1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 38 #include "spdk/config.h" 39 #include "spdk/env.h" 40 #include "spdk/thread.h" 41 #include "spdk/likely.h" 42 #include "spdk/queue.h" 43 #include "spdk/nvme_spec.h" 44 #include "spdk/scsi_spec.h" 45 #include "spdk/notify.h" 46 #include "spdk/util.h" 47 #include "spdk/trace.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk/log.h" 51 #include "spdk/string.h" 52 53 #include "bdev_internal.h" 54 55 #ifdef SPDK_CONFIG_VTUNE 56 #include "ittnotify.h" 57 #include "ittnotify_types.h" 58 int __itt_init_ittlib(const char *, __itt_group_id); 59 #endif 60 61 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 62 #define SPDK_BDEV_IO_CACHE_SIZE 256 63 #define SPDK_BDEV_AUTO_EXAMINE true 64 #define BUF_SMALL_POOL_SIZE 8191 65 #define BUF_LARGE_POOL_SIZE 1023 66 #define NOMEM_THRESHOLD_COUNT 8 67 #define ZERO_BUFFER_SIZE 0x100000 68 69 #define OWNER_BDEV 0x2 70 71 #define OBJECT_BDEV_IO 0x2 72 73 #define TRACE_GROUP_BDEV 0x3 74 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 75 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 76 77 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 78 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 79 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 80 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 81 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 82 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 83 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 84 85 #define SPDK_BDEV_POOL_ALIGNMENT 512 86 87 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 88 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 89 }; 90 91 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 struct spdk_mempool *buf_small_pool; 97 struct spdk_mempool *buf_large_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 pthread_mutex_t mutex; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .init_complete = false, 119 .module_init_complete = false, 120 .mutex = PTHREAD_MUTEX_INITIALIZER, 121 }; 122 123 typedef void (*lock_range_cb)(void *ctx, int status); 124 125 struct lba_range { 126 uint64_t offset; 127 uint64_t length; 128 void *locked_ctx; 129 struct spdk_bdev_channel *owner_ch; 130 TAILQ_ENTRY(lba_range) tailq; 131 }; 132 133 static struct spdk_bdev_opts g_bdev_opts = { 134 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 135 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 136 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 137 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 138 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 139 }; 140 141 static spdk_bdev_init_cb g_init_cb_fn = NULL; 142 static void *g_init_cb_arg = NULL; 143 144 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 145 static void *g_fini_cb_arg = NULL; 146 static struct spdk_thread *g_fini_thread = NULL; 147 148 struct spdk_bdev_qos_limit { 149 /** IOs or bytes allowed per second (i.e., 1s). */ 150 uint64_t limit; 151 152 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 153 * For remaining bytes, allowed to run negative if an I/O is submitted when 154 * some bytes are remaining, but the I/O is bigger than that amount. The 155 * excess will be deducted from the next timeslice. 156 */ 157 int64_t remaining_this_timeslice; 158 159 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 160 uint32_t min_per_timeslice; 161 162 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 163 uint32_t max_per_timeslice; 164 165 /** Function to check whether to queue the IO. */ 166 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 167 168 /** Function to update for the submitted IO. */ 169 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 170 }; 171 172 struct spdk_bdev_qos { 173 /** Types of structure of rate limits. */ 174 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 175 176 /** The channel that all I/O are funneled through. */ 177 struct spdk_bdev_channel *ch; 178 179 /** The thread on which the poller is running. */ 180 struct spdk_thread *thread; 181 182 /** Queue of I/O waiting to be issued. */ 183 bdev_io_tailq_t queued; 184 185 /** Size of a timeslice in tsc ticks. */ 186 uint64_t timeslice_size; 187 188 /** Timestamp of start of last timeslice. */ 189 uint64_t last_timeslice; 190 191 /** Poller that processes queued I/O commands each time slice. */ 192 struct spdk_poller *poller; 193 }; 194 195 struct spdk_bdev_mgmt_channel { 196 bdev_io_stailq_t need_buf_small; 197 bdev_io_stailq_t need_buf_large; 198 199 /* 200 * Each thread keeps a cache of bdev_io - this allows 201 * bdev threads which are *not* DPDK threads to still 202 * benefit from a per-thread bdev_io cache. Without 203 * this, non-DPDK threads fetching from the mempool 204 * incur a cmpxchg on get and put. 205 */ 206 bdev_io_stailq_t per_thread_cache; 207 uint32_t per_thread_cache_count; 208 uint32_t bdev_io_cache_size; 209 210 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 211 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 212 }; 213 214 /* 215 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 216 * will queue here their IO that awaits retry. It makes it possible to retry sending 217 * IO to one bdev after IO from other bdev completes. 218 */ 219 struct spdk_bdev_shared_resource { 220 /* The bdev management channel */ 221 struct spdk_bdev_mgmt_channel *mgmt_ch; 222 223 /* 224 * Count of I/O submitted to bdev module and waiting for completion. 225 * Incremented before submit_request() is called on an spdk_bdev_io. 226 */ 227 uint64_t io_outstanding; 228 229 /* 230 * Queue of IO awaiting retry because of a previous NOMEM status returned 231 * on this channel. 232 */ 233 bdev_io_tailq_t nomem_io; 234 235 /* 236 * Threshold which io_outstanding must drop to before retrying nomem_io. 237 */ 238 uint64_t nomem_threshold; 239 240 /* I/O channel allocated by a bdev module */ 241 struct spdk_io_channel *shared_ch; 242 243 /* Refcount of bdev channels using this resource */ 244 uint32_t ref; 245 246 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 247 }; 248 249 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 250 #define BDEV_CH_QOS_ENABLED (1 << 1) 251 252 struct spdk_bdev_channel { 253 struct spdk_bdev *bdev; 254 255 /* The channel for the underlying device */ 256 struct spdk_io_channel *channel; 257 258 /* Per io_device per thread data */ 259 struct spdk_bdev_shared_resource *shared_resource; 260 261 struct spdk_bdev_io_stat stat; 262 263 /* 264 * Count of I/O submitted to the underlying dev module through this channel 265 * and waiting for completion. 266 */ 267 uint64_t io_outstanding; 268 269 /* 270 * List of all submitted I/Os including I/O that are generated via splitting. 271 */ 272 bdev_io_tailq_t io_submitted; 273 274 /* 275 * List of spdk_bdev_io that are currently queued because they write to a locked 276 * LBA range. 277 */ 278 bdev_io_tailq_t io_locked; 279 280 uint32_t flags; 281 282 struct spdk_histogram_data *histogram; 283 284 #ifdef SPDK_CONFIG_VTUNE 285 uint64_t start_tsc; 286 uint64_t interval_tsc; 287 __itt_string_handle *handle; 288 struct spdk_bdev_io_stat prev_stat; 289 #endif 290 291 bdev_io_tailq_t queued_resets; 292 293 lba_range_tailq_t locked_ranges; 294 }; 295 296 struct media_event_entry { 297 struct spdk_bdev_media_event event; 298 TAILQ_ENTRY(media_event_entry) tailq; 299 }; 300 301 #define MEDIA_EVENT_POOL_SIZE 64 302 303 struct spdk_bdev_desc { 304 struct spdk_bdev *bdev; 305 struct spdk_thread *thread; 306 struct { 307 bool open_with_ext; 308 union { 309 spdk_bdev_remove_cb_t remove_fn; 310 spdk_bdev_event_cb_t event_fn; 311 }; 312 void *ctx; 313 } callback; 314 bool closed; 315 bool write; 316 pthread_mutex_t mutex; 317 uint32_t refs; 318 TAILQ_HEAD(, media_event_entry) pending_media_events; 319 TAILQ_HEAD(, media_event_entry) free_media_events; 320 struct media_event_entry *media_events_buffer; 321 TAILQ_ENTRY(spdk_bdev_desc) link; 322 323 uint64_t timeout_in_sec; 324 spdk_bdev_io_timeout_cb cb_fn; 325 void *cb_arg; 326 struct spdk_poller *io_timeout_poller; 327 }; 328 329 struct spdk_bdev_iostat_ctx { 330 struct spdk_bdev_io_stat *stat; 331 spdk_bdev_get_device_stat_cb cb; 332 void *cb_arg; 333 }; 334 335 struct set_qos_limit_ctx { 336 void (*cb_fn)(void *cb_arg, int status); 337 void *cb_arg; 338 struct spdk_bdev *bdev; 339 }; 340 341 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 342 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 343 344 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 345 static void bdev_write_zero_buffer_next(void *_bdev_io); 346 347 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 348 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 349 350 static int 351 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 352 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 353 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); 354 static int 355 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 356 struct iovec *iov, int iovcnt, void *md_buf, 357 uint64_t offset_blocks, uint64_t num_blocks, 358 spdk_bdev_io_completion_cb cb, void *cb_arg); 359 360 static int 361 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 362 uint64_t offset, uint64_t length, 363 lock_range_cb cb_fn, void *cb_arg); 364 365 static int 366 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 367 uint64_t offset, uint64_t length, 368 lock_range_cb cb_fn, void *cb_arg); 369 370 static inline void bdev_io_complete(void *ctx); 371 372 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 373 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 374 375 void 376 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 377 { 378 if (!opts) { 379 SPDK_ERRLOG("opts should not be NULL\n"); 380 return; 381 } 382 383 if (!opts_size) { 384 SPDK_ERRLOG("opts_size should not be zero value\n"); 385 return; 386 } 387 388 opts->opts_size = opts_size; 389 390 #define SET_FIELD(field) \ 391 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 392 opts->field = g_bdev_opts.field; \ 393 } \ 394 395 SET_FIELD(bdev_io_pool_size); 396 SET_FIELD(bdev_io_cache_size); 397 SET_FIELD(bdev_auto_examine); 398 SET_FIELD(small_buf_pool_size); 399 SET_FIELD(large_buf_pool_size); 400 401 /* Do not remove this statement, you should always update this statement when you adding a new field, 402 * and do not forget to add the SET_FIELD statement for your added field. */ 403 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 404 405 #undef SET_FIELD 406 } 407 408 int 409 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 410 { 411 uint32_t min_pool_size; 412 413 if (!opts) { 414 SPDK_ERRLOG("opts cannot be NULL\n"); 415 return -1; 416 } 417 418 if (!opts->opts_size) { 419 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 420 return -1; 421 } 422 423 /* 424 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 425 * initialization. A second mgmt_ch will be created on the same thread when the application starts 426 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 427 */ 428 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 429 if (opts->bdev_io_pool_size < min_pool_size) { 430 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 431 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 432 spdk_thread_get_count()); 433 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 434 return -1; 435 } 436 437 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 438 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 439 return -1; 440 } 441 442 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 443 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 444 return -1; 445 } 446 447 #define SET_FIELD(field) \ 448 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 449 g_bdev_opts.field = opts->field; \ 450 } \ 451 452 SET_FIELD(bdev_io_pool_size); 453 SET_FIELD(bdev_io_cache_size); 454 SET_FIELD(bdev_auto_examine); 455 SET_FIELD(small_buf_pool_size); 456 SET_FIELD(large_buf_pool_size); 457 458 g_bdev_opts.opts_size = opts->opts_size; 459 460 #undef SET_FIELD 461 462 return 0; 463 } 464 465 struct spdk_bdev_examine_item { 466 char *name; 467 TAILQ_ENTRY(spdk_bdev_examine_item) link; 468 }; 469 470 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 471 472 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 473 g_bdev_examine_allowlist); 474 475 static inline bool 476 bdev_examine_allowlist_check(const char *name) 477 { 478 struct spdk_bdev_examine_item *item; 479 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 480 if (strcmp(name, item->name) == 0) { 481 return true; 482 } 483 } 484 return false; 485 } 486 487 static inline void 488 bdev_examine_allowlist_free(void) 489 { 490 struct spdk_bdev_examine_item *item; 491 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 492 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 493 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 494 free(item->name); 495 free(item); 496 } 497 } 498 499 static inline bool 500 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 501 { 502 struct spdk_bdev_alias *tmp; 503 if (bdev_examine_allowlist_check(bdev->name)) { 504 return true; 505 } 506 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 507 if (bdev_examine_allowlist_check(tmp->alias)) { 508 return true; 509 } 510 } 511 return false; 512 } 513 514 static inline bool 515 bdev_ok_to_examine(struct spdk_bdev *bdev) 516 { 517 if (g_bdev_opts.bdev_auto_examine) { 518 return true; 519 } else { 520 return bdev_in_examine_allowlist(bdev); 521 } 522 } 523 524 static void 525 bdev_examine(struct spdk_bdev *bdev) 526 { 527 struct spdk_bdev_module *module; 528 uint32_t action; 529 530 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 531 if (module->examine_config && bdev_ok_to_examine(bdev)) { 532 action = module->internal.action_in_progress; 533 module->internal.action_in_progress++; 534 module->examine_config(bdev); 535 if (action != module->internal.action_in_progress) { 536 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 537 module->name); 538 } 539 } 540 } 541 542 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 543 if (bdev->internal.claim_module->examine_disk) { 544 bdev->internal.claim_module->internal.action_in_progress++; 545 bdev->internal.claim_module->examine_disk(bdev); 546 } 547 return; 548 } 549 550 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 551 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 552 module->internal.action_in_progress++; 553 module->examine_disk(bdev); 554 } 555 } 556 } 557 558 int 559 spdk_bdev_examine(const char *name) 560 { 561 struct spdk_bdev *bdev; 562 struct spdk_bdev_examine_item *item; 563 564 if (g_bdev_opts.bdev_auto_examine) { 565 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 566 return -EINVAL; 567 } 568 569 if (bdev_examine_allowlist_check(name)) { 570 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 571 return -EEXIST; 572 } 573 574 item = calloc(1, sizeof(*item)); 575 if (!item) { 576 return -ENOMEM; 577 } 578 item->name = strdup(name); 579 if (!item->name) { 580 free(item); 581 return -ENOMEM; 582 } 583 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 584 585 bdev = spdk_bdev_get_by_name(name); 586 if (bdev) { 587 bdev_examine(bdev); 588 } 589 return 0; 590 } 591 592 static inline void 593 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 594 { 595 struct spdk_bdev_examine_item *item; 596 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 597 spdk_json_write_object_begin(w); 598 spdk_json_write_named_string(w, "method", "bdev_examine"); 599 spdk_json_write_named_object_begin(w, "params"); 600 spdk_json_write_named_string(w, "name", item->name); 601 spdk_json_write_object_end(w); 602 spdk_json_write_object_end(w); 603 } 604 } 605 606 struct spdk_bdev * 607 spdk_bdev_first(void) 608 { 609 struct spdk_bdev *bdev; 610 611 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 612 if (bdev) { 613 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 614 } 615 616 return bdev; 617 } 618 619 struct spdk_bdev * 620 spdk_bdev_next(struct spdk_bdev *prev) 621 { 622 struct spdk_bdev *bdev; 623 624 bdev = TAILQ_NEXT(prev, internal.link); 625 if (bdev) { 626 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 627 } 628 629 return bdev; 630 } 631 632 static struct spdk_bdev * 633 _bdev_next_leaf(struct spdk_bdev *bdev) 634 { 635 while (bdev != NULL) { 636 if (bdev->internal.claim_module == NULL) { 637 return bdev; 638 } else { 639 bdev = TAILQ_NEXT(bdev, internal.link); 640 } 641 } 642 643 return bdev; 644 } 645 646 struct spdk_bdev * 647 spdk_bdev_first_leaf(void) 648 { 649 struct spdk_bdev *bdev; 650 651 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 652 653 if (bdev) { 654 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 655 } 656 657 return bdev; 658 } 659 660 struct spdk_bdev * 661 spdk_bdev_next_leaf(struct spdk_bdev *prev) 662 { 663 struct spdk_bdev *bdev; 664 665 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 666 667 if (bdev) { 668 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 669 } 670 671 return bdev; 672 } 673 674 struct spdk_bdev * 675 spdk_bdev_get_by_name(const char *bdev_name) 676 { 677 struct spdk_bdev_alias *tmp; 678 struct spdk_bdev *bdev = spdk_bdev_first(); 679 680 while (bdev != NULL) { 681 if (strcmp(bdev_name, bdev->name) == 0) { 682 return bdev; 683 } 684 685 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 686 if (strcmp(bdev_name, tmp->alias) == 0) { 687 return bdev; 688 } 689 } 690 691 bdev = spdk_bdev_next(bdev); 692 } 693 694 return NULL; 695 } 696 697 void 698 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 699 { 700 struct iovec *iovs; 701 702 if (bdev_io->u.bdev.iovs == NULL) { 703 bdev_io->u.bdev.iovs = &bdev_io->iov; 704 bdev_io->u.bdev.iovcnt = 1; 705 } 706 707 iovs = bdev_io->u.bdev.iovs; 708 709 assert(iovs != NULL); 710 assert(bdev_io->u.bdev.iovcnt >= 1); 711 712 iovs[0].iov_base = buf; 713 iovs[0].iov_len = len; 714 } 715 716 void 717 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 718 { 719 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 720 bdev_io->u.bdev.md_buf = md_buf; 721 } 722 723 static bool 724 _is_buf_allocated(const struct iovec *iovs) 725 { 726 if (iovs == NULL) { 727 return false; 728 } 729 730 return iovs[0].iov_base != NULL; 731 } 732 733 static bool 734 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 735 { 736 int i; 737 uintptr_t iov_base; 738 739 if (spdk_likely(alignment == 1)) { 740 return true; 741 } 742 743 for (i = 0; i < iovcnt; i++) { 744 iov_base = (uintptr_t)iovs[i].iov_base; 745 if ((iov_base & (alignment - 1)) != 0) { 746 return false; 747 } 748 } 749 750 return true; 751 } 752 753 static void 754 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 755 { 756 int i; 757 size_t len; 758 759 for (i = 0; i < iovcnt; i++) { 760 len = spdk_min(iovs[i].iov_len, buf_len); 761 memcpy(buf, iovs[i].iov_base, len); 762 buf += len; 763 buf_len -= len; 764 } 765 } 766 767 static void 768 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 769 { 770 int i; 771 size_t len; 772 773 for (i = 0; i < iovcnt; i++) { 774 len = spdk_min(iovs[i].iov_len, buf_len); 775 memcpy(iovs[i].iov_base, buf, len); 776 buf += len; 777 buf_len -= len; 778 } 779 } 780 781 static void 782 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 783 { 784 /* save original iovec */ 785 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 786 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 787 /* set bounce iov */ 788 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 789 bdev_io->u.bdev.iovcnt = 1; 790 /* set bounce buffer for this operation */ 791 bdev_io->u.bdev.iovs[0].iov_base = buf; 792 bdev_io->u.bdev.iovs[0].iov_len = len; 793 /* if this is write path, copy data from original buffer to bounce buffer */ 794 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 795 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 796 } 797 } 798 799 static void 800 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 801 { 802 /* save original md_buf */ 803 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 804 /* set bounce md_buf */ 805 bdev_io->u.bdev.md_buf = md_buf; 806 807 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 808 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 809 } 810 } 811 812 static void 813 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) 814 { 815 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 816 817 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 818 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 819 bdev_io->internal.get_aux_buf_cb = NULL; 820 } else { 821 assert(bdev_io->internal.get_buf_cb != NULL); 822 bdev_io->internal.buf = buf; 823 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 824 bdev_io->internal.get_buf_cb = NULL; 825 } 826 } 827 828 static void 829 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 830 { 831 struct spdk_bdev *bdev = bdev_io->bdev; 832 bool buf_allocated; 833 uint64_t md_len, alignment; 834 void *aligned_buf; 835 836 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 837 bdev_io_get_buf_complete(bdev_io, buf, true); 838 return; 839 } 840 841 alignment = spdk_bdev_get_buf_align(bdev); 842 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 843 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 844 845 if (buf_allocated) { 846 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 847 } else { 848 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 849 } 850 851 if (spdk_bdev_is_md_separate(bdev)) { 852 aligned_buf = (char *)aligned_buf + len; 853 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 854 855 assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); 856 857 if (bdev_io->u.bdev.md_buf != NULL) { 858 _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); 859 } else { 860 spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); 861 } 862 } 863 bdev_io_get_buf_complete(bdev_io, buf, true); 864 } 865 866 static void 867 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 868 { 869 struct spdk_bdev *bdev = bdev_io->bdev; 870 struct spdk_mempool *pool; 871 struct spdk_bdev_io *tmp; 872 bdev_io_stailq_t *stailq; 873 struct spdk_bdev_mgmt_channel *ch; 874 uint64_t md_len, alignment; 875 876 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 877 alignment = spdk_bdev_get_buf_align(bdev); 878 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 879 880 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 881 SPDK_BDEV_POOL_ALIGNMENT) { 882 pool = g_bdev_mgr.buf_small_pool; 883 stailq = &ch->need_buf_small; 884 } else { 885 pool = g_bdev_mgr.buf_large_pool; 886 stailq = &ch->need_buf_large; 887 } 888 889 if (STAILQ_EMPTY(stailq)) { 890 spdk_mempool_put(pool, buf); 891 } else { 892 tmp = STAILQ_FIRST(stailq); 893 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 894 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 895 } 896 } 897 898 static void 899 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 900 { 901 assert(bdev_io->internal.buf != NULL); 902 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 903 bdev_io->internal.buf = NULL; 904 } 905 906 void 907 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 908 { 909 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 910 911 assert(buf != NULL); 912 _bdev_io_put_buf(bdev_io, buf, len); 913 } 914 915 static void 916 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 917 { 918 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 919 assert(bdev_io->internal.orig_md_buf == NULL); 920 return; 921 } 922 923 /* if this is read path, copy data from bounce buffer to original buffer */ 924 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 925 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 926 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 927 bdev_io->internal.orig_iovcnt, 928 bdev_io->internal.bounce_iov.iov_base, 929 bdev_io->internal.bounce_iov.iov_len); 930 } 931 /* set original buffer for this io */ 932 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 933 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 934 /* disable bouncing buffer for this io */ 935 bdev_io->internal.orig_iovcnt = 0; 936 bdev_io->internal.orig_iovs = NULL; 937 938 /* do the same for metadata buffer */ 939 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 940 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 941 942 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 943 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 944 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 945 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 946 } 947 948 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 949 bdev_io->internal.orig_md_buf = NULL; 950 } 951 952 /* We want to free the bounce buffer here since we know we're done with it (as opposed 953 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 954 */ 955 bdev_io_put_buf(bdev_io); 956 } 957 958 static void 959 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 960 { 961 struct spdk_bdev *bdev = bdev_io->bdev; 962 struct spdk_mempool *pool; 963 bdev_io_stailq_t *stailq; 964 struct spdk_bdev_mgmt_channel *mgmt_ch; 965 uint64_t alignment, md_len; 966 void *buf; 967 968 alignment = spdk_bdev_get_buf_align(bdev); 969 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 970 971 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 972 SPDK_BDEV_POOL_ALIGNMENT) { 973 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 974 len + alignment); 975 bdev_io_get_buf_complete(bdev_io, NULL, false); 976 return; 977 } 978 979 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 980 981 bdev_io->internal.buf_len = len; 982 983 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 984 SPDK_BDEV_POOL_ALIGNMENT) { 985 pool = g_bdev_mgr.buf_small_pool; 986 stailq = &mgmt_ch->need_buf_small; 987 } else { 988 pool = g_bdev_mgr.buf_large_pool; 989 stailq = &mgmt_ch->need_buf_large; 990 } 991 992 buf = spdk_mempool_get(pool); 993 if (!buf) { 994 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 995 } else { 996 _bdev_io_set_buf(bdev_io, buf, len); 997 } 998 } 999 1000 void 1001 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1002 { 1003 struct spdk_bdev *bdev = bdev_io->bdev; 1004 uint64_t alignment; 1005 1006 assert(cb != NULL); 1007 bdev_io->internal.get_buf_cb = cb; 1008 1009 alignment = spdk_bdev_get_buf_align(bdev); 1010 1011 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1012 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1013 /* Buffer already present and aligned */ 1014 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1015 return; 1016 } 1017 1018 bdev_io_get_buf(bdev_io, len); 1019 } 1020 1021 void 1022 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1023 { 1024 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1025 1026 assert(cb != NULL); 1027 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1028 bdev_io->internal.get_aux_buf_cb = cb; 1029 bdev_io_get_buf(bdev_io, len); 1030 } 1031 1032 static int 1033 bdev_module_get_max_ctx_size(void) 1034 { 1035 struct spdk_bdev_module *bdev_module; 1036 int max_bdev_module_size = 0; 1037 1038 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1039 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1040 max_bdev_module_size = bdev_module->get_ctx_size(); 1041 } 1042 } 1043 1044 return max_bdev_module_size; 1045 } 1046 1047 static void 1048 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1049 { 1050 int i; 1051 struct spdk_bdev_qos *qos = bdev->internal.qos; 1052 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1053 1054 if (!qos) { 1055 return; 1056 } 1057 1058 spdk_bdev_get_qos_rate_limits(bdev, limits); 1059 1060 spdk_json_write_object_begin(w); 1061 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1062 1063 spdk_json_write_named_object_begin(w, "params"); 1064 spdk_json_write_named_string(w, "name", bdev->name); 1065 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1066 if (limits[i] > 0) { 1067 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1068 } 1069 } 1070 spdk_json_write_object_end(w); 1071 1072 spdk_json_write_object_end(w); 1073 } 1074 1075 void 1076 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1077 { 1078 struct spdk_bdev_module *bdev_module; 1079 struct spdk_bdev *bdev; 1080 1081 assert(w != NULL); 1082 1083 spdk_json_write_array_begin(w); 1084 1085 spdk_json_write_object_begin(w); 1086 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1087 spdk_json_write_named_object_begin(w, "params"); 1088 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1089 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1090 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1091 spdk_json_write_object_end(w); 1092 spdk_json_write_object_end(w); 1093 1094 bdev_examine_allowlist_config_json(w); 1095 1096 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1097 if (bdev_module->config_json) { 1098 bdev_module->config_json(w); 1099 } 1100 } 1101 1102 pthread_mutex_lock(&g_bdev_mgr.mutex); 1103 1104 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1105 if (bdev->fn_table->write_config_json) { 1106 bdev->fn_table->write_config_json(bdev, w); 1107 } 1108 1109 bdev_qos_config_json(bdev, w); 1110 } 1111 1112 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1113 1114 spdk_json_write_array_end(w); 1115 } 1116 1117 static int 1118 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1119 { 1120 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1121 struct spdk_bdev_io *bdev_io; 1122 uint32_t i; 1123 1124 STAILQ_INIT(&ch->need_buf_small); 1125 STAILQ_INIT(&ch->need_buf_large); 1126 1127 STAILQ_INIT(&ch->per_thread_cache); 1128 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1129 1130 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1131 ch->per_thread_cache_count = 0; 1132 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1133 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1134 assert(bdev_io != NULL); 1135 ch->per_thread_cache_count++; 1136 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1137 } 1138 1139 TAILQ_INIT(&ch->shared_resources); 1140 TAILQ_INIT(&ch->io_wait_queue); 1141 1142 return 0; 1143 } 1144 1145 static void 1146 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1147 { 1148 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1149 struct spdk_bdev_io *bdev_io; 1150 1151 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1152 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1153 } 1154 1155 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1156 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1157 } 1158 1159 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1160 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1161 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1162 ch->per_thread_cache_count--; 1163 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1164 } 1165 1166 assert(ch->per_thread_cache_count == 0); 1167 } 1168 1169 static void 1170 bdev_init_complete(int rc) 1171 { 1172 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1173 void *cb_arg = g_init_cb_arg; 1174 struct spdk_bdev_module *m; 1175 1176 g_bdev_mgr.init_complete = true; 1177 g_init_cb_fn = NULL; 1178 g_init_cb_arg = NULL; 1179 1180 /* 1181 * For modules that need to know when subsystem init is complete, 1182 * inform them now. 1183 */ 1184 if (rc == 0) { 1185 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1186 if (m->init_complete) { 1187 m->init_complete(); 1188 } 1189 } 1190 } 1191 1192 cb_fn(cb_arg, rc); 1193 } 1194 1195 static void 1196 bdev_module_action_complete(void) 1197 { 1198 struct spdk_bdev_module *m; 1199 1200 /* 1201 * Don't finish bdev subsystem initialization if 1202 * module pre-initialization is still in progress, or 1203 * the subsystem been already initialized. 1204 */ 1205 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1206 return; 1207 } 1208 1209 /* 1210 * Check all bdev modules for inits/examinations in progress. If any 1211 * exist, return immediately since we cannot finish bdev subsystem 1212 * initialization until all are completed. 1213 */ 1214 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1215 if (m->internal.action_in_progress > 0) { 1216 return; 1217 } 1218 } 1219 1220 /* 1221 * Modules already finished initialization - now that all 1222 * the bdev modules have finished their asynchronous I/O 1223 * processing, the entire bdev layer can be marked as complete. 1224 */ 1225 bdev_init_complete(0); 1226 } 1227 1228 static void 1229 bdev_module_action_done(struct spdk_bdev_module *module) 1230 { 1231 assert(module->internal.action_in_progress > 0); 1232 module->internal.action_in_progress--; 1233 bdev_module_action_complete(); 1234 } 1235 1236 void 1237 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1238 { 1239 bdev_module_action_done(module); 1240 } 1241 1242 void 1243 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1244 { 1245 bdev_module_action_done(module); 1246 } 1247 1248 /** The last initialized bdev module */ 1249 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1250 1251 static void 1252 bdev_init_failed(void *cb_arg) 1253 { 1254 struct spdk_bdev_module *module = cb_arg; 1255 1256 module->internal.action_in_progress--; 1257 bdev_init_complete(-1); 1258 } 1259 1260 static int 1261 bdev_modules_init(void) 1262 { 1263 struct spdk_bdev_module *module; 1264 int rc = 0; 1265 1266 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1267 g_resume_bdev_module = module; 1268 if (module->async_init) { 1269 module->internal.action_in_progress = 1; 1270 } 1271 rc = module->module_init(); 1272 if (rc != 0) { 1273 /* Bump action_in_progress to prevent other modules from completion of modules_init 1274 * Send message to defer application shutdown until resources are cleaned up */ 1275 module->internal.action_in_progress = 1; 1276 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1277 return rc; 1278 } 1279 } 1280 1281 g_resume_bdev_module = NULL; 1282 return 0; 1283 } 1284 1285 void 1286 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1287 { 1288 int cache_size; 1289 int rc = 0; 1290 char mempool_name[32]; 1291 1292 assert(cb_fn != NULL); 1293 1294 g_init_cb_fn = cb_fn; 1295 g_init_cb_arg = cb_arg; 1296 1297 spdk_notify_type_register("bdev_register"); 1298 spdk_notify_type_register("bdev_unregister"); 1299 1300 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1301 1302 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1303 g_bdev_opts.bdev_io_pool_size, 1304 sizeof(struct spdk_bdev_io) + 1305 bdev_module_get_max_ctx_size(), 1306 0, 1307 SPDK_ENV_SOCKET_ID_ANY); 1308 1309 if (g_bdev_mgr.bdev_io_pool == NULL) { 1310 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1311 bdev_init_complete(-1); 1312 return; 1313 } 1314 1315 /** 1316 * Ensure no more than half of the total buffers end up local caches, by 1317 * using spdk_env_get_core_count() to determine how many local caches we need 1318 * to account for. 1319 */ 1320 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1321 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1322 1323 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1324 g_bdev_opts.small_buf_pool_size, 1325 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1326 SPDK_BDEV_POOL_ALIGNMENT, 1327 cache_size, 1328 SPDK_ENV_SOCKET_ID_ANY); 1329 if (!g_bdev_mgr.buf_small_pool) { 1330 SPDK_ERRLOG("create rbuf small pool failed\n"); 1331 bdev_init_complete(-1); 1332 return; 1333 } 1334 1335 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1336 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1337 1338 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1339 g_bdev_opts.large_buf_pool_size, 1340 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1341 SPDK_BDEV_POOL_ALIGNMENT, 1342 cache_size, 1343 SPDK_ENV_SOCKET_ID_ANY); 1344 if (!g_bdev_mgr.buf_large_pool) { 1345 SPDK_ERRLOG("create rbuf large pool failed\n"); 1346 bdev_init_complete(-1); 1347 return; 1348 } 1349 1350 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1351 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1352 if (!g_bdev_mgr.zero_buffer) { 1353 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1354 bdev_init_complete(-1); 1355 return; 1356 } 1357 1358 #ifdef SPDK_CONFIG_VTUNE 1359 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1360 #endif 1361 1362 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1363 bdev_mgmt_channel_destroy, 1364 sizeof(struct spdk_bdev_mgmt_channel), 1365 "bdev_mgr"); 1366 1367 rc = bdev_modules_init(); 1368 g_bdev_mgr.module_init_complete = true; 1369 if (rc != 0) { 1370 SPDK_ERRLOG("bdev modules init failed\n"); 1371 return; 1372 } 1373 1374 bdev_module_action_complete(); 1375 } 1376 1377 static void 1378 bdev_mgr_unregister_cb(void *io_device) 1379 { 1380 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1381 1382 if (g_bdev_mgr.bdev_io_pool) { 1383 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1384 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1385 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1386 g_bdev_opts.bdev_io_pool_size); 1387 } 1388 1389 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1390 } 1391 1392 if (g_bdev_mgr.buf_small_pool) { 1393 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1394 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1395 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1396 g_bdev_opts.small_buf_pool_size); 1397 assert(false); 1398 } 1399 1400 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1401 } 1402 1403 if (g_bdev_mgr.buf_large_pool) { 1404 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1405 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1406 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1407 g_bdev_opts.large_buf_pool_size); 1408 assert(false); 1409 } 1410 1411 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1412 } 1413 1414 spdk_free(g_bdev_mgr.zero_buffer); 1415 1416 bdev_examine_allowlist_free(); 1417 1418 cb_fn(g_fini_cb_arg); 1419 g_fini_cb_fn = NULL; 1420 g_fini_cb_arg = NULL; 1421 g_bdev_mgr.init_complete = false; 1422 g_bdev_mgr.module_init_complete = false; 1423 pthread_mutex_destroy(&g_bdev_mgr.mutex); 1424 } 1425 1426 static void 1427 bdev_module_finish_iter(void *arg) 1428 { 1429 struct spdk_bdev_module *bdev_module; 1430 1431 /* FIXME: Handling initialization failures is broken now, 1432 * so we won't even try cleaning up after successfully 1433 * initialized modules. if module_init_complete is false, 1434 * just call spdk_bdev_mgr_unregister_cb 1435 */ 1436 if (!g_bdev_mgr.module_init_complete) { 1437 bdev_mgr_unregister_cb(NULL); 1438 return; 1439 } 1440 1441 /* Start iterating from the last touched module */ 1442 if (!g_resume_bdev_module) { 1443 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1444 } else { 1445 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1446 internal.tailq); 1447 } 1448 1449 while (bdev_module) { 1450 if (bdev_module->async_fini) { 1451 /* Save our place so we can resume later. We must 1452 * save the variable here, before calling module_fini() 1453 * below, because in some cases the module may immediately 1454 * call spdk_bdev_module_finish_done() and re-enter 1455 * this function to continue iterating. */ 1456 g_resume_bdev_module = bdev_module; 1457 } 1458 1459 if (bdev_module->module_fini) { 1460 bdev_module->module_fini(); 1461 } 1462 1463 if (bdev_module->async_fini) { 1464 return; 1465 } 1466 1467 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1468 internal.tailq); 1469 } 1470 1471 g_resume_bdev_module = NULL; 1472 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1473 } 1474 1475 void 1476 spdk_bdev_module_finish_done(void) 1477 { 1478 if (spdk_get_thread() != g_fini_thread) { 1479 spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL); 1480 } else { 1481 bdev_module_finish_iter(NULL); 1482 } 1483 } 1484 1485 static void 1486 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1487 { 1488 struct spdk_bdev *bdev = cb_arg; 1489 1490 if (bdeverrno && bdev) { 1491 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1492 bdev->name); 1493 1494 /* 1495 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1496 * bdev; try to continue by manually removing this bdev from the list and continue 1497 * with the next bdev in the list. 1498 */ 1499 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1500 } 1501 1502 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1503 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1504 /* 1505 * Bdev module finish need to be deferred as we might be in the middle of some context 1506 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1507 * after returning. 1508 */ 1509 spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL); 1510 return; 1511 } 1512 1513 /* 1514 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1515 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1516 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1517 * base bdevs. 1518 * 1519 * Also, walk the list in the reverse order. 1520 */ 1521 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1522 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1523 if (bdev->internal.claim_module != NULL) { 1524 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1525 bdev->name, bdev->internal.claim_module->name); 1526 continue; 1527 } 1528 1529 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1530 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1531 return; 1532 } 1533 1534 /* 1535 * If any bdev fails to unclaim underlying bdev properly, we may face the 1536 * case of bdev list consisting of claimed bdevs only (if claims are managed 1537 * correctly, this would mean there's a loop in the claims graph which is 1538 * clearly impossible). Warn and unregister last bdev on the list then. 1539 */ 1540 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1541 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1542 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1543 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1544 return; 1545 } 1546 } 1547 1548 void 1549 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1550 { 1551 struct spdk_bdev_module *m; 1552 1553 assert(cb_fn != NULL); 1554 1555 g_fini_thread = spdk_get_thread(); 1556 1557 g_fini_cb_fn = cb_fn; 1558 g_fini_cb_arg = cb_arg; 1559 1560 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1561 if (m->fini_start) { 1562 m->fini_start(); 1563 } 1564 } 1565 1566 bdev_finish_unregister_bdevs_iter(NULL, 0); 1567 } 1568 1569 struct spdk_bdev_io * 1570 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1571 { 1572 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1573 struct spdk_bdev_io *bdev_io; 1574 1575 if (ch->per_thread_cache_count > 0) { 1576 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1577 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1578 ch->per_thread_cache_count--; 1579 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1580 /* 1581 * Don't try to look for bdev_ios in the global pool if there are 1582 * waiters on bdev_ios - we don't want this caller to jump the line. 1583 */ 1584 bdev_io = NULL; 1585 } else { 1586 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1587 } 1588 1589 return bdev_io; 1590 } 1591 1592 void 1593 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1594 { 1595 struct spdk_bdev_mgmt_channel *ch; 1596 1597 assert(bdev_io != NULL); 1598 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1599 1600 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1601 1602 if (bdev_io->internal.buf != NULL) { 1603 bdev_io_put_buf(bdev_io); 1604 } 1605 1606 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1607 ch->per_thread_cache_count++; 1608 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1609 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1610 struct spdk_bdev_io_wait_entry *entry; 1611 1612 entry = TAILQ_FIRST(&ch->io_wait_queue); 1613 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1614 entry->cb_fn(entry->cb_arg); 1615 } 1616 } else { 1617 /* We should never have a full cache with entries on the io wait queue. */ 1618 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1619 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1620 } 1621 } 1622 1623 static bool 1624 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1625 { 1626 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1627 1628 switch (limit) { 1629 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1630 return true; 1631 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1632 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1633 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1634 return false; 1635 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1636 default: 1637 return false; 1638 } 1639 } 1640 1641 static bool 1642 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1643 { 1644 switch (bdev_io->type) { 1645 case SPDK_BDEV_IO_TYPE_NVME_IO: 1646 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1647 case SPDK_BDEV_IO_TYPE_READ: 1648 case SPDK_BDEV_IO_TYPE_WRITE: 1649 return true; 1650 case SPDK_BDEV_IO_TYPE_ZCOPY: 1651 if (bdev_io->u.bdev.zcopy.start) { 1652 return true; 1653 } else { 1654 return false; 1655 } 1656 default: 1657 return false; 1658 } 1659 } 1660 1661 static bool 1662 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1663 { 1664 switch (bdev_io->type) { 1665 case SPDK_BDEV_IO_TYPE_NVME_IO: 1666 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1667 /* Bit 1 (0x2) set for read operation */ 1668 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1669 return true; 1670 } else { 1671 return false; 1672 } 1673 case SPDK_BDEV_IO_TYPE_READ: 1674 return true; 1675 case SPDK_BDEV_IO_TYPE_ZCOPY: 1676 /* Populate to read from disk */ 1677 if (bdev_io->u.bdev.zcopy.populate) { 1678 return true; 1679 } else { 1680 return false; 1681 } 1682 default: 1683 return false; 1684 } 1685 } 1686 1687 static uint64_t 1688 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1689 { 1690 struct spdk_bdev *bdev = bdev_io->bdev; 1691 1692 switch (bdev_io->type) { 1693 case SPDK_BDEV_IO_TYPE_NVME_IO: 1694 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1695 return bdev_io->u.nvme_passthru.nbytes; 1696 case SPDK_BDEV_IO_TYPE_READ: 1697 case SPDK_BDEV_IO_TYPE_WRITE: 1698 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1699 case SPDK_BDEV_IO_TYPE_ZCOPY: 1700 /* Track the data in the start phase only */ 1701 if (bdev_io->u.bdev.zcopy.start) { 1702 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1703 } else { 1704 return 0; 1705 } 1706 default: 1707 return 0; 1708 } 1709 } 1710 1711 static bool 1712 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1713 { 1714 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1715 return true; 1716 } else { 1717 return false; 1718 } 1719 } 1720 1721 static bool 1722 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1723 { 1724 if (bdev_is_read_io(io) == false) { 1725 return false; 1726 } 1727 1728 return bdev_qos_rw_queue_io(limit, io); 1729 } 1730 1731 static bool 1732 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1733 { 1734 if (bdev_is_read_io(io) == true) { 1735 return false; 1736 } 1737 1738 return bdev_qos_rw_queue_io(limit, io); 1739 } 1740 1741 static void 1742 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1743 { 1744 limit->remaining_this_timeslice--; 1745 } 1746 1747 static void 1748 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1749 { 1750 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1751 } 1752 1753 static void 1754 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1755 { 1756 if (bdev_is_read_io(io) == false) { 1757 return; 1758 } 1759 1760 return bdev_qos_rw_bps_update_quota(limit, io); 1761 } 1762 1763 static void 1764 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1765 { 1766 if (bdev_is_read_io(io) == true) { 1767 return; 1768 } 1769 1770 return bdev_qos_rw_bps_update_quota(limit, io); 1771 } 1772 1773 static void 1774 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1775 { 1776 int i; 1777 1778 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1779 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1780 qos->rate_limits[i].queue_io = NULL; 1781 qos->rate_limits[i].update_quota = NULL; 1782 continue; 1783 } 1784 1785 switch (i) { 1786 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1787 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1788 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1789 break; 1790 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1791 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1792 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 1793 break; 1794 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1795 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 1796 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 1797 break; 1798 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1799 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 1800 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 1801 break; 1802 default: 1803 break; 1804 } 1805 } 1806 } 1807 1808 static void 1809 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 1810 struct spdk_bdev_io *bdev_io, 1811 enum spdk_bdev_io_status status) 1812 { 1813 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1814 1815 bdev_io->internal.in_submit_request = true; 1816 bdev_ch->io_outstanding++; 1817 shared_resource->io_outstanding++; 1818 spdk_bdev_io_complete(bdev_io, status); 1819 bdev_io->internal.in_submit_request = false; 1820 } 1821 1822 static inline void 1823 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1824 { 1825 struct spdk_bdev *bdev = bdev_io->bdev; 1826 struct spdk_io_channel *ch = bdev_ch->channel; 1827 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1828 1829 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 1830 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 1831 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 1832 1833 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 1834 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 1835 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 1836 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 1837 SPDK_BDEV_IO_STATUS_SUCCESS); 1838 return; 1839 } 1840 } 1841 1842 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1843 bdev_ch->io_outstanding++; 1844 shared_resource->io_outstanding++; 1845 bdev_io->internal.in_submit_request = true; 1846 bdev->fn_table->submit_request(ch, bdev_io); 1847 bdev_io->internal.in_submit_request = false; 1848 } else { 1849 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1850 } 1851 } 1852 1853 static int 1854 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1855 { 1856 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1857 int i, submitted_ios = 0; 1858 1859 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1860 if (bdev_qos_io_to_limit(bdev_io) == true) { 1861 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1862 if (!qos->rate_limits[i].queue_io) { 1863 continue; 1864 } 1865 1866 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1867 bdev_io) == true) { 1868 return submitted_ios; 1869 } 1870 } 1871 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1872 if (!qos->rate_limits[i].update_quota) { 1873 continue; 1874 } 1875 1876 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1877 } 1878 } 1879 1880 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1881 bdev_io_do_submit(ch, bdev_io); 1882 submitted_ios++; 1883 } 1884 1885 return submitted_ios; 1886 } 1887 1888 static void 1889 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1890 { 1891 int rc; 1892 1893 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1894 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1895 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1896 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1897 &bdev_io->internal.waitq_entry); 1898 if (rc != 0) { 1899 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1900 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1901 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1902 } 1903 } 1904 1905 static bool 1906 bdev_io_type_can_split(uint8_t type) 1907 { 1908 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1909 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1910 1911 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1912 * UNMAP could be split, but these types of I/O are typically much larger 1913 * in size (sometimes the size of the entire block device), and the bdev 1914 * module can more efficiently split these types of I/O. Plus those types 1915 * of I/O do not have a payload, which makes the splitting process simpler. 1916 */ 1917 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1918 return true; 1919 } else { 1920 return false; 1921 } 1922 } 1923 1924 static bool 1925 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1926 { 1927 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1928 uint32_t max_size = bdev_io->bdev->max_segment_size; 1929 int max_segs = bdev_io->bdev->max_num_segments; 1930 1931 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 1932 1933 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 1934 return false; 1935 } 1936 1937 if (!bdev_io_type_can_split(bdev_io->type)) { 1938 return false; 1939 } 1940 1941 if (io_boundary) { 1942 uint64_t start_stripe, end_stripe; 1943 1944 start_stripe = bdev_io->u.bdev.offset_blocks; 1945 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1946 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1947 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1948 start_stripe >>= spdk_u32log2(io_boundary); 1949 end_stripe >>= spdk_u32log2(io_boundary); 1950 } else { 1951 start_stripe /= io_boundary; 1952 end_stripe /= io_boundary; 1953 } 1954 1955 if (start_stripe != end_stripe) { 1956 return true; 1957 } 1958 } 1959 1960 if (max_segs) { 1961 if (bdev_io->u.bdev.iovcnt > max_segs) { 1962 return true; 1963 } 1964 } 1965 1966 if (max_size) { 1967 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 1968 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 1969 return true; 1970 } 1971 } 1972 } 1973 1974 return false; 1975 } 1976 1977 static uint32_t 1978 _to_next_boundary(uint64_t offset, uint32_t boundary) 1979 { 1980 return (boundary - (offset % boundary)); 1981 } 1982 1983 static void 1984 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1985 1986 static void 1987 _bdev_io_split(void *_bdev_io) 1988 { 1989 struct iovec *parent_iov, *iov; 1990 struct spdk_bdev_io *bdev_io = _bdev_io; 1991 struct spdk_bdev *bdev = bdev_io->bdev; 1992 uint64_t parent_offset, current_offset, remaining; 1993 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 1994 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 1995 uint32_t iovcnt, iov_len, child_iovsize; 1996 uint32_t blocklen = bdev->blocklen; 1997 uint32_t io_boundary = bdev->optimal_io_boundary; 1998 uint32_t max_segment_size = bdev->max_segment_size; 1999 uint32_t max_child_iovcnt = bdev->max_num_segments; 2000 void *md_buf = NULL; 2001 int rc; 2002 2003 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2004 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2005 BDEV_IO_NUM_CHILD_IOV; 2006 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2007 2008 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2009 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2010 parent_offset = bdev_io->u.bdev.offset_blocks; 2011 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2012 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2013 2014 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2015 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2016 if (parent_iov_offset < parent_iov->iov_len) { 2017 break; 2018 } 2019 parent_iov_offset -= parent_iov->iov_len; 2020 } 2021 2022 child_iovcnt = 0; 2023 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2024 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2025 to_next_boundary = spdk_min(remaining, to_next_boundary); 2026 to_next_boundary_bytes = to_next_boundary * blocklen; 2027 2028 iov = &bdev_io->child_iov[child_iovcnt]; 2029 iovcnt = 0; 2030 2031 if (bdev_io->u.bdev.md_buf) { 2032 md_buf = (char *)bdev_io->u.bdev.md_buf + 2033 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2034 } 2035 2036 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2037 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2038 iovcnt < child_iovsize) { 2039 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2040 iov_len = parent_iov->iov_len - parent_iov_offset; 2041 2042 iov_len = spdk_min(iov_len, max_segment_size); 2043 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2044 to_next_boundary_bytes -= iov_len; 2045 2046 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2047 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2048 2049 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2050 parent_iov_offset += iov_len; 2051 } else { 2052 parent_iovpos++; 2053 parent_iov_offset = 0; 2054 } 2055 child_iovcnt++; 2056 iovcnt++; 2057 } 2058 2059 if (to_next_boundary_bytes > 0) { 2060 /* We had to stop this child I/O early because we ran out of 2061 * child_iov space or were limited by max_num_segments. 2062 * Ensure the iovs to be aligned with block size and 2063 * then adjust to_next_boundary before starting the 2064 * child I/O. 2065 */ 2066 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2067 iovcnt == child_iovsize); 2068 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2069 if (to_last_block_bytes != 0) { 2070 uint32_t child_iovpos = child_iovcnt - 1; 2071 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2072 * so the loop will naturally end 2073 */ 2074 2075 to_last_block_bytes = blocklen - to_last_block_bytes; 2076 to_next_boundary_bytes += to_last_block_bytes; 2077 while (to_last_block_bytes > 0 && iovcnt > 0) { 2078 iov_len = spdk_min(to_last_block_bytes, 2079 bdev_io->child_iov[child_iovpos].iov_len); 2080 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2081 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2082 child_iovpos--; 2083 if (--iovcnt == 0) { 2084 /* If the child IO is less than a block size just return. 2085 * If the first child IO of any split round is less than 2086 * a block size, an error exit. 2087 */ 2088 if (bdev_io->u.bdev.split_outstanding == 0) { 2089 SPDK_ERRLOG("The first child io was less than a block size\n"); 2090 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2091 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 2092 (uintptr_t)bdev_io, 0); 2093 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2094 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2095 } 2096 2097 return; 2098 } 2099 } 2100 2101 to_last_block_bytes -= iov_len; 2102 2103 if (parent_iov_offset == 0) { 2104 parent_iovpos--; 2105 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2106 } 2107 parent_iov_offset -= iov_len; 2108 } 2109 2110 assert(to_last_block_bytes == 0); 2111 } 2112 to_next_boundary -= to_next_boundary_bytes / blocklen; 2113 } 2114 2115 bdev_io->u.bdev.split_outstanding++; 2116 2117 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 2118 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2119 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2120 iov, iovcnt, md_buf, current_offset, 2121 to_next_boundary, 2122 bdev_io_split_done, bdev_io); 2123 } else { 2124 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2125 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2126 iov, iovcnt, md_buf, current_offset, 2127 to_next_boundary, 2128 bdev_io_split_done, bdev_io); 2129 } 2130 2131 if (rc == 0) { 2132 current_offset += to_next_boundary; 2133 remaining -= to_next_boundary; 2134 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2135 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 2136 } else { 2137 bdev_io->u.bdev.split_outstanding--; 2138 if (rc == -ENOMEM) { 2139 if (bdev_io->u.bdev.split_outstanding == 0) { 2140 /* No I/O is outstanding. Hence we should wait here. */ 2141 bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split); 2142 } 2143 } else { 2144 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2145 if (bdev_io->u.bdev.split_outstanding == 0) { 2146 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 2147 (uintptr_t)bdev_io, 0); 2148 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2149 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2150 } 2151 } 2152 2153 return; 2154 } 2155 } 2156 } 2157 2158 static void 2159 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2160 { 2161 struct spdk_bdev_io *parent_io = cb_arg; 2162 2163 spdk_bdev_free_io(bdev_io); 2164 2165 if (!success) { 2166 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2167 /* If any child I/O failed, stop further splitting process. */ 2168 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2169 parent_io->u.bdev.split_remaining_num_blocks = 0; 2170 } 2171 parent_io->u.bdev.split_outstanding--; 2172 if (parent_io->u.bdev.split_outstanding != 0) { 2173 return; 2174 } 2175 2176 /* 2177 * Parent I/O finishes when all blocks are consumed. 2178 */ 2179 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2180 assert(parent_io->internal.cb != bdev_io_split_done); 2181 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 2182 (uintptr_t)parent_io, 0); 2183 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2184 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2185 parent_io->internal.caller_ctx); 2186 return; 2187 } 2188 2189 /* 2190 * Continue with the splitting process. This function will complete the parent I/O if the 2191 * splitting is done. 2192 */ 2193 _bdev_io_split(parent_io); 2194 } 2195 2196 static void 2197 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2198 2199 static void 2200 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2201 { 2202 assert(bdev_io_type_can_split(bdev_io->type)); 2203 2204 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2205 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2206 bdev_io->u.bdev.split_outstanding = 0; 2207 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2208 2209 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2210 _bdev_io_split(bdev_io); 2211 } else { 2212 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2213 spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb, 2214 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2215 } 2216 } 2217 2218 static void 2219 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2220 { 2221 if (!success) { 2222 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2223 return; 2224 } 2225 2226 _bdev_io_split(bdev_io); 2227 } 2228 2229 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2230 * be inlined, at least on some compilers. 2231 */ 2232 static inline void 2233 _bdev_io_submit(void *ctx) 2234 { 2235 struct spdk_bdev_io *bdev_io = ctx; 2236 struct spdk_bdev *bdev = bdev_io->bdev; 2237 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2238 uint64_t tsc; 2239 2240 tsc = spdk_get_ticks(); 2241 bdev_io->internal.submit_tsc = tsc; 2242 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 2243 2244 if (spdk_likely(bdev_ch->flags == 0)) { 2245 bdev_io_do_submit(bdev_ch, bdev_io); 2246 return; 2247 } 2248 2249 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2250 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2251 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2252 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2253 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2254 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2255 } else { 2256 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2257 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2258 } 2259 } else { 2260 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2261 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2262 } 2263 } 2264 2265 bool 2266 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2267 2268 bool 2269 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2270 { 2271 if (range1->length == 0 || range2->length == 0) { 2272 return false; 2273 } 2274 2275 if (range1->offset + range1->length <= range2->offset) { 2276 return false; 2277 } 2278 2279 if (range2->offset + range2->length <= range1->offset) { 2280 return false; 2281 } 2282 2283 return true; 2284 } 2285 2286 static bool 2287 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2288 { 2289 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2290 struct lba_range r; 2291 2292 switch (bdev_io->type) { 2293 case SPDK_BDEV_IO_TYPE_NVME_IO: 2294 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2295 /* Don't try to decode the NVMe command - just assume worst-case and that 2296 * it overlaps a locked range. 2297 */ 2298 return true; 2299 case SPDK_BDEV_IO_TYPE_WRITE: 2300 case SPDK_BDEV_IO_TYPE_UNMAP: 2301 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2302 case SPDK_BDEV_IO_TYPE_ZCOPY: 2303 r.offset = bdev_io->u.bdev.offset_blocks; 2304 r.length = bdev_io->u.bdev.num_blocks; 2305 if (!bdev_lba_range_overlapped(range, &r)) { 2306 /* This I/O doesn't overlap the specified LBA range. */ 2307 return false; 2308 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2309 /* This I/O overlaps, but the I/O is on the same channel that locked this 2310 * range, and the caller_ctx is the same as the locked_ctx. This means 2311 * that this I/O is associated with the lock, and is allowed to execute. 2312 */ 2313 return false; 2314 } else { 2315 return true; 2316 } 2317 default: 2318 return false; 2319 } 2320 } 2321 2322 void 2323 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2324 { 2325 struct spdk_bdev *bdev = bdev_io->bdev; 2326 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2327 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2328 2329 assert(thread != NULL); 2330 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2331 2332 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2333 struct lba_range *range; 2334 2335 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2336 if (bdev_io_range_is_locked(bdev_io, range)) { 2337 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2338 return; 2339 } 2340 } 2341 } 2342 2343 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2344 2345 if (bdev_io_should_split(bdev_io)) { 2346 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2347 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2348 (uintptr_t)bdev_io, bdev_io->type); 2349 bdev_io_split(NULL, bdev_io); 2350 return; 2351 } 2352 2353 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2354 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2355 _bdev_io_submit(bdev_io); 2356 } else { 2357 bdev_io->internal.io_submit_ch = ch; 2358 bdev_io->internal.ch = bdev->internal.qos->ch; 2359 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2360 } 2361 } else { 2362 _bdev_io_submit(bdev_io); 2363 } 2364 } 2365 2366 static void 2367 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2368 { 2369 struct spdk_bdev *bdev = bdev_io->bdev; 2370 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2371 struct spdk_io_channel *ch = bdev_ch->channel; 2372 2373 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2374 2375 bdev_io->internal.in_submit_request = true; 2376 bdev->fn_table->submit_request(ch, bdev_io); 2377 bdev_io->internal.in_submit_request = false; 2378 } 2379 2380 void 2381 bdev_io_init(struct spdk_bdev_io *bdev_io, 2382 struct spdk_bdev *bdev, void *cb_arg, 2383 spdk_bdev_io_completion_cb cb) 2384 { 2385 bdev_io->bdev = bdev; 2386 bdev_io->internal.caller_ctx = cb_arg; 2387 bdev_io->internal.cb = cb; 2388 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2389 bdev_io->internal.in_submit_request = false; 2390 bdev_io->internal.buf = NULL; 2391 bdev_io->internal.io_submit_ch = NULL; 2392 bdev_io->internal.orig_iovs = NULL; 2393 bdev_io->internal.orig_iovcnt = 0; 2394 bdev_io->internal.orig_md_buf = NULL; 2395 bdev_io->internal.error.nvme.cdw0 = 0; 2396 bdev_io->num_retries = 0; 2397 bdev_io->internal.get_buf_cb = NULL; 2398 bdev_io->internal.get_aux_buf_cb = NULL; 2399 } 2400 2401 static bool 2402 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2403 { 2404 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2405 } 2406 2407 bool 2408 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2409 { 2410 bool supported; 2411 2412 supported = bdev_io_type_supported(bdev, io_type); 2413 2414 if (!supported) { 2415 switch (io_type) { 2416 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2417 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2418 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2419 break; 2420 case SPDK_BDEV_IO_TYPE_ZCOPY: 2421 /* Zero copy can be emulated with regular read and write */ 2422 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) && 2423 bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2424 break; 2425 default: 2426 break; 2427 } 2428 } 2429 2430 return supported; 2431 } 2432 2433 int 2434 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2435 { 2436 if (bdev->fn_table->dump_info_json) { 2437 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2438 } 2439 2440 return 0; 2441 } 2442 2443 static void 2444 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2445 { 2446 uint32_t max_per_timeslice = 0; 2447 int i; 2448 2449 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2450 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2451 qos->rate_limits[i].max_per_timeslice = 0; 2452 continue; 2453 } 2454 2455 max_per_timeslice = qos->rate_limits[i].limit * 2456 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2457 2458 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2459 qos->rate_limits[i].min_per_timeslice); 2460 2461 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2462 } 2463 2464 bdev_qos_set_ops(qos); 2465 } 2466 2467 static int 2468 bdev_channel_poll_qos(void *arg) 2469 { 2470 struct spdk_bdev_qos *qos = arg; 2471 uint64_t now = spdk_get_ticks(); 2472 int i; 2473 2474 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2475 /* We received our callback earlier than expected - return 2476 * immediately and wait to do accounting until at least one 2477 * timeslice has actually expired. This should never happen 2478 * with a well-behaved timer implementation. 2479 */ 2480 return SPDK_POLLER_IDLE; 2481 } 2482 2483 /* Reset for next round of rate limiting */ 2484 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2485 /* We may have allowed the IOs or bytes to slightly overrun in the last 2486 * timeslice. remaining_this_timeslice is signed, so if it's negative 2487 * here, we'll account for the overrun so that the next timeslice will 2488 * be appropriately reduced. 2489 */ 2490 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2491 qos->rate_limits[i].remaining_this_timeslice = 0; 2492 } 2493 } 2494 2495 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2496 qos->last_timeslice += qos->timeslice_size; 2497 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2498 qos->rate_limits[i].remaining_this_timeslice += 2499 qos->rate_limits[i].max_per_timeslice; 2500 } 2501 } 2502 2503 return bdev_qos_io_submit(qos->ch, qos); 2504 } 2505 2506 static void 2507 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2508 { 2509 struct spdk_bdev_shared_resource *shared_resource; 2510 struct lba_range *range; 2511 2512 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2513 range = TAILQ_FIRST(&ch->locked_ranges); 2514 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2515 free(range); 2516 } 2517 2518 spdk_put_io_channel(ch->channel); 2519 2520 shared_resource = ch->shared_resource; 2521 2522 assert(TAILQ_EMPTY(&ch->io_locked)); 2523 assert(TAILQ_EMPTY(&ch->io_submitted)); 2524 assert(ch->io_outstanding == 0); 2525 assert(shared_resource->ref > 0); 2526 shared_resource->ref--; 2527 if (shared_resource->ref == 0) { 2528 assert(shared_resource->io_outstanding == 0); 2529 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2530 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2531 free(shared_resource); 2532 } 2533 } 2534 2535 /* Caller must hold bdev->internal.mutex. */ 2536 static void 2537 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2538 { 2539 struct spdk_bdev_qos *qos = bdev->internal.qos; 2540 int i; 2541 2542 /* Rate limiting on this bdev enabled */ 2543 if (qos) { 2544 if (qos->ch == NULL) { 2545 struct spdk_io_channel *io_ch; 2546 2547 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2548 bdev->name, spdk_get_thread()); 2549 2550 /* No qos channel has been selected, so set one up */ 2551 2552 /* Take another reference to ch */ 2553 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2554 assert(io_ch != NULL); 2555 qos->ch = ch; 2556 2557 qos->thread = spdk_io_channel_get_thread(io_ch); 2558 2559 TAILQ_INIT(&qos->queued); 2560 2561 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2562 if (bdev_qos_is_iops_rate_limit(i) == true) { 2563 qos->rate_limits[i].min_per_timeslice = 2564 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2565 } else { 2566 qos->rate_limits[i].min_per_timeslice = 2567 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2568 } 2569 2570 if (qos->rate_limits[i].limit == 0) { 2571 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2572 } 2573 } 2574 bdev_qos_update_max_quota_per_timeslice(qos); 2575 qos->timeslice_size = 2576 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2577 qos->last_timeslice = spdk_get_ticks(); 2578 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2579 qos, 2580 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2581 } 2582 2583 ch->flags |= BDEV_CH_QOS_ENABLED; 2584 } 2585 } 2586 2587 struct poll_timeout_ctx { 2588 struct spdk_bdev_desc *desc; 2589 uint64_t timeout_in_sec; 2590 spdk_bdev_io_timeout_cb cb_fn; 2591 void *cb_arg; 2592 }; 2593 2594 static void 2595 bdev_desc_free(struct spdk_bdev_desc *desc) 2596 { 2597 pthread_mutex_destroy(&desc->mutex); 2598 free(desc->media_events_buffer); 2599 free(desc); 2600 } 2601 2602 static void 2603 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2604 { 2605 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2606 struct spdk_bdev_desc *desc = ctx->desc; 2607 2608 free(ctx); 2609 2610 pthread_mutex_lock(&desc->mutex); 2611 desc->refs--; 2612 if (desc->closed == true && desc->refs == 0) { 2613 pthread_mutex_unlock(&desc->mutex); 2614 bdev_desc_free(desc); 2615 return; 2616 } 2617 pthread_mutex_unlock(&desc->mutex); 2618 } 2619 2620 static void 2621 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2622 { 2623 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2624 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2625 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2626 struct spdk_bdev_desc *desc = ctx->desc; 2627 struct spdk_bdev_io *bdev_io; 2628 uint64_t now; 2629 2630 pthread_mutex_lock(&desc->mutex); 2631 if (desc->closed == true) { 2632 pthread_mutex_unlock(&desc->mutex); 2633 spdk_for_each_channel_continue(i, -1); 2634 return; 2635 } 2636 pthread_mutex_unlock(&desc->mutex); 2637 2638 now = spdk_get_ticks(); 2639 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 2640 /* Exclude any I/O that are generated via splitting. */ 2641 if (bdev_io->internal.cb == bdev_io_split_done) { 2642 continue; 2643 } 2644 2645 /* Once we find an I/O that has not timed out, we can immediately 2646 * exit the loop. 2647 */ 2648 if (now < (bdev_io->internal.submit_tsc + 2649 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 2650 goto end; 2651 } 2652 2653 if (bdev_io->internal.desc == desc) { 2654 ctx->cb_fn(ctx->cb_arg, bdev_io); 2655 } 2656 } 2657 2658 end: 2659 spdk_for_each_channel_continue(i, 0); 2660 } 2661 2662 static int 2663 bdev_poll_timeout_io(void *arg) 2664 { 2665 struct spdk_bdev_desc *desc = arg; 2666 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 2667 struct poll_timeout_ctx *ctx; 2668 2669 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 2670 if (!ctx) { 2671 SPDK_ERRLOG("failed to allocate memory\n"); 2672 return SPDK_POLLER_BUSY; 2673 } 2674 ctx->desc = desc; 2675 ctx->cb_arg = desc->cb_arg; 2676 ctx->cb_fn = desc->cb_fn; 2677 ctx->timeout_in_sec = desc->timeout_in_sec; 2678 2679 /* Take a ref on the descriptor in case it gets closed while we are checking 2680 * all of the channels. 2681 */ 2682 pthread_mutex_lock(&desc->mutex); 2683 desc->refs++; 2684 pthread_mutex_unlock(&desc->mutex); 2685 2686 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2687 bdev_channel_poll_timeout_io, 2688 ctx, 2689 bdev_channel_poll_timeout_io_done); 2690 2691 return SPDK_POLLER_BUSY; 2692 } 2693 2694 int 2695 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 2696 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 2697 { 2698 assert(desc->thread == spdk_get_thread()); 2699 2700 spdk_poller_unregister(&desc->io_timeout_poller); 2701 2702 if (timeout_in_sec) { 2703 assert(cb_fn != NULL); 2704 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 2705 desc, 2706 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 2707 1000); 2708 if (desc->io_timeout_poller == NULL) { 2709 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 2710 return -1; 2711 } 2712 } 2713 2714 desc->cb_fn = cb_fn; 2715 desc->cb_arg = cb_arg; 2716 desc->timeout_in_sec = timeout_in_sec; 2717 2718 return 0; 2719 } 2720 2721 static int 2722 bdev_channel_create(void *io_device, void *ctx_buf) 2723 { 2724 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2725 struct spdk_bdev_channel *ch = ctx_buf; 2726 struct spdk_io_channel *mgmt_io_ch; 2727 struct spdk_bdev_mgmt_channel *mgmt_ch; 2728 struct spdk_bdev_shared_resource *shared_resource; 2729 struct lba_range *range; 2730 2731 ch->bdev = bdev; 2732 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 2733 if (!ch->channel) { 2734 return -1; 2735 } 2736 2737 assert(ch->histogram == NULL); 2738 if (bdev->internal.histogram_enabled) { 2739 ch->histogram = spdk_histogram_data_alloc(); 2740 if (ch->histogram == NULL) { 2741 SPDK_ERRLOG("Could not allocate histogram\n"); 2742 } 2743 } 2744 2745 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 2746 if (!mgmt_io_ch) { 2747 spdk_put_io_channel(ch->channel); 2748 return -1; 2749 } 2750 2751 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 2752 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 2753 if (shared_resource->shared_ch == ch->channel) { 2754 spdk_put_io_channel(mgmt_io_ch); 2755 shared_resource->ref++; 2756 break; 2757 } 2758 } 2759 2760 if (shared_resource == NULL) { 2761 shared_resource = calloc(1, sizeof(*shared_resource)); 2762 if (shared_resource == NULL) { 2763 spdk_put_io_channel(ch->channel); 2764 spdk_put_io_channel(mgmt_io_ch); 2765 return -1; 2766 } 2767 2768 shared_resource->mgmt_ch = mgmt_ch; 2769 shared_resource->io_outstanding = 0; 2770 TAILQ_INIT(&shared_resource->nomem_io); 2771 shared_resource->nomem_threshold = 0; 2772 shared_resource->shared_ch = ch->channel; 2773 shared_resource->ref = 1; 2774 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 2775 } 2776 2777 memset(&ch->stat, 0, sizeof(ch->stat)); 2778 ch->stat.ticks_rate = spdk_get_ticks_hz(); 2779 ch->io_outstanding = 0; 2780 TAILQ_INIT(&ch->queued_resets); 2781 TAILQ_INIT(&ch->locked_ranges); 2782 ch->flags = 0; 2783 ch->shared_resource = shared_resource; 2784 2785 TAILQ_INIT(&ch->io_submitted); 2786 TAILQ_INIT(&ch->io_locked); 2787 2788 #ifdef SPDK_CONFIG_VTUNE 2789 { 2790 char *name; 2791 __itt_init_ittlib(NULL, 0); 2792 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 2793 if (!name) { 2794 bdev_channel_destroy_resource(ch); 2795 return -1; 2796 } 2797 ch->handle = __itt_string_handle_create(name); 2798 free(name); 2799 ch->start_tsc = spdk_get_ticks(); 2800 ch->interval_tsc = spdk_get_ticks_hz() / 100; 2801 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 2802 } 2803 #endif 2804 2805 pthread_mutex_lock(&bdev->internal.mutex); 2806 bdev_enable_qos(bdev, ch); 2807 2808 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 2809 struct lba_range *new_range; 2810 2811 new_range = calloc(1, sizeof(*new_range)); 2812 if (new_range == NULL) { 2813 pthread_mutex_unlock(&bdev->internal.mutex); 2814 bdev_channel_destroy_resource(ch); 2815 return -1; 2816 } 2817 new_range->length = range->length; 2818 new_range->offset = range->offset; 2819 new_range->locked_ctx = range->locked_ctx; 2820 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 2821 } 2822 2823 pthread_mutex_unlock(&bdev->internal.mutex); 2824 2825 return 0; 2826 } 2827 2828 /* 2829 * Abort I/O that are waiting on a data buffer. These types of I/O are 2830 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2831 */ 2832 static void 2833 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2834 { 2835 bdev_io_stailq_t tmp; 2836 struct spdk_bdev_io *bdev_io; 2837 2838 STAILQ_INIT(&tmp); 2839 2840 while (!STAILQ_EMPTY(queue)) { 2841 bdev_io = STAILQ_FIRST(queue); 2842 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2843 if (bdev_io->internal.ch == ch) { 2844 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2845 } else { 2846 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2847 } 2848 } 2849 2850 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2851 } 2852 2853 /* 2854 * Abort I/O that are queued waiting for submission. These types of I/O are 2855 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2856 */ 2857 static void 2858 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2859 { 2860 struct spdk_bdev_io *bdev_io, *tmp; 2861 2862 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2863 if (bdev_io->internal.ch == ch) { 2864 TAILQ_REMOVE(queue, bdev_io, internal.link); 2865 /* 2866 * spdk_bdev_io_complete() assumes that the completed I/O had 2867 * been submitted to the bdev module. Since in this case it 2868 * hadn't, bump io_outstanding to account for the decrement 2869 * that spdk_bdev_io_complete() will do. 2870 */ 2871 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2872 ch->io_outstanding++; 2873 ch->shared_resource->io_outstanding++; 2874 } 2875 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2876 } 2877 } 2878 } 2879 2880 static bool 2881 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 2882 { 2883 struct spdk_bdev_io *bdev_io; 2884 2885 TAILQ_FOREACH(bdev_io, queue, internal.link) { 2886 if (bdev_io == bio_to_abort) { 2887 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 2888 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 2889 return true; 2890 } 2891 } 2892 2893 return false; 2894 } 2895 2896 static bool 2897 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 2898 { 2899 struct spdk_bdev_io *bdev_io; 2900 2901 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 2902 if (bdev_io == bio_to_abort) { 2903 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 2904 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 2905 return true; 2906 } 2907 } 2908 2909 return false; 2910 } 2911 2912 static void 2913 bdev_qos_channel_destroy(void *cb_arg) 2914 { 2915 struct spdk_bdev_qos *qos = cb_arg; 2916 2917 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2918 spdk_poller_unregister(&qos->poller); 2919 2920 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 2921 2922 free(qos); 2923 } 2924 2925 static int 2926 bdev_qos_destroy(struct spdk_bdev *bdev) 2927 { 2928 int i; 2929 2930 /* 2931 * Cleanly shutting down the QoS poller is tricky, because 2932 * during the asynchronous operation the user could open 2933 * a new descriptor and create a new channel, spawning 2934 * a new QoS poller. 2935 * 2936 * The strategy is to create a new QoS structure here and swap it 2937 * in. The shutdown path then continues to refer to the old one 2938 * until it completes and then releases it. 2939 */ 2940 struct spdk_bdev_qos *new_qos, *old_qos; 2941 2942 old_qos = bdev->internal.qos; 2943 2944 new_qos = calloc(1, sizeof(*new_qos)); 2945 if (!new_qos) { 2946 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2947 return -ENOMEM; 2948 } 2949 2950 /* Copy the old QoS data into the newly allocated structure */ 2951 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2952 2953 /* Zero out the key parts of the QoS structure */ 2954 new_qos->ch = NULL; 2955 new_qos->thread = NULL; 2956 new_qos->poller = NULL; 2957 TAILQ_INIT(&new_qos->queued); 2958 /* 2959 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2960 * It will be used later for the new QoS structure. 2961 */ 2962 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2963 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2964 new_qos->rate_limits[i].min_per_timeslice = 0; 2965 new_qos->rate_limits[i].max_per_timeslice = 0; 2966 } 2967 2968 bdev->internal.qos = new_qos; 2969 2970 if (old_qos->thread == NULL) { 2971 free(old_qos); 2972 } else { 2973 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 2974 } 2975 2976 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2977 * been destroyed yet. The destruction path will end up waiting for the final 2978 * channel to be put before it releases resources. */ 2979 2980 return 0; 2981 } 2982 2983 static void 2984 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2985 { 2986 total->bytes_read += add->bytes_read; 2987 total->num_read_ops += add->num_read_ops; 2988 total->bytes_written += add->bytes_written; 2989 total->num_write_ops += add->num_write_ops; 2990 total->bytes_unmapped += add->bytes_unmapped; 2991 total->num_unmap_ops += add->num_unmap_ops; 2992 total->read_latency_ticks += add->read_latency_ticks; 2993 total->write_latency_ticks += add->write_latency_ticks; 2994 total->unmap_latency_ticks += add->unmap_latency_ticks; 2995 } 2996 2997 static void 2998 bdev_channel_destroy(void *io_device, void *ctx_buf) 2999 { 3000 struct spdk_bdev_channel *ch = ctx_buf; 3001 struct spdk_bdev_mgmt_channel *mgmt_ch; 3002 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3003 3004 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3005 spdk_get_thread()); 3006 3007 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3008 pthread_mutex_lock(&ch->bdev->internal.mutex); 3009 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3010 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3011 3012 mgmt_ch = shared_resource->mgmt_ch; 3013 3014 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3015 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3016 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3017 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3018 3019 if (ch->histogram) { 3020 spdk_histogram_data_free(ch->histogram); 3021 } 3022 3023 bdev_channel_destroy_resource(ch); 3024 } 3025 3026 int 3027 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3028 { 3029 struct spdk_bdev_alias *tmp; 3030 3031 if (alias == NULL) { 3032 SPDK_ERRLOG("Empty alias passed\n"); 3033 return -EINVAL; 3034 } 3035 3036 if (spdk_bdev_get_by_name(alias)) { 3037 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 3038 return -EEXIST; 3039 } 3040 3041 tmp = calloc(1, sizeof(*tmp)); 3042 if (tmp == NULL) { 3043 SPDK_ERRLOG("Unable to allocate alias\n"); 3044 return -ENOMEM; 3045 } 3046 3047 tmp->alias = strdup(alias); 3048 if (tmp->alias == NULL) { 3049 free(tmp); 3050 SPDK_ERRLOG("Unable to allocate alias\n"); 3051 return -ENOMEM; 3052 } 3053 3054 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3055 3056 return 0; 3057 } 3058 3059 int 3060 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3061 { 3062 struct spdk_bdev_alias *tmp; 3063 3064 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3065 if (strcmp(alias, tmp->alias) == 0) { 3066 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3067 free(tmp->alias); 3068 free(tmp); 3069 return 0; 3070 } 3071 } 3072 3073 SPDK_INFOLOG(bdev, "Alias %s does not exists\n", alias); 3074 3075 return -ENOENT; 3076 } 3077 3078 void 3079 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3080 { 3081 struct spdk_bdev_alias *p, *tmp; 3082 3083 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3084 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3085 free(p->alias); 3086 free(p); 3087 } 3088 } 3089 3090 struct spdk_io_channel * 3091 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3092 { 3093 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3094 } 3095 3096 void * 3097 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3098 { 3099 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3100 void *ctx = NULL; 3101 3102 if (bdev->fn_table->get_module_ctx) { 3103 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3104 } 3105 3106 return ctx; 3107 } 3108 3109 const char * 3110 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3111 { 3112 return bdev->module->name; 3113 } 3114 3115 const char * 3116 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3117 { 3118 return bdev->name; 3119 } 3120 3121 const char * 3122 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3123 { 3124 return bdev->product_name; 3125 } 3126 3127 const struct spdk_bdev_aliases_list * 3128 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3129 { 3130 return &bdev->aliases; 3131 } 3132 3133 uint32_t 3134 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3135 { 3136 return bdev->blocklen; 3137 } 3138 3139 uint32_t 3140 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3141 { 3142 return bdev->write_unit_size; 3143 } 3144 3145 uint64_t 3146 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3147 { 3148 return bdev->blockcnt; 3149 } 3150 3151 const char * 3152 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3153 { 3154 return qos_rpc_type[type]; 3155 } 3156 3157 void 3158 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3159 { 3160 int i; 3161 3162 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3163 3164 pthread_mutex_lock(&bdev->internal.mutex); 3165 if (bdev->internal.qos) { 3166 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3167 if (bdev->internal.qos->rate_limits[i].limit != 3168 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3169 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3170 if (bdev_qos_is_iops_rate_limit(i) == false) { 3171 /* Change from Byte to Megabyte which is user visible. */ 3172 limits[i] = limits[i] / 1024 / 1024; 3173 } 3174 } 3175 } 3176 } 3177 pthread_mutex_unlock(&bdev->internal.mutex); 3178 } 3179 3180 size_t 3181 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3182 { 3183 return 1 << bdev->required_alignment; 3184 } 3185 3186 uint32_t 3187 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3188 { 3189 return bdev->optimal_io_boundary; 3190 } 3191 3192 bool 3193 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3194 { 3195 return bdev->write_cache; 3196 } 3197 3198 const struct spdk_uuid * 3199 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3200 { 3201 return &bdev->uuid; 3202 } 3203 3204 uint16_t 3205 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3206 { 3207 return bdev->acwu; 3208 } 3209 3210 uint32_t 3211 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3212 { 3213 return bdev->md_len; 3214 } 3215 3216 bool 3217 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3218 { 3219 return (bdev->md_len != 0) && bdev->md_interleave; 3220 } 3221 3222 bool 3223 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3224 { 3225 return (bdev->md_len != 0) && !bdev->md_interleave; 3226 } 3227 3228 bool 3229 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3230 { 3231 return bdev->zoned; 3232 } 3233 3234 uint32_t 3235 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3236 { 3237 if (spdk_bdev_is_md_interleaved(bdev)) { 3238 return bdev->blocklen - bdev->md_len; 3239 } else { 3240 return bdev->blocklen; 3241 } 3242 } 3243 3244 static uint32_t 3245 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3246 { 3247 if (!spdk_bdev_is_md_interleaved(bdev)) { 3248 return bdev->blocklen + bdev->md_len; 3249 } else { 3250 return bdev->blocklen; 3251 } 3252 } 3253 3254 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3255 { 3256 if (bdev->md_len != 0) { 3257 return bdev->dif_type; 3258 } else { 3259 return SPDK_DIF_DISABLE; 3260 } 3261 } 3262 3263 bool 3264 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3265 { 3266 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3267 return bdev->dif_is_head_of_md; 3268 } else { 3269 return false; 3270 } 3271 } 3272 3273 bool 3274 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3275 enum spdk_dif_check_type check_type) 3276 { 3277 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3278 return false; 3279 } 3280 3281 switch (check_type) { 3282 case SPDK_DIF_CHECK_TYPE_REFTAG: 3283 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3284 case SPDK_DIF_CHECK_TYPE_APPTAG: 3285 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3286 case SPDK_DIF_CHECK_TYPE_GUARD: 3287 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3288 default: 3289 return false; 3290 } 3291 } 3292 3293 uint64_t 3294 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3295 { 3296 return bdev->internal.measured_queue_depth; 3297 } 3298 3299 uint64_t 3300 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3301 { 3302 return bdev->internal.period; 3303 } 3304 3305 uint64_t 3306 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3307 { 3308 return bdev->internal.weighted_io_time; 3309 } 3310 3311 uint64_t 3312 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3313 { 3314 return bdev->internal.io_time; 3315 } 3316 3317 static void 3318 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3319 { 3320 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3321 3322 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3323 3324 if (bdev->internal.measured_queue_depth) { 3325 bdev->internal.io_time += bdev->internal.period; 3326 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3327 } 3328 } 3329 3330 static void 3331 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3332 { 3333 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3334 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3335 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3336 3337 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3338 spdk_for_each_channel_continue(i, 0); 3339 } 3340 3341 static int 3342 bdev_calculate_measured_queue_depth(void *ctx) 3343 { 3344 struct spdk_bdev *bdev = ctx; 3345 bdev->internal.temporary_queue_depth = 0; 3346 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3347 _calculate_measured_qd_cpl); 3348 return SPDK_POLLER_BUSY; 3349 } 3350 3351 void 3352 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3353 { 3354 bdev->internal.period = period; 3355 3356 if (bdev->internal.qd_poller != NULL) { 3357 spdk_poller_unregister(&bdev->internal.qd_poller); 3358 bdev->internal.measured_queue_depth = UINT64_MAX; 3359 } 3360 3361 if (period != 0) { 3362 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3363 period); 3364 } 3365 } 3366 3367 static void 3368 _resize_notify(void *arg) 3369 { 3370 struct spdk_bdev_desc *desc = arg; 3371 3372 pthread_mutex_lock(&desc->mutex); 3373 desc->refs--; 3374 if (!desc->closed) { 3375 pthread_mutex_unlock(&desc->mutex); 3376 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3377 desc->bdev, 3378 desc->callback.ctx); 3379 return; 3380 } else if (0 == desc->refs) { 3381 /* This descriptor was closed after this resize_notify message was sent. 3382 * spdk_bdev_close() could not free the descriptor since this message was 3383 * in flight, so we free it now using bdev_desc_free(). 3384 */ 3385 pthread_mutex_unlock(&desc->mutex); 3386 bdev_desc_free(desc); 3387 return; 3388 } 3389 pthread_mutex_unlock(&desc->mutex); 3390 } 3391 3392 int 3393 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3394 { 3395 struct spdk_bdev_desc *desc; 3396 int ret; 3397 3398 pthread_mutex_lock(&bdev->internal.mutex); 3399 3400 /* bdev has open descriptors */ 3401 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3402 bdev->blockcnt > size) { 3403 ret = -EBUSY; 3404 } else { 3405 bdev->blockcnt = size; 3406 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3407 pthread_mutex_lock(&desc->mutex); 3408 if (desc->callback.open_with_ext && !desc->closed) { 3409 desc->refs++; 3410 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3411 } 3412 pthread_mutex_unlock(&desc->mutex); 3413 } 3414 ret = 0; 3415 } 3416 3417 pthread_mutex_unlock(&bdev->internal.mutex); 3418 3419 return ret; 3420 } 3421 3422 /* 3423 * Convert I/O offset and length from bytes to blocks. 3424 * 3425 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3426 */ 3427 static uint64_t 3428 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3429 uint64_t num_bytes, uint64_t *num_blocks) 3430 { 3431 uint32_t block_size = bdev->blocklen; 3432 uint8_t shift_cnt; 3433 3434 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3435 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3436 shift_cnt = spdk_u32log2(block_size); 3437 *offset_blocks = offset_bytes >> shift_cnt; 3438 *num_blocks = num_bytes >> shift_cnt; 3439 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3440 (num_bytes - (*num_blocks << shift_cnt)); 3441 } else { 3442 *offset_blocks = offset_bytes / block_size; 3443 *num_blocks = num_bytes / block_size; 3444 return (offset_bytes % block_size) | (num_bytes % block_size); 3445 } 3446 } 3447 3448 static bool 3449 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3450 { 3451 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3452 * has been an overflow and hence the offset has been wrapped around */ 3453 if (offset_blocks + num_blocks < offset_blocks) { 3454 return false; 3455 } 3456 3457 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3458 if (offset_blocks + num_blocks > bdev->blockcnt) { 3459 return false; 3460 } 3461 3462 return true; 3463 } 3464 3465 static bool 3466 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3467 { 3468 return _is_buf_allocated(iovs) == (md_buf != NULL); 3469 } 3470 3471 static int 3472 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3473 void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3474 spdk_bdev_io_completion_cb cb, void *cb_arg) 3475 { 3476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3477 struct spdk_bdev_io *bdev_io; 3478 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3479 3480 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3481 return -EINVAL; 3482 } 3483 3484 bdev_io = bdev_channel_get_io(channel); 3485 if (!bdev_io) { 3486 return -ENOMEM; 3487 } 3488 3489 bdev_io->internal.ch = channel; 3490 bdev_io->internal.desc = desc; 3491 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3492 bdev_io->u.bdev.iovs = &bdev_io->iov; 3493 bdev_io->u.bdev.iovs[0].iov_base = buf; 3494 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3495 bdev_io->u.bdev.iovcnt = 1; 3496 bdev_io->u.bdev.md_buf = md_buf; 3497 bdev_io->u.bdev.num_blocks = num_blocks; 3498 bdev_io->u.bdev.offset_blocks = offset_blocks; 3499 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3500 3501 bdev_io_submit(bdev_io); 3502 return 0; 3503 } 3504 3505 int 3506 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3507 void *buf, uint64_t offset, uint64_t nbytes, 3508 spdk_bdev_io_completion_cb cb, void *cb_arg) 3509 { 3510 uint64_t offset_blocks, num_blocks; 3511 3512 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3513 nbytes, &num_blocks) != 0) { 3514 return -EINVAL; 3515 } 3516 3517 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3518 } 3519 3520 int 3521 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3522 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3523 spdk_bdev_io_completion_cb cb, void *cb_arg) 3524 { 3525 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3526 } 3527 3528 int 3529 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3530 void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3531 spdk_bdev_io_completion_cb cb, void *cb_arg) 3532 { 3533 struct iovec iov = { 3534 .iov_base = buf, 3535 }; 3536 3537 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3538 return -EINVAL; 3539 } 3540 3541 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3542 return -EINVAL; 3543 } 3544 3545 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3546 cb, cb_arg); 3547 } 3548 3549 int 3550 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3551 struct iovec *iov, int iovcnt, 3552 uint64_t offset, uint64_t nbytes, 3553 spdk_bdev_io_completion_cb cb, void *cb_arg) 3554 { 3555 uint64_t offset_blocks, num_blocks; 3556 3557 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3558 nbytes, &num_blocks) != 0) { 3559 return -EINVAL; 3560 } 3561 3562 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3563 } 3564 3565 static int 3566 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3567 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 3568 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg) 3569 { 3570 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3571 struct spdk_bdev_io *bdev_io; 3572 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3573 3574 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3575 return -EINVAL; 3576 } 3577 3578 bdev_io = bdev_channel_get_io(channel); 3579 if (!bdev_io) { 3580 return -ENOMEM; 3581 } 3582 3583 bdev_io->internal.ch = channel; 3584 bdev_io->internal.desc = desc; 3585 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3586 bdev_io->u.bdev.iovs = iov; 3587 bdev_io->u.bdev.iovcnt = iovcnt; 3588 bdev_io->u.bdev.md_buf = md_buf; 3589 bdev_io->u.bdev.num_blocks = num_blocks; 3590 bdev_io->u.bdev.offset_blocks = offset_blocks; 3591 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3592 3593 bdev_io_submit(bdev_io); 3594 return 0; 3595 } 3596 3597 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3598 struct iovec *iov, int iovcnt, 3599 uint64_t offset_blocks, uint64_t num_blocks, 3600 spdk_bdev_io_completion_cb cb, void *cb_arg) 3601 { 3602 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3603 num_blocks, cb, cb_arg); 3604 } 3605 3606 int 3607 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3608 struct iovec *iov, int iovcnt, void *md_buf, 3609 uint64_t offset_blocks, uint64_t num_blocks, 3610 spdk_bdev_io_completion_cb cb, void *cb_arg) 3611 { 3612 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3613 return -EINVAL; 3614 } 3615 3616 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3617 return -EINVAL; 3618 } 3619 3620 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3621 num_blocks, cb, cb_arg); 3622 } 3623 3624 static int 3625 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3626 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3627 spdk_bdev_io_completion_cb cb, void *cb_arg) 3628 { 3629 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3630 struct spdk_bdev_io *bdev_io; 3631 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3632 3633 if (!desc->write) { 3634 return -EBADF; 3635 } 3636 3637 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3638 return -EINVAL; 3639 } 3640 3641 bdev_io = bdev_channel_get_io(channel); 3642 if (!bdev_io) { 3643 return -ENOMEM; 3644 } 3645 3646 bdev_io->internal.ch = channel; 3647 bdev_io->internal.desc = desc; 3648 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3649 bdev_io->u.bdev.iovs = &bdev_io->iov; 3650 bdev_io->u.bdev.iovs[0].iov_base = buf; 3651 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3652 bdev_io->u.bdev.iovcnt = 1; 3653 bdev_io->u.bdev.md_buf = md_buf; 3654 bdev_io->u.bdev.num_blocks = num_blocks; 3655 bdev_io->u.bdev.offset_blocks = offset_blocks; 3656 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3657 3658 bdev_io_submit(bdev_io); 3659 return 0; 3660 } 3661 3662 int 3663 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3664 void *buf, uint64_t offset, uint64_t nbytes, 3665 spdk_bdev_io_completion_cb cb, void *cb_arg) 3666 { 3667 uint64_t offset_blocks, num_blocks; 3668 3669 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3670 nbytes, &num_blocks) != 0) { 3671 return -EINVAL; 3672 } 3673 3674 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3675 } 3676 3677 int 3678 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3679 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3680 spdk_bdev_io_completion_cb cb, void *cb_arg) 3681 { 3682 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3683 cb, cb_arg); 3684 } 3685 3686 int 3687 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3688 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3689 spdk_bdev_io_completion_cb cb, void *cb_arg) 3690 { 3691 struct iovec iov = { 3692 .iov_base = buf, 3693 }; 3694 3695 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3696 return -EINVAL; 3697 } 3698 3699 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3700 return -EINVAL; 3701 } 3702 3703 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3704 cb, cb_arg); 3705 } 3706 3707 static int 3708 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3709 struct iovec *iov, int iovcnt, void *md_buf, 3710 uint64_t offset_blocks, uint64_t num_blocks, 3711 spdk_bdev_io_completion_cb cb, void *cb_arg) 3712 { 3713 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3714 struct spdk_bdev_io *bdev_io; 3715 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3716 3717 if (!desc->write) { 3718 return -EBADF; 3719 } 3720 3721 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3722 return -EINVAL; 3723 } 3724 3725 bdev_io = bdev_channel_get_io(channel); 3726 if (!bdev_io) { 3727 return -ENOMEM; 3728 } 3729 3730 bdev_io->internal.ch = channel; 3731 bdev_io->internal.desc = desc; 3732 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3733 bdev_io->u.bdev.iovs = iov; 3734 bdev_io->u.bdev.iovcnt = iovcnt; 3735 bdev_io->u.bdev.md_buf = md_buf; 3736 bdev_io->u.bdev.num_blocks = num_blocks; 3737 bdev_io->u.bdev.offset_blocks = offset_blocks; 3738 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3739 3740 bdev_io_submit(bdev_io); 3741 return 0; 3742 } 3743 3744 int 3745 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3746 struct iovec *iov, int iovcnt, 3747 uint64_t offset, uint64_t len, 3748 spdk_bdev_io_completion_cb cb, void *cb_arg) 3749 { 3750 uint64_t offset_blocks, num_blocks; 3751 3752 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3753 len, &num_blocks) != 0) { 3754 return -EINVAL; 3755 } 3756 3757 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3758 } 3759 3760 int 3761 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3762 struct iovec *iov, int iovcnt, 3763 uint64_t offset_blocks, uint64_t num_blocks, 3764 spdk_bdev_io_completion_cb cb, void *cb_arg) 3765 { 3766 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3767 num_blocks, cb, cb_arg); 3768 } 3769 3770 int 3771 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3772 struct iovec *iov, int iovcnt, void *md_buf, 3773 uint64_t offset_blocks, uint64_t num_blocks, 3774 spdk_bdev_io_completion_cb cb, void *cb_arg) 3775 { 3776 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3777 return -EINVAL; 3778 } 3779 3780 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3781 return -EINVAL; 3782 } 3783 3784 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3785 num_blocks, cb, cb_arg); 3786 } 3787 3788 static void 3789 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3790 { 3791 struct spdk_bdev_io *parent_io = cb_arg; 3792 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 3793 int i, rc = 0; 3794 3795 if (!success) { 3796 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3797 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3798 spdk_bdev_free_io(bdev_io); 3799 return; 3800 } 3801 3802 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 3803 rc = memcmp(read_buf, 3804 parent_io->u.bdev.iovs[i].iov_base, 3805 parent_io->u.bdev.iovs[i].iov_len); 3806 if (rc) { 3807 break; 3808 } 3809 read_buf += parent_io->u.bdev.iovs[i].iov_len; 3810 } 3811 3812 spdk_bdev_free_io(bdev_io); 3813 3814 if (rc == 0) { 3815 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3816 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 3817 } else { 3818 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 3819 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3820 } 3821 } 3822 3823 static void 3824 bdev_compare_do_read(void *_bdev_io) 3825 { 3826 struct spdk_bdev_io *bdev_io = _bdev_io; 3827 int rc; 3828 3829 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 3830 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 3831 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3832 bdev_compare_do_read_done, bdev_io); 3833 3834 if (rc == -ENOMEM) { 3835 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 3836 } else if (rc != 0) { 3837 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3838 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3839 } 3840 } 3841 3842 static int 3843 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3844 struct iovec *iov, int iovcnt, void *md_buf, 3845 uint64_t offset_blocks, uint64_t num_blocks, 3846 spdk_bdev_io_completion_cb cb, void *cb_arg) 3847 { 3848 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3849 struct spdk_bdev_io *bdev_io; 3850 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3851 3852 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3853 return -EINVAL; 3854 } 3855 3856 bdev_io = bdev_channel_get_io(channel); 3857 if (!bdev_io) { 3858 return -ENOMEM; 3859 } 3860 3861 bdev_io->internal.ch = channel; 3862 bdev_io->internal.desc = desc; 3863 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3864 bdev_io->u.bdev.iovs = iov; 3865 bdev_io->u.bdev.iovcnt = iovcnt; 3866 bdev_io->u.bdev.md_buf = md_buf; 3867 bdev_io->u.bdev.num_blocks = num_blocks; 3868 bdev_io->u.bdev.offset_blocks = offset_blocks; 3869 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3870 3871 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3872 bdev_io_submit(bdev_io); 3873 return 0; 3874 } 3875 3876 bdev_compare_do_read(bdev_io); 3877 3878 return 0; 3879 } 3880 3881 int 3882 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3883 struct iovec *iov, int iovcnt, 3884 uint64_t offset_blocks, uint64_t num_blocks, 3885 spdk_bdev_io_completion_cb cb, void *cb_arg) 3886 { 3887 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3888 num_blocks, cb, cb_arg); 3889 } 3890 3891 int 3892 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3893 struct iovec *iov, int iovcnt, void *md_buf, 3894 uint64_t offset_blocks, uint64_t num_blocks, 3895 spdk_bdev_io_completion_cb cb, void *cb_arg) 3896 { 3897 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3898 return -EINVAL; 3899 } 3900 3901 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3902 return -EINVAL; 3903 } 3904 3905 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3906 num_blocks, cb, cb_arg); 3907 } 3908 3909 static int 3910 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3911 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3912 spdk_bdev_io_completion_cb cb, void *cb_arg) 3913 { 3914 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3915 struct spdk_bdev_io *bdev_io; 3916 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3917 3918 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3919 return -EINVAL; 3920 } 3921 3922 bdev_io = bdev_channel_get_io(channel); 3923 if (!bdev_io) { 3924 return -ENOMEM; 3925 } 3926 3927 bdev_io->internal.ch = channel; 3928 bdev_io->internal.desc = desc; 3929 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3930 bdev_io->u.bdev.iovs = &bdev_io->iov; 3931 bdev_io->u.bdev.iovs[0].iov_base = buf; 3932 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3933 bdev_io->u.bdev.iovcnt = 1; 3934 bdev_io->u.bdev.md_buf = md_buf; 3935 bdev_io->u.bdev.num_blocks = num_blocks; 3936 bdev_io->u.bdev.offset_blocks = offset_blocks; 3937 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3938 3939 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3940 bdev_io_submit(bdev_io); 3941 return 0; 3942 } 3943 3944 bdev_compare_do_read(bdev_io); 3945 3946 return 0; 3947 } 3948 3949 int 3950 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3951 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3952 spdk_bdev_io_completion_cb cb, void *cb_arg) 3953 { 3954 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3955 cb, cb_arg); 3956 } 3957 3958 int 3959 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3960 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3961 spdk_bdev_io_completion_cb cb, void *cb_arg) 3962 { 3963 struct iovec iov = { 3964 .iov_base = buf, 3965 }; 3966 3967 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3968 return -EINVAL; 3969 } 3970 3971 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3972 return -EINVAL; 3973 } 3974 3975 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3976 cb, cb_arg); 3977 } 3978 3979 static void 3980 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 3981 { 3982 struct spdk_bdev_io *bdev_io = ctx; 3983 3984 if (unlock_status) { 3985 SPDK_ERRLOG("LBA range unlock failed\n"); 3986 } 3987 3988 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 3989 false, bdev_io->internal.caller_ctx); 3990 } 3991 3992 static void 3993 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 3994 { 3995 bdev_io->internal.status = status; 3996 3997 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3998 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3999 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4000 } 4001 4002 static void 4003 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4004 { 4005 struct spdk_bdev_io *parent_io = cb_arg; 4006 4007 if (!success) { 4008 SPDK_ERRLOG("Compare and write operation failed\n"); 4009 } 4010 4011 spdk_bdev_free_io(bdev_io); 4012 4013 bdev_comparev_and_writev_blocks_unlock(parent_io, 4014 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4015 } 4016 4017 static void 4018 bdev_compare_and_write_do_write(void *_bdev_io) 4019 { 4020 struct spdk_bdev_io *bdev_io = _bdev_io; 4021 int rc; 4022 4023 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4024 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4025 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4026 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4027 bdev_compare_and_write_do_write_done, bdev_io); 4028 4029 4030 if (rc == -ENOMEM) { 4031 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4032 } else if (rc != 0) { 4033 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4034 } 4035 } 4036 4037 static void 4038 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4039 { 4040 struct spdk_bdev_io *parent_io = cb_arg; 4041 4042 spdk_bdev_free_io(bdev_io); 4043 4044 if (!success) { 4045 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4046 return; 4047 } 4048 4049 bdev_compare_and_write_do_write(parent_io); 4050 } 4051 4052 static void 4053 bdev_compare_and_write_do_compare(void *_bdev_io) 4054 { 4055 struct spdk_bdev_io *bdev_io = _bdev_io; 4056 int rc; 4057 4058 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4059 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4060 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4061 bdev_compare_and_write_do_compare_done, bdev_io); 4062 4063 if (rc == -ENOMEM) { 4064 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4065 } else if (rc != 0) { 4066 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4067 } 4068 } 4069 4070 static void 4071 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4072 { 4073 struct spdk_bdev_io *bdev_io = ctx; 4074 4075 if (status) { 4076 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4077 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4078 return; 4079 } 4080 4081 bdev_compare_and_write_do_compare(bdev_io); 4082 } 4083 4084 int 4085 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4086 struct iovec *compare_iov, int compare_iovcnt, 4087 struct iovec *write_iov, int write_iovcnt, 4088 uint64_t offset_blocks, uint64_t num_blocks, 4089 spdk_bdev_io_completion_cb cb, void *cb_arg) 4090 { 4091 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4092 struct spdk_bdev_io *bdev_io; 4093 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4094 4095 if (!desc->write) { 4096 return -EBADF; 4097 } 4098 4099 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4100 return -EINVAL; 4101 } 4102 4103 if (num_blocks > bdev->acwu) { 4104 return -EINVAL; 4105 } 4106 4107 bdev_io = bdev_channel_get_io(channel); 4108 if (!bdev_io) { 4109 return -ENOMEM; 4110 } 4111 4112 bdev_io->internal.ch = channel; 4113 bdev_io->internal.desc = desc; 4114 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4115 bdev_io->u.bdev.iovs = compare_iov; 4116 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4117 bdev_io->u.bdev.fused_iovs = write_iov; 4118 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4119 bdev_io->u.bdev.md_buf = NULL; 4120 bdev_io->u.bdev.num_blocks = num_blocks; 4121 bdev_io->u.bdev.offset_blocks = offset_blocks; 4122 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4123 4124 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4125 bdev_io_submit(bdev_io); 4126 return 0; 4127 } 4128 4129 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4130 bdev_comparev_and_writev_blocks_locked, bdev_io); 4131 } 4132 4133 static void 4134 bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 4135 { 4136 if (!success) { 4137 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4138 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 4139 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 4140 return; 4141 } 4142 4143 if (bdev_io->u.bdev.zcopy.populate) { 4144 /* Read the real data into the buffer */ 4145 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4146 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4147 bdev_io_submit(bdev_io); 4148 return; 4149 } 4150 4151 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4152 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4153 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 4154 } 4155 4156 int 4157 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4158 uint64_t offset_blocks, uint64_t num_blocks, 4159 bool populate, 4160 spdk_bdev_io_completion_cb cb, void *cb_arg) 4161 { 4162 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4163 struct spdk_bdev_io *bdev_io; 4164 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4165 4166 if (!desc->write) { 4167 return -EBADF; 4168 } 4169 4170 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4171 return -EINVAL; 4172 } 4173 4174 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4175 return -ENOTSUP; 4176 } 4177 4178 bdev_io = bdev_channel_get_io(channel); 4179 if (!bdev_io) { 4180 return -ENOMEM; 4181 } 4182 4183 bdev_io->internal.ch = channel; 4184 bdev_io->internal.desc = desc; 4185 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4186 bdev_io->u.bdev.num_blocks = num_blocks; 4187 bdev_io->u.bdev.offset_blocks = offset_blocks; 4188 bdev_io->u.bdev.iovs = NULL; 4189 bdev_io->u.bdev.iovcnt = 0; 4190 bdev_io->u.bdev.md_buf = NULL; 4191 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4192 bdev_io->u.bdev.zcopy.commit = 0; 4193 bdev_io->u.bdev.zcopy.start = 1; 4194 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4195 4196 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4197 bdev_io_submit(bdev_io); 4198 } else { 4199 /* Emulate zcopy by allocating a buffer */ 4200 spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf, 4201 bdev_io->u.bdev.num_blocks * bdev->blocklen); 4202 } 4203 4204 return 0; 4205 } 4206 4207 int 4208 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4209 spdk_bdev_io_completion_cb cb, void *cb_arg) 4210 { 4211 struct spdk_bdev *bdev = bdev_io->bdev; 4212 4213 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 4214 /* This can happen if the zcopy was emulated in start */ 4215 if (bdev_io->u.bdev.zcopy.start != 1) { 4216 return -EINVAL; 4217 } 4218 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4219 } 4220 4221 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4222 return -EINVAL; 4223 } 4224 4225 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4226 bdev_io->u.bdev.zcopy.start = 0; 4227 bdev_io->internal.caller_ctx = cb_arg; 4228 bdev_io->internal.cb = cb; 4229 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4230 4231 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4232 bdev_io_submit(bdev_io); 4233 return 0; 4234 } 4235 4236 if (!bdev_io->u.bdev.zcopy.commit) { 4237 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4238 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4239 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4240 return 0; 4241 } 4242 4243 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4244 bdev_io_submit(bdev_io); 4245 4246 return 0; 4247 } 4248 4249 int 4250 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4251 uint64_t offset, uint64_t len, 4252 spdk_bdev_io_completion_cb cb, void *cb_arg) 4253 { 4254 uint64_t offset_blocks, num_blocks; 4255 4256 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4257 len, &num_blocks) != 0) { 4258 return -EINVAL; 4259 } 4260 4261 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4262 } 4263 4264 int 4265 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4266 uint64_t offset_blocks, uint64_t num_blocks, 4267 spdk_bdev_io_completion_cb cb, void *cb_arg) 4268 { 4269 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4270 struct spdk_bdev_io *bdev_io; 4271 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4272 4273 if (!desc->write) { 4274 return -EBADF; 4275 } 4276 4277 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4278 return -EINVAL; 4279 } 4280 4281 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4282 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4283 return -ENOTSUP; 4284 } 4285 4286 bdev_io = bdev_channel_get_io(channel); 4287 4288 if (!bdev_io) { 4289 return -ENOMEM; 4290 } 4291 4292 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4293 bdev_io->internal.ch = channel; 4294 bdev_io->internal.desc = desc; 4295 bdev_io->u.bdev.offset_blocks = offset_blocks; 4296 bdev_io->u.bdev.num_blocks = num_blocks; 4297 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4298 4299 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4300 bdev_io_submit(bdev_io); 4301 return 0; 4302 } 4303 4304 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4305 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4306 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4307 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4308 bdev_write_zero_buffer_next(bdev_io); 4309 4310 return 0; 4311 } 4312 4313 int 4314 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4315 uint64_t offset, uint64_t nbytes, 4316 spdk_bdev_io_completion_cb cb, void *cb_arg) 4317 { 4318 uint64_t offset_blocks, num_blocks; 4319 4320 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4321 nbytes, &num_blocks) != 0) { 4322 return -EINVAL; 4323 } 4324 4325 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4326 } 4327 4328 int 4329 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4330 uint64_t offset_blocks, uint64_t num_blocks, 4331 spdk_bdev_io_completion_cb cb, void *cb_arg) 4332 { 4333 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4334 struct spdk_bdev_io *bdev_io; 4335 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4336 4337 if (!desc->write) { 4338 return -EBADF; 4339 } 4340 4341 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4342 return -EINVAL; 4343 } 4344 4345 if (num_blocks == 0) { 4346 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4347 return -EINVAL; 4348 } 4349 4350 bdev_io = bdev_channel_get_io(channel); 4351 if (!bdev_io) { 4352 return -ENOMEM; 4353 } 4354 4355 bdev_io->internal.ch = channel; 4356 bdev_io->internal.desc = desc; 4357 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4358 4359 bdev_io->u.bdev.iovs = &bdev_io->iov; 4360 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4361 bdev_io->u.bdev.iovs[0].iov_len = 0; 4362 bdev_io->u.bdev.iovcnt = 1; 4363 4364 bdev_io->u.bdev.offset_blocks = offset_blocks; 4365 bdev_io->u.bdev.num_blocks = num_blocks; 4366 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4367 4368 bdev_io_submit(bdev_io); 4369 return 0; 4370 } 4371 4372 int 4373 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4374 uint64_t offset, uint64_t length, 4375 spdk_bdev_io_completion_cb cb, void *cb_arg) 4376 { 4377 uint64_t offset_blocks, num_blocks; 4378 4379 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4380 length, &num_blocks) != 0) { 4381 return -EINVAL; 4382 } 4383 4384 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4385 } 4386 4387 int 4388 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4389 uint64_t offset_blocks, uint64_t num_blocks, 4390 spdk_bdev_io_completion_cb cb, void *cb_arg) 4391 { 4392 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4393 struct spdk_bdev_io *bdev_io; 4394 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4395 4396 if (!desc->write) { 4397 return -EBADF; 4398 } 4399 4400 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4401 return -EINVAL; 4402 } 4403 4404 bdev_io = bdev_channel_get_io(channel); 4405 if (!bdev_io) { 4406 return -ENOMEM; 4407 } 4408 4409 bdev_io->internal.ch = channel; 4410 bdev_io->internal.desc = desc; 4411 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4412 bdev_io->u.bdev.iovs = NULL; 4413 bdev_io->u.bdev.iovcnt = 0; 4414 bdev_io->u.bdev.offset_blocks = offset_blocks; 4415 bdev_io->u.bdev.num_blocks = num_blocks; 4416 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4417 4418 bdev_io_submit(bdev_io); 4419 return 0; 4420 } 4421 4422 static void 4423 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4424 { 4425 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4426 struct spdk_bdev_io *bdev_io; 4427 4428 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4429 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4430 bdev_io_submit_reset(bdev_io); 4431 } 4432 4433 static void 4434 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4435 { 4436 struct spdk_io_channel *ch; 4437 struct spdk_bdev_channel *channel; 4438 struct spdk_bdev_mgmt_channel *mgmt_channel; 4439 struct spdk_bdev_shared_resource *shared_resource; 4440 bdev_io_tailq_t tmp_queued; 4441 4442 TAILQ_INIT(&tmp_queued); 4443 4444 ch = spdk_io_channel_iter_get_channel(i); 4445 channel = spdk_io_channel_get_ctx(ch); 4446 shared_resource = channel->shared_resource; 4447 mgmt_channel = shared_resource->mgmt_ch; 4448 4449 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4450 4451 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4452 /* The QoS object is always valid and readable while 4453 * the channel flag is set, so the lock here should not 4454 * be necessary. We're not in the fast path though, so 4455 * just take it anyway. */ 4456 pthread_mutex_lock(&channel->bdev->internal.mutex); 4457 if (channel->bdev->internal.qos->ch == channel) { 4458 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4459 } 4460 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4461 } 4462 4463 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4464 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4465 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4466 bdev_abort_all_queued_io(&tmp_queued, channel); 4467 4468 spdk_for_each_channel_continue(i, 0); 4469 } 4470 4471 static void 4472 bdev_start_reset(void *ctx) 4473 { 4474 struct spdk_bdev_channel *ch = ctx; 4475 4476 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4477 ch, bdev_reset_dev); 4478 } 4479 4480 static void 4481 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4482 { 4483 struct spdk_bdev *bdev = ch->bdev; 4484 4485 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4486 4487 pthread_mutex_lock(&bdev->internal.mutex); 4488 if (bdev->internal.reset_in_progress == NULL) { 4489 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4490 /* 4491 * Take a channel reference for the target bdev for the life of this 4492 * reset. This guards against the channel getting destroyed while 4493 * spdk_for_each_channel() calls related to this reset IO are in 4494 * progress. We will release the reference when this reset is 4495 * completed. 4496 */ 4497 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4498 bdev_start_reset(ch); 4499 } 4500 pthread_mutex_unlock(&bdev->internal.mutex); 4501 } 4502 4503 int 4504 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4505 spdk_bdev_io_completion_cb cb, void *cb_arg) 4506 { 4507 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4508 struct spdk_bdev_io *bdev_io; 4509 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4510 4511 bdev_io = bdev_channel_get_io(channel); 4512 if (!bdev_io) { 4513 return -ENOMEM; 4514 } 4515 4516 bdev_io->internal.ch = channel; 4517 bdev_io->internal.desc = desc; 4518 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4519 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4520 bdev_io->u.reset.ch_ref = NULL; 4521 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4522 4523 pthread_mutex_lock(&bdev->internal.mutex); 4524 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4525 pthread_mutex_unlock(&bdev->internal.mutex); 4526 4527 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4528 internal.ch_link); 4529 4530 bdev_channel_start_reset(channel); 4531 4532 return 0; 4533 } 4534 4535 void 4536 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4537 struct spdk_bdev_io_stat *stat) 4538 { 4539 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4540 4541 *stat = channel->stat; 4542 } 4543 4544 static void 4545 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4546 { 4547 void *io_device = spdk_io_channel_iter_get_io_device(i); 4548 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4549 4550 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4551 bdev_iostat_ctx->cb_arg, 0); 4552 free(bdev_iostat_ctx); 4553 } 4554 4555 static void 4556 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4557 { 4558 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4559 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4560 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4561 4562 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4563 spdk_for_each_channel_continue(i, 0); 4564 } 4565 4566 void 4567 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 4568 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 4569 { 4570 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 4571 4572 assert(bdev != NULL); 4573 assert(stat != NULL); 4574 assert(cb != NULL); 4575 4576 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 4577 if (bdev_iostat_ctx == NULL) { 4578 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 4579 cb(bdev, stat, cb_arg, -ENOMEM); 4580 return; 4581 } 4582 4583 bdev_iostat_ctx->stat = stat; 4584 bdev_iostat_ctx->cb = cb; 4585 bdev_iostat_ctx->cb_arg = cb_arg; 4586 4587 /* Start with the statistics from previously deleted channels. */ 4588 pthread_mutex_lock(&bdev->internal.mutex); 4589 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 4590 pthread_mutex_unlock(&bdev->internal.mutex); 4591 4592 /* Then iterate and add the statistics from each existing channel. */ 4593 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4594 bdev_get_each_channel_stat, 4595 bdev_iostat_ctx, 4596 bdev_get_device_stat_done); 4597 } 4598 4599 int 4600 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4601 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4602 spdk_bdev_io_completion_cb cb, void *cb_arg) 4603 { 4604 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4605 struct spdk_bdev_io *bdev_io; 4606 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4607 4608 if (!desc->write) { 4609 return -EBADF; 4610 } 4611 4612 bdev_io = bdev_channel_get_io(channel); 4613 if (!bdev_io) { 4614 return -ENOMEM; 4615 } 4616 4617 bdev_io->internal.ch = channel; 4618 bdev_io->internal.desc = desc; 4619 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 4620 bdev_io->u.nvme_passthru.cmd = *cmd; 4621 bdev_io->u.nvme_passthru.buf = buf; 4622 bdev_io->u.nvme_passthru.nbytes = nbytes; 4623 bdev_io->u.nvme_passthru.md_buf = NULL; 4624 bdev_io->u.nvme_passthru.md_len = 0; 4625 4626 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4627 4628 bdev_io_submit(bdev_io); 4629 return 0; 4630 } 4631 4632 int 4633 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4634 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4635 spdk_bdev_io_completion_cb cb, void *cb_arg) 4636 { 4637 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4638 struct spdk_bdev_io *bdev_io; 4639 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4640 4641 if (!desc->write) { 4642 /* 4643 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4644 * to easily determine if the command is a read or write, but for now just 4645 * do not allow io_passthru with a read-only descriptor. 4646 */ 4647 return -EBADF; 4648 } 4649 4650 bdev_io = bdev_channel_get_io(channel); 4651 if (!bdev_io) { 4652 return -ENOMEM; 4653 } 4654 4655 bdev_io->internal.ch = channel; 4656 bdev_io->internal.desc = desc; 4657 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 4658 bdev_io->u.nvme_passthru.cmd = *cmd; 4659 bdev_io->u.nvme_passthru.buf = buf; 4660 bdev_io->u.nvme_passthru.nbytes = nbytes; 4661 bdev_io->u.nvme_passthru.md_buf = NULL; 4662 bdev_io->u.nvme_passthru.md_len = 0; 4663 4664 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4665 4666 bdev_io_submit(bdev_io); 4667 return 0; 4668 } 4669 4670 int 4671 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4672 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 4673 spdk_bdev_io_completion_cb cb, void *cb_arg) 4674 { 4675 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4676 struct spdk_bdev_io *bdev_io; 4677 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4678 4679 if (!desc->write) { 4680 /* 4681 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4682 * to easily determine if the command is a read or write, but for now just 4683 * do not allow io_passthru with a read-only descriptor. 4684 */ 4685 return -EBADF; 4686 } 4687 4688 bdev_io = bdev_channel_get_io(channel); 4689 if (!bdev_io) { 4690 return -ENOMEM; 4691 } 4692 4693 bdev_io->internal.ch = channel; 4694 bdev_io->internal.desc = desc; 4695 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 4696 bdev_io->u.nvme_passthru.cmd = *cmd; 4697 bdev_io->u.nvme_passthru.buf = buf; 4698 bdev_io->u.nvme_passthru.nbytes = nbytes; 4699 bdev_io->u.nvme_passthru.md_buf = md_buf; 4700 bdev_io->u.nvme_passthru.md_len = md_len; 4701 4702 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4703 4704 bdev_io_submit(bdev_io); 4705 return 0; 4706 } 4707 4708 static void bdev_abort_retry(void *ctx); 4709 static void bdev_abort(struct spdk_bdev_io *parent_io); 4710 4711 static void 4712 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4713 { 4714 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 4715 struct spdk_bdev_io *parent_io = cb_arg; 4716 struct spdk_bdev_io *bio_to_abort, *tmp_io; 4717 4718 bio_to_abort = bdev_io->u.abort.bio_to_abort; 4719 4720 spdk_bdev_free_io(bdev_io); 4721 4722 if (!success) { 4723 /* Check if the target I/O completed in the meantime. */ 4724 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 4725 if (tmp_io == bio_to_abort) { 4726 break; 4727 } 4728 } 4729 4730 /* If the target I/O still exists, set the parent to failed. */ 4731 if (tmp_io != NULL) { 4732 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4733 } 4734 } 4735 4736 parent_io->u.bdev.split_outstanding--; 4737 if (parent_io->u.bdev.split_outstanding == 0) { 4738 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4739 bdev_abort_retry(parent_io); 4740 } else { 4741 bdev_io_complete(parent_io); 4742 } 4743 } 4744 } 4745 4746 static int 4747 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 4748 struct spdk_bdev_io *bio_to_abort, 4749 spdk_bdev_io_completion_cb cb, void *cb_arg) 4750 { 4751 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4752 struct spdk_bdev_io *bdev_io; 4753 4754 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 4755 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 4756 /* TODO: Abort reset or abort request. */ 4757 return -ENOTSUP; 4758 } 4759 4760 bdev_io = bdev_channel_get_io(channel); 4761 if (bdev_io == NULL) { 4762 return -ENOMEM; 4763 } 4764 4765 bdev_io->internal.ch = channel; 4766 bdev_io->internal.desc = desc; 4767 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 4768 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4769 4770 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 4771 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 4772 4773 /* Parent abort request is not submitted directly, but to manage its 4774 * execution add it to the submitted list here. 4775 */ 4776 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4777 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 4778 4779 bdev_abort(bdev_io); 4780 4781 return 0; 4782 } 4783 4784 bdev_io->u.abort.bio_to_abort = bio_to_abort; 4785 4786 /* Submit the abort request to the underlying bdev module. */ 4787 bdev_io_submit(bdev_io); 4788 4789 return 0; 4790 } 4791 4792 static uint32_t 4793 _bdev_abort(struct spdk_bdev_io *parent_io) 4794 { 4795 struct spdk_bdev_desc *desc = parent_io->internal.desc; 4796 struct spdk_bdev_channel *channel = parent_io->internal.ch; 4797 void *bio_cb_arg; 4798 struct spdk_bdev_io *bio_to_abort; 4799 uint32_t matched_ios; 4800 int rc; 4801 4802 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 4803 4804 /* matched_ios is returned and will be kept by the caller. 4805 * 4806 * This funcion will be used for two cases, 1) the same cb_arg is used for 4807 * multiple I/Os, 2) a single large I/O is split into smaller ones. 4808 * Incrementing split_outstanding directly here may confuse readers especially 4809 * for the 1st case. 4810 * 4811 * Completion of I/O abort is processed after stack unwinding. Hence this trick 4812 * works as expected. 4813 */ 4814 matched_ios = 0; 4815 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4816 4817 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 4818 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 4819 continue; 4820 } 4821 4822 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 4823 /* Any I/O which was submitted after this abort command should be excluded. */ 4824 continue; 4825 } 4826 4827 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 4828 if (rc != 0) { 4829 if (rc == -ENOMEM) { 4830 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 4831 } else { 4832 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4833 } 4834 break; 4835 } 4836 matched_ios++; 4837 } 4838 4839 return matched_ios; 4840 } 4841 4842 static void 4843 bdev_abort_retry(void *ctx) 4844 { 4845 struct spdk_bdev_io *parent_io = ctx; 4846 uint32_t matched_ios; 4847 4848 matched_ios = _bdev_abort(parent_io); 4849 4850 if (matched_ios == 0) { 4851 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4852 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 4853 } else { 4854 /* For retry, the case that no target I/O was found is success 4855 * because it means target I/Os completed in the meantime. 4856 */ 4857 bdev_io_complete(parent_io); 4858 } 4859 return; 4860 } 4861 4862 /* Use split_outstanding to manage the progress of aborting I/Os. */ 4863 parent_io->u.bdev.split_outstanding = matched_ios; 4864 } 4865 4866 static void 4867 bdev_abort(struct spdk_bdev_io *parent_io) 4868 { 4869 uint32_t matched_ios; 4870 4871 matched_ios = _bdev_abort(parent_io); 4872 4873 if (matched_ios == 0) { 4874 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4875 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 4876 } else { 4877 /* The case the no target I/O was found is failure. */ 4878 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4879 bdev_io_complete(parent_io); 4880 } 4881 return; 4882 } 4883 4884 /* Use split_outstanding to manage the progress of aborting I/Os. */ 4885 parent_io->u.bdev.split_outstanding = matched_ios; 4886 } 4887 4888 int 4889 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4890 void *bio_cb_arg, 4891 spdk_bdev_io_completion_cb cb, void *cb_arg) 4892 { 4893 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4894 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4895 struct spdk_bdev_io *bdev_io; 4896 4897 if (bio_cb_arg == NULL) { 4898 return -EINVAL; 4899 } 4900 4901 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 4902 return -ENOTSUP; 4903 } 4904 4905 bdev_io = bdev_channel_get_io(channel); 4906 if (bdev_io == NULL) { 4907 return -ENOMEM; 4908 } 4909 4910 bdev_io->internal.ch = channel; 4911 bdev_io->internal.desc = desc; 4912 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4913 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 4914 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4915 4916 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 4917 4918 /* Parent abort request is not submitted directly, but to manage its execution, 4919 * add it to the submitted list here. 4920 */ 4921 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 4922 4923 bdev_abort(bdev_io); 4924 4925 return 0; 4926 } 4927 4928 int 4929 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4930 struct spdk_bdev_io_wait_entry *entry) 4931 { 4932 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4933 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 4934 4935 if (bdev != entry->bdev) { 4936 SPDK_ERRLOG("bdevs do not match\n"); 4937 return -EINVAL; 4938 } 4939 4940 if (mgmt_ch->per_thread_cache_count > 0) { 4941 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 4942 return -EINVAL; 4943 } 4944 4945 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 4946 return 0; 4947 } 4948 4949 static void 4950 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 4951 { 4952 struct spdk_bdev *bdev = bdev_ch->bdev; 4953 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 4954 struct spdk_bdev_io *bdev_io; 4955 4956 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 4957 /* 4958 * Allow some more I/O to complete before retrying the nomem_io queue. 4959 * Some drivers (such as nvme) cannot immediately take a new I/O in 4960 * the context of a completion, because the resources for the I/O are 4961 * not released until control returns to the bdev poller. Also, we 4962 * may require several small I/O to complete before a larger I/O 4963 * (that requires splitting) can be submitted. 4964 */ 4965 return; 4966 } 4967 4968 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 4969 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 4970 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 4971 bdev_io->internal.ch->io_outstanding++; 4972 shared_resource->io_outstanding++; 4973 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4974 bdev_io->internal.error.nvme.cdw0 = 0; 4975 bdev_io->num_retries++; 4976 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 4977 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4978 break; 4979 } 4980 } 4981 } 4982 4983 static inline void 4984 bdev_io_complete(void *ctx) 4985 { 4986 struct spdk_bdev_io *bdev_io = ctx; 4987 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 4988 uint64_t tsc, tsc_diff; 4989 4990 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 4991 /* 4992 * Send the completion to the thread that originally submitted the I/O, 4993 * which may not be the current thread in the case of QoS. 4994 */ 4995 if (bdev_io->internal.io_submit_ch) { 4996 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4997 bdev_io->internal.io_submit_ch = NULL; 4998 } 4999 5000 /* 5001 * Defer completion to avoid potential infinite recursion if the 5002 * user's completion callback issues a new I/O. 5003 */ 5004 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5005 bdev_io_complete, bdev_io); 5006 return; 5007 } 5008 5009 tsc = spdk_get_ticks(); 5010 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5011 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 5012 5013 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5014 5015 if (bdev_io->internal.ch->histogram) { 5016 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5017 } 5018 5019 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5020 switch (bdev_io->type) { 5021 case SPDK_BDEV_IO_TYPE_READ: 5022 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5023 bdev_io->internal.ch->stat.num_read_ops++; 5024 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5025 break; 5026 case SPDK_BDEV_IO_TYPE_WRITE: 5027 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5028 bdev_io->internal.ch->stat.num_write_ops++; 5029 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5030 break; 5031 case SPDK_BDEV_IO_TYPE_UNMAP: 5032 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5033 bdev_io->internal.ch->stat.num_unmap_ops++; 5034 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5035 break; 5036 case SPDK_BDEV_IO_TYPE_ZCOPY: 5037 /* Track the data in the start phase only */ 5038 if (bdev_io->u.bdev.zcopy.start) { 5039 if (bdev_io->u.bdev.zcopy.populate) { 5040 bdev_io->internal.ch->stat.bytes_read += 5041 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5042 bdev_io->internal.ch->stat.num_read_ops++; 5043 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5044 } else { 5045 bdev_io->internal.ch->stat.bytes_written += 5046 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5047 bdev_io->internal.ch->stat.num_write_ops++; 5048 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5049 } 5050 } 5051 break; 5052 default: 5053 break; 5054 } 5055 } 5056 5057 #ifdef SPDK_CONFIG_VTUNE 5058 uint64_t now_tsc = spdk_get_ticks(); 5059 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5060 uint64_t data[5]; 5061 5062 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5063 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5064 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5065 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5066 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5067 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5068 5069 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5070 __itt_metadata_u64, 5, data); 5071 5072 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5073 bdev_io->internal.ch->start_tsc = now_tsc; 5074 } 5075 #endif 5076 5077 assert(bdev_io->internal.cb != NULL); 5078 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5079 5080 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5081 bdev_io->internal.caller_ctx); 5082 } 5083 5084 static void 5085 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5086 { 5087 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5088 5089 if (bdev_io->u.reset.ch_ref != NULL) { 5090 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5091 bdev_io->u.reset.ch_ref = NULL; 5092 } 5093 5094 bdev_io_complete(bdev_io); 5095 } 5096 5097 static void 5098 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5099 { 5100 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5101 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5102 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5103 struct spdk_bdev_io *queued_reset; 5104 5105 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5106 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5107 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5108 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5109 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5110 } 5111 5112 spdk_for_each_channel_continue(i, 0); 5113 } 5114 5115 void 5116 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5117 { 5118 struct spdk_bdev *bdev = bdev_io->bdev; 5119 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5120 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5121 5122 bdev_io->internal.status = status; 5123 5124 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5125 bool unlock_channels = false; 5126 5127 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5128 SPDK_ERRLOG("NOMEM returned for reset\n"); 5129 } 5130 pthread_mutex_lock(&bdev->internal.mutex); 5131 if (bdev_io == bdev->internal.reset_in_progress) { 5132 bdev->internal.reset_in_progress = NULL; 5133 unlock_channels = true; 5134 } 5135 pthread_mutex_unlock(&bdev->internal.mutex); 5136 5137 if (unlock_channels) { 5138 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5139 bdev_io, bdev_reset_complete); 5140 return; 5141 } 5142 } else { 5143 _bdev_io_unset_bounce_buf(bdev_io); 5144 5145 assert(bdev_ch->io_outstanding > 0); 5146 assert(shared_resource->io_outstanding > 0); 5147 bdev_ch->io_outstanding--; 5148 shared_resource->io_outstanding--; 5149 5150 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 5151 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 5152 /* 5153 * Wait for some of the outstanding I/O to complete before we 5154 * retry any of the nomem_io. Normally we will wait for 5155 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 5156 * depth channels we will instead wait for half to complete. 5157 */ 5158 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 5159 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 5160 return; 5161 } 5162 5163 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 5164 bdev_ch_retry_io(bdev_ch); 5165 } 5166 } 5167 5168 bdev_io_complete(bdev_io); 5169 } 5170 5171 void 5172 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5173 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5174 { 5175 if (sc == SPDK_SCSI_STATUS_GOOD) { 5176 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5177 } else { 5178 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5179 bdev_io->internal.error.scsi.sc = sc; 5180 bdev_io->internal.error.scsi.sk = sk; 5181 bdev_io->internal.error.scsi.asc = asc; 5182 bdev_io->internal.error.scsi.ascq = ascq; 5183 } 5184 5185 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5186 } 5187 5188 void 5189 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5190 int *sc, int *sk, int *asc, int *ascq) 5191 { 5192 assert(sc != NULL); 5193 assert(sk != NULL); 5194 assert(asc != NULL); 5195 assert(ascq != NULL); 5196 5197 switch (bdev_io->internal.status) { 5198 case SPDK_BDEV_IO_STATUS_SUCCESS: 5199 *sc = SPDK_SCSI_STATUS_GOOD; 5200 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5201 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5202 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5203 break; 5204 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5205 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5206 break; 5207 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5208 *sc = bdev_io->internal.error.scsi.sc; 5209 *sk = bdev_io->internal.error.scsi.sk; 5210 *asc = bdev_io->internal.error.scsi.asc; 5211 *ascq = bdev_io->internal.error.scsi.ascq; 5212 break; 5213 default: 5214 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5215 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5216 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5217 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5218 break; 5219 } 5220 } 5221 5222 void 5223 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5224 { 5225 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5226 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5227 } else { 5228 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5229 } 5230 5231 bdev_io->internal.error.nvme.cdw0 = cdw0; 5232 bdev_io->internal.error.nvme.sct = sct; 5233 bdev_io->internal.error.nvme.sc = sc; 5234 5235 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5236 } 5237 5238 void 5239 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5240 { 5241 assert(sct != NULL); 5242 assert(sc != NULL); 5243 assert(cdw0 != NULL); 5244 5245 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5246 *sct = bdev_io->internal.error.nvme.sct; 5247 *sc = bdev_io->internal.error.nvme.sc; 5248 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5249 *sct = SPDK_NVME_SCT_GENERIC; 5250 *sc = SPDK_NVME_SC_SUCCESS; 5251 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5252 *sct = SPDK_NVME_SCT_GENERIC; 5253 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5254 } else { 5255 *sct = SPDK_NVME_SCT_GENERIC; 5256 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5257 } 5258 5259 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5260 } 5261 5262 void 5263 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5264 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5265 { 5266 assert(first_sct != NULL); 5267 assert(first_sc != NULL); 5268 assert(second_sct != NULL); 5269 assert(second_sc != NULL); 5270 assert(cdw0 != NULL); 5271 5272 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5273 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5274 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5275 *first_sct = bdev_io->internal.error.nvme.sct; 5276 *first_sc = bdev_io->internal.error.nvme.sc; 5277 *second_sct = SPDK_NVME_SCT_GENERIC; 5278 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5279 } else { 5280 *first_sct = SPDK_NVME_SCT_GENERIC; 5281 *first_sc = SPDK_NVME_SC_SUCCESS; 5282 *second_sct = bdev_io->internal.error.nvme.sct; 5283 *second_sc = bdev_io->internal.error.nvme.sc; 5284 } 5285 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5286 *first_sct = SPDK_NVME_SCT_GENERIC; 5287 *first_sc = SPDK_NVME_SC_SUCCESS; 5288 *second_sct = SPDK_NVME_SCT_GENERIC; 5289 *second_sc = SPDK_NVME_SC_SUCCESS; 5290 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5291 *first_sct = SPDK_NVME_SCT_GENERIC; 5292 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5293 *second_sct = SPDK_NVME_SCT_GENERIC; 5294 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5295 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5296 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5297 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5298 *second_sct = SPDK_NVME_SCT_GENERIC; 5299 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5300 } else { 5301 *first_sct = SPDK_NVME_SCT_GENERIC; 5302 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5303 *second_sct = SPDK_NVME_SCT_GENERIC; 5304 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5305 } 5306 5307 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5308 } 5309 5310 struct spdk_thread * 5311 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5312 { 5313 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5314 } 5315 5316 struct spdk_io_channel * 5317 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5318 { 5319 return bdev_io->internal.ch->channel; 5320 } 5321 5322 static int 5323 bdev_init(struct spdk_bdev *bdev) 5324 { 5325 char *bdev_name; 5326 5327 assert(bdev->module != NULL); 5328 5329 if (!bdev->name) { 5330 SPDK_ERRLOG("Bdev name is NULL\n"); 5331 return -EINVAL; 5332 } 5333 5334 if (!strlen(bdev->name)) { 5335 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5336 return -EINVAL; 5337 } 5338 5339 if (spdk_bdev_get_by_name(bdev->name)) { 5340 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 5341 return -EEXIST; 5342 } 5343 5344 /* Users often register their own I/O devices using the bdev name. In 5345 * order to avoid conflicts, prepend bdev_. */ 5346 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5347 if (!bdev_name) { 5348 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5349 return -ENOMEM; 5350 } 5351 5352 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5353 bdev->internal.measured_queue_depth = UINT64_MAX; 5354 bdev->internal.claim_module = NULL; 5355 bdev->internal.qd_poller = NULL; 5356 bdev->internal.qos = NULL; 5357 5358 /* If the user didn't specify a uuid, generate one. */ 5359 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5360 spdk_uuid_generate(&bdev->uuid); 5361 } 5362 5363 if (spdk_bdev_get_buf_align(bdev) > 1) { 5364 if (bdev->split_on_optimal_io_boundary) { 5365 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5366 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5367 } else { 5368 bdev->split_on_optimal_io_boundary = true; 5369 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5370 } 5371 } 5372 5373 /* If the user didn't specify a write unit size, set it to one. */ 5374 if (bdev->write_unit_size == 0) { 5375 bdev->write_unit_size = 1; 5376 } 5377 5378 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5379 if (bdev->acwu == 0) { 5380 bdev->acwu = 1; 5381 } 5382 5383 TAILQ_INIT(&bdev->internal.open_descs); 5384 TAILQ_INIT(&bdev->internal.locked_ranges); 5385 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5386 5387 TAILQ_INIT(&bdev->aliases); 5388 5389 bdev->internal.reset_in_progress = NULL; 5390 5391 spdk_io_device_register(__bdev_to_io_dev(bdev), 5392 bdev_channel_create, bdev_channel_destroy, 5393 sizeof(struct spdk_bdev_channel), 5394 bdev_name); 5395 5396 free(bdev_name); 5397 5398 pthread_mutex_init(&bdev->internal.mutex, NULL); 5399 return 0; 5400 } 5401 5402 static void 5403 bdev_destroy_cb(void *io_device) 5404 { 5405 int rc; 5406 struct spdk_bdev *bdev; 5407 spdk_bdev_unregister_cb cb_fn; 5408 void *cb_arg; 5409 5410 bdev = __bdev_from_io_dev(io_device); 5411 cb_fn = bdev->internal.unregister_cb; 5412 cb_arg = bdev->internal.unregister_ctx; 5413 5414 rc = bdev->fn_table->destruct(bdev->ctxt); 5415 if (rc < 0) { 5416 SPDK_ERRLOG("destruct failed\n"); 5417 } 5418 if (rc <= 0 && cb_fn != NULL) { 5419 cb_fn(cb_arg, rc); 5420 } 5421 } 5422 5423 5424 static void 5425 bdev_fini(struct spdk_bdev *bdev) 5426 { 5427 pthread_mutex_destroy(&bdev->internal.mutex); 5428 5429 free(bdev->internal.qos); 5430 5431 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5432 } 5433 5434 static void 5435 bdev_start(struct spdk_bdev *bdev) 5436 { 5437 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 5438 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5439 5440 /* Examine configuration before initializing I/O */ 5441 bdev_examine(bdev); 5442 } 5443 5444 int 5445 spdk_bdev_register(struct spdk_bdev *bdev) 5446 { 5447 int rc = bdev_init(bdev); 5448 5449 if (rc == 0) { 5450 bdev_start(bdev); 5451 } 5452 5453 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5454 return rc; 5455 } 5456 5457 int 5458 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 5459 { 5460 SPDK_ERRLOG("This function is deprecated. Use spdk_bdev_register() instead.\n"); 5461 return spdk_bdev_register(vbdev); 5462 } 5463 5464 void 5465 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5466 { 5467 if (bdev->internal.unregister_cb != NULL) { 5468 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5469 } 5470 } 5471 5472 static void 5473 _remove_notify(void *arg) 5474 { 5475 struct spdk_bdev_desc *desc = arg; 5476 5477 pthread_mutex_lock(&desc->mutex); 5478 desc->refs--; 5479 5480 if (!desc->closed) { 5481 pthread_mutex_unlock(&desc->mutex); 5482 if (desc->callback.open_with_ext) { 5483 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5484 } else { 5485 desc->callback.remove_fn(desc->callback.ctx); 5486 } 5487 return; 5488 } else if (0 == desc->refs) { 5489 /* This descriptor was closed after this remove_notify message was sent. 5490 * spdk_bdev_close() could not free the descriptor since this message was 5491 * in flight, so we free it now using bdev_desc_free(). 5492 */ 5493 pthread_mutex_unlock(&desc->mutex); 5494 bdev_desc_free(desc); 5495 return; 5496 } 5497 pthread_mutex_unlock(&desc->mutex); 5498 } 5499 5500 /* Must be called while holding bdev->internal.mutex. 5501 * returns: 0 - bdev removed and ready to be destructed. 5502 * -EBUSY - bdev can't be destructed yet. */ 5503 static int 5504 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5505 { 5506 struct spdk_bdev_desc *desc, *tmp; 5507 int rc = 0; 5508 5509 /* Notify each descriptor about hotremoval */ 5510 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5511 rc = -EBUSY; 5512 pthread_mutex_lock(&desc->mutex); 5513 /* 5514 * Defer invocation of the event_cb to a separate message that will 5515 * run later on its thread. This ensures this context unwinds and 5516 * we don't recursively unregister this bdev again if the event_cb 5517 * immediately closes its descriptor. 5518 */ 5519 desc->refs++; 5520 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5521 pthread_mutex_unlock(&desc->mutex); 5522 } 5523 5524 /* If there are no descriptors, proceed removing the bdev */ 5525 if (rc == 0) { 5526 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5527 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 5528 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5529 } 5530 5531 return rc; 5532 } 5533 5534 void 5535 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5536 { 5537 struct spdk_thread *thread; 5538 int rc; 5539 5540 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 5541 5542 thread = spdk_get_thread(); 5543 if (!thread) { 5544 /* The user called this from a non-SPDK thread. */ 5545 if (cb_fn != NULL) { 5546 cb_fn(cb_arg, -ENOTSUP); 5547 } 5548 return; 5549 } 5550 5551 pthread_mutex_lock(&g_bdev_mgr.mutex); 5552 pthread_mutex_lock(&bdev->internal.mutex); 5553 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5554 pthread_mutex_unlock(&bdev->internal.mutex); 5555 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5556 if (cb_fn) { 5557 cb_fn(cb_arg, -EBUSY); 5558 } 5559 return; 5560 } 5561 5562 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 5563 bdev->internal.unregister_cb = cb_fn; 5564 bdev->internal.unregister_ctx = cb_arg; 5565 5566 /* Call under lock. */ 5567 rc = bdev_unregister_unsafe(bdev); 5568 pthread_mutex_unlock(&bdev->internal.mutex); 5569 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5570 5571 if (rc == 0) { 5572 bdev_fini(bdev); 5573 } 5574 } 5575 5576 static void 5577 bdev_dummy_event_cb(void *remove_ctx) 5578 { 5579 SPDK_DEBUGLOG(bdev, "Bdev remove event received with no remove callback specified"); 5580 } 5581 5582 static int 5583 bdev_start_qos(struct spdk_bdev *bdev) 5584 { 5585 struct set_qos_limit_ctx *ctx; 5586 5587 /* Enable QoS */ 5588 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 5589 ctx = calloc(1, sizeof(*ctx)); 5590 if (ctx == NULL) { 5591 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 5592 return -ENOMEM; 5593 } 5594 ctx->bdev = bdev; 5595 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5596 bdev_enable_qos_msg, ctx, 5597 bdev_enable_qos_done); 5598 } 5599 5600 return 0; 5601 } 5602 5603 static int 5604 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 5605 { 5606 struct spdk_thread *thread; 5607 int rc = 0; 5608 5609 thread = spdk_get_thread(); 5610 if (!thread) { 5611 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 5612 return -ENOTSUP; 5613 } 5614 5615 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5616 spdk_get_thread()); 5617 5618 desc->bdev = bdev; 5619 desc->thread = thread; 5620 desc->write = write; 5621 5622 pthread_mutex_lock(&bdev->internal.mutex); 5623 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5624 pthread_mutex_unlock(&bdev->internal.mutex); 5625 return -ENODEV; 5626 } 5627 5628 if (write && bdev->internal.claim_module) { 5629 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 5630 bdev->name, bdev->internal.claim_module->name); 5631 pthread_mutex_unlock(&bdev->internal.mutex); 5632 return -EPERM; 5633 } 5634 5635 rc = bdev_start_qos(bdev); 5636 if (rc != 0) { 5637 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 5638 pthread_mutex_unlock(&bdev->internal.mutex); 5639 return rc; 5640 } 5641 5642 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 5643 5644 pthread_mutex_unlock(&bdev->internal.mutex); 5645 5646 return 0; 5647 } 5648 5649 int 5650 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 5651 void *remove_ctx, struct spdk_bdev_desc **_desc) 5652 { 5653 struct spdk_bdev_desc *desc; 5654 int rc; 5655 5656 desc = calloc(1, sizeof(*desc)); 5657 if (desc == NULL) { 5658 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5659 return -ENOMEM; 5660 } 5661 5662 if (remove_cb == NULL) { 5663 remove_cb = bdev_dummy_event_cb; 5664 } 5665 5666 TAILQ_INIT(&desc->pending_media_events); 5667 TAILQ_INIT(&desc->free_media_events); 5668 5669 desc->callback.open_with_ext = false; 5670 desc->callback.remove_fn = remove_cb; 5671 desc->callback.ctx = remove_ctx; 5672 pthread_mutex_init(&desc->mutex, NULL); 5673 5674 pthread_mutex_lock(&g_bdev_mgr.mutex); 5675 5676 rc = bdev_open(bdev, write, desc); 5677 if (rc != 0) { 5678 bdev_desc_free(desc); 5679 desc = NULL; 5680 } 5681 5682 *_desc = desc; 5683 5684 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5685 5686 return rc; 5687 } 5688 5689 int 5690 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 5691 void *event_ctx, struct spdk_bdev_desc **_desc) 5692 { 5693 struct spdk_bdev_desc *desc; 5694 struct spdk_bdev *bdev; 5695 unsigned int event_id; 5696 int rc; 5697 5698 if (event_cb == NULL) { 5699 SPDK_ERRLOG("Missing event callback function\n"); 5700 return -EINVAL; 5701 } 5702 5703 pthread_mutex_lock(&g_bdev_mgr.mutex); 5704 5705 bdev = spdk_bdev_get_by_name(bdev_name); 5706 5707 if (bdev == NULL) { 5708 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 5709 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5710 return -ENODEV; 5711 } 5712 5713 desc = calloc(1, sizeof(*desc)); 5714 if (desc == NULL) { 5715 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5716 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5717 return -ENOMEM; 5718 } 5719 5720 TAILQ_INIT(&desc->pending_media_events); 5721 TAILQ_INIT(&desc->free_media_events); 5722 5723 desc->callback.open_with_ext = true; 5724 desc->callback.event_fn = event_cb; 5725 desc->callback.ctx = event_ctx; 5726 pthread_mutex_init(&desc->mutex, NULL); 5727 5728 if (bdev->media_events) { 5729 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 5730 sizeof(*desc->media_events_buffer)); 5731 if (desc->media_events_buffer == NULL) { 5732 SPDK_ERRLOG("Failed to initialize media event pool\n"); 5733 bdev_desc_free(desc); 5734 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5735 return -ENOMEM; 5736 } 5737 5738 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 5739 TAILQ_INSERT_TAIL(&desc->free_media_events, 5740 &desc->media_events_buffer[event_id], tailq); 5741 } 5742 } 5743 5744 rc = bdev_open(bdev, write, desc); 5745 if (rc != 0) { 5746 bdev_desc_free(desc); 5747 desc = NULL; 5748 } 5749 5750 *_desc = desc; 5751 5752 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5753 5754 return rc; 5755 } 5756 5757 void 5758 spdk_bdev_close(struct spdk_bdev_desc *desc) 5759 { 5760 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5761 int rc; 5762 5763 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5764 spdk_get_thread()); 5765 5766 assert(desc->thread == spdk_get_thread()); 5767 5768 spdk_poller_unregister(&desc->io_timeout_poller); 5769 5770 pthread_mutex_lock(&bdev->internal.mutex); 5771 pthread_mutex_lock(&desc->mutex); 5772 5773 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 5774 5775 desc->closed = true; 5776 5777 if (0 == desc->refs) { 5778 pthread_mutex_unlock(&desc->mutex); 5779 bdev_desc_free(desc); 5780 } else { 5781 pthread_mutex_unlock(&desc->mutex); 5782 } 5783 5784 /* If no more descriptors, kill QoS channel */ 5785 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5786 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 5787 bdev->name, spdk_get_thread()); 5788 5789 if (bdev_qos_destroy(bdev)) { 5790 /* There isn't anything we can do to recover here. Just let the 5791 * old QoS poller keep running. The QoS handling won't change 5792 * cores when the user allocates a new channel, but it won't break. */ 5793 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 5794 } 5795 } 5796 5797 spdk_bdev_set_qd_sampling_period(bdev, 0); 5798 5799 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5800 rc = bdev_unregister_unsafe(bdev); 5801 pthread_mutex_unlock(&bdev->internal.mutex); 5802 5803 if (rc == 0) { 5804 bdev_fini(bdev); 5805 } 5806 } else { 5807 pthread_mutex_unlock(&bdev->internal.mutex); 5808 } 5809 } 5810 5811 int 5812 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 5813 struct spdk_bdev_module *module) 5814 { 5815 if (bdev->internal.claim_module != NULL) { 5816 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 5817 bdev->internal.claim_module->name); 5818 return -EPERM; 5819 } 5820 5821 if (desc && !desc->write) { 5822 desc->write = true; 5823 } 5824 5825 bdev->internal.claim_module = module; 5826 return 0; 5827 } 5828 5829 void 5830 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 5831 { 5832 assert(bdev->internal.claim_module != NULL); 5833 bdev->internal.claim_module = NULL; 5834 } 5835 5836 struct spdk_bdev * 5837 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 5838 { 5839 assert(desc != NULL); 5840 return desc->bdev; 5841 } 5842 5843 void 5844 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 5845 { 5846 struct iovec *iovs; 5847 int iovcnt; 5848 5849 if (bdev_io == NULL) { 5850 return; 5851 } 5852 5853 switch (bdev_io->type) { 5854 case SPDK_BDEV_IO_TYPE_READ: 5855 case SPDK_BDEV_IO_TYPE_WRITE: 5856 case SPDK_BDEV_IO_TYPE_ZCOPY: 5857 iovs = bdev_io->u.bdev.iovs; 5858 iovcnt = bdev_io->u.bdev.iovcnt; 5859 break; 5860 default: 5861 iovs = NULL; 5862 iovcnt = 0; 5863 break; 5864 } 5865 5866 if (iovp) { 5867 *iovp = iovs; 5868 } 5869 if (iovcntp) { 5870 *iovcntp = iovcnt; 5871 } 5872 } 5873 5874 void * 5875 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 5876 { 5877 if (bdev_io == NULL) { 5878 return NULL; 5879 } 5880 5881 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 5882 return NULL; 5883 } 5884 5885 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 5886 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 5887 return bdev_io->u.bdev.md_buf; 5888 } 5889 5890 return NULL; 5891 } 5892 5893 void * 5894 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 5895 { 5896 if (bdev_io == NULL) { 5897 assert(false); 5898 return NULL; 5899 } 5900 5901 return bdev_io->internal.caller_ctx; 5902 } 5903 5904 void 5905 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 5906 { 5907 5908 if (spdk_bdev_module_list_find(bdev_module->name)) { 5909 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 5910 assert(false); 5911 } 5912 5913 /* 5914 * Modules with examine callbacks must be initialized first, so they are 5915 * ready to handle examine callbacks from later modules that will 5916 * register physical bdevs. 5917 */ 5918 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 5919 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5920 } else { 5921 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5922 } 5923 } 5924 5925 struct spdk_bdev_module * 5926 spdk_bdev_module_list_find(const char *name) 5927 { 5928 struct spdk_bdev_module *bdev_module; 5929 5930 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 5931 if (strcmp(name, bdev_module->name) == 0) { 5932 break; 5933 } 5934 } 5935 5936 return bdev_module; 5937 } 5938 5939 static void 5940 bdev_write_zero_buffer_next(void *_bdev_io) 5941 { 5942 struct spdk_bdev_io *bdev_io = _bdev_io; 5943 uint64_t num_bytes, num_blocks; 5944 void *md_buf = NULL; 5945 int rc; 5946 5947 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 5948 bdev_io->u.bdev.split_remaining_num_blocks, 5949 ZERO_BUFFER_SIZE); 5950 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 5951 5952 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 5953 md_buf = (char *)g_bdev_mgr.zero_buffer + 5954 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 5955 } 5956 5957 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 5958 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5959 g_bdev_mgr.zero_buffer, md_buf, 5960 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 5961 bdev_write_zero_buffer_done, bdev_io); 5962 if (rc == 0) { 5963 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 5964 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 5965 } else if (rc == -ENOMEM) { 5966 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 5967 } else { 5968 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5969 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5970 } 5971 } 5972 5973 static void 5974 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5975 { 5976 struct spdk_bdev_io *parent_io = cb_arg; 5977 5978 spdk_bdev_free_io(bdev_io); 5979 5980 if (!success) { 5981 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5982 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5983 return; 5984 } 5985 5986 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 5987 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5988 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5989 return; 5990 } 5991 5992 bdev_write_zero_buffer_next(parent_io); 5993 } 5994 5995 static void 5996 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 5997 { 5998 pthread_mutex_lock(&ctx->bdev->internal.mutex); 5999 ctx->bdev->internal.qos_mod_in_progress = false; 6000 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6001 6002 if (ctx->cb_fn) { 6003 ctx->cb_fn(ctx->cb_arg, status); 6004 } 6005 free(ctx); 6006 } 6007 6008 static void 6009 bdev_disable_qos_done(void *cb_arg) 6010 { 6011 struct set_qos_limit_ctx *ctx = cb_arg; 6012 struct spdk_bdev *bdev = ctx->bdev; 6013 struct spdk_bdev_io *bdev_io; 6014 struct spdk_bdev_qos *qos; 6015 6016 pthread_mutex_lock(&bdev->internal.mutex); 6017 qos = bdev->internal.qos; 6018 bdev->internal.qos = NULL; 6019 pthread_mutex_unlock(&bdev->internal.mutex); 6020 6021 while (!TAILQ_EMPTY(&qos->queued)) { 6022 /* Send queued I/O back to their original thread for resubmission. */ 6023 bdev_io = TAILQ_FIRST(&qos->queued); 6024 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6025 6026 if (bdev_io->internal.io_submit_ch) { 6027 /* 6028 * Channel was changed when sending it to the QoS thread - change it back 6029 * before sending it back to the original thread. 6030 */ 6031 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6032 bdev_io->internal.io_submit_ch = NULL; 6033 } 6034 6035 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6036 _bdev_io_submit, bdev_io); 6037 } 6038 6039 if (qos->thread != NULL) { 6040 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6041 spdk_poller_unregister(&qos->poller); 6042 } 6043 6044 free(qos); 6045 6046 bdev_set_qos_limit_done(ctx, 0); 6047 } 6048 6049 static void 6050 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6051 { 6052 void *io_device = spdk_io_channel_iter_get_io_device(i); 6053 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6054 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6055 struct spdk_thread *thread; 6056 6057 pthread_mutex_lock(&bdev->internal.mutex); 6058 thread = bdev->internal.qos->thread; 6059 pthread_mutex_unlock(&bdev->internal.mutex); 6060 6061 if (thread != NULL) { 6062 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6063 } else { 6064 bdev_disable_qos_done(ctx); 6065 } 6066 } 6067 6068 static void 6069 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6070 { 6071 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6072 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6073 6074 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6075 6076 spdk_for_each_channel_continue(i, 0); 6077 } 6078 6079 static void 6080 bdev_update_qos_rate_limit_msg(void *cb_arg) 6081 { 6082 struct set_qos_limit_ctx *ctx = cb_arg; 6083 struct spdk_bdev *bdev = ctx->bdev; 6084 6085 pthread_mutex_lock(&bdev->internal.mutex); 6086 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6087 pthread_mutex_unlock(&bdev->internal.mutex); 6088 6089 bdev_set_qos_limit_done(ctx, 0); 6090 } 6091 6092 static void 6093 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6094 { 6095 void *io_device = spdk_io_channel_iter_get_io_device(i); 6096 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6097 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6098 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6099 6100 pthread_mutex_lock(&bdev->internal.mutex); 6101 bdev_enable_qos(bdev, bdev_ch); 6102 pthread_mutex_unlock(&bdev->internal.mutex); 6103 spdk_for_each_channel_continue(i, 0); 6104 } 6105 6106 static void 6107 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6108 { 6109 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6110 6111 bdev_set_qos_limit_done(ctx, status); 6112 } 6113 6114 static void 6115 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6116 { 6117 int i; 6118 6119 assert(bdev->internal.qos != NULL); 6120 6121 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6122 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6123 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6124 6125 if (limits[i] == 0) { 6126 bdev->internal.qos->rate_limits[i].limit = 6127 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6128 } 6129 } 6130 } 6131 } 6132 6133 void 6134 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6135 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6136 { 6137 struct set_qos_limit_ctx *ctx; 6138 uint32_t limit_set_complement; 6139 uint64_t min_limit_per_sec; 6140 int i; 6141 bool disable_rate_limit = true; 6142 6143 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6144 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6145 continue; 6146 } 6147 6148 if (limits[i] > 0) { 6149 disable_rate_limit = false; 6150 } 6151 6152 if (bdev_qos_is_iops_rate_limit(i) == true) { 6153 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6154 } else { 6155 /* Change from megabyte to byte rate limit */ 6156 limits[i] = limits[i] * 1024 * 1024; 6157 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6158 } 6159 6160 limit_set_complement = limits[i] % min_limit_per_sec; 6161 if (limit_set_complement) { 6162 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6163 limits[i], min_limit_per_sec); 6164 limits[i] += min_limit_per_sec - limit_set_complement; 6165 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6166 } 6167 } 6168 6169 ctx = calloc(1, sizeof(*ctx)); 6170 if (ctx == NULL) { 6171 cb_fn(cb_arg, -ENOMEM); 6172 return; 6173 } 6174 6175 ctx->cb_fn = cb_fn; 6176 ctx->cb_arg = cb_arg; 6177 ctx->bdev = bdev; 6178 6179 pthread_mutex_lock(&bdev->internal.mutex); 6180 if (bdev->internal.qos_mod_in_progress) { 6181 pthread_mutex_unlock(&bdev->internal.mutex); 6182 free(ctx); 6183 cb_fn(cb_arg, -EAGAIN); 6184 return; 6185 } 6186 bdev->internal.qos_mod_in_progress = true; 6187 6188 if (disable_rate_limit == true && bdev->internal.qos) { 6189 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6190 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6191 (bdev->internal.qos->rate_limits[i].limit > 0 && 6192 bdev->internal.qos->rate_limits[i].limit != 6193 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6194 disable_rate_limit = false; 6195 break; 6196 } 6197 } 6198 } 6199 6200 if (disable_rate_limit == false) { 6201 if (bdev->internal.qos == NULL) { 6202 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6203 if (!bdev->internal.qos) { 6204 pthread_mutex_unlock(&bdev->internal.mutex); 6205 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6206 bdev_set_qos_limit_done(ctx, -ENOMEM); 6207 return; 6208 } 6209 } 6210 6211 if (bdev->internal.qos->thread == NULL) { 6212 /* Enabling */ 6213 bdev_set_qos_rate_limits(bdev, limits); 6214 6215 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6216 bdev_enable_qos_msg, ctx, 6217 bdev_enable_qos_done); 6218 } else { 6219 /* Updating */ 6220 bdev_set_qos_rate_limits(bdev, limits); 6221 6222 spdk_thread_send_msg(bdev->internal.qos->thread, 6223 bdev_update_qos_rate_limit_msg, ctx); 6224 } 6225 } else { 6226 if (bdev->internal.qos != NULL) { 6227 bdev_set_qos_rate_limits(bdev, limits); 6228 6229 /* Disabling */ 6230 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6231 bdev_disable_qos_msg, ctx, 6232 bdev_disable_qos_msg_done); 6233 } else { 6234 pthread_mutex_unlock(&bdev->internal.mutex); 6235 bdev_set_qos_limit_done(ctx, 0); 6236 return; 6237 } 6238 } 6239 6240 pthread_mutex_unlock(&bdev->internal.mutex); 6241 } 6242 6243 struct spdk_bdev_histogram_ctx { 6244 spdk_bdev_histogram_status_cb cb_fn; 6245 void *cb_arg; 6246 struct spdk_bdev *bdev; 6247 int status; 6248 }; 6249 6250 static void 6251 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6252 { 6253 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6254 6255 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6256 ctx->bdev->internal.histogram_in_progress = false; 6257 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6258 ctx->cb_fn(ctx->cb_arg, ctx->status); 6259 free(ctx); 6260 } 6261 6262 static void 6263 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6264 { 6265 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6266 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6267 6268 if (ch->histogram != NULL) { 6269 spdk_histogram_data_free(ch->histogram); 6270 ch->histogram = NULL; 6271 } 6272 spdk_for_each_channel_continue(i, 0); 6273 } 6274 6275 static void 6276 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6277 { 6278 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6279 6280 if (status != 0) { 6281 ctx->status = status; 6282 ctx->bdev->internal.histogram_enabled = false; 6283 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6284 bdev_histogram_disable_channel_cb); 6285 } else { 6286 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6287 ctx->bdev->internal.histogram_in_progress = false; 6288 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6289 ctx->cb_fn(ctx->cb_arg, ctx->status); 6290 free(ctx); 6291 } 6292 } 6293 6294 static void 6295 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6296 { 6297 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6298 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6299 int status = 0; 6300 6301 if (ch->histogram == NULL) { 6302 ch->histogram = spdk_histogram_data_alloc(); 6303 if (ch->histogram == NULL) { 6304 status = -ENOMEM; 6305 } 6306 } 6307 6308 spdk_for_each_channel_continue(i, status); 6309 } 6310 6311 void 6312 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6313 void *cb_arg, bool enable) 6314 { 6315 struct spdk_bdev_histogram_ctx *ctx; 6316 6317 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6318 if (ctx == NULL) { 6319 cb_fn(cb_arg, -ENOMEM); 6320 return; 6321 } 6322 6323 ctx->bdev = bdev; 6324 ctx->status = 0; 6325 ctx->cb_fn = cb_fn; 6326 ctx->cb_arg = cb_arg; 6327 6328 pthread_mutex_lock(&bdev->internal.mutex); 6329 if (bdev->internal.histogram_in_progress) { 6330 pthread_mutex_unlock(&bdev->internal.mutex); 6331 free(ctx); 6332 cb_fn(cb_arg, -EAGAIN); 6333 return; 6334 } 6335 6336 bdev->internal.histogram_in_progress = true; 6337 pthread_mutex_unlock(&bdev->internal.mutex); 6338 6339 bdev->internal.histogram_enabled = enable; 6340 6341 if (enable) { 6342 /* Allocate histogram for each channel */ 6343 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6344 bdev_histogram_enable_channel_cb); 6345 } else { 6346 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6347 bdev_histogram_disable_channel_cb); 6348 } 6349 } 6350 6351 struct spdk_bdev_histogram_data_ctx { 6352 spdk_bdev_histogram_data_cb cb_fn; 6353 void *cb_arg; 6354 struct spdk_bdev *bdev; 6355 /** merged histogram data from all channels */ 6356 struct spdk_histogram_data *histogram; 6357 }; 6358 6359 static void 6360 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6361 { 6362 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6363 6364 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6365 free(ctx); 6366 } 6367 6368 static void 6369 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6370 { 6371 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6372 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6373 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6374 int status = 0; 6375 6376 if (ch->histogram == NULL) { 6377 status = -EFAULT; 6378 } else { 6379 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6380 } 6381 6382 spdk_for_each_channel_continue(i, status); 6383 } 6384 6385 void 6386 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6387 spdk_bdev_histogram_data_cb cb_fn, 6388 void *cb_arg) 6389 { 6390 struct spdk_bdev_histogram_data_ctx *ctx; 6391 6392 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6393 if (ctx == NULL) { 6394 cb_fn(cb_arg, -ENOMEM, NULL); 6395 return; 6396 } 6397 6398 ctx->bdev = bdev; 6399 ctx->cb_fn = cb_fn; 6400 ctx->cb_arg = cb_arg; 6401 6402 ctx->histogram = histogram; 6403 6404 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6405 bdev_histogram_get_channel_cb); 6406 } 6407 6408 size_t 6409 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6410 size_t max_events) 6411 { 6412 struct media_event_entry *entry; 6413 size_t num_events = 0; 6414 6415 for (; num_events < max_events; ++num_events) { 6416 entry = TAILQ_FIRST(&desc->pending_media_events); 6417 if (entry == NULL) { 6418 break; 6419 } 6420 6421 events[num_events] = entry->event; 6422 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6423 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6424 } 6425 6426 return num_events; 6427 } 6428 6429 int 6430 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6431 size_t num_events) 6432 { 6433 struct spdk_bdev_desc *desc; 6434 struct media_event_entry *entry; 6435 size_t event_id; 6436 int rc = 0; 6437 6438 assert(bdev->media_events); 6439 6440 pthread_mutex_lock(&bdev->internal.mutex); 6441 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6442 if (desc->write) { 6443 break; 6444 } 6445 } 6446 6447 if (desc == NULL || desc->media_events_buffer == NULL) { 6448 rc = -ENODEV; 6449 goto out; 6450 } 6451 6452 for (event_id = 0; event_id < num_events; ++event_id) { 6453 entry = TAILQ_FIRST(&desc->free_media_events); 6454 if (entry == NULL) { 6455 break; 6456 } 6457 6458 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6459 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6460 entry->event = events[event_id]; 6461 } 6462 6463 rc = event_id; 6464 out: 6465 pthread_mutex_unlock(&bdev->internal.mutex); 6466 return rc; 6467 } 6468 6469 void 6470 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6471 { 6472 struct spdk_bdev_desc *desc; 6473 6474 pthread_mutex_lock(&bdev->internal.mutex); 6475 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6476 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6477 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6478 desc->callback.ctx); 6479 } 6480 } 6481 pthread_mutex_unlock(&bdev->internal.mutex); 6482 } 6483 6484 struct locked_lba_range_ctx { 6485 struct lba_range range; 6486 struct spdk_bdev *bdev; 6487 struct lba_range *current_range; 6488 struct lba_range *owner_range; 6489 struct spdk_poller *poller; 6490 lock_range_cb cb_fn; 6491 void *cb_arg; 6492 }; 6493 6494 static void 6495 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6496 { 6497 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6498 6499 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6500 free(ctx); 6501 } 6502 6503 static void 6504 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6505 6506 static void 6507 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6508 { 6509 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6510 struct spdk_bdev *bdev = ctx->bdev; 6511 6512 if (status == -ENOMEM) { 6513 /* One of the channels could not allocate a range object. 6514 * So we have to go back and clean up any ranges that were 6515 * allocated successfully before we return error status to 6516 * the caller. We can reuse the unlock function to do that 6517 * clean up. 6518 */ 6519 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6520 bdev_unlock_lba_range_get_channel, ctx, 6521 bdev_lock_error_cleanup_cb); 6522 return; 6523 } 6524 6525 /* All channels have locked this range and no I/O overlapping the range 6526 * are outstanding! Set the owner_ch for the range object for the 6527 * locking channel, so that this channel will know that it is allowed 6528 * to write to this range. 6529 */ 6530 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6531 ctx->cb_fn(ctx->cb_arg, status); 6532 6533 /* Don't free the ctx here. Its range is in the bdev's global list of 6534 * locked ranges still, and will be removed and freed when this range 6535 * is later unlocked. 6536 */ 6537 } 6538 6539 static int 6540 bdev_lock_lba_range_check_io(void *_i) 6541 { 6542 struct spdk_io_channel_iter *i = _i; 6543 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6544 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6545 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6546 struct lba_range *range = ctx->current_range; 6547 struct spdk_bdev_io *bdev_io; 6548 6549 spdk_poller_unregister(&ctx->poller); 6550 6551 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6552 * range. But we need to wait until any outstanding IO overlapping with this range 6553 * are completed. 6554 */ 6555 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6556 if (bdev_io_range_is_locked(bdev_io, range)) { 6557 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6558 return SPDK_POLLER_BUSY; 6559 } 6560 } 6561 6562 spdk_for_each_channel_continue(i, 0); 6563 return SPDK_POLLER_BUSY; 6564 } 6565 6566 static void 6567 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6568 { 6569 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6570 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6571 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6572 struct lba_range *range; 6573 6574 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6575 if (range->length == ctx->range.length && 6576 range->offset == ctx->range.offset && 6577 range->locked_ctx == ctx->range.locked_ctx) { 6578 /* This range already exists on this channel, so don't add 6579 * it again. This can happen when a new channel is created 6580 * while the for_each_channel operation is in progress. 6581 * Do not check for outstanding I/O in that case, since the 6582 * range was locked before any I/O could be submitted to the 6583 * new channel. 6584 */ 6585 spdk_for_each_channel_continue(i, 0); 6586 return; 6587 } 6588 } 6589 6590 range = calloc(1, sizeof(*range)); 6591 if (range == NULL) { 6592 spdk_for_each_channel_continue(i, -ENOMEM); 6593 return; 6594 } 6595 6596 range->length = ctx->range.length; 6597 range->offset = ctx->range.offset; 6598 range->locked_ctx = ctx->range.locked_ctx; 6599 ctx->current_range = range; 6600 if (ctx->range.owner_ch == ch) { 6601 /* This is the range object for the channel that will hold 6602 * the lock. Store it in the ctx object so that we can easily 6603 * set its owner_ch after the lock is finally acquired. 6604 */ 6605 ctx->owner_range = range; 6606 } 6607 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 6608 bdev_lock_lba_range_check_io(i); 6609 } 6610 6611 static void 6612 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 6613 { 6614 assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread); 6615 6616 /* We will add a copy of this range to each channel now. */ 6617 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 6618 bdev_lock_lba_range_cb); 6619 } 6620 6621 static bool 6622 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 6623 { 6624 struct lba_range *r; 6625 6626 TAILQ_FOREACH(r, tailq, tailq) { 6627 if (bdev_lba_range_overlapped(range, r)) { 6628 return true; 6629 } 6630 } 6631 return false; 6632 } 6633 6634 static int 6635 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6636 uint64_t offset, uint64_t length, 6637 lock_range_cb cb_fn, void *cb_arg) 6638 { 6639 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6640 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6641 struct locked_lba_range_ctx *ctx; 6642 6643 if (cb_arg == NULL) { 6644 SPDK_ERRLOG("cb_arg must not be NULL\n"); 6645 return -EINVAL; 6646 } 6647 6648 ctx = calloc(1, sizeof(*ctx)); 6649 if (ctx == NULL) { 6650 return -ENOMEM; 6651 } 6652 6653 ctx->range.offset = offset; 6654 ctx->range.length = length; 6655 ctx->range.owner_ch = ch; 6656 ctx->range.locked_ctx = cb_arg; 6657 ctx->bdev = bdev; 6658 ctx->cb_fn = cb_fn; 6659 ctx->cb_arg = cb_arg; 6660 6661 pthread_mutex_lock(&bdev->internal.mutex); 6662 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 6663 /* There is an active lock overlapping with this range. 6664 * Put it on the pending list until this range no 6665 * longer overlaps with another. 6666 */ 6667 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 6668 } else { 6669 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 6670 bdev_lock_lba_range_ctx(bdev, ctx); 6671 } 6672 pthread_mutex_unlock(&bdev->internal.mutex); 6673 return 0; 6674 } 6675 6676 static void 6677 bdev_lock_lba_range_ctx_msg(void *_ctx) 6678 { 6679 struct locked_lba_range_ctx *ctx = _ctx; 6680 6681 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 6682 } 6683 6684 static void 6685 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6686 { 6687 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6688 struct locked_lba_range_ctx *pending_ctx; 6689 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 6690 struct spdk_bdev *bdev = ch->bdev; 6691 struct lba_range *range, *tmp; 6692 6693 pthread_mutex_lock(&bdev->internal.mutex); 6694 /* Check if there are any pending locked ranges that overlap with this range 6695 * that was just unlocked. If there are, check that it doesn't overlap with any 6696 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 6697 * the lock process. 6698 */ 6699 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 6700 if (bdev_lba_range_overlapped(range, &ctx->range) && 6701 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 6702 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 6703 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6704 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 6705 spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread, 6706 bdev_lock_lba_range_ctx_msg, pending_ctx); 6707 } 6708 } 6709 pthread_mutex_unlock(&bdev->internal.mutex); 6710 6711 ctx->cb_fn(ctx->cb_arg, status); 6712 free(ctx); 6713 } 6714 6715 static void 6716 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6717 { 6718 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6719 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6720 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6721 TAILQ_HEAD(, spdk_bdev_io) io_locked; 6722 struct spdk_bdev_io *bdev_io; 6723 struct lba_range *range; 6724 6725 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6726 if (ctx->range.offset == range->offset && 6727 ctx->range.length == range->length && 6728 ctx->range.locked_ctx == range->locked_ctx) { 6729 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 6730 free(range); 6731 break; 6732 } 6733 } 6734 6735 /* Note: we should almost always be able to assert that the range specified 6736 * was found. But there are some very rare corner cases where a new channel 6737 * gets created simultaneously with a range unlock, where this function 6738 * would execute on that new channel and wouldn't have the range. 6739 * We also use this to clean up range allocations when a later allocation 6740 * fails in the locking path. 6741 * So we can't actually assert() here. 6742 */ 6743 6744 /* Swap the locked IO into a temporary list, and then try to submit them again. 6745 * We could hyper-optimize this to only resubmit locked I/O that overlap 6746 * with the range that was just unlocked, but this isn't a performance path so 6747 * we go for simplicity here. 6748 */ 6749 TAILQ_INIT(&io_locked); 6750 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 6751 while (!TAILQ_EMPTY(&io_locked)) { 6752 bdev_io = TAILQ_FIRST(&io_locked); 6753 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 6754 bdev_io_submit(bdev_io); 6755 } 6756 6757 spdk_for_each_channel_continue(i, 0); 6758 } 6759 6760 static int 6761 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6762 uint64_t offset, uint64_t length, 6763 lock_range_cb cb_fn, void *cb_arg) 6764 { 6765 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6766 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6767 struct locked_lba_range_ctx *ctx; 6768 struct lba_range *range; 6769 bool range_found = false; 6770 6771 /* Let's make sure the specified channel actually has a lock on 6772 * the specified range. Note that the range must match exactly. 6773 */ 6774 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6775 if (range->offset == offset && range->length == length && 6776 range->owner_ch == ch && range->locked_ctx == cb_arg) { 6777 range_found = true; 6778 break; 6779 } 6780 } 6781 6782 if (!range_found) { 6783 return -EINVAL; 6784 } 6785 6786 pthread_mutex_lock(&bdev->internal.mutex); 6787 /* We confirmed that this channel has locked the specified range. To 6788 * start the unlock the process, we find the range in the bdev's locked_ranges 6789 * and remove it. This ensures new channels don't inherit the locked range. 6790 * Then we will send a message to each channel (including the one specified 6791 * here) to remove the range from its per-channel list. 6792 */ 6793 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 6794 if (range->offset == offset && range->length == length && 6795 range->locked_ctx == cb_arg) { 6796 break; 6797 } 6798 } 6799 if (range == NULL) { 6800 assert(false); 6801 pthread_mutex_unlock(&bdev->internal.mutex); 6802 return -EINVAL; 6803 } 6804 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 6805 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6806 pthread_mutex_unlock(&bdev->internal.mutex); 6807 6808 ctx->cb_fn = cb_fn; 6809 ctx->cb_arg = cb_arg; 6810 6811 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 6812 bdev_unlock_lba_range_cb); 6813 return 0; 6814 } 6815 6816 SPDK_LOG_REGISTER_COMPONENT(bdev) 6817 6818 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 6819 { 6820 spdk_trace_register_owner(OWNER_BDEV, 'b'); 6821 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 6822 spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV, 6823 OBJECT_BDEV_IO, 1, 0, "type: "); 6824 spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV, 6825 OBJECT_BDEV_IO, 0, 0, ""); 6826 } 6827