1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/notify.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk_internal/log.h" 52 #include "spdk/string.h" 53 54 #include "bdev_internal.h" 55 56 #ifdef SPDK_CONFIG_VTUNE 57 #include "ittnotify.h" 58 #include "ittnotify_types.h" 59 int __itt_init_ittlib(const char *, __itt_group_id); 60 #endif 61 62 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 63 #define SPDK_BDEV_IO_CACHE_SIZE 256 64 #define SPDK_BDEV_AUTO_EXAMINE true 65 #define BUF_SMALL_POOL_SIZE 8191 66 #define BUF_LARGE_POOL_SIZE 1023 67 #define NOMEM_THRESHOLD_COUNT 8 68 #define ZERO_BUFFER_SIZE 0x100000 69 70 #define OWNER_BDEV 0x2 71 72 #define OBJECT_BDEV_IO 0x2 73 74 #define TRACE_GROUP_BDEV 0x3 75 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 76 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 77 78 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 79 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 80 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 81 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 82 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 83 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 84 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 85 86 #define SPDK_BDEV_POOL_ALIGNMENT 512 87 88 static const char *qos_conf_type[] = {"Limit_IOPS", 89 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 90 }; 91 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 92 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 93 }; 94 95 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 96 97 struct spdk_bdev_mgr { 98 struct spdk_mempool *bdev_io_pool; 99 100 struct spdk_mempool *buf_small_pool; 101 struct spdk_mempool *buf_large_pool; 102 103 void *zero_buffer; 104 105 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 106 107 struct spdk_bdev_list bdevs; 108 109 bool init_complete; 110 bool module_init_complete; 111 112 pthread_mutex_t mutex; 113 114 #ifdef SPDK_CONFIG_VTUNE 115 __itt_domain *domain; 116 #endif 117 }; 118 119 static struct spdk_bdev_mgr g_bdev_mgr = { 120 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 121 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 122 .init_complete = false, 123 .module_init_complete = false, 124 .mutex = PTHREAD_MUTEX_INITIALIZER, 125 }; 126 127 typedef void (*lock_range_cb)(void *ctx, int status); 128 129 struct lba_range { 130 uint64_t offset; 131 uint64_t length; 132 void *locked_ctx; 133 struct spdk_bdev_channel *owner_ch; 134 TAILQ_ENTRY(lba_range) tailq; 135 }; 136 137 static struct spdk_bdev_opts g_bdev_opts = { 138 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 139 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 140 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 141 }; 142 143 static spdk_bdev_init_cb g_init_cb_fn = NULL; 144 static void *g_init_cb_arg = NULL; 145 146 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 147 static void *g_fini_cb_arg = NULL; 148 static struct spdk_thread *g_fini_thread = NULL; 149 150 struct spdk_bdev_qos_limit { 151 /** IOs or bytes allowed per second (i.e., 1s). */ 152 uint64_t limit; 153 154 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 155 * For remaining bytes, allowed to run negative if an I/O is submitted when 156 * some bytes are remaining, but the I/O is bigger than that amount. The 157 * excess will be deducted from the next timeslice. 158 */ 159 int64_t remaining_this_timeslice; 160 161 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 162 uint32_t min_per_timeslice; 163 164 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 165 uint32_t max_per_timeslice; 166 167 /** Function to check whether to queue the IO. */ 168 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 169 170 /** Function to update for the submitted IO. */ 171 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 172 }; 173 174 struct spdk_bdev_qos { 175 /** Types of structure of rate limits. */ 176 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 177 178 /** The channel that all I/O are funneled through. */ 179 struct spdk_bdev_channel *ch; 180 181 /** The thread on which the poller is running. */ 182 struct spdk_thread *thread; 183 184 /** Queue of I/O waiting to be issued. */ 185 bdev_io_tailq_t queued; 186 187 /** Size of a timeslice in tsc ticks. */ 188 uint64_t timeslice_size; 189 190 /** Timestamp of start of last timeslice. */ 191 uint64_t last_timeslice; 192 193 /** Poller that processes queued I/O commands each time slice. */ 194 struct spdk_poller *poller; 195 }; 196 197 struct spdk_bdev_mgmt_channel { 198 bdev_io_stailq_t need_buf_small; 199 bdev_io_stailq_t need_buf_large; 200 201 /* 202 * Each thread keeps a cache of bdev_io - this allows 203 * bdev threads which are *not* DPDK threads to still 204 * benefit from a per-thread bdev_io cache. Without 205 * this, non-DPDK threads fetching from the mempool 206 * incur a cmpxchg on get and put. 207 */ 208 bdev_io_stailq_t per_thread_cache; 209 uint32_t per_thread_cache_count; 210 uint32_t bdev_io_cache_size; 211 212 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 213 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 214 }; 215 216 /* 217 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 218 * will queue here their IO that awaits retry. It makes it possible to retry sending 219 * IO to one bdev after IO from other bdev completes. 220 */ 221 struct spdk_bdev_shared_resource { 222 /* The bdev management channel */ 223 struct spdk_bdev_mgmt_channel *mgmt_ch; 224 225 /* 226 * Count of I/O submitted to bdev module and waiting for completion. 227 * Incremented before submit_request() is called on an spdk_bdev_io. 228 */ 229 uint64_t io_outstanding; 230 231 /* 232 * Queue of IO awaiting retry because of a previous NOMEM status returned 233 * on this channel. 234 */ 235 bdev_io_tailq_t nomem_io; 236 237 /* 238 * Threshold which io_outstanding must drop to before retrying nomem_io. 239 */ 240 uint64_t nomem_threshold; 241 242 /* I/O channel allocated by a bdev module */ 243 struct spdk_io_channel *shared_ch; 244 245 /* Refcount of bdev channels using this resource */ 246 uint32_t ref; 247 248 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 249 }; 250 251 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 252 #define BDEV_CH_QOS_ENABLED (1 << 1) 253 254 struct spdk_bdev_channel { 255 struct spdk_bdev *bdev; 256 257 /* The channel for the underlying device */ 258 struct spdk_io_channel *channel; 259 260 /* Per io_device per thread data */ 261 struct spdk_bdev_shared_resource *shared_resource; 262 263 struct spdk_bdev_io_stat stat; 264 265 /* 266 * Count of I/O submitted to the underlying dev module through this channel 267 * and waiting for completion. 268 */ 269 uint64_t io_outstanding; 270 271 /* 272 * List of all submitted I/Os including I/O that are generated via splitting. 273 */ 274 bdev_io_tailq_t io_submitted; 275 276 /* 277 * List of spdk_bdev_io that are currently queued because they write to a locked 278 * LBA range. 279 */ 280 bdev_io_tailq_t io_locked; 281 282 uint32_t flags; 283 284 struct spdk_histogram_data *histogram; 285 286 #ifdef SPDK_CONFIG_VTUNE 287 uint64_t start_tsc; 288 uint64_t interval_tsc; 289 __itt_string_handle *handle; 290 struct spdk_bdev_io_stat prev_stat; 291 #endif 292 293 bdev_io_tailq_t queued_resets; 294 295 lba_range_tailq_t locked_ranges; 296 }; 297 298 struct media_event_entry { 299 struct spdk_bdev_media_event event; 300 TAILQ_ENTRY(media_event_entry) tailq; 301 }; 302 303 #define MEDIA_EVENT_POOL_SIZE 64 304 305 struct spdk_bdev_desc { 306 struct spdk_bdev *bdev; 307 struct spdk_thread *thread; 308 struct { 309 bool open_with_ext; 310 union { 311 spdk_bdev_remove_cb_t remove_fn; 312 spdk_bdev_event_cb_t event_fn; 313 }; 314 void *ctx; 315 } callback; 316 bool closed; 317 bool write; 318 pthread_mutex_t mutex; 319 uint32_t refs; 320 TAILQ_HEAD(, media_event_entry) pending_media_events; 321 TAILQ_HEAD(, media_event_entry) free_media_events; 322 struct media_event_entry *media_events_buffer; 323 TAILQ_ENTRY(spdk_bdev_desc) link; 324 325 uint64_t timeout_in_sec; 326 spdk_bdev_io_timeout_cb cb_fn; 327 void *cb_arg; 328 struct spdk_poller *io_timeout_poller; 329 }; 330 331 struct spdk_bdev_iostat_ctx { 332 struct spdk_bdev_io_stat *stat; 333 spdk_bdev_get_device_stat_cb cb; 334 void *cb_arg; 335 }; 336 337 struct set_qos_limit_ctx { 338 void (*cb_fn)(void *cb_arg, int status); 339 void *cb_arg; 340 struct spdk_bdev *bdev; 341 }; 342 343 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 344 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 345 346 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 347 static void bdev_write_zero_buffer_next(void *_bdev_io); 348 349 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 350 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 351 352 static int 353 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 354 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 355 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); 356 static int 357 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 358 struct iovec *iov, int iovcnt, void *md_buf, 359 uint64_t offset_blocks, uint64_t num_blocks, 360 spdk_bdev_io_completion_cb cb, void *cb_arg); 361 362 static int 363 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 364 uint64_t offset, uint64_t length, 365 lock_range_cb cb_fn, void *cb_arg); 366 367 static int 368 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 369 uint64_t offset, uint64_t length, 370 lock_range_cb cb_fn, void *cb_arg); 371 372 static inline void bdev_io_complete(void *ctx); 373 374 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 375 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 376 377 void 378 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 379 { 380 *opts = g_bdev_opts; 381 } 382 383 int 384 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 385 { 386 uint32_t min_pool_size; 387 388 /* 389 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 390 * initialization. A second mgmt_ch will be created on the same thread when the application starts 391 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 392 */ 393 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 394 if (opts->bdev_io_pool_size < min_pool_size) { 395 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 396 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 397 spdk_thread_get_count()); 398 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 399 return -1; 400 } 401 402 g_bdev_opts = *opts; 403 return 0; 404 } 405 406 struct spdk_bdev_examine_item { 407 char *name; 408 TAILQ_ENTRY(spdk_bdev_examine_item) link; 409 }; 410 411 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 412 413 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 414 g_bdev_examine_allowlist); 415 416 static inline bool 417 bdev_examine_allowlist_check(const char *name) 418 { 419 struct spdk_bdev_examine_item *item; 420 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 421 if (strcmp(name, item->name) == 0) { 422 return true; 423 } 424 } 425 return false; 426 } 427 428 static inline bool 429 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 430 { 431 struct spdk_bdev_alias *tmp; 432 if (bdev_examine_allowlist_check(bdev->name)) { 433 return true; 434 } 435 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 436 if (bdev_examine_allowlist_check(tmp->alias)) { 437 return true; 438 } 439 } 440 return false; 441 } 442 443 static inline bool 444 bdev_ok_to_examine(struct spdk_bdev *bdev) 445 { 446 if (g_bdev_opts.bdev_auto_examine) { 447 return true; 448 } else { 449 return bdev_in_examine_allowlist(bdev); 450 } 451 } 452 453 static void 454 bdev_examine(struct spdk_bdev *bdev) 455 { 456 struct spdk_bdev_module *module; 457 uint32_t action; 458 459 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 460 if (module->examine_config && bdev_ok_to_examine(bdev)) { 461 action = module->internal.action_in_progress; 462 module->internal.action_in_progress++; 463 module->examine_config(bdev); 464 if (action != module->internal.action_in_progress) { 465 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 466 module->name); 467 } 468 } 469 } 470 471 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 472 if (bdev->internal.claim_module->examine_disk) { 473 bdev->internal.claim_module->internal.action_in_progress++; 474 bdev->internal.claim_module->examine_disk(bdev); 475 } 476 return; 477 } 478 479 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 480 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 481 module->internal.action_in_progress++; 482 module->examine_disk(bdev); 483 } 484 } 485 } 486 487 int 488 spdk_bdev_examine(const char *name) 489 { 490 struct spdk_bdev *bdev; 491 struct spdk_bdev_examine_item *item; 492 493 if (g_bdev_opts.bdev_auto_examine) { 494 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 495 return -EINVAL; 496 } 497 498 if (bdev_examine_allowlist_check(name)) { 499 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 500 return -EEXIST; 501 } 502 503 item = calloc(1, sizeof(*item)); 504 if (!item) { 505 return -ENOMEM; 506 } 507 item->name = strdup(name); 508 if (!item->name) { 509 free(item); 510 return -ENOMEM; 511 } 512 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 513 514 bdev = spdk_bdev_get_by_name(name); 515 if (bdev) { 516 bdev_examine(bdev); 517 } 518 return 0; 519 } 520 521 static inline void 522 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 523 { 524 struct spdk_bdev_examine_item *item; 525 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 526 spdk_json_write_object_begin(w); 527 spdk_json_write_named_string(w, "method", "bdev_examine"); 528 spdk_json_write_named_object_begin(w, "params"); 529 spdk_json_write_named_string(w, "name", item->name); 530 spdk_json_write_object_end(w); 531 spdk_json_write_object_end(w); 532 } 533 } 534 535 struct spdk_bdev * 536 spdk_bdev_first(void) 537 { 538 struct spdk_bdev *bdev; 539 540 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 541 if (bdev) { 542 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 543 } 544 545 return bdev; 546 } 547 548 struct spdk_bdev * 549 spdk_bdev_next(struct spdk_bdev *prev) 550 { 551 struct spdk_bdev *bdev; 552 553 bdev = TAILQ_NEXT(prev, internal.link); 554 if (bdev) { 555 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 556 } 557 558 return bdev; 559 } 560 561 static struct spdk_bdev * 562 _bdev_next_leaf(struct spdk_bdev *bdev) 563 { 564 while (bdev != NULL) { 565 if (bdev->internal.claim_module == NULL) { 566 return bdev; 567 } else { 568 bdev = TAILQ_NEXT(bdev, internal.link); 569 } 570 } 571 572 return bdev; 573 } 574 575 struct spdk_bdev * 576 spdk_bdev_first_leaf(void) 577 { 578 struct spdk_bdev *bdev; 579 580 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 581 582 if (bdev) { 583 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 584 } 585 586 return bdev; 587 } 588 589 struct spdk_bdev * 590 spdk_bdev_next_leaf(struct spdk_bdev *prev) 591 { 592 struct spdk_bdev *bdev; 593 594 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 595 596 if (bdev) { 597 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 598 } 599 600 return bdev; 601 } 602 603 struct spdk_bdev * 604 spdk_bdev_get_by_name(const char *bdev_name) 605 { 606 struct spdk_bdev_alias *tmp; 607 struct spdk_bdev *bdev = spdk_bdev_first(); 608 609 while (bdev != NULL) { 610 if (strcmp(bdev_name, bdev->name) == 0) { 611 return bdev; 612 } 613 614 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 615 if (strcmp(bdev_name, tmp->alias) == 0) { 616 return bdev; 617 } 618 } 619 620 bdev = spdk_bdev_next(bdev); 621 } 622 623 return NULL; 624 } 625 626 void 627 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 628 { 629 struct iovec *iovs; 630 631 if (bdev_io->u.bdev.iovs == NULL) { 632 bdev_io->u.bdev.iovs = &bdev_io->iov; 633 bdev_io->u.bdev.iovcnt = 1; 634 } 635 636 iovs = bdev_io->u.bdev.iovs; 637 638 assert(iovs != NULL); 639 assert(bdev_io->u.bdev.iovcnt >= 1); 640 641 iovs[0].iov_base = buf; 642 iovs[0].iov_len = len; 643 } 644 645 void 646 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 647 { 648 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 649 bdev_io->u.bdev.md_buf = md_buf; 650 } 651 652 static bool 653 _is_buf_allocated(const struct iovec *iovs) 654 { 655 if (iovs == NULL) { 656 return false; 657 } 658 659 return iovs[0].iov_base != NULL; 660 } 661 662 static bool 663 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 664 { 665 int i; 666 uintptr_t iov_base; 667 668 if (spdk_likely(alignment == 1)) { 669 return true; 670 } 671 672 for (i = 0; i < iovcnt; i++) { 673 iov_base = (uintptr_t)iovs[i].iov_base; 674 if ((iov_base & (alignment - 1)) != 0) { 675 return false; 676 } 677 } 678 679 return true; 680 } 681 682 static void 683 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 684 { 685 int i; 686 size_t len; 687 688 for (i = 0; i < iovcnt; i++) { 689 len = spdk_min(iovs[i].iov_len, buf_len); 690 memcpy(buf, iovs[i].iov_base, len); 691 buf += len; 692 buf_len -= len; 693 } 694 } 695 696 static void 697 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 698 { 699 int i; 700 size_t len; 701 702 for (i = 0; i < iovcnt; i++) { 703 len = spdk_min(iovs[i].iov_len, buf_len); 704 memcpy(iovs[i].iov_base, buf, len); 705 buf += len; 706 buf_len -= len; 707 } 708 } 709 710 static void 711 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 712 { 713 /* save original iovec */ 714 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 715 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 716 /* set bounce iov */ 717 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 718 bdev_io->u.bdev.iovcnt = 1; 719 /* set bounce buffer for this operation */ 720 bdev_io->u.bdev.iovs[0].iov_base = buf; 721 bdev_io->u.bdev.iovs[0].iov_len = len; 722 /* if this is write path, copy data from original buffer to bounce buffer */ 723 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 724 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 725 } 726 } 727 728 static void 729 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 730 { 731 /* save original md_buf */ 732 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 733 /* set bounce md_buf */ 734 bdev_io->u.bdev.md_buf = md_buf; 735 736 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 737 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 738 } 739 } 740 741 static void 742 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) 743 { 744 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 745 746 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 747 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 748 bdev_io->internal.get_aux_buf_cb = NULL; 749 } else { 750 assert(bdev_io->internal.get_buf_cb != NULL); 751 bdev_io->internal.buf = buf; 752 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 753 bdev_io->internal.get_buf_cb = NULL; 754 } 755 } 756 757 static void 758 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 759 { 760 struct spdk_bdev *bdev = bdev_io->bdev; 761 bool buf_allocated; 762 uint64_t md_len, alignment; 763 void *aligned_buf; 764 765 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 766 bdev_io_get_buf_complete(bdev_io, buf, true); 767 return; 768 } 769 770 alignment = spdk_bdev_get_buf_align(bdev); 771 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 772 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 773 774 if (buf_allocated) { 775 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 776 } else { 777 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 778 } 779 780 if (spdk_bdev_is_md_separate(bdev)) { 781 aligned_buf = (char *)aligned_buf + len; 782 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 783 784 assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); 785 786 if (bdev_io->u.bdev.md_buf != NULL) { 787 _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); 788 } else { 789 spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); 790 } 791 } 792 bdev_io_get_buf_complete(bdev_io, buf, true); 793 } 794 795 static void 796 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 797 { 798 struct spdk_bdev *bdev = bdev_io->bdev; 799 struct spdk_mempool *pool; 800 struct spdk_bdev_io *tmp; 801 bdev_io_stailq_t *stailq; 802 struct spdk_bdev_mgmt_channel *ch; 803 uint64_t md_len, alignment; 804 805 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 806 alignment = spdk_bdev_get_buf_align(bdev); 807 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 808 809 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 810 SPDK_BDEV_POOL_ALIGNMENT) { 811 pool = g_bdev_mgr.buf_small_pool; 812 stailq = &ch->need_buf_small; 813 } else { 814 pool = g_bdev_mgr.buf_large_pool; 815 stailq = &ch->need_buf_large; 816 } 817 818 if (STAILQ_EMPTY(stailq)) { 819 spdk_mempool_put(pool, buf); 820 } else { 821 tmp = STAILQ_FIRST(stailq); 822 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 823 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 824 } 825 } 826 827 static void 828 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 829 { 830 assert(bdev_io->internal.buf != NULL); 831 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 832 bdev_io->internal.buf = NULL; 833 } 834 835 void 836 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 837 { 838 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 839 840 assert(buf != NULL); 841 _bdev_io_put_buf(bdev_io, buf, len); 842 } 843 844 static void 845 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 846 { 847 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 848 assert(bdev_io->internal.orig_md_buf == NULL); 849 return; 850 } 851 852 /* if this is read path, copy data from bounce buffer to original buffer */ 853 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 854 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 855 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 856 bdev_io->internal.orig_iovcnt, 857 bdev_io->internal.bounce_iov.iov_base, 858 bdev_io->internal.bounce_iov.iov_len); 859 } 860 /* set original buffer for this io */ 861 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 862 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 863 /* disable bouncing buffer for this io */ 864 bdev_io->internal.orig_iovcnt = 0; 865 bdev_io->internal.orig_iovs = NULL; 866 867 /* do the same for metadata buffer */ 868 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 869 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 870 871 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 872 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 873 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 874 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 875 } 876 877 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 878 bdev_io->internal.orig_md_buf = NULL; 879 } 880 881 /* We want to free the bounce buffer here since we know we're done with it (as opposed 882 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 883 */ 884 bdev_io_put_buf(bdev_io); 885 } 886 887 static void 888 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 889 { 890 struct spdk_bdev *bdev = bdev_io->bdev; 891 struct spdk_mempool *pool; 892 bdev_io_stailq_t *stailq; 893 struct spdk_bdev_mgmt_channel *mgmt_ch; 894 uint64_t alignment, md_len; 895 void *buf; 896 897 alignment = spdk_bdev_get_buf_align(bdev); 898 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 899 900 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 901 SPDK_BDEV_POOL_ALIGNMENT) { 902 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 903 len + alignment); 904 bdev_io_get_buf_complete(bdev_io, NULL, false); 905 return; 906 } 907 908 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 909 910 bdev_io->internal.buf_len = len; 911 912 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 913 SPDK_BDEV_POOL_ALIGNMENT) { 914 pool = g_bdev_mgr.buf_small_pool; 915 stailq = &mgmt_ch->need_buf_small; 916 } else { 917 pool = g_bdev_mgr.buf_large_pool; 918 stailq = &mgmt_ch->need_buf_large; 919 } 920 921 buf = spdk_mempool_get(pool); 922 if (!buf) { 923 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 924 } else { 925 _bdev_io_set_buf(bdev_io, buf, len); 926 } 927 } 928 929 void 930 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 931 { 932 struct spdk_bdev *bdev = bdev_io->bdev; 933 uint64_t alignment; 934 935 assert(cb != NULL); 936 bdev_io->internal.get_buf_cb = cb; 937 938 alignment = spdk_bdev_get_buf_align(bdev); 939 940 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 941 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 942 /* Buffer already present and aligned */ 943 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 944 return; 945 } 946 947 bdev_io_get_buf(bdev_io, len); 948 } 949 950 void 951 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 952 { 953 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 954 955 assert(cb != NULL); 956 assert(bdev_io->internal.get_aux_buf_cb == NULL); 957 bdev_io->internal.get_aux_buf_cb = cb; 958 bdev_io_get_buf(bdev_io, len); 959 } 960 961 static int 962 bdev_module_get_max_ctx_size(void) 963 { 964 struct spdk_bdev_module *bdev_module; 965 int max_bdev_module_size = 0; 966 967 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 968 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 969 max_bdev_module_size = bdev_module->get_ctx_size(); 970 } 971 } 972 973 return max_bdev_module_size; 974 } 975 976 void 977 spdk_bdev_config_text(FILE *fp) 978 { 979 struct spdk_bdev_module *bdev_module; 980 981 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 982 if (bdev_module->config_text) { 983 bdev_module->config_text(fp); 984 } 985 } 986 } 987 988 static void 989 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 990 { 991 int i; 992 struct spdk_bdev_qos *qos = bdev->internal.qos; 993 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 994 995 if (!qos) { 996 return; 997 } 998 999 spdk_bdev_get_qos_rate_limits(bdev, limits); 1000 1001 spdk_json_write_object_begin(w); 1002 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1003 1004 spdk_json_write_named_object_begin(w, "params"); 1005 spdk_json_write_named_string(w, "name", bdev->name); 1006 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1007 if (limits[i] > 0) { 1008 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1009 } 1010 } 1011 spdk_json_write_object_end(w); 1012 1013 spdk_json_write_object_end(w); 1014 } 1015 1016 void 1017 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1018 { 1019 struct spdk_bdev_module *bdev_module; 1020 struct spdk_bdev *bdev; 1021 1022 assert(w != NULL); 1023 1024 spdk_json_write_array_begin(w); 1025 1026 spdk_json_write_object_begin(w); 1027 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1028 spdk_json_write_named_object_begin(w, "params"); 1029 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1030 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1031 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1032 spdk_json_write_object_end(w); 1033 spdk_json_write_object_end(w); 1034 1035 bdev_examine_allowlist_config_json(w); 1036 1037 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1038 if (bdev_module->config_json) { 1039 bdev_module->config_json(w); 1040 } 1041 } 1042 1043 pthread_mutex_lock(&g_bdev_mgr.mutex); 1044 1045 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1046 if (bdev->fn_table->write_config_json) { 1047 bdev->fn_table->write_config_json(bdev, w); 1048 } 1049 1050 bdev_qos_config_json(bdev, w); 1051 } 1052 1053 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1054 1055 spdk_json_write_array_end(w); 1056 } 1057 1058 static int 1059 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1060 { 1061 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1062 struct spdk_bdev_io *bdev_io; 1063 uint32_t i; 1064 1065 STAILQ_INIT(&ch->need_buf_small); 1066 STAILQ_INIT(&ch->need_buf_large); 1067 1068 STAILQ_INIT(&ch->per_thread_cache); 1069 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1070 1071 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1072 ch->per_thread_cache_count = 0; 1073 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1074 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1075 assert(bdev_io != NULL); 1076 ch->per_thread_cache_count++; 1077 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1078 } 1079 1080 TAILQ_INIT(&ch->shared_resources); 1081 TAILQ_INIT(&ch->io_wait_queue); 1082 1083 return 0; 1084 } 1085 1086 static void 1087 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1088 { 1089 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1090 struct spdk_bdev_io *bdev_io; 1091 1092 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1093 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1094 } 1095 1096 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1097 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1098 } 1099 1100 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1101 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1102 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1103 ch->per_thread_cache_count--; 1104 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1105 } 1106 1107 assert(ch->per_thread_cache_count == 0); 1108 } 1109 1110 static void 1111 bdev_init_complete(int rc) 1112 { 1113 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1114 void *cb_arg = g_init_cb_arg; 1115 struct spdk_bdev_module *m; 1116 1117 g_bdev_mgr.init_complete = true; 1118 g_init_cb_fn = NULL; 1119 g_init_cb_arg = NULL; 1120 1121 /* 1122 * For modules that need to know when subsystem init is complete, 1123 * inform them now. 1124 */ 1125 if (rc == 0) { 1126 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1127 if (m->init_complete) { 1128 m->init_complete(); 1129 } 1130 } 1131 } 1132 1133 cb_fn(cb_arg, rc); 1134 } 1135 1136 static void 1137 bdev_module_action_complete(void) 1138 { 1139 struct spdk_bdev_module *m; 1140 1141 /* 1142 * Don't finish bdev subsystem initialization if 1143 * module pre-initialization is still in progress, or 1144 * the subsystem been already initialized. 1145 */ 1146 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1147 return; 1148 } 1149 1150 /* 1151 * Check all bdev modules for inits/examinations in progress. If any 1152 * exist, return immediately since we cannot finish bdev subsystem 1153 * initialization until all are completed. 1154 */ 1155 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1156 if (m->internal.action_in_progress > 0) { 1157 return; 1158 } 1159 } 1160 1161 /* 1162 * Modules already finished initialization - now that all 1163 * the bdev modules have finished their asynchronous I/O 1164 * processing, the entire bdev layer can be marked as complete. 1165 */ 1166 bdev_init_complete(0); 1167 } 1168 1169 static void 1170 bdev_module_action_done(struct spdk_bdev_module *module) 1171 { 1172 assert(module->internal.action_in_progress > 0); 1173 module->internal.action_in_progress--; 1174 bdev_module_action_complete(); 1175 } 1176 1177 void 1178 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1179 { 1180 bdev_module_action_done(module); 1181 } 1182 1183 void 1184 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1185 { 1186 bdev_module_action_done(module); 1187 } 1188 1189 /** The last initialized bdev module */ 1190 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1191 1192 static void 1193 bdev_init_failed(void *cb_arg) 1194 { 1195 struct spdk_bdev_module *module = cb_arg; 1196 1197 module->internal.action_in_progress--; 1198 bdev_init_complete(-1); 1199 } 1200 1201 static int 1202 bdev_modules_init(void) 1203 { 1204 struct spdk_bdev_module *module; 1205 int rc = 0; 1206 1207 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1208 g_resume_bdev_module = module; 1209 if (module->async_init) { 1210 module->internal.action_in_progress = 1; 1211 } 1212 rc = module->module_init(); 1213 if (rc != 0) { 1214 /* Bump action_in_progress to prevent other modules from completion of modules_init 1215 * Send message to defer application shutdown until resources are cleaned up */ 1216 module->internal.action_in_progress = 1; 1217 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1218 return rc; 1219 } 1220 } 1221 1222 g_resume_bdev_module = NULL; 1223 return 0; 1224 } 1225 1226 void 1227 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1228 { 1229 struct spdk_conf_section *sp; 1230 struct spdk_bdev_opts bdev_opts; 1231 int32_t bdev_io_pool_size, bdev_io_cache_size; 1232 int cache_size; 1233 int rc = 0; 1234 char mempool_name[32]; 1235 1236 assert(cb_fn != NULL); 1237 1238 sp = spdk_conf_find_section(NULL, "Bdev"); 1239 if (sp != NULL) { 1240 spdk_bdev_get_opts(&bdev_opts); 1241 1242 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 1243 if (bdev_io_pool_size >= 0) { 1244 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 1245 } 1246 1247 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 1248 if (bdev_io_cache_size >= 0) { 1249 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 1250 } 1251 1252 if (spdk_bdev_set_opts(&bdev_opts)) { 1253 bdev_init_complete(-1); 1254 return; 1255 } 1256 1257 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 1258 } 1259 1260 g_init_cb_fn = cb_fn; 1261 g_init_cb_arg = cb_arg; 1262 1263 spdk_notify_type_register("bdev_register"); 1264 spdk_notify_type_register("bdev_unregister"); 1265 1266 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1267 1268 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1269 g_bdev_opts.bdev_io_pool_size, 1270 sizeof(struct spdk_bdev_io) + 1271 bdev_module_get_max_ctx_size(), 1272 0, 1273 SPDK_ENV_SOCKET_ID_ANY); 1274 1275 if (g_bdev_mgr.bdev_io_pool == NULL) { 1276 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1277 bdev_init_complete(-1); 1278 return; 1279 } 1280 1281 /** 1282 * Ensure no more than half of the total buffers end up local caches, by 1283 * using spdk_env_get_core_count() to determine how many local caches we need 1284 * to account for. 1285 */ 1286 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1287 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1288 1289 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1290 BUF_SMALL_POOL_SIZE, 1291 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1292 SPDK_BDEV_POOL_ALIGNMENT, 1293 cache_size, 1294 SPDK_ENV_SOCKET_ID_ANY); 1295 if (!g_bdev_mgr.buf_small_pool) { 1296 SPDK_ERRLOG("create rbuf small pool failed\n"); 1297 bdev_init_complete(-1); 1298 return; 1299 } 1300 1301 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1302 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1303 1304 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1305 BUF_LARGE_POOL_SIZE, 1306 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1307 SPDK_BDEV_POOL_ALIGNMENT, 1308 cache_size, 1309 SPDK_ENV_SOCKET_ID_ANY); 1310 if (!g_bdev_mgr.buf_large_pool) { 1311 SPDK_ERRLOG("create rbuf large pool failed\n"); 1312 bdev_init_complete(-1); 1313 return; 1314 } 1315 1316 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1317 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1318 if (!g_bdev_mgr.zero_buffer) { 1319 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1320 bdev_init_complete(-1); 1321 return; 1322 } 1323 1324 #ifdef SPDK_CONFIG_VTUNE 1325 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1326 #endif 1327 1328 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1329 bdev_mgmt_channel_destroy, 1330 sizeof(struct spdk_bdev_mgmt_channel), 1331 "bdev_mgr"); 1332 1333 rc = bdev_modules_init(); 1334 g_bdev_mgr.module_init_complete = true; 1335 if (rc != 0) { 1336 SPDK_ERRLOG("bdev modules init failed\n"); 1337 return; 1338 } 1339 1340 bdev_module_action_complete(); 1341 } 1342 1343 static void 1344 bdev_mgr_unregister_cb(void *io_device) 1345 { 1346 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1347 1348 if (g_bdev_mgr.bdev_io_pool) { 1349 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1350 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1351 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1352 g_bdev_opts.bdev_io_pool_size); 1353 } 1354 1355 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1356 } 1357 1358 if (g_bdev_mgr.buf_small_pool) { 1359 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1360 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1361 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1362 BUF_SMALL_POOL_SIZE); 1363 assert(false); 1364 } 1365 1366 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1367 } 1368 1369 if (g_bdev_mgr.buf_large_pool) { 1370 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1371 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1372 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1373 BUF_LARGE_POOL_SIZE); 1374 assert(false); 1375 } 1376 1377 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1378 } 1379 1380 spdk_free(g_bdev_mgr.zero_buffer); 1381 1382 cb_fn(g_fini_cb_arg); 1383 g_fini_cb_fn = NULL; 1384 g_fini_cb_arg = NULL; 1385 g_bdev_mgr.init_complete = false; 1386 g_bdev_mgr.module_init_complete = false; 1387 pthread_mutex_destroy(&g_bdev_mgr.mutex); 1388 } 1389 1390 static void 1391 bdev_module_finish_iter(void *arg) 1392 { 1393 struct spdk_bdev_module *bdev_module; 1394 1395 /* FIXME: Handling initialization failures is broken now, 1396 * so we won't even try cleaning up after successfully 1397 * initialized modules. if module_init_complete is false, 1398 * just call spdk_bdev_mgr_unregister_cb 1399 */ 1400 if (!g_bdev_mgr.module_init_complete) { 1401 bdev_mgr_unregister_cb(NULL); 1402 return; 1403 } 1404 1405 /* Start iterating from the last touched module */ 1406 if (!g_resume_bdev_module) { 1407 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1408 } else { 1409 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1410 internal.tailq); 1411 } 1412 1413 while (bdev_module) { 1414 if (bdev_module->async_fini) { 1415 /* Save our place so we can resume later. We must 1416 * save the variable here, before calling module_fini() 1417 * below, because in some cases the module may immediately 1418 * call spdk_bdev_module_finish_done() and re-enter 1419 * this function to continue iterating. */ 1420 g_resume_bdev_module = bdev_module; 1421 } 1422 1423 if (bdev_module->module_fini) { 1424 bdev_module->module_fini(); 1425 } 1426 1427 if (bdev_module->async_fini) { 1428 return; 1429 } 1430 1431 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1432 internal.tailq); 1433 } 1434 1435 g_resume_bdev_module = NULL; 1436 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1437 } 1438 1439 void 1440 spdk_bdev_module_finish_done(void) 1441 { 1442 if (spdk_get_thread() != g_fini_thread) { 1443 spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL); 1444 } else { 1445 bdev_module_finish_iter(NULL); 1446 } 1447 } 1448 1449 static void 1450 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1451 { 1452 struct spdk_bdev *bdev = cb_arg; 1453 1454 if (bdeverrno && bdev) { 1455 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1456 bdev->name); 1457 1458 /* 1459 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1460 * bdev; try to continue by manually removing this bdev from the list and continue 1461 * with the next bdev in the list. 1462 */ 1463 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1464 } 1465 1466 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1467 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1468 /* 1469 * Bdev module finish need to be deferred as we might be in the middle of some context 1470 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1471 * after returning. 1472 */ 1473 spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL); 1474 return; 1475 } 1476 1477 /* 1478 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1479 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1480 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1481 * base bdevs. 1482 * 1483 * Also, walk the list in the reverse order. 1484 */ 1485 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1486 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1487 if (bdev->internal.claim_module != NULL) { 1488 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1489 bdev->name, bdev->internal.claim_module->name); 1490 continue; 1491 } 1492 1493 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1494 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1495 return; 1496 } 1497 1498 /* 1499 * If any bdev fails to unclaim underlying bdev properly, we may face the 1500 * case of bdev list consisting of claimed bdevs only (if claims are managed 1501 * correctly, this would mean there's a loop in the claims graph which is 1502 * clearly impossible). Warn and unregister last bdev on the list then. 1503 */ 1504 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1505 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1506 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1507 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1508 return; 1509 } 1510 } 1511 1512 void 1513 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1514 { 1515 struct spdk_bdev_module *m; 1516 1517 assert(cb_fn != NULL); 1518 1519 g_fini_thread = spdk_get_thread(); 1520 1521 g_fini_cb_fn = cb_fn; 1522 g_fini_cb_arg = cb_arg; 1523 1524 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1525 if (m->fini_start) { 1526 m->fini_start(); 1527 } 1528 } 1529 1530 bdev_finish_unregister_bdevs_iter(NULL, 0); 1531 } 1532 1533 struct spdk_bdev_io * 1534 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1535 { 1536 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1537 struct spdk_bdev_io *bdev_io; 1538 1539 if (ch->per_thread_cache_count > 0) { 1540 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1541 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1542 ch->per_thread_cache_count--; 1543 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1544 /* 1545 * Don't try to look for bdev_ios in the global pool if there are 1546 * waiters on bdev_ios - we don't want this caller to jump the line. 1547 */ 1548 bdev_io = NULL; 1549 } else { 1550 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1551 } 1552 1553 return bdev_io; 1554 } 1555 1556 void 1557 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1558 { 1559 struct spdk_bdev_mgmt_channel *ch; 1560 1561 assert(bdev_io != NULL); 1562 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1563 1564 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1565 1566 if (bdev_io->internal.buf != NULL) { 1567 bdev_io_put_buf(bdev_io); 1568 } 1569 1570 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1571 ch->per_thread_cache_count++; 1572 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1573 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1574 struct spdk_bdev_io_wait_entry *entry; 1575 1576 entry = TAILQ_FIRST(&ch->io_wait_queue); 1577 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1578 entry->cb_fn(entry->cb_arg); 1579 } 1580 } else { 1581 /* We should never have a full cache with entries on the io wait queue. */ 1582 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1583 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1584 } 1585 } 1586 1587 static bool 1588 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1589 { 1590 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1591 1592 switch (limit) { 1593 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1594 return true; 1595 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1596 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1597 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1598 return false; 1599 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1600 default: 1601 return false; 1602 } 1603 } 1604 1605 static bool 1606 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1607 { 1608 switch (bdev_io->type) { 1609 case SPDK_BDEV_IO_TYPE_NVME_IO: 1610 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1611 case SPDK_BDEV_IO_TYPE_READ: 1612 case SPDK_BDEV_IO_TYPE_WRITE: 1613 return true; 1614 case SPDK_BDEV_IO_TYPE_ZCOPY: 1615 if (bdev_io->u.bdev.zcopy.start) { 1616 return true; 1617 } else { 1618 return false; 1619 } 1620 default: 1621 return false; 1622 } 1623 } 1624 1625 static bool 1626 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1627 { 1628 switch (bdev_io->type) { 1629 case SPDK_BDEV_IO_TYPE_NVME_IO: 1630 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1631 /* Bit 1 (0x2) set for read operation */ 1632 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1633 return true; 1634 } else { 1635 return false; 1636 } 1637 case SPDK_BDEV_IO_TYPE_READ: 1638 return true; 1639 case SPDK_BDEV_IO_TYPE_ZCOPY: 1640 /* Populate to read from disk */ 1641 if (bdev_io->u.bdev.zcopy.populate) { 1642 return true; 1643 } else { 1644 return false; 1645 } 1646 default: 1647 return false; 1648 } 1649 } 1650 1651 static uint64_t 1652 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1653 { 1654 struct spdk_bdev *bdev = bdev_io->bdev; 1655 1656 switch (bdev_io->type) { 1657 case SPDK_BDEV_IO_TYPE_NVME_IO: 1658 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1659 return bdev_io->u.nvme_passthru.nbytes; 1660 case SPDK_BDEV_IO_TYPE_READ: 1661 case SPDK_BDEV_IO_TYPE_WRITE: 1662 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1663 case SPDK_BDEV_IO_TYPE_ZCOPY: 1664 /* Track the data in the start phase only */ 1665 if (bdev_io->u.bdev.zcopy.start) { 1666 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1667 } else { 1668 return 0; 1669 } 1670 default: 1671 return 0; 1672 } 1673 } 1674 1675 static bool 1676 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1677 { 1678 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1679 return true; 1680 } else { 1681 return false; 1682 } 1683 } 1684 1685 static bool 1686 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1687 { 1688 if (bdev_is_read_io(io) == false) { 1689 return false; 1690 } 1691 1692 return bdev_qos_rw_queue_io(limit, io); 1693 } 1694 1695 static bool 1696 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1697 { 1698 if (bdev_is_read_io(io) == true) { 1699 return false; 1700 } 1701 1702 return bdev_qos_rw_queue_io(limit, io); 1703 } 1704 1705 static void 1706 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1707 { 1708 limit->remaining_this_timeslice--; 1709 } 1710 1711 static void 1712 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1713 { 1714 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1715 } 1716 1717 static void 1718 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1719 { 1720 if (bdev_is_read_io(io) == false) { 1721 return; 1722 } 1723 1724 return bdev_qos_rw_bps_update_quota(limit, io); 1725 } 1726 1727 static void 1728 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1729 { 1730 if (bdev_is_read_io(io) == true) { 1731 return; 1732 } 1733 1734 return bdev_qos_rw_bps_update_quota(limit, io); 1735 } 1736 1737 static void 1738 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1739 { 1740 int i; 1741 1742 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1743 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1744 qos->rate_limits[i].queue_io = NULL; 1745 qos->rate_limits[i].update_quota = NULL; 1746 continue; 1747 } 1748 1749 switch (i) { 1750 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1751 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1752 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1753 break; 1754 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1755 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1756 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 1757 break; 1758 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1759 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 1760 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 1761 break; 1762 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1763 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 1764 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 1765 break; 1766 default: 1767 break; 1768 } 1769 } 1770 } 1771 1772 static void 1773 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 1774 struct spdk_bdev_io *bdev_io, 1775 enum spdk_bdev_io_status status) 1776 { 1777 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1778 1779 bdev_io->internal.in_submit_request = true; 1780 bdev_ch->io_outstanding++; 1781 shared_resource->io_outstanding++; 1782 spdk_bdev_io_complete(bdev_io, status); 1783 bdev_io->internal.in_submit_request = false; 1784 } 1785 1786 static inline void 1787 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1788 { 1789 struct spdk_bdev *bdev = bdev_io->bdev; 1790 struct spdk_io_channel *ch = bdev_ch->channel; 1791 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1792 1793 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 1794 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 1795 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 1796 1797 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 1798 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 1799 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 1800 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 1801 SPDK_BDEV_IO_STATUS_SUCCESS); 1802 return; 1803 } 1804 } 1805 1806 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1807 bdev_ch->io_outstanding++; 1808 shared_resource->io_outstanding++; 1809 bdev_io->internal.in_submit_request = true; 1810 bdev->fn_table->submit_request(ch, bdev_io); 1811 bdev_io->internal.in_submit_request = false; 1812 } else { 1813 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1814 } 1815 } 1816 1817 static int 1818 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1819 { 1820 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1821 int i, submitted_ios = 0; 1822 1823 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1824 if (bdev_qos_io_to_limit(bdev_io) == true) { 1825 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1826 if (!qos->rate_limits[i].queue_io) { 1827 continue; 1828 } 1829 1830 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1831 bdev_io) == true) { 1832 return submitted_ios; 1833 } 1834 } 1835 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1836 if (!qos->rate_limits[i].update_quota) { 1837 continue; 1838 } 1839 1840 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1841 } 1842 } 1843 1844 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1845 bdev_io_do_submit(ch, bdev_io); 1846 submitted_ios++; 1847 } 1848 1849 return submitted_ios; 1850 } 1851 1852 static void 1853 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1854 { 1855 int rc; 1856 1857 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1858 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1859 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1860 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1861 &bdev_io->internal.waitq_entry); 1862 if (rc != 0) { 1863 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1864 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1865 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1866 } 1867 } 1868 1869 static bool 1870 bdev_io_type_can_split(uint8_t type) 1871 { 1872 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1873 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1874 1875 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1876 * UNMAP could be split, but these types of I/O are typically much larger 1877 * in size (sometimes the size of the entire block device), and the bdev 1878 * module can more efficiently split these types of I/O. Plus those types 1879 * of I/O do not have a payload, which makes the splitting process simpler. 1880 */ 1881 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1882 return true; 1883 } else { 1884 return false; 1885 } 1886 } 1887 1888 static bool 1889 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1890 { 1891 uint64_t start_stripe, end_stripe; 1892 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1893 1894 if (io_boundary == 0) { 1895 return false; 1896 } 1897 1898 if (!bdev_io_type_can_split(bdev_io->type)) { 1899 return false; 1900 } 1901 1902 start_stripe = bdev_io->u.bdev.offset_blocks; 1903 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1904 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1905 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1906 start_stripe >>= spdk_u32log2(io_boundary); 1907 end_stripe >>= spdk_u32log2(io_boundary); 1908 } else { 1909 start_stripe /= io_boundary; 1910 end_stripe /= io_boundary; 1911 } 1912 return (start_stripe != end_stripe); 1913 } 1914 1915 static uint32_t 1916 _to_next_boundary(uint64_t offset, uint32_t boundary) 1917 { 1918 return (boundary - (offset % boundary)); 1919 } 1920 1921 static void 1922 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1923 1924 static void 1925 _bdev_io_split(void *_bdev_io) 1926 { 1927 struct spdk_bdev_io *bdev_io = _bdev_io; 1928 uint64_t current_offset, remaining; 1929 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 1930 struct iovec *parent_iov, *iov; 1931 uint64_t parent_iov_offset, iov_len; 1932 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1933 void *md_buf = NULL; 1934 int rc; 1935 1936 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1937 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1938 blocklen = bdev_io->bdev->blocklen; 1939 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1940 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1941 1942 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1943 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1944 if (parent_iov_offset < parent_iov->iov_len) { 1945 break; 1946 } 1947 parent_iov_offset -= parent_iov->iov_len; 1948 } 1949 1950 child_iovcnt = 0; 1951 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1952 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1953 to_next_boundary = spdk_min(remaining, to_next_boundary); 1954 to_next_boundary_bytes = to_next_boundary * blocklen; 1955 iov = &bdev_io->child_iov[child_iovcnt]; 1956 iovcnt = 0; 1957 1958 if (bdev_io->u.bdev.md_buf) { 1959 assert((parent_iov_offset % blocklen) > 0); 1960 md_buf = (char *)bdev_io->u.bdev.md_buf + (parent_iov_offset / blocklen) * 1961 spdk_bdev_get_md_size(bdev_io->bdev); 1962 } 1963 1964 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1965 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1966 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1967 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1968 to_next_boundary_bytes -= iov_len; 1969 1970 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1971 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1972 1973 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1974 parent_iov_offset += iov_len; 1975 } else { 1976 parent_iovpos++; 1977 parent_iov_offset = 0; 1978 } 1979 child_iovcnt++; 1980 iovcnt++; 1981 } 1982 1983 if (to_next_boundary_bytes > 0) { 1984 /* We had to stop this child I/O early because we ran out of 1985 * child_iov space. Ensure the iovs to be aligned with block 1986 * size and then adjust to_next_boundary before starting the 1987 * child I/O. 1988 */ 1989 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV); 1990 to_last_block_bytes = to_next_boundary_bytes % blocklen; 1991 if (to_last_block_bytes != 0) { 1992 uint32_t child_iovpos = child_iovcnt - 1; 1993 /* don't decrease child_iovcnt so the loop will naturally end */ 1994 1995 to_last_block_bytes = blocklen - to_last_block_bytes; 1996 to_next_boundary_bytes += to_last_block_bytes; 1997 while (to_last_block_bytes > 0 && iovcnt > 0) { 1998 iov_len = spdk_min(to_last_block_bytes, 1999 bdev_io->child_iov[child_iovpos].iov_len); 2000 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2001 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2002 child_iovpos--; 2003 if (--iovcnt == 0) { 2004 return; 2005 } 2006 } 2007 to_last_block_bytes -= iov_len; 2008 } 2009 2010 assert(to_last_block_bytes == 0); 2011 } 2012 to_next_boundary -= to_next_boundary_bytes / blocklen; 2013 } 2014 2015 bdev_io->u.bdev.split_outstanding++; 2016 2017 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 2018 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2019 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2020 iov, iovcnt, md_buf, current_offset, 2021 to_next_boundary, 2022 bdev_io_split_done, bdev_io); 2023 } else { 2024 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2025 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2026 iov, iovcnt, md_buf, current_offset, 2027 to_next_boundary, 2028 bdev_io_split_done, bdev_io); 2029 } 2030 2031 if (rc == 0) { 2032 current_offset += to_next_boundary; 2033 remaining -= to_next_boundary; 2034 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2035 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 2036 } else { 2037 bdev_io->u.bdev.split_outstanding--; 2038 if (rc == -ENOMEM) { 2039 if (bdev_io->u.bdev.split_outstanding == 0) { 2040 /* No I/O is outstanding. Hence we should wait here. */ 2041 bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split); 2042 } 2043 } else { 2044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2045 if (bdev_io->u.bdev.split_outstanding == 0) { 2046 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 2047 (uintptr_t)bdev_io, 0); 2048 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2049 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2050 } 2051 } 2052 2053 return; 2054 } 2055 } 2056 } 2057 2058 static void 2059 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2060 { 2061 struct spdk_bdev_io *parent_io = cb_arg; 2062 2063 spdk_bdev_free_io(bdev_io); 2064 2065 if (!success) { 2066 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2067 /* If any child I/O failed, stop further splitting process. */ 2068 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2069 parent_io->u.bdev.split_remaining_num_blocks = 0; 2070 } 2071 parent_io->u.bdev.split_outstanding--; 2072 if (parent_io->u.bdev.split_outstanding != 0) { 2073 return; 2074 } 2075 2076 /* 2077 * Parent I/O finishes when all blocks are consumed. 2078 */ 2079 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2080 assert(parent_io->internal.cb != bdev_io_split_done); 2081 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 2082 (uintptr_t)parent_io, 0); 2083 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2084 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2085 parent_io->internal.caller_ctx); 2086 return; 2087 } 2088 2089 /* 2090 * Continue with the splitting process. This function will complete the parent I/O if the 2091 * splitting is done. 2092 */ 2093 _bdev_io_split(parent_io); 2094 } 2095 2096 static void 2097 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2098 2099 static void 2100 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2101 { 2102 assert(bdev_io_type_can_split(bdev_io->type)); 2103 2104 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2105 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2106 bdev_io->u.bdev.split_outstanding = 0; 2107 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2108 2109 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2110 _bdev_io_split(bdev_io); 2111 } else { 2112 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2113 spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb, 2114 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2115 } 2116 } 2117 2118 static void 2119 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2120 { 2121 if (!success) { 2122 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2123 return; 2124 } 2125 2126 bdev_io_split(ch, bdev_io); 2127 } 2128 2129 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2130 * be inlined, at least on some compilers. 2131 */ 2132 static inline void 2133 _bdev_io_submit(void *ctx) 2134 { 2135 struct spdk_bdev_io *bdev_io = ctx; 2136 struct spdk_bdev *bdev = bdev_io->bdev; 2137 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2138 uint64_t tsc; 2139 2140 tsc = spdk_get_ticks(); 2141 bdev_io->internal.submit_tsc = tsc; 2142 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 2143 2144 if (spdk_likely(bdev_ch->flags == 0)) { 2145 bdev_io_do_submit(bdev_ch, bdev_io); 2146 return; 2147 } 2148 2149 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2150 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2151 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2152 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2153 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2154 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2155 } else { 2156 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2157 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2158 } 2159 } else { 2160 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2161 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2162 } 2163 } 2164 2165 bool 2166 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2167 2168 bool 2169 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2170 { 2171 if (range1->length == 0 || range2->length == 0) { 2172 return false; 2173 } 2174 2175 if (range1->offset + range1->length <= range2->offset) { 2176 return false; 2177 } 2178 2179 if (range2->offset + range2->length <= range1->offset) { 2180 return false; 2181 } 2182 2183 return true; 2184 } 2185 2186 static bool 2187 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2188 { 2189 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2190 struct lba_range r; 2191 2192 switch (bdev_io->type) { 2193 case SPDK_BDEV_IO_TYPE_NVME_IO: 2194 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2195 /* Don't try to decode the NVMe command - just assume worst-case and that 2196 * it overlaps a locked range. 2197 */ 2198 return true; 2199 case SPDK_BDEV_IO_TYPE_WRITE: 2200 case SPDK_BDEV_IO_TYPE_UNMAP: 2201 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2202 case SPDK_BDEV_IO_TYPE_ZCOPY: 2203 r.offset = bdev_io->u.bdev.offset_blocks; 2204 r.length = bdev_io->u.bdev.num_blocks; 2205 if (!bdev_lba_range_overlapped(range, &r)) { 2206 /* This I/O doesn't overlap the specified LBA range. */ 2207 return false; 2208 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2209 /* This I/O overlaps, but the I/O is on the same channel that locked this 2210 * range, and the caller_ctx is the same as the locked_ctx. This means 2211 * that this I/O is associated with the lock, and is allowed to execute. 2212 */ 2213 return false; 2214 } else { 2215 return true; 2216 } 2217 default: 2218 return false; 2219 } 2220 } 2221 2222 void 2223 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2224 { 2225 struct spdk_bdev *bdev = bdev_io->bdev; 2226 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2227 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2228 2229 assert(thread != NULL); 2230 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2231 2232 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2233 struct lba_range *range; 2234 2235 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2236 if (bdev_io_range_is_locked(bdev_io, range)) { 2237 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2238 return; 2239 } 2240 } 2241 } 2242 2243 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2244 2245 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bdev_io)) { 2246 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2247 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2248 (uintptr_t)bdev_io, bdev_io->type); 2249 bdev_io_split(NULL, bdev_io); 2250 return; 2251 } 2252 2253 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2254 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2255 _bdev_io_submit(bdev_io); 2256 } else { 2257 bdev_io->internal.io_submit_ch = ch; 2258 bdev_io->internal.ch = bdev->internal.qos->ch; 2259 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2260 } 2261 } else { 2262 _bdev_io_submit(bdev_io); 2263 } 2264 } 2265 2266 static void 2267 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2268 { 2269 struct spdk_bdev *bdev = bdev_io->bdev; 2270 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2271 struct spdk_io_channel *ch = bdev_ch->channel; 2272 2273 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2274 2275 bdev_io->internal.in_submit_request = true; 2276 bdev->fn_table->submit_request(ch, bdev_io); 2277 bdev_io->internal.in_submit_request = false; 2278 } 2279 2280 void 2281 bdev_io_init(struct spdk_bdev_io *bdev_io, 2282 struct spdk_bdev *bdev, void *cb_arg, 2283 spdk_bdev_io_completion_cb cb) 2284 { 2285 bdev_io->bdev = bdev; 2286 bdev_io->internal.caller_ctx = cb_arg; 2287 bdev_io->internal.cb = cb; 2288 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2289 bdev_io->internal.in_submit_request = false; 2290 bdev_io->internal.buf = NULL; 2291 bdev_io->internal.io_submit_ch = NULL; 2292 bdev_io->internal.orig_iovs = NULL; 2293 bdev_io->internal.orig_iovcnt = 0; 2294 bdev_io->internal.orig_md_buf = NULL; 2295 bdev_io->internal.error.nvme.cdw0 = 0; 2296 bdev_io->num_retries = 0; 2297 bdev_io->internal.get_buf_cb = NULL; 2298 bdev_io->internal.get_aux_buf_cb = NULL; 2299 } 2300 2301 static bool 2302 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2303 { 2304 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2305 } 2306 2307 bool 2308 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2309 { 2310 bool supported; 2311 2312 supported = bdev_io_type_supported(bdev, io_type); 2313 2314 if (!supported) { 2315 switch (io_type) { 2316 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2317 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2318 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2319 break; 2320 case SPDK_BDEV_IO_TYPE_ZCOPY: 2321 /* Zero copy can be emulated with regular read and write */ 2322 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) && 2323 bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2324 break; 2325 default: 2326 break; 2327 } 2328 } 2329 2330 return supported; 2331 } 2332 2333 int 2334 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2335 { 2336 if (bdev->fn_table->dump_info_json) { 2337 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2338 } 2339 2340 return 0; 2341 } 2342 2343 static void 2344 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2345 { 2346 uint32_t max_per_timeslice = 0; 2347 int i; 2348 2349 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2350 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2351 qos->rate_limits[i].max_per_timeslice = 0; 2352 continue; 2353 } 2354 2355 max_per_timeslice = qos->rate_limits[i].limit * 2356 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2357 2358 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2359 qos->rate_limits[i].min_per_timeslice); 2360 2361 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2362 } 2363 2364 bdev_qos_set_ops(qos); 2365 } 2366 2367 static int 2368 bdev_channel_poll_qos(void *arg) 2369 { 2370 struct spdk_bdev_qos *qos = arg; 2371 uint64_t now = spdk_get_ticks(); 2372 int i; 2373 2374 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2375 /* We received our callback earlier than expected - return 2376 * immediately and wait to do accounting until at least one 2377 * timeslice has actually expired. This should never happen 2378 * with a well-behaved timer implementation. 2379 */ 2380 return SPDK_POLLER_IDLE; 2381 } 2382 2383 /* Reset for next round of rate limiting */ 2384 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2385 /* We may have allowed the IOs or bytes to slightly overrun in the last 2386 * timeslice. remaining_this_timeslice is signed, so if it's negative 2387 * here, we'll account for the overrun so that the next timeslice will 2388 * be appropriately reduced. 2389 */ 2390 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2391 qos->rate_limits[i].remaining_this_timeslice = 0; 2392 } 2393 } 2394 2395 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2396 qos->last_timeslice += qos->timeslice_size; 2397 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2398 qos->rate_limits[i].remaining_this_timeslice += 2399 qos->rate_limits[i].max_per_timeslice; 2400 } 2401 } 2402 2403 return bdev_qos_io_submit(qos->ch, qos); 2404 } 2405 2406 static void 2407 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2408 { 2409 struct spdk_bdev_shared_resource *shared_resource; 2410 struct lba_range *range; 2411 2412 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2413 range = TAILQ_FIRST(&ch->locked_ranges); 2414 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2415 free(range); 2416 } 2417 2418 spdk_put_io_channel(ch->channel); 2419 2420 shared_resource = ch->shared_resource; 2421 2422 assert(TAILQ_EMPTY(&ch->io_locked)); 2423 assert(TAILQ_EMPTY(&ch->io_submitted)); 2424 assert(ch->io_outstanding == 0); 2425 assert(shared_resource->ref > 0); 2426 shared_resource->ref--; 2427 if (shared_resource->ref == 0) { 2428 assert(shared_resource->io_outstanding == 0); 2429 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2430 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2431 free(shared_resource); 2432 } 2433 } 2434 2435 /* Caller must hold bdev->internal.mutex. */ 2436 static void 2437 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2438 { 2439 struct spdk_bdev_qos *qos = bdev->internal.qos; 2440 int i; 2441 2442 /* Rate limiting on this bdev enabled */ 2443 if (qos) { 2444 if (qos->ch == NULL) { 2445 struct spdk_io_channel *io_ch; 2446 2447 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2448 bdev->name, spdk_get_thread()); 2449 2450 /* No qos channel has been selected, so set one up */ 2451 2452 /* Take another reference to ch */ 2453 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2454 assert(io_ch != NULL); 2455 qos->ch = ch; 2456 2457 qos->thread = spdk_io_channel_get_thread(io_ch); 2458 2459 TAILQ_INIT(&qos->queued); 2460 2461 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2462 if (bdev_qos_is_iops_rate_limit(i) == true) { 2463 qos->rate_limits[i].min_per_timeslice = 2464 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2465 } else { 2466 qos->rate_limits[i].min_per_timeslice = 2467 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2468 } 2469 2470 if (qos->rate_limits[i].limit == 0) { 2471 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2472 } 2473 } 2474 bdev_qos_update_max_quota_per_timeslice(qos); 2475 qos->timeslice_size = 2476 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2477 qos->last_timeslice = spdk_get_ticks(); 2478 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 2479 qos, 2480 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2481 } 2482 2483 ch->flags |= BDEV_CH_QOS_ENABLED; 2484 } 2485 } 2486 2487 struct poll_timeout_ctx { 2488 struct spdk_bdev_desc *desc; 2489 uint64_t timeout_in_sec; 2490 spdk_bdev_io_timeout_cb cb_fn; 2491 void *cb_arg; 2492 }; 2493 2494 static void 2495 bdev_desc_free(struct spdk_bdev_desc *desc) 2496 { 2497 pthread_mutex_destroy(&desc->mutex); 2498 free(desc->media_events_buffer); 2499 free(desc); 2500 } 2501 2502 static void 2503 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2504 { 2505 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2506 struct spdk_bdev_desc *desc = ctx->desc; 2507 2508 free(ctx); 2509 2510 pthread_mutex_lock(&desc->mutex); 2511 desc->refs--; 2512 if (desc->closed == true && desc->refs == 0) { 2513 pthread_mutex_unlock(&desc->mutex); 2514 bdev_desc_free(desc); 2515 return; 2516 } 2517 pthread_mutex_unlock(&desc->mutex); 2518 } 2519 2520 static void 2521 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2522 { 2523 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2524 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2525 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2526 struct spdk_bdev_desc *desc = ctx->desc; 2527 struct spdk_bdev_io *bdev_io; 2528 uint64_t now; 2529 2530 pthread_mutex_lock(&desc->mutex); 2531 if (desc->closed == true) { 2532 pthread_mutex_unlock(&desc->mutex); 2533 spdk_for_each_channel_continue(i, -1); 2534 return; 2535 } 2536 pthread_mutex_unlock(&desc->mutex); 2537 2538 now = spdk_get_ticks(); 2539 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 2540 /* Exclude any I/O that are generated via splitting. */ 2541 if (bdev_io->internal.cb == bdev_io_split_done) { 2542 continue; 2543 } 2544 2545 /* Once we find an I/O that has not timed out, we can immediately 2546 * exit the loop. 2547 */ 2548 if (now < (bdev_io->internal.submit_tsc + 2549 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 2550 goto end; 2551 } 2552 2553 if (bdev_io->internal.desc == desc) { 2554 ctx->cb_fn(ctx->cb_arg, bdev_io); 2555 } 2556 } 2557 2558 end: 2559 spdk_for_each_channel_continue(i, 0); 2560 } 2561 2562 static int 2563 bdev_poll_timeout_io(void *arg) 2564 { 2565 struct spdk_bdev_desc *desc = arg; 2566 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 2567 struct poll_timeout_ctx *ctx; 2568 2569 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 2570 if (!ctx) { 2571 SPDK_ERRLOG("failed to allocate memory\n"); 2572 return SPDK_POLLER_BUSY; 2573 } 2574 ctx->desc = desc; 2575 ctx->cb_arg = desc->cb_arg; 2576 ctx->cb_fn = desc->cb_fn; 2577 ctx->timeout_in_sec = desc->timeout_in_sec; 2578 2579 /* Take a ref on the descriptor in case it gets closed while we are checking 2580 * all of the channels. 2581 */ 2582 pthread_mutex_lock(&desc->mutex); 2583 desc->refs++; 2584 pthread_mutex_unlock(&desc->mutex); 2585 2586 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2587 bdev_channel_poll_timeout_io, 2588 ctx, 2589 bdev_channel_poll_timeout_io_done); 2590 2591 return SPDK_POLLER_BUSY; 2592 } 2593 2594 int 2595 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 2596 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 2597 { 2598 assert(desc->thread == spdk_get_thread()); 2599 2600 spdk_poller_unregister(&desc->io_timeout_poller); 2601 2602 if (timeout_in_sec) { 2603 assert(cb_fn != NULL); 2604 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 2605 desc, 2606 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 2607 1000); 2608 if (desc->io_timeout_poller == NULL) { 2609 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 2610 return -1; 2611 } 2612 } 2613 2614 desc->cb_fn = cb_fn; 2615 desc->cb_arg = cb_arg; 2616 desc->timeout_in_sec = timeout_in_sec; 2617 2618 return 0; 2619 } 2620 2621 static int 2622 bdev_channel_create(void *io_device, void *ctx_buf) 2623 { 2624 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2625 struct spdk_bdev_channel *ch = ctx_buf; 2626 struct spdk_io_channel *mgmt_io_ch; 2627 struct spdk_bdev_mgmt_channel *mgmt_ch; 2628 struct spdk_bdev_shared_resource *shared_resource; 2629 struct lba_range *range; 2630 2631 ch->bdev = bdev; 2632 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 2633 if (!ch->channel) { 2634 return -1; 2635 } 2636 2637 assert(ch->histogram == NULL); 2638 if (bdev->internal.histogram_enabled) { 2639 ch->histogram = spdk_histogram_data_alloc(); 2640 if (ch->histogram == NULL) { 2641 SPDK_ERRLOG("Could not allocate histogram\n"); 2642 } 2643 } 2644 2645 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 2646 if (!mgmt_io_ch) { 2647 spdk_put_io_channel(ch->channel); 2648 return -1; 2649 } 2650 2651 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 2652 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 2653 if (shared_resource->shared_ch == ch->channel) { 2654 spdk_put_io_channel(mgmt_io_ch); 2655 shared_resource->ref++; 2656 break; 2657 } 2658 } 2659 2660 if (shared_resource == NULL) { 2661 shared_resource = calloc(1, sizeof(*shared_resource)); 2662 if (shared_resource == NULL) { 2663 spdk_put_io_channel(ch->channel); 2664 spdk_put_io_channel(mgmt_io_ch); 2665 return -1; 2666 } 2667 2668 shared_resource->mgmt_ch = mgmt_ch; 2669 shared_resource->io_outstanding = 0; 2670 TAILQ_INIT(&shared_resource->nomem_io); 2671 shared_resource->nomem_threshold = 0; 2672 shared_resource->shared_ch = ch->channel; 2673 shared_resource->ref = 1; 2674 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 2675 } 2676 2677 memset(&ch->stat, 0, sizeof(ch->stat)); 2678 ch->stat.ticks_rate = spdk_get_ticks_hz(); 2679 ch->io_outstanding = 0; 2680 TAILQ_INIT(&ch->queued_resets); 2681 TAILQ_INIT(&ch->locked_ranges); 2682 ch->flags = 0; 2683 ch->shared_resource = shared_resource; 2684 2685 TAILQ_INIT(&ch->io_submitted); 2686 TAILQ_INIT(&ch->io_locked); 2687 2688 #ifdef SPDK_CONFIG_VTUNE 2689 { 2690 char *name; 2691 __itt_init_ittlib(NULL, 0); 2692 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 2693 if (!name) { 2694 bdev_channel_destroy_resource(ch); 2695 return -1; 2696 } 2697 ch->handle = __itt_string_handle_create(name); 2698 free(name); 2699 ch->start_tsc = spdk_get_ticks(); 2700 ch->interval_tsc = spdk_get_ticks_hz() / 100; 2701 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 2702 } 2703 #endif 2704 2705 pthread_mutex_lock(&bdev->internal.mutex); 2706 bdev_enable_qos(bdev, ch); 2707 2708 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 2709 struct lba_range *new_range; 2710 2711 new_range = calloc(1, sizeof(*new_range)); 2712 if (new_range == NULL) { 2713 pthread_mutex_unlock(&bdev->internal.mutex); 2714 bdev_channel_destroy_resource(ch); 2715 return -1; 2716 } 2717 new_range->length = range->length; 2718 new_range->offset = range->offset; 2719 new_range->locked_ctx = range->locked_ctx; 2720 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 2721 } 2722 2723 pthread_mutex_unlock(&bdev->internal.mutex); 2724 2725 return 0; 2726 } 2727 2728 /* 2729 * Abort I/O that are waiting on a data buffer. These types of I/O are 2730 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2731 */ 2732 static void 2733 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2734 { 2735 bdev_io_stailq_t tmp; 2736 struct spdk_bdev_io *bdev_io; 2737 2738 STAILQ_INIT(&tmp); 2739 2740 while (!STAILQ_EMPTY(queue)) { 2741 bdev_io = STAILQ_FIRST(queue); 2742 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2743 if (bdev_io->internal.ch == ch) { 2744 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2745 } else { 2746 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2747 } 2748 } 2749 2750 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2751 } 2752 2753 /* 2754 * Abort I/O that are queued waiting for submission. These types of I/O are 2755 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2756 */ 2757 static void 2758 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2759 { 2760 struct spdk_bdev_io *bdev_io, *tmp; 2761 2762 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2763 if (bdev_io->internal.ch == ch) { 2764 TAILQ_REMOVE(queue, bdev_io, internal.link); 2765 /* 2766 * spdk_bdev_io_complete() assumes that the completed I/O had 2767 * been submitted to the bdev module. Since in this case it 2768 * hadn't, bump io_outstanding to account for the decrement 2769 * that spdk_bdev_io_complete() will do. 2770 */ 2771 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2772 ch->io_outstanding++; 2773 ch->shared_resource->io_outstanding++; 2774 } 2775 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2776 } 2777 } 2778 } 2779 2780 static bool 2781 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 2782 { 2783 struct spdk_bdev_io *bdev_io; 2784 2785 TAILQ_FOREACH(bdev_io, queue, internal.link) { 2786 if (bdev_io == bio_to_abort) { 2787 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 2788 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 2789 return true; 2790 } 2791 } 2792 2793 return false; 2794 } 2795 2796 static bool 2797 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 2798 { 2799 struct spdk_bdev_io *bdev_io; 2800 2801 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 2802 if (bdev_io == bio_to_abort) { 2803 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 2804 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 2805 return true; 2806 } 2807 } 2808 2809 return false; 2810 } 2811 2812 static void 2813 bdev_qos_channel_destroy(void *cb_arg) 2814 { 2815 struct spdk_bdev_qos *qos = cb_arg; 2816 2817 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2818 spdk_poller_unregister(&qos->poller); 2819 2820 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2821 2822 free(qos); 2823 } 2824 2825 static int 2826 bdev_qos_destroy(struct spdk_bdev *bdev) 2827 { 2828 int i; 2829 2830 /* 2831 * Cleanly shutting down the QoS poller is tricky, because 2832 * during the asynchronous operation the user could open 2833 * a new descriptor and create a new channel, spawning 2834 * a new QoS poller. 2835 * 2836 * The strategy is to create a new QoS structure here and swap it 2837 * in. The shutdown path then continues to refer to the old one 2838 * until it completes and then releases it. 2839 */ 2840 struct spdk_bdev_qos *new_qos, *old_qos; 2841 2842 old_qos = bdev->internal.qos; 2843 2844 new_qos = calloc(1, sizeof(*new_qos)); 2845 if (!new_qos) { 2846 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2847 return -ENOMEM; 2848 } 2849 2850 /* Copy the old QoS data into the newly allocated structure */ 2851 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2852 2853 /* Zero out the key parts of the QoS structure */ 2854 new_qos->ch = NULL; 2855 new_qos->thread = NULL; 2856 new_qos->poller = NULL; 2857 TAILQ_INIT(&new_qos->queued); 2858 /* 2859 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2860 * It will be used later for the new QoS structure. 2861 */ 2862 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2863 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2864 new_qos->rate_limits[i].min_per_timeslice = 0; 2865 new_qos->rate_limits[i].max_per_timeslice = 0; 2866 } 2867 2868 bdev->internal.qos = new_qos; 2869 2870 if (old_qos->thread == NULL) { 2871 free(old_qos); 2872 } else { 2873 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 2874 } 2875 2876 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2877 * been destroyed yet. The destruction path will end up waiting for the final 2878 * channel to be put before it releases resources. */ 2879 2880 return 0; 2881 } 2882 2883 static void 2884 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2885 { 2886 total->bytes_read += add->bytes_read; 2887 total->num_read_ops += add->num_read_ops; 2888 total->bytes_written += add->bytes_written; 2889 total->num_write_ops += add->num_write_ops; 2890 total->bytes_unmapped += add->bytes_unmapped; 2891 total->num_unmap_ops += add->num_unmap_ops; 2892 total->read_latency_ticks += add->read_latency_ticks; 2893 total->write_latency_ticks += add->write_latency_ticks; 2894 total->unmap_latency_ticks += add->unmap_latency_ticks; 2895 } 2896 2897 static void 2898 bdev_channel_destroy(void *io_device, void *ctx_buf) 2899 { 2900 struct spdk_bdev_channel *ch = ctx_buf; 2901 struct spdk_bdev_mgmt_channel *mgmt_ch; 2902 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2903 2904 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2905 spdk_get_thread()); 2906 2907 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2908 pthread_mutex_lock(&ch->bdev->internal.mutex); 2909 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2910 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2911 2912 mgmt_ch = shared_resource->mgmt_ch; 2913 2914 bdev_abort_all_queued_io(&ch->queued_resets, ch); 2915 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 2916 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 2917 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 2918 2919 if (ch->histogram) { 2920 spdk_histogram_data_free(ch->histogram); 2921 } 2922 2923 bdev_channel_destroy_resource(ch); 2924 } 2925 2926 int 2927 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2928 { 2929 struct spdk_bdev_alias *tmp; 2930 2931 if (alias == NULL) { 2932 SPDK_ERRLOG("Empty alias passed\n"); 2933 return -EINVAL; 2934 } 2935 2936 if (spdk_bdev_get_by_name(alias)) { 2937 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2938 return -EEXIST; 2939 } 2940 2941 tmp = calloc(1, sizeof(*tmp)); 2942 if (tmp == NULL) { 2943 SPDK_ERRLOG("Unable to allocate alias\n"); 2944 return -ENOMEM; 2945 } 2946 2947 tmp->alias = strdup(alias); 2948 if (tmp->alias == NULL) { 2949 free(tmp); 2950 SPDK_ERRLOG("Unable to allocate alias\n"); 2951 return -ENOMEM; 2952 } 2953 2954 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2955 2956 return 0; 2957 } 2958 2959 int 2960 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2961 { 2962 struct spdk_bdev_alias *tmp; 2963 2964 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2965 if (strcmp(alias, tmp->alias) == 0) { 2966 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2967 free(tmp->alias); 2968 free(tmp); 2969 return 0; 2970 } 2971 } 2972 2973 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2974 2975 return -ENOENT; 2976 } 2977 2978 void 2979 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2980 { 2981 struct spdk_bdev_alias *p, *tmp; 2982 2983 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2984 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2985 free(p->alias); 2986 free(p); 2987 } 2988 } 2989 2990 struct spdk_io_channel * 2991 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2992 { 2993 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 2994 } 2995 2996 const char * 2997 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2998 { 2999 return bdev->name; 3000 } 3001 3002 const char * 3003 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3004 { 3005 return bdev->product_name; 3006 } 3007 3008 const struct spdk_bdev_aliases_list * 3009 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3010 { 3011 return &bdev->aliases; 3012 } 3013 3014 uint32_t 3015 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3016 { 3017 return bdev->blocklen; 3018 } 3019 3020 uint32_t 3021 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3022 { 3023 return bdev->write_unit_size; 3024 } 3025 3026 uint64_t 3027 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3028 { 3029 return bdev->blockcnt; 3030 } 3031 3032 const char * 3033 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3034 { 3035 return qos_rpc_type[type]; 3036 } 3037 3038 void 3039 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3040 { 3041 int i; 3042 3043 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3044 3045 pthread_mutex_lock(&bdev->internal.mutex); 3046 if (bdev->internal.qos) { 3047 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3048 if (bdev->internal.qos->rate_limits[i].limit != 3049 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3050 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3051 if (bdev_qos_is_iops_rate_limit(i) == false) { 3052 /* Change from Byte to Megabyte which is user visible. */ 3053 limits[i] = limits[i] / 1024 / 1024; 3054 } 3055 } 3056 } 3057 } 3058 pthread_mutex_unlock(&bdev->internal.mutex); 3059 } 3060 3061 size_t 3062 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3063 { 3064 return 1 << bdev->required_alignment; 3065 } 3066 3067 uint32_t 3068 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3069 { 3070 return bdev->optimal_io_boundary; 3071 } 3072 3073 bool 3074 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3075 { 3076 return bdev->write_cache; 3077 } 3078 3079 const struct spdk_uuid * 3080 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3081 { 3082 return &bdev->uuid; 3083 } 3084 3085 uint16_t 3086 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3087 { 3088 return bdev->acwu; 3089 } 3090 3091 uint32_t 3092 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3093 { 3094 return bdev->md_len; 3095 } 3096 3097 bool 3098 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3099 { 3100 return (bdev->md_len != 0) && bdev->md_interleave; 3101 } 3102 3103 bool 3104 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3105 { 3106 return (bdev->md_len != 0) && !bdev->md_interleave; 3107 } 3108 3109 bool 3110 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3111 { 3112 return bdev->zoned; 3113 } 3114 3115 uint32_t 3116 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3117 { 3118 if (spdk_bdev_is_md_interleaved(bdev)) { 3119 return bdev->blocklen - bdev->md_len; 3120 } else { 3121 return bdev->blocklen; 3122 } 3123 } 3124 3125 static uint32_t 3126 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3127 { 3128 if (!spdk_bdev_is_md_interleaved(bdev)) { 3129 return bdev->blocklen + bdev->md_len; 3130 } else { 3131 return bdev->blocklen; 3132 } 3133 } 3134 3135 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3136 { 3137 if (bdev->md_len != 0) { 3138 return bdev->dif_type; 3139 } else { 3140 return SPDK_DIF_DISABLE; 3141 } 3142 } 3143 3144 bool 3145 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3146 { 3147 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3148 return bdev->dif_is_head_of_md; 3149 } else { 3150 return false; 3151 } 3152 } 3153 3154 bool 3155 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3156 enum spdk_dif_check_type check_type) 3157 { 3158 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3159 return false; 3160 } 3161 3162 switch (check_type) { 3163 case SPDK_DIF_CHECK_TYPE_REFTAG: 3164 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3165 case SPDK_DIF_CHECK_TYPE_APPTAG: 3166 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3167 case SPDK_DIF_CHECK_TYPE_GUARD: 3168 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3169 default: 3170 return false; 3171 } 3172 } 3173 3174 uint64_t 3175 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3176 { 3177 return bdev->internal.measured_queue_depth; 3178 } 3179 3180 uint64_t 3181 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3182 { 3183 return bdev->internal.period; 3184 } 3185 3186 uint64_t 3187 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3188 { 3189 return bdev->internal.weighted_io_time; 3190 } 3191 3192 uint64_t 3193 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3194 { 3195 return bdev->internal.io_time; 3196 } 3197 3198 static void 3199 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3200 { 3201 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3202 3203 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3204 3205 if (bdev->internal.measured_queue_depth) { 3206 bdev->internal.io_time += bdev->internal.period; 3207 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3208 } 3209 } 3210 3211 static void 3212 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3213 { 3214 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3215 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3216 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3217 3218 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3219 spdk_for_each_channel_continue(i, 0); 3220 } 3221 3222 static int 3223 bdev_calculate_measured_queue_depth(void *ctx) 3224 { 3225 struct spdk_bdev *bdev = ctx; 3226 bdev->internal.temporary_queue_depth = 0; 3227 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3228 _calculate_measured_qd_cpl); 3229 return SPDK_POLLER_BUSY; 3230 } 3231 3232 void 3233 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3234 { 3235 bdev->internal.period = period; 3236 3237 if (bdev->internal.qd_poller != NULL) { 3238 spdk_poller_unregister(&bdev->internal.qd_poller); 3239 bdev->internal.measured_queue_depth = UINT64_MAX; 3240 } 3241 3242 if (period != 0) { 3243 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 3244 period); 3245 } 3246 } 3247 3248 static void 3249 _resize_notify(void *arg) 3250 { 3251 struct spdk_bdev_desc *desc = arg; 3252 3253 pthread_mutex_lock(&desc->mutex); 3254 desc->refs--; 3255 if (!desc->closed) { 3256 pthread_mutex_unlock(&desc->mutex); 3257 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3258 desc->bdev, 3259 desc->callback.ctx); 3260 return; 3261 } else if (0 == desc->refs) { 3262 /* This descriptor was closed after this resize_notify message was sent. 3263 * spdk_bdev_close() could not free the descriptor since this message was 3264 * in flight, so we free it now using bdev_desc_free(). 3265 */ 3266 pthread_mutex_unlock(&desc->mutex); 3267 bdev_desc_free(desc); 3268 return; 3269 } 3270 pthread_mutex_unlock(&desc->mutex); 3271 } 3272 3273 int 3274 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3275 { 3276 struct spdk_bdev_desc *desc; 3277 int ret; 3278 3279 pthread_mutex_lock(&bdev->internal.mutex); 3280 3281 /* bdev has open descriptors */ 3282 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3283 bdev->blockcnt > size) { 3284 ret = -EBUSY; 3285 } else { 3286 bdev->blockcnt = size; 3287 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3288 pthread_mutex_lock(&desc->mutex); 3289 if (desc->callback.open_with_ext && !desc->closed) { 3290 desc->refs++; 3291 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3292 } 3293 pthread_mutex_unlock(&desc->mutex); 3294 } 3295 ret = 0; 3296 } 3297 3298 pthread_mutex_unlock(&bdev->internal.mutex); 3299 3300 return ret; 3301 } 3302 3303 /* 3304 * Convert I/O offset and length from bytes to blocks. 3305 * 3306 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3307 */ 3308 static uint64_t 3309 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3310 uint64_t num_bytes, uint64_t *num_blocks) 3311 { 3312 uint32_t block_size = bdev->blocklen; 3313 uint8_t shift_cnt; 3314 3315 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3316 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3317 shift_cnt = spdk_u32log2(block_size); 3318 *offset_blocks = offset_bytes >> shift_cnt; 3319 *num_blocks = num_bytes >> shift_cnt; 3320 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3321 (num_bytes - (*num_blocks << shift_cnt)); 3322 } else { 3323 *offset_blocks = offset_bytes / block_size; 3324 *num_blocks = num_bytes / block_size; 3325 return (offset_bytes % block_size) | (num_bytes % block_size); 3326 } 3327 } 3328 3329 static bool 3330 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3331 { 3332 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3333 * has been an overflow and hence the offset has been wrapped around */ 3334 if (offset_blocks + num_blocks < offset_blocks) { 3335 return false; 3336 } 3337 3338 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3339 if (offset_blocks + num_blocks > bdev->blockcnt) { 3340 return false; 3341 } 3342 3343 return true; 3344 } 3345 3346 static bool 3347 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3348 { 3349 return _is_buf_allocated(iovs) == (md_buf != NULL); 3350 } 3351 3352 static int 3353 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3354 void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3355 spdk_bdev_io_completion_cb cb, void *cb_arg) 3356 { 3357 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3358 struct spdk_bdev_io *bdev_io; 3359 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3360 3361 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3362 return -EINVAL; 3363 } 3364 3365 bdev_io = bdev_channel_get_io(channel); 3366 if (!bdev_io) { 3367 return -ENOMEM; 3368 } 3369 3370 bdev_io->internal.ch = channel; 3371 bdev_io->internal.desc = desc; 3372 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3373 bdev_io->u.bdev.iovs = &bdev_io->iov; 3374 bdev_io->u.bdev.iovs[0].iov_base = buf; 3375 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3376 bdev_io->u.bdev.iovcnt = 1; 3377 bdev_io->u.bdev.md_buf = md_buf; 3378 bdev_io->u.bdev.num_blocks = num_blocks; 3379 bdev_io->u.bdev.offset_blocks = offset_blocks; 3380 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3381 3382 bdev_io_submit(bdev_io); 3383 return 0; 3384 } 3385 3386 int 3387 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3388 void *buf, uint64_t offset, uint64_t nbytes, 3389 spdk_bdev_io_completion_cb cb, void *cb_arg) 3390 { 3391 uint64_t offset_blocks, num_blocks; 3392 3393 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3394 nbytes, &num_blocks) != 0) { 3395 return -EINVAL; 3396 } 3397 3398 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3399 } 3400 3401 int 3402 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3403 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3404 spdk_bdev_io_completion_cb cb, void *cb_arg) 3405 { 3406 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3407 } 3408 3409 int 3410 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3411 void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3412 spdk_bdev_io_completion_cb cb, void *cb_arg) 3413 { 3414 struct iovec iov = { 3415 .iov_base = buf, 3416 }; 3417 3418 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3419 return -EINVAL; 3420 } 3421 3422 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3423 return -EINVAL; 3424 } 3425 3426 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3427 cb, cb_arg); 3428 } 3429 3430 int 3431 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3432 struct iovec *iov, int iovcnt, 3433 uint64_t offset, uint64_t nbytes, 3434 spdk_bdev_io_completion_cb cb, void *cb_arg) 3435 { 3436 uint64_t offset_blocks, num_blocks; 3437 3438 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3439 nbytes, &num_blocks) != 0) { 3440 return -EINVAL; 3441 } 3442 3443 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3444 } 3445 3446 static int 3447 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3448 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 3449 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg) 3450 { 3451 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3452 struct spdk_bdev_io *bdev_io; 3453 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3454 3455 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3456 return -EINVAL; 3457 } 3458 3459 bdev_io = bdev_channel_get_io(channel); 3460 if (!bdev_io) { 3461 return -ENOMEM; 3462 } 3463 3464 bdev_io->internal.ch = channel; 3465 bdev_io->internal.desc = desc; 3466 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3467 bdev_io->u.bdev.iovs = iov; 3468 bdev_io->u.bdev.iovcnt = iovcnt; 3469 bdev_io->u.bdev.md_buf = md_buf; 3470 bdev_io->u.bdev.num_blocks = num_blocks; 3471 bdev_io->u.bdev.offset_blocks = offset_blocks; 3472 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3473 3474 bdev_io_submit(bdev_io); 3475 return 0; 3476 } 3477 3478 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3479 struct iovec *iov, int iovcnt, 3480 uint64_t offset_blocks, uint64_t num_blocks, 3481 spdk_bdev_io_completion_cb cb, void *cb_arg) 3482 { 3483 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3484 num_blocks, cb, cb_arg); 3485 } 3486 3487 int 3488 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3489 struct iovec *iov, int iovcnt, void *md_buf, 3490 uint64_t offset_blocks, uint64_t num_blocks, 3491 spdk_bdev_io_completion_cb cb, void *cb_arg) 3492 { 3493 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3494 return -EINVAL; 3495 } 3496 3497 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3498 return -EINVAL; 3499 } 3500 3501 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3502 num_blocks, cb, cb_arg); 3503 } 3504 3505 static int 3506 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3507 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3508 spdk_bdev_io_completion_cb cb, void *cb_arg) 3509 { 3510 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3511 struct spdk_bdev_io *bdev_io; 3512 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3513 3514 if (!desc->write) { 3515 return -EBADF; 3516 } 3517 3518 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3519 return -EINVAL; 3520 } 3521 3522 bdev_io = bdev_channel_get_io(channel); 3523 if (!bdev_io) { 3524 return -ENOMEM; 3525 } 3526 3527 bdev_io->internal.ch = channel; 3528 bdev_io->internal.desc = desc; 3529 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3530 bdev_io->u.bdev.iovs = &bdev_io->iov; 3531 bdev_io->u.bdev.iovs[0].iov_base = buf; 3532 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3533 bdev_io->u.bdev.iovcnt = 1; 3534 bdev_io->u.bdev.md_buf = md_buf; 3535 bdev_io->u.bdev.num_blocks = num_blocks; 3536 bdev_io->u.bdev.offset_blocks = offset_blocks; 3537 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3538 3539 bdev_io_submit(bdev_io); 3540 return 0; 3541 } 3542 3543 int 3544 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3545 void *buf, uint64_t offset, uint64_t nbytes, 3546 spdk_bdev_io_completion_cb cb, void *cb_arg) 3547 { 3548 uint64_t offset_blocks, num_blocks; 3549 3550 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3551 nbytes, &num_blocks) != 0) { 3552 return -EINVAL; 3553 } 3554 3555 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3556 } 3557 3558 int 3559 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3560 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3561 spdk_bdev_io_completion_cb cb, void *cb_arg) 3562 { 3563 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3564 cb, cb_arg); 3565 } 3566 3567 int 3568 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3569 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3570 spdk_bdev_io_completion_cb cb, void *cb_arg) 3571 { 3572 struct iovec iov = { 3573 .iov_base = buf, 3574 }; 3575 3576 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3577 return -EINVAL; 3578 } 3579 3580 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3581 return -EINVAL; 3582 } 3583 3584 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3585 cb, cb_arg); 3586 } 3587 3588 static int 3589 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3590 struct iovec *iov, int iovcnt, void *md_buf, 3591 uint64_t offset_blocks, uint64_t num_blocks, 3592 spdk_bdev_io_completion_cb cb, void *cb_arg) 3593 { 3594 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3595 struct spdk_bdev_io *bdev_io; 3596 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3597 3598 if (!desc->write) { 3599 return -EBADF; 3600 } 3601 3602 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3603 return -EINVAL; 3604 } 3605 3606 bdev_io = bdev_channel_get_io(channel); 3607 if (!bdev_io) { 3608 return -ENOMEM; 3609 } 3610 3611 bdev_io->internal.ch = channel; 3612 bdev_io->internal.desc = desc; 3613 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3614 bdev_io->u.bdev.iovs = iov; 3615 bdev_io->u.bdev.iovcnt = iovcnt; 3616 bdev_io->u.bdev.md_buf = md_buf; 3617 bdev_io->u.bdev.num_blocks = num_blocks; 3618 bdev_io->u.bdev.offset_blocks = offset_blocks; 3619 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3620 3621 bdev_io_submit(bdev_io); 3622 return 0; 3623 } 3624 3625 int 3626 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3627 struct iovec *iov, int iovcnt, 3628 uint64_t offset, uint64_t len, 3629 spdk_bdev_io_completion_cb cb, void *cb_arg) 3630 { 3631 uint64_t offset_blocks, num_blocks; 3632 3633 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3634 len, &num_blocks) != 0) { 3635 return -EINVAL; 3636 } 3637 3638 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3639 } 3640 3641 int 3642 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3643 struct iovec *iov, int iovcnt, 3644 uint64_t offset_blocks, uint64_t num_blocks, 3645 spdk_bdev_io_completion_cb cb, void *cb_arg) 3646 { 3647 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3648 num_blocks, cb, cb_arg); 3649 } 3650 3651 int 3652 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3653 struct iovec *iov, int iovcnt, void *md_buf, 3654 uint64_t offset_blocks, uint64_t num_blocks, 3655 spdk_bdev_io_completion_cb cb, void *cb_arg) 3656 { 3657 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3658 return -EINVAL; 3659 } 3660 3661 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3662 return -EINVAL; 3663 } 3664 3665 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3666 num_blocks, cb, cb_arg); 3667 } 3668 3669 static void 3670 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3671 { 3672 struct spdk_bdev_io *parent_io = cb_arg; 3673 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 3674 int i, rc = 0; 3675 3676 if (!success) { 3677 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3678 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3679 spdk_bdev_free_io(bdev_io); 3680 return; 3681 } 3682 3683 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 3684 rc = memcmp(read_buf, 3685 parent_io->u.bdev.iovs[i].iov_base, 3686 parent_io->u.bdev.iovs[i].iov_len); 3687 if (rc) { 3688 break; 3689 } 3690 read_buf += parent_io->u.bdev.iovs[i].iov_len; 3691 } 3692 3693 spdk_bdev_free_io(bdev_io); 3694 3695 if (rc == 0) { 3696 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3697 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 3698 } else { 3699 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 3700 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3701 } 3702 } 3703 3704 static void 3705 bdev_compare_do_read(void *_bdev_io) 3706 { 3707 struct spdk_bdev_io *bdev_io = _bdev_io; 3708 int rc; 3709 3710 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 3711 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 3712 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3713 bdev_compare_do_read_done, bdev_io); 3714 3715 if (rc == -ENOMEM) { 3716 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 3717 } else if (rc != 0) { 3718 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3719 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3720 } 3721 } 3722 3723 static int 3724 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3725 struct iovec *iov, int iovcnt, void *md_buf, 3726 uint64_t offset_blocks, uint64_t num_blocks, 3727 spdk_bdev_io_completion_cb cb, void *cb_arg) 3728 { 3729 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3730 struct spdk_bdev_io *bdev_io; 3731 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3732 3733 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3734 return -EINVAL; 3735 } 3736 3737 bdev_io = bdev_channel_get_io(channel); 3738 if (!bdev_io) { 3739 return -ENOMEM; 3740 } 3741 3742 bdev_io->internal.ch = channel; 3743 bdev_io->internal.desc = desc; 3744 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3745 bdev_io->u.bdev.iovs = iov; 3746 bdev_io->u.bdev.iovcnt = iovcnt; 3747 bdev_io->u.bdev.md_buf = md_buf; 3748 bdev_io->u.bdev.num_blocks = num_blocks; 3749 bdev_io->u.bdev.offset_blocks = offset_blocks; 3750 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3751 3752 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3753 bdev_io_submit(bdev_io); 3754 return 0; 3755 } 3756 3757 bdev_compare_do_read(bdev_io); 3758 3759 return 0; 3760 } 3761 3762 int 3763 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3764 struct iovec *iov, int iovcnt, 3765 uint64_t offset_blocks, uint64_t num_blocks, 3766 spdk_bdev_io_completion_cb cb, void *cb_arg) 3767 { 3768 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3769 num_blocks, cb, cb_arg); 3770 } 3771 3772 int 3773 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3774 struct iovec *iov, int iovcnt, void *md_buf, 3775 uint64_t offset_blocks, uint64_t num_blocks, 3776 spdk_bdev_io_completion_cb cb, void *cb_arg) 3777 { 3778 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3779 return -EINVAL; 3780 } 3781 3782 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3783 return -EINVAL; 3784 } 3785 3786 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3787 num_blocks, cb, cb_arg); 3788 } 3789 3790 static int 3791 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3792 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3793 spdk_bdev_io_completion_cb cb, void *cb_arg) 3794 { 3795 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3796 struct spdk_bdev_io *bdev_io; 3797 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3798 3799 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3800 return -EINVAL; 3801 } 3802 3803 bdev_io = bdev_channel_get_io(channel); 3804 if (!bdev_io) { 3805 return -ENOMEM; 3806 } 3807 3808 bdev_io->internal.ch = channel; 3809 bdev_io->internal.desc = desc; 3810 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3811 bdev_io->u.bdev.iovs = &bdev_io->iov; 3812 bdev_io->u.bdev.iovs[0].iov_base = buf; 3813 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3814 bdev_io->u.bdev.iovcnt = 1; 3815 bdev_io->u.bdev.md_buf = md_buf; 3816 bdev_io->u.bdev.num_blocks = num_blocks; 3817 bdev_io->u.bdev.offset_blocks = offset_blocks; 3818 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3819 3820 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3821 bdev_io_submit(bdev_io); 3822 return 0; 3823 } 3824 3825 bdev_compare_do_read(bdev_io); 3826 3827 return 0; 3828 } 3829 3830 int 3831 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3832 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3833 spdk_bdev_io_completion_cb cb, void *cb_arg) 3834 { 3835 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3836 cb, cb_arg); 3837 } 3838 3839 int 3840 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3841 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3842 spdk_bdev_io_completion_cb cb, void *cb_arg) 3843 { 3844 struct iovec iov = { 3845 .iov_base = buf, 3846 }; 3847 3848 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3849 return -EINVAL; 3850 } 3851 3852 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3853 return -EINVAL; 3854 } 3855 3856 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3857 cb, cb_arg); 3858 } 3859 3860 static void 3861 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 3862 { 3863 struct spdk_bdev_io *bdev_io = ctx; 3864 3865 if (unlock_status) { 3866 SPDK_ERRLOG("LBA range unlock failed\n"); 3867 } 3868 3869 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 3870 false, bdev_io->internal.caller_ctx); 3871 } 3872 3873 static void 3874 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 3875 { 3876 bdev_io->internal.status = status; 3877 3878 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3879 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3880 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 3881 } 3882 3883 static void 3884 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3885 { 3886 struct spdk_bdev_io *parent_io = cb_arg; 3887 3888 if (!success) { 3889 SPDK_ERRLOG("Compare and write operation failed\n"); 3890 } 3891 3892 spdk_bdev_free_io(bdev_io); 3893 3894 bdev_comparev_and_writev_blocks_unlock(parent_io, 3895 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 3896 } 3897 3898 static void 3899 bdev_compare_and_write_do_write(void *_bdev_io) 3900 { 3901 struct spdk_bdev_io *bdev_io = _bdev_io; 3902 int rc; 3903 3904 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 3905 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3906 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 3907 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3908 bdev_compare_and_write_do_write_done, bdev_io); 3909 3910 3911 if (rc == -ENOMEM) { 3912 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 3913 } else if (rc != 0) { 3914 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3915 } 3916 } 3917 3918 static void 3919 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3920 { 3921 struct spdk_bdev_io *parent_io = cb_arg; 3922 3923 spdk_bdev_free_io(bdev_io); 3924 3925 if (!success) { 3926 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 3927 return; 3928 } 3929 3930 bdev_compare_and_write_do_write(parent_io); 3931 } 3932 3933 static void 3934 bdev_compare_and_write_do_compare(void *_bdev_io) 3935 { 3936 struct spdk_bdev_io *bdev_io = _bdev_io; 3937 int rc; 3938 3939 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 3940 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 3941 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3942 bdev_compare_and_write_do_compare_done, bdev_io); 3943 3944 if (rc == -ENOMEM) { 3945 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 3946 } else if (rc != 0) { 3947 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 3948 } 3949 } 3950 3951 static void 3952 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 3953 { 3954 struct spdk_bdev_io *bdev_io = ctx; 3955 3956 if (status) { 3957 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 3958 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3959 return; 3960 } 3961 3962 bdev_compare_and_write_do_compare(bdev_io); 3963 } 3964 3965 int 3966 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3967 struct iovec *compare_iov, int compare_iovcnt, 3968 struct iovec *write_iov, int write_iovcnt, 3969 uint64_t offset_blocks, uint64_t num_blocks, 3970 spdk_bdev_io_completion_cb cb, void *cb_arg) 3971 { 3972 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3973 struct spdk_bdev_io *bdev_io; 3974 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3975 3976 if (!desc->write) { 3977 return -EBADF; 3978 } 3979 3980 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3981 return -EINVAL; 3982 } 3983 3984 if (num_blocks > bdev->acwu) { 3985 return -EINVAL; 3986 } 3987 3988 bdev_io = bdev_channel_get_io(channel); 3989 if (!bdev_io) { 3990 return -ENOMEM; 3991 } 3992 3993 bdev_io->internal.ch = channel; 3994 bdev_io->internal.desc = desc; 3995 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 3996 bdev_io->u.bdev.iovs = compare_iov; 3997 bdev_io->u.bdev.iovcnt = compare_iovcnt; 3998 bdev_io->u.bdev.fused_iovs = write_iov; 3999 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4000 bdev_io->u.bdev.md_buf = NULL; 4001 bdev_io->u.bdev.num_blocks = num_blocks; 4002 bdev_io->u.bdev.offset_blocks = offset_blocks; 4003 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4004 4005 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4006 bdev_io_submit(bdev_io); 4007 return 0; 4008 } 4009 4010 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4011 bdev_comparev_and_writev_blocks_locked, bdev_io); 4012 } 4013 4014 static void 4015 bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 4016 { 4017 if (!success) { 4018 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4019 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 4020 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 4021 return; 4022 } 4023 4024 if (bdev_io->u.bdev.zcopy.populate) { 4025 /* Read the real data into the buffer */ 4026 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4027 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4028 bdev_io_submit(bdev_io); 4029 return; 4030 } 4031 4032 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4033 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4034 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 4035 } 4036 4037 int 4038 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4039 uint64_t offset_blocks, uint64_t num_blocks, 4040 bool populate, 4041 spdk_bdev_io_completion_cb cb, void *cb_arg) 4042 { 4043 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4044 struct spdk_bdev_io *bdev_io; 4045 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4046 4047 if (!desc->write) { 4048 return -EBADF; 4049 } 4050 4051 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4052 return -EINVAL; 4053 } 4054 4055 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4056 return -ENOTSUP; 4057 } 4058 4059 bdev_io = bdev_channel_get_io(channel); 4060 if (!bdev_io) { 4061 return -ENOMEM; 4062 } 4063 4064 bdev_io->internal.ch = channel; 4065 bdev_io->internal.desc = desc; 4066 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4067 bdev_io->u.bdev.num_blocks = num_blocks; 4068 bdev_io->u.bdev.offset_blocks = offset_blocks; 4069 bdev_io->u.bdev.iovs = NULL; 4070 bdev_io->u.bdev.iovcnt = 0; 4071 bdev_io->u.bdev.md_buf = NULL; 4072 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4073 bdev_io->u.bdev.zcopy.commit = 0; 4074 bdev_io->u.bdev.zcopy.start = 1; 4075 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4076 4077 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4078 bdev_io_submit(bdev_io); 4079 } else { 4080 /* Emulate zcopy by allocating a buffer */ 4081 spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf, 4082 bdev_io->u.bdev.num_blocks * bdev->blocklen); 4083 } 4084 4085 return 0; 4086 } 4087 4088 int 4089 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4090 spdk_bdev_io_completion_cb cb, void *cb_arg) 4091 { 4092 struct spdk_bdev *bdev = bdev_io->bdev; 4093 4094 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 4095 /* This can happen if the zcopy was emulated in start */ 4096 if (bdev_io->u.bdev.zcopy.start != 1) { 4097 return -EINVAL; 4098 } 4099 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4100 } 4101 4102 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4103 return -EINVAL; 4104 } 4105 4106 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4107 bdev_io->u.bdev.zcopy.start = 0; 4108 bdev_io->internal.caller_ctx = cb_arg; 4109 bdev_io->internal.cb = cb; 4110 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4111 4112 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4113 bdev_io_submit(bdev_io); 4114 return 0; 4115 } 4116 4117 if (!bdev_io->u.bdev.zcopy.commit) { 4118 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 4119 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4120 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4121 return 0; 4122 } 4123 4124 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4125 bdev_io_submit(bdev_io); 4126 4127 return 0; 4128 } 4129 4130 int 4131 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4132 uint64_t offset, uint64_t len, 4133 spdk_bdev_io_completion_cb cb, void *cb_arg) 4134 { 4135 uint64_t offset_blocks, num_blocks; 4136 4137 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4138 len, &num_blocks) != 0) { 4139 return -EINVAL; 4140 } 4141 4142 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4143 } 4144 4145 int 4146 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4147 uint64_t offset_blocks, uint64_t num_blocks, 4148 spdk_bdev_io_completion_cb cb, void *cb_arg) 4149 { 4150 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4151 struct spdk_bdev_io *bdev_io; 4152 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4153 4154 if (!desc->write) { 4155 return -EBADF; 4156 } 4157 4158 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4159 return -EINVAL; 4160 } 4161 4162 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4163 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4164 return -ENOTSUP; 4165 } 4166 4167 bdev_io = bdev_channel_get_io(channel); 4168 4169 if (!bdev_io) { 4170 return -ENOMEM; 4171 } 4172 4173 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4174 bdev_io->internal.ch = channel; 4175 bdev_io->internal.desc = desc; 4176 bdev_io->u.bdev.offset_blocks = offset_blocks; 4177 bdev_io->u.bdev.num_blocks = num_blocks; 4178 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4179 4180 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4181 bdev_io_submit(bdev_io); 4182 return 0; 4183 } 4184 4185 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4186 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4187 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4188 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4189 bdev_write_zero_buffer_next(bdev_io); 4190 4191 return 0; 4192 } 4193 4194 int 4195 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4196 uint64_t offset, uint64_t nbytes, 4197 spdk_bdev_io_completion_cb cb, void *cb_arg) 4198 { 4199 uint64_t offset_blocks, num_blocks; 4200 4201 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4202 nbytes, &num_blocks) != 0) { 4203 return -EINVAL; 4204 } 4205 4206 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4207 } 4208 4209 int 4210 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4211 uint64_t offset_blocks, uint64_t num_blocks, 4212 spdk_bdev_io_completion_cb cb, void *cb_arg) 4213 { 4214 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4215 struct spdk_bdev_io *bdev_io; 4216 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4217 4218 if (!desc->write) { 4219 return -EBADF; 4220 } 4221 4222 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4223 return -EINVAL; 4224 } 4225 4226 if (num_blocks == 0) { 4227 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4228 return -EINVAL; 4229 } 4230 4231 bdev_io = bdev_channel_get_io(channel); 4232 if (!bdev_io) { 4233 return -ENOMEM; 4234 } 4235 4236 bdev_io->internal.ch = channel; 4237 bdev_io->internal.desc = desc; 4238 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4239 4240 bdev_io->u.bdev.iovs = &bdev_io->iov; 4241 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4242 bdev_io->u.bdev.iovs[0].iov_len = 0; 4243 bdev_io->u.bdev.iovcnt = 1; 4244 4245 bdev_io->u.bdev.offset_blocks = offset_blocks; 4246 bdev_io->u.bdev.num_blocks = num_blocks; 4247 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4248 4249 bdev_io_submit(bdev_io); 4250 return 0; 4251 } 4252 4253 int 4254 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4255 uint64_t offset, uint64_t length, 4256 spdk_bdev_io_completion_cb cb, void *cb_arg) 4257 { 4258 uint64_t offset_blocks, num_blocks; 4259 4260 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4261 length, &num_blocks) != 0) { 4262 return -EINVAL; 4263 } 4264 4265 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4266 } 4267 4268 int 4269 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4270 uint64_t offset_blocks, uint64_t num_blocks, 4271 spdk_bdev_io_completion_cb cb, void *cb_arg) 4272 { 4273 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4274 struct spdk_bdev_io *bdev_io; 4275 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4276 4277 if (!desc->write) { 4278 return -EBADF; 4279 } 4280 4281 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4282 return -EINVAL; 4283 } 4284 4285 bdev_io = bdev_channel_get_io(channel); 4286 if (!bdev_io) { 4287 return -ENOMEM; 4288 } 4289 4290 bdev_io->internal.ch = channel; 4291 bdev_io->internal.desc = desc; 4292 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4293 bdev_io->u.bdev.iovs = NULL; 4294 bdev_io->u.bdev.iovcnt = 0; 4295 bdev_io->u.bdev.offset_blocks = offset_blocks; 4296 bdev_io->u.bdev.num_blocks = num_blocks; 4297 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4298 4299 bdev_io_submit(bdev_io); 4300 return 0; 4301 } 4302 4303 static void 4304 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4305 { 4306 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4307 struct spdk_bdev_io *bdev_io; 4308 4309 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4310 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4311 bdev_io_submit_reset(bdev_io); 4312 } 4313 4314 static void 4315 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4316 { 4317 struct spdk_io_channel *ch; 4318 struct spdk_bdev_channel *channel; 4319 struct spdk_bdev_mgmt_channel *mgmt_channel; 4320 struct spdk_bdev_shared_resource *shared_resource; 4321 bdev_io_tailq_t tmp_queued; 4322 4323 TAILQ_INIT(&tmp_queued); 4324 4325 ch = spdk_io_channel_iter_get_channel(i); 4326 channel = spdk_io_channel_get_ctx(ch); 4327 shared_resource = channel->shared_resource; 4328 mgmt_channel = shared_resource->mgmt_ch; 4329 4330 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4331 4332 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4333 /* The QoS object is always valid and readable while 4334 * the channel flag is set, so the lock here should not 4335 * be necessary. We're not in the fast path though, so 4336 * just take it anyway. */ 4337 pthread_mutex_lock(&channel->bdev->internal.mutex); 4338 if (channel->bdev->internal.qos->ch == channel) { 4339 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4340 } 4341 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4342 } 4343 4344 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 4345 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 4346 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 4347 bdev_abort_all_queued_io(&tmp_queued, channel); 4348 4349 spdk_for_each_channel_continue(i, 0); 4350 } 4351 4352 static void 4353 bdev_start_reset(void *ctx) 4354 { 4355 struct spdk_bdev_channel *ch = ctx; 4356 4357 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4358 ch, bdev_reset_dev); 4359 } 4360 4361 static void 4362 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4363 { 4364 struct spdk_bdev *bdev = ch->bdev; 4365 4366 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4367 4368 pthread_mutex_lock(&bdev->internal.mutex); 4369 if (bdev->internal.reset_in_progress == NULL) { 4370 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4371 /* 4372 * Take a channel reference for the target bdev for the life of this 4373 * reset. This guards against the channel getting destroyed while 4374 * spdk_for_each_channel() calls related to this reset IO are in 4375 * progress. We will release the reference when this reset is 4376 * completed. 4377 */ 4378 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4379 bdev_start_reset(ch); 4380 } 4381 pthread_mutex_unlock(&bdev->internal.mutex); 4382 } 4383 4384 int 4385 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4386 spdk_bdev_io_completion_cb cb, void *cb_arg) 4387 { 4388 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4389 struct spdk_bdev_io *bdev_io; 4390 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4391 4392 bdev_io = bdev_channel_get_io(channel); 4393 if (!bdev_io) { 4394 return -ENOMEM; 4395 } 4396 4397 bdev_io->internal.ch = channel; 4398 bdev_io->internal.desc = desc; 4399 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4400 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4401 bdev_io->u.reset.ch_ref = NULL; 4402 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4403 4404 pthread_mutex_lock(&bdev->internal.mutex); 4405 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4406 pthread_mutex_unlock(&bdev->internal.mutex); 4407 4408 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4409 internal.ch_link); 4410 4411 bdev_channel_start_reset(channel); 4412 4413 return 0; 4414 } 4415 4416 void 4417 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4418 struct spdk_bdev_io_stat *stat) 4419 { 4420 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4421 4422 *stat = channel->stat; 4423 } 4424 4425 static void 4426 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4427 { 4428 void *io_device = spdk_io_channel_iter_get_io_device(i); 4429 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4430 4431 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4432 bdev_iostat_ctx->cb_arg, 0); 4433 free(bdev_iostat_ctx); 4434 } 4435 4436 static void 4437 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4438 { 4439 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4440 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4441 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4442 4443 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4444 spdk_for_each_channel_continue(i, 0); 4445 } 4446 4447 void 4448 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 4449 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 4450 { 4451 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 4452 4453 assert(bdev != NULL); 4454 assert(stat != NULL); 4455 assert(cb != NULL); 4456 4457 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 4458 if (bdev_iostat_ctx == NULL) { 4459 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 4460 cb(bdev, stat, cb_arg, -ENOMEM); 4461 return; 4462 } 4463 4464 bdev_iostat_ctx->stat = stat; 4465 bdev_iostat_ctx->cb = cb; 4466 bdev_iostat_ctx->cb_arg = cb_arg; 4467 4468 /* Start with the statistics from previously deleted channels. */ 4469 pthread_mutex_lock(&bdev->internal.mutex); 4470 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 4471 pthread_mutex_unlock(&bdev->internal.mutex); 4472 4473 /* Then iterate and add the statistics from each existing channel. */ 4474 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4475 bdev_get_each_channel_stat, 4476 bdev_iostat_ctx, 4477 bdev_get_device_stat_done); 4478 } 4479 4480 int 4481 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4482 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4483 spdk_bdev_io_completion_cb cb, void *cb_arg) 4484 { 4485 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4486 struct spdk_bdev_io *bdev_io; 4487 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4488 4489 if (!desc->write) { 4490 return -EBADF; 4491 } 4492 4493 bdev_io = bdev_channel_get_io(channel); 4494 if (!bdev_io) { 4495 return -ENOMEM; 4496 } 4497 4498 bdev_io->internal.ch = channel; 4499 bdev_io->internal.desc = desc; 4500 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 4501 bdev_io->u.nvme_passthru.cmd = *cmd; 4502 bdev_io->u.nvme_passthru.buf = buf; 4503 bdev_io->u.nvme_passthru.nbytes = nbytes; 4504 bdev_io->u.nvme_passthru.md_buf = NULL; 4505 bdev_io->u.nvme_passthru.md_len = 0; 4506 4507 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4508 4509 bdev_io_submit(bdev_io); 4510 return 0; 4511 } 4512 4513 int 4514 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4515 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4516 spdk_bdev_io_completion_cb cb, void *cb_arg) 4517 { 4518 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4519 struct spdk_bdev_io *bdev_io; 4520 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4521 4522 if (!desc->write) { 4523 /* 4524 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4525 * to easily determine if the command is a read or write, but for now just 4526 * do not allow io_passthru with a read-only descriptor. 4527 */ 4528 return -EBADF; 4529 } 4530 4531 bdev_io = bdev_channel_get_io(channel); 4532 if (!bdev_io) { 4533 return -ENOMEM; 4534 } 4535 4536 bdev_io->internal.ch = channel; 4537 bdev_io->internal.desc = desc; 4538 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 4539 bdev_io->u.nvme_passthru.cmd = *cmd; 4540 bdev_io->u.nvme_passthru.buf = buf; 4541 bdev_io->u.nvme_passthru.nbytes = nbytes; 4542 bdev_io->u.nvme_passthru.md_buf = NULL; 4543 bdev_io->u.nvme_passthru.md_len = 0; 4544 4545 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4546 4547 bdev_io_submit(bdev_io); 4548 return 0; 4549 } 4550 4551 int 4552 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4553 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 4554 spdk_bdev_io_completion_cb cb, void *cb_arg) 4555 { 4556 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4557 struct spdk_bdev_io *bdev_io; 4558 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4559 4560 if (!desc->write) { 4561 /* 4562 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4563 * to easily determine if the command is a read or write, but for now just 4564 * do not allow io_passthru with a read-only descriptor. 4565 */ 4566 return -EBADF; 4567 } 4568 4569 bdev_io = bdev_channel_get_io(channel); 4570 if (!bdev_io) { 4571 return -ENOMEM; 4572 } 4573 4574 bdev_io->internal.ch = channel; 4575 bdev_io->internal.desc = desc; 4576 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 4577 bdev_io->u.nvme_passthru.cmd = *cmd; 4578 bdev_io->u.nvme_passthru.buf = buf; 4579 bdev_io->u.nvme_passthru.nbytes = nbytes; 4580 bdev_io->u.nvme_passthru.md_buf = md_buf; 4581 bdev_io->u.nvme_passthru.md_len = md_len; 4582 4583 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4584 4585 bdev_io_submit(bdev_io); 4586 return 0; 4587 } 4588 4589 static void bdev_abort_retry(void *ctx); 4590 static void bdev_abort(struct spdk_bdev_io *parent_io); 4591 4592 static void 4593 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4594 { 4595 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 4596 struct spdk_bdev_io *parent_io = cb_arg; 4597 struct spdk_bdev_io *bio_to_abort, *tmp_io; 4598 4599 bio_to_abort = bdev_io->u.abort.bio_to_abort; 4600 4601 spdk_bdev_free_io(bdev_io); 4602 4603 if (!success) { 4604 /* Check if the target I/O completed in the meantime. */ 4605 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 4606 if (tmp_io == bio_to_abort) { 4607 break; 4608 } 4609 } 4610 4611 /* If the target I/O still exists, set the parent to failed. */ 4612 if (tmp_io != NULL) { 4613 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4614 } 4615 } 4616 4617 parent_io->u.bdev.split_outstanding--; 4618 if (parent_io->u.bdev.split_outstanding == 0) { 4619 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4620 bdev_abort_retry(parent_io); 4621 } else { 4622 bdev_io_complete(parent_io); 4623 } 4624 } 4625 } 4626 4627 static int 4628 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 4629 struct spdk_bdev_io *bio_to_abort, 4630 spdk_bdev_io_completion_cb cb, void *cb_arg) 4631 { 4632 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4633 struct spdk_bdev_io *bdev_io; 4634 4635 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 4636 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 4637 /* TODO: Abort reset or abort request. */ 4638 return -ENOTSUP; 4639 } 4640 4641 bdev_io = bdev_channel_get_io(channel); 4642 if (bdev_io == NULL) { 4643 return -ENOMEM; 4644 } 4645 4646 bdev_io->internal.ch = channel; 4647 bdev_io->internal.desc = desc; 4648 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 4649 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4650 4651 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 4652 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 4653 4654 /* Parent abort request is not submitted directly, but to manage its 4655 * execution add it to the submitted list here. 4656 */ 4657 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4658 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 4659 4660 bdev_abort(bdev_io); 4661 4662 return 0; 4663 } 4664 4665 bdev_io->u.abort.bio_to_abort = bio_to_abort; 4666 4667 /* Submit the abort request to the underlying bdev module. */ 4668 bdev_io_submit(bdev_io); 4669 4670 return 0; 4671 } 4672 4673 static uint32_t 4674 _bdev_abort(struct spdk_bdev_io *parent_io) 4675 { 4676 struct spdk_bdev_desc *desc = parent_io->internal.desc; 4677 struct spdk_bdev_channel *channel = parent_io->internal.ch; 4678 void *bio_cb_arg; 4679 struct spdk_bdev_io *bio_to_abort; 4680 uint32_t matched_ios; 4681 int rc; 4682 4683 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 4684 4685 /* matched_ios is returned and will be kept by the caller. 4686 * 4687 * This funcion will be used for two cases, 1) the same cb_arg is used for 4688 * multiple I/Os, 2) a single large I/O is split into smaller ones. 4689 * Incrementing split_outstanding directly here may confuse readers especially 4690 * for the 1st case. 4691 * 4692 * Completion of I/O abort is processed after stack unwinding. Hence this trick 4693 * works as expected. 4694 */ 4695 matched_ios = 0; 4696 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4697 4698 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 4699 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 4700 continue; 4701 } 4702 4703 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 4704 /* Any I/O which was submitted after this abort command should be excluded. */ 4705 continue; 4706 } 4707 4708 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 4709 if (rc != 0) { 4710 if (rc == -ENOMEM) { 4711 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 4712 } else { 4713 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4714 } 4715 break; 4716 } 4717 matched_ios++; 4718 } 4719 4720 return matched_ios; 4721 } 4722 4723 static void 4724 bdev_abort_retry(void *ctx) 4725 { 4726 struct spdk_bdev_io *parent_io = ctx; 4727 uint32_t matched_ios; 4728 4729 matched_ios = _bdev_abort(parent_io); 4730 4731 if (matched_ios == 0) { 4732 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4733 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 4734 } else { 4735 /* For retry, the case that no target I/O was found is success 4736 * because it means target I/Os completed in the meantime. 4737 */ 4738 bdev_io_complete(parent_io); 4739 } 4740 return; 4741 } 4742 4743 /* Use split_outstanding to manage the progress of aborting I/Os. */ 4744 parent_io->u.bdev.split_outstanding = matched_ios; 4745 } 4746 4747 static void 4748 bdev_abort(struct spdk_bdev_io *parent_io) 4749 { 4750 uint32_t matched_ios; 4751 4752 matched_ios = _bdev_abort(parent_io); 4753 4754 if (matched_ios == 0) { 4755 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4756 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 4757 } else { 4758 /* The case the no target I/O was found is failure. */ 4759 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4760 bdev_io_complete(parent_io); 4761 } 4762 return; 4763 } 4764 4765 /* Use split_outstanding to manage the progress of aborting I/Os. */ 4766 parent_io->u.bdev.split_outstanding = matched_ios; 4767 } 4768 4769 int 4770 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4771 void *bio_cb_arg, 4772 spdk_bdev_io_completion_cb cb, void *cb_arg) 4773 { 4774 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4775 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4776 struct spdk_bdev_io *bdev_io; 4777 4778 if (bio_cb_arg == NULL) { 4779 return -EINVAL; 4780 } 4781 4782 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 4783 return -ENOTSUP; 4784 } 4785 4786 bdev_io = bdev_channel_get_io(channel); 4787 if (bdev_io == NULL) { 4788 return -ENOMEM; 4789 } 4790 4791 bdev_io->internal.ch = channel; 4792 bdev_io->internal.desc = desc; 4793 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4794 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 4795 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4796 4797 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 4798 4799 /* Parent abort request is not submitted directly, but to manage its execution, 4800 * add it to the submitted list here. 4801 */ 4802 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 4803 4804 bdev_abort(bdev_io); 4805 4806 return 0; 4807 } 4808 4809 int 4810 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4811 struct spdk_bdev_io_wait_entry *entry) 4812 { 4813 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4814 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 4815 4816 if (bdev != entry->bdev) { 4817 SPDK_ERRLOG("bdevs do not match\n"); 4818 return -EINVAL; 4819 } 4820 4821 if (mgmt_ch->per_thread_cache_count > 0) { 4822 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 4823 return -EINVAL; 4824 } 4825 4826 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 4827 return 0; 4828 } 4829 4830 static void 4831 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 4832 { 4833 struct spdk_bdev *bdev = bdev_ch->bdev; 4834 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 4835 struct spdk_bdev_io *bdev_io; 4836 4837 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 4838 /* 4839 * Allow some more I/O to complete before retrying the nomem_io queue. 4840 * Some drivers (such as nvme) cannot immediately take a new I/O in 4841 * the context of a completion, because the resources for the I/O are 4842 * not released until control returns to the bdev poller. Also, we 4843 * may require several small I/O to complete before a larger I/O 4844 * (that requires splitting) can be submitted. 4845 */ 4846 return; 4847 } 4848 4849 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 4850 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 4851 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 4852 bdev_io->internal.ch->io_outstanding++; 4853 shared_resource->io_outstanding++; 4854 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4855 bdev_io->internal.error.nvme.cdw0 = 0; 4856 bdev_io->num_retries++; 4857 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 4858 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4859 break; 4860 } 4861 } 4862 } 4863 4864 static inline void 4865 bdev_io_complete(void *ctx) 4866 { 4867 struct spdk_bdev_io *bdev_io = ctx; 4868 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 4869 uint64_t tsc, tsc_diff; 4870 4871 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 4872 /* 4873 * Send the completion to the thread that originally submitted the I/O, 4874 * which may not be the current thread in the case of QoS. 4875 */ 4876 if (bdev_io->internal.io_submit_ch) { 4877 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4878 bdev_io->internal.io_submit_ch = NULL; 4879 } 4880 4881 /* 4882 * Defer completion to avoid potential infinite recursion if the 4883 * user's completion callback issues a new I/O. 4884 */ 4885 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 4886 bdev_io_complete, bdev_io); 4887 return; 4888 } 4889 4890 tsc = spdk_get_ticks(); 4891 tsc_diff = tsc - bdev_io->internal.submit_tsc; 4892 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 4893 4894 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 4895 4896 if (bdev_io->internal.ch->histogram) { 4897 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 4898 } 4899 4900 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 4901 switch (bdev_io->type) { 4902 case SPDK_BDEV_IO_TYPE_READ: 4903 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4904 bdev_io->internal.ch->stat.num_read_ops++; 4905 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 4906 break; 4907 case SPDK_BDEV_IO_TYPE_WRITE: 4908 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4909 bdev_io->internal.ch->stat.num_write_ops++; 4910 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 4911 break; 4912 case SPDK_BDEV_IO_TYPE_UNMAP: 4913 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4914 bdev_io->internal.ch->stat.num_unmap_ops++; 4915 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 4916 break; 4917 case SPDK_BDEV_IO_TYPE_ZCOPY: 4918 /* Track the data in the start phase only */ 4919 if (bdev_io->u.bdev.zcopy.start) { 4920 if (bdev_io->u.bdev.zcopy.populate) { 4921 bdev_io->internal.ch->stat.bytes_read += 4922 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4923 bdev_io->internal.ch->stat.num_read_ops++; 4924 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 4925 } else { 4926 bdev_io->internal.ch->stat.bytes_written += 4927 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4928 bdev_io->internal.ch->stat.num_write_ops++; 4929 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 4930 } 4931 } 4932 break; 4933 default: 4934 break; 4935 } 4936 } 4937 4938 #ifdef SPDK_CONFIG_VTUNE 4939 uint64_t now_tsc = spdk_get_ticks(); 4940 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 4941 uint64_t data[5]; 4942 4943 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 4944 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 4945 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 4946 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 4947 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 4948 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 4949 4950 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 4951 __itt_metadata_u64, 5, data); 4952 4953 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 4954 bdev_io->internal.ch->start_tsc = now_tsc; 4955 } 4956 #endif 4957 4958 assert(bdev_io->internal.cb != NULL); 4959 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 4960 4961 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 4962 bdev_io->internal.caller_ctx); 4963 } 4964 4965 static void 4966 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 4967 { 4968 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 4969 4970 if (bdev_io->u.reset.ch_ref != NULL) { 4971 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 4972 bdev_io->u.reset.ch_ref = NULL; 4973 } 4974 4975 bdev_io_complete(bdev_io); 4976 } 4977 4978 static void 4979 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 4980 { 4981 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 4982 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4983 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4984 struct spdk_bdev_io *queued_reset; 4985 4986 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 4987 while (!TAILQ_EMPTY(&ch->queued_resets)) { 4988 queued_reset = TAILQ_FIRST(&ch->queued_resets); 4989 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 4990 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 4991 } 4992 4993 spdk_for_each_channel_continue(i, 0); 4994 } 4995 4996 void 4997 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 4998 { 4999 struct spdk_bdev *bdev = bdev_io->bdev; 5000 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5001 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5002 5003 bdev_io->internal.status = status; 5004 5005 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5006 bool unlock_channels = false; 5007 5008 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5009 SPDK_ERRLOG("NOMEM returned for reset\n"); 5010 } 5011 pthread_mutex_lock(&bdev->internal.mutex); 5012 if (bdev_io == bdev->internal.reset_in_progress) { 5013 bdev->internal.reset_in_progress = NULL; 5014 unlock_channels = true; 5015 } 5016 pthread_mutex_unlock(&bdev->internal.mutex); 5017 5018 if (unlock_channels) { 5019 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5020 bdev_io, bdev_reset_complete); 5021 return; 5022 } 5023 } else { 5024 _bdev_io_unset_bounce_buf(bdev_io); 5025 5026 assert(bdev_ch->io_outstanding > 0); 5027 assert(shared_resource->io_outstanding > 0); 5028 bdev_ch->io_outstanding--; 5029 shared_resource->io_outstanding--; 5030 5031 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 5032 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 5033 /* 5034 * Wait for some of the outstanding I/O to complete before we 5035 * retry any of the nomem_io. Normally we will wait for 5036 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 5037 * depth channels we will instead wait for half to complete. 5038 */ 5039 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 5040 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 5041 return; 5042 } 5043 5044 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 5045 bdev_ch_retry_io(bdev_ch); 5046 } 5047 } 5048 5049 bdev_io_complete(bdev_io); 5050 } 5051 5052 void 5053 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5054 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5055 { 5056 if (sc == SPDK_SCSI_STATUS_GOOD) { 5057 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5058 } else { 5059 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5060 bdev_io->internal.error.scsi.sc = sc; 5061 bdev_io->internal.error.scsi.sk = sk; 5062 bdev_io->internal.error.scsi.asc = asc; 5063 bdev_io->internal.error.scsi.ascq = ascq; 5064 } 5065 5066 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5067 } 5068 5069 void 5070 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5071 int *sc, int *sk, int *asc, int *ascq) 5072 { 5073 assert(sc != NULL); 5074 assert(sk != NULL); 5075 assert(asc != NULL); 5076 assert(ascq != NULL); 5077 5078 switch (bdev_io->internal.status) { 5079 case SPDK_BDEV_IO_STATUS_SUCCESS: 5080 *sc = SPDK_SCSI_STATUS_GOOD; 5081 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5082 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5083 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5084 break; 5085 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5086 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5087 break; 5088 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5089 *sc = bdev_io->internal.error.scsi.sc; 5090 *sk = bdev_io->internal.error.scsi.sk; 5091 *asc = bdev_io->internal.error.scsi.asc; 5092 *ascq = bdev_io->internal.error.scsi.ascq; 5093 break; 5094 default: 5095 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5096 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5097 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5098 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5099 break; 5100 } 5101 } 5102 5103 void 5104 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5105 { 5106 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5107 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5108 } else { 5109 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5110 } 5111 5112 bdev_io->internal.error.nvme.cdw0 = cdw0; 5113 bdev_io->internal.error.nvme.sct = sct; 5114 bdev_io->internal.error.nvme.sc = sc; 5115 5116 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5117 } 5118 5119 void 5120 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5121 { 5122 assert(sct != NULL); 5123 assert(sc != NULL); 5124 assert(cdw0 != NULL); 5125 5126 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5127 *sct = bdev_io->internal.error.nvme.sct; 5128 *sc = bdev_io->internal.error.nvme.sc; 5129 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5130 *sct = SPDK_NVME_SCT_GENERIC; 5131 *sc = SPDK_NVME_SC_SUCCESS; 5132 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5133 *sct = SPDK_NVME_SCT_GENERIC; 5134 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5135 } else { 5136 *sct = SPDK_NVME_SCT_GENERIC; 5137 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5138 } 5139 5140 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5141 } 5142 5143 void 5144 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5145 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5146 { 5147 assert(first_sct != NULL); 5148 assert(first_sc != NULL); 5149 assert(second_sct != NULL); 5150 assert(second_sc != NULL); 5151 assert(cdw0 != NULL); 5152 5153 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5154 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5155 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5156 *first_sct = bdev_io->internal.error.nvme.sct; 5157 *first_sc = bdev_io->internal.error.nvme.sc; 5158 *second_sct = SPDK_NVME_SCT_GENERIC; 5159 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5160 } else { 5161 *first_sct = SPDK_NVME_SCT_GENERIC; 5162 *first_sc = SPDK_NVME_SC_SUCCESS; 5163 *second_sct = bdev_io->internal.error.nvme.sct; 5164 *second_sc = bdev_io->internal.error.nvme.sc; 5165 } 5166 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5167 *first_sct = SPDK_NVME_SCT_GENERIC; 5168 *first_sc = SPDK_NVME_SC_SUCCESS; 5169 *second_sct = SPDK_NVME_SCT_GENERIC; 5170 *second_sc = SPDK_NVME_SC_SUCCESS; 5171 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5172 *first_sct = SPDK_NVME_SCT_GENERIC; 5173 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5174 *second_sct = SPDK_NVME_SCT_GENERIC; 5175 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5176 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5177 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5178 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5179 *second_sct = SPDK_NVME_SCT_GENERIC; 5180 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5181 } else { 5182 *first_sct = SPDK_NVME_SCT_GENERIC; 5183 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5184 *second_sct = SPDK_NVME_SCT_GENERIC; 5185 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5186 } 5187 5188 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5189 } 5190 5191 struct spdk_thread * 5192 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 5193 { 5194 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 5195 } 5196 5197 struct spdk_io_channel * 5198 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 5199 { 5200 return bdev_io->internal.ch->channel; 5201 } 5202 5203 static void 5204 bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 5205 { 5206 uint64_t min_qos_set; 5207 int i; 5208 5209 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5210 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5211 break; 5212 } 5213 } 5214 5215 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 5216 SPDK_ERRLOG("Invalid rate limits set.\n"); 5217 return; 5218 } 5219 5220 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5221 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5222 continue; 5223 } 5224 5225 if (bdev_qos_is_iops_rate_limit(i) == true) { 5226 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 5227 } else { 5228 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 5229 } 5230 5231 if (limits[i] == 0 || limits[i] % min_qos_set) { 5232 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 5233 limits[i], bdev->name, min_qos_set); 5234 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 5235 return; 5236 } 5237 } 5238 5239 if (!bdev->internal.qos) { 5240 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 5241 if (!bdev->internal.qos) { 5242 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 5243 return; 5244 } 5245 } 5246 5247 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5248 bdev->internal.qos->rate_limits[i].limit = limits[i]; 5249 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 5250 bdev->name, i, limits[i]); 5251 } 5252 5253 return; 5254 } 5255 5256 static void 5257 bdev_qos_config(struct spdk_bdev *bdev) 5258 { 5259 struct spdk_conf_section *sp = NULL; 5260 const char *val = NULL; 5261 int i = 0, j = 0; 5262 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 5263 bool config_qos = false; 5264 5265 sp = spdk_conf_find_section(NULL, "QoS"); 5266 if (!sp) { 5267 return; 5268 } 5269 5270 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 5271 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 5272 5273 i = 0; 5274 while (true) { 5275 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 5276 if (!val) { 5277 break; 5278 } 5279 5280 if (strcmp(bdev->name, val) != 0) { 5281 i++; 5282 continue; 5283 } 5284 5285 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 5286 if (val) { 5287 if (bdev_qos_is_iops_rate_limit(j) == true) { 5288 limits[j] = strtoull(val, NULL, 10); 5289 } else { 5290 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 5291 } 5292 config_qos = true; 5293 } 5294 5295 break; 5296 } 5297 5298 j++; 5299 } 5300 5301 if (config_qos == true) { 5302 bdev_qos_config_limit(bdev, limits); 5303 } 5304 5305 return; 5306 } 5307 5308 static int 5309 bdev_init(struct spdk_bdev *bdev) 5310 { 5311 char *bdev_name; 5312 5313 assert(bdev->module != NULL); 5314 5315 if (!bdev->name) { 5316 SPDK_ERRLOG("Bdev name is NULL\n"); 5317 return -EINVAL; 5318 } 5319 5320 if (!strlen(bdev->name)) { 5321 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 5322 return -EINVAL; 5323 } 5324 5325 if (spdk_bdev_get_by_name(bdev->name)) { 5326 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 5327 return -EEXIST; 5328 } 5329 5330 /* Users often register their own I/O devices using the bdev name. In 5331 * order to avoid conflicts, prepend bdev_. */ 5332 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 5333 if (!bdev_name) { 5334 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 5335 return -ENOMEM; 5336 } 5337 5338 bdev->internal.status = SPDK_BDEV_STATUS_READY; 5339 bdev->internal.measured_queue_depth = UINT64_MAX; 5340 bdev->internal.claim_module = NULL; 5341 bdev->internal.qd_poller = NULL; 5342 bdev->internal.qos = NULL; 5343 5344 /* If the user didn't specify a uuid, generate one. */ 5345 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 5346 spdk_uuid_generate(&bdev->uuid); 5347 } 5348 5349 if (spdk_bdev_get_buf_align(bdev) > 1) { 5350 if (bdev->split_on_optimal_io_boundary) { 5351 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 5352 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 5353 } else { 5354 bdev->split_on_optimal_io_boundary = true; 5355 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 5356 } 5357 } 5358 5359 /* If the user didn't specify a write unit size, set it to one. */ 5360 if (bdev->write_unit_size == 0) { 5361 bdev->write_unit_size = 1; 5362 } 5363 5364 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 5365 if (bdev->acwu == 0) { 5366 bdev->acwu = 1; 5367 } 5368 5369 TAILQ_INIT(&bdev->internal.open_descs); 5370 TAILQ_INIT(&bdev->internal.locked_ranges); 5371 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 5372 5373 TAILQ_INIT(&bdev->aliases); 5374 5375 bdev->internal.reset_in_progress = NULL; 5376 5377 bdev_qos_config(bdev); 5378 5379 spdk_io_device_register(__bdev_to_io_dev(bdev), 5380 bdev_channel_create, bdev_channel_destroy, 5381 sizeof(struct spdk_bdev_channel), 5382 bdev_name); 5383 5384 free(bdev_name); 5385 5386 pthread_mutex_init(&bdev->internal.mutex, NULL); 5387 return 0; 5388 } 5389 5390 static void 5391 bdev_destroy_cb(void *io_device) 5392 { 5393 int rc; 5394 struct spdk_bdev *bdev; 5395 spdk_bdev_unregister_cb cb_fn; 5396 void *cb_arg; 5397 5398 bdev = __bdev_from_io_dev(io_device); 5399 cb_fn = bdev->internal.unregister_cb; 5400 cb_arg = bdev->internal.unregister_ctx; 5401 5402 rc = bdev->fn_table->destruct(bdev->ctxt); 5403 if (rc < 0) { 5404 SPDK_ERRLOG("destruct failed\n"); 5405 } 5406 if (rc <= 0 && cb_fn != NULL) { 5407 cb_fn(cb_arg, rc); 5408 } 5409 } 5410 5411 5412 static void 5413 bdev_fini(struct spdk_bdev *bdev) 5414 { 5415 pthread_mutex_destroy(&bdev->internal.mutex); 5416 5417 free(bdev->internal.qos); 5418 5419 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5420 } 5421 5422 static void 5423 bdev_start(struct spdk_bdev *bdev) 5424 { 5425 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 5426 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5427 5428 /* Examine configuration before initializing I/O */ 5429 bdev_examine(bdev); 5430 } 5431 5432 int 5433 spdk_bdev_register(struct spdk_bdev *bdev) 5434 { 5435 int rc = bdev_init(bdev); 5436 5437 if (rc == 0) { 5438 bdev_start(bdev); 5439 } 5440 5441 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5442 return rc; 5443 } 5444 5445 int 5446 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 5447 { 5448 SPDK_ERRLOG("This function is deprecated. Use spdk_bdev_register() instead.\n"); 5449 return spdk_bdev_register(vbdev); 5450 } 5451 5452 void 5453 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5454 { 5455 if (bdev->internal.unregister_cb != NULL) { 5456 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5457 } 5458 } 5459 5460 static void 5461 _remove_notify(void *arg) 5462 { 5463 struct spdk_bdev_desc *desc = arg; 5464 5465 pthread_mutex_lock(&desc->mutex); 5466 desc->refs--; 5467 5468 if (!desc->closed) { 5469 pthread_mutex_unlock(&desc->mutex); 5470 if (desc->callback.open_with_ext) { 5471 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5472 } else { 5473 desc->callback.remove_fn(desc->callback.ctx); 5474 } 5475 return; 5476 } else if (0 == desc->refs) { 5477 /* This descriptor was closed after this remove_notify message was sent. 5478 * spdk_bdev_close() could not free the descriptor since this message was 5479 * in flight, so we free it now using bdev_desc_free(). 5480 */ 5481 pthread_mutex_unlock(&desc->mutex); 5482 bdev_desc_free(desc); 5483 return; 5484 } 5485 pthread_mutex_unlock(&desc->mutex); 5486 } 5487 5488 /* Must be called while holding bdev->internal.mutex. 5489 * returns: 0 - bdev removed and ready to be destructed. 5490 * -EBUSY - bdev can't be destructed yet. */ 5491 static int 5492 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5493 { 5494 struct spdk_bdev_desc *desc, *tmp; 5495 int rc = 0; 5496 5497 /* Notify each descriptor about hotremoval */ 5498 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5499 rc = -EBUSY; 5500 pthread_mutex_lock(&desc->mutex); 5501 /* 5502 * Defer invocation of the event_cb to a separate message that will 5503 * run later on its thread. This ensures this context unwinds and 5504 * we don't recursively unregister this bdev again if the event_cb 5505 * immediately closes its descriptor. 5506 */ 5507 desc->refs++; 5508 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5509 pthread_mutex_unlock(&desc->mutex); 5510 } 5511 5512 /* If there are no descriptors, proceed removing the bdev */ 5513 if (rc == 0) { 5514 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5515 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list done\n", bdev->name); 5516 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5517 } 5518 5519 return rc; 5520 } 5521 5522 void 5523 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5524 { 5525 struct spdk_thread *thread; 5526 int rc; 5527 5528 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 5529 5530 thread = spdk_get_thread(); 5531 if (!thread) { 5532 /* The user called this from a non-SPDK thread. */ 5533 if (cb_fn != NULL) { 5534 cb_fn(cb_arg, -ENOTSUP); 5535 } 5536 return; 5537 } 5538 5539 pthread_mutex_lock(&g_bdev_mgr.mutex); 5540 pthread_mutex_lock(&bdev->internal.mutex); 5541 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5542 pthread_mutex_unlock(&bdev->internal.mutex); 5543 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5544 if (cb_fn) { 5545 cb_fn(cb_arg, -EBUSY); 5546 } 5547 return; 5548 } 5549 5550 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 5551 bdev->internal.unregister_cb = cb_fn; 5552 bdev->internal.unregister_ctx = cb_arg; 5553 5554 /* Call under lock. */ 5555 rc = bdev_unregister_unsafe(bdev); 5556 pthread_mutex_unlock(&bdev->internal.mutex); 5557 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5558 5559 if (rc == 0) { 5560 bdev_fini(bdev); 5561 } 5562 } 5563 5564 static void 5565 bdev_dummy_event_cb(void *remove_ctx) 5566 { 5567 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev remove event received with no remove callback specified"); 5568 } 5569 5570 static int 5571 bdev_start_qos(struct spdk_bdev *bdev) 5572 { 5573 struct set_qos_limit_ctx *ctx; 5574 5575 /* Enable QoS */ 5576 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 5577 ctx = calloc(1, sizeof(*ctx)); 5578 if (ctx == NULL) { 5579 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 5580 return -ENOMEM; 5581 } 5582 ctx->bdev = bdev; 5583 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5584 bdev_enable_qos_msg, ctx, 5585 bdev_enable_qos_done); 5586 } 5587 5588 return 0; 5589 } 5590 5591 static int 5592 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 5593 { 5594 struct spdk_thread *thread; 5595 int rc = 0; 5596 5597 thread = spdk_get_thread(); 5598 if (!thread) { 5599 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 5600 return -ENOTSUP; 5601 } 5602 5603 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5604 spdk_get_thread()); 5605 5606 desc->bdev = bdev; 5607 desc->thread = thread; 5608 desc->write = write; 5609 5610 pthread_mutex_lock(&bdev->internal.mutex); 5611 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5612 pthread_mutex_unlock(&bdev->internal.mutex); 5613 return -ENODEV; 5614 } 5615 5616 if (write && bdev->internal.claim_module) { 5617 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 5618 bdev->name, bdev->internal.claim_module->name); 5619 pthread_mutex_unlock(&bdev->internal.mutex); 5620 return -EPERM; 5621 } 5622 5623 rc = bdev_start_qos(bdev); 5624 if (rc != 0) { 5625 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 5626 pthread_mutex_unlock(&bdev->internal.mutex); 5627 return rc; 5628 } 5629 5630 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 5631 5632 pthread_mutex_unlock(&bdev->internal.mutex); 5633 5634 return 0; 5635 } 5636 5637 int 5638 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 5639 void *remove_ctx, struct spdk_bdev_desc **_desc) 5640 { 5641 struct spdk_bdev_desc *desc; 5642 int rc; 5643 5644 desc = calloc(1, sizeof(*desc)); 5645 if (desc == NULL) { 5646 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5647 return -ENOMEM; 5648 } 5649 5650 if (remove_cb == NULL) { 5651 remove_cb = bdev_dummy_event_cb; 5652 } 5653 5654 TAILQ_INIT(&desc->pending_media_events); 5655 TAILQ_INIT(&desc->free_media_events); 5656 5657 desc->callback.open_with_ext = false; 5658 desc->callback.remove_fn = remove_cb; 5659 desc->callback.ctx = remove_ctx; 5660 pthread_mutex_init(&desc->mutex, NULL); 5661 5662 pthread_mutex_lock(&g_bdev_mgr.mutex); 5663 5664 rc = bdev_open(bdev, write, desc); 5665 if (rc != 0) { 5666 bdev_desc_free(desc); 5667 desc = NULL; 5668 } 5669 5670 *_desc = desc; 5671 5672 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5673 5674 return rc; 5675 } 5676 5677 int 5678 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 5679 void *event_ctx, struct spdk_bdev_desc **_desc) 5680 { 5681 struct spdk_bdev_desc *desc; 5682 struct spdk_bdev *bdev; 5683 unsigned int event_id; 5684 int rc; 5685 5686 if (event_cb == NULL) { 5687 SPDK_ERRLOG("Missing event callback function\n"); 5688 return -EINVAL; 5689 } 5690 5691 pthread_mutex_lock(&g_bdev_mgr.mutex); 5692 5693 bdev = spdk_bdev_get_by_name(bdev_name); 5694 5695 if (bdev == NULL) { 5696 SPDK_ERRLOG("Failed to find bdev with name: %s\n", bdev_name); 5697 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5698 return -EINVAL; 5699 } 5700 5701 desc = calloc(1, sizeof(*desc)); 5702 if (desc == NULL) { 5703 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5704 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5705 return -ENOMEM; 5706 } 5707 5708 TAILQ_INIT(&desc->pending_media_events); 5709 TAILQ_INIT(&desc->free_media_events); 5710 5711 desc->callback.open_with_ext = true; 5712 desc->callback.event_fn = event_cb; 5713 desc->callback.ctx = event_ctx; 5714 pthread_mutex_init(&desc->mutex, NULL); 5715 5716 if (bdev->media_events) { 5717 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 5718 sizeof(*desc->media_events_buffer)); 5719 if (desc->media_events_buffer == NULL) { 5720 SPDK_ERRLOG("Failed to initialize media event pool\n"); 5721 bdev_desc_free(desc); 5722 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5723 return -ENOMEM; 5724 } 5725 5726 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 5727 TAILQ_INSERT_TAIL(&desc->free_media_events, 5728 &desc->media_events_buffer[event_id], tailq); 5729 } 5730 } 5731 5732 rc = bdev_open(bdev, write, desc); 5733 if (rc != 0) { 5734 bdev_desc_free(desc); 5735 desc = NULL; 5736 } 5737 5738 *_desc = desc; 5739 5740 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5741 5742 return rc; 5743 } 5744 5745 void 5746 spdk_bdev_close(struct spdk_bdev_desc *desc) 5747 { 5748 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5749 int rc; 5750 5751 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5752 spdk_get_thread()); 5753 5754 assert(desc->thread == spdk_get_thread()); 5755 5756 spdk_poller_unregister(&desc->io_timeout_poller); 5757 5758 pthread_mutex_lock(&bdev->internal.mutex); 5759 pthread_mutex_lock(&desc->mutex); 5760 5761 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 5762 5763 desc->closed = true; 5764 5765 if (0 == desc->refs) { 5766 pthread_mutex_unlock(&desc->mutex); 5767 bdev_desc_free(desc); 5768 } else { 5769 pthread_mutex_unlock(&desc->mutex); 5770 } 5771 5772 /* If no more descriptors, kill QoS channel */ 5773 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5774 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 5775 bdev->name, spdk_get_thread()); 5776 5777 if (bdev_qos_destroy(bdev)) { 5778 /* There isn't anything we can do to recover here. Just let the 5779 * old QoS poller keep running. The QoS handling won't change 5780 * cores when the user allocates a new channel, but it won't break. */ 5781 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 5782 } 5783 } 5784 5785 spdk_bdev_set_qd_sampling_period(bdev, 0); 5786 5787 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5788 rc = bdev_unregister_unsafe(bdev); 5789 pthread_mutex_unlock(&bdev->internal.mutex); 5790 5791 if (rc == 0) { 5792 bdev_fini(bdev); 5793 } 5794 } else { 5795 pthread_mutex_unlock(&bdev->internal.mutex); 5796 } 5797 } 5798 5799 int 5800 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 5801 struct spdk_bdev_module *module) 5802 { 5803 if (bdev->internal.claim_module != NULL) { 5804 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 5805 bdev->internal.claim_module->name); 5806 return -EPERM; 5807 } 5808 5809 if (desc && !desc->write) { 5810 desc->write = true; 5811 } 5812 5813 bdev->internal.claim_module = module; 5814 return 0; 5815 } 5816 5817 void 5818 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 5819 { 5820 assert(bdev->internal.claim_module != NULL); 5821 bdev->internal.claim_module = NULL; 5822 } 5823 5824 struct spdk_bdev * 5825 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 5826 { 5827 assert(desc != NULL); 5828 return desc->bdev; 5829 } 5830 5831 void 5832 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 5833 { 5834 struct iovec *iovs; 5835 int iovcnt; 5836 5837 if (bdev_io == NULL) { 5838 return; 5839 } 5840 5841 switch (bdev_io->type) { 5842 case SPDK_BDEV_IO_TYPE_READ: 5843 case SPDK_BDEV_IO_TYPE_WRITE: 5844 case SPDK_BDEV_IO_TYPE_ZCOPY: 5845 iovs = bdev_io->u.bdev.iovs; 5846 iovcnt = bdev_io->u.bdev.iovcnt; 5847 break; 5848 default: 5849 iovs = NULL; 5850 iovcnt = 0; 5851 break; 5852 } 5853 5854 if (iovp) { 5855 *iovp = iovs; 5856 } 5857 if (iovcntp) { 5858 *iovcntp = iovcnt; 5859 } 5860 } 5861 5862 void * 5863 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 5864 { 5865 if (bdev_io == NULL) { 5866 return NULL; 5867 } 5868 5869 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 5870 return NULL; 5871 } 5872 5873 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 5874 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 5875 return bdev_io->u.bdev.md_buf; 5876 } 5877 5878 return NULL; 5879 } 5880 5881 void * 5882 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 5883 { 5884 if (bdev_io == NULL) { 5885 assert(false); 5886 return NULL; 5887 } 5888 5889 return bdev_io->internal.caller_ctx; 5890 } 5891 5892 void 5893 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 5894 { 5895 5896 if (spdk_bdev_module_list_find(bdev_module->name)) { 5897 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 5898 assert(false); 5899 } 5900 5901 /* 5902 * Modules with examine callbacks must be initialized first, so they are 5903 * ready to handle examine callbacks from later modules that will 5904 * register physical bdevs. 5905 */ 5906 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 5907 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5908 } else { 5909 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5910 } 5911 } 5912 5913 struct spdk_bdev_module * 5914 spdk_bdev_module_list_find(const char *name) 5915 { 5916 struct spdk_bdev_module *bdev_module; 5917 5918 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 5919 if (strcmp(name, bdev_module->name) == 0) { 5920 break; 5921 } 5922 } 5923 5924 return bdev_module; 5925 } 5926 5927 static void 5928 bdev_write_zero_buffer_next(void *_bdev_io) 5929 { 5930 struct spdk_bdev_io *bdev_io = _bdev_io; 5931 uint64_t num_bytes, num_blocks; 5932 void *md_buf = NULL; 5933 int rc; 5934 5935 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 5936 bdev_io->u.bdev.split_remaining_num_blocks, 5937 ZERO_BUFFER_SIZE); 5938 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 5939 5940 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 5941 md_buf = (char *)g_bdev_mgr.zero_buffer + 5942 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 5943 } 5944 5945 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 5946 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5947 g_bdev_mgr.zero_buffer, md_buf, 5948 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 5949 bdev_write_zero_buffer_done, bdev_io); 5950 if (rc == 0) { 5951 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 5952 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 5953 } else if (rc == -ENOMEM) { 5954 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 5955 } else { 5956 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5957 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5958 } 5959 } 5960 5961 static void 5962 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5963 { 5964 struct spdk_bdev_io *parent_io = cb_arg; 5965 5966 spdk_bdev_free_io(bdev_io); 5967 5968 if (!success) { 5969 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5970 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5971 return; 5972 } 5973 5974 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 5975 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5976 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5977 return; 5978 } 5979 5980 bdev_write_zero_buffer_next(parent_io); 5981 } 5982 5983 static void 5984 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 5985 { 5986 pthread_mutex_lock(&ctx->bdev->internal.mutex); 5987 ctx->bdev->internal.qos_mod_in_progress = false; 5988 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 5989 5990 if (ctx->cb_fn) { 5991 ctx->cb_fn(ctx->cb_arg, status); 5992 } 5993 free(ctx); 5994 } 5995 5996 static void 5997 bdev_disable_qos_done(void *cb_arg) 5998 { 5999 struct set_qos_limit_ctx *ctx = cb_arg; 6000 struct spdk_bdev *bdev = ctx->bdev; 6001 struct spdk_bdev_io *bdev_io; 6002 struct spdk_bdev_qos *qos; 6003 6004 pthread_mutex_lock(&bdev->internal.mutex); 6005 qos = bdev->internal.qos; 6006 bdev->internal.qos = NULL; 6007 pthread_mutex_unlock(&bdev->internal.mutex); 6008 6009 while (!TAILQ_EMPTY(&qos->queued)) { 6010 /* Send queued I/O back to their original thread for resubmission. */ 6011 bdev_io = TAILQ_FIRST(&qos->queued); 6012 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6013 6014 if (bdev_io->internal.io_submit_ch) { 6015 /* 6016 * Channel was changed when sending it to the QoS thread - change it back 6017 * before sending it back to the original thread. 6018 */ 6019 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6020 bdev_io->internal.io_submit_ch = NULL; 6021 } 6022 6023 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6024 _bdev_io_submit, bdev_io); 6025 } 6026 6027 if (qos->thread != NULL) { 6028 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6029 spdk_poller_unregister(&qos->poller); 6030 } 6031 6032 free(qos); 6033 6034 bdev_set_qos_limit_done(ctx, 0); 6035 } 6036 6037 static void 6038 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6039 { 6040 void *io_device = spdk_io_channel_iter_get_io_device(i); 6041 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6042 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6043 struct spdk_thread *thread; 6044 6045 pthread_mutex_lock(&bdev->internal.mutex); 6046 thread = bdev->internal.qos->thread; 6047 pthread_mutex_unlock(&bdev->internal.mutex); 6048 6049 if (thread != NULL) { 6050 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6051 } else { 6052 bdev_disable_qos_done(ctx); 6053 } 6054 } 6055 6056 static void 6057 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6058 { 6059 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6060 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6061 6062 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6063 6064 spdk_for_each_channel_continue(i, 0); 6065 } 6066 6067 static void 6068 bdev_update_qos_rate_limit_msg(void *cb_arg) 6069 { 6070 struct set_qos_limit_ctx *ctx = cb_arg; 6071 struct spdk_bdev *bdev = ctx->bdev; 6072 6073 pthread_mutex_lock(&bdev->internal.mutex); 6074 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6075 pthread_mutex_unlock(&bdev->internal.mutex); 6076 6077 bdev_set_qos_limit_done(ctx, 0); 6078 } 6079 6080 static void 6081 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6082 { 6083 void *io_device = spdk_io_channel_iter_get_io_device(i); 6084 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6085 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6086 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6087 6088 pthread_mutex_lock(&bdev->internal.mutex); 6089 bdev_enable_qos(bdev, bdev_ch); 6090 pthread_mutex_unlock(&bdev->internal.mutex); 6091 spdk_for_each_channel_continue(i, 0); 6092 } 6093 6094 static void 6095 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6096 { 6097 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6098 6099 bdev_set_qos_limit_done(ctx, status); 6100 } 6101 6102 static void 6103 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6104 { 6105 int i; 6106 6107 assert(bdev->internal.qos != NULL); 6108 6109 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6110 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6111 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6112 6113 if (limits[i] == 0) { 6114 bdev->internal.qos->rate_limits[i].limit = 6115 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6116 } 6117 } 6118 } 6119 } 6120 6121 void 6122 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6123 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6124 { 6125 struct set_qos_limit_ctx *ctx; 6126 uint32_t limit_set_complement; 6127 uint64_t min_limit_per_sec; 6128 int i; 6129 bool disable_rate_limit = true; 6130 6131 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6132 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6133 continue; 6134 } 6135 6136 if (limits[i] > 0) { 6137 disable_rate_limit = false; 6138 } 6139 6140 if (bdev_qos_is_iops_rate_limit(i) == true) { 6141 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6142 } else { 6143 /* Change from megabyte to byte rate limit */ 6144 limits[i] = limits[i] * 1024 * 1024; 6145 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6146 } 6147 6148 limit_set_complement = limits[i] % min_limit_per_sec; 6149 if (limit_set_complement) { 6150 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6151 limits[i], min_limit_per_sec); 6152 limits[i] += min_limit_per_sec - limit_set_complement; 6153 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 6154 } 6155 } 6156 6157 ctx = calloc(1, sizeof(*ctx)); 6158 if (ctx == NULL) { 6159 cb_fn(cb_arg, -ENOMEM); 6160 return; 6161 } 6162 6163 ctx->cb_fn = cb_fn; 6164 ctx->cb_arg = cb_arg; 6165 ctx->bdev = bdev; 6166 6167 pthread_mutex_lock(&bdev->internal.mutex); 6168 if (bdev->internal.qos_mod_in_progress) { 6169 pthread_mutex_unlock(&bdev->internal.mutex); 6170 free(ctx); 6171 cb_fn(cb_arg, -EAGAIN); 6172 return; 6173 } 6174 bdev->internal.qos_mod_in_progress = true; 6175 6176 if (disable_rate_limit == true && bdev->internal.qos) { 6177 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6178 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 6179 (bdev->internal.qos->rate_limits[i].limit > 0 && 6180 bdev->internal.qos->rate_limits[i].limit != 6181 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 6182 disable_rate_limit = false; 6183 break; 6184 } 6185 } 6186 } 6187 6188 if (disable_rate_limit == false) { 6189 if (bdev->internal.qos == NULL) { 6190 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 6191 if (!bdev->internal.qos) { 6192 pthread_mutex_unlock(&bdev->internal.mutex); 6193 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 6194 bdev_set_qos_limit_done(ctx, -ENOMEM); 6195 return; 6196 } 6197 } 6198 6199 if (bdev->internal.qos->thread == NULL) { 6200 /* Enabling */ 6201 bdev_set_qos_rate_limits(bdev, limits); 6202 6203 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6204 bdev_enable_qos_msg, ctx, 6205 bdev_enable_qos_done); 6206 } else { 6207 /* Updating */ 6208 bdev_set_qos_rate_limits(bdev, limits); 6209 6210 spdk_thread_send_msg(bdev->internal.qos->thread, 6211 bdev_update_qos_rate_limit_msg, ctx); 6212 } 6213 } else { 6214 if (bdev->internal.qos != NULL) { 6215 bdev_set_qos_rate_limits(bdev, limits); 6216 6217 /* Disabling */ 6218 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6219 bdev_disable_qos_msg, ctx, 6220 bdev_disable_qos_msg_done); 6221 } else { 6222 pthread_mutex_unlock(&bdev->internal.mutex); 6223 bdev_set_qos_limit_done(ctx, 0); 6224 return; 6225 } 6226 } 6227 6228 pthread_mutex_unlock(&bdev->internal.mutex); 6229 } 6230 6231 struct spdk_bdev_histogram_ctx { 6232 spdk_bdev_histogram_status_cb cb_fn; 6233 void *cb_arg; 6234 struct spdk_bdev *bdev; 6235 int status; 6236 }; 6237 6238 static void 6239 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 6240 { 6241 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6242 6243 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6244 ctx->bdev->internal.histogram_in_progress = false; 6245 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6246 ctx->cb_fn(ctx->cb_arg, ctx->status); 6247 free(ctx); 6248 } 6249 6250 static void 6251 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 6252 { 6253 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6254 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6255 6256 if (ch->histogram != NULL) { 6257 spdk_histogram_data_free(ch->histogram); 6258 ch->histogram = NULL; 6259 } 6260 spdk_for_each_channel_continue(i, 0); 6261 } 6262 6263 static void 6264 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 6265 { 6266 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6267 6268 if (status != 0) { 6269 ctx->status = status; 6270 ctx->bdev->internal.histogram_enabled = false; 6271 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 6272 bdev_histogram_disable_channel_cb); 6273 } else { 6274 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6275 ctx->bdev->internal.histogram_in_progress = false; 6276 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6277 ctx->cb_fn(ctx->cb_arg, ctx->status); 6278 free(ctx); 6279 } 6280 } 6281 6282 static void 6283 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 6284 { 6285 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6286 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6287 int status = 0; 6288 6289 if (ch->histogram == NULL) { 6290 ch->histogram = spdk_histogram_data_alloc(); 6291 if (ch->histogram == NULL) { 6292 status = -ENOMEM; 6293 } 6294 } 6295 6296 spdk_for_each_channel_continue(i, status); 6297 } 6298 6299 void 6300 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 6301 void *cb_arg, bool enable) 6302 { 6303 struct spdk_bdev_histogram_ctx *ctx; 6304 6305 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 6306 if (ctx == NULL) { 6307 cb_fn(cb_arg, -ENOMEM); 6308 return; 6309 } 6310 6311 ctx->bdev = bdev; 6312 ctx->status = 0; 6313 ctx->cb_fn = cb_fn; 6314 ctx->cb_arg = cb_arg; 6315 6316 pthread_mutex_lock(&bdev->internal.mutex); 6317 if (bdev->internal.histogram_in_progress) { 6318 pthread_mutex_unlock(&bdev->internal.mutex); 6319 free(ctx); 6320 cb_fn(cb_arg, -EAGAIN); 6321 return; 6322 } 6323 6324 bdev->internal.histogram_in_progress = true; 6325 pthread_mutex_unlock(&bdev->internal.mutex); 6326 6327 bdev->internal.histogram_enabled = enable; 6328 6329 if (enable) { 6330 /* Allocate histogram for each channel */ 6331 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 6332 bdev_histogram_enable_channel_cb); 6333 } else { 6334 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 6335 bdev_histogram_disable_channel_cb); 6336 } 6337 } 6338 6339 struct spdk_bdev_histogram_data_ctx { 6340 spdk_bdev_histogram_data_cb cb_fn; 6341 void *cb_arg; 6342 struct spdk_bdev *bdev; 6343 /** merged histogram data from all channels */ 6344 struct spdk_histogram_data *histogram; 6345 }; 6346 6347 static void 6348 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 6349 { 6350 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6351 6352 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 6353 free(ctx); 6354 } 6355 6356 static void 6357 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 6358 { 6359 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6360 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6361 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6362 int status = 0; 6363 6364 if (ch->histogram == NULL) { 6365 status = -EFAULT; 6366 } else { 6367 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 6368 } 6369 6370 spdk_for_each_channel_continue(i, status); 6371 } 6372 6373 void 6374 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 6375 spdk_bdev_histogram_data_cb cb_fn, 6376 void *cb_arg) 6377 { 6378 struct spdk_bdev_histogram_data_ctx *ctx; 6379 6380 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 6381 if (ctx == NULL) { 6382 cb_fn(cb_arg, -ENOMEM, NULL); 6383 return; 6384 } 6385 6386 ctx->bdev = bdev; 6387 ctx->cb_fn = cb_fn; 6388 ctx->cb_arg = cb_arg; 6389 6390 ctx->histogram = histogram; 6391 6392 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 6393 bdev_histogram_get_channel_cb); 6394 } 6395 6396 size_t 6397 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6398 size_t max_events) 6399 { 6400 struct media_event_entry *entry; 6401 size_t num_events = 0; 6402 6403 for (; num_events < max_events; ++num_events) { 6404 entry = TAILQ_FIRST(&desc->pending_media_events); 6405 if (entry == NULL) { 6406 break; 6407 } 6408 6409 events[num_events] = entry->event; 6410 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6411 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6412 } 6413 6414 return num_events; 6415 } 6416 6417 int 6418 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6419 size_t num_events) 6420 { 6421 struct spdk_bdev_desc *desc; 6422 struct media_event_entry *entry; 6423 size_t event_id; 6424 int rc = 0; 6425 6426 assert(bdev->media_events); 6427 6428 pthread_mutex_lock(&bdev->internal.mutex); 6429 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6430 if (desc->write) { 6431 break; 6432 } 6433 } 6434 6435 if (desc == NULL || desc->media_events_buffer == NULL) { 6436 rc = -ENODEV; 6437 goto out; 6438 } 6439 6440 for (event_id = 0; event_id < num_events; ++event_id) { 6441 entry = TAILQ_FIRST(&desc->free_media_events); 6442 if (entry == NULL) { 6443 break; 6444 } 6445 6446 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6447 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6448 entry->event = events[event_id]; 6449 } 6450 6451 rc = event_id; 6452 out: 6453 pthread_mutex_unlock(&bdev->internal.mutex); 6454 return rc; 6455 } 6456 6457 void 6458 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6459 { 6460 struct spdk_bdev_desc *desc; 6461 6462 pthread_mutex_lock(&bdev->internal.mutex); 6463 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6464 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6465 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6466 desc->callback.ctx); 6467 } 6468 } 6469 pthread_mutex_unlock(&bdev->internal.mutex); 6470 } 6471 6472 struct locked_lba_range_ctx { 6473 struct lba_range range; 6474 struct spdk_bdev *bdev; 6475 struct lba_range *current_range; 6476 struct lba_range *owner_range; 6477 struct spdk_poller *poller; 6478 lock_range_cb cb_fn; 6479 void *cb_arg; 6480 }; 6481 6482 static void 6483 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6484 { 6485 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6486 6487 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6488 free(ctx); 6489 } 6490 6491 static void 6492 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6493 6494 static void 6495 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6496 { 6497 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6498 struct spdk_bdev *bdev = ctx->bdev; 6499 6500 if (status == -ENOMEM) { 6501 /* One of the channels could not allocate a range object. 6502 * So we have to go back and clean up any ranges that were 6503 * allocated successfully before we return error status to 6504 * the caller. We can reuse the unlock function to do that 6505 * clean up. 6506 */ 6507 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6508 bdev_unlock_lba_range_get_channel, ctx, 6509 bdev_lock_error_cleanup_cb); 6510 return; 6511 } 6512 6513 /* All channels have locked this range and no I/O overlapping the range 6514 * are outstanding! Set the owner_ch for the range object for the 6515 * locking channel, so that this channel will know that it is allowed 6516 * to write to this range. 6517 */ 6518 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6519 ctx->cb_fn(ctx->cb_arg, status); 6520 6521 /* Don't free the ctx here. Its range is in the bdev's global list of 6522 * locked ranges still, and will be removed and freed when this range 6523 * is later unlocked. 6524 */ 6525 } 6526 6527 static int 6528 bdev_lock_lba_range_check_io(void *_i) 6529 { 6530 struct spdk_io_channel_iter *i = _i; 6531 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6532 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6533 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6534 struct lba_range *range = ctx->current_range; 6535 struct spdk_bdev_io *bdev_io; 6536 6537 spdk_poller_unregister(&ctx->poller); 6538 6539 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6540 * range. But we need to wait until any outstanding IO overlapping with this range 6541 * are completed. 6542 */ 6543 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6544 if (bdev_io_range_is_locked(bdev_io, range)) { 6545 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 6546 return SPDK_POLLER_BUSY; 6547 } 6548 } 6549 6550 spdk_for_each_channel_continue(i, 0); 6551 return SPDK_POLLER_BUSY; 6552 } 6553 6554 static void 6555 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6556 { 6557 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6558 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6559 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6560 struct lba_range *range; 6561 6562 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6563 if (range->length == ctx->range.length && 6564 range->offset == ctx->range.offset && 6565 range->locked_ctx == ctx->range.locked_ctx) { 6566 /* This range already exists on this channel, so don't add 6567 * it again. This can happen when a new channel is created 6568 * while the for_each_channel operation is in progress. 6569 * Do not check for outstanding I/O in that case, since the 6570 * range was locked before any I/O could be submitted to the 6571 * new channel. 6572 */ 6573 spdk_for_each_channel_continue(i, 0); 6574 return; 6575 } 6576 } 6577 6578 range = calloc(1, sizeof(*range)); 6579 if (range == NULL) { 6580 spdk_for_each_channel_continue(i, -ENOMEM); 6581 return; 6582 } 6583 6584 range->length = ctx->range.length; 6585 range->offset = ctx->range.offset; 6586 range->locked_ctx = ctx->range.locked_ctx; 6587 ctx->current_range = range; 6588 if (ctx->range.owner_ch == ch) { 6589 /* This is the range object for the channel that will hold 6590 * the lock. Store it in the ctx object so that we can easily 6591 * set its owner_ch after the lock is finally acquired. 6592 */ 6593 ctx->owner_range = range; 6594 } 6595 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 6596 bdev_lock_lba_range_check_io(i); 6597 } 6598 6599 static void 6600 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 6601 { 6602 assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread); 6603 6604 /* We will add a copy of this range to each channel now. */ 6605 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 6606 bdev_lock_lba_range_cb); 6607 } 6608 6609 static bool 6610 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 6611 { 6612 struct lba_range *r; 6613 6614 TAILQ_FOREACH(r, tailq, tailq) { 6615 if (bdev_lba_range_overlapped(range, r)) { 6616 return true; 6617 } 6618 } 6619 return false; 6620 } 6621 6622 static int 6623 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6624 uint64_t offset, uint64_t length, 6625 lock_range_cb cb_fn, void *cb_arg) 6626 { 6627 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6628 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6629 struct locked_lba_range_ctx *ctx; 6630 6631 if (cb_arg == NULL) { 6632 SPDK_ERRLOG("cb_arg must not be NULL\n"); 6633 return -EINVAL; 6634 } 6635 6636 ctx = calloc(1, sizeof(*ctx)); 6637 if (ctx == NULL) { 6638 return -ENOMEM; 6639 } 6640 6641 ctx->range.offset = offset; 6642 ctx->range.length = length; 6643 ctx->range.owner_ch = ch; 6644 ctx->range.locked_ctx = cb_arg; 6645 ctx->bdev = bdev; 6646 ctx->cb_fn = cb_fn; 6647 ctx->cb_arg = cb_arg; 6648 6649 pthread_mutex_lock(&bdev->internal.mutex); 6650 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 6651 /* There is an active lock overlapping with this range. 6652 * Put it on the pending list until this range no 6653 * longer overlaps with another. 6654 */ 6655 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 6656 } else { 6657 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 6658 bdev_lock_lba_range_ctx(bdev, ctx); 6659 } 6660 pthread_mutex_unlock(&bdev->internal.mutex); 6661 return 0; 6662 } 6663 6664 static void 6665 bdev_lock_lba_range_ctx_msg(void *_ctx) 6666 { 6667 struct locked_lba_range_ctx *ctx = _ctx; 6668 6669 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 6670 } 6671 6672 static void 6673 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6674 { 6675 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6676 struct locked_lba_range_ctx *pending_ctx; 6677 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 6678 struct spdk_bdev *bdev = ch->bdev; 6679 struct lba_range *range, *tmp; 6680 6681 pthread_mutex_lock(&bdev->internal.mutex); 6682 /* Check if there are any pending locked ranges that overlap with this range 6683 * that was just unlocked. If there are, check that it doesn't overlap with any 6684 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 6685 * the lock process. 6686 */ 6687 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 6688 if (bdev_lba_range_overlapped(range, &ctx->range) && 6689 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 6690 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 6691 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6692 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 6693 spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread, 6694 bdev_lock_lba_range_ctx_msg, pending_ctx); 6695 } 6696 } 6697 pthread_mutex_unlock(&bdev->internal.mutex); 6698 6699 ctx->cb_fn(ctx->cb_arg, status); 6700 free(ctx); 6701 } 6702 6703 static void 6704 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6705 { 6706 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6707 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6708 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6709 TAILQ_HEAD(, spdk_bdev_io) io_locked; 6710 struct spdk_bdev_io *bdev_io; 6711 struct lba_range *range; 6712 6713 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6714 if (ctx->range.offset == range->offset && 6715 ctx->range.length == range->length && 6716 ctx->range.locked_ctx == range->locked_ctx) { 6717 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 6718 free(range); 6719 break; 6720 } 6721 } 6722 6723 /* Note: we should almost always be able to assert that the range specified 6724 * was found. But there are some very rare corner cases where a new channel 6725 * gets created simultaneously with a range unlock, where this function 6726 * would execute on that new channel and wouldn't have the range. 6727 * We also use this to clean up range allocations when a later allocation 6728 * fails in the locking path. 6729 * So we can't actually assert() here. 6730 */ 6731 6732 /* Swap the locked IO into a temporary list, and then try to submit them again. 6733 * We could hyper-optimize this to only resubmit locked I/O that overlap 6734 * with the range that was just unlocked, but this isn't a performance path so 6735 * we go for simplicity here. 6736 */ 6737 TAILQ_INIT(&io_locked); 6738 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 6739 while (!TAILQ_EMPTY(&io_locked)) { 6740 bdev_io = TAILQ_FIRST(&io_locked); 6741 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 6742 bdev_io_submit(bdev_io); 6743 } 6744 6745 spdk_for_each_channel_continue(i, 0); 6746 } 6747 6748 static int 6749 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6750 uint64_t offset, uint64_t length, 6751 lock_range_cb cb_fn, void *cb_arg) 6752 { 6753 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6754 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6755 struct locked_lba_range_ctx *ctx; 6756 struct lba_range *range; 6757 bool range_found = false; 6758 6759 /* Let's make sure the specified channel actually has a lock on 6760 * the specified range. Note that the range must match exactly. 6761 */ 6762 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6763 if (range->offset == offset && range->length == length && 6764 range->owner_ch == ch && range->locked_ctx == cb_arg) { 6765 range_found = true; 6766 break; 6767 } 6768 } 6769 6770 if (!range_found) { 6771 return -EINVAL; 6772 } 6773 6774 pthread_mutex_lock(&bdev->internal.mutex); 6775 /* We confirmed that this channel has locked the specified range. To 6776 * start the unlock the process, we find the range in the bdev's locked_ranges 6777 * and remove it. This ensures new channels don't inherit the locked range. 6778 * Then we will send a message to each channel (including the one specified 6779 * here) to remove the range from its per-channel list. 6780 */ 6781 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 6782 if (range->offset == offset && range->length == length && 6783 range->locked_ctx == cb_arg) { 6784 break; 6785 } 6786 } 6787 if (range == NULL) { 6788 assert(false); 6789 pthread_mutex_unlock(&bdev->internal.mutex); 6790 return -EINVAL; 6791 } 6792 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 6793 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6794 pthread_mutex_unlock(&bdev->internal.mutex); 6795 6796 ctx->cb_fn = cb_fn; 6797 ctx->cb_arg = cb_arg; 6798 6799 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 6800 bdev_unlock_lba_range_cb); 6801 return 0; 6802 } 6803 6804 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 6805 6806 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 6807 { 6808 spdk_trace_register_owner(OWNER_BDEV, 'b'); 6809 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 6810 spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV, 6811 OBJECT_BDEV_IO, 1, 0, "type: "); 6812 spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV, 6813 OBJECT_BDEV_IO, 0, 0, ""); 6814 } 6815