1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/env.h" 40 #include "spdk/event.h" 41 #include "spdk/thread.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/util.h" 47 48 #include "spdk/bdev_module.h" 49 #include "spdk_internal/log.h" 50 #include "spdk/string.h" 51 52 #ifdef SPDK_CONFIG_VTUNE 53 #include "ittnotify.h" 54 #include "ittnotify_types.h" 55 int __itt_init_ittlib(const char *, __itt_group_id); 56 #endif 57 58 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 59 #define SPDK_BDEV_IO_CACHE_SIZE 256 60 #define BUF_SMALL_POOL_SIZE 8192 61 #define BUF_LARGE_POOL_SIZE 1024 62 #define NOMEM_THRESHOLD_COUNT 8 63 #define ZERO_BUFFER_SIZE 0x100000 64 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 65 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 66 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 67 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 69 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC 10 70 71 enum spdk_bdev_qos_type { 72 SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0, 73 SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT, 74 SPDK_BDEV_QOS_NUM_TYPES /* Keep last */ 75 }; 76 77 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"}; 78 79 struct spdk_bdev_mgr { 80 struct spdk_mempool *bdev_io_pool; 81 82 struct spdk_mempool *buf_small_pool; 83 struct spdk_mempool *buf_large_pool; 84 85 void *zero_buffer; 86 87 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 88 89 TAILQ_HEAD(, spdk_bdev) bdevs; 90 91 bool init_complete; 92 bool module_init_complete; 93 94 #ifdef SPDK_CONFIG_VTUNE 95 __itt_domain *domain; 96 #endif 97 }; 98 99 static struct spdk_bdev_mgr g_bdev_mgr = { 100 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 101 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 102 .init_complete = false, 103 .module_init_complete = false, 104 }; 105 106 static struct spdk_bdev_opts g_bdev_opts = { 107 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 108 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 109 }; 110 111 static spdk_bdev_init_cb g_init_cb_fn = NULL; 112 static void *g_init_cb_arg = NULL; 113 114 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 115 static void *g_fini_cb_arg = NULL; 116 static struct spdk_thread *g_fini_thread = NULL; 117 118 struct spdk_bdev_qos { 119 /** Rate limit, in I/O per second */ 120 uint64_t iops_rate_limit; 121 122 /** Rate limit, in byte per second */ 123 uint64_t byte_rate_limit; 124 125 /** The channel that all I/O are funneled through */ 126 struct spdk_bdev_channel *ch; 127 128 /** The thread on which the poller is running. */ 129 struct spdk_thread *thread; 130 131 /** Queue of I/O waiting to be issued. */ 132 bdev_io_tailq_t queued; 133 134 /** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 135 * only valid for the master channel which manages the outstanding IOs. */ 136 uint64_t max_ios_per_timeslice; 137 138 /** Maximum allowed bytes to be issued in one timeslice (e.g., 1ms) and 139 * only valid for the master channel which manages the outstanding IOs. */ 140 uint64_t max_byte_per_timeslice; 141 142 /** Submitted IO in one timeslice (e.g., 1ms) */ 143 uint64_t io_submitted_this_timeslice; 144 145 /** Submitted byte in one timeslice (e.g., 1ms) */ 146 uint64_t byte_submitted_this_timeslice; 147 148 /** Polller that processes queued I/O commands each time slice. */ 149 struct spdk_poller *poller; 150 }; 151 152 struct spdk_bdev_mgmt_channel { 153 bdev_io_stailq_t need_buf_small; 154 bdev_io_stailq_t need_buf_large; 155 156 /* 157 * Each thread keeps a cache of bdev_io - this allows 158 * bdev threads which are *not* DPDK threads to still 159 * benefit from a per-thread bdev_io cache. Without 160 * this, non-DPDK threads fetching from the mempool 161 * incur a cmpxchg on get and put. 162 */ 163 bdev_io_stailq_t per_thread_cache; 164 uint32_t per_thread_cache_count; 165 uint32_t bdev_io_cache_size; 166 167 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 168 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 169 }; 170 171 /* 172 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 173 * will queue here their IO that awaits retry. It makes it posible to retry sending 174 * IO to one bdev after IO from other bdev completes. 175 */ 176 struct spdk_bdev_shared_resource { 177 /* The bdev management channel */ 178 struct spdk_bdev_mgmt_channel *mgmt_ch; 179 180 /* 181 * Count of I/O submitted to bdev module and waiting for completion. 182 * Incremented before submit_request() is called on an spdk_bdev_io. 183 */ 184 uint64_t io_outstanding; 185 186 /* 187 * Queue of IO awaiting retry because of a previous NOMEM status returned 188 * on this channel. 189 */ 190 bdev_io_tailq_t nomem_io; 191 192 /* 193 * Threshold which io_outstanding must drop to before retrying nomem_io. 194 */ 195 uint64_t nomem_threshold; 196 197 /* I/O channel allocated by a bdev module */ 198 struct spdk_io_channel *shared_ch; 199 200 /* Refcount of bdev channels using this resource */ 201 uint32_t ref; 202 203 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 204 }; 205 206 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 207 #define BDEV_CH_QOS_ENABLED (1 << 1) 208 209 struct spdk_bdev_channel { 210 struct spdk_bdev *bdev; 211 212 /* The channel for the underlying device */ 213 struct spdk_io_channel *channel; 214 215 /* Per io_device per thread data */ 216 struct spdk_bdev_shared_resource *shared_resource; 217 218 struct spdk_bdev_io_stat stat; 219 220 /* 221 * Count of I/O submitted through this channel and waiting for completion. 222 * Incremented before submit_request() is called on an spdk_bdev_io. 223 */ 224 uint64_t io_outstanding; 225 226 bdev_io_tailq_t queued_resets; 227 228 uint32_t flags; 229 230 #ifdef SPDK_CONFIG_VTUNE 231 uint64_t start_tsc; 232 uint64_t interval_tsc; 233 __itt_string_handle *handle; 234 struct spdk_bdev_io_stat prev_stat; 235 #endif 236 237 }; 238 239 struct spdk_bdev_desc { 240 struct spdk_bdev *bdev; 241 spdk_bdev_remove_cb_t remove_cb; 242 void *remove_ctx; 243 bool remove_scheduled; 244 bool write; 245 TAILQ_ENTRY(spdk_bdev_desc) link; 246 }; 247 248 struct spdk_bdev_iostat_ctx { 249 struct spdk_bdev_io_stat *stat; 250 spdk_bdev_get_device_stat_cb cb; 251 void *cb_arg; 252 }; 253 254 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 255 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 256 257 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 258 259 void 260 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 261 { 262 *opts = g_bdev_opts; 263 } 264 265 int 266 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 267 { 268 uint32_t min_pool_size; 269 270 /* 271 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 272 * initialization. A second mgmt_ch will be created on the same thread when the application starts 273 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 274 */ 275 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 276 if (opts->bdev_io_pool_size < min_pool_size) { 277 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 278 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 279 spdk_thread_get_count()); 280 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 281 return -1; 282 } 283 284 g_bdev_opts = *opts; 285 return 0; 286 } 287 288 struct spdk_bdev * 289 spdk_bdev_first(void) 290 { 291 struct spdk_bdev *bdev; 292 293 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 294 if (bdev) { 295 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 296 } 297 298 return bdev; 299 } 300 301 struct spdk_bdev * 302 spdk_bdev_next(struct spdk_bdev *prev) 303 { 304 struct spdk_bdev *bdev; 305 306 bdev = TAILQ_NEXT(prev, internal.link); 307 if (bdev) { 308 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 309 } 310 311 return bdev; 312 } 313 314 static struct spdk_bdev * 315 _bdev_next_leaf(struct spdk_bdev *bdev) 316 { 317 while (bdev != NULL) { 318 if (bdev->internal.claim_module == NULL) { 319 return bdev; 320 } else { 321 bdev = TAILQ_NEXT(bdev, internal.link); 322 } 323 } 324 325 return bdev; 326 } 327 328 struct spdk_bdev * 329 spdk_bdev_first_leaf(void) 330 { 331 struct spdk_bdev *bdev; 332 333 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 334 335 if (bdev) { 336 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 337 } 338 339 return bdev; 340 } 341 342 struct spdk_bdev * 343 spdk_bdev_next_leaf(struct spdk_bdev *prev) 344 { 345 struct spdk_bdev *bdev; 346 347 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 348 349 if (bdev) { 350 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 351 } 352 353 return bdev; 354 } 355 356 struct spdk_bdev * 357 spdk_bdev_get_by_name(const char *bdev_name) 358 { 359 struct spdk_bdev_alias *tmp; 360 struct spdk_bdev *bdev = spdk_bdev_first(); 361 362 while (bdev != NULL) { 363 if (strcmp(bdev_name, bdev->name) == 0) { 364 return bdev; 365 } 366 367 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 368 if (strcmp(bdev_name, tmp->alias) == 0) { 369 return bdev; 370 } 371 } 372 373 bdev = spdk_bdev_next(bdev); 374 } 375 376 return NULL; 377 } 378 379 void 380 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 381 { 382 struct iovec *iovs; 383 384 iovs = bdev_io->u.bdev.iovs; 385 386 assert(iovs != NULL); 387 assert(bdev_io->u.bdev.iovcnt >= 1); 388 389 iovs[0].iov_base = buf; 390 iovs[0].iov_len = len; 391 } 392 393 static void 394 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 395 { 396 struct spdk_mempool *pool; 397 struct spdk_bdev_io *tmp; 398 void *buf, *aligned_buf; 399 bdev_io_stailq_t *stailq; 400 struct spdk_bdev_mgmt_channel *ch; 401 402 assert(bdev_io->u.bdev.iovcnt == 1); 403 404 buf = bdev_io->internal.buf; 405 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 406 407 bdev_io->internal.buf = NULL; 408 409 if (bdev_io->internal.buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 410 pool = g_bdev_mgr.buf_small_pool; 411 stailq = &ch->need_buf_small; 412 } else { 413 pool = g_bdev_mgr.buf_large_pool; 414 stailq = &ch->need_buf_large; 415 } 416 417 if (STAILQ_EMPTY(stailq)) { 418 spdk_mempool_put(pool, buf); 419 } else { 420 tmp = STAILQ_FIRST(stailq); 421 422 aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL); 423 spdk_bdev_io_set_buf(bdev_io, aligned_buf, tmp->internal.buf_len); 424 425 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 426 tmp->internal.buf = buf; 427 tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); 428 } 429 } 430 431 void 432 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 433 { 434 struct spdk_mempool *pool; 435 bdev_io_stailq_t *stailq; 436 void *buf, *aligned_buf; 437 struct spdk_bdev_mgmt_channel *mgmt_ch; 438 439 assert(cb != NULL); 440 assert(bdev_io->u.bdev.iovs != NULL); 441 442 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 443 /* Buffer already present */ 444 cb(bdev_io->internal.ch->channel, bdev_io); 445 return; 446 } 447 448 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 449 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 450 451 bdev_io->internal.buf_len = len; 452 bdev_io->internal.get_buf_cb = cb; 453 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 454 pool = g_bdev_mgr.buf_small_pool; 455 stailq = &mgmt_ch->need_buf_small; 456 } else { 457 pool = g_bdev_mgr.buf_large_pool; 458 stailq = &mgmt_ch->need_buf_large; 459 } 460 461 buf = spdk_mempool_get(pool); 462 463 if (!buf) { 464 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 465 } else { 466 aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL); 467 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 468 469 bdev_io->internal.buf = buf; 470 bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); 471 } 472 } 473 474 static int 475 spdk_bdev_module_get_max_ctx_size(void) 476 { 477 struct spdk_bdev_module *bdev_module; 478 int max_bdev_module_size = 0; 479 480 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 481 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 482 max_bdev_module_size = bdev_module->get_ctx_size(); 483 } 484 } 485 486 return max_bdev_module_size; 487 } 488 489 void 490 spdk_bdev_config_text(FILE *fp) 491 { 492 struct spdk_bdev_module *bdev_module; 493 494 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 495 if (bdev_module->config_text) { 496 bdev_module->config_text(fp); 497 } 498 } 499 } 500 501 void 502 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 503 { 504 struct spdk_bdev_module *bdev_module; 505 struct spdk_bdev *bdev; 506 507 assert(w != NULL); 508 509 spdk_json_write_array_begin(w); 510 511 spdk_json_write_object_begin(w); 512 spdk_json_write_named_string(w, "method", "set_bdev_options"); 513 spdk_json_write_name(w, "params"); 514 spdk_json_write_object_begin(w); 515 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 516 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 517 spdk_json_write_object_end(w); 518 spdk_json_write_object_end(w); 519 520 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 521 if (bdev_module->config_json) { 522 bdev_module->config_json(w); 523 } 524 } 525 526 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 527 spdk_bdev_config_json(bdev, w); 528 } 529 530 spdk_json_write_array_end(w); 531 } 532 533 static int 534 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 535 { 536 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 537 struct spdk_bdev_io *bdev_io; 538 uint32_t i; 539 540 STAILQ_INIT(&ch->need_buf_small); 541 STAILQ_INIT(&ch->need_buf_large); 542 543 STAILQ_INIT(&ch->per_thread_cache); 544 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 545 546 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 547 ch->per_thread_cache_count = 0; 548 for (i = 0; i < ch->bdev_io_cache_size; i++) { 549 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 550 assert(bdev_io != NULL); 551 ch->per_thread_cache_count++; 552 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 553 } 554 555 TAILQ_INIT(&ch->shared_resources); 556 TAILQ_INIT(&ch->io_wait_queue); 557 558 return 0; 559 } 560 561 static void 562 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 563 { 564 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 565 struct spdk_bdev_io *bdev_io; 566 567 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 568 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 569 } 570 571 if (!TAILQ_EMPTY(&ch->shared_resources)) { 572 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 573 } 574 575 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 576 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 577 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 578 ch->per_thread_cache_count--; 579 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 580 } 581 582 assert(ch->per_thread_cache_count == 0); 583 } 584 585 static void 586 spdk_bdev_init_complete(int rc) 587 { 588 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 589 void *cb_arg = g_init_cb_arg; 590 struct spdk_bdev_module *m; 591 592 g_bdev_mgr.init_complete = true; 593 g_init_cb_fn = NULL; 594 g_init_cb_arg = NULL; 595 596 /* 597 * For modules that need to know when subsystem init is complete, 598 * inform them now. 599 */ 600 if (rc == 0) { 601 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 602 if (m->init_complete) { 603 m->init_complete(); 604 } 605 } 606 } 607 608 cb_fn(cb_arg, rc); 609 } 610 611 static void 612 spdk_bdev_module_action_complete(void) 613 { 614 struct spdk_bdev_module *m; 615 616 /* 617 * Don't finish bdev subsystem initialization if 618 * module pre-initialization is still in progress, or 619 * the subsystem been already initialized. 620 */ 621 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 622 return; 623 } 624 625 /* 626 * Check all bdev modules for inits/examinations in progress. If any 627 * exist, return immediately since we cannot finish bdev subsystem 628 * initialization until all are completed. 629 */ 630 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 631 if (m->internal.action_in_progress > 0) { 632 return; 633 } 634 } 635 636 /* 637 * Modules already finished initialization - now that all 638 * the bdev modules have finished their asynchronous I/O 639 * processing, the entire bdev layer can be marked as complete. 640 */ 641 spdk_bdev_init_complete(0); 642 } 643 644 static void 645 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 646 { 647 assert(module->internal.action_in_progress > 0); 648 module->internal.action_in_progress--; 649 spdk_bdev_module_action_complete(); 650 } 651 652 void 653 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 654 { 655 spdk_bdev_module_action_done(module); 656 } 657 658 void 659 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 660 { 661 spdk_bdev_module_action_done(module); 662 } 663 664 static int 665 spdk_bdev_modules_init(void) 666 { 667 struct spdk_bdev_module *module; 668 int rc = 0; 669 670 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 671 rc = module->module_init(); 672 if (rc != 0) { 673 break; 674 } 675 } 676 677 g_bdev_mgr.module_init_complete = true; 678 return rc; 679 } 680 681 682 static void 683 spdk_bdev_init_failed_complete(void *cb_arg) 684 { 685 spdk_bdev_init_complete(-1); 686 } 687 688 static void 689 spdk_bdev_init_failed(void *cb_arg) 690 { 691 spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); 692 } 693 694 void 695 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 696 { 697 struct spdk_conf_section *sp; 698 struct spdk_bdev_opts bdev_opts; 699 int32_t bdev_io_pool_size, bdev_io_cache_size; 700 int cache_size; 701 int rc = 0; 702 char mempool_name[32]; 703 704 assert(cb_fn != NULL); 705 706 sp = spdk_conf_find_section(NULL, "Bdev"); 707 if (sp != NULL) { 708 spdk_bdev_get_opts(&bdev_opts); 709 710 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 711 if (bdev_io_pool_size >= 0) { 712 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 713 } 714 715 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 716 if (bdev_io_cache_size >= 0) { 717 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 718 } 719 720 if (spdk_bdev_set_opts(&bdev_opts)) { 721 spdk_bdev_init_complete(-1); 722 return; 723 } 724 725 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 726 } 727 728 g_init_cb_fn = cb_fn; 729 g_init_cb_arg = cb_arg; 730 731 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 732 733 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 734 g_bdev_opts.bdev_io_pool_size, 735 sizeof(struct spdk_bdev_io) + 736 spdk_bdev_module_get_max_ctx_size(), 737 0, 738 SPDK_ENV_SOCKET_ID_ANY); 739 740 if (g_bdev_mgr.bdev_io_pool == NULL) { 741 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 742 spdk_bdev_init_complete(-1); 743 return; 744 } 745 746 /** 747 * Ensure no more than half of the total buffers end up local caches, by 748 * using spdk_thread_get_count() to determine how many local caches we need 749 * to account for. 750 */ 751 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 752 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 753 754 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 755 BUF_SMALL_POOL_SIZE, 756 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 757 cache_size, 758 SPDK_ENV_SOCKET_ID_ANY); 759 if (!g_bdev_mgr.buf_small_pool) { 760 SPDK_ERRLOG("create rbuf small pool failed\n"); 761 spdk_bdev_init_complete(-1); 762 return; 763 } 764 765 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 766 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 767 768 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 769 BUF_LARGE_POOL_SIZE, 770 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 771 cache_size, 772 SPDK_ENV_SOCKET_ID_ANY); 773 if (!g_bdev_mgr.buf_large_pool) { 774 SPDK_ERRLOG("create rbuf large pool failed\n"); 775 spdk_bdev_init_complete(-1); 776 return; 777 } 778 779 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 780 NULL); 781 if (!g_bdev_mgr.zero_buffer) { 782 SPDK_ERRLOG("create bdev zero buffer failed\n"); 783 spdk_bdev_init_complete(-1); 784 return; 785 } 786 787 #ifdef SPDK_CONFIG_VTUNE 788 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 789 #endif 790 791 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 792 spdk_bdev_mgmt_channel_destroy, 793 sizeof(struct spdk_bdev_mgmt_channel)); 794 795 rc = spdk_bdev_modules_init(); 796 if (rc != 0) { 797 SPDK_ERRLOG("bdev modules init failed\n"); 798 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); 799 return; 800 } 801 802 spdk_bdev_module_action_complete(); 803 } 804 805 static void 806 spdk_bdev_mgr_unregister_cb(void *io_device) 807 { 808 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 809 810 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 811 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 812 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 813 g_bdev_opts.bdev_io_pool_size); 814 } 815 816 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 817 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 818 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 819 BUF_SMALL_POOL_SIZE); 820 assert(false); 821 } 822 823 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 824 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 825 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 826 BUF_LARGE_POOL_SIZE); 827 assert(false); 828 } 829 830 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 831 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 832 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 833 spdk_dma_free(g_bdev_mgr.zero_buffer); 834 835 cb_fn(g_fini_cb_arg); 836 g_fini_cb_fn = NULL; 837 g_fini_cb_arg = NULL; 838 } 839 840 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 841 842 static void 843 spdk_bdev_module_finish_iter(void *arg) 844 { 845 struct spdk_bdev_module *bdev_module; 846 847 /* Start iterating from the last touched module */ 848 if (!g_resume_bdev_module) { 849 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 850 } else { 851 bdev_module = TAILQ_NEXT(g_resume_bdev_module, internal.tailq); 852 } 853 854 while (bdev_module) { 855 if (bdev_module->async_fini) { 856 /* Save our place so we can resume later. We must 857 * save the variable here, before calling module_fini() 858 * below, because in some cases the module may immediately 859 * call spdk_bdev_module_finish_done() and re-enter 860 * this function to continue iterating. */ 861 g_resume_bdev_module = bdev_module; 862 } 863 864 if (bdev_module->module_fini) { 865 bdev_module->module_fini(); 866 } 867 868 if (bdev_module->async_fini) { 869 return; 870 } 871 872 bdev_module = TAILQ_NEXT(bdev_module, internal.tailq); 873 } 874 875 g_resume_bdev_module = NULL; 876 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 877 } 878 879 void 880 spdk_bdev_module_finish_done(void) 881 { 882 if (spdk_get_thread() != g_fini_thread) { 883 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 884 } else { 885 spdk_bdev_module_finish_iter(NULL); 886 } 887 } 888 889 static void 890 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 891 { 892 struct spdk_bdev *bdev = cb_arg; 893 894 if (bdeverrno && bdev) { 895 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 896 bdev->name); 897 898 /* 899 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 900 * bdev; try to continue by manually removing this bdev from the list and continue 901 * with the next bdev in the list. 902 */ 903 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 904 } 905 906 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 907 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 908 /* 909 * Bdev module finish need to be deffered as we might be in the middle of some context 910 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 911 * after returning. 912 */ 913 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 914 return; 915 } 916 917 /* 918 * Unregister the first bdev in the list. 919 * 920 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 921 * calling the remove_cb of the descriptors first. 922 * 923 * Once this bdev and all of its open descriptors have been cleaned up, this function 924 * will be called again via the unregister completion callback to continue the cleanup 925 * process with the next bdev. 926 */ 927 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 928 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 929 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 930 } 931 932 void 933 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 934 { 935 assert(cb_fn != NULL); 936 937 g_fini_thread = spdk_get_thread(); 938 939 g_fini_cb_fn = cb_fn; 940 g_fini_cb_arg = cb_arg; 941 942 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 943 } 944 945 static struct spdk_bdev_io * 946 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 947 { 948 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 949 struct spdk_bdev_io *bdev_io; 950 951 if (ch->per_thread_cache_count > 0) { 952 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 953 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 954 ch->per_thread_cache_count--; 955 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 956 /* 957 * Don't try to look for bdev_ios in the global pool if there are 958 * waiters on bdev_ios - we don't want this caller to jump the line. 959 */ 960 bdev_io = NULL; 961 } else { 962 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 963 } 964 965 return bdev_io; 966 } 967 968 void 969 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 970 { 971 struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 972 973 assert(bdev_io != NULL); 974 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 975 976 if (bdev_io->internal.buf != NULL) { 977 spdk_bdev_io_put_buf(bdev_io); 978 } 979 980 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 981 ch->per_thread_cache_count++; 982 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 983 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 984 struct spdk_bdev_io_wait_entry *entry; 985 986 entry = TAILQ_FIRST(&ch->io_wait_queue); 987 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 988 entry->cb_fn(entry->cb_arg); 989 } 990 } else { 991 /* We should never have a full cache with entries on the io wait queue. */ 992 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 993 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 994 } 995 } 996 997 static uint64_t 998 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 999 { 1000 struct spdk_bdev *bdev = bdev_io->bdev; 1001 1002 switch (bdev_io->type) { 1003 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 1004 case SPDK_BDEV_IO_TYPE_NVME_IO: 1005 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1006 return bdev_io->u.nvme_passthru.nbytes; 1007 case SPDK_BDEV_IO_TYPE_READ: 1008 case SPDK_BDEV_IO_TYPE_WRITE: 1009 case SPDK_BDEV_IO_TYPE_UNMAP: 1010 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1011 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1012 default: 1013 return 0; 1014 } 1015 } 1016 1017 static void 1018 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 1019 { 1020 struct spdk_bdev_io *bdev_io = NULL; 1021 struct spdk_bdev *bdev = ch->bdev; 1022 struct spdk_bdev_qos *qos = bdev->internal.qos; 1023 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1024 1025 while (!TAILQ_EMPTY(&qos->queued)) { 1026 if (qos->max_ios_per_timeslice > 0 && 1027 qos->io_submitted_this_timeslice >= qos->max_ios_per_timeslice) { 1028 break; 1029 } 1030 1031 if (qos->max_byte_per_timeslice > 0 && 1032 qos->byte_submitted_this_timeslice >= qos->max_byte_per_timeslice) { 1033 break; 1034 } 1035 1036 bdev_io = TAILQ_FIRST(&qos->queued); 1037 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1038 qos->io_submitted_this_timeslice++; 1039 qos->byte_submitted_this_timeslice += _spdk_bdev_get_io_size_in_byte(bdev_io); 1040 ch->io_outstanding++; 1041 shared_resource->io_outstanding++; 1042 bdev->fn_table->submit_request(ch->channel, bdev_io); 1043 } 1044 } 1045 1046 static void 1047 _spdk_bdev_io_submit(void *ctx) 1048 { 1049 struct spdk_bdev_io *bdev_io = ctx; 1050 struct spdk_bdev *bdev = bdev_io->bdev; 1051 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1052 struct spdk_io_channel *ch = bdev_ch->channel; 1053 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1054 1055 bdev_io->internal.submit_tsc = spdk_get_ticks(); 1056 bdev_ch->io_outstanding++; 1057 shared_resource->io_outstanding++; 1058 bdev_io->internal.in_submit_request = true; 1059 if (spdk_likely(bdev_ch->flags == 0)) { 1060 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1061 bdev->fn_table->submit_request(ch, bdev_io); 1062 } else { 1063 bdev_ch->io_outstanding--; 1064 shared_resource->io_outstanding--; 1065 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1066 } 1067 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1068 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1069 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1070 bdev_ch->io_outstanding--; 1071 shared_resource->io_outstanding--; 1072 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1073 _spdk_bdev_qos_io_submit(bdev_ch); 1074 } else { 1075 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1076 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1077 } 1078 bdev_io->internal.in_submit_request = false; 1079 } 1080 1081 static void 1082 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 1083 { 1084 struct spdk_bdev *bdev = bdev_io->bdev; 1085 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 1086 1087 assert(thread != NULL); 1088 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1089 1090 if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { 1091 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 1092 _spdk_bdev_io_submit(bdev_io); 1093 } else { 1094 bdev_io->internal.io_submit_ch = bdev_io->internal.ch; 1095 bdev_io->internal.ch = bdev->internal.qos->ch; 1096 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); 1097 } 1098 } else { 1099 _spdk_bdev_io_submit(bdev_io); 1100 } 1101 } 1102 1103 static void 1104 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 1105 { 1106 struct spdk_bdev *bdev = bdev_io->bdev; 1107 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1108 struct spdk_io_channel *ch = bdev_ch->channel; 1109 1110 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1111 1112 bdev_io->internal.in_submit_request = true; 1113 bdev->fn_table->submit_request(ch, bdev_io); 1114 bdev_io->internal.in_submit_request = false; 1115 } 1116 1117 static void 1118 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 1119 struct spdk_bdev *bdev, void *cb_arg, 1120 spdk_bdev_io_completion_cb cb) 1121 { 1122 bdev_io->bdev = bdev; 1123 bdev_io->internal.caller_ctx = cb_arg; 1124 bdev_io->internal.cb = cb; 1125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1126 bdev_io->internal.in_submit_request = false; 1127 bdev_io->internal.buf = NULL; 1128 bdev_io->internal.io_submit_ch = NULL; 1129 } 1130 1131 static bool 1132 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1133 { 1134 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1135 } 1136 1137 bool 1138 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1139 { 1140 bool supported; 1141 1142 supported = _spdk_bdev_io_type_supported(bdev, io_type); 1143 1144 if (!supported) { 1145 switch (io_type) { 1146 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1147 /* The bdev layer will emulate write zeroes as long as write is supported. */ 1148 supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 1149 break; 1150 default: 1151 break; 1152 } 1153 } 1154 1155 return supported; 1156 } 1157 1158 int 1159 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1160 { 1161 if (bdev->fn_table->dump_info_json) { 1162 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1163 } 1164 1165 return 0; 1166 } 1167 1168 void 1169 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1170 { 1171 assert(bdev != NULL); 1172 assert(w != NULL); 1173 1174 if (bdev->fn_table->write_config_json) { 1175 bdev->fn_table->write_config_json(bdev, w); 1176 } else { 1177 spdk_json_write_object_begin(w); 1178 spdk_json_write_named_string(w, "name", bdev->name); 1179 spdk_json_write_object_end(w); 1180 } 1181 } 1182 1183 static void 1184 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1185 { 1186 uint64_t max_ios_per_timeslice = 0, max_byte_per_timeslice = 0; 1187 1188 if (qos->iops_rate_limit > 0) { 1189 max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1190 SPDK_BDEV_SEC_TO_USEC; 1191 qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice, 1192 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 1193 } 1194 1195 if (qos->byte_rate_limit > 0) { 1196 max_byte_per_timeslice = qos->byte_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1197 SPDK_BDEV_SEC_TO_USEC; 1198 qos->max_byte_per_timeslice = spdk_max(max_byte_per_timeslice, 1199 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE); 1200 } 1201 } 1202 1203 static int 1204 spdk_bdev_channel_poll_qos(void *arg) 1205 { 1206 struct spdk_bdev_qos *qos = arg; 1207 1208 /* Reset for next round of rate limiting */ 1209 qos->io_submitted_this_timeslice = 0; 1210 1211 /* More bytes sent in the last timeslice, allow less in this timeslice */ 1212 if (qos->byte_submitted_this_timeslice > qos->max_byte_per_timeslice) { 1213 qos->byte_submitted_this_timeslice -= qos->max_byte_per_timeslice; 1214 } else { 1215 qos->byte_submitted_this_timeslice = 0; 1216 } 1217 1218 _spdk_bdev_qos_io_submit(qos->ch); 1219 1220 return -1; 1221 } 1222 1223 static void 1224 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1225 { 1226 struct spdk_bdev_shared_resource *shared_resource; 1227 1228 if (!ch) { 1229 return; 1230 } 1231 1232 if (ch->channel) { 1233 spdk_put_io_channel(ch->channel); 1234 } 1235 1236 assert(ch->io_outstanding == 0); 1237 1238 shared_resource = ch->shared_resource; 1239 if (shared_resource) { 1240 assert(ch->io_outstanding == 0); 1241 assert(shared_resource->ref > 0); 1242 shared_resource->ref--; 1243 if (shared_resource->ref == 0) { 1244 assert(shared_resource->io_outstanding == 0); 1245 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1246 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1247 free(shared_resource); 1248 } 1249 } 1250 } 1251 1252 /* Caller must hold bdev->internal.mutex. */ 1253 static void 1254 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1255 { 1256 struct spdk_bdev_qos *qos = bdev->internal.qos; 1257 1258 /* Rate limiting on this bdev enabled */ 1259 if (qos) { 1260 if (qos->ch == NULL) { 1261 struct spdk_io_channel *io_ch; 1262 1263 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1264 bdev->name, spdk_get_thread()); 1265 1266 /* No qos channel has been selected, so set one up */ 1267 1268 /* Take another reference to ch */ 1269 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1270 qos->ch = ch; 1271 1272 qos->thread = spdk_io_channel_get_thread(io_ch); 1273 1274 TAILQ_INIT(&qos->queued); 1275 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1276 qos->io_submitted_this_timeslice = 0; 1277 qos->byte_submitted_this_timeslice = 0; 1278 1279 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1280 qos, 1281 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1282 } 1283 1284 ch->flags |= BDEV_CH_QOS_ENABLED; 1285 } 1286 } 1287 1288 static int 1289 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1290 { 1291 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1292 struct spdk_bdev_channel *ch = ctx_buf; 1293 struct spdk_io_channel *mgmt_io_ch; 1294 struct spdk_bdev_mgmt_channel *mgmt_ch; 1295 struct spdk_bdev_shared_resource *shared_resource; 1296 1297 ch->bdev = bdev; 1298 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1299 if (!ch->channel) { 1300 return -1; 1301 } 1302 1303 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1304 if (!mgmt_io_ch) { 1305 return -1; 1306 } 1307 1308 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1309 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1310 if (shared_resource->shared_ch == ch->channel) { 1311 spdk_put_io_channel(mgmt_io_ch); 1312 shared_resource->ref++; 1313 break; 1314 } 1315 } 1316 1317 if (shared_resource == NULL) { 1318 shared_resource = calloc(1, sizeof(*shared_resource)); 1319 if (shared_resource == NULL) { 1320 spdk_put_io_channel(mgmt_io_ch); 1321 return -1; 1322 } 1323 1324 shared_resource->mgmt_ch = mgmt_ch; 1325 shared_resource->io_outstanding = 0; 1326 TAILQ_INIT(&shared_resource->nomem_io); 1327 shared_resource->nomem_threshold = 0; 1328 shared_resource->shared_ch = ch->channel; 1329 shared_resource->ref = 1; 1330 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1331 } 1332 1333 memset(&ch->stat, 0, sizeof(ch->stat)); 1334 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1335 ch->io_outstanding = 0; 1336 TAILQ_INIT(&ch->queued_resets); 1337 ch->flags = 0; 1338 ch->shared_resource = shared_resource; 1339 1340 #ifdef SPDK_CONFIG_VTUNE 1341 { 1342 char *name; 1343 __itt_init_ittlib(NULL, 0); 1344 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1345 if (!name) { 1346 _spdk_bdev_channel_destroy_resource(ch); 1347 return -1; 1348 } 1349 ch->handle = __itt_string_handle_create(name); 1350 free(name); 1351 ch->start_tsc = spdk_get_ticks(); 1352 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1353 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1354 } 1355 #endif 1356 1357 pthread_mutex_lock(&bdev->internal.mutex); 1358 _spdk_bdev_enable_qos(bdev, ch); 1359 pthread_mutex_unlock(&bdev->internal.mutex); 1360 1361 return 0; 1362 } 1363 1364 /* 1365 * Abort I/O that are waiting on a data buffer. These types of I/O are 1366 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 1367 */ 1368 static void 1369 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1370 { 1371 bdev_io_stailq_t tmp; 1372 struct spdk_bdev_io *bdev_io; 1373 1374 STAILQ_INIT(&tmp); 1375 1376 while (!STAILQ_EMPTY(queue)) { 1377 bdev_io = STAILQ_FIRST(queue); 1378 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 1379 if (bdev_io->internal.ch == ch) { 1380 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1381 } else { 1382 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 1383 } 1384 } 1385 1386 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1387 } 1388 1389 /* 1390 * Abort I/O that are queued waiting for submission. These types of I/O are 1391 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1392 */ 1393 static void 1394 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1395 { 1396 struct spdk_bdev_io *bdev_io, *tmp; 1397 1398 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 1399 if (bdev_io->internal.ch == ch) { 1400 TAILQ_REMOVE(queue, bdev_io, internal.link); 1401 /* 1402 * spdk_bdev_io_complete() assumes that the completed I/O had 1403 * been submitted to the bdev module. Since in this case it 1404 * hadn't, bump io_outstanding to account for the decrement 1405 * that spdk_bdev_io_complete() will do. 1406 */ 1407 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1408 ch->io_outstanding++; 1409 ch->shared_resource->io_outstanding++; 1410 } 1411 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1412 } 1413 } 1414 } 1415 1416 static void 1417 spdk_bdev_qos_channel_destroy(void *cb_arg) 1418 { 1419 struct spdk_bdev_qos *qos = cb_arg; 1420 1421 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 1422 spdk_poller_unregister(&qos->poller); 1423 1424 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 1425 1426 free(qos); 1427 } 1428 1429 static int 1430 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 1431 { 1432 /* 1433 * Cleanly shutting down the QoS poller is tricky, because 1434 * during the asynchronous operation the user could open 1435 * a new descriptor and create a new channel, spawning 1436 * a new QoS poller. 1437 * 1438 * The strategy is to create a new QoS structure here and swap it 1439 * in. The shutdown path then continues to refer to the old one 1440 * until it completes and then releases it. 1441 */ 1442 struct spdk_bdev_qos *new_qos, *old_qos; 1443 1444 old_qos = bdev->internal.qos; 1445 1446 new_qos = calloc(1, sizeof(*new_qos)); 1447 if (!new_qos) { 1448 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 1449 return -ENOMEM; 1450 } 1451 1452 /* Copy the old QoS data into the newly allocated structure */ 1453 memcpy(new_qos, old_qos, sizeof(*new_qos)); 1454 1455 /* Zero out the key parts of the QoS structure */ 1456 new_qos->ch = NULL; 1457 new_qos->thread = NULL; 1458 new_qos->max_ios_per_timeslice = 0; 1459 new_qos->max_byte_per_timeslice = 0; 1460 new_qos->io_submitted_this_timeslice = 0; 1461 new_qos->byte_submitted_this_timeslice = 0; 1462 new_qos->poller = NULL; 1463 TAILQ_INIT(&new_qos->queued); 1464 1465 bdev->internal.qos = new_qos; 1466 1467 if (old_qos->thread == NULL) { 1468 free(old_qos); 1469 } else { 1470 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 1471 old_qos); 1472 } 1473 1474 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 1475 * been destroyed yet. The destruction path will end up waiting for the final 1476 * channel to be put before it releases resources. */ 1477 1478 return 0; 1479 } 1480 1481 static void 1482 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 1483 { 1484 total->bytes_read += add->bytes_read; 1485 total->num_read_ops += add->num_read_ops; 1486 total->bytes_written += add->bytes_written; 1487 total->num_write_ops += add->num_write_ops; 1488 total->read_latency_ticks += add->read_latency_ticks; 1489 total->write_latency_ticks += add->write_latency_ticks; 1490 } 1491 1492 static void 1493 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1494 { 1495 struct spdk_bdev_channel *ch = ctx_buf; 1496 struct spdk_bdev_mgmt_channel *mgmt_ch; 1497 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1498 1499 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 1500 spdk_get_thread()); 1501 1502 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 1503 pthread_mutex_lock(&ch->bdev->internal.mutex); 1504 _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 1505 pthread_mutex_unlock(&ch->bdev->internal.mutex); 1506 1507 mgmt_ch = shared_resource->mgmt_ch; 1508 1509 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1510 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 1511 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1512 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1513 1514 _spdk_bdev_channel_destroy_resource(ch); 1515 } 1516 1517 int 1518 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1519 { 1520 struct spdk_bdev_alias *tmp; 1521 1522 if (alias == NULL) { 1523 SPDK_ERRLOG("Empty alias passed\n"); 1524 return -EINVAL; 1525 } 1526 1527 if (spdk_bdev_get_by_name(alias)) { 1528 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1529 return -EEXIST; 1530 } 1531 1532 tmp = calloc(1, sizeof(*tmp)); 1533 if (tmp == NULL) { 1534 SPDK_ERRLOG("Unable to allocate alias\n"); 1535 return -ENOMEM; 1536 } 1537 1538 tmp->alias = strdup(alias); 1539 if (tmp->alias == NULL) { 1540 free(tmp); 1541 SPDK_ERRLOG("Unable to allocate alias\n"); 1542 return -ENOMEM; 1543 } 1544 1545 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1546 1547 return 0; 1548 } 1549 1550 int 1551 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1552 { 1553 struct spdk_bdev_alias *tmp; 1554 1555 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1556 if (strcmp(alias, tmp->alias) == 0) { 1557 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1558 free(tmp->alias); 1559 free(tmp); 1560 return 0; 1561 } 1562 } 1563 1564 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1565 1566 return -ENOENT; 1567 } 1568 1569 struct spdk_io_channel * 1570 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1571 { 1572 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1573 } 1574 1575 const char * 1576 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1577 { 1578 return bdev->name; 1579 } 1580 1581 const char * 1582 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1583 { 1584 return bdev->product_name; 1585 } 1586 1587 const struct spdk_bdev_aliases_list * 1588 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1589 { 1590 return &bdev->aliases; 1591 } 1592 1593 uint32_t 1594 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1595 { 1596 return bdev->blocklen; 1597 } 1598 1599 uint64_t 1600 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1601 { 1602 return bdev->blockcnt; 1603 } 1604 1605 uint64_t 1606 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev) 1607 { 1608 uint64_t iops_rate_limit = 0; 1609 1610 pthread_mutex_lock(&bdev->internal.mutex); 1611 if (bdev->internal.qos) { 1612 iops_rate_limit = bdev->internal.qos->iops_rate_limit; 1613 } 1614 pthread_mutex_unlock(&bdev->internal.mutex); 1615 1616 return iops_rate_limit; 1617 } 1618 1619 size_t 1620 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1621 { 1622 /* TODO: push this logic down to the bdev modules */ 1623 if (bdev->need_aligned_buffer) { 1624 return bdev->blocklen; 1625 } 1626 1627 return 1; 1628 } 1629 1630 uint32_t 1631 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1632 { 1633 return bdev->optimal_io_boundary; 1634 } 1635 1636 bool 1637 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1638 { 1639 return bdev->write_cache; 1640 } 1641 1642 const struct spdk_uuid * 1643 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1644 { 1645 return &bdev->uuid; 1646 } 1647 1648 uint64_t 1649 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 1650 { 1651 return bdev->internal.measured_queue_depth; 1652 } 1653 1654 uint64_t 1655 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 1656 { 1657 return bdev->internal.period; 1658 } 1659 1660 uint64_t 1661 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 1662 { 1663 return bdev->internal.weighted_io_time; 1664 } 1665 1666 uint64_t 1667 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 1668 { 1669 return bdev->internal.io_time; 1670 } 1671 1672 static void 1673 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 1674 { 1675 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 1676 1677 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 1678 1679 if (bdev->internal.measured_queue_depth) { 1680 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 1681 } 1682 } 1683 1684 static void 1685 _calculate_measured_qd(struct spdk_io_channel_iter *i) 1686 { 1687 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 1688 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 1689 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 1690 1691 bdev->internal.temporary_queue_depth += ch->io_outstanding; 1692 spdk_for_each_channel_continue(i, 0); 1693 } 1694 1695 static int 1696 spdk_bdev_calculate_measured_queue_depth(void *ctx) 1697 { 1698 struct spdk_bdev *bdev = ctx; 1699 bdev->internal.temporary_queue_depth = 0; 1700 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 1701 _calculate_measured_qd_cpl); 1702 return 0; 1703 } 1704 1705 void 1706 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 1707 { 1708 bdev->internal.period = period; 1709 1710 if (bdev->internal.qd_poller != NULL) { 1711 spdk_poller_unregister(&bdev->internal.qd_poller); 1712 bdev->internal.measured_queue_depth = UINT64_MAX; 1713 } 1714 1715 if (period != 0) { 1716 bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, 1717 period); 1718 } 1719 } 1720 1721 int 1722 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1723 { 1724 int ret; 1725 1726 pthread_mutex_lock(&bdev->internal.mutex); 1727 1728 /* bdev has open descriptors */ 1729 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 1730 bdev->blockcnt > size) { 1731 ret = -EBUSY; 1732 } else { 1733 bdev->blockcnt = size; 1734 ret = 0; 1735 } 1736 1737 pthread_mutex_unlock(&bdev->internal.mutex); 1738 1739 return ret; 1740 } 1741 1742 /* 1743 * Convert I/O offset and length from bytes to blocks. 1744 * 1745 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1746 */ 1747 static uint64_t 1748 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1749 uint64_t num_bytes, uint64_t *num_blocks) 1750 { 1751 uint32_t block_size = bdev->blocklen; 1752 1753 *offset_blocks = offset_bytes / block_size; 1754 *num_blocks = num_bytes / block_size; 1755 1756 return (offset_bytes % block_size) | (num_bytes % block_size); 1757 } 1758 1759 static bool 1760 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1761 { 1762 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1763 * has been an overflow and hence the offset has been wrapped around */ 1764 if (offset_blocks + num_blocks < offset_blocks) { 1765 return false; 1766 } 1767 1768 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1769 if (offset_blocks + num_blocks > bdev->blockcnt) { 1770 return false; 1771 } 1772 1773 return true; 1774 } 1775 1776 int 1777 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1778 void *buf, uint64_t offset, uint64_t nbytes, 1779 spdk_bdev_io_completion_cb cb, void *cb_arg) 1780 { 1781 uint64_t offset_blocks, num_blocks; 1782 1783 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1784 return -EINVAL; 1785 } 1786 1787 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1788 } 1789 1790 int 1791 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1792 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1793 spdk_bdev_io_completion_cb cb, void *cb_arg) 1794 { 1795 struct spdk_bdev *bdev = desc->bdev; 1796 struct spdk_bdev_io *bdev_io; 1797 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1798 1799 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1800 return -EINVAL; 1801 } 1802 1803 bdev_io = spdk_bdev_get_io(channel); 1804 if (!bdev_io) { 1805 return -ENOMEM; 1806 } 1807 1808 bdev_io->internal.ch = channel; 1809 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1810 bdev_io->u.bdev.iovs = &bdev_io->iov; 1811 bdev_io->u.bdev.iovs[0].iov_base = buf; 1812 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 1813 bdev_io->u.bdev.iovcnt = 1; 1814 bdev_io->u.bdev.num_blocks = num_blocks; 1815 bdev_io->u.bdev.offset_blocks = offset_blocks; 1816 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1817 1818 spdk_bdev_io_submit(bdev_io); 1819 return 0; 1820 } 1821 1822 int 1823 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1824 struct iovec *iov, int iovcnt, 1825 uint64_t offset, uint64_t nbytes, 1826 spdk_bdev_io_completion_cb cb, void *cb_arg) 1827 { 1828 uint64_t offset_blocks, num_blocks; 1829 1830 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1831 return -EINVAL; 1832 } 1833 1834 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1835 } 1836 1837 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1838 struct iovec *iov, int iovcnt, 1839 uint64_t offset_blocks, uint64_t num_blocks, 1840 spdk_bdev_io_completion_cb cb, void *cb_arg) 1841 { 1842 struct spdk_bdev *bdev = desc->bdev; 1843 struct spdk_bdev_io *bdev_io; 1844 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1845 1846 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1847 return -EINVAL; 1848 } 1849 1850 bdev_io = spdk_bdev_get_io(channel); 1851 if (!bdev_io) { 1852 return -ENOMEM; 1853 } 1854 1855 bdev_io->internal.ch = channel; 1856 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1857 bdev_io->u.bdev.iovs = iov; 1858 bdev_io->u.bdev.iovcnt = iovcnt; 1859 bdev_io->u.bdev.num_blocks = num_blocks; 1860 bdev_io->u.bdev.offset_blocks = offset_blocks; 1861 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1862 1863 spdk_bdev_io_submit(bdev_io); 1864 return 0; 1865 } 1866 1867 int 1868 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1869 void *buf, uint64_t offset, uint64_t nbytes, 1870 spdk_bdev_io_completion_cb cb, void *cb_arg) 1871 { 1872 uint64_t offset_blocks, num_blocks; 1873 1874 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1875 return -EINVAL; 1876 } 1877 1878 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1879 } 1880 1881 int 1882 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1883 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1884 spdk_bdev_io_completion_cb cb, void *cb_arg) 1885 { 1886 struct spdk_bdev *bdev = desc->bdev; 1887 struct spdk_bdev_io *bdev_io; 1888 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1889 1890 if (!desc->write) { 1891 return -EBADF; 1892 } 1893 1894 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1895 return -EINVAL; 1896 } 1897 1898 bdev_io = spdk_bdev_get_io(channel); 1899 if (!bdev_io) { 1900 return -ENOMEM; 1901 } 1902 1903 bdev_io->internal.ch = channel; 1904 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1905 bdev_io->u.bdev.iovs = &bdev_io->iov; 1906 bdev_io->u.bdev.iovs[0].iov_base = buf; 1907 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 1908 bdev_io->u.bdev.iovcnt = 1; 1909 bdev_io->u.bdev.num_blocks = num_blocks; 1910 bdev_io->u.bdev.offset_blocks = offset_blocks; 1911 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1912 1913 spdk_bdev_io_submit(bdev_io); 1914 return 0; 1915 } 1916 1917 int 1918 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1919 struct iovec *iov, int iovcnt, 1920 uint64_t offset, uint64_t len, 1921 spdk_bdev_io_completion_cb cb, void *cb_arg) 1922 { 1923 uint64_t offset_blocks, num_blocks; 1924 1925 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1926 return -EINVAL; 1927 } 1928 1929 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1930 } 1931 1932 int 1933 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1934 struct iovec *iov, int iovcnt, 1935 uint64_t offset_blocks, uint64_t num_blocks, 1936 spdk_bdev_io_completion_cb cb, void *cb_arg) 1937 { 1938 struct spdk_bdev *bdev = desc->bdev; 1939 struct spdk_bdev_io *bdev_io; 1940 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1941 1942 if (!desc->write) { 1943 return -EBADF; 1944 } 1945 1946 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1947 return -EINVAL; 1948 } 1949 1950 bdev_io = spdk_bdev_get_io(channel); 1951 if (!bdev_io) { 1952 return -ENOMEM; 1953 } 1954 1955 bdev_io->internal.ch = channel; 1956 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1957 bdev_io->u.bdev.iovs = iov; 1958 bdev_io->u.bdev.iovcnt = iovcnt; 1959 bdev_io->u.bdev.num_blocks = num_blocks; 1960 bdev_io->u.bdev.offset_blocks = offset_blocks; 1961 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1962 1963 spdk_bdev_io_submit(bdev_io); 1964 return 0; 1965 } 1966 1967 int 1968 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1969 uint64_t offset, uint64_t len, 1970 spdk_bdev_io_completion_cb cb, void *cb_arg) 1971 { 1972 uint64_t offset_blocks, num_blocks; 1973 1974 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1975 return -EINVAL; 1976 } 1977 1978 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1979 } 1980 1981 int 1982 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1983 uint64_t offset_blocks, uint64_t num_blocks, 1984 spdk_bdev_io_completion_cb cb, void *cb_arg) 1985 { 1986 struct spdk_bdev *bdev = desc->bdev; 1987 struct spdk_bdev_io *bdev_io; 1988 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1989 uint64_t len; 1990 bool split_request = false; 1991 1992 if (!desc->write) { 1993 return -EBADF; 1994 } 1995 1996 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1997 return -EINVAL; 1998 } 1999 2000 bdev_io = spdk_bdev_get_io(channel); 2001 2002 if (!bdev_io) { 2003 return -ENOMEM; 2004 } 2005 2006 bdev_io->internal.ch = channel; 2007 bdev_io->u.bdev.offset_blocks = offset_blocks; 2008 2009 if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 2010 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 2011 bdev_io->u.bdev.num_blocks = num_blocks; 2012 bdev_io->u.bdev.iovs = NULL; 2013 bdev_io->u.bdev.iovcnt = 0; 2014 2015 } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 2016 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 2017 2018 len = spdk_bdev_get_block_size(bdev) * num_blocks; 2019 2020 if (len > ZERO_BUFFER_SIZE) { 2021 split_request = true; 2022 len = ZERO_BUFFER_SIZE; 2023 } 2024 2025 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2026 bdev_io->u.bdev.iovs = &bdev_io->iov; 2027 bdev_io->u.bdev.iovs[0].iov_base = g_bdev_mgr.zero_buffer; 2028 bdev_io->u.bdev.iovs[0].iov_len = len; 2029 bdev_io->u.bdev.iovcnt = 1; 2030 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 2031 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 2032 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 2033 } else { 2034 spdk_bdev_free_io(bdev_io); 2035 return -ENOTSUP; 2036 } 2037 2038 if (split_request) { 2039 bdev_io->u.bdev.stored_user_cb = cb; 2040 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 2041 } else { 2042 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2043 } 2044 spdk_bdev_io_submit(bdev_io); 2045 return 0; 2046 } 2047 2048 int 2049 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2050 uint64_t offset, uint64_t nbytes, 2051 spdk_bdev_io_completion_cb cb, void *cb_arg) 2052 { 2053 uint64_t offset_blocks, num_blocks; 2054 2055 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2056 return -EINVAL; 2057 } 2058 2059 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2060 } 2061 2062 int 2063 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2064 uint64_t offset_blocks, uint64_t num_blocks, 2065 spdk_bdev_io_completion_cb cb, void *cb_arg) 2066 { 2067 struct spdk_bdev *bdev = desc->bdev; 2068 struct spdk_bdev_io *bdev_io; 2069 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2070 2071 if (!desc->write) { 2072 return -EBADF; 2073 } 2074 2075 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2076 return -EINVAL; 2077 } 2078 2079 if (num_blocks == 0) { 2080 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 2081 return -EINVAL; 2082 } 2083 2084 bdev_io = spdk_bdev_get_io(channel); 2085 if (!bdev_io) { 2086 return -ENOMEM; 2087 } 2088 2089 bdev_io->internal.ch = channel; 2090 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 2091 2092 bdev_io->u.bdev.iovs = &bdev_io->iov; 2093 bdev_io->u.bdev.iovs[0].iov_base = NULL; 2094 bdev_io->u.bdev.iovs[0].iov_len = 0; 2095 bdev_io->u.bdev.iovcnt = 1; 2096 2097 bdev_io->u.bdev.offset_blocks = offset_blocks; 2098 bdev_io->u.bdev.num_blocks = num_blocks; 2099 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2100 2101 spdk_bdev_io_submit(bdev_io); 2102 return 0; 2103 } 2104 2105 int 2106 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2107 uint64_t offset, uint64_t length, 2108 spdk_bdev_io_completion_cb cb, void *cb_arg) 2109 { 2110 uint64_t offset_blocks, num_blocks; 2111 2112 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 2113 return -EINVAL; 2114 } 2115 2116 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2117 } 2118 2119 int 2120 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2121 uint64_t offset_blocks, uint64_t num_blocks, 2122 spdk_bdev_io_completion_cb cb, void *cb_arg) 2123 { 2124 struct spdk_bdev *bdev = desc->bdev; 2125 struct spdk_bdev_io *bdev_io; 2126 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2127 2128 if (!desc->write) { 2129 return -EBADF; 2130 } 2131 2132 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2133 return -EINVAL; 2134 } 2135 2136 bdev_io = spdk_bdev_get_io(channel); 2137 if (!bdev_io) { 2138 return -ENOMEM; 2139 } 2140 2141 bdev_io->internal.ch = channel; 2142 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 2143 bdev_io->u.bdev.iovs = NULL; 2144 bdev_io->u.bdev.iovcnt = 0; 2145 bdev_io->u.bdev.offset_blocks = offset_blocks; 2146 bdev_io->u.bdev.num_blocks = num_blocks; 2147 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2148 2149 spdk_bdev_io_submit(bdev_io); 2150 return 0; 2151 } 2152 2153 static void 2154 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 2155 { 2156 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 2157 struct spdk_bdev_io *bdev_io; 2158 2159 bdev_io = TAILQ_FIRST(&ch->queued_resets); 2160 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 2161 spdk_bdev_io_submit_reset(bdev_io); 2162 } 2163 2164 static void 2165 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 2166 { 2167 struct spdk_io_channel *ch; 2168 struct spdk_bdev_channel *channel; 2169 struct spdk_bdev_mgmt_channel *mgmt_channel; 2170 struct spdk_bdev_shared_resource *shared_resource; 2171 bdev_io_tailq_t tmp_queued; 2172 2173 TAILQ_INIT(&tmp_queued); 2174 2175 ch = spdk_io_channel_iter_get_channel(i); 2176 channel = spdk_io_channel_get_ctx(ch); 2177 shared_resource = channel->shared_resource; 2178 mgmt_channel = shared_resource->mgmt_ch; 2179 2180 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 2181 2182 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 2183 /* The QoS object is always valid and readable while 2184 * the channel flag is set, so the lock here should not 2185 * be necessary. We're not in the fast path though, so 2186 * just take it anyway. */ 2187 pthread_mutex_lock(&channel->bdev->internal.mutex); 2188 if (channel->bdev->internal.qos->ch == channel) { 2189 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 2190 } 2191 pthread_mutex_unlock(&channel->bdev->internal.mutex); 2192 } 2193 2194 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 2195 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 2196 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 2197 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 2198 2199 spdk_for_each_channel_continue(i, 0); 2200 } 2201 2202 static void 2203 _spdk_bdev_start_reset(void *ctx) 2204 { 2205 struct spdk_bdev_channel *ch = ctx; 2206 2207 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 2208 ch, _spdk_bdev_reset_dev); 2209 } 2210 2211 static void 2212 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 2213 { 2214 struct spdk_bdev *bdev = ch->bdev; 2215 2216 assert(!TAILQ_EMPTY(&ch->queued_resets)); 2217 2218 pthread_mutex_lock(&bdev->internal.mutex); 2219 if (bdev->internal.reset_in_progress == NULL) { 2220 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 2221 /* 2222 * Take a channel reference for the target bdev for the life of this 2223 * reset. This guards against the channel getting destroyed while 2224 * spdk_for_each_channel() calls related to this reset IO are in 2225 * progress. We will release the reference when this reset is 2226 * completed. 2227 */ 2228 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2229 _spdk_bdev_start_reset(ch); 2230 } 2231 pthread_mutex_unlock(&bdev->internal.mutex); 2232 } 2233 2234 int 2235 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2236 spdk_bdev_io_completion_cb cb, void *cb_arg) 2237 { 2238 struct spdk_bdev *bdev = desc->bdev; 2239 struct spdk_bdev_io *bdev_io; 2240 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2241 2242 bdev_io = spdk_bdev_get_io(channel); 2243 if (!bdev_io) { 2244 return -ENOMEM; 2245 } 2246 2247 bdev_io->internal.ch = channel; 2248 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2249 bdev_io->u.reset.ch_ref = NULL; 2250 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2251 2252 pthread_mutex_lock(&bdev->internal.mutex); 2253 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 2254 pthread_mutex_unlock(&bdev->internal.mutex); 2255 2256 _spdk_bdev_channel_start_reset(channel); 2257 2258 return 0; 2259 } 2260 2261 void 2262 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2263 struct spdk_bdev_io_stat *stat) 2264 { 2265 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2266 2267 *stat = channel->stat; 2268 } 2269 2270 static void 2271 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 2272 { 2273 void *io_device = spdk_io_channel_iter_get_io_device(i); 2274 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2275 2276 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 2277 bdev_iostat_ctx->cb_arg, 0); 2278 free(bdev_iostat_ctx); 2279 } 2280 2281 static void 2282 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 2283 { 2284 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2285 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2286 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2287 2288 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 2289 spdk_for_each_channel_continue(i, 0); 2290 } 2291 2292 void 2293 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 2294 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 2295 { 2296 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 2297 2298 assert(bdev != NULL); 2299 assert(stat != NULL); 2300 assert(cb != NULL); 2301 2302 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 2303 if (bdev_iostat_ctx == NULL) { 2304 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 2305 cb(bdev, stat, cb_arg, -ENOMEM); 2306 return; 2307 } 2308 2309 bdev_iostat_ctx->stat = stat; 2310 bdev_iostat_ctx->cb = cb; 2311 bdev_iostat_ctx->cb_arg = cb_arg; 2312 2313 /* Start with the statistics from previously deleted channels. */ 2314 pthread_mutex_lock(&bdev->internal.mutex); 2315 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 2316 pthread_mutex_unlock(&bdev->internal.mutex); 2317 2318 /* Then iterate and add the statistics from each existing channel. */ 2319 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2320 _spdk_bdev_get_each_channel_stat, 2321 bdev_iostat_ctx, 2322 _spdk_bdev_get_device_stat_done); 2323 } 2324 2325 int 2326 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2327 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2328 spdk_bdev_io_completion_cb cb, void *cb_arg) 2329 { 2330 struct spdk_bdev *bdev = desc->bdev; 2331 struct spdk_bdev_io *bdev_io; 2332 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2333 2334 if (!desc->write) { 2335 return -EBADF; 2336 } 2337 2338 bdev_io = spdk_bdev_get_io(channel); 2339 if (!bdev_io) { 2340 return -ENOMEM; 2341 } 2342 2343 bdev_io->internal.ch = channel; 2344 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 2345 bdev_io->u.nvme_passthru.cmd = *cmd; 2346 bdev_io->u.nvme_passthru.buf = buf; 2347 bdev_io->u.nvme_passthru.nbytes = nbytes; 2348 bdev_io->u.nvme_passthru.md_buf = NULL; 2349 bdev_io->u.nvme_passthru.md_len = 0; 2350 2351 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2352 2353 spdk_bdev_io_submit(bdev_io); 2354 return 0; 2355 } 2356 2357 int 2358 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2359 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2360 spdk_bdev_io_completion_cb cb, void *cb_arg) 2361 { 2362 struct spdk_bdev *bdev = desc->bdev; 2363 struct spdk_bdev_io *bdev_io; 2364 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2365 2366 if (!desc->write) { 2367 /* 2368 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2369 * to easily determine if the command is a read or write, but for now just 2370 * do not allow io_passthru with a read-only descriptor. 2371 */ 2372 return -EBADF; 2373 } 2374 2375 bdev_io = spdk_bdev_get_io(channel); 2376 if (!bdev_io) { 2377 return -ENOMEM; 2378 } 2379 2380 bdev_io->internal.ch = channel; 2381 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2382 bdev_io->u.nvme_passthru.cmd = *cmd; 2383 bdev_io->u.nvme_passthru.buf = buf; 2384 bdev_io->u.nvme_passthru.nbytes = nbytes; 2385 bdev_io->u.nvme_passthru.md_buf = NULL; 2386 bdev_io->u.nvme_passthru.md_len = 0; 2387 2388 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2389 2390 spdk_bdev_io_submit(bdev_io); 2391 return 0; 2392 } 2393 2394 int 2395 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2396 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2397 spdk_bdev_io_completion_cb cb, void *cb_arg) 2398 { 2399 struct spdk_bdev *bdev = desc->bdev; 2400 struct spdk_bdev_io *bdev_io; 2401 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2402 2403 if (!desc->write) { 2404 /* 2405 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2406 * to easily determine if the command is a read or write, but for now just 2407 * do not allow io_passthru with a read-only descriptor. 2408 */ 2409 return -EBADF; 2410 } 2411 2412 bdev_io = spdk_bdev_get_io(channel); 2413 if (!bdev_io) { 2414 return -ENOMEM; 2415 } 2416 2417 bdev_io->internal.ch = channel; 2418 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2419 bdev_io->u.nvme_passthru.cmd = *cmd; 2420 bdev_io->u.nvme_passthru.buf = buf; 2421 bdev_io->u.nvme_passthru.nbytes = nbytes; 2422 bdev_io->u.nvme_passthru.md_buf = md_buf; 2423 bdev_io->u.nvme_passthru.md_len = md_len; 2424 2425 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2426 2427 spdk_bdev_io_submit(bdev_io); 2428 return 0; 2429 } 2430 2431 int 2432 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2433 struct spdk_bdev_io_wait_entry *entry) 2434 { 2435 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2436 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 2437 2438 if (bdev != entry->bdev) { 2439 SPDK_ERRLOG("bdevs do not match\n"); 2440 return -EINVAL; 2441 } 2442 2443 if (mgmt_ch->per_thread_cache_count > 0) { 2444 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 2445 return -EINVAL; 2446 } 2447 2448 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 2449 return 0; 2450 } 2451 2452 static void 2453 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2454 { 2455 struct spdk_bdev *bdev = bdev_ch->bdev; 2456 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2457 struct spdk_bdev_io *bdev_io; 2458 2459 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 2460 /* 2461 * Allow some more I/O to complete before retrying the nomem_io queue. 2462 * Some drivers (such as nvme) cannot immediately take a new I/O in 2463 * the context of a completion, because the resources for the I/O are 2464 * not released until control returns to the bdev poller. Also, we 2465 * may require several small I/O to complete before a larger I/O 2466 * (that requires splitting) can be submitted. 2467 */ 2468 return; 2469 } 2470 2471 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 2472 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 2473 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 2474 bdev_io->internal.ch->io_outstanding++; 2475 shared_resource->io_outstanding++; 2476 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2477 bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); 2478 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 2479 break; 2480 } 2481 } 2482 } 2483 2484 static inline void 2485 _spdk_bdev_io_complete(void *ctx) 2486 { 2487 struct spdk_bdev_io *bdev_io = ctx; 2488 2489 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 2490 /* 2491 * Send the completion to the thread that originally submitted the I/O, 2492 * which may not be the current thread in the case of QoS. 2493 */ 2494 if (bdev_io->internal.io_submit_ch) { 2495 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2496 bdev_io->internal.io_submit_ch = NULL; 2497 } 2498 2499 /* 2500 * Defer completion to avoid potential infinite recursion if the 2501 * user's completion callback issues a new I/O. 2502 */ 2503 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 2504 _spdk_bdev_io_complete, bdev_io); 2505 return; 2506 } 2507 2508 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2509 switch (bdev_io->type) { 2510 case SPDK_BDEV_IO_TYPE_READ: 2511 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2512 bdev_io->internal.ch->stat.num_read_ops++; 2513 bdev_io->internal.ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc); 2514 break; 2515 case SPDK_BDEV_IO_TYPE_WRITE: 2516 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2517 bdev_io->internal.ch->stat.num_write_ops++; 2518 bdev_io->internal.ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc); 2519 break; 2520 default: 2521 break; 2522 } 2523 } 2524 2525 #ifdef SPDK_CONFIG_VTUNE 2526 uint64_t now_tsc = spdk_get_ticks(); 2527 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 2528 uint64_t data[5]; 2529 2530 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 2531 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 2532 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 2533 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 2534 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 2535 bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; 2536 2537 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 2538 __itt_metadata_u64, 5, data); 2539 2540 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 2541 bdev_io->internal.ch->start_tsc = now_tsc; 2542 } 2543 #endif 2544 2545 assert(bdev_io->internal.cb != NULL); 2546 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); 2547 2548 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2549 bdev_io->internal.caller_ctx); 2550 } 2551 2552 static void 2553 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2554 { 2555 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2556 2557 if (bdev_io->u.reset.ch_ref != NULL) { 2558 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2559 bdev_io->u.reset.ch_ref = NULL; 2560 } 2561 2562 _spdk_bdev_io_complete(bdev_io); 2563 } 2564 2565 static void 2566 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2567 { 2568 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2569 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2570 2571 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2572 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2573 _spdk_bdev_channel_start_reset(ch); 2574 } 2575 2576 spdk_for_each_channel_continue(i, 0); 2577 } 2578 2579 void 2580 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2581 { 2582 struct spdk_bdev *bdev = bdev_io->bdev; 2583 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2584 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2585 2586 bdev_io->internal.status = status; 2587 2588 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2589 bool unlock_channels = false; 2590 2591 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2592 SPDK_ERRLOG("NOMEM returned for reset\n"); 2593 } 2594 pthread_mutex_lock(&bdev->internal.mutex); 2595 if (bdev_io == bdev->internal.reset_in_progress) { 2596 bdev->internal.reset_in_progress = NULL; 2597 unlock_channels = true; 2598 } 2599 pthread_mutex_unlock(&bdev->internal.mutex); 2600 2601 if (unlock_channels) { 2602 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2603 bdev_io, _spdk_bdev_reset_complete); 2604 return; 2605 } 2606 } else { 2607 assert(bdev_ch->io_outstanding > 0); 2608 assert(shared_resource->io_outstanding > 0); 2609 bdev_ch->io_outstanding--; 2610 shared_resource->io_outstanding--; 2611 2612 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2613 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 2614 /* 2615 * Wait for some of the outstanding I/O to complete before we 2616 * retry any of the nomem_io. Normally we will wait for 2617 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2618 * depth channels we will instead wait for half to complete. 2619 */ 2620 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 2621 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 2622 return; 2623 } 2624 2625 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 2626 _spdk_bdev_ch_retry_io(bdev_ch); 2627 } 2628 } 2629 2630 _spdk_bdev_io_complete(bdev_io); 2631 } 2632 2633 void 2634 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2635 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2636 { 2637 if (sc == SPDK_SCSI_STATUS_GOOD) { 2638 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2639 } else { 2640 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2641 bdev_io->internal.error.scsi.sc = sc; 2642 bdev_io->internal.error.scsi.sk = sk; 2643 bdev_io->internal.error.scsi.asc = asc; 2644 bdev_io->internal.error.scsi.ascq = ascq; 2645 } 2646 2647 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 2648 } 2649 2650 void 2651 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2652 int *sc, int *sk, int *asc, int *ascq) 2653 { 2654 assert(sc != NULL); 2655 assert(sk != NULL); 2656 assert(asc != NULL); 2657 assert(ascq != NULL); 2658 2659 switch (bdev_io->internal.status) { 2660 case SPDK_BDEV_IO_STATUS_SUCCESS: 2661 *sc = SPDK_SCSI_STATUS_GOOD; 2662 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2663 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2664 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2665 break; 2666 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2667 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2668 break; 2669 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2670 *sc = bdev_io->internal.error.scsi.sc; 2671 *sk = bdev_io->internal.error.scsi.sk; 2672 *asc = bdev_io->internal.error.scsi.asc; 2673 *ascq = bdev_io->internal.error.scsi.ascq; 2674 break; 2675 default: 2676 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2677 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2678 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2679 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2680 break; 2681 } 2682 } 2683 2684 void 2685 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2686 { 2687 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2688 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2689 } else { 2690 bdev_io->internal.error.nvme.sct = sct; 2691 bdev_io->internal.error.nvme.sc = sc; 2692 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2693 } 2694 2695 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 2696 } 2697 2698 void 2699 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2700 { 2701 assert(sct != NULL); 2702 assert(sc != NULL); 2703 2704 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2705 *sct = bdev_io->internal.error.nvme.sct; 2706 *sc = bdev_io->internal.error.nvme.sc; 2707 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2708 *sct = SPDK_NVME_SCT_GENERIC; 2709 *sc = SPDK_NVME_SC_SUCCESS; 2710 } else { 2711 *sct = SPDK_NVME_SCT_GENERIC; 2712 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2713 } 2714 } 2715 2716 struct spdk_thread * 2717 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2718 { 2719 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 2720 } 2721 2722 static void 2723 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set, 2724 enum spdk_bdev_qos_type qos_type) 2725 { 2726 uint64_t min_qos_set = 0; 2727 2728 switch (qos_type) { 2729 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2730 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 2731 break; 2732 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2733 min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC; 2734 break; 2735 default: 2736 SPDK_ERRLOG("Unsupported QoS type.\n"); 2737 return; 2738 } 2739 2740 if (qos_set % min_qos_set) { 2741 SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n", 2742 qos_set, bdev->name, min_qos_set); 2743 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2744 return; 2745 } 2746 2747 if (!bdev->internal.qos) { 2748 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 2749 if (!bdev->internal.qos) { 2750 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 2751 return; 2752 } 2753 } 2754 2755 switch (qos_type) { 2756 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2757 bdev->internal.qos->iops_rate_limit = qos_set; 2758 break; 2759 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2760 bdev->internal.qos->byte_rate_limit = qos_set * 1024 * 1024; 2761 break; 2762 default: 2763 break; 2764 } 2765 2766 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 2767 bdev->name, qos_type, qos_set); 2768 2769 return; 2770 } 2771 2772 static void 2773 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2774 { 2775 struct spdk_conf_section *sp = NULL; 2776 const char *val = NULL; 2777 uint64_t qos_set = 0; 2778 int i = 0, j = 0; 2779 2780 sp = spdk_conf_find_section(NULL, "QoS"); 2781 if (!sp) { 2782 return; 2783 } 2784 2785 while (j < SPDK_BDEV_QOS_NUM_TYPES) { 2786 i = 0; 2787 while (true) { 2788 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0); 2789 if (!val) { 2790 break; 2791 } 2792 2793 if (strcmp(bdev->name, val) != 0) { 2794 i++; 2795 continue; 2796 } 2797 2798 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1); 2799 if (val) { 2800 qos_set = strtoull(val, NULL, 10); 2801 _spdk_bdev_qos_config_type(bdev, qos_set, j); 2802 } 2803 2804 break; 2805 } 2806 2807 j++; 2808 } 2809 2810 return; 2811 } 2812 2813 static int 2814 spdk_bdev_init(struct spdk_bdev *bdev) 2815 { 2816 assert(bdev->module != NULL); 2817 2818 if (!bdev->name) { 2819 SPDK_ERRLOG("Bdev name is NULL\n"); 2820 return -EINVAL; 2821 } 2822 2823 if (spdk_bdev_get_by_name(bdev->name)) { 2824 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2825 return -EEXIST; 2826 } 2827 2828 bdev->internal.status = SPDK_BDEV_STATUS_READY; 2829 bdev->internal.measured_queue_depth = UINT64_MAX; 2830 2831 TAILQ_INIT(&bdev->internal.open_descs); 2832 2833 TAILQ_INIT(&bdev->aliases); 2834 2835 bdev->internal.reset_in_progress = NULL; 2836 2837 _spdk_bdev_qos_config(bdev); 2838 2839 spdk_io_device_register(__bdev_to_io_dev(bdev), 2840 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2841 sizeof(struct spdk_bdev_channel)); 2842 2843 pthread_mutex_init(&bdev->internal.mutex, NULL); 2844 return 0; 2845 } 2846 2847 static void 2848 spdk_bdev_destroy_cb(void *io_device) 2849 { 2850 int rc; 2851 struct spdk_bdev *bdev; 2852 spdk_bdev_unregister_cb cb_fn; 2853 void *cb_arg; 2854 2855 bdev = __bdev_from_io_dev(io_device); 2856 cb_fn = bdev->internal.unregister_cb; 2857 cb_arg = bdev->internal.unregister_ctx; 2858 2859 rc = bdev->fn_table->destruct(bdev->ctxt); 2860 if (rc < 0) { 2861 SPDK_ERRLOG("destruct failed\n"); 2862 } 2863 if (rc <= 0 && cb_fn != NULL) { 2864 cb_fn(cb_arg, rc); 2865 } 2866 } 2867 2868 2869 static void 2870 spdk_bdev_fini(struct spdk_bdev *bdev) 2871 { 2872 pthread_mutex_destroy(&bdev->internal.mutex); 2873 2874 free(bdev->internal.qos); 2875 2876 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2877 } 2878 2879 static void 2880 spdk_bdev_start(struct spdk_bdev *bdev) 2881 { 2882 struct spdk_bdev_module *module; 2883 uint32_t action; 2884 2885 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2886 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 2887 2888 /* Examine configuration before initializing I/O */ 2889 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2890 if (module->examine_config) { 2891 action = module->internal.action_in_progress; 2892 module->internal.action_in_progress++; 2893 module->examine_config(bdev); 2894 if (action != module->internal.action_in_progress) { 2895 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 2896 module->name); 2897 } 2898 } 2899 } 2900 2901 if (bdev->internal.claim_module) { 2902 return; 2903 } 2904 2905 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2906 if (module->examine_disk) { 2907 module->internal.action_in_progress++; 2908 module->examine_disk(bdev); 2909 } 2910 } 2911 } 2912 2913 int 2914 spdk_bdev_register(struct spdk_bdev *bdev) 2915 { 2916 int rc = spdk_bdev_init(bdev); 2917 2918 if (rc == 0) { 2919 spdk_bdev_start(bdev); 2920 } 2921 2922 return rc; 2923 } 2924 2925 static void 2926 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2927 { 2928 struct spdk_bdev **bdevs; 2929 struct spdk_bdev *base; 2930 size_t i, j, k; 2931 bool found; 2932 2933 /* Iterate over base bdevs to remove vbdev from them. */ 2934 for (i = 0; i < vbdev->internal.base_bdevs_cnt; i++) { 2935 found = false; 2936 base = vbdev->internal.base_bdevs[i]; 2937 2938 for (j = 0; j < base->vbdevs_cnt; j++) { 2939 if (base->vbdevs[j] != vbdev) { 2940 continue; 2941 } 2942 2943 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2944 base->vbdevs[k] = base->vbdevs[k + 1]; 2945 } 2946 2947 base->vbdevs_cnt--; 2948 if (base->vbdevs_cnt > 0) { 2949 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2950 /* It would be odd if shrinking memory block fail. */ 2951 assert(bdevs); 2952 base->vbdevs = bdevs; 2953 } else { 2954 free(base->vbdevs); 2955 base->vbdevs = NULL; 2956 } 2957 2958 found = true; 2959 break; 2960 } 2961 2962 if (!found) { 2963 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2964 } 2965 } 2966 2967 free(vbdev->internal.base_bdevs); 2968 vbdev->internal.base_bdevs = NULL; 2969 vbdev->internal.base_bdevs_cnt = 0; 2970 } 2971 2972 static int 2973 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2974 { 2975 struct spdk_bdev **vbdevs; 2976 struct spdk_bdev *base; 2977 size_t i; 2978 2979 /* Adding base bdevs isn't supported (yet?). */ 2980 assert(vbdev->internal.base_bdevs_cnt == 0); 2981 2982 vbdev->internal.base_bdevs = malloc(cnt * sizeof(vbdev->internal.base_bdevs[0])); 2983 if (!vbdev->internal.base_bdevs) { 2984 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2985 return -ENOMEM; 2986 } 2987 2988 memcpy(vbdev->internal.base_bdevs, base_bdevs, cnt * sizeof(vbdev->internal.base_bdevs[0])); 2989 vbdev->internal.base_bdevs_cnt = cnt; 2990 2991 /* Iterate over base bdevs to add this vbdev to them. */ 2992 for (i = 0; i < cnt; i++) { 2993 base = vbdev->internal.base_bdevs[i]; 2994 2995 assert(base != NULL); 2996 assert(base->internal.claim_module != NULL); 2997 2998 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2999 if (!vbdevs) { 3000 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 3001 spdk_vbdev_remove_base_bdevs(vbdev); 3002 return -ENOMEM; 3003 } 3004 3005 vbdevs[base->vbdevs_cnt] = vbdev; 3006 base->vbdevs = vbdevs; 3007 base->vbdevs_cnt++; 3008 } 3009 3010 return 0; 3011 } 3012 3013 int 3014 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 3015 { 3016 int rc; 3017 3018 rc = spdk_bdev_init(vbdev); 3019 if (rc) { 3020 return rc; 3021 } 3022 3023 if (base_bdev_count == 0) { 3024 spdk_bdev_start(vbdev); 3025 return 0; 3026 } 3027 3028 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 3029 if (rc) { 3030 spdk_bdev_fini(vbdev); 3031 return rc; 3032 } 3033 3034 spdk_bdev_start(vbdev); 3035 return 0; 3036 3037 } 3038 3039 void 3040 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 3041 { 3042 if (bdev->internal.unregister_cb != NULL) { 3043 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 3044 } 3045 } 3046 3047 static void 3048 _remove_notify(void *arg) 3049 { 3050 struct spdk_bdev_desc *desc = arg; 3051 3052 desc->remove_cb(desc->remove_ctx); 3053 } 3054 3055 void 3056 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 3057 { 3058 struct spdk_bdev_desc *desc, *tmp; 3059 bool do_destruct = true; 3060 struct spdk_thread *thread; 3061 3062 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 3063 3064 thread = spdk_get_thread(); 3065 if (!thread) { 3066 /* The user called this from a non-SPDK thread. */ 3067 if (cb_fn != NULL) { 3068 cb_fn(cb_arg, -ENOTSUP); 3069 } 3070 return; 3071 } 3072 3073 pthread_mutex_lock(&bdev->internal.mutex); 3074 3075 spdk_vbdev_remove_base_bdevs(bdev); 3076 3077 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 3078 bdev->internal.unregister_cb = cb_fn; 3079 bdev->internal.unregister_ctx = cb_arg; 3080 3081 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 3082 if (desc->remove_cb) { 3083 do_destruct = false; 3084 /* 3085 * Defer invocation of the remove_cb to a separate message that will 3086 * run later on this thread. This ensures this context unwinds and 3087 * we don't recursively unregister this bdev again if the remove_cb 3088 * immediately closes its descriptor. 3089 */ 3090 if (!desc->remove_scheduled) { 3091 /* Avoid scheduling removal of the same descriptor multiple times. */ 3092 desc->remove_scheduled = true; 3093 spdk_thread_send_msg(thread, _remove_notify, desc); 3094 } 3095 } 3096 } 3097 3098 if (!do_destruct) { 3099 pthread_mutex_unlock(&bdev->internal.mutex); 3100 return; 3101 } 3102 3103 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 3104 pthread_mutex_unlock(&bdev->internal.mutex); 3105 3106 spdk_bdev_fini(bdev); 3107 } 3108 3109 int 3110 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 3111 void *remove_ctx, struct spdk_bdev_desc **_desc) 3112 { 3113 struct spdk_bdev_desc *desc; 3114 3115 desc = calloc(1, sizeof(*desc)); 3116 if (desc == NULL) { 3117 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 3118 return -ENOMEM; 3119 } 3120 3121 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3122 spdk_get_thread()); 3123 3124 pthread_mutex_lock(&bdev->internal.mutex); 3125 3126 if (write && bdev->internal.claim_module) { 3127 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 3128 bdev->name, bdev->internal.claim_module->name); 3129 free(desc); 3130 pthread_mutex_unlock(&bdev->internal.mutex); 3131 return -EPERM; 3132 } 3133 3134 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 3135 3136 desc->bdev = bdev; 3137 desc->remove_cb = remove_cb; 3138 desc->remove_ctx = remove_ctx; 3139 desc->write = write; 3140 *_desc = desc; 3141 3142 pthread_mutex_unlock(&bdev->internal.mutex); 3143 3144 return 0; 3145 } 3146 3147 void 3148 spdk_bdev_close(struct spdk_bdev_desc *desc) 3149 { 3150 struct spdk_bdev *bdev = desc->bdev; 3151 bool do_unregister = false; 3152 3153 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3154 spdk_get_thread()); 3155 3156 pthread_mutex_lock(&bdev->internal.mutex); 3157 3158 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 3159 free(desc); 3160 3161 /* If no more descriptors, kill QoS channel */ 3162 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3163 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 3164 bdev->name, spdk_get_thread()); 3165 3166 if (spdk_bdev_qos_destroy(bdev)) { 3167 /* There isn't anything we can do to recover here. Just let the 3168 * old QoS poller keep running. The QoS handling won't change 3169 * cores when the user allocates a new channel, but it won't break. */ 3170 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 3171 } 3172 } 3173 3174 spdk_bdev_set_qd_sampling_period(bdev, 0); 3175 3176 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3177 do_unregister = true; 3178 } 3179 pthread_mutex_unlock(&bdev->internal.mutex); 3180 3181 if (do_unregister == true) { 3182 spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); 3183 } 3184 } 3185 3186 int 3187 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 3188 struct spdk_bdev_module *module) 3189 { 3190 if (bdev->internal.claim_module != NULL) { 3191 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 3192 bdev->internal.claim_module->name); 3193 return -EPERM; 3194 } 3195 3196 if (desc && !desc->write) { 3197 desc->write = true; 3198 } 3199 3200 bdev->internal.claim_module = module; 3201 return 0; 3202 } 3203 3204 void 3205 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 3206 { 3207 assert(bdev->internal.claim_module != NULL); 3208 bdev->internal.claim_module = NULL; 3209 } 3210 3211 struct spdk_bdev * 3212 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 3213 { 3214 return desc->bdev; 3215 } 3216 3217 void 3218 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 3219 { 3220 struct iovec *iovs; 3221 int iovcnt; 3222 3223 if (bdev_io == NULL) { 3224 return; 3225 } 3226 3227 switch (bdev_io->type) { 3228 case SPDK_BDEV_IO_TYPE_READ: 3229 iovs = bdev_io->u.bdev.iovs; 3230 iovcnt = bdev_io->u.bdev.iovcnt; 3231 break; 3232 case SPDK_BDEV_IO_TYPE_WRITE: 3233 iovs = bdev_io->u.bdev.iovs; 3234 iovcnt = bdev_io->u.bdev.iovcnt; 3235 break; 3236 default: 3237 iovs = NULL; 3238 iovcnt = 0; 3239 break; 3240 } 3241 3242 if (iovp) { 3243 *iovp = iovs; 3244 } 3245 if (iovcntp) { 3246 *iovcntp = iovcnt; 3247 } 3248 } 3249 3250 void 3251 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 3252 { 3253 3254 if (spdk_bdev_module_list_find(bdev_module->name)) { 3255 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 3256 assert(false); 3257 } 3258 3259 if (bdev_module->async_init) { 3260 bdev_module->internal.action_in_progress = 1; 3261 } 3262 3263 /* 3264 * Modules with examine callbacks must be initialized first, so they are 3265 * ready to handle examine callbacks from later modules that will 3266 * register physical bdevs. 3267 */ 3268 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 3269 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3270 } else { 3271 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3272 } 3273 } 3274 3275 struct spdk_bdev_module * 3276 spdk_bdev_module_list_find(const char *name) 3277 { 3278 struct spdk_bdev_module *bdev_module; 3279 3280 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3281 if (strcmp(name, bdev_module->name) == 0) { 3282 break; 3283 } 3284 } 3285 3286 return bdev_module; 3287 } 3288 3289 static void 3290 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3291 { 3292 uint64_t len; 3293 3294 if (!success) { 3295 bdev_io->internal.cb = bdev_io->u.bdev.stored_user_cb; 3296 _spdk_bdev_io_complete(bdev_io); 3297 return; 3298 } 3299 3300 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 3301 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 3302 ZERO_BUFFER_SIZE); 3303 3304 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 3305 bdev_io->u.bdev.iovs[0].iov_len = len; 3306 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 3307 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 3308 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 3309 3310 /* if this round completes the i/o, change the callback to be the original user callback */ 3311 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 3312 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 3313 } else { 3314 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 3315 } 3316 spdk_bdev_io_submit(bdev_io); 3317 } 3318 3319 struct set_qos_limit_ctx { 3320 void (*cb_fn)(void *cb_arg, int status); 3321 void *cb_arg; 3322 struct spdk_bdev *bdev; 3323 }; 3324 3325 static void 3326 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 3327 { 3328 pthread_mutex_lock(&ctx->bdev->internal.mutex); 3329 ctx->bdev->internal.qos_mod_in_progress = false; 3330 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 3331 3332 ctx->cb_fn(ctx->cb_arg, status); 3333 free(ctx); 3334 } 3335 3336 static void 3337 _spdk_bdev_disable_qos_done(void *cb_arg) 3338 { 3339 struct set_qos_limit_ctx *ctx = cb_arg; 3340 struct spdk_bdev *bdev = ctx->bdev; 3341 struct spdk_bdev_io *bdev_io; 3342 struct spdk_bdev_qos *qos; 3343 3344 pthread_mutex_lock(&bdev->internal.mutex); 3345 qos = bdev->internal.qos; 3346 bdev->internal.qos = NULL; 3347 pthread_mutex_unlock(&bdev->internal.mutex); 3348 3349 while (!TAILQ_EMPTY(&qos->queued)) { 3350 /* Send queued I/O back to their original thread for resubmission. */ 3351 bdev_io = TAILQ_FIRST(&qos->queued); 3352 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 3353 3354 if (bdev_io->internal.io_submit_ch) { 3355 /* 3356 * Channel was changed when sending it to the QoS thread - change it back 3357 * before sending it back to the original thread. 3358 */ 3359 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 3360 bdev_io->internal.io_submit_ch = NULL; 3361 } 3362 3363 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 3364 _spdk_bdev_io_submit, bdev_io); 3365 } 3366 3367 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3368 spdk_poller_unregister(&qos->poller); 3369 3370 free(qos); 3371 3372 _spdk_bdev_set_qos_limit_done(ctx, 0); 3373 } 3374 3375 static void 3376 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 3377 { 3378 void *io_device = spdk_io_channel_iter_get_io_device(i); 3379 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3380 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3381 struct spdk_thread *thread; 3382 3383 pthread_mutex_lock(&bdev->internal.mutex); 3384 thread = bdev->internal.qos->thread; 3385 pthread_mutex_unlock(&bdev->internal.mutex); 3386 3387 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 3388 } 3389 3390 static void 3391 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 3392 { 3393 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3394 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3395 3396 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 3397 3398 spdk_for_each_channel_continue(i, 0); 3399 } 3400 3401 static void 3402 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg) 3403 { 3404 struct set_qos_limit_ctx *ctx = cb_arg; 3405 struct spdk_bdev *bdev = ctx->bdev; 3406 3407 pthread_mutex_lock(&bdev->internal.mutex); 3408 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 3409 pthread_mutex_unlock(&bdev->internal.mutex); 3410 3411 _spdk_bdev_set_qos_limit_done(ctx, 0); 3412 } 3413 3414 static void 3415 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 3416 { 3417 void *io_device = spdk_io_channel_iter_get_io_device(i); 3418 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3419 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3420 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3421 3422 pthread_mutex_lock(&bdev->internal.mutex); 3423 _spdk_bdev_enable_qos(bdev, bdev_ch); 3424 pthread_mutex_unlock(&bdev->internal.mutex); 3425 spdk_for_each_channel_continue(i, 0); 3426 } 3427 3428 static void 3429 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 3430 { 3431 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3432 3433 _spdk_bdev_set_qos_limit_done(ctx, status); 3434 } 3435 3436 void 3437 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec, 3438 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 3439 { 3440 struct set_qos_limit_ctx *ctx; 3441 3442 if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 3443 SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n", 3444 ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 3445 cb_fn(cb_arg, -EINVAL); 3446 return; 3447 } 3448 3449 ctx = calloc(1, sizeof(*ctx)); 3450 if (ctx == NULL) { 3451 cb_fn(cb_arg, -ENOMEM); 3452 return; 3453 } 3454 3455 ctx->cb_fn = cb_fn; 3456 ctx->cb_arg = cb_arg; 3457 ctx->bdev = bdev; 3458 3459 pthread_mutex_lock(&bdev->internal.mutex); 3460 if (bdev->internal.qos_mod_in_progress) { 3461 pthread_mutex_unlock(&bdev->internal.mutex); 3462 free(ctx); 3463 cb_fn(cb_arg, -EAGAIN); 3464 return; 3465 } 3466 bdev->internal.qos_mod_in_progress = true; 3467 3468 if (ios_per_sec > 0) { 3469 if (bdev->internal.qos == NULL) { 3470 /* Enabling */ 3471 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 3472 if (!bdev->internal.qos) { 3473 pthread_mutex_unlock(&bdev->internal.mutex); 3474 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3475 free(ctx); 3476 cb_fn(cb_arg, -ENOMEM); 3477 return; 3478 } 3479 3480 bdev->internal.qos->iops_rate_limit = ios_per_sec; 3481 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3482 _spdk_bdev_enable_qos_msg, ctx, 3483 _spdk_bdev_enable_qos_done); 3484 } else { 3485 /* Updating */ 3486 bdev->internal.qos->iops_rate_limit = ios_per_sec; 3487 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx); 3488 } 3489 } else { 3490 if (bdev->internal.qos != NULL) { 3491 /* Disabling */ 3492 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3493 _spdk_bdev_disable_qos_msg, ctx, 3494 _spdk_bdev_disable_qos_msg_done); 3495 } else { 3496 pthread_mutex_unlock(&bdev->internal.mutex); 3497 _spdk_bdev_set_qos_limit_done(ctx, 0); 3498 return; 3499 } 3500 } 3501 3502 pthread_mutex_unlock(&bdev->internal.mutex); 3503 } 3504 3505 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 3506