1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/env.h" 40 #include "spdk/event.h" 41 #include "spdk/io_channel.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/util.h" 47 48 #include "spdk_internal/bdev.h" 49 #include "spdk_internal/log.h" 50 #include "spdk/string.h" 51 52 #ifdef SPDK_CONFIG_VTUNE 53 #include "ittnotify.h" 54 #include "ittnotify_types.h" 55 int __itt_init_ittlib(const char *, __itt_group_id); 56 #endif 57 58 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 59 #define SPDK_BDEV_IO_CACHE_SIZE 256 60 #define BUF_SMALL_POOL_SIZE 8192 61 #define BUF_LARGE_POOL_SIZE 1024 62 #define NOMEM_THRESHOLD_COUNT 8 63 #define ZERO_BUFFER_SIZE 0x100000 64 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 65 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 66 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 67 68 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t; 69 typedef STAILQ_HEAD(, spdk_bdev_io) bdev_io_stailq_t; 70 71 struct spdk_bdev_mgr { 72 struct spdk_mempool *bdev_io_pool; 73 74 struct spdk_mempool *buf_small_pool; 75 struct spdk_mempool *buf_large_pool; 76 77 void *zero_buffer; 78 79 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 80 81 TAILQ_HEAD(, spdk_bdev) bdevs; 82 83 bool init_complete; 84 bool module_init_complete; 85 86 #ifdef SPDK_CONFIG_VTUNE 87 __itt_domain *domain; 88 #endif 89 }; 90 91 static struct spdk_bdev_mgr g_bdev_mgr = { 92 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 93 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 94 .init_complete = false, 95 .module_init_complete = false, 96 }; 97 98 static spdk_bdev_init_cb g_init_cb_fn = NULL; 99 static void *g_init_cb_arg = NULL; 100 101 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 102 static void *g_fini_cb_arg = NULL; 103 static struct spdk_thread *g_fini_thread = NULL; 104 105 106 struct spdk_bdev_mgmt_channel { 107 bdev_io_stailq_t need_buf_small; 108 bdev_io_stailq_t need_buf_large; 109 110 /* 111 * Each thread keeps a cache of bdev_io - this allows 112 * bdev threads which are *not* DPDK threads to still 113 * benefit from a per-thread bdev_io cache. Without 114 * this, non-DPDK threads fetching from the mempool 115 * incur a cmpxchg on get and put. 116 */ 117 bdev_io_stailq_t per_thread_cache; 118 uint32_t per_thread_cache_count; 119 120 TAILQ_HEAD(, spdk_bdev_module_channel) module_channels; 121 }; 122 123 struct spdk_bdev_desc { 124 struct spdk_bdev *bdev; 125 spdk_bdev_remove_cb_t remove_cb; 126 void *remove_ctx; 127 bool write; 128 TAILQ_ENTRY(spdk_bdev_desc) link; 129 }; 130 131 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 132 #define BDEV_CH_QOS_ENABLED (1 << 1) 133 134 struct spdk_bdev_channel { 135 struct spdk_bdev *bdev; 136 137 /* The channel for the underlying device */ 138 struct spdk_io_channel *channel; 139 140 /* Channel for the bdev manager */ 141 struct spdk_io_channel *mgmt_channel; 142 143 struct spdk_bdev_io_stat stat; 144 145 bdev_io_tailq_t queued_resets; 146 147 uint32_t flags; 148 149 /* 150 * Rate limiting on this channel. 151 * Queue of IO awaiting issue because of a QoS rate limiting happened 152 * on this channel. 153 */ 154 bdev_io_tailq_t qos_io; 155 156 /* 157 * Rate limiting on this channel. 158 * Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 159 * only valid for the master channel which manages the outstanding IOs. 160 */ 161 uint64_t qos_max_ios_per_timeslice; 162 163 /* 164 * Rate limiting on this channel. 165 * Submitted IO in one timeslice (e.g., 1ms) 166 */ 167 uint64_t io_submitted_this_timeslice; 168 169 /* 170 * Rate limiting on this channel. 171 * Periodic running QoS poller in millisecond. 172 */ 173 struct spdk_poller *qos_poller; 174 175 /* Per-device channel */ 176 struct spdk_bdev_module_channel *module_ch; 177 178 #ifdef SPDK_CONFIG_VTUNE 179 uint64_t start_tsc; 180 uint64_t interval_tsc; 181 __itt_string_handle *handle; 182 #endif 183 184 }; 185 186 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 187 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 188 189 /* 190 * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device 191 * will queue here their IO that awaits retry. It makes it posible to retry sending 192 * IO to one bdev after IO from other bdev completes. 193 */ 194 struct spdk_bdev_module_channel { 195 /* 196 * Count of I/O submitted to bdev module and waiting for completion. 197 * Incremented before submit_request() is called on an spdk_bdev_io. 198 */ 199 uint64_t io_outstanding; 200 201 /* 202 * Queue of IO awaiting retry because of a previous NOMEM status returned 203 * on this channel. 204 */ 205 bdev_io_tailq_t nomem_io; 206 207 /* 208 * Threshold which io_outstanding must drop to before retrying nomem_io. 209 */ 210 uint64_t nomem_threshold; 211 212 /* I/O channel allocated by a bdev module */ 213 struct spdk_io_channel *module_ch; 214 215 uint32_t ref; 216 217 TAILQ_ENTRY(spdk_bdev_module_channel) link; 218 }; 219 220 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 221 222 struct spdk_bdev * 223 spdk_bdev_first(void) 224 { 225 struct spdk_bdev *bdev; 226 227 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 228 if (bdev) { 229 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 230 } 231 232 return bdev; 233 } 234 235 struct spdk_bdev * 236 spdk_bdev_next(struct spdk_bdev *prev) 237 { 238 struct spdk_bdev *bdev; 239 240 bdev = TAILQ_NEXT(prev, link); 241 if (bdev) { 242 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 243 } 244 245 return bdev; 246 } 247 248 static struct spdk_bdev * 249 _bdev_next_leaf(struct spdk_bdev *bdev) 250 { 251 while (bdev != NULL) { 252 if (bdev->claim_module == NULL) { 253 return bdev; 254 } else { 255 bdev = TAILQ_NEXT(bdev, link); 256 } 257 } 258 259 return bdev; 260 } 261 262 struct spdk_bdev * 263 spdk_bdev_first_leaf(void) 264 { 265 struct spdk_bdev *bdev; 266 267 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 268 269 if (bdev) { 270 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 271 } 272 273 return bdev; 274 } 275 276 struct spdk_bdev * 277 spdk_bdev_next_leaf(struct spdk_bdev *prev) 278 { 279 struct spdk_bdev *bdev; 280 281 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 282 283 if (bdev) { 284 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 285 } 286 287 return bdev; 288 } 289 290 struct spdk_bdev * 291 spdk_bdev_get_by_name(const char *bdev_name) 292 { 293 struct spdk_bdev_alias *tmp; 294 struct spdk_bdev *bdev = spdk_bdev_first(); 295 296 while (bdev != NULL) { 297 if (strcmp(bdev_name, bdev->name) == 0) { 298 return bdev; 299 } 300 301 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 302 if (strcmp(bdev_name, tmp->alias) == 0) { 303 return bdev; 304 } 305 } 306 307 bdev = spdk_bdev_next(bdev); 308 } 309 310 return NULL; 311 } 312 313 static void 314 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 315 { 316 assert(bdev_io->get_buf_cb != NULL); 317 assert(buf != NULL); 318 assert(bdev_io->u.bdev.iovs != NULL); 319 320 bdev_io->buf = buf; 321 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 322 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 323 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 324 } 325 326 static void 327 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 328 { 329 struct spdk_mempool *pool; 330 struct spdk_bdev_io *tmp; 331 void *buf; 332 bdev_io_stailq_t *stailq; 333 struct spdk_bdev_mgmt_channel *ch; 334 335 assert(bdev_io->u.bdev.iovcnt == 1); 336 337 buf = bdev_io->buf; 338 ch = bdev_io->mgmt_ch; 339 340 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 341 pool = g_bdev_mgr.buf_small_pool; 342 stailq = &ch->need_buf_small; 343 } else { 344 pool = g_bdev_mgr.buf_large_pool; 345 stailq = &ch->need_buf_large; 346 } 347 348 if (STAILQ_EMPTY(stailq)) { 349 spdk_mempool_put(pool, buf); 350 } else { 351 tmp = STAILQ_FIRST(stailq); 352 STAILQ_REMOVE_HEAD(stailq, buf_link); 353 spdk_bdev_io_set_buf(tmp, buf); 354 } 355 } 356 357 void 358 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 359 { 360 struct spdk_mempool *pool; 361 bdev_io_stailq_t *stailq; 362 void *buf = NULL; 363 struct spdk_bdev_mgmt_channel *ch; 364 365 assert(cb != NULL); 366 assert(bdev_io->u.bdev.iovs != NULL); 367 368 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 369 /* Buffer already present */ 370 cb(bdev_io->ch->channel, bdev_io); 371 return; 372 } 373 374 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 375 ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel); 376 377 bdev_io->buf_len = len; 378 bdev_io->get_buf_cb = cb; 379 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 380 pool = g_bdev_mgr.buf_small_pool; 381 stailq = &ch->need_buf_small; 382 } else { 383 pool = g_bdev_mgr.buf_large_pool; 384 stailq = &ch->need_buf_large; 385 } 386 387 buf = spdk_mempool_get(pool); 388 389 if (!buf) { 390 STAILQ_INSERT_TAIL(stailq, bdev_io, buf_link); 391 } else { 392 spdk_bdev_io_set_buf(bdev_io, buf); 393 } 394 } 395 396 static int 397 spdk_bdev_module_get_max_ctx_size(void) 398 { 399 struct spdk_bdev_module *bdev_module; 400 int max_bdev_module_size = 0; 401 402 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 403 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 404 max_bdev_module_size = bdev_module->get_ctx_size(); 405 } 406 } 407 408 return max_bdev_module_size; 409 } 410 411 void 412 spdk_bdev_config_text(FILE *fp) 413 { 414 struct spdk_bdev_module *bdev_module; 415 416 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 417 if (bdev_module->config_text) { 418 bdev_module->config_text(fp); 419 } 420 } 421 } 422 423 static int 424 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 425 { 426 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 427 428 STAILQ_INIT(&ch->need_buf_small); 429 STAILQ_INIT(&ch->need_buf_large); 430 431 STAILQ_INIT(&ch->per_thread_cache); 432 ch->per_thread_cache_count = 0; 433 434 TAILQ_INIT(&ch->module_channels); 435 436 return 0; 437 } 438 439 static void 440 spdk_bdev_mgmt_channel_free_resources(struct spdk_bdev_mgmt_channel *ch) 441 { 442 struct spdk_bdev_io *bdev_io; 443 444 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 445 SPDK_ERRLOG("Pending I/O list wasn't empty on channel free\n"); 446 } 447 448 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 449 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 450 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 451 ch->per_thread_cache_count--; 452 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 453 } 454 455 assert(ch->per_thread_cache_count == 0); 456 } 457 458 static void 459 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 460 { 461 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 462 463 spdk_bdev_mgmt_channel_free_resources(ch); 464 } 465 466 static void 467 spdk_bdev_init_complete(int rc) 468 { 469 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 470 void *cb_arg = g_init_cb_arg; 471 472 g_bdev_mgr.init_complete = true; 473 g_init_cb_fn = NULL; 474 g_init_cb_arg = NULL; 475 476 cb_fn(cb_arg, rc); 477 } 478 479 static void 480 spdk_bdev_module_action_complete(void) 481 { 482 struct spdk_bdev_module *m; 483 484 /* 485 * Don't finish bdev subsystem initialization if 486 * module pre-initialization is still in progress, or 487 * the subsystem been already initialized. 488 */ 489 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 490 return; 491 } 492 493 /* 494 * Check all bdev modules for inits/examinations in progress. If any 495 * exist, return immediately since we cannot finish bdev subsystem 496 * initialization until all are completed. 497 */ 498 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 499 if (m->action_in_progress > 0) { 500 return; 501 } 502 } 503 504 /* 505 * Modules already finished initialization - now that all 506 * the bdev modules have finished their asynchronous I/O 507 * processing, the entire bdev layer can be marked as complete. 508 */ 509 spdk_bdev_init_complete(0); 510 } 511 512 static void 513 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 514 { 515 assert(module->action_in_progress > 0); 516 module->action_in_progress--; 517 spdk_bdev_module_action_complete(); 518 } 519 520 void 521 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 522 { 523 spdk_bdev_module_action_done(module); 524 } 525 526 void 527 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 528 { 529 spdk_bdev_module_action_done(module); 530 } 531 532 static int 533 spdk_bdev_modules_init(void) 534 { 535 struct spdk_bdev_module *module; 536 int rc = 0; 537 538 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 539 rc = module->module_init(); 540 if (rc != 0) { 541 break; 542 } 543 } 544 545 g_bdev_mgr.module_init_complete = true; 546 return rc; 547 } 548 void 549 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 550 { 551 int cache_size; 552 int rc = 0; 553 char mempool_name[32]; 554 555 assert(cb_fn != NULL); 556 557 g_init_cb_fn = cb_fn; 558 g_init_cb_arg = cb_arg; 559 560 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 561 562 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 563 SPDK_BDEV_IO_POOL_SIZE, 564 sizeof(struct spdk_bdev_io) + 565 spdk_bdev_module_get_max_ctx_size(), 566 0, 567 SPDK_ENV_SOCKET_ID_ANY); 568 569 if (g_bdev_mgr.bdev_io_pool == NULL) { 570 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 571 spdk_bdev_init_complete(-1); 572 return; 573 } 574 575 /** 576 * Ensure no more than half of the total buffers end up local caches, by 577 * using spdk_env_get_core_count() to determine how many local caches we need 578 * to account for. 579 */ 580 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 581 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 582 583 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 584 BUF_SMALL_POOL_SIZE, 585 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 586 cache_size, 587 SPDK_ENV_SOCKET_ID_ANY); 588 if (!g_bdev_mgr.buf_small_pool) { 589 SPDK_ERRLOG("create rbuf small pool failed\n"); 590 spdk_bdev_init_complete(-1); 591 return; 592 } 593 594 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 595 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 596 597 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 598 BUF_LARGE_POOL_SIZE, 599 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 600 cache_size, 601 SPDK_ENV_SOCKET_ID_ANY); 602 if (!g_bdev_mgr.buf_large_pool) { 603 SPDK_ERRLOG("create rbuf large pool failed\n"); 604 spdk_bdev_init_complete(-1); 605 return; 606 } 607 608 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 609 NULL); 610 if (!g_bdev_mgr.zero_buffer) { 611 SPDK_ERRLOG("create bdev zero buffer failed\n"); 612 spdk_bdev_init_complete(-1); 613 return; 614 } 615 616 #ifdef SPDK_CONFIG_VTUNE 617 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 618 #endif 619 620 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 621 spdk_bdev_mgmt_channel_destroy, 622 sizeof(struct spdk_bdev_mgmt_channel)); 623 624 rc = spdk_bdev_modules_init(); 625 if (rc != 0) { 626 SPDK_ERRLOG("bdev modules init failed\n"); 627 spdk_bdev_init_complete(-1); 628 return; 629 } 630 631 spdk_bdev_module_action_complete(); 632 } 633 634 static void 635 spdk_bdev_module_finish_cb(void *io_device) 636 { 637 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 638 639 cb_fn(g_fini_cb_arg); 640 g_fini_cb_fn = NULL; 641 g_fini_cb_arg = NULL; 642 } 643 644 static void 645 spdk_bdev_module_finish_complete(struct spdk_io_channel_iter *i, int status) 646 { 647 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 648 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 649 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 650 SPDK_BDEV_IO_POOL_SIZE); 651 } 652 653 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 654 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 655 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 656 BUF_SMALL_POOL_SIZE); 657 assert(false); 658 } 659 660 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 661 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 662 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 663 BUF_LARGE_POOL_SIZE); 664 assert(false); 665 } 666 667 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 668 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 669 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 670 spdk_dma_free(g_bdev_mgr.zero_buffer); 671 672 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_module_finish_cb); 673 } 674 675 static void 676 mgmt_channel_free_resources(struct spdk_io_channel_iter *i) 677 { 678 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 679 struct spdk_bdev_mgmt_channel *ch = spdk_io_channel_get_ctx(_ch); 680 681 spdk_bdev_mgmt_channel_free_resources(ch); 682 spdk_for_each_channel_continue(i, 0); 683 } 684 685 static void 686 spdk_bdev_module_finish_iter(void *arg) 687 { 688 /* Notice that this variable is static. It is saved between calls to 689 * this function. */ 690 static struct spdk_bdev_module *resume_bdev_module = NULL; 691 struct spdk_bdev_module *bdev_module; 692 693 /* Start iterating from the last touched module */ 694 if (!resume_bdev_module) { 695 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 696 } else { 697 bdev_module = TAILQ_NEXT(resume_bdev_module, tailq); 698 } 699 700 while (bdev_module) { 701 if (bdev_module->async_fini) { 702 /* Save our place so we can resume later. We must 703 * save the variable here, before calling module_fini() 704 * below, because in some cases the module may immediately 705 * call spdk_bdev_module_finish_done() and re-enter 706 * this function to continue iterating. */ 707 resume_bdev_module = bdev_module; 708 } 709 710 if (bdev_module->module_fini) { 711 bdev_module->module_fini(); 712 } 713 714 if (bdev_module->async_fini) { 715 return; 716 } 717 718 bdev_module = TAILQ_NEXT(bdev_module, tailq); 719 } 720 721 resume_bdev_module = NULL; 722 spdk_for_each_channel(&g_bdev_mgr, mgmt_channel_free_resources, NULL, 723 spdk_bdev_module_finish_complete); 724 } 725 726 void 727 spdk_bdev_module_finish_done(void) 728 { 729 if (spdk_get_thread() != g_fini_thread) { 730 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 731 } else { 732 spdk_bdev_module_finish_iter(NULL); 733 } 734 } 735 736 static void 737 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 738 { 739 struct spdk_bdev *bdev = cb_arg; 740 741 if (bdeverrno && bdev) { 742 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 743 bdev->name); 744 745 /* 746 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 747 * bdev; try to continue by manually removing this bdev from the list and continue 748 * with the next bdev in the list. 749 */ 750 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 751 } 752 753 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 754 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 755 spdk_bdev_module_finish_iter(NULL); 756 return; 757 } 758 759 /* 760 * Unregister the first bdev in the list. 761 * 762 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 763 * calling the remove_cb of the descriptors first. 764 * 765 * Once this bdev and all of its open descriptors have been cleaned up, this function 766 * will be called again via the unregister completion callback to continue the cleanup 767 * process with the next bdev. 768 */ 769 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 770 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 771 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 772 } 773 774 static void 775 _spdk_bdev_finish_unregister_bdevs(void) 776 { 777 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 778 } 779 780 void 781 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 782 { 783 assert(cb_fn != NULL); 784 785 g_fini_thread = spdk_get_thread(); 786 787 g_fini_cb_fn = cb_fn; 788 g_fini_cb_arg = cb_arg; 789 790 _spdk_bdev_finish_unregister_bdevs(); 791 } 792 793 static struct spdk_bdev_io * 794 spdk_bdev_get_io(struct spdk_io_channel *_ch) 795 { 796 struct spdk_bdev_mgmt_channel *ch = spdk_io_channel_get_ctx(_ch); 797 struct spdk_bdev_io *bdev_io; 798 799 if (ch->per_thread_cache_count > 0) { 800 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 801 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 802 ch->per_thread_cache_count--; 803 } else { 804 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 805 if (!bdev_io) { 806 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 807 return NULL; 808 } 809 } 810 811 bdev_io->mgmt_ch = ch; 812 813 return bdev_io; 814 } 815 816 static void 817 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 818 { 819 struct spdk_bdev_mgmt_channel *ch = bdev_io->mgmt_ch; 820 821 if (bdev_io->buf != NULL) { 822 spdk_bdev_io_put_buf(bdev_io); 823 } 824 825 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 826 ch->per_thread_cache_count++; 827 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, buf_link); 828 } else { 829 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 830 } 831 } 832 833 static void 834 _spdk_bdev_qos_io_submit(void *ctx) 835 { 836 struct spdk_bdev_channel *ch = ctx; 837 struct spdk_bdev_io *bdev_io = NULL; 838 struct spdk_bdev *bdev = ch->bdev; 839 struct spdk_bdev_module_channel *shared_ch = ch->module_ch; 840 841 while (!TAILQ_EMPTY(&ch->qos_io)) { 842 if (ch->io_submitted_this_timeslice < ch->qos_max_ios_per_timeslice) { 843 bdev_io = TAILQ_FIRST(&ch->qos_io); 844 TAILQ_REMOVE(&ch->qos_io, bdev_io, link); 845 ch->io_submitted_this_timeslice++; 846 shared_ch->io_outstanding++; 847 bdev->fn_table->submit_request(ch->channel, bdev_io); 848 } else { 849 break; 850 } 851 } 852 } 853 854 static void 855 _spdk_bdev_io_submit(void *ctx) 856 { 857 struct spdk_bdev_io *bdev_io = ctx; 858 struct spdk_bdev *bdev = bdev_io->bdev; 859 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 860 struct spdk_io_channel *ch = bdev_ch->channel; 861 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 862 863 bdev_io->submit_tsc = spdk_get_ticks(); 864 shared_ch->io_outstanding++; 865 bdev_io->in_submit_request = true; 866 if (spdk_likely(bdev_ch->flags == 0)) { 867 if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) { 868 bdev->fn_table->submit_request(ch, bdev_io); 869 } else { 870 shared_ch->io_outstanding--; 871 TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link); 872 } 873 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 874 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 875 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 876 shared_ch->io_outstanding--; 877 TAILQ_INSERT_TAIL(&bdev_ch->qos_io, bdev_io, link); 878 _spdk_bdev_qos_io_submit(bdev_ch); 879 } else { 880 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 881 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 882 } 883 bdev_io->in_submit_request = false; 884 } 885 886 static void 887 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 888 { 889 struct spdk_bdev *bdev = bdev_io->bdev; 890 891 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 892 893 /* QoS channel and thread have been properly configured */ 894 if (bdev->ios_per_sec > 0 && bdev->qos_channel && bdev->qos_thread) { 895 bdev_io->io_submit_ch = bdev_io->ch; 896 bdev_io->ch = bdev->qos_channel; 897 spdk_thread_send_msg(bdev->qos_thread, _spdk_bdev_io_submit, bdev_io); 898 } else { 899 _spdk_bdev_io_submit(bdev_io); 900 } 901 } 902 903 static void 904 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 905 { 906 struct spdk_bdev *bdev = bdev_io->bdev; 907 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 908 struct spdk_io_channel *ch = bdev_ch->channel; 909 910 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 911 912 bdev_io->in_submit_request = true; 913 bdev->fn_table->submit_request(ch, bdev_io); 914 bdev_io->in_submit_request = false; 915 } 916 917 static void 918 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 919 struct spdk_bdev *bdev, void *cb_arg, 920 spdk_bdev_io_completion_cb cb) 921 { 922 bdev_io->bdev = bdev; 923 bdev_io->caller_ctx = cb_arg; 924 bdev_io->cb = cb; 925 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 926 bdev_io->in_submit_request = false; 927 bdev_io->buf = NULL; 928 bdev_io->io_submit_ch = NULL; 929 } 930 931 bool 932 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 933 { 934 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 935 } 936 937 int 938 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 939 { 940 if (bdev->fn_table->dump_info_json) { 941 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 942 } 943 944 return 0; 945 } 946 947 static void 948 spdk_bdev_qos_get_max_ios_per_timeslice(struct spdk_bdev *bdev) 949 { 950 uint64_t qos_max_ios_per_timeslice = 0; 951 952 qos_max_ios_per_timeslice = bdev->ios_per_sec * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 953 SPDK_BDEV_SEC_TO_USEC; 954 bdev->qos_channel->qos_max_ios_per_timeslice = spdk_max(qos_max_ios_per_timeslice, 955 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 956 } 957 958 static void 959 spdk_bdev_channel_poll_qos(void *arg) 960 { 961 struct spdk_bdev_channel *ch = arg; 962 struct spdk_bdev *bdev = ch->bdev; 963 964 /* Reset for next round of rate limiting */ 965 ch->io_submitted_this_timeslice = 0; 966 spdk_bdev_qos_get_max_ios_per_timeslice(bdev); 967 968 _spdk_bdev_qos_io_submit(ch); 969 } 970 971 static void 972 spdk_bdev_qos_register_poller(void *ctx) 973 { 974 struct spdk_bdev_channel *ch = ctx; 975 976 ch->qos_poller = spdk_poller_register(spdk_bdev_channel_poll_qos, ch, 977 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 978 } 979 980 static void 981 spdk_bdev_qos_unregister_poller(void *ctx) 982 { 983 struct spdk_poller *poller = ctx; 984 985 spdk_poller_unregister(&poller); 986 } 987 988 static int 989 _spdk_bdev_channel_create(struct spdk_bdev_channel *ch, void *io_device) 990 { 991 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 992 struct spdk_bdev_mgmt_channel *mgmt_ch; 993 struct spdk_bdev_module_channel *shared_ch; 994 995 ch->bdev = bdev; 996 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 997 if (!ch->channel) { 998 return -1; 999 } 1000 1001 ch->mgmt_channel = spdk_get_io_channel(&g_bdev_mgr); 1002 if (!ch->mgmt_channel) { 1003 return -1; 1004 } 1005 1006 mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel); 1007 TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) { 1008 if (shared_ch->module_ch == ch->channel) { 1009 shared_ch->ref++; 1010 break; 1011 } 1012 } 1013 1014 if (shared_ch == NULL) { 1015 shared_ch = calloc(1, sizeof(*shared_ch)); 1016 if (!shared_ch) { 1017 return -1; 1018 } 1019 1020 shared_ch->io_outstanding = 0; 1021 TAILQ_INIT(&shared_ch->nomem_io); 1022 shared_ch->nomem_threshold = 0; 1023 shared_ch->module_ch = ch->channel; 1024 shared_ch->ref = 1; 1025 TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link); 1026 } 1027 1028 memset(&ch->stat, 0, sizeof(ch->stat)); 1029 TAILQ_INIT(&ch->queued_resets); 1030 TAILQ_INIT(&ch->qos_io); 1031 ch->qos_max_ios_per_timeslice = 0; 1032 ch->io_submitted_this_timeslice = 0; 1033 ch->qos_poller = NULL; 1034 ch->flags = 0; 1035 ch->module_ch = shared_ch; 1036 1037 return 0; 1038 } 1039 1040 static void 1041 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1042 { 1043 struct spdk_bdev_mgmt_channel *mgmt_channel; 1044 struct spdk_bdev_module_channel *shared_ch = NULL; 1045 1046 if (!ch) { 1047 return; 1048 } 1049 1050 if (ch->channel) { 1051 spdk_put_io_channel(ch->channel); 1052 } 1053 1054 if (ch->mgmt_channel) { 1055 shared_ch = ch->module_ch; 1056 if (shared_ch) { 1057 assert(shared_ch->ref > 0); 1058 shared_ch->ref--; 1059 if (shared_ch->ref == 0) { 1060 mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); 1061 assert(shared_ch->io_outstanding == 0); 1062 TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link); 1063 free(shared_ch); 1064 } 1065 } 1066 spdk_put_io_channel(ch->mgmt_channel); 1067 } 1068 } 1069 1070 static int 1071 _spdk_bdev_qos_channel_create(struct spdk_bdev *bdev) 1072 { 1073 bdev->qos_channel = calloc(1, sizeof(struct spdk_bdev_channel)); 1074 if (!bdev->qos_channel) { 1075 return -1; 1076 } 1077 1078 bdev->qos_thread = spdk_get_thread(); 1079 if (!bdev->qos_thread) { 1080 return -1; 1081 } 1082 1083 if (_spdk_bdev_channel_create(bdev->qos_channel, __bdev_to_io_dev(bdev)) != 0) { 1084 return -1; 1085 } 1086 1087 bdev->qos_channel->flags |= BDEV_CH_QOS_ENABLED; 1088 spdk_bdev_qos_get_max_ios_per_timeslice(bdev); 1089 spdk_bdev_qos_register_poller(bdev->qos_channel); 1090 1091 return 0; 1092 } 1093 1094 static void 1095 _spdk_bdev_qos_channel_destroy(void *ctx) 1096 { 1097 struct spdk_bdev_channel *qos_channel = ctx; 1098 struct spdk_bdev *bdev = NULL; 1099 struct spdk_poller *poller = NULL; 1100 1101 if (!qos_channel) { 1102 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "QoS channel already NULL\n"); 1103 return; 1104 } 1105 1106 bdev = qos_channel->bdev; 1107 poller = qos_channel->qos_poller; 1108 1109 assert(bdev->qos_thread == spdk_get_thread()); 1110 assert(bdev->qos_channel == qos_channel); 1111 1112 free(bdev->qos_channel); 1113 bdev->qos_channel = NULL; 1114 bdev->qos_thread = NULL; 1115 1116 if (!poller) { 1117 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "QoS poller already NULL\n"); 1118 } else { 1119 spdk_bdev_qos_unregister_poller(poller); 1120 } 1121 } 1122 1123 static void 1124 spdk_bdev_qos_channel_create_async(void *ctx) 1125 { 1126 struct spdk_bdev *bdev = ctx; 1127 1128 if (!bdev->qos_channel) { 1129 if (_spdk_bdev_qos_channel_create(bdev) != 0) { 1130 SPDK_ERRLOG("QoS channel failed to create\n"); 1131 _spdk_bdev_channel_destroy_resource(bdev->qos_channel); 1132 _spdk_bdev_qos_channel_destroy(bdev->qos_channel); 1133 } 1134 } 1135 } 1136 1137 static int 1138 spdk_bdev_qos_channel_create(void *ctx) 1139 { 1140 struct spdk_bdev *bdev = ctx; 1141 struct spdk_thread *qos_thread = bdev->qos_thread; 1142 1143 /* 1144 * There is an async destroying on going. 1145 * Send a message to that thread to defer the creation. 1146 */ 1147 if (bdev->qos_channel_destroying == true) { 1148 if (qos_thread) { 1149 spdk_thread_send_msg(qos_thread, 1150 spdk_bdev_qos_channel_create_async, bdev); 1151 return 0; 1152 } 1153 } 1154 1155 if (!bdev->qos_channel) { 1156 return _spdk_bdev_qos_channel_create(bdev); 1157 } else { 1158 return 0; 1159 } 1160 } 1161 1162 static int 1163 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1164 { 1165 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1166 struct spdk_bdev_channel *ch = ctx_buf; 1167 1168 if (_spdk_bdev_channel_create(ch, io_device) != 0) { 1169 _spdk_bdev_channel_destroy_resource(ch); 1170 return -1; 1171 } 1172 1173 /* Rate limiting on this bdev enabled */ 1174 if (bdev->ios_per_sec > 0) { 1175 if (spdk_bdev_qos_channel_create(bdev) != 0) { 1176 _spdk_bdev_channel_destroy_resource(ch); 1177 _spdk_bdev_channel_destroy_resource(bdev->qos_channel); 1178 _spdk_bdev_qos_channel_destroy(bdev->qos_channel); 1179 return -1; 1180 } 1181 } 1182 1183 pthread_mutex_lock(&bdev->mutex); 1184 bdev->channel_count++; 1185 pthread_mutex_unlock(&bdev->mutex); 1186 1187 #ifdef SPDK_CONFIG_VTUNE 1188 { 1189 char *name; 1190 __itt_init_ittlib(NULL, 0); 1191 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1192 if (!name) { 1193 _spdk_bdev_channel_destroy_resource(ch); 1194 _spdk_bdev_channel_destroy_resource(bdev->qos_channel); 1195 _spdk_bdev_qos_channel_destroy(bdev->qos_channel); 1196 return -1; 1197 } 1198 ch->handle = __itt_string_handle_create(name); 1199 free(name); 1200 ch->start_tsc = spdk_get_ticks(); 1201 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1202 } 1203 #endif 1204 1205 return 0; 1206 } 1207 1208 /* 1209 * Abort I/O that are waiting on a data buffer. These types of I/O are 1210 * linked using the spdk_bdev_io buf_link TAILQ_ENTRY. 1211 */ 1212 static void 1213 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1214 { 1215 bdev_io_stailq_t tmp; 1216 struct spdk_bdev_io *bdev_io; 1217 1218 STAILQ_INIT(&tmp); 1219 1220 while (!STAILQ_EMPTY(queue)) { 1221 bdev_io = STAILQ_FIRST(queue); 1222 STAILQ_REMOVE_HEAD(queue, buf_link); 1223 if (bdev_io->ch == ch) { 1224 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1225 } else { 1226 STAILQ_INSERT_TAIL(&tmp, bdev_io, buf_link); 1227 } 1228 } 1229 1230 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1231 } 1232 1233 /* 1234 * Abort I/O that are queued waiting for submission. These types of I/O are 1235 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1236 */ 1237 static void 1238 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1239 { 1240 struct spdk_bdev_io *bdev_io, *tmp; 1241 1242 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1243 if (bdev_io->ch == ch) { 1244 TAILQ_REMOVE(queue, bdev_io, link); 1245 /* 1246 * spdk_bdev_io_complete() assumes that the completed I/O had 1247 * been submitted to the bdev module. Since in this case it 1248 * hadn't, bump io_outstanding to account for the decrement 1249 * that spdk_bdev_io_complete() will do. 1250 */ 1251 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1252 ch->module_ch->io_outstanding++; 1253 } 1254 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1255 } 1256 } 1257 } 1258 1259 static void 1260 _spdk_bdev_channel_destroy(struct spdk_bdev_channel *ch) 1261 { 1262 struct spdk_bdev_mgmt_channel *mgmt_channel; 1263 struct spdk_bdev_module_channel *shared_ch = ch->module_ch; 1264 1265 mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); 1266 1267 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1268 _spdk_bdev_abort_queued_io(&ch->qos_io, ch); 1269 _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch); 1270 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch); 1271 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch); 1272 1273 _spdk_bdev_channel_destroy_resource(ch); 1274 } 1275 1276 static void 1277 spdk_bdev_qos_channel_destroy(void *ctx) 1278 { 1279 struct spdk_bdev *bdev = ctx; 1280 1281 bdev->qos_channel_destroying = false; 1282 1283 _spdk_bdev_channel_destroy(bdev->qos_channel); 1284 _spdk_bdev_qos_channel_destroy(bdev->qos_channel); 1285 } 1286 1287 static void 1288 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1289 { 1290 struct spdk_bdev_channel *ch = ctx_buf; 1291 struct spdk_bdev *bdev = ch->bdev; 1292 uint32_t channel_count = 0; 1293 1294 _spdk_bdev_channel_destroy(ch); 1295 1296 pthread_mutex_lock(&bdev->mutex); 1297 bdev->channel_count--; 1298 channel_count = bdev->channel_count; 1299 pthread_mutex_unlock(&bdev->mutex); 1300 1301 /* Destroy QoS channel as no active bdev channels there */ 1302 if (channel_count == 0 && bdev->ios_per_sec > 0 && bdev->qos_thread) { 1303 if (bdev->qos_thread == spdk_get_thread()) { 1304 spdk_bdev_qos_channel_destroy(bdev); 1305 } else { 1306 bdev->qos_channel_destroying = true; 1307 spdk_thread_send_msg(bdev->qos_thread, 1308 spdk_bdev_qos_channel_destroy, bdev); 1309 } 1310 } 1311 } 1312 1313 int 1314 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1315 { 1316 struct spdk_bdev_alias *tmp; 1317 1318 if (alias == NULL) { 1319 SPDK_ERRLOG("Empty alias passed\n"); 1320 return -EINVAL; 1321 } 1322 1323 if (spdk_bdev_get_by_name(alias)) { 1324 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1325 return -EEXIST; 1326 } 1327 1328 tmp = calloc(1, sizeof(*tmp)); 1329 if (tmp == NULL) { 1330 SPDK_ERRLOG("Unable to allocate alias\n"); 1331 return -ENOMEM; 1332 } 1333 1334 tmp->alias = strdup(alias); 1335 if (tmp->alias == NULL) { 1336 free(tmp); 1337 SPDK_ERRLOG("Unable to allocate alias\n"); 1338 return -ENOMEM; 1339 } 1340 1341 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1342 1343 return 0; 1344 } 1345 1346 int 1347 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1348 { 1349 struct spdk_bdev_alias *tmp; 1350 1351 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1352 if (strcmp(alias, tmp->alias) == 0) { 1353 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1354 free(tmp->alias); 1355 free(tmp); 1356 return 0; 1357 } 1358 } 1359 1360 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1361 1362 return -ENOENT; 1363 } 1364 1365 struct spdk_io_channel * 1366 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1367 { 1368 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1369 } 1370 1371 const char * 1372 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1373 { 1374 return bdev->name; 1375 } 1376 1377 const char * 1378 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1379 { 1380 return bdev->product_name; 1381 } 1382 1383 const struct spdk_bdev_aliases_list * 1384 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1385 { 1386 return &bdev->aliases; 1387 } 1388 1389 uint32_t 1390 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1391 { 1392 return bdev->blocklen; 1393 } 1394 1395 uint64_t 1396 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1397 { 1398 return bdev->blockcnt; 1399 } 1400 1401 size_t 1402 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1403 { 1404 /* TODO: push this logic down to the bdev modules */ 1405 if (bdev->need_aligned_buffer) { 1406 return bdev->blocklen; 1407 } 1408 1409 return 1; 1410 } 1411 1412 uint32_t 1413 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1414 { 1415 return bdev->optimal_io_boundary; 1416 } 1417 1418 bool 1419 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1420 { 1421 return bdev->write_cache; 1422 } 1423 1424 const struct spdk_uuid * 1425 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1426 { 1427 return &bdev->uuid; 1428 } 1429 1430 int 1431 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1432 { 1433 int ret; 1434 1435 pthread_mutex_lock(&bdev->mutex); 1436 1437 /* bdev has open descriptors */ 1438 if (!TAILQ_EMPTY(&bdev->open_descs) && 1439 bdev->blockcnt > size) { 1440 ret = -EBUSY; 1441 } else { 1442 bdev->blockcnt = size; 1443 ret = 0; 1444 } 1445 1446 pthread_mutex_unlock(&bdev->mutex); 1447 1448 return ret; 1449 } 1450 1451 /* 1452 * Convert I/O offset and length from bytes to blocks. 1453 * 1454 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1455 */ 1456 static uint64_t 1457 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1458 uint64_t num_bytes, uint64_t *num_blocks) 1459 { 1460 uint32_t block_size = bdev->blocklen; 1461 1462 *offset_blocks = offset_bytes / block_size; 1463 *num_blocks = num_bytes / block_size; 1464 1465 return (offset_bytes % block_size) | (num_bytes % block_size); 1466 } 1467 1468 static bool 1469 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1470 { 1471 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1472 * has been an overflow and hence the offset has been wrapped around */ 1473 if (offset_blocks + num_blocks < offset_blocks) { 1474 return false; 1475 } 1476 1477 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1478 if (offset_blocks + num_blocks > bdev->blockcnt) { 1479 return false; 1480 } 1481 1482 return true; 1483 } 1484 1485 int 1486 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1487 void *buf, uint64_t offset, uint64_t nbytes, 1488 spdk_bdev_io_completion_cb cb, void *cb_arg) 1489 { 1490 uint64_t offset_blocks, num_blocks; 1491 1492 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1493 return -EINVAL; 1494 } 1495 1496 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1497 } 1498 1499 int 1500 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1501 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1502 spdk_bdev_io_completion_cb cb, void *cb_arg) 1503 { 1504 struct spdk_bdev *bdev = desc->bdev; 1505 struct spdk_bdev_io *bdev_io; 1506 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1507 1508 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1509 return -EINVAL; 1510 } 1511 1512 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1513 if (!bdev_io) { 1514 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1515 return -ENOMEM; 1516 } 1517 1518 bdev_io->ch = channel; 1519 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1520 bdev_io->u.bdev.iov.iov_base = buf; 1521 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1522 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1523 bdev_io->u.bdev.iovcnt = 1; 1524 bdev_io->u.bdev.num_blocks = num_blocks; 1525 bdev_io->u.bdev.offset_blocks = offset_blocks; 1526 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1527 1528 spdk_bdev_io_submit(bdev_io); 1529 return 0; 1530 } 1531 1532 int 1533 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1534 struct iovec *iov, int iovcnt, 1535 uint64_t offset, uint64_t nbytes, 1536 spdk_bdev_io_completion_cb cb, void *cb_arg) 1537 { 1538 uint64_t offset_blocks, num_blocks; 1539 1540 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1541 return -EINVAL; 1542 } 1543 1544 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1545 } 1546 1547 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1548 struct iovec *iov, int iovcnt, 1549 uint64_t offset_blocks, uint64_t num_blocks, 1550 spdk_bdev_io_completion_cb cb, void *cb_arg) 1551 { 1552 struct spdk_bdev *bdev = desc->bdev; 1553 struct spdk_bdev_io *bdev_io; 1554 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1555 1556 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1557 return -EINVAL; 1558 } 1559 1560 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1561 if (!bdev_io) { 1562 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1563 return -ENOMEM; 1564 } 1565 1566 bdev_io->ch = channel; 1567 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1568 bdev_io->u.bdev.iovs = iov; 1569 bdev_io->u.bdev.iovcnt = iovcnt; 1570 bdev_io->u.bdev.num_blocks = num_blocks; 1571 bdev_io->u.bdev.offset_blocks = offset_blocks; 1572 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1573 1574 spdk_bdev_io_submit(bdev_io); 1575 return 0; 1576 } 1577 1578 int 1579 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1580 void *buf, uint64_t offset, uint64_t nbytes, 1581 spdk_bdev_io_completion_cb cb, void *cb_arg) 1582 { 1583 uint64_t offset_blocks, num_blocks; 1584 1585 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1586 return -EINVAL; 1587 } 1588 1589 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1590 } 1591 1592 int 1593 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1594 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1595 spdk_bdev_io_completion_cb cb, void *cb_arg) 1596 { 1597 struct spdk_bdev *bdev = desc->bdev; 1598 struct spdk_bdev_io *bdev_io; 1599 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1600 1601 if (!desc->write) { 1602 return -EBADF; 1603 } 1604 1605 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1606 return -EINVAL; 1607 } 1608 1609 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1610 if (!bdev_io) { 1611 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1612 return -ENOMEM; 1613 } 1614 1615 bdev_io->ch = channel; 1616 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1617 bdev_io->u.bdev.iov.iov_base = buf; 1618 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1619 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1620 bdev_io->u.bdev.iovcnt = 1; 1621 bdev_io->u.bdev.num_blocks = num_blocks; 1622 bdev_io->u.bdev.offset_blocks = offset_blocks; 1623 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1624 1625 spdk_bdev_io_submit(bdev_io); 1626 return 0; 1627 } 1628 1629 int 1630 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1631 struct iovec *iov, int iovcnt, 1632 uint64_t offset, uint64_t len, 1633 spdk_bdev_io_completion_cb cb, void *cb_arg) 1634 { 1635 uint64_t offset_blocks, num_blocks; 1636 1637 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1638 return -EINVAL; 1639 } 1640 1641 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1642 } 1643 1644 int 1645 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1646 struct iovec *iov, int iovcnt, 1647 uint64_t offset_blocks, uint64_t num_blocks, 1648 spdk_bdev_io_completion_cb cb, void *cb_arg) 1649 { 1650 struct spdk_bdev *bdev = desc->bdev; 1651 struct spdk_bdev_io *bdev_io; 1652 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1653 1654 if (!desc->write) { 1655 return -EBADF; 1656 } 1657 1658 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1659 return -EINVAL; 1660 } 1661 1662 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1663 if (!bdev_io) { 1664 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1665 return -ENOMEM; 1666 } 1667 1668 bdev_io->ch = channel; 1669 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1670 bdev_io->u.bdev.iovs = iov; 1671 bdev_io->u.bdev.iovcnt = iovcnt; 1672 bdev_io->u.bdev.num_blocks = num_blocks; 1673 bdev_io->u.bdev.offset_blocks = offset_blocks; 1674 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1675 1676 spdk_bdev_io_submit(bdev_io); 1677 return 0; 1678 } 1679 1680 int 1681 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1682 uint64_t offset, uint64_t len, 1683 spdk_bdev_io_completion_cb cb, void *cb_arg) 1684 { 1685 uint64_t offset_blocks, num_blocks; 1686 1687 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1688 return -EINVAL; 1689 } 1690 1691 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1692 } 1693 1694 int 1695 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1696 uint64_t offset_blocks, uint64_t num_blocks, 1697 spdk_bdev_io_completion_cb cb, void *cb_arg) 1698 { 1699 struct spdk_bdev *bdev = desc->bdev; 1700 struct spdk_bdev_io *bdev_io; 1701 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1702 uint64_t len; 1703 bool split_request = false; 1704 1705 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1706 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1707 return -ERANGE; 1708 } 1709 1710 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1711 return -EINVAL; 1712 } 1713 1714 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1715 1716 if (!bdev_io) { 1717 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1718 return -ENOMEM; 1719 } 1720 1721 bdev_io->ch = channel; 1722 bdev_io->u.bdev.offset_blocks = offset_blocks; 1723 1724 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1725 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1726 bdev_io->u.bdev.num_blocks = num_blocks; 1727 bdev_io->u.bdev.iovs = NULL; 1728 bdev_io->u.bdev.iovcnt = 0; 1729 1730 } else { 1731 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1732 1733 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1734 1735 if (len > ZERO_BUFFER_SIZE) { 1736 split_request = true; 1737 len = ZERO_BUFFER_SIZE; 1738 } 1739 1740 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1741 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1742 bdev_io->u.bdev.iov.iov_len = len; 1743 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1744 bdev_io->u.bdev.iovcnt = 1; 1745 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1746 bdev_io->split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1747 bdev_io->split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1748 } 1749 1750 if (split_request) { 1751 bdev_io->stored_user_cb = cb; 1752 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1753 } else { 1754 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1755 } 1756 spdk_bdev_io_submit(bdev_io); 1757 return 0; 1758 } 1759 1760 int 1761 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1762 uint64_t offset, uint64_t nbytes, 1763 spdk_bdev_io_completion_cb cb, void *cb_arg) 1764 { 1765 uint64_t offset_blocks, num_blocks; 1766 1767 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1768 return -EINVAL; 1769 } 1770 1771 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1772 } 1773 1774 int 1775 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1776 uint64_t offset_blocks, uint64_t num_blocks, 1777 spdk_bdev_io_completion_cb cb, void *cb_arg) 1778 { 1779 struct spdk_bdev *bdev = desc->bdev; 1780 struct spdk_bdev_io *bdev_io; 1781 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1782 1783 if (!desc->write) { 1784 return -EBADF; 1785 } 1786 1787 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1788 return -EINVAL; 1789 } 1790 1791 if (num_blocks == 0) { 1792 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1793 return -EINVAL; 1794 } 1795 1796 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1797 if (!bdev_io) { 1798 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1799 return -ENOMEM; 1800 } 1801 1802 bdev_io->ch = channel; 1803 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1804 bdev_io->u.bdev.iov.iov_base = NULL; 1805 bdev_io->u.bdev.iov.iov_len = 0; 1806 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1807 bdev_io->u.bdev.iovcnt = 1; 1808 bdev_io->u.bdev.offset_blocks = offset_blocks; 1809 bdev_io->u.bdev.num_blocks = num_blocks; 1810 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1811 1812 spdk_bdev_io_submit(bdev_io); 1813 return 0; 1814 } 1815 1816 int 1817 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1818 uint64_t offset, uint64_t length, 1819 spdk_bdev_io_completion_cb cb, void *cb_arg) 1820 { 1821 uint64_t offset_blocks, num_blocks; 1822 1823 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1824 return -EINVAL; 1825 } 1826 1827 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1828 } 1829 1830 int 1831 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1832 uint64_t offset_blocks, uint64_t num_blocks, 1833 spdk_bdev_io_completion_cb cb, void *cb_arg) 1834 { 1835 struct spdk_bdev *bdev = desc->bdev; 1836 struct spdk_bdev_io *bdev_io; 1837 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1838 1839 if (!desc->write) { 1840 return -EBADF; 1841 } 1842 1843 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1844 return -EINVAL; 1845 } 1846 1847 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1848 if (!bdev_io) { 1849 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1850 return -ENOMEM; 1851 } 1852 1853 bdev_io->ch = channel; 1854 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1855 bdev_io->u.bdev.iovs = NULL; 1856 bdev_io->u.bdev.iovcnt = 0; 1857 bdev_io->u.bdev.offset_blocks = offset_blocks; 1858 bdev_io->u.bdev.num_blocks = num_blocks; 1859 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1860 1861 spdk_bdev_io_submit(bdev_io); 1862 return 0; 1863 } 1864 1865 static void 1866 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1867 { 1868 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1869 struct spdk_bdev_io *bdev_io; 1870 1871 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1872 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1873 spdk_bdev_io_submit_reset(bdev_io); 1874 } 1875 1876 static void 1877 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1878 { 1879 struct spdk_io_channel *ch; 1880 struct spdk_bdev_channel *channel; 1881 struct spdk_bdev_mgmt_channel *mgmt_channel; 1882 struct spdk_bdev_module_channel *shared_ch; 1883 1884 ch = spdk_io_channel_iter_get_channel(i); 1885 channel = spdk_io_channel_get_ctx(ch); 1886 mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel); 1887 shared_ch = channel->module_ch; 1888 1889 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1890 1891 _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel); 1892 _spdk_bdev_abort_queued_io(&channel->qos_io, channel); 1893 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1894 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1895 1896 spdk_for_each_channel_continue(i, 0); 1897 } 1898 1899 static void 1900 _spdk_bdev_start_reset(void *ctx) 1901 { 1902 struct spdk_bdev_channel *ch = ctx; 1903 1904 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1905 ch, _spdk_bdev_reset_dev); 1906 } 1907 1908 static void 1909 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1910 { 1911 struct spdk_bdev *bdev = ch->bdev; 1912 1913 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1914 1915 pthread_mutex_lock(&bdev->mutex); 1916 if (bdev->reset_in_progress == NULL) { 1917 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1918 /* 1919 * Take a channel reference for the target bdev for the life of this 1920 * reset. This guards against the channel getting destroyed while 1921 * spdk_for_each_channel() calls related to this reset IO are in 1922 * progress. We will release the reference when this reset is 1923 * completed. 1924 */ 1925 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1926 _spdk_bdev_start_reset(ch); 1927 } 1928 pthread_mutex_unlock(&bdev->mutex); 1929 } 1930 1931 int 1932 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1933 spdk_bdev_io_completion_cb cb, void *cb_arg) 1934 { 1935 struct spdk_bdev *bdev = desc->bdev; 1936 struct spdk_bdev_io *bdev_io; 1937 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1938 1939 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1940 if (!bdev_io) { 1941 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1942 return -ENOMEM; 1943 } 1944 1945 bdev_io->ch = channel; 1946 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1947 bdev_io->u.reset.ch_ref = NULL; 1948 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1949 1950 pthread_mutex_lock(&bdev->mutex); 1951 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 1952 pthread_mutex_unlock(&bdev->mutex); 1953 1954 _spdk_bdev_channel_start_reset(channel); 1955 1956 return 0; 1957 } 1958 1959 void 1960 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 1961 struct spdk_bdev_io_stat *stat) 1962 { 1963 #ifdef SPDK_CONFIG_VTUNE 1964 SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n"); 1965 memset(stat, 0, sizeof(*stat)); 1966 return; 1967 #endif 1968 1969 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1970 1971 channel->stat.ticks_rate = spdk_get_ticks_hz(); 1972 *stat = channel->stat; 1973 memset(&channel->stat, 0, sizeof(channel->stat)); 1974 } 1975 1976 int 1977 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1978 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 1979 spdk_bdev_io_completion_cb cb, void *cb_arg) 1980 { 1981 struct spdk_bdev *bdev = desc->bdev; 1982 struct spdk_bdev_io *bdev_io; 1983 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1984 1985 if (!desc->write) { 1986 return -EBADF; 1987 } 1988 1989 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1990 if (!bdev_io) { 1991 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 1992 return -ENOMEM; 1993 } 1994 1995 bdev_io->ch = channel; 1996 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 1997 bdev_io->u.nvme_passthru.cmd = *cmd; 1998 bdev_io->u.nvme_passthru.buf = buf; 1999 bdev_io->u.nvme_passthru.nbytes = nbytes; 2000 bdev_io->u.nvme_passthru.md_buf = NULL; 2001 bdev_io->u.nvme_passthru.md_len = 0; 2002 2003 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2004 2005 spdk_bdev_io_submit(bdev_io); 2006 return 0; 2007 } 2008 2009 int 2010 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2011 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2012 spdk_bdev_io_completion_cb cb, void *cb_arg) 2013 { 2014 struct spdk_bdev *bdev = desc->bdev; 2015 struct spdk_bdev_io *bdev_io; 2016 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2017 2018 if (!desc->write) { 2019 /* 2020 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2021 * to easily determine if the command is a read or write, but for now just 2022 * do not allow io_passthru with a read-only descriptor. 2023 */ 2024 return -EBADF; 2025 } 2026 2027 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 2028 if (!bdev_io) { 2029 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2030 return -ENOMEM; 2031 } 2032 2033 bdev_io->ch = channel; 2034 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2035 bdev_io->u.nvme_passthru.cmd = *cmd; 2036 bdev_io->u.nvme_passthru.buf = buf; 2037 bdev_io->u.nvme_passthru.nbytes = nbytes; 2038 bdev_io->u.nvme_passthru.md_buf = NULL; 2039 bdev_io->u.nvme_passthru.md_len = 0; 2040 2041 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2042 2043 spdk_bdev_io_submit(bdev_io); 2044 return 0; 2045 } 2046 2047 int 2048 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2049 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2050 spdk_bdev_io_completion_cb cb, void *cb_arg) 2051 { 2052 struct spdk_bdev *bdev = desc->bdev; 2053 struct spdk_bdev_io *bdev_io; 2054 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2055 2056 if (!desc->write) { 2057 /* 2058 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2059 * to easily determine if the command is a read or write, but for now just 2060 * do not allow io_passthru with a read-only descriptor. 2061 */ 2062 return -EBADF; 2063 } 2064 2065 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 2066 if (!bdev_io) { 2067 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2068 return -ENOMEM; 2069 } 2070 2071 bdev_io->ch = channel; 2072 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2073 bdev_io->u.nvme_passthru.cmd = *cmd; 2074 bdev_io->u.nvme_passthru.buf = buf; 2075 bdev_io->u.nvme_passthru.nbytes = nbytes; 2076 bdev_io->u.nvme_passthru.md_buf = md_buf; 2077 bdev_io->u.nvme_passthru.md_len = md_len; 2078 2079 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2080 2081 spdk_bdev_io_submit(bdev_io); 2082 return 0; 2083 } 2084 2085 int 2086 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2087 { 2088 if (!bdev_io) { 2089 SPDK_ERRLOG("bdev_io is NULL\n"); 2090 return -1; 2091 } 2092 2093 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2094 SPDK_ERRLOG("bdev_io is in pending state\n"); 2095 assert(false); 2096 return -1; 2097 } 2098 2099 spdk_bdev_put_io(bdev_io); 2100 2101 return 0; 2102 } 2103 2104 static void 2105 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2106 { 2107 struct spdk_bdev *bdev = bdev_ch->bdev; 2108 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 2109 struct spdk_bdev_io *bdev_io; 2110 2111 if (shared_ch->io_outstanding > shared_ch->nomem_threshold) { 2112 /* 2113 * Allow some more I/O to complete before retrying the nomem_io queue. 2114 * Some drivers (such as nvme) cannot immediately take a new I/O in 2115 * the context of a completion, because the resources for the I/O are 2116 * not released until control returns to the bdev poller. Also, we 2117 * may require several small I/O to complete before a larger I/O 2118 * (that requires splitting) can be submitted. 2119 */ 2120 return; 2121 } 2122 2123 while (!TAILQ_EMPTY(&shared_ch->nomem_io)) { 2124 bdev_io = TAILQ_FIRST(&shared_ch->nomem_io); 2125 TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link); 2126 shared_ch->io_outstanding++; 2127 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2128 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2129 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2130 break; 2131 } 2132 } 2133 } 2134 2135 static void 2136 _spdk_bdev_qos_io_complete(void *ctx) 2137 { 2138 struct spdk_bdev_io *bdev_io = ctx; 2139 2140 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, bdev_io->caller_ctx); 2141 } 2142 2143 static void 2144 _spdk_bdev_io_complete(void *ctx) 2145 { 2146 struct spdk_bdev_io *bdev_io = ctx; 2147 2148 assert(bdev_io->cb != NULL); 2149 2150 if (bdev_io->io_submit_ch) { 2151 bdev_io->ch = bdev_io->io_submit_ch; 2152 bdev_io->io_submit_ch = NULL; 2153 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2154 _spdk_bdev_qos_io_complete, bdev_io); 2155 } else { 2156 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2157 bdev_io->caller_ctx); 2158 } 2159 } 2160 2161 static void 2162 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2163 { 2164 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2165 2166 if (bdev_io->u.reset.ch_ref != NULL) { 2167 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2168 bdev_io->u.reset.ch_ref = NULL; 2169 } 2170 2171 _spdk_bdev_io_complete(bdev_io); 2172 } 2173 2174 static void 2175 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2176 { 2177 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2178 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2179 2180 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2181 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2182 _spdk_bdev_channel_start_reset(ch); 2183 } 2184 2185 spdk_for_each_channel_continue(i, 0); 2186 } 2187 2188 void 2189 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2190 { 2191 struct spdk_bdev *bdev = bdev_io->bdev; 2192 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2193 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 2194 2195 bdev_io->status = status; 2196 2197 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2198 bool unlock_channels = false; 2199 2200 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2201 SPDK_ERRLOG("NOMEM returned for reset\n"); 2202 } 2203 pthread_mutex_lock(&bdev->mutex); 2204 if (bdev_io == bdev->reset_in_progress) { 2205 bdev->reset_in_progress = NULL; 2206 unlock_channels = true; 2207 } 2208 pthread_mutex_unlock(&bdev->mutex); 2209 2210 if (unlock_channels) { 2211 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2212 bdev_io, _spdk_bdev_reset_complete); 2213 return; 2214 } 2215 } else { 2216 assert(shared_ch->io_outstanding > 0); 2217 shared_ch->io_outstanding--; 2218 if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) { 2219 if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) { 2220 _spdk_bdev_ch_retry_io(bdev_ch); 2221 } 2222 } else { 2223 TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link); 2224 /* 2225 * Wait for some of the outstanding I/O to complete before we 2226 * retry any of the nomem_io. Normally we will wait for 2227 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2228 * depth channels we will instead wait for half to complete. 2229 */ 2230 shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2, 2231 (int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); 2232 return; 2233 } 2234 } 2235 2236 if (status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2237 switch (bdev_io->type) { 2238 case SPDK_BDEV_IO_TYPE_READ: 2239 bdev_ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2240 bdev_ch->stat.num_read_ops++; 2241 bdev_ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2242 break; 2243 case SPDK_BDEV_IO_TYPE_WRITE: 2244 bdev_ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2245 bdev_ch->stat.num_write_ops++; 2246 bdev_ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2247 break; 2248 default: 2249 break; 2250 } 2251 } 2252 2253 #ifdef SPDK_CONFIG_VTUNE 2254 uint64_t now_tsc = spdk_get_ticks(); 2255 if (now_tsc > (bdev_ch->start_tsc + bdev_ch->interval_tsc)) { 2256 uint64_t data[5]; 2257 2258 data[0] = bdev_ch->stat.num_read_ops; 2259 data[1] = bdev_ch->stat.bytes_read; 2260 data[2] = bdev_ch->stat.num_write_ops; 2261 data[3] = bdev_ch->stat.bytes_written; 2262 data[4] = bdev->fn_table->get_spin_time ? 2263 bdev->fn_table->get_spin_time(bdev_ch->channel) : 0; 2264 2265 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_ch->handle, 2266 __itt_metadata_u64, 5, data); 2267 2268 memset(&bdev_ch->stat, 0, sizeof(bdev_ch->stat)); 2269 bdev_ch->start_tsc = now_tsc; 2270 } 2271 #endif 2272 2273 if (bdev_io->in_submit_request) { 2274 /* 2275 * Defer completion to avoid potential infinite recursion if the 2276 * user's completion callback issues a new I/O. 2277 */ 2278 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_ch->channel), 2279 _spdk_bdev_io_complete, bdev_io); 2280 } else { 2281 _spdk_bdev_io_complete(bdev_io); 2282 } 2283 } 2284 2285 void 2286 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2287 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2288 { 2289 if (sc == SPDK_SCSI_STATUS_GOOD) { 2290 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2291 } else { 2292 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2293 bdev_io->error.scsi.sc = sc; 2294 bdev_io->error.scsi.sk = sk; 2295 bdev_io->error.scsi.asc = asc; 2296 bdev_io->error.scsi.ascq = ascq; 2297 } 2298 2299 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2300 } 2301 2302 void 2303 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2304 int *sc, int *sk, int *asc, int *ascq) 2305 { 2306 assert(sc != NULL); 2307 assert(sk != NULL); 2308 assert(asc != NULL); 2309 assert(ascq != NULL); 2310 2311 switch (bdev_io->status) { 2312 case SPDK_BDEV_IO_STATUS_SUCCESS: 2313 *sc = SPDK_SCSI_STATUS_GOOD; 2314 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2315 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2316 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2317 break; 2318 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2319 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2320 break; 2321 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2322 *sc = bdev_io->error.scsi.sc; 2323 *sk = bdev_io->error.scsi.sk; 2324 *asc = bdev_io->error.scsi.asc; 2325 *ascq = bdev_io->error.scsi.ascq; 2326 break; 2327 default: 2328 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2329 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2330 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2331 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2332 break; 2333 } 2334 } 2335 2336 void 2337 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2338 { 2339 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2340 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2341 } else { 2342 bdev_io->error.nvme.sct = sct; 2343 bdev_io->error.nvme.sc = sc; 2344 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2345 } 2346 2347 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2348 } 2349 2350 void 2351 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2352 { 2353 assert(sct != NULL); 2354 assert(sc != NULL); 2355 2356 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2357 *sct = bdev_io->error.nvme.sct; 2358 *sc = bdev_io->error.nvme.sc; 2359 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2360 *sct = SPDK_NVME_SCT_GENERIC; 2361 *sc = SPDK_NVME_SC_SUCCESS; 2362 } else { 2363 *sct = SPDK_NVME_SCT_GENERIC; 2364 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2365 } 2366 } 2367 2368 struct spdk_thread * 2369 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2370 { 2371 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2372 } 2373 2374 static int 2375 _spdk_bdev_register(struct spdk_bdev *bdev) 2376 { 2377 struct spdk_bdev_module *module; 2378 2379 assert(bdev->module != NULL); 2380 2381 if (!bdev->name) { 2382 SPDK_ERRLOG("Bdev name is NULL\n"); 2383 return -EINVAL; 2384 } 2385 2386 if (spdk_bdev_get_by_name(bdev->name)) { 2387 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2388 return -EEXIST; 2389 } 2390 2391 bdev->status = SPDK_BDEV_STATUS_READY; 2392 2393 TAILQ_INIT(&bdev->open_descs); 2394 2395 TAILQ_INIT(&bdev->vbdevs); 2396 TAILQ_INIT(&bdev->base_bdevs); 2397 2398 TAILQ_INIT(&bdev->aliases); 2399 2400 bdev->reset_in_progress = NULL; 2401 2402 spdk_io_device_register(__bdev_to_io_dev(bdev), 2403 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2404 sizeof(struct spdk_bdev_channel)); 2405 2406 pthread_mutex_init(&bdev->mutex, NULL); 2407 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2408 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2409 2410 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2411 if (module->examine) { 2412 module->action_in_progress++; 2413 module->examine(bdev); 2414 } 2415 } 2416 2417 return 0; 2418 } 2419 2420 int 2421 spdk_bdev_register(struct spdk_bdev *bdev) 2422 { 2423 return _spdk_bdev_register(bdev); 2424 } 2425 2426 int 2427 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2428 { 2429 int i, rc; 2430 2431 rc = _spdk_bdev_register(vbdev); 2432 if (rc) { 2433 return rc; 2434 } 2435 2436 for (i = 0; i < base_bdev_count; i++) { 2437 assert(base_bdevs[i] != NULL); 2438 assert(base_bdevs[i]->claim_module != NULL); 2439 TAILQ_INSERT_TAIL(&vbdev->base_bdevs, base_bdevs[i], base_bdev_link); 2440 TAILQ_INSERT_TAIL(&base_bdevs[i]->vbdevs, vbdev, vbdev_link); 2441 } 2442 2443 return 0; 2444 } 2445 2446 void 2447 spdk_bdev_unregister_done(struct spdk_bdev *bdev, int bdeverrno) 2448 { 2449 if (bdev->unregister_cb != NULL) { 2450 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2451 } 2452 } 2453 2454 static void 2455 _remove_notify(void *arg) 2456 { 2457 struct spdk_bdev_desc *desc = arg; 2458 2459 desc->remove_cb(desc->remove_ctx); 2460 } 2461 2462 void 2463 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2464 { 2465 struct spdk_bdev_desc *desc, *tmp; 2466 int rc; 2467 bool do_destruct = true; 2468 struct spdk_bdev *base_bdev; 2469 2470 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2471 2472 pthread_mutex_lock(&bdev->mutex); 2473 2474 if (!TAILQ_EMPTY(&bdev->base_bdevs)) { 2475 TAILQ_FOREACH(base_bdev, &bdev->base_bdevs, base_bdev_link) { 2476 TAILQ_REMOVE(&base_bdev->vbdevs, bdev, vbdev_link); 2477 } 2478 } 2479 2480 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2481 bdev->unregister_cb = cb_fn; 2482 bdev->unregister_ctx = cb_arg; 2483 2484 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2485 if (desc->remove_cb) { 2486 do_destruct = false; 2487 /* 2488 * Defer invocation of the remove_cb to a separate message that will 2489 * run later on this thread. This ensures this context unwinds and 2490 * we don't recursively unregister this bdev again if the remove_cb 2491 * immediately closes its descriptor. 2492 */ 2493 spdk_thread_send_msg(spdk_get_thread(), _remove_notify, desc); 2494 } 2495 } 2496 2497 if (!do_destruct) { 2498 pthread_mutex_unlock(&bdev->mutex); 2499 return; 2500 } 2501 2502 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2503 pthread_mutex_unlock(&bdev->mutex); 2504 2505 pthread_mutex_destroy(&bdev->mutex); 2506 2507 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 2508 2509 rc = bdev->fn_table->destruct(bdev->ctxt); 2510 if (rc < 0) { 2511 SPDK_ERRLOG("destruct failed\n"); 2512 } 2513 if (rc <= 0 && cb_fn != NULL) { 2514 cb_fn(cb_arg, rc); 2515 } 2516 } 2517 2518 int 2519 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2520 void *remove_ctx, struct spdk_bdev_desc **_desc) 2521 { 2522 struct spdk_bdev_desc *desc; 2523 2524 desc = calloc(1, sizeof(*desc)); 2525 if (desc == NULL) { 2526 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2527 return -ENOMEM; 2528 } 2529 2530 pthread_mutex_lock(&bdev->mutex); 2531 2532 if (write && bdev->claim_module) { 2533 SPDK_INFOLOG(SPDK_LOG_BDEV, "Could not open %s - already claimed\n", bdev->name); 2534 free(desc); 2535 pthread_mutex_unlock(&bdev->mutex); 2536 return -EPERM; 2537 } 2538 2539 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2540 2541 desc->bdev = bdev; 2542 desc->remove_cb = remove_cb; 2543 desc->remove_ctx = remove_ctx; 2544 desc->write = write; 2545 *_desc = desc; 2546 2547 pthread_mutex_unlock(&bdev->mutex); 2548 2549 return 0; 2550 } 2551 2552 void 2553 spdk_bdev_close(struct spdk_bdev_desc *desc) 2554 { 2555 struct spdk_bdev *bdev = desc->bdev; 2556 bool do_unregister = false; 2557 2558 pthread_mutex_lock(&bdev->mutex); 2559 2560 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2561 free(desc); 2562 2563 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2564 do_unregister = true; 2565 } 2566 pthread_mutex_unlock(&bdev->mutex); 2567 2568 if (do_unregister == true) { 2569 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2570 } 2571 } 2572 2573 int 2574 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2575 struct spdk_bdev_module *module) 2576 { 2577 if (bdev->claim_module != NULL) { 2578 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2579 bdev->claim_module->name); 2580 return -EPERM; 2581 } 2582 2583 if (desc && !desc->write) { 2584 desc->write = true; 2585 } 2586 2587 bdev->claim_module = module; 2588 return 0; 2589 } 2590 2591 void 2592 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2593 { 2594 assert(bdev->claim_module != NULL); 2595 bdev->claim_module = NULL; 2596 } 2597 2598 struct spdk_bdev * 2599 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2600 { 2601 return desc->bdev; 2602 } 2603 2604 void 2605 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2606 { 2607 struct iovec *iovs; 2608 int iovcnt; 2609 2610 if (bdev_io == NULL) { 2611 return; 2612 } 2613 2614 switch (bdev_io->type) { 2615 case SPDK_BDEV_IO_TYPE_READ: 2616 iovs = bdev_io->u.bdev.iovs; 2617 iovcnt = bdev_io->u.bdev.iovcnt; 2618 break; 2619 case SPDK_BDEV_IO_TYPE_WRITE: 2620 iovs = bdev_io->u.bdev.iovs; 2621 iovcnt = bdev_io->u.bdev.iovcnt; 2622 break; 2623 default: 2624 iovs = NULL; 2625 iovcnt = 0; 2626 break; 2627 } 2628 2629 if (iovp) { 2630 *iovp = iovs; 2631 } 2632 if (iovcntp) { 2633 *iovcntp = iovcnt; 2634 } 2635 } 2636 2637 void 2638 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2639 { 2640 2641 if (spdk_bdev_module_list_find(bdev_module->name)) { 2642 fprintf(stderr, "ERROR: module '%s' already registered.\n", bdev_module->name); 2643 assert(false); 2644 } 2645 2646 if (bdev_module->async_init) { 2647 bdev_module->action_in_progress = 1; 2648 } 2649 2650 /* 2651 * Modules with examine callbacks must be initialized first, so they are 2652 * ready to handle examine callbacks from later modules that will 2653 * register physical bdevs. 2654 */ 2655 if (bdev_module->examine != NULL) { 2656 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2657 } else { 2658 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2659 } 2660 } 2661 2662 struct spdk_bdev_module * 2663 spdk_bdev_module_list_find(const char *name) 2664 { 2665 struct spdk_bdev_module *bdev_module; 2666 2667 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2668 if (strcmp(name, bdev_module->name) == 0) { 2669 break; 2670 } 2671 } 2672 2673 return bdev_module; 2674 } 2675 2676 static void 2677 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2678 { 2679 uint64_t len; 2680 2681 if (!success) { 2682 bdev_io->cb = bdev_io->stored_user_cb; 2683 _spdk_bdev_io_complete(bdev_io); 2684 return; 2685 } 2686 2687 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2688 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->split_remaining_num_blocks, 2689 ZERO_BUFFER_SIZE); 2690 2691 bdev_io->u.bdev.offset_blocks = bdev_io->split_current_offset_blocks; 2692 bdev_io->u.bdev.iov.iov_len = len; 2693 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2694 bdev_io->split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2695 bdev_io->split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2696 2697 /* if this round completes the i/o, change the callback to be the original user callback */ 2698 if (bdev_io->split_remaining_num_blocks == 0) { 2699 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->stored_user_cb); 2700 } else { 2701 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 2702 } 2703 spdk_bdev_io_submit(bdev_io); 2704 } 2705 2706 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 2707