1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 39 #include "spdk/env.h" 40 #include "spdk/event.h" 41 #include "spdk/io_channel.h" 42 #include "spdk/likely.h" 43 #include "spdk/queue.h" 44 #include "spdk/nvme_spec.h" 45 #include "spdk/scsi_spec.h" 46 #include "spdk/util.h" 47 48 #include "spdk_internal/bdev.h" 49 #include "spdk_internal/log.h" 50 #include "spdk/string.h" 51 52 #ifdef SPDK_CONFIG_VTUNE 53 #include "ittnotify.h" 54 #include "ittnotify_types.h" 55 int __itt_init_ittlib(const char *, __itt_group_id); 56 #endif 57 58 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 59 #define SPDK_BDEV_IO_CACHE_SIZE 256 60 #define BUF_SMALL_POOL_SIZE 8192 61 #define BUF_LARGE_POOL_SIZE 1024 62 #define NOMEM_THRESHOLD_COUNT 8 63 #define ZERO_BUFFER_SIZE 0x100000 64 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 65 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 66 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 67 68 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t; 69 typedef STAILQ_HEAD(, spdk_bdev_io) bdev_io_stailq_t; 70 71 struct spdk_bdev_mgr { 72 struct spdk_mempool *bdev_io_pool; 73 74 struct spdk_mempool *buf_small_pool; 75 struct spdk_mempool *buf_large_pool; 76 77 void *zero_buffer; 78 79 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 80 81 TAILQ_HEAD(, spdk_bdev) bdevs; 82 83 bool init_complete; 84 bool module_init_complete; 85 86 #ifdef SPDK_CONFIG_VTUNE 87 __itt_domain *domain; 88 #endif 89 }; 90 91 static struct spdk_bdev_mgr g_bdev_mgr = { 92 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 93 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 94 .init_complete = false, 95 .module_init_complete = false, 96 }; 97 98 static spdk_bdev_init_cb g_init_cb_fn = NULL; 99 static void *g_init_cb_arg = NULL; 100 101 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 102 static void *g_fini_cb_arg = NULL; 103 static struct spdk_thread *g_fini_thread = NULL; 104 105 106 struct spdk_bdev_mgmt_channel { 107 bdev_io_stailq_t need_buf_small; 108 bdev_io_stailq_t need_buf_large; 109 110 /* 111 * Each thread keeps a cache of bdev_io - this allows 112 * bdev threads which are *not* DPDK threads to still 113 * benefit from a per-thread bdev_io cache. Without 114 * this, non-DPDK threads fetching from the mempool 115 * incur a cmpxchg on get and put. 116 */ 117 bdev_io_stailq_t per_thread_cache; 118 uint32_t per_thread_cache_count; 119 120 TAILQ_HEAD(, spdk_bdev_module_channel) module_channels; 121 }; 122 123 struct spdk_bdev_desc { 124 struct spdk_bdev *bdev; 125 spdk_bdev_remove_cb_t remove_cb; 126 void *remove_ctx; 127 bool write; 128 TAILQ_ENTRY(spdk_bdev_desc) link; 129 }; 130 131 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 132 #define BDEV_CH_QOS_ENABLED (1 << 1) 133 134 struct spdk_bdev_channel { 135 struct spdk_bdev *bdev; 136 137 /* The channel for the underlying device */ 138 struct spdk_io_channel *channel; 139 140 /* Channel for the bdev manager */ 141 struct spdk_io_channel *mgmt_channel; 142 143 struct spdk_bdev_io_stat stat; 144 145 bdev_io_tailq_t queued_resets; 146 147 uint32_t flags; 148 149 /* 150 * Rate limiting on this channel. 151 * Queue of IO awaiting issue because of a QoS rate limiting happened 152 * on this channel. 153 */ 154 bdev_io_tailq_t qos_io; 155 156 /* 157 * Rate limiting on this channel. 158 * Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 159 * only valid for the master channel which manages the outstanding IOs. 160 */ 161 uint64_t qos_max_ios_per_timeslice; 162 163 /* 164 * Rate limiting on this channel. 165 * Submitted IO in one timeslice (e.g., 1ms) 166 */ 167 uint64_t io_submitted_this_timeslice; 168 169 /* 170 * Rate limiting on this channel. 171 * Periodic running QoS poller in millisecond. 172 */ 173 struct spdk_poller *qos_poller; 174 175 /* Per-device channel */ 176 struct spdk_bdev_module_channel *module_ch; 177 178 #ifdef SPDK_CONFIG_VTUNE 179 uint64_t start_tsc; 180 uint64_t interval_tsc; 181 __itt_string_handle *handle; 182 #endif 183 184 }; 185 186 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 187 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 188 189 /* 190 * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device 191 * will queue here their IO that awaits retry. It makes it posible to retry sending 192 * IO to one bdev after IO from other bdev completes. 193 */ 194 struct spdk_bdev_module_channel { 195 /* 196 * Count of I/O submitted to bdev module and waiting for completion. 197 * Incremented before submit_request() is called on an spdk_bdev_io. 198 */ 199 uint64_t io_outstanding; 200 201 /* 202 * Queue of IO awaiting retry because of a previous NOMEM status returned 203 * on this channel. 204 */ 205 bdev_io_tailq_t nomem_io; 206 207 /* 208 * Threshold which io_outstanding must drop to before retrying nomem_io. 209 */ 210 uint64_t nomem_threshold; 211 212 /* I/O channel allocated by a bdev module */ 213 struct spdk_io_channel *module_ch; 214 215 uint32_t ref; 216 217 TAILQ_ENTRY(spdk_bdev_module_channel) link; 218 }; 219 220 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 221 222 struct spdk_bdev * 223 spdk_bdev_first(void) 224 { 225 struct spdk_bdev *bdev; 226 227 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 228 if (bdev) { 229 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 230 } 231 232 return bdev; 233 } 234 235 struct spdk_bdev * 236 spdk_bdev_next(struct spdk_bdev *prev) 237 { 238 struct spdk_bdev *bdev; 239 240 bdev = TAILQ_NEXT(prev, link); 241 if (bdev) { 242 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 243 } 244 245 return bdev; 246 } 247 248 static struct spdk_bdev * 249 _bdev_next_leaf(struct spdk_bdev *bdev) 250 { 251 while (bdev != NULL) { 252 if (bdev->claim_module == NULL) { 253 return bdev; 254 } else { 255 bdev = TAILQ_NEXT(bdev, link); 256 } 257 } 258 259 return bdev; 260 } 261 262 struct spdk_bdev * 263 spdk_bdev_first_leaf(void) 264 { 265 struct spdk_bdev *bdev; 266 267 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 268 269 if (bdev) { 270 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 271 } 272 273 return bdev; 274 } 275 276 struct spdk_bdev * 277 spdk_bdev_next_leaf(struct spdk_bdev *prev) 278 { 279 struct spdk_bdev *bdev; 280 281 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 282 283 if (bdev) { 284 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 285 } 286 287 return bdev; 288 } 289 290 struct spdk_bdev * 291 spdk_bdev_get_by_name(const char *bdev_name) 292 { 293 struct spdk_bdev_alias *tmp; 294 struct spdk_bdev *bdev = spdk_bdev_first(); 295 296 while (bdev != NULL) { 297 if (strcmp(bdev_name, bdev->name) == 0) { 298 return bdev; 299 } 300 301 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 302 if (strcmp(bdev_name, tmp->alias) == 0) { 303 return bdev; 304 } 305 } 306 307 bdev = spdk_bdev_next(bdev); 308 } 309 310 return NULL; 311 } 312 313 static void 314 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 315 { 316 assert(bdev_io->get_buf_cb != NULL); 317 assert(buf != NULL); 318 assert(bdev_io->u.bdev.iovs != NULL); 319 320 bdev_io->buf = buf; 321 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 322 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 323 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 324 } 325 326 static void 327 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 328 { 329 struct spdk_mempool *pool; 330 struct spdk_bdev_io *tmp; 331 void *buf; 332 bdev_io_stailq_t *stailq; 333 struct spdk_bdev_mgmt_channel *ch; 334 335 assert(bdev_io->u.bdev.iovcnt == 1); 336 337 buf = bdev_io->buf; 338 ch = bdev_io->mgmt_ch; 339 340 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 341 pool = g_bdev_mgr.buf_small_pool; 342 stailq = &ch->need_buf_small; 343 } else { 344 pool = g_bdev_mgr.buf_large_pool; 345 stailq = &ch->need_buf_large; 346 } 347 348 if (STAILQ_EMPTY(stailq)) { 349 spdk_mempool_put(pool, buf); 350 } else { 351 tmp = STAILQ_FIRST(stailq); 352 STAILQ_REMOVE_HEAD(stailq, buf_link); 353 spdk_bdev_io_set_buf(tmp, buf); 354 } 355 } 356 357 void 358 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 359 { 360 struct spdk_mempool *pool; 361 bdev_io_stailq_t *stailq; 362 void *buf = NULL; 363 struct spdk_bdev_mgmt_channel *ch; 364 365 assert(cb != NULL); 366 assert(bdev_io->u.bdev.iovs != NULL); 367 368 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 369 /* Buffer already present */ 370 cb(bdev_io->ch->channel, bdev_io); 371 return; 372 } 373 374 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 375 ch = spdk_io_channel_get_ctx(bdev_io->ch->mgmt_channel); 376 377 bdev_io->buf_len = len; 378 bdev_io->get_buf_cb = cb; 379 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 380 pool = g_bdev_mgr.buf_small_pool; 381 stailq = &ch->need_buf_small; 382 } else { 383 pool = g_bdev_mgr.buf_large_pool; 384 stailq = &ch->need_buf_large; 385 } 386 387 buf = spdk_mempool_get(pool); 388 389 if (!buf) { 390 STAILQ_INSERT_TAIL(stailq, bdev_io, buf_link); 391 } else { 392 spdk_bdev_io_set_buf(bdev_io, buf); 393 } 394 } 395 396 static int 397 spdk_bdev_module_get_max_ctx_size(void) 398 { 399 struct spdk_bdev_module *bdev_module; 400 int max_bdev_module_size = 0; 401 402 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 403 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 404 max_bdev_module_size = bdev_module->get_ctx_size(); 405 } 406 } 407 408 return max_bdev_module_size; 409 } 410 411 void 412 spdk_bdev_config_text(FILE *fp) 413 { 414 struct spdk_bdev_module *bdev_module; 415 416 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 417 if (bdev_module->config_text) { 418 bdev_module->config_text(fp); 419 } 420 } 421 } 422 423 static int 424 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 425 { 426 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 427 428 STAILQ_INIT(&ch->need_buf_small); 429 STAILQ_INIT(&ch->need_buf_large); 430 431 STAILQ_INIT(&ch->per_thread_cache); 432 ch->per_thread_cache_count = 0; 433 434 TAILQ_INIT(&ch->module_channels); 435 436 return 0; 437 } 438 439 static void 440 spdk_bdev_mgmt_channel_free_resources(struct spdk_bdev_mgmt_channel *ch) 441 { 442 struct spdk_bdev_io *bdev_io; 443 444 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 445 SPDK_ERRLOG("Pending I/O list wasn't empty on channel free\n"); 446 } 447 448 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 449 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 450 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 451 ch->per_thread_cache_count--; 452 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 453 } 454 455 assert(ch->per_thread_cache_count == 0); 456 } 457 458 static void 459 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 460 { 461 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 462 463 spdk_bdev_mgmt_channel_free_resources(ch); 464 } 465 466 static void 467 spdk_bdev_init_complete(int rc) 468 { 469 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 470 void *cb_arg = g_init_cb_arg; 471 472 g_bdev_mgr.init_complete = true; 473 g_init_cb_fn = NULL; 474 g_init_cb_arg = NULL; 475 476 cb_fn(cb_arg, rc); 477 } 478 479 static void 480 spdk_bdev_module_action_complete(void) 481 { 482 struct spdk_bdev_module *m; 483 484 /* 485 * Don't finish bdev subsystem initialization if 486 * module pre-initialization is still in progress, or 487 * the subsystem been already initialized. 488 */ 489 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 490 return; 491 } 492 493 /* 494 * Check all bdev modules for inits/examinations in progress. If any 495 * exist, return immediately since we cannot finish bdev subsystem 496 * initialization until all are completed. 497 */ 498 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 499 if (m->action_in_progress > 0) { 500 return; 501 } 502 } 503 504 /* 505 * Modules already finished initialization - now that all 506 * the bdev modules have finished their asynchronous I/O 507 * processing, the entire bdev layer can be marked as complete. 508 */ 509 spdk_bdev_init_complete(0); 510 } 511 512 static void 513 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 514 { 515 assert(module->action_in_progress > 0); 516 module->action_in_progress--; 517 spdk_bdev_module_action_complete(); 518 } 519 520 void 521 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 522 { 523 spdk_bdev_module_action_done(module); 524 } 525 526 void 527 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 528 { 529 spdk_bdev_module_action_done(module); 530 } 531 532 static int 533 spdk_bdev_modules_init(void) 534 { 535 struct spdk_bdev_module *module; 536 int rc = 0; 537 538 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 539 rc = module->module_init(); 540 if (rc != 0) { 541 break; 542 } 543 } 544 545 g_bdev_mgr.module_init_complete = true; 546 return rc; 547 } 548 void 549 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 550 { 551 int cache_size; 552 int rc = 0; 553 char mempool_name[32]; 554 555 assert(cb_fn != NULL); 556 557 g_init_cb_fn = cb_fn; 558 g_init_cb_arg = cb_arg; 559 560 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 561 562 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 563 SPDK_BDEV_IO_POOL_SIZE, 564 sizeof(struct spdk_bdev_io) + 565 spdk_bdev_module_get_max_ctx_size(), 566 0, 567 SPDK_ENV_SOCKET_ID_ANY); 568 569 if (g_bdev_mgr.bdev_io_pool == NULL) { 570 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 571 spdk_bdev_init_complete(-1); 572 return; 573 } 574 575 /** 576 * Ensure no more than half of the total buffers end up local caches, by 577 * using spdk_env_get_core_count() to determine how many local caches we need 578 * to account for. 579 */ 580 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 581 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 582 583 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 584 BUF_SMALL_POOL_SIZE, 585 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 586 cache_size, 587 SPDK_ENV_SOCKET_ID_ANY); 588 if (!g_bdev_mgr.buf_small_pool) { 589 SPDK_ERRLOG("create rbuf small pool failed\n"); 590 spdk_bdev_init_complete(-1); 591 return; 592 } 593 594 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 595 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 596 597 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 598 BUF_LARGE_POOL_SIZE, 599 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 600 cache_size, 601 SPDK_ENV_SOCKET_ID_ANY); 602 if (!g_bdev_mgr.buf_large_pool) { 603 SPDK_ERRLOG("create rbuf large pool failed\n"); 604 spdk_bdev_init_complete(-1); 605 return; 606 } 607 608 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 609 NULL); 610 if (!g_bdev_mgr.zero_buffer) { 611 SPDK_ERRLOG("create bdev zero buffer failed\n"); 612 spdk_bdev_init_complete(-1); 613 return; 614 } 615 616 #ifdef SPDK_CONFIG_VTUNE 617 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 618 #endif 619 620 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 621 spdk_bdev_mgmt_channel_destroy, 622 sizeof(struct spdk_bdev_mgmt_channel)); 623 624 rc = spdk_bdev_modules_init(); 625 if (rc != 0) { 626 SPDK_ERRLOG("bdev modules init failed\n"); 627 spdk_bdev_init_complete(-1); 628 return; 629 } 630 631 spdk_bdev_module_action_complete(); 632 } 633 634 static void 635 spdk_bdev_module_finish_cb(void *io_device) 636 { 637 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 638 639 cb_fn(g_fini_cb_arg); 640 g_fini_cb_fn = NULL; 641 g_fini_cb_arg = NULL; 642 } 643 644 static void 645 spdk_bdev_module_finish_complete(struct spdk_io_channel_iter *i, int status) 646 { 647 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 648 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 649 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 650 SPDK_BDEV_IO_POOL_SIZE); 651 } 652 653 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 654 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 655 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 656 BUF_SMALL_POOL_SIZE); 657 assert(false); 658 } 659 660 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 661 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 662 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 663 BUF_LARGE_POOL_SIZE); 664 assert(false); 665 } 666 667 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 668 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 669 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 670 spdk_dma_free(g_bdev_mgr.zero_buffer); 671 672 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_module_finish_cb); 673 } 674 675 static void 676 mgmt_channel_free_resources(struct spdk_io_channel_iter *i) 677 { 678 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 679 struct spdk_bdev_mgmt_channel *ch = spdk_io_channel_get_ctx(_ch); 680 681 spdk_bdev_mgmt_channel_free_resources(ch); 682 spdk_for_each_channel_continue(i, 0); 683 } 684 685 static void 686 spdk_bdev_module_finish_iter(void *arg) 687 { 688 /* Notice that this variable is static. It is saved between calls to 689 * this function. */ 690 static struct spdk_bdev_module *resume_bdev_module = NULL; 691 struct spdk_bdev_module *bdev_module; 692 693 /* Start iterating from the last touched module */ 694 if (!resume_bdev_module) { 695 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 696 } else { 697 bdev_module = TAILQ_NEXT(resume_bdev_module, tailq); 698 } 699 700 while (bdev_module) { 701 if (bdev_module->async_fini) { 702 /* Save our place so we can resume later. We must 703 * save the variable here, before calling module_fini() 704 * below, because in some cases the module may immediately 705 * call spdk_bdev_module_finish_done() and re-enter 706 * this function to continue iterating. */ 707 resume_bdev_module = bdev_module; 708 } 709 710 if (bdev_module->module_fini) { 711 bdev_module->module_fini(); 712 } 713 714 if (bdev_module->async_fini) { 715 return; 716 } 717 718 bdev_module = TAILQ_NEXT(bdev_module, tailq); 719 } 720 721 resume_bdev_module = NULL; 722 spdk_for_each_channel(&g_bdev_mgr, mgmt_channel_free_resources, NULL, 723 spdk_bdev_module_finish_complete); 724 } 725 726 void 727 spdk_bdev_module_finish_done(void) 728 { 729 if (spdk_get_thread() != g_fini_thread) { 730 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 731 } else { 732 spdk_bdev_module_finish_iter(NULL); 733 } 734 } 735 736 static void 737 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 738 { 739 struct spdk_bdev *bdev = cb_arg; 740 741 if (bdeverrno && bdev) { 742 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 743 bdev->name); 744 745 /* 746 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 747 * bdev; try to continue by manually removing this bdev from the list and continue 748 * with the next bdev in the list. 749 */ 750 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 751 } 752 753 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 754 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 755 spdk_bdev_module_finish_iter(NULL); 756 return; 757 } 758 759 /* 760 * Unregister the first bdev in the list. 761 * 762 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 763 * calling the remove_cb of the descriptors first. 764 * 765 * Once this bdev and all of its open descriptors have been cleaned up, this function 766 * will be called again via the unregister completion callback to continue the cleanup 767 * process with the next bdev. 768 */ 769 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 770 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 771 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 772 } 773 774 static void 775 _spdk_bdev_finish_unregister_bdevs(void) 776 { 777 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 778 } 779 780 void 781 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 782 { 783 assert(cb_fn != NULL); 784 785 g_fini_thread = spdk_get_thread(); 786 787 g_fini_cb_fn = cb_fn; 788 g_fini_cb_arg = cb_arg; 789 790 _spdk_bdev_finish_unregister_bdevs(); 791 } 792 793 static struct spdk_bdev_io * 794 spdk_bdev_get_io(struct spdk_io_channel *_ch) 795 { 796 struct spdk_bdev_mgmt_channel *ch = spdk_io_channel_get_ctx(_ch); 797 struct spdk_bdev_io *bdev_io; 798 799 if (ch->per_thread_cache_count > 0) { 800 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 801 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 802 ch->per_thread_cache_count--; 803 } else { 804 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 805 if (!bdev_io) { 806 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 807 return NULL; 808 } 809 } 810 811 bdev_io->mgmt_ch = ch; 812 813 return bdev_io; 814 } 815 816 static void 817 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 818 { 819 struct spdk_bdev_mgmt_channel *ch = bdev_io->mgmt_ch; 820 821 if (bdev_io->buf != NULL) { 822 spdk_bdev_io_put_buf(bdev_io); 823 } 824 825 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 826 ch->per_thread_cache_count++; 827 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, buf_link); 828 } else { 829 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 830 } 831 } 832 833 static void 834 _spdk_bdev_qos_io_submit(void *ctx) 835 { 836 struct spdk_bdev_channel *ch = ctx; 837 struct spdk_bdev_io *bdev_io = NULL; 838 struct spdk_bdev *bdev = ch->bdev; 839 struct spdk_bdev_module_channel *shared_ch = ch->module_ch; 840 841 while (!TAILQ_EMPTY(&ch->qos_io)) { 842 if (ch->io_submitted_this_timeslice < ch->qos_max_ios_per_timeslice) { 843 bdev_io = TAILQ_FIRST(&ch->qos_io); 844 TAILQ_REMOVE(&ch->qos_io, bdev_io, link); 845 ch->io_submitted_this_timeslice++; 846 shared_ch->io_outstanding++; 847 bdev->fn_table->submit_request(ch->channel, bdev_io); 848 } else { 849 break; 850 } 851 } 852 } 853 854 static void 855 _spdk_bdev_io_submit(void *ctx) 856 { 857 struct spdk_bdev_io *bdev_io = ctx; 858 struct spdk_bdev *bdev = bdev_io->bdev; 859 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 860 struct spdk_io_channel *ch = bdev_ch->channel; 861 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 862 863 bdev_io->submit_tsc = spdk_get_ticks(); 864 shared_ch->io_outstanding++; 865 bdev_io->in_submit_request = true; 866 if (spdk_likely(bdev_ch->flags == 0)) { 867 if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) { 868 bdev->fn_table->submit_request(ch, bdev_io); 869 } else { 870 shared_ch->io_outstanding--; 871 TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link); 872 } 873 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 874 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 875 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 876 shared_ch->io_outstanding--; 877 TAILQ_INSERT_TAIL(&bdev_ch->qos_io, bdev_io, link); 878 _spdk_bdev_qos_io_submit(bdev_ch); 879 } else { 880 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 881 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 882 } 883 bdev_io->in_submit_request = false; 884 } 885 886 static void 887 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 888 { 889 struct spdk_bdev *bdev = bdev_io->bdev; 890 891 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 892 893 /* QoS channel and thread have been properly configured */ 894 if (bdev->ios_per_sec > 0 && bdev->qos_channel && bdev->qos_thread) { 895 bdev_io->io_submit_ch = bdev_io->ch; 896 bdev_io->ch = bdev->qos_channel; 897 spdk_thread_send_msg(bdev->qos_thread, _spdk_bdev_io_submit, bdev_io); 898 } else { 899 _spdk_bdev_io_submit(bdev_io); 900 } 901 } 902 903 static void 904 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 905 { 906 struct spdk_bdev *bdev = bdev_io->bdev; 907 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 908 struct spdk_io_channel *ch = bdev_ch->channel; 909 910 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 911 912 bdev_io->in_submit_request = true; 913 bdev->fn_table->submit_request(ch, bdev_io); 914 bdev_io->in_submit_request = false; 915 } 916 917 static void 918 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 919 struct spdk_bdev *bdev, void *cb_arg, 920 spdk_bdev_io_completion_cb cb) 921 { 922 bdev_io->bdev = bdev; 923 bdev_io->caller_ctx = cb_arg; 924 bdev_io->cb = cb; 925 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 926 bdev_io->in_submit_request = false; 927 bdev_io->buf = NULL; 928 bdev_io->io_submit_ch = NULL; 929 } 930 931 bool 932 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 933 { 934 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 935 } 936 937 int 938 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 939 { 940 if (bdev->fn_table->dump_info_json) { 941 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 942 } 943 944 return 0; 945 } 946 947 static void 948 spdk_bdev_qos_get_max_ios_per_timeslice(struct spdk_bdev *bdev) 949 { 950 uint64_t qos_max_ios_per_timeslice = 0; 951 952 qos_max_ios_per_timeslice = bdev->ios_per_sec * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 953 SPDK_BDEV_SEC_TO_USEC; 954 bdev->qos_channel->qos_max_ios_per_timeslice = spdk_max(qos_max_ios_per_timeslice, 955 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 956 } 957 958 static int 959 spdk_bdev_channel_poll_qos(void *arg) 960 { 961 struct spdk_bdev_channel *ch = arg; 962 struct spdk_bdev *bdev = ch->bdev; 963 964 /* Reset for next round of rate limiting */ 965 ch->io_submitted_this_timeslice = 0; 966 spdk_bdev_qos_get_max_ios_per_timeslice(bdev); 967 968 _spdk_bdev_qos_io_submit(ch); 969 970 return -1; 971 } 972 973 static int 974 _spdk_bdev_channel_create(struct spdk_bdev_channel *ch, void *io_device) 975 { 976 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 977 struct spdk_bdev_mgmt_channel *mgmt_ch; 978 struct spdk_bdev_module_channel *shared_ch; 979 980 ch->bdev = bdev; 981 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 982 if (!ch->channel) { 983 return -1; 984 } 985 986 ch->mgmt_channel = spdk_get_io_channel(&g_bdev_mgr); 987 if (!ch->mgmt_channel) { 988 return -1; 989 } 990 991 mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel); 992 TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) { 993 if (shared_ch->module_ch == ch->channel) { 994 shared_ch->ref++; 995 break; 996 } 997 } 998 999 if (shared_ch == NULL) { 1000 shared_ch = calloc(1, sizeof(*shared_ch)); 1001 if (!shared_ch) { 1002 return -1; 1003 } 1004 1005 shared_ch->io_outstanding = 0; 1006 TAILQ_INIT(&shared_ch->nomem_io); 1007 shared_ch->nomem_threshold = 0; 1008 shared_ch->module_ch = ch->channel; 1009 shared_ch->ref = 1; 1010 TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link); 1011 } 1012 1013 memset(&ch->stat, 0, sizeof(ch->stat)); 1014 TAILQ_INIT(&ch->queued_resets); 1015 TAILQ_INIT(&ch->qos_io); 1016 ch->qos_max_ios_per_timeslice = 0; 1017 ch->io_submitted_this_timeslice = 0; 1018 ch->qos_poller = NULL; 1019 ch->flags = 0; 1020 ch->module_ch = shared_ch; 1021 1022 return 0; 1023 } 1024 1025 static void 1026 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1027 { 1028 struct spdk_bdev_mgmt_channel *mgmt_channel; 1029 struct spdk_bdev_module_channel *shared_ch = NULL; 1030 1031 if (!ch) { 1032 return; 1033 } 1034 1035 if (ch->channel) { 1036 spdk_put_io_channel(ch->channel); 1037 } 1038 1039 if (ch->mgmt_channel) { 1040 shared_ch = ch->module_ch; 1041 if (shared_ch) { 1042 assert(shared_ch->ref > 0); 1043 shared_ch->ref--; 1044 if (shared_ch->ref == 0) { 1045 mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); 1046 assert(shared_ch->io_outstanding == 0); 1047 TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link); 1048 free(shared_ch); 1049 } 1050 } 1051 spdk_put_io_channel(ch->mgmt_channel); 1052 } 1053 } 1054 1055 /* Caller must hold bdev->mutex. */ 1056 static int 1057 spdk_bdev_qos_channel_create(struct spdk_bdev *bdev) 1058 { 1059 assert(bdev->qos_channel == NULL); 1060 assert(bdev->qos_thread == NULL); 1061 1062 bdev->qos_channel = calloc(1, sizeof(struct spdk_bdev_channel)); 1063 if (!bdev->qos_channel) { 1064 return -1; 1065 } 1066 1067 bdev->qos_thread = spdk_get_thread(); 1068 if (!bdev->qos_thread) { 1069 free(bdev->qos_channel); 1070 bdev->qos_channel = NULL; 1071 return -1; 1072 } 1073 1074 if (_spdk_bdev_channel_create(bdev->qos_channel, __bdev_to_io_dev(bdev)) != 0) { 1075 free(bdev->qos_channel); 1076 bdev->qos_channel = NULL; 1077 bdev->qos_thread = NULL; 1078 return -1; 1079 } 1080 1081 bdev->qos_channel->flags |= BDEV_CH_QOS_ENABLED; 1082 spdk_bdev_qos_get_max_ios_per_timeslice(bdev); 1083 bdev->qos_channel->qos_poller = spdk_poller_register( 1084 spdk_bdev_channel_poll_qos, 1085 bdev->qos_channel, 1086 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1087 1088 return 0; 1089 } 1090 1091 static int 1092 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1093 { 1094 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1095 struct spdk_bdev_channel *ch = ctx_buf; 1096 1097 if (_spdk_bdev_channel_create(ch, io_device) != 0) { 1098 _spdk_bdev_channel_destroy_resource(ch); 1099 return -1; 1100 } 1101 1102 #ifdef SPDK_CONFIG_VTUNE 1103 { 1104 char *name; 1105 __itt_init_ittlib(NULL, 0); 1106 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1107 if (!name) { 1108 _spdk_bdev_channel_destroy_resource(ch); 1109 return -1; 1110 } 1111 ch->handle = __itt_string_handle_create(name); 1112 free(name); 1113 ch->start_tsc = spdk_get_ticks(); 1114 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1115 } 1116 #endif 1117 1118 pthread_mutex_lock(&bdev->mutex); 1119 1120 /* Rate limiting on this bdev enabled */ 1121 if (bdev->ios_per_sec > 0 && bdev->qos_channel == NULL) { 1122 if (spdk_bdev_qos_channel_create(bdev) != 0) { 1123 _spdk_bdev_channel_destroy_resource(ch); 1124 pthread_mutex_unlock(&bdev->mutex); 1125 return -1; 1126 } 1127 } 1128 1129 bdev->channel_count++; 1130 1131 pthread_mutex_unlock(&bdev->mutex); 1132 1133 return 0; 1134 } 1135 1136 /* 1137 * Abort I/O that are waiting on a data buffer. These types of I/O are 1138 * linked using the spdk_bdev_io buf_link TAILQ_ENTRY. 1139 */ 1140 static void 1141 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1142 { 1143 bdev_io_stailq_t tmp; 1144 struct spdk_bdev_io *bdev_io; 1145 1146 STAILQ_INIT(&tmp); 1147 1148 while (!STAILQ_EMPTY(queue)) { 1149 bdev_io = STAILQ_FIRST(queue); 1150 STAILQ_REMOVE_HEAD(queue, buf_link); 1151 if (bdev_io->ch == ch) { 1152 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1153 } else { 1154 STAILQ_INSERT_TAIL(&tmp, bdev_io, buf_link); 1155 } 1156 } 1157 1158 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1159 } 1160 1161 /* 1162 * Abort I/O that are queued waiting for submission. These types of I/O are 1163 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1164 */ 1165 static void 1166 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1167 { 1168 struct spdk_bdev_io *bdev_io, *tmp; 1169 1170 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1171 if (bdev_io->ch == ch) { 1172 TAILQ_REMOVE(queue, bdev_io, link); 1173 /* 1174 * spdk_bdev_io_complete() assumes that the completed I/O had 1175 * been submitted to the bdev module. Since in this case it 1176 * hadn't, bump io_outstanding to account for the decrement 1177 * that spdk_bdev_io_complete() will do. 1178 */ 1179 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1180 ch->module_ch->io_outstanding++; 1181 } 1182 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1183 } 1184 } 1185 } 1186 1187 static void 1188 _spdk_bdev_channel_destroy(struct spdk_bdev_channel *ch) 1189 { 1190 struct spdk_bdev_mgmt_channel *mgmt_channel; 1191 struct spdk_bdev_module_channel *shared_ch = ch->module_ch; 1192 1193 mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); 1194 1195 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1196 _spdk_bdev_abort_queued_io(&ch->qos_io, ch); 1197 _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch); 1198 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch); 1199 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch); 1200 1201 _spdk_bdev_channel_destroy_resource(ch); 1202 } 1203 1204 static void 1205 spdk_bdev_qos_channel_destroy(void *ctx) 1206 { 1207 struct spdk_bdev_channel *qos_channel = ctx; 1208 1209 _spdk_bdev_channel_destroy(qos_channel); 1210 1211 spdk_poller_unregister(&qos_channel->qos_poller); 1212 free(qos_channel); 1213 } 1214 1215 static void 1216 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1217 { 1218 struct spdk_bdev_channel *ch = ctx_buf; 1219 struct spdk_bdev *bdev = ch->bdev; 1220 1221 _spdk_bdev_channel_destroy(ch); 1222 1223 pthread_mutex_lock(&bdev->mutex); 1224 bdev->channel_count--; 1225 if (bdev->channel_count == 0 && bdev->qos_channel != NULL) { 1226 /* All I/O channels for this bdev have been destroyed - destroy the QoS channel. */ 1227 spdk_thread_send_msg(bdev->qos_thread, spdk_bdev_qos_channel_destroy, 1228 bdev->qos_channel); 1229 1230 /* 1231 * Set qos_channel to NULL within the critical section so that 1232 * if another channel is created, it will see qos_channel == NULL and 1233 * re-create the QoS channel even if the asynchronous qos_channel_destroy 1234 * isn't finished yet. 1235 */ 1236 bdev->qos_channel = NULL; 1237 bdev->qos_thread = NULL; 1238 } 1239 pthread_mutex_unlock(&bdev->mutex); 1240 } 1241 1242 int 1243 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1244 { 1245 struct spdk_bdev_alias *tmp; 1246 1247 if (alias == NULL) { 1248 SPDK_ERRLOG("Empty alias passed\n"); 1249 return -EINVAL; 1250 } 1251 1252 if (spdk_bdev_get_by_name(alias)) { 1253 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1254 return -EEXIST; 1255 } 1256 1257 tmp = calloc(1, sizeof(*tmp)); 1258 if (tmp == NULL) { 1259 SPDK_ERRLOG("Unable to allocate alias\n"); 1260 return -ENOMEM; 1261 } 1262 1263 tmp->alias = strdup(alias); 1264 if (tmp->alias == NULL) { 1265 free(tmp); 1266 SPDK_ERRLOG("Unable to allocate alias\n"); 1267 return -ENOMEM; 1268 } 1269 1270 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1271 1272 return 0; 1273 } 1274 1275 int 1276 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1277 { 1278 struct spdk_bdev_alias *tmp; 1279 1280 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1281 if (strcmp(alias, tmp->alias) == 0) { 1282 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1283 free(tmp->alias); 1284 free(tmp); 1285 return 0; 1286 } 1287 } 1288 1289 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1290 1291 return -ENOENT; 1292 } 1293 1294 struct spdk_io_channel * 1295 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1296 { 1297 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1298 } 1299 1300 const char * 1301 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1302 { 1303 return bdev->name; 1304 } 1305 1306 const char * 1307 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1308 { 1309 return bdev->product_name; 1310 } 1311 1312 const struct spdk_bdev_aliases_list * 1313 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1314 { 1315 return &bdev->aliases; 1316 } 1317 1318 uint32_t 1319 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1320 { 1321 return bdev->blocklen; 1322 } 1323 1324 uint64_t 1325 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1326 { 1327 return bdev->blockcnt; 1328 } 1329 1330 size_t 1331 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1332 { 1333 /* TODO: push this logic down to the bdev modules */ 1334 if (bdev->need_aligned_buffer) { 1335 return bdev->blocklen; 1336 } 1337 1338 return 1; 1339 } 1340 1341 uint32_t 1342 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1343 { 1344 return bdev->optimal_io_boundary; 1345 } 1346 1347 bool 1348 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1349 { 1350 return bdev->write_cache; 1351 } 1352 1353 const struct spdk_uuid * 1354 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1355 { 1356 return &bdev->uuid; 1357 } 1358 1359 int 1360 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1361 { 1362 int ret; 1363 1364 pthread_mutex_lock(&bdev->mutex); 1365 1366 /* bdev has open descriptors */ 1367 if (!TAILQ_EMPTY(&bdev->open_descs) && 1368 bdev->blockcnt > size) { 1369 ret = -EBUSY; 1370 } else { 1371 bdev->blockcnt = size; 1372 ret = 0; 1373 } 1374 1375 pthread_mutex_unlock(&bdev->mutex); 1376 1377 return ret; 1378 } 1379 1380 /* 1381 * Convert I/O offset and length from bytes to blocks. 1382 * 1383 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1384 */ 1385 static uint64_t 1386 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1387 uint64_t num_bytes, uint64_t *num_blocks) 1388 { 1389 uint32_t block_size = bdev->blocklen; 1390 1391 *offset_blocks = offset_bytes / block_size; 1392 *num_blocks = num_bytes / block_size; 1393 1394 return (offset_bytes % block_size) | (num_bytes % block_size); 1395 } 1396 1397 static bool 1398 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1399 { 1400 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1401 * has been an overflow and hence the offset has been wrapped around */ 1402 if (offset_blocks + num_blocks < offset_blocks) { 1403 return false; 1404 } 1405 1406 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1407 if (offset_blocks + num_blocks > bdev->blockcnt) { 1408 return false; 1409 } 1410 1411 return true; 1412 } 1413 1414 int 1415 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1416 void *buf, uint64_t offset, uint64_t nbytes, 1417 spdk_bdev_io_completion_cb cb, void *cb_arg) 1418 { 1419 uint64_t offset_blocks, num_blocks; 1420 1421 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1422 return -EINVAL; 1423 } 1424 1425 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1426 } 1427 1428 int 1429 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1430 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1431 spdk_bdev_io_completion_cb cb, void *cb_arg) 1432 { 1433 struct spdk_bdev *bdev = desc->bdev; 1434 struct spdk_bdev_io *bdev_io; 1435 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1436 1437 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1438 return -EINVAL; 1439 } 1440 1441 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1442 if (!bdev_io) { 1443 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1444 return -ENOMEM; 1445 } 1446 1447 bdev_io->ch = channel; 1448 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1449 bdev_io->u.bdev.iov.iov_base = buf; 1450 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1451 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1452 bdev_io->u.bdev.iovcnt = 1; 1453 bdev_io->u.bdev.num_blocks = num_blocks; 1454 bdev_io->u.bdev.offset_blocks = offset_blocks; 1455 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1456 1457 spdk_bdev_io_submit(bdev_io); 1458 return 0; 1459 } 1460 1461 int 1462 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1463 struct iovec *iov, int iovcnt, 1464 uint64_t offset, uint64_t nbytes, 1465 spdk_bdev_io_completion_cb cb, void *cb_arg) 1466 { 1467 uint64_t offset_blocks, num_blocks; 1468 1469 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1470 return -EINVAL; 1471 } 1472 1473 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1474 } 1475 1476 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1477 struct iovec *iov, int iovcnt, 1478 uint64_t offset_blocks, uint64_t num_blocks, 1479 spdk_bdev_io_completion_cb cb, void *cb_arg) 1480 { 1481 struct spdk_bdev *bdev = desc->bdev; 1482 struct spdk_bdev_io *bdev_io; 1483 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1484 1485 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1486 return -EINVAL; 1487 } 1488 1489 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1490 if (!bdev_io) { 1491 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1492 return -ENOMEM; 1493 } 1494 1495 bdev_io->ch = channel; 1496 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1497 bdev_io->u.bdev.iovs = iov; 1498 bdev_io->u.bdev.iovcnt = iovcnt; 1499 bdev_io->u.bdev.num_blocks = num_blocks; 1500 bdev_io->u.bdev.offset_blocks = offset_blocks; 1501 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1502 1503 spdk_bdev_io_submit(bdev_io); 1504 return 0; 1505 } 1506 1507 int 1508 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1509 void *buf, uint64_t offset, uint64_t nbytes, 1510 spdk_bdev_io_completion_cb cb, void *cb_arg) 1511 { 1512 uint64_t offset_blocks, num_blocks; 1513 1514 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1515 return -EINVAL; 1516 } 1517 1518 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1519 } 1520 1521 int 1522 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1523 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1524 spdk_bdev_io_completion_cb cb, void *cb_arg) 1525 { 1526 struct spdk_bdev *bdev = desc->bdev; 1527 struct spdk_bdev_io *bdev_io; 1528 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1529 1530 if (!desc->write) { 1531 return -EBADF; 1532 } 1533 1534 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1535 return -EINVAL; 1536 } 1537 1538 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1539 if (!bdev_io) { 1540 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1541 return -ENOMEM; 1542 } 1543 1544 bdev_io->ch = channel; 1545 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1546 bdev_io->u.bdev.iov.iov_base = buf; 1547 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1548 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1549 bdev_io->u.bdev.iovcnt = 1; 1550 bdev_io->u.bdev.num_blocks = num_blocks; 1551 bdev_io->u.bdev.offset_blocks = offset_blocks; 1552 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1553 1554 spdk_bdev_io_submit(bdev_io); 1555 return 0; 1556 } 1557 1558 int 1559 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1560 struct iovec *iov, int iovcnt, 1561 uint64_t offset, uint64_t len, 1562 spdk_bdev_io_completion_cb cb, void *cb_arg) 1563 { 1564 uint64_t offset_blocks, num_blocks; 1565 1566 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1567 return -EINVAL; 1568 } 1569 1570 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1571 } 1572 1573 int 1574 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1575 struct iovec *iov, int iovcnt, 1576 uint64_t offset_blocks, uint64_t num_blocks, 1577 spdk_bdev_io_completion_cb cb, void *cb_arg) 1578 { 1579 struct spdk_bdev *bdev = desc->bdev; 1580 struct spdk_bdev_io *bdev_io; 1581 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1582 1583 if (!desc->write) { 1584 return -EBADF; 1585 } 1586 1587 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1588 return -EINVAL; 1589 } 1590 1591 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1592 if (!bdev_io) { 1593 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1594 return -ENOMEM; 1595 } 1596 1597 bdev_io->ch = channel; 1598 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1599 bdev_io->u.bdev.iovs = iov; 1600 bdev_io->u.bdev.iovcnt = iovcnt; 1601 bdev_io->u.bdev.num_blocks = num_blocks; 1602 bdev_io->u.bdev.offset_blocks = offset_blocks; 1603 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1604 1605 spdk_bdev_io_submit(bdev_io); 1606 return 0; 1607 } 1608 1609 int 1610 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1611 uint64_t offset, uint64_t len, 1612 spdk_bdev_io_completion_cb cb, void *cb_arg) 1613 { 1614 uint64_t offset_blocks, num_blocks; 1615 1616 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1617 return -EINVAL; 1618 } 1619 1620 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1621 } 1622 1623 int 1624 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1625 uint64_t offset_blocks, uint64_t num_blocks, 1626 spdk_bdev_io_completion_cb cb, void *cb_arg) 1627 { 1628 struct spdk_bdev *bdev = desc->bdev; 1629 struct spdk_bdev_io *bdev_io; 1630 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1631 uint64_t len; 1632 bool split_request = false; 1633 1634 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1635 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1636 return -ERANGE; 1637 } 1638 1639 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1640 return -EINVAL; 1641 } 1642 1643 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1644 1645 if (!bdev_io) { 1646 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1647 return -ENOMEM; 1648 } 1649 1650 bdev_io->ch = channel; 1651 bdev_io->u.bdev.offset_blocks = offset_blocks; 1652 1653 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1654 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1655 bdev_io->u.bdev.num_blocks = num_blocks; 1656 bdev_io->u.bdev.iovs = NULL; 1657 bdev_io->u.bdev.iovcnt = 0; 1658 1659 } else { 1660 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1661 1662 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1663 1664 if (len > ZERO_BUFFER_SIZE) { 1665 split_request = true; 1666 len = ZERO_BUFFER_SIZE; 1667 } 1668 1669 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1670 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1671 bdev_io->u.bdev.iov.iov_len = len; 1672 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1673 bdev_io->u.bdev.iovcnt = 1; 1674 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1675 bdev_io->split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1676 bdev_io->split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1677 } 1678 1679 if (split_request) { 1680 bdev_io->stored_user_cb = cb; 1681 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1682 } else { 1683 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1684 } 1685 spdk_bdev_io_submit(bdev_io); 1686 return 0; 1687 } 1688 1689 int 1690 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1691 uint64_t offset, uint64_t nbytes, 1692 spdk_bdev_io_completion_cb cb, void *cb_arg) 1693 { 1694 uint64_t offset_blocks, num_blocks; 1695 1696 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1697 return -EINVAL; 1698 } 1699 1700 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1701 } 1702 1703 int 1704 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1705 uint64_t offset_blocks, uint64_t num_blocks, 1706 spdk_bdev_io_completion_cb cb, void *cb_arg) 1707 { 1708 struct spdk_bdev *bdev = desc->bdev; 1709 struct spdk_bdev_io *bdev_io; 1710 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1711 1712 if (!desc->write) { 1713 return -EBADF; 1714 } 1715 1716 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1717 return -EINVAL; 1718 } 1719 1720 if (num_blocks == 0) { 1721 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1722 return -EINVAL; 1723 } 1724 1725 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1726 if (!bdev_io) { 1727 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1728 return -ENOMEM; 1729 } 1730 1731 bdev_io->ch = channel; 1732 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1733 bdev_io->u.bdev.iov.iov_base = NULL; 1734 bdev_io->u.bdev.iov.iov_len = 0; 1735 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1736 bdev_io->u.bdev.iovcnt = 1; 1737 bdev_io->u.bdev.offset_blocks = offset_blocks; 1738 bdev_io->u.bdev.num_blocks = num_blocks; 1739 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1740 1741 spdk_bdev_io_submit(bdev_io); 1742 return 0; 1743 } 1744 1745 int 1746 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1747 uint64_t offset, uint64_t length, 1748 spdk_bdev_io_completion_cb cb, void *cb_arg) 1749 { 1750 uint64_t offset_blocks, num_blocks; 1751 1752 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1753 return -EINVAL; 1754 } 1755 1756 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1757 } 1758 1759 int 1760 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1761 uint64_t offset_blocks, uint64_t num_blocks, 1762 spdk_bdev_io_completion_cb cb, void *cb_arg) 1763 { 1764 struct spdk_bdev *bdev = desc->bdev; 1765 struct spdk_bdev_io *bdev_io; 1766 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1767 1768 if (!desc->write) { 1769 return -EBADF; 1770 } 1771 1772 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1773 return -EINVAL; 1774 } 1775 1776 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1777 if (!bdev_io) { 1778 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1779 return -ENOMEM; 1780 } 1781 1782 bdev_io->ch = channel; 1783 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1784 bdev_io->u.bdev.iovs = NULL; 1785 bdev_io->u.bdev.iovcnt = 0; 1786 bdev_io->u.bdev.offset_blocks = offset_blocks; 1787 bdev_io->u.bdev.num_blocks = num_blocks; 1788 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1789 1790 spdk_bdev_io_submit(bdev_io); 1791 return 0; 1792 } 1793 1794 static void 1795 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1796 { 1797 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1798 struct spdk_bdev_io *bdev_io; 1799 1800 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1801 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1802 spdk_bdev_io_submit_reset(bdev_io); 1803 } 1804 1805 static void 1806 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1807 { 1808 struct spdk_io_channel *ch; 1809 struct spdk_bdev_channel *channel; 1810 struct spdk_bdev_mgmt_channel *mgmt_channel; 1811 struct spdk_bdev_module_channel *shared_ch; 1812 1813 ch = spdk_io_channel_iter_get_channel(i); 1814 channel = spdk_io_channel_get_ctx(ch); 1815 mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel); 1816 shared_ch = channel->module_ch; 1817 1818 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1819 1820 _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel); 1821 _spdk_bdev_abort_queued_io(&channel->qos_io, channel); 1822 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1823 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1824 1825 spdk_for_each_channel_continue(i, 0); 1826 } 1827 1828 static void 1829 _spdk_bdev_start_reset(void *ctx) 1830 { 1831 struct spdk_bdev_channel *ch = ctx; 1832 1833 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1834 ch, _spdk_bdev_reset_dev); 1835 } 1836 1837 static void 1838 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1839 { 1840 struct spdk_bdev *bdev = ch->bdev; 1841 1842 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1843 1844 pthread_mutex_lock(&bdev->mutex); 1845 if (bdev->reset_in_progress == NULL) { 1846 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1847 /* 1848 * Take a channel reference for the target bdev for the life of this 1849 * reset. This guards against the channel getting destroyed while 1850 * spdk_for_each_channel() calls related to this reset IO are in 1851 * progress. We will release the reference when this reset is 1852 * completed. 1853 */ 1854 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1855 _spdk_bdev_start_reset(ch); 1856 } 1857 pthread_mutex_unlock(&bdev->mutex); 1858 } 1859 1860 int 1861 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1862 spdk_bdev_io_completion_cb cb, void *cb_arg) 1863 { 1864 struct spdk_bdev *bdev = desc->bdev; 1865 struct spdk_bdev_io *bdev_io; 1866 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1867 1868 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1869 if (!bdev_io) { 1870 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1871 return -ENOMEM; 1872 } 1873 1874 bdev_io->ch = channel; 1875 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1876 bdev_io->u.reset.ch_ref = NULL; 1877 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1878 1879 pthread_mutex_lock(&bdev->mutex); 1880 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 1881 pthread_mutex_unlock(&bdev->mutex); 1882 1883 _spdk_bdev_channel_start_reset(channel); 1884 1885 return 0; 1886 } 1887 1888 void 1889 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 1890 struct spdk_bdev_io_stat *stat) 1891 { 1892 #ifdef SPDK_CONFIG_VTUNE 1893 SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n"); 1894 memset(stat, 0, sizeof(*stat)); 1895 return; 1896 #endif 1897 1898 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1899 1900 channel->stat.ticks_rate = spdk_get_ticks_hz(); 1901 *stat = channel->stat; 1902 memset(&channel->stat, 0, sizeof(channel->stat)); 1903 } 1904 1905 int 1906 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1907 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 1908 spdk_bdev_io_completion_cb cb, void *cb_arg) 1909 { 1910 struct spdk_bdev *bdev = desc->bdev; 1911 struct spdk_bdev_io *bdev_io; 1912 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1913 1914 if (!desc->write) { 1915 return -EBADF; 1916 } 1917 1918 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1919 if (!bdev_io) { 1920 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 1921 return -ENOMEM; 1922 } 1923 1924 bdev_io->ch = channel; 1925 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 1926 bdev_io->u.nvme_passthru.cmd = *cmd; 1927 bdev_io->u.nvme_passthru.buf = buf; 1928 bdev_io->u.nvme_passthru.nbytes = nbytes; 1929 bdev_io->u.nvme_passthru.md_buf = NULL; 1930 bdev_io->u.nvme_passthru.md_len = 0; 1931 1932 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1933 1934 spdk_bdev_io_submit(bdev_io); 1935 return 0; 1936 } 1937 1938 int 1939 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1940 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 1941 spdk_bdev_io_completion_cb cb, void *cb_arg) 1942 { 1943 struct spdk_bdev *bdev = desc->bdev; 1944 struct spdk_bdev_io *bdev_io; 1945 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1946 1947 if (!desc->write) { 1948 /* 1949 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 1950 * to easily determine if the command is a read or write, but for now just 1951 * do not allow io_passthru with a read-only descriptor. 1952 */ 1953 return -EBADF; 1954 } 1955 1956 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1957 if (!bdev_io) { 1958 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 1959 return -ENOMEM; 1960 } 1961 1962 bdev_io->ch = channel; 1963 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 1964 bdev_io->u.nvme_passthru.cmd = *cmd; 1965 bdev_io->u.nvme_passthru.buf = buf; 1966 bdev_io->u.nvme_passthru.nbytes = nbytes; 1967 bdev_io->u.nvme_passthru.md_buf = NULL; 1968 bdev_io->u.nvme_passthru.md_len = 0; 1969 1970 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1971 1972 spdk_bdev_io_submit(bdev_io); 1973 return 0; 1974 } 1975 1976 int 1977 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1978 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 1979 spdk_bdev_io_completion_cb cb, void *cb_arg) 1980 { 1981 struct spdk_bdev *bdev = desc->bdev; 1982 struct spdk_bdev_io *bdev_io; 1983 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1984 1985 if (!desc->write) { 1986 /* 1987 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 1988 * to easily determine if the command is a read or write, but for now just 1989 * do not allow io_passthru with a read-only descriptor. 1990 */ 1991 return -EBADF; 1992 } 1993 1994 bdev_io = spdk_bdev_get_io(channel->mgmt_channel); 1995 if (!bdev_io) { 1996 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 1997 return -ENOMEM; 1998 } 1999 2000 bdev_io->ch = channel; 2001 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2002 bdev_io->u.nvme_passthru.cmd = *cmd; 2003 bdev_io->u.nvme_passthru.buf = buf; 2004 bdev_io->u.nvme_passthru.nbytes = nbytes; 2005 bdev_io->u.nvme_passthru.md_buf = md_buf; 2006 bdev_io->u.nvme_passthru.md_len = md_len; 2007 2008 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2009 2010 spdk_bdev_io_submit(bdev_io); 2011 return 0; 2012 } 2013 2014 int 2015 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2016 { 2017 if (!bdev_io) { 2018 SPDK_ERRLOG("bdev_io is NULL\n"); 2019 return -1; 2020 } 2021 2022 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2023 SPDK_ERRLOG("bdev_io is in pending state\n"); 2024 assert(false); 2025 return -1; 2026 } 2027 2028 spdk_bdev_put_io(bdev_io); 2029 2030 return 0; 2031 } 2032 2033 static void 2034 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2035 { 2036 struct spdk_bdev *bdev = bdev_ch->bdev; 2037 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 2038 struct spdk_bdev_io *bdev_io; 2039 2040 if (shared_ch->io_outstanding > shared_ch->nomem_threshold) { 2041 /* 2042 * Allow some more I/O to complete before retrying the nomem_io queue. 2043 * Some drivers (such as nvme) cannot immediately take a new I/O in 2044 * the context of a completion, because the resources for the I/O are 2045 * not released until control returns to the bdev poller. Also, we 2046 * may require several small I/O to complete before a larger I/O 2047 * (that requires splitting) can be submitted. 2048 */ 2049 return; 2050 } 2051 2052 while (!TAILQ_EMPTY(&shared_ch->nomem_io)) { 2053 bdev_io = TAILQ_FIRST(&shared_ch->nomem_io); 2054 TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link); 2055 shared_ch->io_outstanding++; 2056 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2057 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2058 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2059 break; 2060 } 2061 } 2062 } 2063 2064 static void 2065 _spdk_bdev_qos_io_complete(void *ctx) 2066 { 2067 struct spdk_bdev_io *bdev_io = ctx; 2068 2069 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, bdev_io->caller_ctx); 2070 } 2071 2072 static void 2073 _spdk_bdev_io_complete(void *ctx) 2074 { 2075 struct spdk_bdev_io *bdev_io = ctx; 2076 2077 assert(bdev_io->cb != NULL); 2078 2079 if (bdev_io->io_submit_ch) { 2080 bdev_io->ch = bdev_io->io_submit_ch; 2081 bdev_io->io_submit_ch = NULL; 2082 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2083 _spdk_bdev_qos_io_complete, bdev_io); 2084 } else { 2085 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2086 bdev_io->caller_ctx); 2087 } 2088 } 2089 2090 static void 2091 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2092 { 2093 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2094 2095 if (bdev_io->u.reset.ch_ref != NULL) { 2096 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2097 bdev_io->u.reset.ch_ref = NULL; 2098 } 2099 2100 _spdk_bdev_io_complete(bdev_io); 2101 } 2102 2103 static void 2104 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2105 { 2106 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2107 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2108 2109 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2110 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2111 _spdk_bdev_channel_start_reset(ch); 2112 } 2113 2114 spdk_for_each_channel_continue(i, 0); 2115 } 2116 2117 void 2118 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2119 { 2120 struct spdk_bdev *bdev = bdev_io->bdev; 2121 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2122 struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; 2123 2124 bdev_io->status = status; 2125 2126 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2127 bool unlock_channels = false; 2128 2129 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2130 SPDK_ERRLOG("NOMEM returned for reset\n"); 2131 } 2132 pthread_mutex_lock(&bdev->mutex); 2133 if (bdev_io == bdev->reset_in_progress) { 2134 bdev->reset_in_progress = NULL; 2135 unlock_channels = true; 2136 } 2137 pthread_mutex_unlock(&bdev->mutex); 2138 2139 if (unlock_channels) { 2140 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2141 bdev_io, _spdk_bdev_reset_complete); 2142 return; 2143 } 2144 } else { 2145 assert(shared_ch->io_outstanding > 0); 2146 shared_ch->io_outstanding--; 2147 if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) { 2148 if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) { 2149 _spdk_bdev_ch_retry_io(bdev_ch); 2150 } 2151 } else { 2152 TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link); 2153 /* 2154 * Wait for some of the outstanding I/O to complete before we 2155 * retry any of the nomem_io. Normally we will wait for 2156 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2157 * depth channels we will instead wait for half to complete. 2158 */ 2159 shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2, 2160 (int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); 2161 return; 2162 } 2163 } 2164 2165 if (status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2166 switch (bdev_io->type) { 2167 case SPDK_BDEV_IO_TYPE_READ: 2168 bdev_ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2169 bdev_ch->stat.num_read_ops++; 2170 bdev_ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2171 break; 2172 case SPDK_BDEV_IO_TYPE_WRITE: 2173 bdev_ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2174 bdev_ch->stat.num_write_ops++; 2175 bdev_ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2176 break; 2177 default: 2178 break; 2179 } 2180 } 2181 2182 #ifdef SPDK_CONFIG_VTUNE 2183 uint64_t now_tsc = spdk_get_ticks(); 2184 if (now_tsc > (bdev_ch->start_tsc + bdev_ch->interval_tsc)) { 2185 uint64_t data[5]; 2186 2187 data[0] = bdev_ch->stat.num_read_ops; 2188 data[1] = bdev_ch->stat.bytes_read; 2189 data[2] = bdev_ch->stat.num_write_ops; 2190 data[3] = bdev_ch->stat.bytes_written; 2191 data[4] = bdev->fn_table->get_spin_time ? 2192 bdev->fn_table->get_spin_time(bdev_ch->channel) : 0; 2193 2194 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_ch->handle, 2195 __itt_metadata_u64, 5, data); 2196 2197 memset(&bdev_ch->stat, 0, sizeof(bdev_ch->stat)); 2198 bdev_ch->start_tsc = now_tsc; 2199 } 2200 #endif 2201 2202 if (bdev_io->in_submit_request) { 2203 /* 2204 * Defer completion to avoid potential infinite recursion if the 2205 * user's completion callback issues a new I/O. 2206 */ 2207 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_ch->channel), 2208 _spdk_bdev_io_complete, bdev_io); 2209 } else { 2210 _spdk_bdev_io_complete(bdev_io); 2211 } 2212 } 2213 2214 void 2215 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2216 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2217 { 2218 if (sc == SPDK_SCSI_STATUS_GOOD) { 2219 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2220 } else { 2221 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2222 bdev_io->error.scsi.sc = sc; 2223 bdev_io->error.scsi.sk = sk; 2224 bdev_io->error.scsi.asc = asc; 2225 bdev_io->error.scsi.ascq = ascq; 2226 } 2227 2228 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2229 } 2230 2231 void 2232 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2233 int *sc, int *sk, int *asc, int *ascq) 2234 { 2235 assert(sc != NULL); 2236 assert(sk != NULL); 2237 assert(asc != NULL); 2238 assert(ascq != NULL); 2239 2240 switch (bdev_io->status) { 2241 case SPDK_BDEV_IO_STATUS_SUCCESS: 2242 *sc = SPDK_SCSI_STATUS_GOOD; 2243 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2244 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2245 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2246 break; 2247 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2248 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2249 break; 2250 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2251 *sc = bdev_io->error.scsi.sc; 2252 *sk = bdev_io->error.scsi.sk; 2253 *asc = bdev_io->error.scsi.asc; 2254 *ascq = bdev_io->error.scsi.ascq; 2255 break; 2256 default: 2257 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2258 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2259 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2260 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2261 break; 2262 } 2263 } 2264 2265 void 2266 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2267 { 2268 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2269 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2270 } else { 2271 bdev_io->error.nvme.sct = sct; 2272 bdev_io->error.nvme.sc = sc; 2273 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2274 } 2275 2276 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2277 } 2278 2279 void 2280 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2281 { 2282 assert(sct != NULL); 2283 assert(sc != NULL); 2284 2285 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2286 *sct = bdev_io->error.nvme.sct; 2287 *sc = bdev_io->error.nvme.sc; 2288 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2289 *sct = SPDK_NVME_SCT_GENERIC; 2290 *sc = SPDK_NVME_SC_SUCCESS; 2291 } else { 2292 *sct = SPDK_NVME_SCT_GENERIC; 2293 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2294 } 2295 } 2296 2297 struct spdk_thread * 2298 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2299 { 2300 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2301 } 2302 2303 static int 2304 _spdk_bdev_register(struct spdk_bdev *bdev) 2305 { 2306 struct spdk_bdev_module *module; 2307 2308 assert(bdev->module != NULL); 2309 2310 if (!bdev->name) { 2311 SPDK_ERRLOG("Bdev name is NULL\n"); 2312 return -EINVAL; 2313 } 2314 2315 if (spdk_bdev_get_by_name(bdev->name)) { 2316 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2317 return -EEXIST; 2318 } 2319 2320 bdev->status = SPDK_BDEV_STATUS_READY; 2321 2322 TAILQ_INIT(&bdev->open_descs); 2323 2324 TAILQ_INIT(&bdev->vbdevs); 2325 TAILQ_INIT(&bdev->base_bdevs); 2326 2327 TAILQ_INIT(&bdev->aliases); 2328 2329 bdev->reset_in_progress = NULL; 2330 2331 spdk_io_device_register(__bdev_to_io_dev(bdev), 2332 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2333 sizeof(struct spdk_bdev_channel)); 2334 2335 pthread_mutex_init(&bdev->mutex, NULL); 2336 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2337 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2338 2339 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2340 if (module->examine) { 2341 module->action_in_progress++; 2342 module->examine(bdev); 2343 } 2344 } 2345 2346 return 0; 2347 } 2348 2349 int 2350 spdk_bdev_register(struct spdk_bdev *bdev) 2351 { 2352 return _spdk_bdev_register(bdev); 2353 } 2354 2355 int 2356 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2357 { 2358 int i, rc; 2359 2360 rc = _spdk_bdev_register(vbdev); 2361 if (rc) { 2362 return rc; 2363 } 2364 2365 for (i = 0; i < base_bdev_count; i++) { 2366 assert(base_bdevs[i] != NULL); 2367 assert(base_bdevs[i]->claim_module != NULL); 2368 TAILQ_INSERT_TAIL(&vbdev->base_bdevs, base_bdevs[i], base_bdev_link); 2369 TAILQ_INSERT_TAIL(&base_bdevs[i]->vbdevs, vbdev, vbdev_link); 2370 } 2371 2372 return 0; 2373 } 2374 2375 void 2376 spdk_bdev_unregister_done(struct spdk_bdev *bdev, int bdeverrno) 2377 { 2378 if (bdev->unregister_cb != NULL) { 2379 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2380 } 2381 } 2382 2383 static void 2384 _remove_notify(void *arg) 2385 { 2386 struct spdk_bdev_desc *desc = arg; 2387 2388 desc->remove_cb(desc->remove_ctx); 2389 } 2390 2391 void 2392 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2393 { 2394 struct spdk_bdev_desc *desc, *tmp; 2395 int rc; 2396 bool do_destruct = true; 2397 struct spdk_bdev *base_bdev; 2398 2399 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2400 2401 pthread_mutex_lock(&bdev->mutex); 2402 2403 if (!TAILQ_EMPTY(&bdev->base_bdevs)) { 2404 TAILQ_FOREACH(base_bdev, &bdev->base_bdevs, base_bdev_link) { 2405 TAILQ_REMOVE(&base_bdev->vbdevs, bdev, vbdev_link); 2406 } 2407 } 2408 2409 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2410 bdev->unregister_cb = cb_fn; 2411 bdev->unregister_ctx = cb_arg; 2412 2413 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2414 if (desc->remove_cb) { 2415 do_destruct = false; 2416 /* 2417 * Defer invocation of the remove_cb to a separate message that will 2418 * run later on this thread. This ensures this context unwinds and 2419 * we don't recursively unregister this bdev again if the remove_cb 2420 * immediately closes its descriptor. 2421 */ 2422 spdk_thread_send_msg(spdk_get_thread(), _remove_notify, desc); 2423 } 2424 } 2425 2426 if (!do_destruct) { 2427 pthread_mutex_unlock(&bdev->mutex); 2428 return; 2429 } 2430 2431 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2432 pthread_mutex_unlock(&bdev->mutex); 2433 2434 pthread_mutex_destroy(&bdev->mutex); 2435 2436 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 2437 2438 rc = bdev->fn_table->destruct(bdev->ctxt); 2439 if (rc < 0) { 2440 SPDK_ERRLOG("destruct failed\n"); 2441 } 2442 if (rc <= 0 && cb_fn != NULL) { 2443 cb_fn(cb_arg, rc); 2444 } 2445 } 2446 2447 int 2448 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2449 void *remove_ctx, struct spdk_bdev_desc **_desc) 2450 { 2451 struct spdk_bdev_desc *desc; 2452 2453 desc = calloc(1, sizeof(*desc)); 2454 if (desc == NULL) { 2455 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2456 return -ENOMEM; 2457 } 2458 2459 pthread_mutex_lock(&bdev->mutex); 2460 2461 if (write && bdev->claim_module) { 2462 SPDK_INFOLOG(SPDK_LOG_BDEV, "Could not open %s - already claimed\n", bdev->name); 2463 free(desc); 2464 pthread_mutex_unlock(&bdev->mutex); 2465 return -EPERM; 2466 } 2467 2468 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2469 2470 desc->bdev = bdev; 2471 desc->remove_cb = remove_cb; 2472 desc->remove_ctx = remove_ctx; 2473 desc->write = write; 2474 *_desc = desc; 2475 2476 pthread_mutex_unlock(&bdev->mutex); 2477 2478 return 0; 2479 } 2480 2481 void 2482 spdk_bdev_close(struct spdk_bdev_desc *desc) 2483 { 2484 struct spdk_bdev *bdev = desc->bdev; 2485 bool do_unregister = false; 2486 2487 pthread_mutex_lock(&bdev->mutex); 2488 2489 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2490 free(desc); 2491 2492 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2493 do_unregister = true; 2494 } 2495 pthread_mutex_unlock(&bdev->mutex); 2496 2497 if (do_unregister == true) { 2498 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2499 } 2500 } 2501 2502 int 2503 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2504 struct spdk_bdev_module *module) 2505 { 2506 if (bdev->claim_module != NULL) { 2507 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2508 bdev->claim_module->name); 2509 return -EPERM; 2510 } 2511 2512 if (desc && !desc->write) { 2513 desc->write = true; 2514 } 2515 2516 bdev->claim_module = module; 2517 return 0; 2518 } 2519 2520 void 2521 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2522 { 2523 assert(bdev->claim_module != NULL); 2524 bdev->claim_module = NULL; 2525 } 2526 2527 struct spdk_bdev * 2528 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2529 { 2530 return desc->bdev; 2531 } 2532 2533 void 2534 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2535 { 2536 struct iovec *iovs; 2537 int iovcnt; 2538 2539 if (bdev_io == NULL) { 2540 return; 2541 } 2542 2543 switch (bdev_io->type) { 2544 case SPDK_BDEV_IO_TYPE_READ: 2545 iovs = bdev_io->u.bdev.iovs; 2546 iovcnt = bdev_io->u.bdev.iovcnt; 2547 break; 2548 case SPDK_BDEV_IO_TYPE_WRITE: 2549 iovs = bdev_io->u.bdev.iovs; 2550 iovcnt = bdev_io->u.bdev.iovcnt; 2551 break; 2552 default: 2553 iovs = NULL; 2554 iovcnt = 0; 2555 break; 2556 } 2557 2558 if (iovp) { 2559 *iovp = iovs; 2560 } 2561 if (iovcntp) { 2562 *iovcntp = iovcnt; 2563 } 2564 } 2565 2566 void 2567 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2568 { 2569 2570 if (spdk_bdev_module_list_find(bdev_module->name)) { 2571 fprintf(stderr, "ERROR: module '%s' already registered.\n", bdev_module->name); 2572 assert(false); 2573 } 2574 2575 if (bdev_module->async_init) { 2576 bdev_module->action_in_progress = 1; 2577 } 2578 2579 /* 2580 * Modules with examine callbacks must be initialized first, so they are 2581 * ready to handle examine callbacks from later modules that will 2582 * register physical bdevs. 2583 */ 2584 if (bdev_module->examine != NULL) { 2585 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2586 } else { 2587 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2588 } 2589 } 2590 2591 struct spdk_bdev_module * 2592 spdk_bdev_module_list_find(const char *name) 2593 { 2594 struct spdk_bdev_module *bdev_module; 2595 2596 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2597 if (strcmp(name, bdev_module->name) == 0) { 2598 break; 2599 } 2600 } 2601 2602 return bdev_module; 2603 } 2604 2605 static void 2606 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2607 { 2608 uint64_t len; 2609 2610 if (!success) { 2611 bdev_io->cb = bdev_io->stored_user_cb; 2612 _spdk_bdev_io_complete(bdev_io); 2613 return; 2614 } 2615 2616 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2617 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->split_remaining_num_blocks, 2618 ZERO_BUFFER_SIZE); 2619 2620 bdev_io->u.bdev.offset_blocks = bdev_io->split_current_offset_blocks; 2621 bdev_io->u.bdev.iov.iov_len = len; 2622 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2623 bdev_io->split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2624 bdev_io->split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2625 2626 /* if this round completes the i/o, change the callback to be the original user callback */ 2627 if (bdev_io->split_remaining_num_blocks == 0) { 2628 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->stored_user_cb); 2629 } else { 2630 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 2631 } 2632 spdk_bdev_io_submit(bdev_io); 2633 } 2634 2635 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 2636