1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 #include "spdk/conf.h" 39 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/io_channel.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 49 #include "spdk_internal/bdev.h" 50 #include "spdk_internal/log.h" 51 #include "spdk/string.h" 52 53 #ifdef SPDK_CONFIG_VTUNE 54 #include "ittnotify.h" 55 #include "ittnotify_types.h" 56 int __itt_init_ittlib(const char *, __itt_group_id); 57 #endif 58 59 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 60 #define SPDK_BDEV_IO_CACHE_SIZE 256 61 #define BUF_SMALL_POOL_SIZE 8192 62 #define BUF_LARGE_POOL_SIZE 1024 63 #define NOMEM_THRESHOLD_COUNT 8 64 #define ZERO_BUFFER_SIZE 0x100000 65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 66 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 69 70 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t; 71 typedef STAILQ_HEAD(, spdk_bdev_io) bdev_io_stailq_t; 72 73 struct spdk_bdev_mgr { 74 struct spdk_mempool *bdev_io_pool; 75 76 struct spdk_mempool *buf_small_pool; 77 struct spdk_mempool *buf_large_pool; 78 79 void *zero_buffer; 80 81 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 82 83 TAILQ_HEAD(, spdk_bdev) bdevs; 84 85 bool init_complete; 86 bool module_init_complete; 87 88 #ifdef SPDK_CONFIG_VTUNE 89 __itt_domain *domain; 90 #endif 91 }; 92 93 static struct spdk_bdev_mgr g_bdev_mgr = { 94 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 95 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 96 .init_complete = false, 97 .module_init_complete = false, 98 }; 99 100 static spdk_bdev_init_cb g_init_cb_fn = NULL; 101 static void *g_init_cb_arg = NULL; 102 103 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 104 static void *g_fini_cb_arg = NULL; 105 static struct spdk_thread *g_fini_thread = NULL; 106 107 108 struct spdk_bdev_mgmt_channel { 109 bdev_io_stailq_t need_buf_small; 110 bdev_io_stailq_t need_buf_large; 111 112 /* 113 * Each thread keeps a cache of bdev_io - this allows 114 * bdev threads which are *not* DPDK threads to still 115 * benefit from a per-thread bdev_io cache. Without 116 * this, non-DPDK threads fetching from the mempool 117 * incur a cmpxchg on get and put. 118 */ 119 bdev_io_stailq_t per_thread_cache; 120 uint32_t per_thread_cache_count; 121 }; 122 123 /* 124 * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device 125 * will queue here their IO that awaits retry. It makes it posible to retry sending 126 * IO to one bdev after IO from other bdev completes. 127 */ 128 struct spdk_bdev_module_channel { 129 130 /* The bdev management channel */ 131 struct spdk_bdev_mgmt_channel *mgmt_ch; 132 133 /* 134 * Count of I/O submitted to bdev module and waiting for completion. 135 * Incremented before submit_request() is called on an spdk_bdev_io. 136 */ 137 uint64_t io_outstanding; 138 139 /* 140 * Queue of IO awaiting retry because of a previous NOMEM status returned 141 * on this channel. 142 */ 143 bdev_io_tailq_t nomem_io; 144 145 /* 146 * Threshold which io_outstanding must drop to before retrying nomem_io. 147 */ 148 uint64_t nomem_threshold; 149 150 TAILQ_ENTRY(spdk_bdev_module_channel) link; 151 }; 152 153 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 154 #define BDEV_CH_QOS_ENABLED (1 << 1) 155 156 struct spdk_bdev_channel { 157 struct spdk_bdev *bdev; 158 159 /* The channel for the underlying device */ 160 struct spdk_io_channel *channel; 161 162 /* Channel for the bdev module */ 163 struct spdk_bdev_module_channel *module_ch; 164 165 struct spdk_bdev_io_stat stat; 166 167 /* 168 * Count of I/O submitted through this channel and waiting for completion. 169 * Incremented before submit_request() is called on an spdk_bdev_io. 170 */ 171 uint64_t io_outstanding; 172 173 bdev_io_tailq_t queued_resets; 174 175 uint32_t flags; 176 177 /* 178 * Rate limiting on this channel. 179 * Queue of IO awaiting issue because of a QoS rate limiting happened 180 * on this channel. 181 */ 182 bdev_io_tailq_t qos_io; 183 184 /* 185 * Rate limiting on this channel. 186 * Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 187 * only valid for the master channel which manages the outstanding IOs. 188 */ 189 uint64_t qos_max_ios_per_timeslice; 190 191 /* 192 * Rate limiting on this channel. 193 * Submitted IO in one timeslice (e.g., 1ms) 194 */ 195 uint64_t io_submitted_this_timeslice; 196 197 /* 198 * Rate limiting on this channel. 199 * Periodic running QoS poller in millisecond. 200 */ 201 struct spdk_poller *qos_poller; 202 203 #ifdef SPDK_CONFIG_VTUNE 204 uint64_t start_tsc; 205 uint64_t interval_tsc; 206 __itt_string_handle *handle; 207 #endif 208 209 }; 210 211 struct spdk_bdev_desc { 212 struct spdk_bdev *bdev; 213 spdk_bdev_remove_cb_t remove_cb; 214 void *remove_ctx; 215 bool write; 216 TAILQ_ENTRY(spdk_bdev_desc) link; 217 }; 218 219 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 220 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 221 222 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 223 224 struct spdk_bdev * 225 spdk_bdev_first(void) 226 { 227 struct spdk_bdev *bdev; 228 229 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 230 if (bdev) { 231 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 232 } 233 234 return bdev; 235 } 236 237 struct spdk_bdev * 238 spdk_bdev_next(struct spdk_bdev *prev) 239 { 240 struct spdk_bdev *bdev; 241 242 bdev = TAILQ_NEXT(prev, link); 243 if (bdev) { 244 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 245 } 246 247 return bdev; 248 } 249 250 static struct spdk_bdev * 251 _bdev_next_leaf(struct spdk_bdev *bdev) 252 { 253 while (bdev != NULL) { 254 if (bdev->claim_module == NULL) { 255 return bdev; 256 } else { 257 bdev = TAILQ_NEXT(bdev, link); 258 } 259 } 260 261 return bdev; 262 } 263 264 struct spdk_bdev * 265 spdk_bdev_first_leaf(void) 266 { 267 struct spdk_bdev *bdev; 268 269 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 270 271 if (bdev) { 272 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 273 } 274 275 return bdev; 276 } 277 278 struct spdk_bdev * 279 spdk_bdev_next_leaf(struct spdk_bdev *prev) 280 { 281 struct spdk_bdev *bdev; 282 283 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 284 285 if (bdev) { 286 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 287 } 288 289 return bdev; 290 } 291 292 struct spdk_bdev * 293 spdk_bdev_get_by_name(const char *bdev_name) 294 { 295 struct spdk_bdev_alias *tmp; 296 struct spdk_bdev *bdev = spdk_bdev_first(); 297 298 while (bdev != NULL) { 299 if (strcmp(bdev_name, bdev->name) == 0) { 300 return bdev; 301 } 302 303 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 304 if (strcmp(bdev_name, tmp->alias) == 0) { 305 return bdev; 306 } 307 } 308 309 bdev = spdk_bdev_next(bdev); 310 } 311 312 return NULL; 313 } 314 315 static void 316 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 317 { 318 assert(bdev_io->get_buf_cb != NULL); 319 assert(buf != NULL); 320 assert(bdev_io->u.bdev.iovs != NULL); 321 322 bdev_io->buf = buf; 323 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 324 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 325 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 326 } 327 328 static void 329 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 330 { 331 struct spdk_mempool *pool; 332 struct spdk_bdev_io *tmp; 333 void *buf; 334 bdev_io_stailq_t *stailq; 335 struct spdk_bdev_mgmt_channel *ch; 336 337 assert(bdev_io->u.bdev.iovcnt == 1); 338 339 buf = bdev_io->buf; 340 ch = bdev_io->ch->module_ch->mgmt_ch; 341 342 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 343 pool = g_bdev_mgr.buf_small_pool; 344 stailq = &ch->need_buf_small; 345 } else { 346 pool = g_bdev_mgr.buf_large_pool; 347 stailq = &ch->need_buf_large; 348 } 349 350 if (STAILQ_EMPTY(stailq)) { 351 spdk_mempool_put(pool, buf); 352 } else { 353 tmp = STAILQ_FIRST(stailq); 354 STAILQ_REMOVE_HEAD(stailq, buf_link); 355 spdk_bdev_io_set_buf(tmp, buf); 356 } 357 } 358 359 void 360 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 361 { 362 struct spdk_mempool *pool; 363 bdev_io_stailq_t *stailq; 364 void *buf = NULL; 365 struct spdk_bdev_mgmt_channel *mgmt_ch; 366 367 assert(cb != NULL); 368 assert(bdev_io->u.bdev.iovs != NULL); 369 370 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 371 /* Buffer already present */ 372 cb(bdev_io->ch->channel, bdev_io); 373 return; 374 } 375 376 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 377 mgmt_ch = bdev_io->ch->module_ch->mgmt_ch; 378 379 bdev_io->buf_len = len; 380 bdev_io->get_buf_cb = cb; 381 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 382 pool = g_bdev_mgr.buf_small_pool; 383 stailq = &mgmt_ch->need_buf_small; 384 } else { 385 pool = g_bdev_mgr.buf_large_pool; 386 stailq = &mgmt_ch->need_buf_large; 387 } 388 389 buf = spdk_mempool_get(pool); 390 391 if (!buf) { 392 STAILQ_INSERT_TAIL(stailq, bdev_io, buf_link); 393 } else { 394 spdk_bdev_io_set_buf(bdev_io, buf); 395 } 396 } 397 398 static int 399 spdk_bdev_module_get_max_ctx_size(void) 400 { 401 struct spdk_bdev_module *bdev_module; 402 int max_bdev_module_size = 0; 403 404 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 405 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 406 max_bdev_module_size = bdev_module->get_ctx_size(); 407 } 408 } 409 410 return max_bdev_module_size; 411 } 412 413 void 414 spdk_bdev_config_text(FILE *fp) 415 { 416 struct spdk_bdev_module *bdev_module; 417 418 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 419 if (bdev_module->config_text) { 420 bdev_module->config_text(fp); 421 } 422 } 423 } 424 425 void 426 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 427 { 428 struct spdk_bdev_module *bdev_module; 429 struct spdk_bdev *bdev; 430 431 assert(w != NULL); 432 433 spdk_json_write_array_begin(w); 434 435 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 436 if (bdev_module->config_json) { 437 bdev_module->config_json(w); 438 } 439 } 440 441 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) { 442 spdk_bdev_config_json(bdev, w); 443 } 444 445 spdk_json_write_array_end(w); 446 } 447 448 static int 449 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 450 { 451 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 452 453 STAILQ_INIT(&ch->need_buf_small); 454 STAILQ_INIT(&ch->need_buf_large); 455 456 STAILQ_INIT(&ch->per_thread_cache); 457 ch->per_thread_cache_count = 0; 458 459 return 0; 460 } 461 462 static void 463 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 464 { 465 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 466 struct spdk_bdev_io *bdev_io; 467 468 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 469 SPDK_ERRLOG("Pending I/O list wasn't empty on channel free\n"); 470 } 471 472 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 473 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 474 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 475 ch->per_thread_cache_count--; 476 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 477 } 478 479 assert(ch->per_thread_cache_count == 0); 480 } 481 482 static void 483 spdk_bdev_init_complete(int rc) 484 { 485 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 486 void *cb_arg = g_init_cb_arg; 487 488 g_bdev_mgr.init_complete = true; 489 g_init_cb_fn = NULL; 490 g_init_cb_arg = NULL; 491 492 cb_fn(cb_arg, rc); 493 } 494 495 static void 496 spdk_bdev_module_action_complete(void) 497 { 498 struct spdk_bdev_module *m; 499 500 /* 501 * Don't finish bdev subsystem initialization if 502 * module pre-initialization is still in progress, or 503 * the subsystem been already initialized. 504 */ 505 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 506 return; 507 } 508 509 /* 510 * Check all bdev modules for inits/examinations in progress. If any 511 * exist, return immediately since we cannot finish bdev subsystem 512 * initialization until all are completed. 513 */ 514 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 515 if (m->action_in_progress > 0) { 516 return; 517 } 518 } 519 520 /* 521 * For modules that need to know when subsystem init is complete, 522 * inform them now. 523 */ 524 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 525 if (m->init_complete) { 526 m->init_complete(); 527 } 528 } 529 530 /* 531 * Modules already finished initialization - now that all 532 * the bdev modules have finished their asynchronous I/O 533 * processing, the entire bdev layer can be marked as complete. 534 */ 535 spdk_bdev_init_complete(0); 536 } 537 538 static void 539 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 540 { 541 assert(module->action_in_progress > 0); 542 module->action_in_progress--; 543 spdk_bdev_module_action_complete(); 544 } 545 546 void 547 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 548 { 549 spdk_bdev_module_action_done(module); 550 } 551 552 void 553 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 554 { 555 spdk_bdev_module_action_done(module); 556 } 557 558 static int 559 spdk_bdev_module_channel_create(void *io_device, void *ctx_buf) 560 { 561 struct spdk_bdev_module_channel *ch = ctx_buf; 562 struct spdk_io_channel *mgmt_ch; 563 564 ch->io_outstanding = 0; 565 TAILQ_INIT(&ch->nomem_io); 566 ch->nomem_threshold = 0; 567 568 mgmt_ch = spdk_get_io_channel(&g_bdev_mgr); 569 if (!mgmt_ch) { 570 return -1; 571 } 572 573 ch->mgmt_ch = spdk_io_channel_get_ctx(mgmt_ch); 574 575 return 0; 576 } 577 578 static void 579 spdk_bdev_module_channel_destroy(void *io_device, void *ctx_buf) 580 { 581 struct spdk_bdev_module_channel *ch = ctx_buf; 582 583 assert(ch->io_outstanding == 0); 584 assert(TAILQ_EMPTY(&ch->nomem_io)); 585 586 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->mgmt_ch)); 587 } 588 589 static int 590 spdk_bdev_modules_init(void) 591 { 592 struct spdk_bdev_module *module; 593 int rc = 0; 594 595 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 596 spdk_io_device_register(module, 597 spdk_bdev_module_channel_create, 598 spdk_bdev_module_channel_destroy, 599 sizeof(struct spdk_bdev_module_channel)); 600 rc = module->module_init(); 601 if (rc != 0) { 602 break; 603 } 604 } 605 606 g_bdev_mgr.module_init_complete = true; 607 return rc; 608 } 609 void 610 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 611 { 612 int cache_size; 613 int rc = 0; 614 char mempool_name[32]; 615 616 assert(cb_fn != NULL); 617 618 g_init_cb_fn = cb_fn; 619 g_init_cb_arg = cb_arg; 620 621 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 622 623 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 624 SPDK_BDEV_IO_POOL_SIZE, 625 sizeof(struct spdk_bdev_io) + 626 spdk_bdev_module_get_max_ctx_size(), 627 0, 628 SPDK_ENV_SOCKET_ID_ANY); 629 630 if (g_bdev_mgr.bdev_io_pool == NULL) { 631 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 632 spdk_bdev_init_complete(-1); 633 return; 634 } 635 636 /** 637 * Ensure no more than half of the total buffers end up local caches, by 638 * using spdk_env_get_core_count() to determine how many local caches we need 639 * to account for. 640 */ 641 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 642 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 643 644 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 645 BUF_SMALL_POOL_SIZE, 646 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 647 cache_size, 648 SPDK_ENV_SOCKET_ID_ANY); 649 if (!g_bdev_mgr.buf_small_pool) { 650 SPDK_ERRLOG("create rbuf small pool failed\n"); 651 spdk_bdev_init_complete(-1); 652 return; 653 } 654 655 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 656 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 657 658 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 659 BUF_LARGE_POOL_SIZE, 660 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 661 cache_size, 662 SPDK_ENV_SOCKET_ID_ANY); 663 if (!g_bdev_mgr.buf_large_pool) { 664 SPDK_ERRLOG("create rbuf large pool failed\n"); 665 spdk_bdev_init_complete(-1); 666 return; 667 } 668 669 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 670 NULL); 671 if (!g_bdev_mgr.zero_buffer) { 672 SPDK_ERRLOG("create bdev zero buffer failed\n"); 673 spdk_bdev_init_complete(-1); 674 return; 675 } 676 677 #ifdef SPDK_CONFIG_VTUNE 678 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 679 #endif 680 681 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 682 spdk_bdev_mgmt_channel_destroy, 683 sizeof(struct spdk_bdev_mgmt_channel)); 684 685 rc = spdk_bdev_modules_init(); 686 if (rc != 0) { 687 SPDK_ERRLOG("bdev modules init failed\n"); 688 spdk_bdev_init_complete(-1); 689 return; 690 } 691 692 spdk_bdev_module_action_complete(); 693 } 694 695 static void 696 spdk_bdev_mgr_unregister_cb(void *io_device) 697 { 698 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 699 700 cb_fn(g_fini_cb_arg); 701 g_fini_cb_fn = NULL; 702 g_fini_cb_arg = NULL; 703 } 704 705 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 706 707 static void 708 spdk_bdev_module_finish_iter(void *arg) 709 { 710 struct spdk_bdev_module *bdev_module; 711 712 /* Start iterating from the last touched module */ 713 if (!g_resume_bdev_module) { 714 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 715 } else { 716 bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq); 717 } 718 719 if (bdev_module) { 720 /* Save our place so we can resume later. We must 721 * save the variable here, before calling module_fini() 722 * below, because in some cases the module may immediately 723 * call spdk_bdev_module_finish_done() and re-enter 724 * this function to continue iterating. */ 725 g_resume_bdev_module = bdev_module; 726 727 if (bdev_module->module_fini) { 728 bdev_module->module_fini(); 729 } 730 731 if (!bdev_module->async_fini) { 732 spdk_bdev_module_finish_done(); 733 } 734 735 return; 736 } 737 738 g_resume_bdev_module = NULL; 739 740 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 741 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 742 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 743 SPDK_BDEV_IO_POOL_SIZE); 744 } 745 746 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 747 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 748 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 749 BUF_SMALL_POOL_SIZE); 750 assert(false); 751 } 752 753 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 754 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 755 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 756 BUF_LARGE_POOL_SIZE); 757 assert(false); 758 } 759 760 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 761 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 762 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 763 spdk_dma_free(g_bdev_mgr.zero_buffer); 764 765 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 766 } 767 768 static void 769 spdk_bdev_module_unregister_cb(void *io_device) 770 { 771 if (spdk_get_thread() != g_fini_thread) { 772 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 773 } else { 774 spdk_bdev_module_finish_iter(NULL); 775 } 776 } 777 778 void 779 spdk_bdev_module_finish_done(void) 780 { 781 spdk_io_device_unregister(g_resume_bdev_module, spdk_bdev_module_unregister_cb); 782 } 783 784 static void 785 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 786 { 787 struct spdk_bdev *bdev = cb_arg; 788 789 if (bdeverrno && bdev) { 790 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 791 bdev->name); 792 793 /* 794 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 795 * bdev; try to continue by manually removing this bdev from the list and continue 796 * with the next bdev in the list. 797 */ 798 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 799 } 800 801 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 802 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 803 /* 804 * Bdev module finish need to be deffered as we might be in the middle of some context 805 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 806 * after returning. 807 */ 808 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 809 return; 810 } 811 812 /* 813 * Unregister the first bdev in the list. 814 * 815 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 816 * calling the remove_cb of the descriptors first. 817 * 818 * Once this bdev and all of its open descriptors have been cleaned up, this function 819 * will be called again via the unregister completion callback to continue the cleanup 820 * process with the next bdev. 821 */ 822 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 823 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 824 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 825 } 826 827 void 828 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 829 { 830 assert(cb_fn != NULL); 831 832 g_fini_thread = spdk_get_thread(); 833 834 g_fini_cb_fn = cb_fn; 835 g_fini_cb_arg = cb_arg; 836 837 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 838 } 839 840 static struct spdk_bdev_io * 841 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 842 { 843 struct spdk_bdev_mgmt_channel *ch = channel->module_ch->mgmt_ch; 844 struct spdk_bdev_io *bdev_io; 845 846 if (ch->per_thread_cache_count > 0) { 847 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 848 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 849 ch->per_thread_cache_count--; 850 } else { 851 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 852 if (!bdev_io) { 853 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 854 return NULL; 855 } 856 } 857 858 return bdev_io; 859 } 860 861 static void 862 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 863 { 864 struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->module_ch->mgmt_ch; 865 866 if (bdev_io->buf != NULL) { 867 spdk_bdev_io_put_buf(bdev_io); 868 } 869 870 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 871 ch->per_thread_cache_count++; 872 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, buf_link); 873 } else { 874 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 875 } 876 } 877 878 static void 879 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 880 { 881 struct spdk_bdev_io *bdev_io = NULL; 882 struct spdk_bdev *bdev = ch->bdev; 883 struct spdk_bdev_module_channel *module_ch = ch->module_ch; 884 885 while (!TAILQ_EMPTY(&ch->qos_io)) { 886 if (ch->io_submitted_this_timeslice < ch->qos_max_ios_per_timeslice) { 887 bdev_io = TAILQ_FIRST(&ch->qos_io); 888 TAILQ_REMOVE(&ch->qos_io, bdev_io, link); 889 ch->io_submitted_this_timeslice++; 890 ch->io_outstanding++; 891 module_ch->io_outstanding++; 892 bdev->fn_table->submit_request(ch->channel, bdev_io); 893 } else { 894 break; 895 } 896 } 897 } 898 899 static void 900 _spdk_bdev_io_submit(void *ctx) 901 { 902 struct spdk_bdev_io *bdev_io = ctx; 903 struct spdk_bdev *bdev = bdev_io->bdev; 904 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 905 struct spdk_io_channel *ch = bdev_ch->channel; 906 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 907 908 bdev_io->submit_tsc = spdk_get_ticks(); 909 bdev_ch->io_outstanding++; 910 module_ch->io_outstanding++; 911 bdev_io->in_submit_request = true; 912 if (spdk_likely(bdev_ch->flags == 0)) { 913 if (spdk_likely(TAILQ_EMPTY(&module_ch->nomem_io))) { 914 bdev->fn_table->submit_request(ch, bdev_io); 915 } else { 916 bdev_ch->io_outstanding--; 917 module_ch->io_outstanding--; 918 TAILQ_INSERT_TAIL(&module_ch->nomem_io, bdev_io, link); 919 } 920 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 921 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 922 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 923 bdev_ch->io_outstanding--; 924 module_ch->io_outstanding--; 925 TAILQ_INSERT_TAIL(&bdev_ch->qos_io, bdev_io, link); 926 _spdk_bdev_qos_io_submit(bdev_ch); 927 } else { 928 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 929 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 930 } 931 bdev_io->in_submit_request = false; 932 } 933 934 static void 935 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 936 { 937 struct spdk_bdev *bdev = bdev_io->bdev; 938 939 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 940 941 /* QoS channel and thread have been properly configured */ 942 if (bdev->ios_per_sec > 0 && bdev->qos_channel && bdev->qos_thread) { 943 bdev_io->io_submit_ch = bdev_io->ch; 944 bdev_io->ch = bdev->qos_channel; 945 spdk_thread_send_msg(bdev->qos_thread, _spdk_bdev_io_submit, bdev_io); 946 } else { 947 _spdk_bdev_io_submit(bdev_io); 948 } 949 } 950 951 static void 952 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 953 { 954 struct spdk_bdev *bdev = bdev_io->bdev; 955 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 956 struct spdk_io_channel *ch = bdev_ch->channel; 957 958 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 959 960 bdev_io->in_submit_request = true; 961 bdev->fn_table->submit_request(ch, bdev_io); 962 bdev_io->in_submit_request = false; 963 } 964 965 static void 966 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 967 struct spdk_bdev *bdev, void *cb_arg, 968 spdk_bdev_io_completion_cb cb) 969 { 970 bdev_io->bdev = bdev; 971 bdev_io->caller_ctx = cb_arg; 972 bdev_io->cb = cb; 973 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 974 bdev_io->in_submit_request = false; 975 bdev_io->buf = NULL; 976 bdev_io->io_submit_ch = NULL; 977 } 978 979 bool 980 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 981 { 982 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 983 } 984 985 int 986 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 987 { 988 if (bdev->fn_table->dump_info_json) { 989 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 990 } 991 992 return 0; 993 } 994 995 void 996 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 997 { 998 assert(bdev != NULL); 999 assert(w != NULL); 1000 1001 if (bdev->fn_table->write_config_json) { 1002 bdev->fn_table->write_config_json(bdev, w); 1003 } else { 1004 spdk_json_write_object_begin(w); 1005 spdk_json_write_named_string(w, "name", bdev->name); 1006 spdk_json_write_object_end(w); 1007 } 1008 } 1009 1010 static void 1011 spdk_bdev_qos_get_max_ios_per_timeslice(struct spdk_bdev_channel *qos_ch) 1012 { 1013 uint64_t qos_max_ios_per_timeslice = 0; 1014 struct spdk_bdev *bdev = qos_ch->bdev; 1015 1016 qos_max_ios_per_timeslice = bdev->ios_per_sec * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1017 SPDK_BDEV_SEC_TO_USEC; 1018 qos_ch->qos_max_ios_per_timeslice = spdk_max(qos_max_ios_per_timeslice, 1019 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 1020 } 1021 1022 static int 1023 spdk_bdev_channel_poll_qos(void *arg) 1024 { 1025 struct spdk_bdev_channel *ch = arg; 1026 1027 /* Reset for next round of rate limiting */ 1028 ch->io_submitted_this_timeslice = 0; 1029 spdk_bdev_qos_get_max_ios_per_timeslice(ch); 1030 1031 _spdk_bdev_qos_io_submit(ch); 1032 1033 return -1; 1034 } 1035 1036 static int 1037 _spdk_bdev_channel_create(struct spdk_bdev_channel *ch, void *io_device) 1038 { 1039 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1040 1041 ch->bdev = bdev; 1042 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1043 if (!ch->channel) { 1044 return -1; 1045 } 1046 1047 ch->module_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(bdev->module)); 1048 1049 memset(&ch->stat, 0, sizeof(ch->stat)); 1050 ch->io_outstanding = 0; 1051 TAILQ_INIT(&ch->queued_resets); 1052 TAILQ_INIT(&ch->qos_io); 1053 ch->qos_max_ios_per_timeslice = 0; 1054 ch->io_submitted_this_timeslice = 0; 1055 ch->qos_poller = NULL; 1056 ch->flags = 0; 1057 1058 return 0; 1059 } 1060 1061 static void 1062 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1063 { 1064 if (!ch) { 1065 return; 1066 } 1067 1068 if (ch->channel) { 1069 spdk_put_io_channel(ch->channel); 1070 } 1071 1072 if (ch->module_ch) { 1073 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->module_ch)); 1074 } 1075 } 1076 1077 /* Caller must hold bdev->mutex. */ 1078 static int 1079 spdk_bdev_qos_channel_create(struct spdk_bdev *bdev) 1080 { 1081 assert(bdev->qos_channel == NULL); 1082 assert(bdev->qos_thread == NULL); 1083 1084 bdev->qos_channel = calloc(1, sizeof(struct spdk_bdev_channel)); 1085 if (!bdev->qos_channel) { 1086 return -1; 1087 } 1088 1089 bdev->qos_thread = spdk_get_thread(); 1090 if (!bdev->qos_thread) { 1091 free(bdev->qos_channel); 1092 bdev->qos_channel = NULL; 1093 return -1; 1094 } 1095 1096 if (_spdk_bdev_channel_create(bdev->qos_channel, __bdev_to_io_dev(bdev)) != 0) { 1097 free(bdev->qos_channel); 1098 bdev->qos_channel = NULL; 1099 bdev->qos_thread = NULL; 1100 return -1; 1101 } 1102 1103 bdev->qos_channel->flags |= BDEV_CH_QOS_ENABLED; 1104 spdk_bdev_qos_get_max_ios_per_timeslice(bdev->qos_channel); 1105 bdev->qos_channel->qos_poller = spdk_poller_register( 1106 spdk_bdev_channel_poll_qos, 1107 bdev->qos_channel, 1108 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1109 1110 return 0; 1111 } 1112 1113 static int 1114 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1115 { 1116 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1117 struct spdk_bdev_channel *ch = ctx_buf; 1118 1119 if (_spdk_bdev_channel_create(ch, io_device) != 0) { 1120 _spdk_bdev_channel_destroy_resource(ch); 1121 return -1; 1122 } 1123 1124 #ifdef SPDK_CONFIG_VTUNE 1125 { 1126 char *name; 1127 __itt_init_ittlib(NULL, 0); 1128 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1129 if (!name) { 1130 _spdk_bdev_channel_destroy_resource(ch); 1131 return -1; 1132 } 1133 ch->handle = __itt_string_handle_create(name); 1134 free(name); 1135 ch->start_tsc = spdk_get_ticks(); 1136 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1137 } 1138 #endif 1139 1140 pthread_mutex_lock(&bdev->mutex); 1141 1142 /* Rate limiting on this bdev enabled */ 1143 if (bdev->ios_per_sec > 0 && bdev->qos_channel == NULL) { 1144 if (spdk_bdev_qos_channel_create(bdev) != 0) { 1145 _spdk_bdev_channel_destroy_resource(ch); 1146 pthread_mutex_unlock(&bdev->mutex); 1147 return -1; 1148 } 1149 } 1150 1151 bdev->channel_count++; 1152 1153 pthread_mutex_unlock(&bdev->mutex); 1154 1155 return 0; 1156 } 1157 1158 /* 1159 * Abort I/O that are waiting on a data buffer. These types of I/O are 1160 * linked using the spdk_bdev_io buf_link TAILQ_ENTRY. 1161 */ 1162 static void 1163 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1164 { 1165 bdev_io_stailq_t tmp; 1166 struct spdk_bdev_io *bdev_io; 1167 1168 STAILQ_INIT(&tmp); 1169 1170 while (!STAILQ_EMPTY(queue)) { 1171 bdev_io = STAILQ_FIRST(queue); 1172 STAILQ_REMOVE_HEAD(queue, buf_link); 1173 if (bdev_io->ch == ch) { 1174 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1175 } else { 1176 STAILQ_INSERT_TAIL(&tmp, bdev_io, buf_link); 1177 } 1178 } 1179 1180 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1181 } 1182 1183 /* 1184 * Abort I/O that are queued waiting for submission. These types of I/O are 1185 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1186 */ 1187 static void 1188 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1189 { 1190 struct spdk_bdev_io *bdev_io, *tmp; 1191 1192 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1193 if (bdev_io->ch == ch) { 1194 TAILQ_REMOVE(queue, bdev_io, link); 1195 /* 1196 * spdk_bdev_io_complete() assumes that the completed I/O had 1197 * been submitted to the bdev module. Since in this case it 1198 * hadn't, bump io_outstanding to account for the decrement 1199 * that spdk_bdev_io_complete() will do. 1200 */ 1201 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1202 ch->io_outstanding++; 1203 ch->module_ch->io_outstanding++; 1204 } 1205 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1206 } 1207 } 1208 } 1209 1210 static void 1211 _spdk_bdev_channel_destroy(struct spdk_bdev_channel *ch) 1212 { 1213 struct spdk_bdev_mgmt_channel *mgmt_ch; 1214 struct spdk_bdev_module_channel *module_ch = ch->module_ch; 1215 1216 mgmt_ch = module_ch->mgmt_ch; 1217 1218 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1219 _spdk_bdev_abort_queued_io(&ch->qos_io, ch); 1220 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, ch); 1221 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1222 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1223 1224 _spdk_bdev_channel_destroy_resource(ch); 1225 } 1226 1227 static void 1228 spdk_bdev_qos_channel_destroy(void *ctx) 1229 { 1230 struct spdk_bdev_channel *qos_channel = ctx; 1231 1232 _spdk_bdev_channel_destroy(qos_channel); 1233 1234 spdk_poller_unregister(&qos_channel->qos_poller); 1235 free(qos_channel); 1236 } 1237 1238 static void 1239 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1240 { 1241 struct spdk_bdev_channel *ch = ctx_buf; 1242 struct spdk_bdev *bdev = ch->bdev; 1243 1244 _spdk_bdev_channel_destroy(ch); 1245 1246 pthread_mutex_lock(&bdev->mutex); 1247 bdev->channel_count--; 1248 if (bdev->channel_count == 0 && bdev->qos_channel != NULL) { 1249 /* All I/O channels for this bdev have been destroyed - destroy the QoS channel. */ 1250 spdk_thread_send_msg(bdev->qos_thread, spdk_bdev_qos_channel_destroy, 1251 bdev->qos_channel); 1252 1253 /* 1254 * Set qos_channel to NULL within the critical section so that 1255 * if another channel is created, it will see qos_channel == NULL and 1256 * re-create the QoS channel even if the asynchronous qos_channel_destroy 1257 * isn't finished yet. 1258 */ 1259 bdev->qos_channel = NULL; 1260 bdev->qos_thread = NULL; 1261 } 1262 pthread_mutex_unlock(&bdev->mutex); 1263 } 1264 1265 int 1266 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1267 { 1268 struct spdk_bdev_alias *tmp; 1269 1270 if (alias == NULL) { 1271 SPDK_ERRLOG("Empty alias passed\n"); 1272 return -EINVAL; 1273 } 1274 1275 if (spdk_bdev_get_by_name(alias)) { 1276 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1277 return -EEXIST; 1278 } 1279 1280 tmp = calloc(1, sizeof(*tmp)); 1281 if (tmp == NULL) { 1282 SPDK_ERRLOG("Unable to allocate alias\n"); 1283 return -ENOMEM; 1284 } 1285 1286 tmp->alias = strdup(alias); 1287 if (tmp->alias == NULL) { 1288 free(tmp); 1289 SPDK_ERRLOG("Unable to allocate alias\n"); 1290 return -ENOMEM; 1291 } 1292 1293 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1294 1295 return 0; 1296 } 1297 1298 int 1299 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1300 { 1301 struct spdk_bdev_alias *tmp; 1302 1303 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1304 if (strcmp(alias, tmp->alias) == 0) { 1305 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1306 free(tmp->alias); 1307 free(tmp); 1308 return 0; 1309 } 1310 } 1311 1312 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1313 1314 return -ENOENT; 1315 } 1316 1317 struct spdk_io_channel * 1318 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1319 { 1320 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1321 } 1322 1323 const char * 1324 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1325 { 1326 return bdev->name; 1327 } 1328 1329 const char * 1330 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1331 { 1332 return bdev->product_name; 1333 } 1334 1335 const struct spdk_bdev_aliases_list * 1336 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1337 { 1338 return &bdev->aliases; 1339 } 1340 1341 uint32_t 1342 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1343 { 1344 return bdev->blocklen; 1345 } 1346 1347 uint64_t 1348 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1349 { 1350 return bdev->blockcnt; 1351 } 1352 1353 size_t 1354 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1355 { 1356 /* TODO: push this logic down to the bdev modules */ 1357 if (bdev->need_aligned_buffer) { 1358 return bdev->blocklen; 1359 } 1360 1361 return 1; 1362 } 1363 1364 uint32_t 1365 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1366 { 1367 return bdev->optimal_io_boundary; 1368 } 1369 1370 bool 1371 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1372 { 1373 return bdev->write_cache; 1374 } 1375 1376 const struct spdk_uuid * 1377 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1378 { 1379 return &bdev->uuid; 1380 } 1381 1382 int 1383 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1384 { 1385 int ret; 1386 1387 pthread_mutex_lock(&bdev->mutex); 1388 1389 /* bdev has open descriptors */ 1390 if (!TAILQ_EMPTY(&bdev->open_descs) && 1391 bdev->blockcnt > size) { 1392 ret = -EBUSY; 1393 } else { 1394 bdev->blockcnt = size; 1395 ret = 0; 1396 } 1397 1398 pthread_mutex_unlock(&bdev->mutex); 1399 1400 return ret; 1401 } 1402 1403 /* 1404 * Convert I/O offset and length from bytes to blocks. 1405 * 1406 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1407 */ 1408 static uint64_t 1409 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1410 uint64_t num_bytes, uint64_t *num_blocks) 1411 { 1412 uint32_t block_size = bdev->blocklen; 1413 1414 *offset_blocks = offset_bytes / block_size; 1415 *num_blocks = num_bytes / block_size; 1416 1417 return (offset_bytes % block_size) | (num_bytes % block_size); 1418 } 1419 1420 static bool 1421 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1422 { 1423 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1424 * has been an overflow and hence the offset has been wrapped around */ 1425 if (offset_blocks + num_blocks < offset_blocks) { 1426 return false; 1427 } 1428 1429 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1430 if (offset_blocks + num_blocks > bdev->blockcnt) { 1431 return false; 1432 } 1433 1434 return true; 1435 } 1436 1437 int 1438 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1439 void *buf, uint64_t offset, uint64_t nbytes, 1440 spdk_bdev_io_completion_cb cb, void *cb_arg) 1441 { 1442 uint64_t offset_blocks, num_blocks; 1443 1444 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1445 return -EINVAL; 1446 } 1447 1448 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1449 } 1450 1451 int 1452 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1453 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1454 spdk_bdev_io_completion_cb cb, void *cb_arg) 1455 { 1456 struct spdk_bdev *bdev = desc->bdev; 1457 struct spdk_bdev_io *bdev_io; 1458 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1459 1460 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1461 return -EINVAL; 1462 } 1463 1464 bdev_io = spdk_bdev_get_io(channel); 1465 if (!bdev_io) { 1466 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1467 return -ENOMEM; 1468 } 1469 1470 bdev_io->ch = channel; 1471 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1472 bdev_io->u.bdev.iov.iov_base = buf; 1473 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1474 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1475 bdev_io->u.bdev.iovcnt = 1; 1476 bdev_io->u.bdev.num_blocks = num_blocks; 1477 bdev_io->u.bdev.offset_blocks = offset_blocks; 1478 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1479 1480 spdk_bdev_io_submit(bdev_io); 1481 return 0; 1482 } 1483 1484 int 1485 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1486 struct iovec *iov, int iovcnt, 1487 uint64_t offset, uint64_t nbytes, 1488 spdk_bdev_io_completion_cb cb, void *cb_arg) 1489 { 1490 uint64_t offset_blocks, num_blocks; 1491 1492 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1493 return -EINVAL; 1494 } 1495 1496 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1497 } 1498 1499 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1500 struct iovec *iov, int iovcnt, 1501 uint64_t offset_blocks, uint64_t num_blocks, 1502 spdk_bdev_io_completion_cb cb, void *cb_arg) 1503 { 1504 struct spdk_bdev *bdev = desc->bdev; 1505 struct spdk_bdev_io *bdev_io; 1506 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1507 1508 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1509 return -EINVAL; 1510 } 1511 1512 bdev_io = spdk_bdev_get_io(channel); 1513 if (!bdev_io) { 1514 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1515 return -ENOMEM; 1516 } 1517 1518 bdev_io->ch = channel; 1519 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1520 bdev_io->u.bdev.iovs = iov; 1521 bdev_io->u.bdev.iovcnt = iovcnt; 1522 bdev_io->u.bdev.num_blocks = num_blocks; 1523 bdev_io->u.bdev.offset_blocks = offset_blocks; 1524 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1525 1526 spdk_bdev_io_submit(bdev_io); 1527 return 0; 1528 } 1529 1530 int 1531 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1532 void *buf, uint64_t offset, uint64_t nbytes, 1533 spdk_bdev_io_completion_cb cb, void *cb_arg) 1534 { 1535 uint64_t offset_blocks, num_blocks; 1536 1537 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1538 return -EINVAL; 1539 } 1540 1541 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1542 } 1543 1544 int 1545 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1546 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1547 spdk_bdev_io_completion_cb cb, void *cb_arg) 1548 { 1549 struct spdk_bdev *bdev = desc->bdev; 1550 struct spdk_bdev_io *bdev_io; 1551 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1552 1553 if (!desc->write) { 1554 return -EBADF; 1555 } 1556 1557 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1558 return -EINVAL; 1559 } 1560 1561 bdev_io = spdk_bdev_get_io(channel); 1562 if (!bdev_io) { 1563 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1564 return -ENOMEM; 1565 } 1566 1567 bdev_io->ch = channel; 1568 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1569 bdev_io->u.bdev.iov.iov_base = buf; 1570 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1571 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1572 bdev_io->u.bdev.iovcnt = 1; 1573 bdev_io->u.bdev.num_blocks = num_blocks; 1574 bdev_io->u.bdev.offset_blocks = offset_blocks; 1575 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1576 1577 spdk_bdev_io_submit(bdev_io); 1578 return 0; 1579 } 1580 1581 int 1582 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1583 struct iovec *iov, int iovcnt, 1584 uint64_t offset, uint64_t len, 1585 spdk_bdev_io_completion_cb cb, void *cb_arg) 1586 { 1587 uint64_t offset_blocks, num_blocks; 1588 1589 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1590 return -EINVAL; 1591 } 1592 1593 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1594 } 1595 1596 int 1597 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1598 struct iovec *iov, int iovcnt, 1599 uint64_t offset_blocks, uint64_t num_blocks, 1600 spdk_bdev_io_completion_cb cb, void *cb_arg) 1601 { 1602 struct spdk_bdev *bdev = desc->bdev; 1603 struct spdk_bdev_io *bdev_io; 1604 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1605 1606 if (!desc->write) { 1607 return -EBADF; 1608 } 1609 1610 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1611 return -EINVAL; 1612 } 1613 1614 bdev_io = spdk_bdev_get_io(channel); 1615 if (!bdev_io) { 1616 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1617 return -ENOMEM; 1618 } 1619 1620 bdev_io->ch = channel; 1621 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1622 bdev_io->u.bdev.iovs = iov; 1623 bdev_io->u.bdev.iovcnt = iovcnt; 1624 bdev_io->u.bdev.num_blocks = num_blocks; 1625 bdev_io->u.bdev.offset_blocks = offset_blocks; 1626 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1627 1628 spdk_bdev_io_submit(bdev_io); 1629 return 0; 1630 } 1631 1632 int 1633 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1634 uint64_t offset, uint64_t len, 1635 spdk_bdev_io_completion_cb cb, void *cb_arg) 1636 { 1637 uint64_t offset_blocks, num_blocks; 1638 1639 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1640 return -EINVAL; 1641 } 1642 1643 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1644 } 1645 1646 int 1647 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1648 uint64_t offset_blocks, uint64_t num_blocks, 1649 spdk_bdev_io_completion_cb cb, void *cb_arg) 1650 { 1651 struct spdk_bdev *bdev = desc->bdev; 1652 struct spdk_bdev_io *bdev_io; 1653 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1654 uint64_t len; 1655 bool split_request = false; 1656 1657 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1658 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1659 return -ERANGE; 1660 } 1661 1662 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1663 return -EINVAL; 1664 } 1665 1666 bdev_io = spdk_bdev_get_io(channel); 1667 1668 if (!bdev_io) { 1669 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1670 return -ENOMEM; 1671 } 1672 1673 bdev_io->ch = channel; 1674 bdev_io->u.bdev.offset_blocks = offset_blocks; 1675 1676 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1677 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1678 bdev_io->u.bdev.num_blocks = num_blocks; 1679 bdev_io->u.bdev.iovs = NULL; 1680 bdev_io->u.bdev.iovcnt = 0; 1681 1682 } else { 1683 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1684 1685 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1686 1687 if (len > ZERO_BUFFER_SIZE) { 1688 split_request = true; 1689 len = ZERO_BUFFER_SIZE; 1690 } 1691 1692 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1693 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1694 bdev_io->u.bdev.iov.iov_len = len; 1695 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1696 bdev_io->u.bdev.iovcnt = 1; 1697 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1698 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1699 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1700 } 1701 1702 if (split_request) { 1703 bdev_io->u.bdev.stored_user_cb = cb; 1704 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1705 } else { 1706 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1707 } 1708 spdk_bdev_io_submit(bdev_io); 1709 return 0; 1710 } 1711 1712 int 1713 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1714 uint64_t offset, uint64_t nbytes, 1715 spdk_bdev_io_completion_cb cb, void *cb_arg) 1716 { 1717 uint64_t offset_blocks, num_blocks; 1718 1719 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1720 return -EINVAL; 1721 } 1722 1723 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1724 } 1725 1726 int 1727 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1728 uint64_t offset_blocks, uint64_t num_blocks, 1729 spdk_bdev_io_completion_cb cb, void *cb_arg) 1730 { 1731 struct spdk_bdev *bdev = desc->bdev; 1732 struct spdk_bdev_io *bdev_io; 1733 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1734 1735 if (!desc->write) { 1736 return -EBADF; 1737 } 1738 1739 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1740 return -EINVAL; 1741 } 1742 1743 if (num_blocks == 0) { 1744 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1745 return -EINVAL; 1746 } 1747 1748 bdev_io = spdk_bdev_get_io(channel); 1749 if (!bdev_io) { 1750 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1751 return -ENOMEM; 1752 } 1753 1754 bdev_io->ch = channel; 1755 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1756 bdev_io->u.bdev.iov.iov_base = NULL; 1757 bdev_io->u.bdev.iov.iov_len = 0; 1758 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1759 bdev_io->u.bdev.iovcnt = 1; 1760 bdev_io->u.bdev.offset_blocks = offset_blocks; 1761 bdev_io->u.bdev.num_blocks = num_blocks; 1762 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1763 1764 spdk_bdev_io_submit(bdev_io); 1765 return 0; 1766 } 1767 1768 int 1769 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1770 uint64_t offset, uint64_t length, 1771 spdk_bdev_io_completion_cb cb, void *cb_arg) 1772 { 1773 uint64_t offset_blocks, num_blocks; 1774 1775 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1776 return -EINVAL; 1777 } 1778 1779 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1780 } 1781 1782 int 1783 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1784 uint64_t offset_blocks, uint64_t num_blocks, 1785 spdk_bdev_io_completion_cb cb, void *cb_arg) 1786 { 1787 struct spdk_bdev *bdev = desc->bdev; 1788 struct spdk_bdev_io *bdev_io; 1789 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1790 1791 if (!desc->write) { 1792 return -EBADF; 1793 } 1794 1795 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1796 return -EINVAL; 1797 } 1798 1799 bdev_io = spdk_bdev_get_io(channel); 1800 if (!bdev_io) { 1801 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1802 return -ENOMEM; 1803 } 1804 1805 bdev_io->ch = channel; 1806 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1807 bdev_io->u.bdev.iovs = NULL; 1808 bdev_io->u.bdev.iovcnt = 0; 1809 bdev_io->u.bdev.offset_blocks = offset_blocks; 1810 bdev_io->u.bdev.num_blocks = num_blocks; 1811 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1812 1813 spdk_bdev_io_submit(bdev_io); 1814 return 0; 1815 } 1816 1817 static void 1818 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1819 { 1820 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1821 struct spdk_bdev_io *bdev_io; 1822 1823 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1824 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1825 spdk_bdev_io_submit_reset(bdev_io); 1826 } 1827 1828 static void 1829 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1830 { 1831 struct spdk_io_channel *ch; 1832 struct spdk_bdev_channel *channel; 1833 struct spdk_bdev_mgmt_channel *mgmt_channel; 1834 struct spdk_bdev_module_channel *module_ch; 1835 1836 ch = spdk_io_channel_iter_get_channel(i); 1837 channel = spdk_io_channel_get_ctx(ch); 1838 module_ch = channel->module_ch; 1839 mgmt_channel = module_ch->mgmt_ch; 1840 1841 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1842 1843 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, channel); 1844 _spdk_bdev_abort_queued_io(&channel->qos_io, channel); 1845 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1846 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1847 1848 spdk_for_each_channel_continue(i, 0); 1849 } 1850 1851 static void 1852 _spdk_bdev_reset_freeze_qos_channel(void *ctx) 1853 { 1854 struct spdk_bdev *bdev = ctx; 1855 struct spdk_bdev_mgmt_channel *mgmt_channel = NULL; 1856 struct spdk_bdev_channel *qos_channel = bdev->qos_channel; 1857 struct spdk_bdev_module_channel *module_ch = NULL; 1858 1859 if (qos_channel) { 1860 module_ch = qos_channel->module_ch; 1861 mgmt_channel = module_ch->mgmt_ch; 1862 1863 qos_channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1864 1865 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, qos_channel); 1866 _spdk_bdev_abort_queued_io(&qos_channel->qos_io, qos_channel); 1867 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, qos_channel); 1868 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, qos_channel); 1869 } 1870 } 1871 1872 static void 1873 _spdk_bdev_start_reset(void *ctx) 1874 { 1875 struct spdk_bdev_channel *ch = ctx; 1876 1877 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1878 ch, _spdk_bdev_reset_dev); 1879 } 1880 1881 static void 1882 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1883 { 1884 struct spdk_bdev *bdev = ch->bdev; 1885 1886 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1887 1888 pthread_mutex_lock(&bdev->mutex); 1889 if (bdev->reset_in_progress == NULL) { 1890 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1891 /* 1892 * Take a channel reference for the target bdev for the life of this 1893 * reset. This guards against the channel getting destroyed while 1894 * spdk_for_each_channel() calls related to this reset IO are in 1895 * progress. We will release the reference when this reset is 1896 * completed. 1897 */ 1898 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1899 _spdk_bdev_start_reset(ch); 1900 } 1901 pthread_mutex_unlock(&bdev->mutex); 1902 } 1903 1904 int 1905 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1906 spdk_bdev_io_completion_cb cb, void *cb_arg) 1907 { 1908 struct spdk_bdev *bdev = desc->bdev; 1909 struct spdk_bdev_io *bdev_io; 1910 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1911 1912 bdev_io = spdk_bdev_get_io(channel); 1913 if (!bdev_io) { 1914 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1915 return -ENOMEM; 1916 } 1917 1918 bdev_io->ch = channel; 1919 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1920 bdev_io->u.reset.ch_ref = NULL; 1921 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1922 1923 pthread_mutex_lock(&bdev->mutex); 1924 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 1925 pthread_mutex_unlock(&bdev->mutex); 1926 1927 _spdk_bdev_channel_start_reset(channel); 1928 1929 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 1930 if (bdev->qos_thread) { 1931 spdk_thread_send_msg(bdev->qos_thread, 1932 _spdk_bdev_reset_freeze_qos_channel, bdev); 1933 } 1934 1935 return 0; 1936 } 1937 1938 void 1939 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 1940 struct spdk_bdev_io_stat *stat) 1941 { 1942 #ifdef SPDK_CONFIG_VTUNE 1943 SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n"); 1944 memset(stat, 0, sizeof(*stat)); 1945 return; 1946 #endif 1947 1948 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1949 1950 channel->stat.ticks_rate = spdk_get_ticks_hz(); 1951 *stat = channel->stat; 1952 memset(&channel->stat, 0, sizeof(channel->stat)); 1953 } 1954 1955 int 1956 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1957 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 1958 spdk_bdev_io_completion_cb cb, void *cb_arg) 1959 { 1960 struct spdk_bdev *bdev = desc->bdev; 1961 struct spdk_bdev_io *bdev_io; 1962 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1963 1964 if (!desc->write) { 1965 return -EBADF; 1966 } 1967 1968 bdev_io = spdk_bdev_get_io(channel); 1969 if (!bdev_io) { 1970 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 1971 return -ENOMEM; 1972 } 1973 1974 bdev_io->ch = channel; 1975 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 1976 bdev_io->u.nvme_passthru.cmd = *cmd; 1977 bdev_io->u.nvme_passthru.buf = buf; 1978 bdev_io->u.nvme_passthru.nbytes = nbytes; 1979 bdev_io->u.nvme_passthru.md_buf = NULL; 1980 bdev_io->u.nvme_passthru.md_len = 0; 1981 1982 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1983 1984 spdk_bdev_io_submit(bdev_io); 1985 return 0; 1986 } 1987 1988 int 1989 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1990 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 1991 spdk_bdev_io_completion_cb cb, void *cb_arg) 1992 { 1993 struct spdk_bdev *bdev = desc->bdev; 1994 struct spdk_bdev_io *bdev_io; 1995 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1996 1997 if (!desc->write) { 1998 /* 1999 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2000 * to easily determine if the command is a read or write, but for now just 2001 * do not allow io_passthru with a read-only descriptor. 2002 */ 2003 return -EBADF; 2004 } 2005 2006 bdev_io = spdk_bdev_get_io(channel); 2007 if (!bdev_io) { 2008 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2009 return -ENOMEM; 2010 } 2011 2012 bdev_io->ch = channel; 2013 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2014 bdev_io->u.nvme_passthru.cmd = *cmd; 2015 bdev_io->u.nvme_passthru.buf = buf; 2016 bdev_io->u.nvme_passthru.nbytes = nbytes; 2017 bdev_io->u.nvme_passthru.md_buf = NULL; 2018 bdev_io->u.nvme_passthru.md_len = 0; 2019 2020 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2021 2022 spdk_bdev_io_submit(bdev_io); 2023 return 0; 2024 } 2025 2026 int 2027 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2028 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2029 spdk_bdev_io_completion_cb cb, void *cb_arg) 2030 { 2031 struct spdk_bdev *bdev = desc->bdev; 2032 struct spdk_bdev_io *bdev_io; 2033 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2034 2035 if (!desc->write) { 2036 /* 2037 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2038 * to easily determine if the command is a read or write, but for now just 2039 * do not allow io_passthru with a read-only descriptor. 2040 */ 2041 return -EBADF; 2042 } 2043 2044 bdev_io = spdk_bdev_get_io(channel); 2045 if (!bdev_io) { 2046 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2047 return -ENOMEM; 2048 } 2049 2050 bdev_io->ch = channel; 2051 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2052 bdev_io->u.nvme_passthru.cmd = *cmd; 2053 bdev_io->u.nvme_passthru.buf = buf; 2054 bdev_io->u.nvme_passthru.nbytes = nbytes; 2055 bdev_io->u.nvme_passthru.md_buf = md_buf; 2056 bdev_io->u.nvme_passthru.md_len = md_len; 2057 2058 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2059 2060 spdk_bdev_io_submit(bdev_io); 2061 return 0; 2062 } 2063 2064 int 2065 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2066 { 2067 if (!bdev_io) { 2068 SPDK_ERRLOG("bdev_io is NULL\n"); 2069 return -1; 2070 } 2071 2072 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2073 SPDK_ERRLOG("bdev_io is in pending state\n"); 2074 assert(false); 2075 return -1; 2076 } 2077 2078 spdk_bdev_put_io(bdev_io); 2079 2080 return 0; 2081 } 2082 2083 static void 2084 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2085 { 2086 struct spdk_bdev *bdev = bdev_ch->bdev; 2087 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 2088 struct spdk_bdev_io *bdev_io; 2089 2090 if (module_ch->io_outstanding > module_ch->nomem_threshold) { 2091 /* 2092 * Allow some more I/O to complete before retrying the nomem_io queue. 2093 * Some drivers (such as nvme) cannot immediately take a new I/O in 2094 * the context of a completion, because the resources for the I/O are 2095 * not released until control returns to the bdev poller. Also, we 2096 * may require several small I/O to complete before a larger I/O 2097 * (that requires splitting) can be submitted. 2098 */ 2099 return; 2100 } 2101 2102 while (!TAILQ_EMPTY(&module_ch->nomem_io)) { 2103 bdev_io = TAILQ_FIRST(&module_ch->nomem_io); 2104 TAILQ_REMOVE(&module_ch->nomem_io, bdev_io, link); 2105 bdev_io->ch->io_outstanding++; 2106 module_ch->io_outstanding++; 2107 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2108 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2109 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2110 break; 2111 } 2112 } 2113 } 2114 2115 static inline void 2116 _spdk_bdev_io_complete(void *ctx) 2117 { 2118 struct spdk_bdev_io *bdev_io = ctx; 2119 2120 if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) { 2121 /* 2122 * Send the completion to the thread that originally submitted the I/O, 2123 * which may not be the current thread in the case of QoS. 2124 */ 2125 if (bdev_io->io_submit_ch) { 2126 bdev_io->ch = bdev_io->io_submit_ch; 2127 bdev_io->io_submit_ch = NULL; 2128 } 2129 2130 /* 2131 * Defer completion to avoid potential infinite recursion if the 2132 * user's completion callback issues a new I/O. 2133 */ 2134 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2135 _spdk_bdev_io_complete, bdev_io); 2136 return; 2137 } 2138 2139 assert(bdev_io->cb != NULL); 2140 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel)); 2141 2142 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2143 bdev_io->caller_ctx); 2144 } 2145 2146 static void 2147 _spdk_bdev_unfreeze_qos_channel(void *ctx) 2148 { 2149 struct spdk_bdev *bdev = ctx; 2150 2151 if (bdev->qos_channel) { 2152 bdev->qos_channel->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2153 assert(TAILQ_EMPTY(&bdev->qos_channel->queued_resets)); 2154 } 2155 } 2156 2157 static void 2158 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2159 { 2160 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2161 2162 if (bdev_io->u.reset.ch_ref != NULL) { 2163 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2164 bdev_io->u.reset.ch_ref = NULL; 2165 } 2166 2167 _spdk_bdev_io_complete(bdev_io); 2168 } 2169 2170 static void 2171 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2172 { 2173 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2174 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2175 2176 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2177 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2178 _spdk_bdev_channel_start_reset(ch); 2179 } 2180 2181 spdk_for_each_channel_continue(i, 0); 2182 } 2183 2184 void 2185 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2186 { 2187 struct spdk_bdev *bdev = bdev_io->bdev; 2188 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2189 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 2190 2191 bdev_io->status = status; 2192 2193 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2194 bool unlock_channels = false; 2195 2196 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2197 SPDK_ERRLOG("NOMEM returned for reset\n"); 2198 } 2199 pthread_mutex_lock(&bdev->mutex); 2200 if (bdev_io == bdev->reset_in_progress) { 2201 bdev->reset_in_progress = NULL; 2202 unlock_channels = true; 2203 } 2204 pthread_mutex_unlock(&bdev->mutex); 2205 2206 if (unlock_channels) { 2207 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 2208 if (bdev->qos_thread) { 2209 spdk_thread_send_msg(bdev->qos_thread, 2210 _spdk_bdev_unfreeze_qos_channel, bdev); 2211 } 2212 2213 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2214 bdev_io, _spdk_bdev_reset_complete); 2215 return; 2216 } 2217 } else { 2218 assert(bdev_ch->io_outstanding > 0); 2219 assert(module_ch->io_outstanding > 0); 2220 bdev_ch->io_outstanding--; 2221 module_ch->io_outstanding--; 2222 2223 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2224 TAILQ_INSERT_HEAD(&module_ch->nomem_io, bdev_io, link); 2225 /* 2226 * Wait for some of the outstanding I/O to complete before we 2227 * retry any of the nomem_io. Normally we will wait for 2228 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2229 * depth channels we will instead wait for half to complete. 2230 */ 2231 module_ch->nomem_threshold = spdk_max((int64_t)module_ch->io_outstanding / 2, 2232 (int64_t)module_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); 2233 return; 2234 } 2235 2236 if (spdk_unlikely(!TAILQ_EMPTY(&module_ch->nomem_io))) { 2237 _spdk_bdev_ch_retry_io(bdev_ch); 2238 } 2239 } 2240 2241 if (status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2242 switch (bdev_io->type) { 2243 case SPDK_BDEV_IO_TYPE_READ: 2244 bdev_ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2245 bdev_ch->stat.num_read_ops++; 2246 bdev_ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2247 break; 2248 case SPDK_BDEV_IO_TYPE_WRITE: 2249 bdev_ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev->blocklen; 2250 bdev_ch->stat.num_write_ops++; 2251 bdev_ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2252 break; 2253 default: 2254 break; 2255 } 2256 } 2257 2258 #ifdef SPDK_CONFIG_VTUNE 2259 uint64_t now_tsc = spdk_get_ticks(); 2260 if (now_tsc > (bdev_ch->start_tsc + bdev_ch->interval_tsc)) { 2261 uint64_t data[5]; 2262 2263 data[0] = bdev_ch->stat.num_read_ops; 2264 data[1] = bdev_ch->stat.bytes_read; 2265 data[2] = bdev_ch->stat.num_write_ops; 2266 data[3] = bdev_ch->stat.bytes_written; 2267 data[4] = bdev->fn_table->get_spin_time ? 2268 bdev->fn_table->get_spin_time(bdev_ch->channel) : 0; 2269 2270 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_ch->handle, 2271 __itt_metadata_u64, 5, data); 2272 2273 memset(&bdev_ch->stat, 0, sizeof(bdev_ch->stat)); 2274 bdev_ch->start_tsc = now_tsc; 2275 } 2276 #endif 2277 2278 _spdk_bdev_io_complete(bdev_io); 2279 } 2280 2281 void 2282 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2283 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2284 { 2285 if (sc == SPDK_SCSI_STATUS_GOOD) { 2286 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2287 } else { 2288 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2289 bdev_io->error.scsi.sc = sc; 2290 bdev_io->error.scsi.sk = sk; 2291 bdev_io->error.scsi.asc = asc; 2292 bdev_io->error.scsi.ascq = ascq; 2293 } 2294 2295 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2296 } 2297 2298 void 2299 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2300 int *sc, int *sk, int *asc, int *ascq) 2301 { 2302 assert(sc != NULL); 2303 assert(sk != NULL); 2304 assert(asc != NULL); 2305 assert(ascq != NULL); 2306 2307 switch (bdev_io->status) { 2308 case SPDK_BDEV_IO_STATUS_SUCCESS: 2309 *sc = SPDK_SCSI_STATUS_GOOD; 2310 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2311 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2312 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2313 break; 2314 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2315 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2316 break; 2317 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2318 *sc = bdev_io->error.scsi.sc; 2319 *sk = bdev_io->error.scsi.sk; 2320 *asc = bdev_io->error.scsi.asc; 2321 *ascq = bdev_io->error.scsi.ascq; 2322 break; 2323 default: 2324 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2325 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2326 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2327 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2328 break; 2329 } 2330 } 2331 2332 void 2333 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2334 { 2335 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2336 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2337 } else { 2338 bdev_io->error.nvme.sct = sct; 2339 bdev_io->error.nvme.sc = sc; 2340 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2341 } 2342 2343 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2344 } 2345 2346 void 2347 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2348 { 2349 assert(sct != NULL); 2350 assert(sc != NULL); 2351 2352 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2353 *sct = bdev_io->error.nvme.sct; 2354 *sc = bdev_io->error.nvme.sc; 2355 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2356 *sct = SPDK_NVME_SCT_GENERIC; 2357 *sc = SPDK_NVME_SC_SUCCESS; 2358 } else { 2359 *sct = SPDK_NVME_SCT_GENERIC; 2360 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2361 } 2362 } 2363 2364 struct spdk_thread * 2365 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2366 { 2367 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2368 } 2369 2370 static void 2371 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2372 { 2373 struct spdk_conf_section *sp = NULL; 2374 const char *val = NULL; 2375 int ios_per_sec = 0; 2376 int i = 0; 2377 2378 sp = spdk_conf_find_section(NULL, "QoS"); 2379 if (!sp) { 2380 return; 2381 } 2382 2383 while (true) { 2384 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 0); 2385 if (!val) { 2386 break; 2387 } 2388 2389 if (strcmp(bdev->name, val) != 0) { 2390 i++; 2391 continue; 2392 } 2393 2394 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 1); 2395 if (!val) { 2396 return; 2397 } 2398 2399 ios_per_sec = (int)strtol(val, NULL, 10); 2400 if (ios_per_sec > 0) { 2401 if (ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 2402 SPDK_ERRLOG("Assigned IOPS %u on bdev %s is not multiple of %u\n", 2403 ios_per_sec, bdev->name, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 2404 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2405 } else { 2406 bdev->ios_per_sec = (uint64_t)ios_per_sec; 2407 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS:%lu\n", 2408 bdev->name, bdev->ios_per_sec); 2409 } 2410 } 2411 2412 return; 2413 } 2414 } 2415 2416 static int 2417 spdk_bdev_init(struct spdk_bdev *bdev) 2418 { 2419 assert(bdev->module != NULL); 2420 2421 if (!bdev->name) { 2422 SPDK_ERRLOG("Bdev name is NULL\n"); 2423 return -EINVAL; 2424 } 2425 2426 if (spdk_bdev_get_by_name(bdev->name)) { 2427 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2428 return -EEXIST; 2429 } 2430 2431 bdev->status = SPDK_BDEV_STATUS_READY; 2432 2433 TAILQ_INIT(&bdev->open_descs); 2434 2435 TAILQ_INIT(&bdev->aliases); 2436 2437 bdev->reset_in_progress = NULL; 2438 2439 _spdk_bdev_qos_config(bdev); 2440 2441 spdk_io_device_register(__bdev_to_io_dev(bdev), 2442 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2443 sizeof(struct spdk_bdev_channel)); 2444 2445 pthread_mutex_init(&bdev->mutex, NULL); 2446 return 0; 2447 } 2448 2449 static void 2450 spdk_bdev_destroy_cb(void *io_device) 2451 { 2452 int rc; 2453 struct spdk_bdev *bdev; 2454 spdk_bdev_unregister_cb cb_fn; 2455 void *cb_arg; 2456 2457 bdev = __bdev_from_io_dev(io_device); 2458 cb_fn = bdev->unregister_cb; 2459 cb_arg = bdev->unregister_ctx; 2460 2461 rc = bdev->fn_table->destruct(bdev->ctxt); 2462 if (rc < 0) { 2463 SPDK_ERRLOG("destruct failed\n"); 2464 } 2465 if (rc <= 0 && cb_fn != NULL) { 2466 cb_fn(cb_arg, rc); 2467 } 2468 } 2469 2470 2471 static void 2472 spdk_bdev_fini(struct spdk_bdev *bdev) 2473 { 2474 pthread_mutex_destroy(&bdev->mutex); 2475 2476 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2477 } 2478 2479 static void 2480 spdk_bdev_start(struct spdk_bdev *bdev) 2481 { 2482 struct spdk_bdev_module *module; 2483 2484 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2485 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2486 2487 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2488 if (module->examine) { 2489 module->action_in_progress++; 2490 module->examine(bdev); 2491 } 2492 } 2493 } 2494 2495 int 2496 spdk_bdev_register(struct spdk_bdev *bdev) 2497 { 2498 int rc = spdk_bdev_init(bdev); 2499 2500 if (rc == 0) { 2501 spdk_bdev_start(bdev); 2502 } 2503 2504 return rc; 2505 } 2506 2507 static void 2508 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2509 { 2510 struct spdk_bdev **bdevs; 2511 struct spdk_bdev *base; 2512 size_t i, j, k; 2513 bool found; 2514 2515 /* Iterate over base bdevs to remove vbdev from them. */ 2516 for (i = 0; i < vbdev->base_bdevs_cnt; i++) { 2517 found = false; 2518 base = vbdev->base_bdevs[i]; 2519 2520 for (j = 0; j < base->vbdevs_cnt; j++) { 2521 if (base->vbdevs[j] != vbdev) { 2522 continue; 2523 } 2524 2525 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2526 base->vbdevs[k] = base->vbdevs[k + 1]; 2527 } 2528 2529 base->vbdevs_cnt--; 2530 if (base->vbdevs_cnt > 0) { 2531 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2532 /* It would be odd if shrinking memory block fail. */ 2533 assert(bdevs); 2534 base->vbdevs = bdevs; 2535 } else { 2536 free(base->vbdevs); 2537 base->vbdevs = NULL; 2538 } 2539 2540 found = true; 2541 break; 2542 } 2543 2544 if (!found) { 2545 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2546 } 2547 } 2548 2549 free(vbdev->base_bdevs); 2550 vbdev->base_bdevs = NULL; 2551 vbdev->base_bdevs_cnt = 0; 2552 } 2553 2554 static int 2555 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2556 { 2557 struct spdk_bdev **vbdevs; 2558 struct spdk_bdev *base; 2559 size_t i; 2560 2561 /* Adding base bdevs isn't supported (yet?). */ 2562 assert(vbdev->base_bdevs_cnt == 0); 2563 2564 vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0])); 2565 if (!vbdev->base_bdevs) { 2566 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2567 return -ENOMEM; 2568 } 2569 2570 memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0])); 2571 vbdev->base_bdevs_cnt = cnt; 2572 2573 /* Iterate over base bdevs to add this vbdev to them. */ 2574 for (i = 0; i < cnt; i++) { 2575 base = vbdev->base_bdevs[i]; 2576 2577 assert(base != NULL); 2578 assert(base->claim_module != NULL); 2579 2580 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2581 if (!vbdevs) { 2582 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 2583 spdk_vbdev_remove_base_bdevs(vbdev); 2584 return -ENOMEM; 2585 } 2586 2587 vbdevs[base->vbdevs_cnt] = vbdev; 2588 base->vbdevs = vbdevs; 2589 base->vbdevs_cnt++; 2590 } 2591 2592 return 0; 2593 } 2594 2595 int 2596 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2597 { 2598 int rc; 2599 2600 rc = spdk_bdev_init(vbdev); 2601 if (rc) { 2602 return rc; 2603 } 2604 2605 if (base_bdev_count == 0) { 2606 spdk_bdev_start(vbdev); 2607 return 0; 2608 } 2609 2610 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 2611 if (rc) { 2612 spdk_bdev_fini(vbdev); 2613 return rc; 2614 } 2615 2616 spdk_bdev_start(vbdev); 2617 return 0; 2618 2619 } 2620 2621 void 2622 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 2623 { 2624 if (bdev->unregister_cb != NULL) { 2625 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2626 } 2627 } 2628 2629 static void 2630 _remove_notify(void *arg) 2631 { 2632 struct spdk_bdev_desc *desc = arg; 2633 2634 desc->remove_cb(desc->remove_ctx); 2635 } 2636 2637 void 2638 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2639 { 2640 struct spdk_bdev_desc *desc, *tmp; 2641 bool do_destruct = true; 2642 2643 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2644 2645 pthread_mutex_lock(&bdev->mutex); 2646 2647 spdk_vbdev_remove_base_bdevs(bdev); 2648 2649 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2650 bdev->unregister_cb = cb_fn; 2651 bdev->unregister_ctx = cb_arg; 2652 2653 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2654 if (desc->remove_cb) { 2655 do_destruct = false; 2656 /* 2657 * Defer invocation of the remove_cb to a separate message that will 2658 * run later on this thread. This ensures this context unwinds and 2659 * we don't recursively unregister this bdev again if the remove_cb 2660 * immediately closes its descriptor. 2661 */ 2662 spdk_thread_send_msg(spdk_get_thread(), _remove_notify, desc); 2663 } 2664 } 2665 2666 if (!do_destruct) { 2667 pthread_mutex_unlock(&bdev->mutex); 2668 return; 2669 } 2670 2671 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2672 pthread_mutex_unlock(&bdev->mutex); 2673 2674 spdk_bdev_fini(bdev); 2675 } 2676 2677 int 2678 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2679 void *remove_ctx, struct spdk_bdev_desc **_desc) 2680 { 2681 struct spdk_bdev_desc *desc; 2682 2683 desc = calloc(1, sizeof(*desc)); 2684 if (desc == NULL) { 2685 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2686 return -ENOMEM; 2687 } 2688 2689 pthread_mutex_lock(&bdev->mutex); 2690 2691 if (write && bdev->claim_module) { 2692 SPDK_INFOLOG(SPDK_LOG_BDEV, "Could not open %s - already claimed\n", bdev->name); 2693 free(desc); 2694 pthread_mutex_unlock(&bdev->mutex); 2695 return -EPERM; 2696 } 2697 2698 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2699 2700 desc->bdev = bdev; 2701 desc->remove_cb = remove_cb; 2702 desc->remove_ctx = remove_ctx; 2703 desc->write = write; 2704 *_desc = desc; 2705 2706 pthread_mutex_unlock(&bdev->mutex); 2707 2708 return 0; 2709 } 2710 2711 void 2712 spdk_bdev_close(struct spdk_bdev_desc *desc) 2713 { 2714 struct spdk_bdev *bdev = desc->bdev; 2715 bool do_unregister = false; 2716 2717 pthread_mutex_lock(&bdev->mutex); 2718 2719 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2720 free(desc); 2721 2722 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2723 do_unregister = true; 2724 } 2725 pthread_mutex_unlock(&bdev->mutex); 2726 2727 if (do_unregister == true) { 2728 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2729 } 2730 } 2731 2732 int 2733 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2734 struct spdk_bdev_module *module) 2735 { 2736 if (bdev->claim_module != NULL) { 2737 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2738 bdev->claim_module->name); 2739 return -EPERM; 2740 } 2741 2742 if (desc && !desc->write) { 2743 desc->write = true; 2744 } 2745 2746 bdev->claim_module = module; 2747 return 0; 2748 } 2749 2750 void 2751 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2752 { 2753 assert(bdev->claim_module != NULL); 2754 bdev->claim_module = NULL; 2755 } 2756 2757 struct spdk_bdev * 2758 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2759 { 2760 return desc->bdev; 2761 } 2762 2763 void 2764 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2765 { 2766 struct iovec *iovs; 2767 int iovcnt; 2768 2769 if (bdev_io == NULL) { 2770 return; 2771 } 2772 2773 switch (bdev_io->type) { 2774 case SPDK_BDEV_IO_TYPE_READ: 2775 iovs = bdev_io->u.bdev.iovs; 2776 iovcnt = bdev_io->u.bdev.iovcnt; 2777 break; 2778 case SPDK_BDEV_IO_TYPE_WRITE: 2779 iovs = bdev_io->u.bdev.iovs; 2780 iovcnt = bdev_io->u.bdev.iovcnt; 2781 break; 2782 default: 2783 iovs = NULL; 2784 iovcnt = 0; 2785 break; 2786 } 2787 2788 if (iovp) { 2789 *iovp = iovs; 2790 } 2791 if (iovcntp) { 2792 *iovcntp = iovcnt; 2793 } 2794 } 2795 2796 void 2797 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2798 { 2799 2800 if (spdk_bdev_module_list_find(bdev_module->name)) { 2801 fprintf(stderr, "ERROR: module '%s' already registered.\n", bdev_module->name); 2802 assert(false); 2803 } 2804 2805 if (bdev_module->async_init) { 2806 bdev_module->action_in_progress = 1; 2807 } 2808 2809 /* 2810 * Modules with examine callbacks must be initialized first, so they are 2811 * ready to handle examine callbacks from later modules that will 2812 * register physical bdevs. 2813 */ 2814 if (bdev_module->examine != NULL) { 2815 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2816 } else { 2817 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2818 } 2819 } 2820 2821 struct spdk_bdev_module * 2822 spdk_bdev_module_list_find(const char *name) 2823 { 2824 struct spdk_bdev_module *bdev_module; 2825 2826 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2827 if (strcmp(name, bdev_module->name) == 0) { 2828 break; 2829 } 2830 } 2831 2832 return bdev_module; 2833 } 2834 2835 static void 2836 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2837 { 2838 uint64_t len; 2839 2840 if (!success) { 2841 bdev_io->cb = bdev_io->u.bdev.stored_user_cb; 2842 _spdk_bdev_io_complete(bdev_io); 2843 return; 2844 } 2845 2846 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2847 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 2848 ZERO_BUFFER_SIZE); 2849 2850 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 2851 bdev_io->u.bdev.iov.iov_len = len; 2852 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2853 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2854 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2855 2856 /* if this round completes the i/o, change the callback to be the original user callback */ 2857 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 2858 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 2859 } else { 2860 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 2861 } 2862 spdk_bdev_io_submit(bdev_io); 2863 } 2864 2865 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 2866