1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 #include "spdk/conf.h" 39 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/io_channel.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 49 #include "spdk_internal/bdev.h" 50 #include "spdk_internal/log.h" 51 #include "spdk/string.h" 52 53 #ifdef SPDK_CONFIG_VTUNE 54 #include "ittnotify.h" 55 #include "ittnotify_types.h" 56 int __itt_init_ittlib(const char *, __itt_group_id); 57 #endif 58 59 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 60 #define SPDK_BDEV_IO_CACHE_SIZE 256 61 #define BUF_SMALL_POOL_SIZE 8192 62 #define BUF_LARGE_POOL_SIZE 1024 63 #define NOMEM_THRESHOLD_COUNT 8 64 #define ZERO_BUFFER_SIZE 0x100000 65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 66 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 69 70 struct spdk_bdev_mgr { 71 struct spdk_mempool *bdev_io_pool; 72 73 struct spdk_mempool *buf_small_pool; 74 struct spdk_mempool *buf_large_pool; 75 76 void *zero_buffer; 77 78 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 79 80 TAILQ_HEAD(, spdk_bdev) bdevs; 81 82 bool init_complete; 83 bool module_init_complete; 84 85 #ifdef SPDK_CONFIG_VTUNE 86 __itt_domain *domain; 87 #endif 88 }; 89 90 static struct spdk_bdev_mgr g_bdev_mgr = { 91 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 92 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 93 .init_complete = false, 94 .module_init_complete = false, 95 }; 96 97 static spdk_bdev_init_cb g_init_cb_fn = NULL; 98 static void *g_init_cb_arg = NULL; 99 100 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 101 static void *g_fini_cb_arg = NULL; 102 static struct spdk_thread *g_fini_thread = NULL; 103 104 struct spdk_bdev_qos { 105 /** Rate limit, in I/O per second */ 106 uint64_t rate_limit; 107 108 /** The channel that all I/O are funneled through */ 109 struct spdk_bdev_channel *ch; 110 111 /** The thread on which the poller is running. */ 112 struct spdk_thread *thread; 113 114 /** Queue of I/O waiting to be issued. */ 115 bdev_io_tailq_t queued; 116 117 /** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 118 * only valid for the master channel which manages the outstanding IOs. */ 119 uint64_t max_ios_per_timeslice; 120 121 /** Submitted IO in one timeslice (e.g., 1ms) */ 122 uint64_t io_submitted_this_timeslice; 123 124 /** Polller that processes queued I/O commands each time slice. */ 125 struct spdk_poller *poller; 126 }; 127 128 struct spdk_bdev_mgmt_channel { 129 bdev_io_stailq_t need_buf_small; 130 bdev_io_stailq_t need_buf_large; 131 132 /* 133 * Each thread keeps a cache of bdev_io - this allows 134 * bdev threads which are *not* DPDK threads to still 135 * benefit from a per-thread bdev_io cache. Without 136 * this, non-DPDK threads fetching from the mempool 137 * incur a cmpxchg on get and put. 138 */ 139 bdev_io_stailq_t per_thread_cache; 140 uint32_t per_thread_cache_count; 141 142 TAILQ_HEAD(, spdk_bdev_module_channel) module_channels; 143 }; 144 145 /* 146 * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device 147 * will queue here their IO that awaits retry. It makes it posible to retry sending 148 * IO to one bdev after IO from other bdev completes. 149 */ 150 struct spdk_bdev_module_channel { 151 /* The bdev management channel */ 152 struct spdk_bdev_mgmt_channel *mgmt_ch; 153 154 /* 155 * Count of I/O submitted to bdev module and waiting for completion. 156 * Incremented before submit_request() is called on an spdk_bdev_io. 157 */ 158 uint64_t io_outstanding; 159 160 /* 161 * Queue of IO awaiting retry because of a previous NOMEM status returned 162 * on this channel. 163 */ 164 bdev_io_tailq_t nomem_io; 165 166 /* 167 * Threshold which io_outstanding must drop to before retrying nomem_io. 168 */ 169 uint64_t nomem_threshold; 170 171 /* I/O channel allocated by a bdev module */ 172 struct spdk_io_channel *module_ch; 173 174 /* Refcount of bdev channels using this channel */ 175 uint32_t ref; 176 177 TAILQ_ENTRY(spdk_bdev_module_channel) link; 178 }; 179 180 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 181 #define BDEV_CH_QOS_ENABLED (1 << 1) 182 183 struct spdk_bdev_channel { 184 struct spdk_bdev *bdev; 185 186 /* The channel for the underlying device */ 187 struct spdk_io_channel *channel; 188 189 /* Channel for the bdev module */ 190 struct spdk_bdev_module_channel *module_ch; 191 192 struct spdk_bdev_io_stat stat; 193 194 /* 195 * Count of I/O submitted through this channel and waiting for completion. 196 * Incremented before submit_request() is called on an spdk_bdev_io. 197 */ 198 uint64_t io_outstanding; 199 200 bdev_io_tailq_t queued_resets; 201 202 uint32_t flags; 203 204 #ifdef SPDK_CONFIG_VTUNE 205 uint64_t start_tsc; 206 uint64_t interval_tsc; 207 __itt_string_handle *handle; 208 #endif 209 210 }; 211 212 struct spdk_bdev_desc { 213 struct spdk_bdev *bdev; 214 spdk_bdev_remove_cb_t remove_cb; 215 void *remove_ctx; 216 bool write; 217 TAILQ_ENTRY(spdk_bdev_desc) link; 218 }; 219 220 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 221 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 222 223 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 224 225 struct spdk_bdev * 226 spdk_bdev_first(void) 227 { 228 struct spdk_bdev *bdev; 229 230 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 231 if (bdev) { 232 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 233 } 234 235 return bdev; 236 } 237 238 struct spdk_bdev * 239 spdk_bdev_next(struct spdk_bdev *prev) 240 { 241 struct spdk_bdev *bdev; 242 243 bdev = TAILQ_NEXT(prev, link); 244 if (bdev) { 245 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 246 } 247 248 return bdev; 249 } 250 251 static struct spdk_bdev * 252 _bdev_next_leaf(struct spdk_bdev *bdev) 253 { 254 while (bdev != NULL) { 255 if (bdev->claim_module == NULL) { 256 return bdev; 257 } else { 258 bdev = TAILQ_NEXT(bdev, link); 259 } 260 } 261 262 return bdev; 263 } 264 265 struct spdk_bdev * 266 spdk_bdev_first_leaf(void) 267 { 268 struct spdk_bdev *bdev; 269 270 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 271 272 if (bdev) { 273 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 274 } 275 276 return bdev; 277 } 278 279 struct spdk_bdev * 280 spdk_bdev_next_leaf(struct spdk_bdev *prev) 281 { 282 struct spdk_bdev *bdev; 283 284 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 285 286 if (bdev) { 287 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 288 } 289 290 return bdev; 291 } 292 293 struct spdk_bdev * 294 spdk_bdev_get_by_name(const char *bdev_name) 295 { 296 struct spdk_bdev_alias *tmp; 297 struct spdk_bdev *bdev = spdk_bdev_first(); 298 299 while (bdev != NULL) { 300 if (strcmp(bdev_name, bdev->name) == 0) { 301 return bdev; 302 } 303 304 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 305 if (strcmp(bdev_name, tmp->alias) == 0) { 306 return bdev; 307 } 308 } 309 310 bdev = spdk_bdev_next(bdev); 311 } 312 313 return NULL; 314 } 315 316 static void 317 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 318 { 319 assert(bdev_io->get_buf_cb != NULL); 320 assert(buf != NULL); 321 assert(bdev_io->u.bdev.iovs != NULL); 322 323 bdev_io->buf = buf; 324 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 325 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 326 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 327 } 328 329 static void 330 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 331 { 332 struct spdk_mempool *pool; 333 struct spdk_bdev_io *tmp; 334 void *buf; 335 bdev_io_stailq_t *stailq; 336 struct spdk_bdev_mgmt_channel *ch; 337 338 assert(bdev_io->u.bdev.iovcnt == 1); 339 340 buf = bdev_io->buf; 341 ch = bdev_io->ch->module_ch->mgmt_ch; 342 343 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 344 pool = g_bdev_mgr.buf_small_pool; 345 stailq = &ch->need_buf_small; 346 } else { 347 pool = g_bdev_mgr.buf_large_pool; 348 stailq = &ch->need_buf_large; 349 } 350 351 if (STAILQ_EMPTY(stailq)) { 352 spdk_mempool_put(pool, buf); 353 } else { 354 tmp = STAILQ_FIRST(stailq); 355 STAILQ_REMOVE_HEAD(stailq, buf_link); 356 spdk_bdev_io_set_buf(tmp, buf); 357 } 358 } 359 360 void 361 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 362 { 363 struct spdk_mempool *pool; 364 bdev_io_stailq_t *stailq; 365 void *buf = NULL; 366 struct spdk_bdev_mgmt_channel *mgmt_ch; 367 368 assert(cb != NULL); 369 assert(bdev_io->u.bdev.iovs != NULL); 370 371 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 372 /* Buffer already present */ 373 cb(bdev_io->ch->channel, bdev_io); 374 return; 375 } 376 377 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 378 mgmt_ch = bdev_io->ch->module_ch->mgmt_ch; 379 380 bdev_io->buf_len = len; 381 bdev_io->get_buf_cb = cb; 382 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 383 pool = g_bdev_mgr.buf_small_pool; 384 stailq = &mgmt_ch->need_buf_small; 385 } else { 386 pool = g_bdev_mgr.buf_large_pool; 387 stailq = &mgmt_ch->need_buf_large; 388 } 389 390 buf = spdk_mempool_get(pool); 391 392 if (!buf) { 393 STAILQ_INSERT_TAIL(stailq, bdev_io, buf_link); 394 } else { 395 spdk_bdev_io_set_buf(bdev_io, buf); 396 } 397 } 398 399 static int 400 spdk_bdev_module_get_max_ctx_size(void) 401 { 402 struct spdk_bdev_module *bdev_module; 403 int max_bdev_module_size = 0; 404 405 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 406 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 407 max_bdev_module_size = bdev_module->get_ctx_size(); 408 } 409 } 410 411 return max_bdev_module_size; 412 } 413 414 void 415 spdk_bdev_config_text(FILE *fp) 416 { 417 struct spdk_bdev_module *bdev_module; 418 419 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 420 if (bdev_module->config_text) { 421 bdev_module->config_text(fp); 422 } 423 } 424 } 425 426 void 427 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 428 { 429 struct spdk_bdev_module *bdev_module; 430 struct spdk_bdev *bdev; 431 432 assert(w != NULL); 433 434 spdk_json_write_array_begin(w); 435 436 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 437 if (bdev_module->config_json) { 438 bdev_module->config_json(w); 439 } 440 } 441 442 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) { 443 spdk_bdev_config_json(bdev, w); 444 } 445 446 spdk_json_write_array_end(w); 447 } 448 449 static int 450 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 451 { 452 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 453 454 STAILQ_INIT(&ch->need_buf_small); 455 STAILQ_INIT(&ch->need_buf_large); 456 457 STAILQ_INIT(&ch->per_thread_cache); 458 ch->per_thread_cache_count = 0; 459 460 TAILQ_INIT(&ch->module_channels); 461 462 return 0; 463 } 464 465 static void 466 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 467 { 468 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 469 struct spdk_bdev_io *bdev_io; 470 471 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 472 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 473 } 474 475 if (!TAILQ_EMPTY(&ch->module_channels)) { 476 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 477 } 478 479 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 480 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 481 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 482 ch->per_thread_cache_count--; 483 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 484 } 485 486 assert(ch->per_thread_cache_count == 0); 487 } 488 489 static void 490 spdk_bdev_init_complete(int rc) 491 { 492 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 493 void *cb_arg = g_init_cb_arg; 494 struct spdk_bdev_module *m; 495 496 g_bdev_mgr.init_complete = true; 497 g_init_cb_fn = NULL; 498 g_init_cb_arg = NULL; 499 500 /* 501 * For modules that need to know when subsystem init is complete, 502 * inform them now. 503 */ 504 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 505 if (m->init_complete) { 506 m->init_complete(); 507 } 508 } 509 510 cb_fn(cb_arg, rc); 511 } 512 513 static void 514 spdk_bdev_module_action_complete(void) 515 { 516 struct spdk_bdev_module *m; 517 518 /* 519 * Don't finish bdev subsystem initialization if 520 * module pre-initialization is still in progress, or 521 * the subsystem been already initialized. 522 */ 523 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 524 return; 525 } 526 527 /* 528 * Check all bdev modules for inits/examinations in progress. If any 529 * exist, return immediately since we cannot finish bdev subsystem 530 * initialization until all are completed. 531 */ 532 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 533 if (m->action_in_progress > 0) { 534 return; 535 } 536 } 537 538 /* 539 * Modules already finished initialization - now that all 540 * the bdev modules have finished their asynchronous I/O 541 * processing, the entire bdev layer can be marked as complete. 542 */ 543 spdk_bdev_init_complete(0); 544 } 545 546 static void 547 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 548 { 549 assert(module->action_in_progress > 0); 550 module->action_in_progress--; 551 spdk_bdev_module_action_complete(); 552 } 553 554 void 555 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 556 { 557 spdk_bdev_module_action_done(module); 558 } 559 560 void 561 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 562 { 563 spdk_bdev_module_action_done(module); 564 } 565 566 static int 567 spdk_bdev_modules_init(void) 568 { 569 struct spdk_bdev_module *module; 570 int rc = 0; 571 572 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 573 rc = module->module_init(); 574 if (rc != 0) { 575 break; 576 } 577 } 578 579 g_bdev_mgr.module_init_complete = true; 580 return rc; 581 } 582 void 583 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 584 { 585 int cache_size; 586 int rc = 0; 587 char mempool_name[32]; 588 589 assert(cb_fn != NULL); 590 591 g_init_cb_fn = cb_fn; 592 g_init_cb_arg = cb_arg; 593 594 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 595 596 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 597 SPDK_BDEV_IO_POOL_SIZE, 598 sizeof(struct spdk_bdev_io) + 599 spdk_bdev_module_get_max_ctx_size(), 600 0, 601 SPDK_ENV_SOCKET_ID_ANY); 602 603 if (g_bdev_mgr.bdev_io_pool == NULL) { 604 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 605 spdk_bdev_init_complete(-1); 606 return; 607 } 608 609 /** 610 * Ensure no more than half of the total buffers end up local caches, by 611 * using spdk_env_get_core_count() to determine how many local caches we need 612 * to account for. 613 */ 614 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 615 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 616 617 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 618 BUF_SMALL_POOL_SIZE, 619 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 620 cache_size, 621 SPDK_ENV_SOCKET_ID_ANY); 622 if (!g_bdev_mgr.buf_small_pool) { 623 SPDK_ERRLOG("create rbuf small pool failed\n"); 624 spdk_bdev_init_complete(-1); 625 return; 626 } 627 628 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 629 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 630 631 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 632 BUF_LARGE_POOL_SIZE, 633 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 634 cache_size, 635 SPDK_ENV_SOCKET_ID_ANY); 636 if (!g_bdev_mgr.buf_large_pool) { 637 SPDK_ERRLOG("create rbuf large pool failed\n"); 638 spdk_bdev_init_complete(-1); 639 return; 640 } 641 642 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 643 NULL); 644 if (!g_bdev_mgr.zero_buffer) { 645 SPDK_ERRLOG("create bdev zero buffer failed\n"); 646 spdk_bdev_init_complete(-1); 647 return; 648 } 649 650 #ifdef SPDK_CONFIG_VTUNE 651 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 652 #endif 653 654 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 655 spdk_bdev_mgmt_channel_destroy, 656 sizeof(struct spdk_bdev_mgmt_channel)); 657 658 rc = spdk_bdev_modules_init(); 659 if (rc != 0) { 660 SPDK_ERRLOG("bdev modules init failed\n"); 661 spdk_bdev_init_complete(-1); 662 return; 663 } 664 665 spdk_bdev_module_action_complete(); 666 } 667 668 static void 669 spdk_bdev_mgr_unregister_cb(void *io_device) 670 { 671 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 672 673 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 674 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 675 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 676 SPDK_BDEV_IO_POOL_SIZE); 677 } 678 679 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 680 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 681 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 682 BUF_SMALL_POOL_SIZE); 683 assert(false); 684 } 685 686 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 687 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 688 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 689 BUF_LARGE_POOL_SIZE); 690 assert(false); 691 } 692 693 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 694 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 695 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 696 spdk_dma_free(g_bdev_mgr.zero_buffer); 697 698 cb_fn(g_fini_cb_arg); 699 g_fini_cb_fn = NULL; 700 g_fini_cb_arg = NULL; 701 } 702 703 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 704 705 static void 706 spdk_bdev_module_finish_iter(void *arg) 707 { 708 struct spdk_bdev_module *bdev_module; 709 710 /* Start iterating from the last touched module */ 711 if (!g_resume_bdev_module) { 712 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 713 } else { 714 bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq); 715 } 716 717 while (bdev_module) { 718 if (bdev_module->async_fini) { 719 /* Save our place so we can resume later. We must 720 * save the variable here, before calling module_fini() 721 * below, because in some cases the module may immediately 722 * call spdk_bdev_module_finish_done() and re-enter 723 * this function to continue iterating. */ 724 g_resume_bdev_module = bdev_module; 725 } 726 727 if (bdev_module->module_fini) { 728 bdev_module->module_fini(); 729 } 730 731 if (bdev_module->async_fini) { 732 return; 733 } 734 735 bdev_module = TAILQ_NEXT(bdev_module, tailq); 736 } 737 738 g_resume_bdev_module = NULL; 739 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 740 } 741 742 void 743 spdk_bdev_module_finish_done(void) 744 { 745 if (spdk_get_thread() != g_fini_thread) { 746 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 747 } else { 748 spdk_bdev_module_finish_iter(NULL); 749 } 750 } 751 752 static void 753 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 754 { 755 struct spdk_bdev *bdev = cb_arg; 756 757 if (bdeverrno && bdev) { 758 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 759 bdev->name); 760 761 /* 762 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 763 * bdev; try to continue by manually removing this bdev from the list and continue 764 * with the next bdev in the list. 765 */ 766 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 767 } 768 769 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 770 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 771 /* 772 * Bdev module finish need to be deffered as we might be in the middle of some context 773 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 774 * after returning. 775 */ 776 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 777 return; 778 } 779 780 /* 781 * Unregister the first bdev in the list. 782 * 783 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 784 * calling the remove_cb of the descriptors first. 785 * 786 * Once this bdev and all of its open descriptors have been cleaned up, this function 787 * will be called again via the unregister completion callback to continue the cleanup 788 * process with the next bdev. 789 */ 790 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 791 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 792 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 793 } 794 795 void 796 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 797 { 798 assert(cb_fn != NULL); 799 800 g_fini_thread = spdk_get_thread(); 801 802 g_fini_cb_fn = cb_fn; 803 g_fini_cb_arg = cb_arg; 804 805 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 806 } 807 808 static struct spdk_bdev_io * 809 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 810 { 811 struct spdk_bdev_mgmt_channel *ch = channel->module_ch->mgmt_ch; 812 struct spdk_bdev_io *bdev_io; 813 814 if (ch->per_thread_cache_count > 0) { 815 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 816 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 817 ch->per_thread_cache_count--; 818 } else { 819 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 820 if (!bdev_io) { 821 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 822 return NULL; 823 } 824 } 825 826 return bdev_io; 827 } 828 829 static void 830 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 831 { 832 struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->module_ch->mgmt_ch; 833 834 if (bdev_io->buf != NULL) { 835 spdk_bdev_io_put_buf(bdev_io); 836 } 837 838 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 839 ch->per_thread_cache_count++; 840 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, buf_link); 841 } else { 842 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 843 } 844 } 845 846 static void 847 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 848 { 849 struct spdk_bdev_io *bdev_io = NULL; 850 struct spdk_bdev *bdev = ch->bdev; 851 struct spdk_bdev_qos *qos = bdev->qos; 852 struct spdk_bdev_module_channel *module_ch = ch->module_ch; 853 854 while (!TAILQ_EMPTY(&qos->queued)) { 855 if (qos->io_submitted_this_timeslice < qos->max_ios_per_timeslice) { 856 bdev_io = TAILQ_FIRST(&qos->queued); 857 TAILQ_REMOVE(&qos->queued, bdev_io, link); 858 qos->io_submitted_this_timeslice++; 859 ch->io_outstanding++; 860 module_ch->io_outstanding++; 861 bdev->fn_table->submit_request(ch->channel, bdev_io); 862 } else { 863 break; 864 } 865 } 866 } 867 868 static void 869 _spdk_bdev_io_submit(void *ctx) 870 { 871 struct spdk_bdev_io *bdev_io = ctx; 872 struct spdk_bdev *bdev = bdev_io->bdev; 873 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 874 struct spdk_io_channel *ch = bdev_ch->channel; 875 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 876 877 bdev_io->submit_tsc = spdk_get_ticks(); 878 bdev_ch->io_outstanding++; 879 module_ch->io_outstanding++; 880 bdev_io->in_submit_request = true; 881 if (spdk_likely(bdev_ch->flags == 0)) { 882 if (spdk_likely(TAILQ_EMPTY(&module_ch->nomem_io))) { 883 bdev->fn_table->submit_request(ch, bdev_io); 884 } else { 885 bdev_ch->io_outstanding--; 886 module_ch->io_outstanding--; 887 TAILQ_INSERT_TAIL(&module_ch->nomem_io, bdev_io, link); 888 } 889 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 890 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 891 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 892 bdev_ch->io_outstanding--; 893 module_ch->io_outstanding--; 894 TAILQ_INSERT_TAIL(&bdev->qos->queued, bdev_io, link); 895 _spdk_bdev_qos_io_submit(bdev_ch); 896 } else { 897 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 898 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 899 } 900 bdev_io->in_submit_request = false; 901 } 902 903 static void 904 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 905 { 906 struct spdk_bdev *bdev = bdev_io->bdev; 907 908 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 909 910 if (bdev_io->ch->flags & BDEV_CH_QOS_ENABLED) { 911 bdev_io->io_submit_ch = bdev_io->ch; 912 bdev_io->ch = bdev->qos->ch; 913 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_io_submit, bdev_io); 914 } else { 915 _spdk_bdev_io_submit(bdev_io); 916 } 917 } 918 919 static void 920 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 921 { 922 struct spdk_bdev *bdev = bdev_io->bdev; 923 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 924 struct spdk_io_channel *ch = bdev_ch->channel; 925 926 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 927 928 bdev_io->in_submit_request = true; 929 bdev->fn_table->submit_request(ch, bdev_io); 930 bdev_io->in_submit_request = false; 931 } 932 933 static void 934 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 935 struct spdk_bdev *bdev, void *cb_arg, 936 spdk_bdev_io_completion_cb cb) 937 { 938 bdev_io->bdev = bdev; 939 bdev_io->caller_ctx = cb_arg; 940 bdev_io->cb = cb; 941 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 942 bdev_io->in_submit_request = false; 943 bdev_io->buf = NULL; 944 bdev_io->io_submit_ch = NULL; 945 } 946 947 bool 948 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 949 { 950 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 951 } 952 953 int 954 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 955 { 956 if (bdev->fn_table->dump_info_json) { 957 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 958 } 959 960 return 0; 961 } 962 963 void 964 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 965 { 966 assert(bdev != NULL); 967 assert(w != NULL); 968 969 if (bdev->fn_table->write_config_json) { 970 bdev->fn_table->write_config_json(bdev, w); 971 } else { 972 spdk_json_write_object_begin(w); 973 spdk_json_write_named_string(w, "name", bdev->name); 974 spdk_json_write_object_end(w); 975 } 976 } 977 978 static void 979 spdk_bdev_qos_update_max_ios_per_timeslice(struct spdk_bdev_qos *qos) 980 { 981 uint64_t max_ios_per_timeslice = 0; 982 983 max_ios_per_timeslice = qos->rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 984 SPDK_BDEV_SEC_TO_USEC; 985 qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice, 986 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 987 } 988 989 static int 990 spdk_bdev_channel_poll_qos(void *arg) 991 { 992 struct spdk_bdev_qos *qos = arg; 993 994 /* Reset for next round of rate limiting */ 995 qos->io_submitted_this_timeslice = 0; 996 997 _spdk_bdev_qos_io_submit(qos->ch); 998 999 return -1; 1000 } 1001 1002 static int 1003 _spdk_bdev_channel_create(struct spdk_bdev_channel *ch, void *io_device) 1004 { 1005 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1006 struct spdk_io_channel *mgmt_io_ch; 1007 struct spdk_bdev_mgmt_channel *mgmt_ch; 1008 struct spdk_bdev_module_channel *module_ch; 1009 1010 ch->bdev = bdev; 1011 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1012 if (!ch->channel) { 1013 return -1; 1014 } 1015 1016 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1017 if (!mgmt_io_ch) { 1018 return -1; 1019 } 1020 1021 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1022 TAILQ_FOREACH(module_ch, &mgmt_ch->module_channels, link) { 1023 if (module_ch->module_ch == ch->channel) { 1024 spdk_put_io_channel(mgmt_io_ch); 1025 module_ch->ref++; 1026 break; 1027 } 1028 } 1029 1030 if (module_ch == NULL) { 1031 module_ch = calloc(1, sizeof(*module_ch)); 1032 if (module_ch == NULL) { 1033 spdk_put_io_channel(mgmt_io_ch); 1034 return -1; 1035 } 1036 1037 module_ch->mgmt_ch = mgmt_ch; 1038 module_ch->io_outstanding = 0; 1039 TAILQ_INIT(&module_ch->nomem_io); 1040 module_ch->nomem_threshold = 0; 1041 module_ch->module_ch = ch->channel; 1042 module_ch->ref = 1; 1043 TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, module_ch, link); 1044 } 1045 1046 memset(&ch->stat, 0, sizeof(ch->stat)); 1047 ch->io_outstanding = 0; 1048 TAILQ_INIT(&ch->queued_resets); 1049 ch->flags = 0; 1050 ch->module_ch = module_ch; 1051 1052 return 0; 1053 } 1054 1055 static void 1056 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1057 { 1058 struct spdk_bdev_module_channel *module_ch; 1059 1060 if (!ch) { 1061 return; 1062 } 1063 1064 if (ch->channel) { 1065 spdk_put_io_channel(ch->channel); 1066 } 1067 1068 assert(ch->io_outstanding == 0); 1069 1070 module_ch = ch->module_ch; 1071 if (module_ch) { 1072 assert(module_ch->ref > 0); 1073 module_ch->ref--; 1074 if (module_ch->ref == 0) { 1075 assert(module_ch->io_outstanding == 0); 1076 spdk_put_io_channel(spdk_io_channel_from_ctx(module_ch->mgmt_ch)); 1077 TAILQ_REMOVE(&module_ch->mgmt_ch->module_channels, module_ch, link); 1078 free(module_ch); 1079 } 1080 } 1081 } 1082 1083 /* Caller must hold bdev->mutex. */ 1084 static int 1085 spdk_bdev_qos_channel_create(struct spdk_bdev *bdev) 1086 { 1087 assert(bdev->qos->ch == NULL); 1088 assert(bdev->qos->thread == NULL); 1089 1090 bdev->qos->ch = calloc(1, sizeof(struct spdk_bdev_channel)); 1091 if (!bdev->qos->ch) { 1092 return -1; 1093 } 1094 1095 bdev->qos->thread = spdk_get_thread(); 1096 if (!bdev->qos->thread) { 1097 free(bdev->qos->ch); 1098 bdev->qos->ch = NULL; 1099 return -1; 1100 } 1101 1102 if (_spdk_bdev_channel_create(bdev->qos->ch, __bdev_to_io_dev(bdev)) != 0) { 1103 free(bdev->qos->ch); 1104 bdev->qos->ch = NULL; 1105 bdev->qos->thread = NULL; 1106 return -1; 1107 } 1108 1109 TAILQ_INIT(&bdev->qos->queued); 1110 1111 bdev->qos->ch->flags |= BDEV_CH_QOS_ENABLED; 1112 spdk_bdev_qos_update_max_ios_per_timeslice(bdev->qos); 1113 1114 bdev->qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1115 bdev->qos, 1116 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1117 1118 return 0; 1119 } 1120 1121 /* Caller must hold bdev->mutex */ 1122 static int 1123 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1124 { 1125 /* Rate limiting on this bdev enabled */ 1126 if (bdev->qos) { 1127 if (bdev->qos->ch == NULL) { 1128 if (spdk_bdev_qos_channel_create(bdev) != 0) { 1129 return -1; 1130 } 1131 } 1132 ch->flags |= BDEV_CH_QOS_ENABLED; 1133 } 1134 1135 return 0; 1136 } 1137 1138 static int 1139 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1140 { 1141 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1142 struct spdk_bdev_channel *ch = ctx_buf; 1143 1144 if (_spdk_bdev_channel_create(ch, io_device) != 0) { 1145 _spdk_bdev_channel_destroy_resource(ch); 1146 return -1; 1147 } 1148 1149 #ifdef SPDK_CONFIG_VTUNE 1150 { 1151 char *name; 1152 __itt_init_ittlib(NULL, 0); 1153 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1154 if (!name) { 1155 _spdk_bdev_channel_destroy_resource(ch); 1156 return -1; 1157 } 1158 ch->handle = __itt_string_handle_create(name); 1159 free(name); 1160 ch->start_tsc = spdk_get_ticks(); 1161 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1162 } 1163 #endif 1164 1165 pthread_mutex_lock(&bdev->mutex); 1166 1167 if (_spdk_bdev_enable_qos(bdev, ch)) { 1168 _spdk_bdev_channel_destroy_resource(ch); 1169 pthread_mutex_unlock(&bdev->mutex); 1170 return -1; 1171 } 1172 1173 bdev->channel_count++; 1174 1175 pthread_mutex_unlock(&bdev->mutex); 1176 1177 return 0; 1178 } 1179 1180 /* 1181 * Abort I/O that are waiting on a data buffer. These types of I/O are 1182 * linked using the spdk_bdev_io buf_link TAILQ_ENTRY. 1183 */ 1184 static void 1185 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1186 { 1187 bdev_io_stailq_t tmp; 1188 struct spdk_bdev_io *bdev_io; 1189 1190 STAILQ_INIT(&tmp); 1191 1192 while (!STAILQ_EMPTY(queue)) { 1193 bdev_io = STAILQ_FIRST(queue); 1194 STAILQ_REMOVE_HEAD(queue, buf_link); 1195 if (bdev_io->ch == ch) { 1196 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1197 } else { 1198 STAILQ_INSERT_TAIL(&tmp, bdev_io, buf_link); 1199 } 1200 } 1201 1202 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1203 } 1204 1205 /* 1206 * Abort I/O that are queued waiting for submission. These types of I/O are 1207 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1208 */ 1209 static void 1210 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1211 { 1212 struct spdk_bdev_io *bdev_io, *tmp; 1213 1214 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1215 if (bdev_io->ch == ch) { 1216 TAILQ_REMOVE(queue, bdev_io, link); 1217 /* 1218 * spdk_bdev_io_complete() assumes that the completed I/O had 1219 * been submitted to the bdev module. Since in this case it 1220 * hadn't, bump io_outstanding to account for the decrement 1221 * that spdk_bdev_io_complete() will do. 1222 */ 1223 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1224 ch->io_outstanding++; 1225 ch->module_ch->io_outstanding++; 1226 } 1227 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1228 } 1229 } 1230 } 1231 1232 static void 1233 _spdk_bdev_channel_destroy(struct spdk_bdev_channel *ch) 1234 { 1235 struct spdk_bdev_mgmt_channel *mgmt_ch; 1236 struct spdk_bdev_module_channel *module_ch = ch->module_ch; 1237 1238 mgmt_ch = module_ch->mgmt_ch; 1239 1240 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1241 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, ch); 1242 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1243 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1244 1245 _spdk_bdev_channel_destroy_resource(ch); 1246 } 1247 1248 static void 1249 spdk_bdev_qos_channel_destroy(void *cb_arg) 1250 { 1251 struct spdk_bdev_qos *qos = cb_arg; 1252 1253 _spdk_bdev_channel_destroy(qos->ch); 1254 1255 spdk_poller_unregister(&qos->poller); 1256 1257 free(qos->ch); 1258 free(qos); 1259 } 1260 1261 static int 1262 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 1263 { 1264 /* 1265 * Cleanly shutting down the QoS poller is tricky, because 1266 * during the asynchronous operation the user could open a 1267 * new channel, spawning a new QoS poller. 1268 * 1269 * The strategy is to create a new QoS structure here and swap it 1270 * in. The shutdown path then continues to refer to the old one 1271 * until it completes and then releases it. 1272 */ 1273 struct spdk_bdev_qos *new_qos, *old_qos; 1274 1275 old_qos = bdev->qos; 1276 1277 new_qos = calloc(1, sizeof(*new_qos)); 1278 if (!new_qos) { 1279 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 1280 return -ENOMEM; 1281 } 1282 1283 /* Copy the old QoS data into the newly allocated structure */ 1284 memcpy(new_qos, old_qos, sizeof(*new_qos)); 1285 1286 /* Zero out the key parts of the QoS structure */ 1287 new_qos->ch = NULL; 1288 new_qos->thread = NULL; 1289 new_qos->max_ios_per_timeslice = 0; 1290 new_qos->io_submitted_this_timeslice = 0; 1291 new_qos->poller = NULL; 1292 TAILQ_INIT(&new_qos->queued); 1293 1294 bdev->qos = new_qos; 1295 1296 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 1297 old_qos); 1298 1299 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 1300 * been destroyed yet. The destruction path will end up waiting for the final 1301 * channel to be put before it releases resources. */ 1302 1303 return 0; 1304 } 1305 1306 static void 1307 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1308 { 1309 struct spdk_bdev_channel *ch = ctx_buf; 1310 struct spdk_bdev *bdev = ch->bdev; 1311 1312 _spdk_bdev_channel_destroy(ch); 1313 1314 pthread_mutex_lock(&bdev->mutex); 1315 bdev->channel_count--; 1316 if (bdev->channel_count == 0 && bdev->qos && bdev->qos->ch != NULL) { 1317 if (spdk_bdev_qos_destroy(bdev)) { 1318 /* There isn't anything we can do to recover from here. Just let the 1319 * old QoS poller keep running. The QoS handling won't change 1320 * cores when the user allocates a new channel, but it won't break. */ 1321 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 1322 } 1323 } 1324 pthread_mutex_unlock(&bdev->mutex); 1325 } 1326 1327 int 1328 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1329 { 1330 struct spdk_bdev_alias *tmp; 1331 1332 if (alias == NULL) { 1333 SPDK_ERRLOG("Empty alias passed\n"); 1334 return -EINVAL; 1335 } 1336 1337 if (spdk_bdev_get_by_name(alias)) { 1338 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1339 return -EEXIST; 1340 } 1341 1342 tmp = calloc(1, sizeof(*tmp)); 1343 if (tmp == NULL) { 1344 SPDK_ERRLOG("Unable to allocate alias\n"); 1345 return -ENOMEM; 1346 } 1347 1348 tmp->alias = strdup(alias); 1349 if (tmp->alias == NULL) { 1350 free(tmp); 1351 SPDK_ERRLOG("Unable to allocate alias\n"); 1352 return -ENOMEM; 1353 } 1354 1355 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1356 1357 return 0; 1358 } 1359 1360 int 1361 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1362 { 1363 struct spdk_bdev_alias *tmp; 1364 1365 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1366 if (strcmp(alias, tmp->alias) == 0) { 1367 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1368 free(tmp->alias); 1369 free(tmp); 1370 return 0; 1371 } 1372 } 1373 1374 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1375 1376 return -ENOENT; 1377 } 1378 1379 struct spdk_io_channel * 1380 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1381 { 1382 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1383 } 1384 1385 const char * 1386 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1387 { 1388 return bdev->name; 1389 } 1390 1391 const char * 1392 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1393 { 1394 return bdev->product_name; 1395 } 1396 1397 const struct spdk_bdev_aliases_list * 1398 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1399 { 1400 return &bdev->aliases; 1401 } 1402 1403 uint32_t 1404 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1405 { 1406 return bdev->blocklen; 1407 } 1408 1409 uint64_t 1410 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1411 { 1412 return bdev->blockcnt; 1413 } 1414 1415 uint64_t 1416 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev) 1417 { 1418 uint64_t rate_limit = 0; 1419 1420 pthread_mutex_lock(&bdev->mutex); 1421 if (bdev->qos) { 1422 rate_limit = bdev->qos->rate_limit; 1423 } 1424 pthread_mutex_unlock(&bdev->mutex); 1425 1426 return rate_limit; 1427 } 1428 1429 size_t 1430 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1431 { 1432 /* TODO: push this logic down to the bdev modules */ 1433 if (bdev->need_aligned_buffer) { 1434 return bdev->blocklen; 1435 } 1436 1437 return 1; 1438 } 1439 1440 uint32_t 1441 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1442 { 1443 return bdev->optimal_io_boundary; 1444 } 1445 1446 bool 1447 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1448 { 1449 return bdev->write_cache; 1450 } 1451 1452 const struct spdk_uuid * 1453 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1454 { 1455 return &bdev->uuid; 1456 } 1457 1458 int 1459 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1460 { 1461 int ret; 1462 1463 pthread_mutex_lock(&bdev->mutex); 1464 1465 /* bdev has open descriptors */ 1466 if (!TAILQ_EMPTY(&bdev->open_descs) && 1467 bdev->blockcnt > size) { 1468 ret = -EBUSY; 1469 } else { 1470 bdev->blockcnt = size; 1471 ret = 0; 1472 } 1473 1474 pthread_mutex_unlock(&bdev->mutex); 1475 1476 return ret; 1477 } 1478 1479 /* 1480 * Convert I/O offset and length from bytes to blocks. 1481 * 1482 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1483 */ 1484 static uint64_t 1485 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1486 uint64_t num_bytes, uint64_t *num_blocks) 1487 { 1488 uint32_t block_size = bdev->blocklen; 1489 1490 *offset_blocks = offset_bytes / block_size; 1491 *num_blocks = num_bytes / block_size; 1492 1493 return (offset_bytes % block_size) | (num_bytes % block_size); 1494 } 1495 1496 static bool 1497 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1498 { 1499 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1500 * has been an overflow and hence the offset has been wrapped around */ 1501 if (offset_blocks + num_blocks < offset_blocks) { 1502 return false; 1503 } 1504 1505 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1506 if (offset_blocks + num_blocks > bdev->blockcnt) { 1507 return false; 1508 } 1509 1510 return true; 1511 } 1512 1513 int 1514 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1515 void *buf, uint64_t offset, uint64_t nbytes, 1516 spdk_bdev_io_completion_cb cb, void *cb_arg) 1517 { 1518 uint64_t offset_blocks, num_blocks; 1519 1520 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1521 return -EINVAL; 1522 } 1523 1524 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1525 } 1526 1527 int 1528 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1529 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1530 spdk_bdev_io_completion_cb cb, void *cb_arg) 1531 { 1532 struct spdk_bdev *bdev = desc->bdev; 1533 struct spdk_bdev_io *bdev_io; 1534 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1535 1536 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1537 return -EINVAL; 1538 } 1539 1540 bdev_io = spdk_bdev_get_io(channel); 1541 if (!bdev_io) { 1542 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1543 return -ENOMEM; 1544 } 1545 1546 bdev_io->ch = channel; 1547 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1548 bdev_io->u.bdev.iov.iov_base = buf; 1549 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1550 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1551 bdev_io->u.bdev.iovcnt = 1; 1552 bdev_io->u.bdev.num_blocks = num_blocks; 1553 bdev_io->u.bdev.offset_blocks = offset_blocks; 1554 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1555 1556 spdk_bdev_io_submit(bdev_io); 1557 return 0; 1558 } 1559 1560 int 1561 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1562 struct iovec *iov, int iovcnt, 1563 uint64_t offset, uint64_t nbytes, 1564 spdk_bdev_io_completion_cb cb, void *cb_arg) 1565 { 1566 uint64_t offset_blocks, num_blocks; 1567 1568 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1569 return -EINVAL; 1570 } 1571 1572 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1573 } 1574 1575 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1576 struct iovec *iov, int iovcnt, 1577 uint64_t offset_blocks, uint64_t num_blocks, 1578 spdk_bdev_io_completion_cb cb, void *cb_arg) 1579 { 1580 struct spdk_bdev *bdev = desc->bdev; 1581 struct spdk_bdev_io *bdev_io; 1582 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1583 1584 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1585 return -EINVAL; 1586 } 1587 1588 bdev_io = spdk_bdev_get_io(channel); 1589 if (!bdev_io) { 1590 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1591 return -ENOMEM; 1592 } 1593 1594 bdev_io->ch = channel; 1595 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1596 bdev_io->u.bdev.iovs = iov; 1597 bdev_io->u.bdev.iovcnt = iovcnt; 1598 bdev_io->u.bdev.num_blocks = num_blocks; 1599 bdev_io->u.bdev.offset_blocks = offset_blocks; 1600 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1601 1602 spdk_bdev_io_submit(bdev_io); 1603 return 0; 1604 } 1605 1606 int 1607 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1608 void *buf, uint64_t offset, uint64_t nbytes, 1609 spdk_bdev_io_completion_cb cb, void *cb_arg) 1610 { 1611 uint64_t offset_blocks, num_blocks; 1612 1613 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1614 return -EINVAL; 1615 } 1616 1617 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1618 } 1619 1620 int 1621 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1622 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1623 spdk_bdev_io_completion_cb cb, void *cb_arg) 1624 { 1625 struct spdk_bdev *bdev = desc->bdev; 1626 struct spdk_bdev_io *bdev_io; 1627 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1628 1629 if (!desc->write) { 1630 return -EBADF; 1631 } 1632 1633 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1634 return -EINVAL; 1635 } 1636 1637 bdev_io = spdk_bdev_get_io(channel); 1638 if (!bdev_io) { 1639 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1640 return -ENOMEM; 1641 } 1642 1643 bdev_io->ch = channel; 1644 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1645 bdev_io->u.bdev.iov.iov_base = buf; 1646 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1647 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1648 bdev_io->u.bdev.iovcnt = 1; 1649 bdev_io->u.bdev.num_blocks = num_blocks; 1650 bdev_io->u.bdev.offset_blocks = offset_blocks; 1651 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1652 1653 spdk_bdev_io_submit(bdev_io); 1654 return 0; 1655 } 1656 1657 int 1658 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1659 struct iovec *iov, int iovcnt, 1660 uint64_t offset, uint64_t len, 1661 spdk_bdev_io_completion_cb cb, void *cb_arg) 1662 { 1663 uint64_t offset_blocks, num_blocks; 1664 1665 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1666 return -EINVAL; 1667 } 1668 1669 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1670 } 1671 1672 int 1673 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1674 struct iovec *iov, int iovcnt, 1675 uint64_t offset_blocks, uint64_t num_blocks, 1676 spdk_bdev_io_completion_cb cb, void *cb_arg) 1677 { 1678 struct spdk_bdev *bdev = desc->bdev; 1679 struct spdk_bdev_io *bdev_io; 1680 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1681 1682 if (!desc->write) { 1683 return -EBADF; 1684 } 1685 1686 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1687 return -EINVAL; 1688 } 1689 1690 bdev_io = spdk_bdev_get_io(channel); 1691 if (!bdev_io) { 1692 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1693 return -ENOMEM; 1694 } 1695 1696 bdev_io->ch = channel; 1697 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1698 bdev_io->u.bdev.iovs = iov; 1699 bdev_io->u.bdev.iovcnt = iovcnt; 1700 bdev_io->u.bdev.num_blocks = num_blocks; 1701 bdev_io->u.bdev.offset_blocks = offset_blocks; 1702 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1703 1704 spdk_bdev_io_submit(bdev_io); 1705 return 0; 1706 } 1707 1708 int 1709 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1710 uint64_t offset, uint64_t len, 1711 spdk_bdev_io_completion_cb cb, void *cb_arg) 1712 { 1713 uint64_t offset_blocks, num_blocks; 1714 1715 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1716 return -EINVAL; 1717 } 1718 1719 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1720 } 1721 1722 int 1723 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1724 uint64_t offset_blocks, uint64_t num_blocks, 1725 spdk_bdev_io_completion_cb cb, void *cb_arg) 1726 { 1727 struct spdk_bdev *bdev = desc->bdev; 1728 struct spdk_bdev_io *bdev_io; 1729 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1730 uint64_t len; 1731 bool split_request = false; 1732 1733 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1734 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1735 return -ERANGE; 1736 } 1737 1738 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1739 return -EINVAL; 1740 } 1741 1742 bdev_io = spdk_bdev_get_io(channel); 1743 1744 if (!bdev_io) { 1745 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1746 return -ENOMEM; 1747 } 1748 1749 bdev_io->ch = channel; 1750 bdev_io->u.bdev.offset_blocks = offset_blocks; 1751 1752 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1753 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1754 bdev_io->u.bdev.num_blocks = num_blocks; 1755 bdev_io->u.bdev.iovs = NULL; 1756 bdev_io->u.bdev.iovcnt = 0; 1757 1758 } else { 1759 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1760 1761 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1762 1763 if (len > ZERO_BUFFER_SIZE) { 1764 split_request = true; 1765 len = ZERO_BUFFER_SIZE; 1766 } 1767 1768 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1769 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1770 bdev_io->u.bdev.iov.iov_len = len; 1771 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1772 bdev_io->u.bdev.iovcnt = 1; 1773 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1774 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1775 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1776 } 1777 1778 if (split_request) { 1779 bdev_io->u.bdev.stored_user_cb = cb; 1780 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1781 } else { 1782 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1783 } 1784 spdk_bdev_io_submit(bdev_io); 1785 return 0; 1786 } 1787 1788 int 1789 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1790 uint64_t offset, uint64_t nbytes, 1791 spdk_bdev_io_completion_cb cb, void *cb_arg) 1792 { 1793 uint64_t offset_blocks, num_blocks; 1794 1795 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1796 return -EINVAL; 1797 } 1798 1799 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1800 } 1801 1802 int 1803 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1804 uint64_t offset_blocks, uint64_t num_blocks, 1805 spdk_bdev_io_completion_cb cb, void *cb_arg) 1806 { 1807 struct spdk_bdev *bdev = desc->bdev; 1808 struct spdk_bdev_io *bdev_io; 1809 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1810 1811 if (!desc->write) { 1812 return -EBADF; 1813 } 1814 1815 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1816 return -EINVAL; 1817 } 1818 1819 if (num_blocks == 0) { 1820 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1821 return -EINVAL; 1822 } 1823 1824 bdev_io = spdk_bdev_get_io(channel); 1825 if (!bdev_io) { 1826 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1827 return -ENOMEM; 1828 } 1829 1830 bdev_io->ch = channel; 1831 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1832 bdev_io->u.bdev.iov.iov_base = NULL; 1833 bdev_io->u.bdev.iov.iov_len = 0; 1834 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1835 bdev_io->u.bdev.iovcnt = 1; 1836 bdev_io->u.bdev.offset_blocks = offset_blocks; 1837 bdev_io->u.bdev.num_blocks = num_blocks; 1838 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1839 1840 spdk_bdev_io_submit(bdev_io); 1841 return 0; 1842 } 1843 1844 int 1845 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1846 uint64_t offset, uint64_t length, 1847 spdk_bdev_io_completion_cb cb, void *cb_arg) 1848 { 1849 uint64_t offset_blocks, num_blocks; 1850 1851 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1852 return -EINVAL; 1853 } 1854 1855 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1856 } 1857 1858 int 1859 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1860 uint64_t offset_blocks, uint64_t num_blocks, 1861 spdk_bdev_io_completion_cb cb, void *cb_arg) 1862 { 1863 struct spdk_bdev *bdev = desc->bdev; 1864 struct spdk_bdev_io *bdev_io; 1865 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1866 1867 if (!desc->write) { 1868 return -EBADF; 1869 } 1870 1871 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1872 return -EINVAL; 1873 } 1874 1875 bdev_io = spdk_bdev_get_io(channel); 1876 if (!bdev_io) { 1877 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1878 return -ENOMEM; 1879 } 1880 1881 bdev_io->ch = channel; 1882 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1883 bdev_io->u.bdev.iovs = NULL; 1884 bdev_io->u.bdev.iovcnt = 0; 1885 bdev_io->u.bdev.offset_blocks = offset_blocks; 1886 bdev_io->u.bdev.num_blocks = num_blocks; 1887 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1888 1889 spdk_bdev_io_submit(bdev_io); 1890 return 0; 1891 } 1892 1893 static void 1894 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1895 { 1896 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1897 struct spdk_bdev_io *bdev_io; 1898 1899 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1900 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1901 spdk_bdev_io_submit_reset(bdev_io); 1902 } 1903 1904 static void 1905 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1906 { 1907 struct spdk_io_channel *ch; 1908 struct spdk_bdev_channel *channel; 1909 struct spdk_bdev_mgmt_channel *mgmt_channel; 1910 struct spdk_bdev_module_channel *module_ch; 1911 1912 ch = spdk_io_channel_iter_get_channel(i); 1913 channel = spdk_io_channel_get_ctx(ch); 1914 module_ch = channel->module_ch; 1915 mgmt_channel = module_ch->mgmt_ch; 1916 1917 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1918 1919 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, channel); 1920 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1921 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1922 1923 spdk_for_each_channel_continue(i, 0); 1924 } 1925 1926 static void 1927 _spdk_bdev_reset_freeze_qos_channel(void *ctx) 1928 { 1929 struct spdk_bdev *bdev = ctx; 1930 struct spdk_bdev_mgmt_channel *mgmt_channel = NULL; 1931 struct spdk_bdev_channel *qos_channel = bdev->qos->ch; 1932 struct spdk_bdev_module_channel *module_ch = NULL; 1933 1934 if (qos_channel) { 1935 module_ch = qos_channel->module_ch; 1936 mgmt_channel = module_ch->mgmt_ch; 1937 1938 qos_channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1939 1940 _spdk_bdev_abort_queued_io(&module_ch->nomem_io, qos_channel); 1941 _spdk_bdev_abort_queued_io(&bdev->qos->queued, qos_channel); 1942 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, qos_channel); 1943 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, qos_channel); 1944 } 1945 } 1946 1947 static void 1948 _spdk_bdev_start_reset(void *ctx) 1949 { 1950 struct spdk_bdev_channel *ch = ctx; 1951 1952 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1953 ch, _spdk_bdev_reset_dev); 1954 } 1955 1956 static void 1957 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1958 { 1959 struct spdk_bdev *bdev = ch->bdev; 1960 1961 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1962 1963 pthread_mutex_lock(&bdev->mutex); 1964 if (bdev->reset_in_progress == NULL) { 1965 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1966 /* 1967 * Take a channel reference for the target bdev for the life of this 1968 * reset. This guards against the channel getting destroyed while 1969 * spdk_for_each_channel() calls related to this reset IO are in 1970 * progress. We will release the reference when this reset is 1971 * completed. 1972 */ 1973 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1974 _spdk_bdev_start_reset(ch); 1975 } 1976 pthread_mutex_unlock(&bdev->mutex); 1977 } 1978 1979 int 1980 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1981 spdk_bdev_io_completion_cb cb, void *cb_arg) 1982 { 1983 struct spdk_bdev *bdev = desc->bdev; 1984 struct spdk_bdev_io *bdev_io; 1985 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1986 1987 bdev_io = spdk_bdev_get_io(channel); 1988 if (!bdev_io) { 1989 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1990 return -ENOMEM; 1991 } 1992 1993 bdev_io->ch = channel; 1994 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1995 bdev_io->u.reset.ch_ref = NULL; 1996 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1997 1998 pthread_mutex_lock(&bdev->mutex); 1999 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 2000 pthread_mutex_unlock(&bdev->mutex); 2001 2002 _spdk_bdev_channel_start_reset(channel); 2003 2004 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 2005 if (bdev->qos && bdev->qos->thread) { 2006 spdk_thread_send_msg(bdev->qos->thread, 2007 _spdk_bdev_reset_freeze_qos_channel, bdev); 2008 } 2009 2010 return 0; 2011 } 2012 2013 void 2014 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2015 struct spdk_bdev_io_stat *stat) 2016 { 2017 #ifdef SPDK_CONFIG_VTUNE 2018 SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n"); 2019 memset(stat, 0, sizeof(*stat)); 2020 return; 2021 #endif 2022 2023 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2024 2025 channel->stat.ticks_rate = spdk_get_ticks_hz(); 2026 *stat = channel->stat; 2027 memset(&channel->stat, 0, sizeof(channel->stat)); 2028 } 2029 2030 int 2031 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2032 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2033 spdk_bdev_io_completion_cb cb, void *cb_arg) 2034 { 2035 struct spdk_bdev *bdev = desc->bdev; 2036 struct spdk_bdev_io *bdev_io; 2037 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2038 2039 if (!desc->write) { 2040 return -EBADF; 2041 } 2042 2043 bdev_io = spdk_bdev_get_io(channel); 2044 if (!bdev_io) { 2045 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2046 return -ENOMEM; 2047 } 2048 2049 bdev_io->ch = channel; 2050 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 2051 bdev_io->u.nvme_passthru.cmd = *cmd; 2052 bdev_io->u.nvme_passthru.buf = buf; 2053 bdev_io->u.nvme_passthru.nbytes = nbytes; 2054 bdev_io->u.nvme_passthru.md_buf = NULL; 2055 bdev_io->u.nvme_passthru.md_len = 0; 2056 2057 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2058 2059 spdk_bdev_io_submit(bdev_io); 2060 return 0; 2061 } 2062 2063 int 2064 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2065 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2066 spdk_bdev_io_completion_cb cb, void *cb_arg) 2067 { 2068 struct spdk_bdev *bdev = desc->bdev; 2069 struct spdk_bdev_io *bdev_io; 2070 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2071 2072 if (!desc->write) { 2073 /* 2074 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2075 * to easily determine if the command is a read or write, but for now just 2076 * do not allow io_passthru with a read-only descriptor. 2077 */ 2078 return -EBADF; 2079 } 2080 2081 bdev_io = spdk_bdev_get_io(channel); 2082 if (!bdev_io) { 2083 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2084 return -ENOMEM; 2085 } 2086 2087 bdev_io->ch = channel; 2088 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2089 bdev_io->u.nvme_passthru.cmd = *cmd; 2090 bdev_io->u.nvme_passthru.buf = buf; 2091 bdev_io->u.nvme_passthru.nbytes = nbytes; 2092 bdev_io->u.nvme_passthru.md_buf = NULL; 2093 bdev_io->u.nvme_passthru.md_len = 0; 2094 2095 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2096 2097 spdk_bdev_io_submit(bdev_io); 2098 return 0; 2099 } 2100 2101 int 2102 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2103 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2104 spdk_bdev_io_completion_cb cb, void *cb_arg) 2105 { 2106 struct spdk_bdev *bdev = desc->bdev; 2107 struct spdk_bdev_io *bdev_io; 2108 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2109 2110 if (!desc->write) { 2111 /* 2112 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2113 * to easily determine if the command is a read or write, but for now just 2114 * do not allow io_passthru with a read-only descriptor. 2115 */ 2116 return -EBADF; 2117 } 2118 2119 bdev_io = spdk_bdev_get_io(channel); 2120 if (!bdev_io) { 2121 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2122 return -ENOMEM; 2123 } 2124 2125 bdev_io->ch = channel; 2126 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2127 bdev_io->u.nvme_passthru.cmd = *cmd; 2128 bdev_io->u.nvme_passthru.buf = buf; 2129 bdev_io->u.nvme_passthru.nbytes = nbytes; 2130 bdev_io->u.nvme_passthru.md_buf = md_buf; 2131 bdev_io->u.nvme_passthru.md_len = md_len; 2132 2133 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2134 2135 spdk_bdev_io_submit(bdev_io); 2136 return 0; 2137 } 2138 2139 int 2140 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2141 { 2142 if (!bdev_io) { 2143 SPDK_ERRLOG("bdev_io is NULL\n"); 2144 return -1; 2145 } 2146 2147 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2148 SPDK_ERRLOG("bdev_io is in pending state\n"); 2149 assert(false); 2150 return -1; 2151 } 2152 2153 spdk_bdev_put_io(bdev_io); 2154 2155 return 0; 2156 } 2157 2158 static void 2159 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2160 { 2161 struct spdk_bdev *bdev = bdev_ch->bdev; 2162 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 2163 struct spdk_bdev_io *bdev_io; 2164 2165 if (module_ch->io_outstanding > module_ch->nomem_threshold) { 2166 /* 2167 * Allow some more I/O to complete before retrying the nomem_io queue. 2168 * Some drivers (such as nvme) cannot immediately take a new I/O in 2169 * the context of a completion, because the resources for the I/O are 2170 * not released until control returns to the bdev poller. Also, we 2171 * may require several small I/O to complete before a larger I/O 2172 * (that requires splitting) can be submitted. 2173 */ 2174 return; 2175 } 2176 2177 while (!TAILQ_EMPTY(&module_ch->nomem_io)) { 2178 bdev_io = TAILQ_FIRST(&module_ch->nomem_io); 2179 TAILQ_REMOVE(&module_ch->nomem_io, bdev_io, link); 2180 bdev_io->ch->io_outstanding++; 2181 module_ch->io_outstanding++; 2182 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2183 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2184 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2185 break; 2186 } 2187 } 2188 } 2189 2190 static inline void 2191 _spdk_bdev_io_complete(void *ctx) 2192 { 2193 struct spdk_bdev_io *bdev_io = ctx; 2194 2195 if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) { 2196 /* 2197 * Send the completion to the thread that originally submitted the I/O, 2198 * which may not be the current thread in the case of QoS. 2199 */ 2200 if (bdev_io->io_submit_ch) { 2201 bdev_io->ch = bdev_io->io_submit_ch; 2202 bdev_io->io_submit_ch = NULL; 2203 } 2204 2205 /* 2206 * Defer completion to avoid potential infinite recursion if the 2207 * user's completion callback issues a new I/O. 2208 */ 2209 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2210 _spdk_bdev_io_complete, bdev_io); 2211 return; 2212 } 2213 2214 if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2215 switch (bdev_io->type) { 2216 case SPDK_BDEV_IO_TYPE_READ: 2217 bdev_io->ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2218 bdev_io->ch->stat.num_read_ops++; 2219 bdev_io->ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2220 break; 2221 case SPDK_BDEV_IO_TYPE_WRITE: 2222 bdev_io->ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2223 bdev_io->ch->stat.num_write_ops++; 2224 bdev_io->ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2225 break; 2226 default: 2227 break; 2228 } 2229 } 2230 2231 #ifdef SPDK_CONFIG_VTUNE 2232 uint64_t now_tsc = spdk_get_ticks(); 2233 if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) { 2234 uint64_t data[5]; 2235 2236 data[0] = bdev_io->ch->stat.num_read_ops; 2237 data[1] = bdev_io->ch->stat.bytes_read; 2238 data[2] = bdev_io->ch->stat.num_write_ops; 2239 data[3] = bdev_io->ch->stat.bytes_written; 2240 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 2241 bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0; 2242 2243 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle, 2244 __itt_metadata_u64, 5, data); 2245 2246 memset(&bdev_io->ch->stat, 0, sizeof(bdev_io->ch->stat)); 2247 bdev_io->ch->start_tsc = now_tsc; 2248 } 2249 #endif 2250 2251 assert(bdev_io->cb != NULL); 2252 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel)); 2253 2254 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2255 bdev_io->caller_ctx); 2256 } 2257 2258 static void 2259 _spdk_bdev_unfreeze_qos_channel(void *ctx) 2260 { 2261 struct spdk_bdev *bdev = ctx; 2262 2263 if (bdev->qos->ch) { 2264 bdev->qos->ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2265 assert(TAILQ_EMPTY(&bdev->qos->ch->queued_resets)); 2266 } 2267 } 2268 2269 static void 2270 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2271 { 2272 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2273 2274 if (bdev_io->u.reset.ch_ref != NULL) { 2275 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2276 bdev_io->u.reset.ch_ref = NULL; 2277 } 2278 2279 _spdk_bdev_io_complete(bdev_io); 2280 } 2281 2282 static void 2283 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2284 { 2285 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2286 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2287 2288 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2289 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2290 _spdk_bdev_channel_start_reset(ch); 2291 } 2292 2293 spdk_for_each_channel_continue(i, 0); 2294 } 2295 2296 void 2297 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2298 { 2299 struct spdk_bdev *bdev = bdev_io->bdev; 2300 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2301 struct spdk_bdev_module_channel *module_ch = bdev_ch->module_ch; 2302 2303 bdev_io->status = status; 2304 2305 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2306 bool unlock_channels = false; 2307 2308 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2309 SPDK_ERRLOG("NOMEM returned for reset\n"); 2310 } 2311 pthread_mutex_lock(&bdev->mutex); 2312 if (bdev_io == bdev->reset_in_progress) { 2313 bdev->reset_in_progress = NULL; 2314 unlock_channels = true; 2315 } 2316 pthread_mutex_unlock(&bdev->mutex); 2317 2318 if (unlock_channels) { 2319 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 2320 if (bdev->qos && bdev->qos->thread) { 2321 spdk_thread_send_msg(bdev->qos->thread, 2322 _spdk_bdev_unfreeze_qos_channel, bdev); 2323 } 2324 2325 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2326 bdev_io, _spdk_bdev_reset_complete); 2327 return; 2328 } 2329 } else { 2330 assert(bdev_ch->io_outstanding > 0); 2331 assert(module_ch->io_outstanding > 0); 2332 bdev_ch->io_outstanding--; 2333 module_ch->io_outstanding--; 2334 2335 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2336 TAILQ_INSERT_HEAD(&module_ch->nomem_io, bdev_io, link); 2337 /* 2338 * Wait for some of the outstanding I/O to complete before we 2339 * retry any of the nomem_io. Normally we will wait for 2340 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2341 * depth channels we will instead wait for half to complete. 2342 */ 2343 module_ch->nomem_threshold = spdk_max((int64_t)module_ch->io_outstanding / 2, 2344 (int64_t)module_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); 2345 return; 2346 } 2347 2348 if (spdk_unlikely(!TAILQ_EMPTY(&module_ch->nomem_io))) { 2349 _spdk_bdev_ch_retry_io(bdev_ch); 2350 } 2351 } 2352 2353 _spdk_bdev_io_complete(bdev_io); 2354 } 2355 2356 void 2357 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2358 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2359 { 2360 if (sc == SPDK_SCSI_STATUS_GOOD) { 2361 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2362 } else { 2363 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2364 bdev_io->error.scsi.sc = sc; 2365 bdev_io->error.scsi.sk = sk; 2366 bdev_io->error.scsi.asc = asc; 2367 bdev_io->error.scsi.ascq = ascq; 2368 } 2369 2370 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2371 } 2372 2373 void 2374 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2375 int *sc, int *sk, int *asc, int *ascq) 2376 { 2377 assert(sc != NULL); 2378 assert(sk != NULL); 2379 assert(asc != NULL); 2380 assert(ascq != NULL); 2381 2382 switch (bdev_io->status) { 2383 case SPDK_BDEV_IO_STATUS_SUCCESS: 2384 *sc = SPDK_SCSI_STATUS_GOOD; 2385 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2386 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2387 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2388 break; 2389 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2390 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2391 break; 2392 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2393 *sc = bdev_io->error.scsi.sc; 2394 *sk = bdev_io->error.scsi.sk; 2395 *asc = bdev_io->error.scsi.asc; 2396 *ascq = bdev_io->error.scsi.ascq; 2397 break; 2398 default: 2399 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2400 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2401 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2402 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2403 break; 2404 } 2405 } 2406 2407 void 2408 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2409 { 2410 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2411 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2412 } else { 2413 bdev_io->error.nvme.sct = sct; 2414 bdev_io->error.nvme.sc = sc; 2415 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2416 } 2417 2418 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2419 } 2420 2421 void 2422 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2423 { 2424 assert(sct != NULL); 2425 assert(sc != NULL); 2426 2427 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2428 *sct = bdev_io->error.nvme.sct; 2429 *sc = bdev_io->error.nvme.sc; 2430 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2431 *sct = SPDK_NVME_SCT_GENERIC; 2432 *sc = SPDK_NVME_SC_SUCCESS; 2433 } else { 2434 *sct = SPDK_NVME_SCT_GENERIC; 2435 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2436 } 2437 } 2438 2439 struct spdk_thread * 2440 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2441 { 2442 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2443 } 2444 2445 static void 2446 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2447 { 2448 struct spdk_conf_section *sp = NULL; 2449 const char *val = NULL; 2450 uint64_t ios_per_sec = 0; 2451 int i = 0; 2452 2453 sp = spdk_conf_find_section(NULL, "QoS"); 2454 if (!sp) { 2455 return; 2456 } 2457 2458 while (true) { 2459 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 0); 2460 if (!val) { 2461 break; 2462 } 2463 2464 if (strcmp(bdev->name, val) != 0) { 2465 i++; 2466 continue; 2467 } 2468 2469 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 1); 2470 if (!val) { 2471 return; 2472 } 2473 2474 ios_per_sec = strtoull(val, NULL, 10); 2475 if (ios_per_sec > 0) { 2476 if (ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 2477 SPDK_ERRLOG("Assigned IOPS %" PRIu64 " on bdev %s is not multiple of %u\n", 2478 ios_per_sec, bdev->name, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 2479 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2480 } else { 2481 bdev->qos = calloc(1, sizeof(*bdev->qos)); 2482 if (!bdev->qos) { 2483 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 2484 return; 2485 } 2486 bdev->qos->rate_limit = ios_per_sec; 2487 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS:%lu\n", 2488 bdev->name, bdev->qos->rate_limit); 2489 } 2490 } 2491 2492 return; 2493 } 2494 } 2495 2496 static int 2497 spdk_bdev_init(struct spdk_bdev *bdev) 2498 { 2499 assert(bdev->module != NULL); 2500 2501 if (!bdev->name) { 2502 SPDK_ERRLOG("Bdev name is NULL\n"); 2503 return -EINVAL; 2504 } 2505 2506 if (spdk_bdev_get_by_name(bdev->name)) { 2507 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2508 return -EEXIST; 2509 } 2510 2511 bdev->status = SPDK_BDEV_STATUS_READY; 2512 2513 TAILQ_INIT(&bdev->open_descs); 2514 2515 TAILQ_INIT(&bdev->aliases); 2516 2517 bdev->reset_in_progress = NULL; 2518 2519 _spdk_bdev_qos_config(bdev); 2520 2521 spdk_io_device_register(__bdev_to_io_dev(bdev), 2522 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2523 sizeof(struct spdk_bdev_channel)); 2524 2525 pthread_mutex_init(&bdev->mutex, NULL); 2526 return 0; 2527 } 2528 2529 static void 2530 spdk_bdev_destroy_cb(void *io_device) 2531 { 2532 int rc; 2533 struct spdk_bdev *bdev; 2534 spdk_bdev_unregister_cb cb_fn; 2535 void *cb_arg; 2536 2537 bdev = __bdev_from_io_dev(io_device); 2538 cb_fn = bdev->unregister_cb; 2539 cb_arg = bdev->unregister_ctx; 2540 2541 rc = bdev->fn_table->destruct(bdev->ctxt); 2542 if (rc < 0) { 2543 SPDK_ERRLOG("destruct failed\n"); 2544 } 2545 if (rc <= 0 && cb_fn != NULL) { 2546 cb_fn(cb_arg, rc); 2547 } 2548 } 2549 2550 2551 static void 2552 spdk_bdev_fini(struct spdk_bdev *bdev) 2553 { 2554 pthread_mutex_destroy(&bdev->mutex); 2555 2556 free(bdev->qos); 2557 2558 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2559 } 2560 2561 static void 2562 spdk_bdev_start(struct spdk_bdev *bdev) 2563 { 2564 struct spdk_bdev_module *module; 2565 2566 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2567 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2568 2569 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2570 if (module->examine) { 2571 module->action_in_progress++; 2572 module->examine(bdev); 2573 } 2574 } 2575 } 2576 2577 int 2578 spdk_bdev_register(struct spdk_bdev *bdev) 2579 { 2580 int rc = spdk_bdev_init(bdev); 2581 2582 if (rc == 0) { 2583 spdk_bdev_start(bdev); 2584 } 2585 2586 return rc; 2587 } 2588 2589 static void 2590 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2591 { 2592 struct spdk_bdev **bdevs; 2593 struct spdk_bdev *base; 2594 size_t i, j, k; 2595 bool found; 2596 2597 /* Iterate over base bdevs to remove vbdev from them. */ 2598 for (i = 0; i < vbdev->base_bdevs_cnt; i++) { 2599 found = false; 2600 base = vbdev->base_bdevs[i]; 2601 2602 for (j = 0; j < base->vbdevs_cnt; j++) { 2603 if (base->vbdevs[j] != vbdev) { 2604 continue; 2605 } 2606 2607 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2608 base->vbdevs[k] = base->vbdevs[k + 1]; 2609 } 2610 2611 base->vbdevs_cnt--; 2612 if (base->vbdevs_cnt > 0) { 2613 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2614 /* It would be odd if shrinking memory block fail. */ 2615 assert(bdevs); 2616 base->vbdevs = bdevs; 2617 } else { 2618 free(base->vbdevs); 2619 base->vbdevs = NULL; 2620 } 2621 2622 found = true; 2623 break; 2624 } 2625 2626 if (!found) { 2627 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2628 } 2629 } 2630 2631 free(vbdev->base_bdevs); 2632 vbdev->base_bdevs = NULL; 2633 vbdev->base_bdevs_cnt = 0; 2634 } 2635 2636 static int 2637 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2638 { 2639 struct spdk_bdev **vbdevs; 2640 struct spdk_bdev *base; 2641 size_t i; 2642 2643 /* Adding base bdevs isn't supported (yet?). */ 2644 assert(vbdev->base_bdevs_cnt == 0); 2645 2646 vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0])); 2647 if (!vbdev->base_bdevs) { 2648 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2649 return -ENOMEM; 2650 } 2651 2652 memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0])); 2653 vbdev->base_bdevs_cnt = cnt; 2654 2655 /* Iterate over base bdevs to add this vbdev to them. */ 2656 for (i = 0; i < cnt; i++) { 2657 base = vbdev->base_bdevs[i]; 2658 2659 assert(base != NULL); 2660 assert(base->claim_module != NULL); 2661 2662 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2663 if (!vbdevs) { 2664 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 2665 spdk_vbdev_remove_base_bdevs(vbdev); 2666 return -ENOMEM; 2667 } 2668 2669 vbdevs[base->vbdevs_cnt] = vbdev; 2670 base->vbdevs = vbdevs; 2671 base->vbdevs_cnt++; 2672 } 2673 2674 return 0; 2675 } 2676 2677 int 2678 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2679 { 2680 int rc; 2681 2682 rc = spdk_bdev_init(vbdev); 2683 if (rc) { 2684 return rc; 2685 } 2686 2687 if (base_bdev_count == 0) { 2688 spdk_bdev_start(vbdev); 2689 return 0; 2690 } 2691 2692 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 2693 if (rc) { 2694 spdk_bdev_fini(vbdev); 2695 return rc; 2696 } 2697 2698 spdk_bdev_start(vbdev); 2699 return 0; 2700 2701 } 2702 2703 void 2704 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 2705 { 2706 if (bdev->unregister_cb != NULL) { 2707 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2708 } 2709 } 2710 2711 static void 2712 _remove_notify(void *arg) 2713 { 2714 struct spdk_bdev_desc *desc = arg; 2715 2716 desc->remove_cb(desc->remove_ctx); 2717 } 2718 2719 void 2720 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2721 { 2722 struct spdk_bdev_desc *desc, *tmp; 2723 bool do_destruct = true; 2724 struct spdk_thread *thread; 2725 2726 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2727 2728 thread = spdk_get_thread(); 2729 if (!thread) { 2730 /* The user called this from a non-SPDK thread. */ 2731 cb_fn(cb_arg, -ENOTSUP); 2732 return; 2733 } 2734 2735 pthread_mutex_lock(&bdev->mutex); 2736 2737 spdk_vbdev_remove_base_bdevs(bdev); 2738 2739 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2740 bdev->unregister_cb = cb_fn; 2741 bdev->unregister_ctx = cb_arg; 2742 2743 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2744 if (desc->remove_cb) { 2745 do_destruct = false; 2746 /* 2747 * Defer invocation of the remove_cb to a separate message that will 2748 * run later on this thread. This ensures this context unwinds and 2749 * we don't recursively unregister this bdev again if the remove_cb 2750 * immediately closes its descriptor. 2751 */ 2752 spdk_thread_send_msg(thread, _remove_notify, desc); 2753 } 2754 } 2755 2756 if (!do_destruct) { 2757 pthread_mutex_unlock(&bdev->mutex); 2758 return; 2759 } 2760 2761 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2762 pthread_mutex_unlock(&bdev->mutex); 2763 2764 spdk_bdev_fini(bdev); 2765 } 2766 2767 int 2768 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2769 void *remove_ctx, struct spdk_bdev_desc **_desc) 2770 { 2771 struct spdk_bdev_desc *desc; 2772 2773 desc = calloc(1, sizeof(*desc)); 2774 if (desc == NULL) { 2775 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2776 return -ENOMEM; 2777 } 2778 2779 pthread_mutex_lock(&bdev->mutex); 2780 2781 if (write && bdev->claim_module) { 2782 SPDK_INFOLOG(SPDK_LOG_BDEV, "Could not open %s - already claimed\n", bdev->name); 2783 free(desc); 2784 pthread_mutex_unlock(&bdev->mutex); 2785 return -EPERM; 2786 } 2787 2788 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2789 2790 desc->bdev = bdev; 2791 desc->remove_cb = remove_cb; 2792 desc->remove_ctx = remove_ctx; 2793 desc->write = write; 2794 *_desc = desc; 2795 2796 pthread_mutex_unlock(&bdev->mutex); 2797 2798 return 0; 2799 } 2800 2801 void 2802 spdk_bdev_close(struct spdk_bdev_desc *desc) 2803 { 2804 struct spdk_bdev *bdev = desc->bdev; 2805 bool do_unregister = false; 2806 2807 pthread_mutex_lock(&bdev->mutex); 2808 2809 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2810 free(desc); 2811 2812 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2813 do_unregister = true; 2814 } 2815 pthread_mutex_unlock(&bdev->mutex); 2816 2817 if (do_unregister == true) { 2818 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2819 } 2820 } 2821 2822 int 2823 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2824 struct spdk_bdev_module *module) 2825 { 2826 if (bdev->claim_module != NULL) { 2827 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2828 bdev->claim_module->name); 2829 return -EPERM; 2830 } 2831 2832 if (desc && !desc->write) { 2833 desc->write = true; 2834 } 2835 2836 bdev->claim_module = module; 2837 return 0; 2838 } 2839 2840 void 2841 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2842 { 2843 assert(bdev->claim_module != NULL); 2844 bdev->claim_module = NULL; 2845 } 2846 2847 struct spdk_bdev * 2848 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2849 { 2850 return desc->bdev; 2851 } 2852 2853 void 2854 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2855 { 2856 struct iovec *iovs; 2857 int iovcnt; 2858 2859 if (bdev_io == NULL) { 2860 return; 2861 } 2862 2863 switch (bdev_io->type) { 2864 case SPDK_BDEV_IO_TYPE_READ: 2865 iovs = bdev_io->u.bdev.iovs; 2866 iovcnt = bdev_io->u.bdev.iovcnt; 2867 break; 2868 case SPDK_BDEV_IO_TYPE_WRITE: 2869 iovs = bdev_io->u.bdev.iovs; 2870 iovcnt = bdev_io->u.bdev.iovcnt; 2871 break; 2872 default: 2873 iovs = NULL; 2874 iovcnt = 0; 2875 break; 2876 } 2877 2878 if (iovp) { 2879 *iovp = iovs; 2880 } 2881 if (iovcntp) { 2882 *iovcntp = iovcnt; 2883 } 2884 } 2885 2886 void 2887 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2888 { 2889 2890 if (spdk_bdev_module_list_find(bdev_module->name)) { 2891 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 2892 assert(false); 2893 } 2894 2895 if (bdev_module->async_init) { 2896 bdev_module->action_in_progress = 1; 2897 } 2898 2899 /* 2900 * Modules with examine callbacks must be initialized first, so they are 2901 * ready to handle examine callbacks from later modules that will 2902 * register physical bdevs. 2903 */ 2904 if (bdev_module->examine != NULL) { 2905 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2906 } else { 2907 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2908 } 2909 } 2910 2911 struct spdk_bdev_module * 2912 spdk_bdev_module_list_find(const char *name) 2913 { 2914 struct spdk_bdev_module *bdev_module; 2915 2916 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2917 if (strcmp(name, bdev_module->name) == 0) { 2918 break; 2919 } 2920 } 2921 2922 return bdev_module; 2923 } 2924 2925 static void 2926 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2927 { 2928 uint64_t len; 2929 2930 if (!success) { 2931 bdev_io->cb = bdev_io->u.bdev.stored_user_cb; 2932 _spdk_bdev_io_complete(bdev_io); 2933 return; 2934 } 2935 2936 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2937 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 2938 ZERO_BUFFER_SIZE); 2939 2940 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 2941 bdev_io->u.bdev.iov.iov_len = len; 2942 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2943 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2944 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2945 2946 /* if this round completes the i/o, change the callback to be the original user callback */ 2947 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 2948 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 2949 } else { 2950 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 2951 } 2952 spdk_bdev_io_submit(bdev_io); 2953 } 2954 2955 struct set_qos_limit_ctx { 2956 void (*cb_fn)(void *cb_arg, int status); 2957 void *cb_arg; 2958 struct spdk_bdev *bdev; 2959 }; 2960 2961 static void 2962 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 2963 { 2964 pthread_mutex_lock(&ctx->bdev->mutex); 2965 ctx->bdev->qos_mod_in_progress = false; 2966 pthread_mutex_unlock(&ctx->bdev->mutex); 2967 2968 ctx->cb_fn(ctx->cb_arg, status); 2969 free(ctx); 2970 } 2971 2972 static void 2973 _spdk_bdev_disable_qos_done(void *cb_arg) 2974 { 2975 struct set_qos_limit_ctx *ctx = cb_arg; 2976 struct spdk_bdev *bdev = ctx->bdev; 2977 struct spdk_bdev_qos *qos; 2978 2979 pthread_mutex_lock(&bdev->mutex); 2980 qos = bdev->qos; 2981 bdev->qos = NULL; 2982 pthread_mutex_unlock(&bdev->mutex); 2983 2984 _spdk_bdev_abort_queued_io(&qos->queued, qos->ch); 2985 _spdk_bdev_channel_destroy(qos->ch); 2986 spdk_poller_unregister(&qos->poller); 2987 2988 free(qos->ch); 2989 free(qos); 2990 2991 _spdk_bdev_set_qos_limit_done(ctx, 0); 2992 } 2993 2994 static void 2995 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 2996 { 2997 void *io_device = spdk_io_channel_iter_get_io_device(i); 2998 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2999 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3000 struct spdk_thread *thread; 3001 3002 pthread_mutex_lock(&bdev->mutex); 3003 thread = bdev->qos->thread; 3004 pthread_mutex_unlock(&bdev->mutex); 3005 3006 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 3007 } 3008 3009 static void 3010 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 3011 { 3012 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3013 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3014 3015 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 3016 3017 spdk_for_each_channel_continue(i, 0); 3018 } 3019 3020 static void 3021 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg) 3022 { 3023 struct set_qos_limit_ctx *ctx = cb_arg; 3024 struct spdk_bdev *bdev = ctx->bdev; 3025 3026 pthread_mutex_lock(&bdev->mutex); 3027 spdk_bdev_qos_update_max_ios_per_timeslice(bdev->qos); 3028 pthread_mutex_unlock(&bdev->mutex); 3029 3030 _spdk_bdev_set_qos_limit_done(ctx, 0); 3031 } 3032 3033 static void 3034 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 3035 { 3036 void *io_device = spdk_io_channel_iter_get_io_device(i); 3037 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3038 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3039 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3040 int rc; 3041 3042 pthread_mutex_lock(&bdev->mutex); 3043 rc = _spdk_bdev_enable_qos(bdev, bdev_ch); 3044 pthread_mutex_unlock(&bdev->mutex); 3045 spdk_for_each_channel_continue(i, rc); 3046 } 3047 3048 static void 3049 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 3050 { 3051 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3052 3053 _spdk_bdev_set_qos_limit_done(ctx, status); 3054 } 3055 3056 void 3057 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec, 3058 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 3059 { 3060 struct set_qos_limit_ctx *ctx; 3061 3062 if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 3063 SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n", 3064 ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 3065 cb_fn(cb_arg, -EINVAL); 3066 return; 3067 } 3068 3069 ctx = calloc(1, sizeof(*ctx)); 3070 if (ctx == NULL) { 3071 cb_fn(cb_arg, -ENOMEM); 3072 return; 3073 } 3074 3075 ctx->cb_fn = cb_fn; 3076 ctx->cb_arg = cb_arg; 3077 ctx->bdev = bdev; 3078 3079 pthread_mutex_lock(&bdev->mutex); 3080 if (bdev->qos_mod_in_progress) { 3081 pthread_mutex_unlock(&bdev->mutex); 3082 free(ctx); 3083 cb_fn(cb_arg, -EAGAIN); 3084 return; 3085 } 3086 bdev->qos_mod_in_progress = true; 3087 3088 if (ios_per_sec > 0) { 3089 if (bdev->qos == NULL) { 3090 /* Enabling */ 3091 bdev->qos = calloc(1, sizeof(*bdev->qos)); 3092 if (!bdev->qos) { 3093 pthread_mutex_unlock(&bdev->mutex); 3094 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3095 free(ctx); 3096 cb_fn(cb_arg, -ENOMEM); 3097 return; 3098 } 3099 3100 bdev->qos->rate_limit = ios_per_sec; 3101 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3102 _spdk_bdev_enable_qos_msg, ctx, 3103 _spdk_bdev_enable_qos_done); 3104 } else { 3105 /* Updating */ 3106 bdev->qos->rate_limit = ios_per_sec; 3107 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx); 3108 } 3109 } else { 3110 if (bdev->qos != NULL) { 3111 /* Disabling */ 3112 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3113 _spdk_bdev_disable_qos_msg, ctx, 3114 _spdk_bdev_disable_qos_msg_done); 3115 } else { 3116 pthread_mutex_unlock(&bdev->mutex); 3117 _spdk_bdev_set_qos_limit_done(ctx, 0); 3118 return; 3119 } 3120 } 3121 3122 pthread_mutex_unlock(&bdev->mutex); 3123 } 3124 3125 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 3126