1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 #include "spdk/conf.h" 39 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/io_channel.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 49 #include "spdk_internal/bdev.h" 50 #include "spdk_internal/log.h" 51 #include "spdk/string.h" 52 53 #ifdef SPDK_CONFIG_VTUNE 54 #include "ittnotify.h" 55 #include "ittnotify_types.h" 56 int __itt_init_ittlib(const char *, __itt_group_id); 57 #endif 58 59 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 60 #define SPDK_BDEV_IO_CACHE_SIZE 256 61 #define BUF_SMALL_POOL_SIZE 8192 62 #define BUF_LARGE_POOL_SIZE 1024 63 #define NOMEM_THRESHOLD_COUNT 8 64 #define ZERO_BUFFER_SIZE 0x100000 65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 66 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 69 70 struct spdk_bdev_mgr { 71 struct spdk_mempool *bdev_io_pool; 72 73 struct spdk_mempool *buf_small_pool; 74 struct spdk_mempool *buf_large_pool; 75 76 void *zero_buffer; 77 78 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 79 80 TAILQ_HEAD(, spdk_bdev) bdevs; 81 82 bool init_complete; 83 bool module_init_complete; 84 85 #ifdef SPDK_CONFIG_VTUNE 86 __itt_domain *domain; 87 #endif 88 }; 89 90 static struct spdk_bdev_mgr g_bdev_mgr = { 91 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 92 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 93 .init_complete = false, 94 .module_init_complete = false, 95 }; 96 97 static spdk_bdev_init_cb g_init_cb_fn = NULL; 98 static void *g_init_cb_arg = NULL; 99 100 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 101 static void *g_fini_cb_arg = NULL; 102 static struct spdk_thread *g_fini_thread = NULL; 103 104 struct spdk_bdev_qos { 105 /** Rate limit, in I/O per second */ 106 uint64_t rate_limit; 107 108 /** The channel that all I/O are funneled through */ 109 struct spdk_bdev_channel *ch; 110 111 /** The thread on which the poller is running. */ 112 struct spdk_thread *thread; 113 114 /** Queue of I/O waiting to be issued. */ 115 bdev_io_tailq_t queued; 116 117 /** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 118 * only valid for the master channel which manages the outstanding IOs. */ 119 uint64_t max_ios_per_timeslice; 120 121 /** Submitted IO in one timeslice (e.g., 1ms) */ 122 uint64_t io_submitted_this_timeslice; 123 124 /** Polller that processes queued I/O commands each time slice. */ 125 struct spdk_poller *poller; 126 }; 127 128 struct spdk_bdev_mgmt_channel { 129 bdev_io_stailq_t need_buf_small; 130 bdev_io_stailq_t need_buf_large; 131 132 /* 133 * Each thread keeps a cache of bdev_io - this allows 134 * bdev threads which are *not* DPDK threads to still 135 * benefit from a per-thread bdev_io cache. Without 136 * this, non-DPDK threads fetching from the mempool 137 * incur a cmpxchg on get and put. 138 */ 139 bdev_io_stailq_t per_thread_cache; 140 uint32_t per_thread_cache_count; 141 142 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 143 }; 144 145 /* 146 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 147 * will queue here their IO that awaits retry. It makes it posible to retry sending 148 * IO to one bdev after IO from other bdev completes. 149 */ 150 struct spdk_bdev_shared_resource { 151 /* The bdev management channel */ 152 struct spdk_bdev_mgmt_channel *mgmt_ch; 153 154 /* 155 * Count of I/O submitted to bdev module and waiting for completion. 156 * Incremented before submit_request() is called on an spdk_bdev_io. 157 */ 158 uint64_t io_outstanding; 159 160 /* 161 * Queue of IO awaiting retry because of a previous NOMEM status returned 162 * on this channel. 163 */ 164 bdev_io_tailq_t nomem_io; 165 166 /* 167 * Threshold which io_outstanding must drop to before retrying nomem_io. 168 */ 169 uint64_t nomem_threshold; 170 171 /* I/O channel allocated by a bdev module */ 172 struct spdk_io_channel *shared_ch; 173 174 /* Refcount of bdev channels using this resource */ 175 uint32_t ref; 176 177 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 178 }; 179 180 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 181 #define BDEV_CH_QOS_ENABLED (1 << 1) 182 183 struct spdk_bdev_channel { 184 struct spdk_bdev *bdev; 185 186 /* The channel for the underlying device */ 187 struct spdk_io_channel *channel; 188 189 /* Per io_device per thread data */ 190 struct spdk_bdev_shared_resource *shared_resource; 191 192 struct spdk_bdev_io_stat stat; 193 194 /* 195 * Count of I/O submitted through this channel and waiting for completion. 196 * Incremented before submit_request() is called on an spdk_bdev_io. 197 */ 198 uint64_t io_outstanding; 199 200 bdev_io_tailq_t queued_resets; 201 202 uint32_t flags; 203 204 #ifdef SPDK_CONFIG_VTUNE 205 uint64_t start_tsc; 206 uint64_t interval_tsc; 207 __itt_string_handle *handle; 208 #endif 209 210 }; 211 212 struct spdk_bdev_desc { 213 struct spdk_bdev *bdev; 214 spdk_bdev_remove_cb_t remove_cb; 215 void *remove_ctx; 216 bool write; 217 TAILQ_ENTRY(spdk_bdev_desc) link; 218 }; 219 220 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 221 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 222 223 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 224 225 struct spdk_bdev * 226 spdk_bdev_first(void) 227 { 228 struct spdk_bdev *bdev; 229 230 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 231 if (bdev) { 232 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 233 } 234 235 return bdev; 236 } 237 238 struct spdk_bdev * 239 spdk_bdev_next(struct spdk_bdev *prev) 240 { 241 struct spdk_bdev *bdev; 242 243 bdev = TAILQ_NEXT(prev, link); 244 if (bdev) { 245 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 246 } 247 248 return bdev; 249 } 250 251 static struct spdk_bdev * 252 _bdev_next_leaf(struct spdk_bdev *bdev) 253 { 254 while (bdev != NULL) { 255 if (bdev->claim_module == NULL) { 256 return bdev; 257 } else { 258 bdev = TAILQ_NEXT(bdev, link); 259 } 260 } 261 262 return bdev; 263 } 264 265 struct spdk_bdev * 266 spdk_bdev_first_leaf(void) 267 { 268 struct spdk_bdev *bdev; 269 270 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 271 272 if (bdev) { 273 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 274 } 275 276 return bdev; 277 } 278 279 struct spdk_bdev * 280 spdk_bdev_next_leaf(struct spdk_bdev *prev) 281 { 282 struct spdk_bdev *bdev; 283 284 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 285 286 if (bdev) { 287 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 288 } 289 290 return bdev; 291 } 292 293 struct spdk_bdev * 294 spdk_bdev_get_by_name(const char *bdev_name) 295 { 296 struct spdk_bdev_alias *tmp; 297 struct spdk_bdev *bdev = spdk_bdev_first(); 298 299 while (bdev != NULL) { 300 if (strcmp(bdev_name, bdev->name) == 0) { 301 return bdev; 302 } 303 304 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 305 if (strcmp(bdev_name, tmp->alias) == 0) { 306 return bdev; 307 } 308 } 309 310 bdev = spdk_bdev_next(bdev); 311 } 312 313 return NULL; 314 } 315 316 static void 317 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 318 { 319 assert(bdev_io->get_buf_cb != NULL); 320 assert(buf != NULL); 321 assert(bdev_io->u.bdev.iovs != NULL); 322 323 bdev_io->buf = buf; 324 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 325 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 326 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 327 } 328 329 static void 330 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 331 { 332 struct spdk_mempool *pool; 333 struct spdk_bdev_io *tmp; 334 void *buf; 335 bdev_io_stailq_t *stailq; 336 struct spdk_bdev_mgmt_channel *ch; 337 338 assert(bdev_io->u.bdev.iovcnt == 1); 339 340 buf = bdev_io->buf; 341 ch = bdev_io->ch->shared_resource->mgmt_ch; 342 343 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 344 pool = g_bdev_mgr.buf_small_pool; 345 stailq = &ch->need_buf_small; 346 } else { 347 pool = g_bdev_mgr.buf_large_pool; 348 stailq = &ch->need_buf_large; 349 } 350 351 if (STAILQ_EMPTY(stailq)) { 352 spdk_mempool_put(pool, buf); 353 } else { 354 tmp = STAILQ_FIRST(stailq); 355 STAILQ_REMOVE_HEAD(stailq, buf_link); 356 spdk_bdev_io_set_buf(tmp, buf); 357 } 358 } 359 360 void 361 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 362 { 363 struct spdk_mempool *pool; 364 bdev_io_stailq_t *stailq; 365 void *buf = NULL; 366 struct spdk_bdev_mgmt_channel *mgmt_ch; 367 368 assert(cb != NULL); 369 assert(bdev_io->u.bdev.iovs != NULL); 370 371 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 372 /* Buffer already present */ 373 cb(bdev_io->ch->channel, bdev_io); 374 return; 375 } 376 377 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 378 mgmt_ch = bdev_io->ch->shared_resource->mgmt_ch; 379 380 bdev_io->buf_len = len; 381 bdev_io->get_buf_cb = cb; 382 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 383 pool = g_bdev_mgr.buf_small_pool; 384 stailq = &mgmt_ch->need_buf_small; 385 } else { 386 pool = g_bdev_mgr.buf_large_pool; 387 stailq = &mgmt_ch->need_buf_large; 388 } 389 390 buf = spdk_mempool_get(pool); 391 392 if (!buf) { 393 STAILQ_INSERT_TAIL(stailq, bdev_io, buf_link); 394 } else { 395 spdk_bdev_io_set_buf(bdev_io, buf); 396 } 397 } 398 399 static int 400 spdk_bdev_module_get_max_ctx_size(void) 401 { 402 struct spdk_bdev_module *bdev_module; 403 int max_bdev_module_size = 0; 404 405 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 406 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 407 max_bdev_module_size = bdev_module->get_ctx_size(); 408 } 409 } 410 411 return max_bdev_module_size; 412 } 413 414 void 415 spdk_bdev_config_text(FILE *fp) 416 { 417 struct spdk_bdev_module *bdev_module; 418 419 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 420 if (bdev_module->config_text) { 421 bdev_module->config_text(fp); 422 } 423 } 424 } 425 426 void 427 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 428 { 429 struct spdk_bdev_module *bdev_module; 430 struct spdk_bdev *bdev; 431 432 assert(w != NULL); 433 434 spdk_json_write_array_begin(w); 435 436 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 437 if (bdev_module->config_json) { 438 bdev_module->config_json(w); 439 } 440 } 441 442 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) { 443 spdk_bdev_config_json(bdev, w); 444 } 445 446 spdk_json_write_array_end(w); 447 } 448 449 static int 450 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 451 { 452 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 453 454 STAILQ_INIT(&ch->need_buf_small); 455 STAILQ_INIT(&ch->need_buf_large); 456 457 STAILQ_INIT(&ch->per_thread_cache); 458 ch->per_thread_cache_count = 0; 459 460 TAILQ_INIT(&ch->shared_resources); 461 462 return 0; 463 } 464 465 static void 466 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 467 { 468 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 469 struct spdk_bdev_io *bdev_io; 470 471 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 472 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 473 } 474 475 if (!TAILQ_EMPTY(&ch->shared_resources)) { 476 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 477 } 478 479 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 480 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 481 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 482 ch->per_thread_cache_count--; 483 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 484 } 485 486 assert(ch->per_thread_cache_count == 0); 487 } 488 489 static void 490 spdk_bdev_init_complete(int rc) 491 { 492 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 493 void *cb_arg = g_init_cb_arg; 494 struct spdk_bdev_module *m; 495 496 g_bdev_mgr.init_complete = true; 497 g_init_cb_fn = NULL; 498 g_init_cb_arg = NULL; 499 500 /* 501 * For modules that need to know when subsystem init is complete, 502 * inform them now. 503 */ 504 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 505 if (m->init_complete) { 506 m->init_complete(); 507 } 508 } 509 510 cb_fn(cb_arg, rc); 511 } 512 513 static void 514 spdk_bdev_module_action_complete(void) 515 { 516 struct spdk_bdev_module *m; 517 518 /* 519 * Don't finish bdev subsystem initialization if 520 * module pre-initialization is still in progress, or 521 * the subsystem been already initialized. 522 */ 523 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 524 return; 525 } 526 527 /* 528 * Check all bdev modules for inits/examinations in progress. If any 529 * exist, return immediately since we cannot finish bdev subsystem 530 * initialization until all are completed. 531 */ 532 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 533 if (m->action_in_progress > 0) { 534 return; 535 } 536 } 537 538 /* 539 * Modules already finished initialization - now that all 540 * the bdev modules have finished their asynchronous I/O 541 * processing, the entire bdev layer can be marked as complete. 542 */ 543 spdk_bdev_init_complete(0); 544 } 545 546 static void 547 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 548 { 549 assert(module->action_in_progress > 0); 550 module->action_in_progress--; 551 spdk_bdev_module_action_complete(); 552 } 553 554 void 555 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 556 { 557 spdk_bdev_module_action_done(module); 558 } 559 560 void 561 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 562 { 563 spdk_bdev_module_action_done(module); 564 } 565 566 static int 567 spdk_bdev_modules_init(void) 568 { 569 struct spdk_bdev_module *module; 570 int rc = 0; 571 572 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 573 rc = module->module_init(); 574 if (rc != 0) { 575 break; 576 } 577 } 578 579 g_bdev_mgr.module_init_complete = true; 580 return rc; 581 } 582 void 583 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 584 { 585 int cache_size; 586 int rc = 0; 587 char mempool_name[32]; 588 589 assert(cb_fn != NULL); 590 591 g_init_cb_fn = cb_fn; 592 g_init_cb_arg = cb_arg; 593 594 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 595 596 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 597 SPDK_BDEV_IO_POOL_SIZE, 598 sizeof(struct spdk_bdev_io) + 599 spdk_bdev_module_get_max_ctx_size(), 600 0, 601 SPDK_ENV_SOCKET_ID_ANY); 602 603 if (g_bdev_mgr.bdev_io_pool == NULL) { 604 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 605 spdk_bdev_init_complete(-1); 606 return; 607 } 608 609 /** 610 * Ensure no more than half of the total buffers end up local caches, by 611 * using spdk_env_get_core_count() to determine how many local caches we need 612 * to account for. 613 */ 614 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 615 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 616 617 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 618 BUF_SMALL_POOL_SIZE, 619 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 620 cache_size, 621 SPDK_ENV_SOCKET_ID_ANY); 622 if (!g_bdev_mgr.buf_small_pool) { 623 SPDK_ERRLOG("create rbuf small pool failed\n"); 624 spdk_bdev_init_complete(-1); 625 return; 626 } 627 628 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 629 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 630 631 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 632 BUF_LARGE_POOL_SIZE, 633 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 634 cache_size, 635 SPDK_ENV_SOCKET_ID_ANY); 636 if (!g_bdev_mgr.buf_large_pool) { 637 SPDK_ERRLOG("create rbuf large pool failed\n"); 638 spdk_bdev_init_complete(-1); 639 return; 640 } 641 642 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 643 NULL); 644 if (!g_bdev_mgr.zero_buffer) { 645 SPDK_ERRLOG("create bdev zero buffer failed\n"); 646 spdk_bdev_init_complete(-1); 647 return; 648 } 649 650 #ifdef SPDK_CONFIG_VTUNE 651 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 652 #endif 653 654 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 655 spdk_bdev_mgmt_channel_destroy, 656 sizeof(struct spdk_bdev_mgmt_channel)); 657 658 rc = spdk_bdev_modules_init(); 659 if (rc != 0) { 660 SPDK_ERRLOG("bdev modules init failed\n"); 661 spdk_bdev_init_complete(-1); 662 return; 663 } 664 665 spdk_bdev_module_action_complete(); 666 } 667 668 static void 669 spdk_bdev_mgr_unregister_cb(void *io_device) 670 { 671 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 672 673 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 674 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 675 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 676 SPDK_BDEV_IO_POOL_SIZE); 677 } 678 679 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 680 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 681 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 682 BUF_SMALL_POOL_SIZE); 683 assert(false); 684 } 685 686 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 687 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 688 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 689 BUF_LARGE_POOL_SIZE); 690 assert(false); 691 } 692 693 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 694 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 695 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 696 spdk_dma_free(g_bdev_mgr.zero_buffer); 697 698 cb_fn(g_fini_cb_arg); 699 g_fini_cb_fn = NULL; 700 g_fini_cb_arg = NULL; 701 } 702 703 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 704 705 static void 706 spdk_bdev_module_finish_iter(void *arg) 707 { 708 struct spdk_bdev_module *bdev_module; 709 710 /* Start iterating from the last touched module */ 711 if (!g_resume_bdev_module) { 712 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 713 } else { 714 bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq); 715 } 716 717 while (bdev_module) { 718 if (bdev_module->async_fini) { 719 /* Save our place so we can resume later. We must 720 * save the variable here, before calling module_fini() 721 * below, because in some cases the module may immediately 722 * call spdk_bdev_module_finish_done() and re-enter 723 * this function to continue iterating. */ 724 g_resume_bdev_module = bdev_module; 725 } 726 727 if (bdev_module->module_fini) { 728 bdev_module->module_fini(); 729 } 730 731 if (bdev_module->async_fini) { 732 return; 733 } 734 735 bdev_module = TAILQ_NEXT(bdev_module, tailq); 736 } 737 738 g_resume_bdev_module = NULL; 739 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 740 } 741 742 void 743 spdk_bdev_module_finish_done(void) 744 { 745 if (spdk_get_thread() != g_fini_thread) { 746 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 747 } else { 748 spdk_bdev_module_finish_iter(NULL); 749 } 750 } 751 752 static void 753 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 754 { 755 struct spdk_bdev *bdev = cb_arg; 756 757 if (bdeverrno && bdev) { 758 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 759 bdev->name); 760 761 /* 762 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 763 * bdev; try to continue by manually removing this bdev from the list and continue 764 * with the next bdev in the list. 765 */ 766 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 767 } 768 769 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 770 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 771 /* 772 * Bdev module finish need to be deffered as we might be in the middle of some context 773 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 774 * after returning. 775 */ 776 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 777 return; 778 } 779 780 /* 781 * Unregister the first bdev in the list. 782 * 783 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 784 * calling the remove_cb of the descriptors first. 785 * 786 * Once this bdev and all of its open descriptors have been cleaned up, this function 787 * will be called again via the unregister completion callback to continue the cleanup 788 * process with the next bdev. 789 */ 790 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 791 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 792 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 793 } 794 795 void 796 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 797 { 798 assert(cb_fn != NULL); 799 800 g_fini_thread = spdk_get_thread(); 801 802 g_fini_cb_fn = cb_fn; 803 g_fini_cb_arg = cb_arg; 804 805 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 806 } 807 808 static struct spdk_bdev_io * 809 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 810 { 811 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 812 struct spdk_bdev_io *bdev_io; 813 814 if (ch->per_thread_cache_count > 0) { 815 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 816 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, buf_link); 817 ch->per_thread_cache_count--; 818 } else { 819 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 820 if (!bdev_io) { 821 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 822 return NULL; 823 } 824 } 825 826 return bdev_io; 827 } 828 829 static void 830 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 831 { 832 struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->shared_resource->mgmt_ch; 833 834 if (bdev_io->buf != NULL) { 835 spdk_bdev_io_put_buf(bdev_io); 836 } 837 838 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 839 ch->per_thread_cache_count++; 840 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, buf_link); 841 } else { 842 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 843 } 844 } 845 846 static void 847 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 848 { 849 struct spdk_bdev_io *bdev_io = NULL; 850 struct spdk_bdev *bdev = ch->bdev; 851 struct spdk_bdev_qos *qos = bdev->qos; 852 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 853 854 while (!TAILQ_EMPTY(&qos->queued)) { 855 if (qos->io_submitted_this_timeslice < qos->max_ios_per_timeslice) { 856 bdev_io = TAILQ_FIRST(&qos->queued); 857 TAILQ_REMOVE(&qos->queued, bdev_io, link); 858 qos->io_submitted_this_timeslice++; 859 ch->io_outstanding++; 860 shared_resource->io_outstanding++; 861 bdev->fn_table->submit_request(ch->channel, bdev_io); 862 } else { 863 break; 864 } 865 } 866 } 867 868 static void 869 _spdk_bdev_io_submit(void *ctx) 870 { 871 struct spdk_bdev_io *bdev_io = ctx; 872 struct spdk_bdev *bdev = bdev_io->bdev; 873 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 874 struct spdk_io_channel *ch = bdev_ch->channel; 875 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 876 877 bdev_io->submit_tsc = spdk_get_ticks(); 878 bdev_ch->io_outstanding++; 879 shared_resource->io_outstanding++; 880 bdev_io->in_submit_request = true; 881 if (spdk_likely(bdev_ch->flags == 0)) { 882 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 883 bdev->fn_table->submit_request(ch, bdev_io); 884 } else { 885 bdev_ch->io_outstanding--; 886 shared_resource->io_outstanding--; 887 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, link); 888 } 889 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 890 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 891 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 892 bdev_ch->io_outstanding--; 893 shared_resource->io_outstanding--; 894 TAILQ_INSERT_TAIL(&bdev->qos->queued, bdev_io, link); 895 _spdk_bdev_qos_io_submit(bdev_ch); 896 } else { 897 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 898 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 899 } 900 bdev_io->in_submit_request = false; 901 } 902 903 static void 904 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 905 { 906 struct spdk_bdev *bdev = bdev_io->bdev; 907 908 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 909 910 if (bdev_io->ch->flags & BDEV_CH_QOS_ENABLED) { 911 bdev_io->io_submit_ch = bdev_io->ch; 912 bdev_io->ch = bdev->qos->ch; 913 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_io_submit, bdev_io); 914 } else { 915 _spdk_bdev_io_submit(bdev_io); 916 } 917 } 918 919 static void 920 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 921 { 922 struct spdk_bdev *bdev = bdev_io->bdev; 923 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 924 struct spdk_io_channel *ch = bdev_ch->channel; 925 926 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 927 928 bdev_io->in_submit_request = true; 929 bdev->fn_table->submit_request(ch, bdev_io); 930 bdev_io->in_submit_request = false; 931 } 932 933 static void 934 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 935 struct spdk_bdev *bdev, void *cb_arg, 936 spdk_bdev_io_completion_cb cb) 937 { 938 bdev_io->bdev = bdev; 939 bdev_io->caller_ctx = cb_arg; 940 bdev_io->cb = cb; 941 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 942 bdev_io->in_submit_request = false; 943 bdev_io->buf = NULL; 944 bdev_io->io_submit_ch = NULL; 945 } 946 947 bool 948 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 949 { 950 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 951 } 952 953 int 954 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 955 { 956 if (bdev->fn_table->dump_info_json) { 957 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 958 } 959 960 return 0; 961 } 962 963 void 964 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 965 { 966 assert(bdev != NULL); 967 assert(w != NULL); 968 969 if (bdev->fn_table->write_config_json) { 970 bdev->fn_table->write_config_json(bdev, w); 971 } else { 972 spdk_json_write_object_begin(w); 973 spdk_json_write_named_string(w, "name", bdev->name); 974 spdk_json_write_object_end(w); 975 } 976 } 977 978 static void 979 spdk_bdev_qos_update_max_ios_per_timeslice(struct spdk_bdev_qos *qos) 980 { 981 uint64_t max_ios_per_timeslice = 0; 982 983 max_ios_per_timeslice = qos->rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 984 SPDK_BDEV_SEC_TO_USEC; 985 qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice, 986 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 987 } 988 989 static int 990 spdk_bdev_channel_poll_qos(void *arg) 991 { 992 struct spdk_bdev_qos *qos = arg; 993 994 /* Reset for next round of rate limiting */ 995 qos->io_submitted_this_timeslice = 0; 996 997 _spdk_bdev_qos_io_submit(qos->ch); 998 999 return -1; 1000 } 1001 1002 static int 1003 _spdk_bdev_channel_create(struct spdk_bdev_channel *ch, void *io_device) 1004 { 1005 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1006 struct spdk_io_channel *mgmt_io_ch; 1007 struct spdk_bdev_mgmt_channel *mgmt_ch; 1008 struct spdk_bdev_shared_resource *shared_resource; 1009 1010 ch->bdev = bdev; 1011 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1012 if (!ch->channel) { 1013 return -1; 1014 } 1015 1016 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1017 if (!mgmt_io_ch) { 1018 return -1; 1019 } 1020 1021 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1022 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1023 if (shared_resource->shared_ch == ch->channel) { 1024 spdk_put_io_channel(mgmt_io_ch); 1025 shared_resource->ref++; 1026 break; 1027 } 1028 } 1029 1030 if (shared_resource == NULL) { 1031 shared_resource = calloc(1, sizeof(*shared_resource)); 1032 if (shared_resource == NULL) { 1033 spdk_put_io_channel(mgmt_io_ch); 1034 return -1; 1035 } 1036 1037 shared_resource->mgmt_ch = mgmt_ch; 1038 shared_resource->io_outstanding = 0; 1039 TAILQ_INIT(&shared_resource->nomem_io); 1040 shared_resource->nomem_threshold = 0; 1041 shared_resource->shared_ch = ch->channel; 1042 shared_resource->ref = 1; 1043 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1044 } 1045 1046 memset(&ch->stat, 0, sizeof(ch->stat)); 1047 ch->io_outstanding = 0; 1048 TAILQ_INIT(&ch->queued_resets); 1049 ch->flags = 0; 1050 ch->shared_resource = shared_resource; 1051 1052 return 0; 1053 } 1054 1055 static void 1056 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1057 { 1058 struct spdk_bdev_shared_resource *shared_resource; 1059 1060 if (!ch) { 1061 return; 1062 } 1063 1064 if (ch->channel) { 1065 spdk_put_io_channel(ch->channel); 1066 } 1067 1068 assert(ch->io_outstanding == 0); 1069 1070 shared_resource = ch->shared_resource; 1071 if (shared_resource) { 1072 assert(ch->io_outstanding == 0); 1073 assert(shared_resource->ref > 0); 1074 shared_resource->ref--; 1075 if (shared_resource->ref == 0) { 1076 assert(shared_resource->io_outstanding == 0); 1077 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1078 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1079 free(shared_resource); 1080 } 1081 } 1082 } 1083 1084 /* Caller must hold bdev->mutex. */ 1085 static int 1086 spdk_bdev_qos_channel_create(struct spdk_bdev *bdev) 1087 { 1088 assert(bdev->qos->ch == NULL); 1089 assert(bdev->qos->thread == NULL); 1090 1091 bdev->qos->ch = calloc(1, sizeof(struct spdk_bdev_channel)); 1092 if (!bdev->qos->ch) { 1093 return -1; 1094 } 1095 1096 bdev->qos->thread = spdk_get_thread(); 1097 if (!bdev->qos->thread) { 1098 free(bdev->qos->ch); 1099 bdev->qos->ch = NULL; 1100 return -1; 1101 } 1102 1103 if (_spdk_bdev_channel_create(bdev->qos->ch, __bdev_to_io_dev(bdev)) != 0) { 1104 free(bdev->qos->ch); 1105 bdev->qos->ch = NULL; 1106 bdev->qos->thread = NULL; 1107 return -1; 1108 } 1109 1110 TAILQ_INIT(&bdev->qos->queued); 1111 1112 bdev->qos->ch->flags |= BDEV_CH_QOS_ENABLED; 1113 spdk_bdev_qos_update_max_ios_per_timeslice(bdev->qos); 1114 1115 bdev->qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1116 bdev->qos, 1117 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1118 1119 return 0; 1120 } 1121 1122 /* Caller must hold bdev->mutex */ 1123 static int 1124 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1125 { 1126 /* Rate limiting on this bdev enabled */ 1127 if (bdev->qos) { 1128 if (bdev->qos->ch == NULL) { 1129 if (spdk_bdev_qos_channel_create(bdev) != 0) { 1130 return -1; 1131 } 1132 } 1133 ch->flags |= BDEV_CH_QOS_ENABLED; 1134 } 1135 1136 return 0; 1137 } 1138 1139 static int 1140 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1141 { 1142 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1143 struct spdk_bdev_channel *ch = ctx_buf; 1144 1145 if (_spdk_bdev_channel_create(ch, io_device) != 0) { 1146 _spdk_bdev_channel_destroy_resource(ch); 1147 return -1; 1148 } 1149 1150 #ifdef SPDK_CONFIG_VTUNE 1151 { 1152 char *name; 1153 __itt_init_ittlib(NULL, 0); 1154 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1155 if (!name) { 1156 _spdk_bdev_channel_destroy_resource(ch); 1157 return -1; 1158 } 1159 ch->handle = __itt_string_handle_create(name); 1160 free(name); 1161 ch->start_tsc = spdk_get_ticks(); 1162 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1163 } 1164 #endif 1165 1166 pthread_mutex_lock(&bdev->mutex); 1167 1168 if (_spdk_bdev_enable_qos(bdev, ch)) { 1169 _spdk_bdev_channel_destroy_resource(ch); 1170 pthread_mutex_unlock(&bdev->mutex); 1171 return -1; 1172 } 1173 1174 bdev->channel_count++; 1175 1176 pthread_mutex_unlock(&bdev->mutex); 1177 1178 return 0; 1179 } 1180 1181 /* 1182 * Abort I/O that are waiting on a data buffer. These types of I/O are 1183 * linked using the spdk_bdev_io buf_link TAILQ_ENTRY. 1184 */ 1185 static void 1186 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1187 { 1188 bdev_io_stailq_t tmp; 1189 struct spdk_bdev_io *bdev_io; 1190 1191 STAILQ_INIT(&tmp); 1192 1193 while (!STAILQ_EMPTY(queue)) { 1194 bdev_io = STAILQ_FIRST(queue); 1195 STAILQ_REMOVE_HEAD(queue, buf_link); 1196 if (bdev_io->ch == ch) { 1197 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1198 } else { 1199 STAILQ_INSERT_TAIL(&tmp, bdev_io, buf_link); 1200 } 1201 } 1202 1203 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1204 } 1205 1206 /* 1207 * Abort I/O that are queued waiting for submission. These types of I/O are 1208 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1209 */ 1210 static void 1211 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1212 { 1213 struct spdk_bdev_io *bdev_io, *tmp; 1214 1215 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1216 if (bdev_io->ch == ch) { 1217 TAILQ_REMOVE(queue, bdev_io, link); 1218 /* 1219 * spdk_bdev_io_complete() assumes that the completed I/O had 1220 * been submitted to the bdev module. Since in this case it 1221 * hadn't, bump io_outstanding to account for the decrement 1222 * that spdk_bdev_io_complete() will do. 1223 */ 1224 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1225 ch->io_outstanding++; 1226 ch->shared_resource->io_outstanding++; 1227 } 1228 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1229 } 1230 } 1231 } 1232 1233 static void 1234 _spdk_bdev_channel_destroy(struct spdk_bdev_channel *ch) 1235 { 1236 struct spdk_bdev_mgmt_channel *mgmt_ch; 1237 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1238 1239 mgmt_ch = shared_resource->mgmt_ch; 1240 1241 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1242 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 1243 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1244 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1245 1246 _spdk_bdev_channel_destroy_resource(ch); 1247 } 1248 1249 static void 1250 spdk_bdev_qos_channel_destroy(void *cb_arg) 1251 { 1252 struct spdk_bdev_qos *qos = cb_arg; 1253 1254 _spdk_bdev_channel_destroy(qos->ch); 1255 1256 spdk_poller_unregister(&qos->poller); 1257 1258 free(qos->ch); 1259 free(qos); 1260 } 1261 1262 static int 1263 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 1264 { 1265 /* 1266 * Cleanly shutting down the QoS poller is tricky, because 1267 * during the asynchronous operation the user could open a 1268 * new channel, spawning a new QoS poller. 1269 * 1270 * The strategy is to create a new QoS structure here and swap it 1271 * in. The shutdown path then continues to refer to the old one 1272 * until it completes and then releases it. 1273 */ 1274 struct spdk_bdev_qos *new_qos, *old_qos; 1275 1276 old_qos = bdev->qos; 1277 1278 new_qos = calloc(1, sizeof(*new_qos)); 1279 if (!new_qos) { 1280 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 1281 return -ENOMEM; 1282 } 1283 1284 /* Copy the old QoS data into the newly allocated structure */ 1285 memcpy(new_qos, old_qos, sizeof(*new_qos)); 1286 1287 /* Zero out the key parts of the QoS structure */ 1288 new_qos->ch = NULL; 1289 new_qos->thread = NULL; 1290 new_qos->max_ios_per_timeslice = 0; 1291 new_qos->io_submitted_this_timeslice = 0; 1292 new_qos->poller = NULL; 1293 TAILQ_INIT(&new_qos->queued); 1294 1295 bdev->qos = new_qos; 1296 1297 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 1298 old_qos); 1299 1300 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 1301 * been destroyed yet. The destruction path will end up waiting for the final 1302 * channel to be put before it releases resources. */ 1303 1304 return 0; 1305 } 1306 1307 static void 1308 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1309 { 1310 struct spdk_bdev_channel *ch = ctx_buf; 1311 struct spdk_bdev *bdev = ch->bdev; 1312 1313 _spdk_bdev_channel_destroy(ch); 1314 1315 pthread_mutex_lock(&bdev->mutex); 1316 bdev->channel_count--; 1317 if (bdev->channel_count == 0 && bdev->qos && bdev->qos->ch != NULL) { 1318 if (spdk_bdev_qos_destroy(bdev)) { 1319 /* There isn't anything we can do to recover from here. Just let the 1320 * old QoS poller keep running. The QoS handling won't change 1321 * cores when the user allocates a new channel, but it won't break. */ 1322 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 1323 } 1324 } 1325 pthread_mutex_unlock(&bdev->mutex); 1326 } 1327 1328 int 1329 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1330 { 1331 struct spdk_bdev_alias *tmp; 1332 1333 if (alias == NULL) { 1334 SPDK_ERRLOG("Empty alias passed\n"); 1335 return -EINVAL; 1336 } 1337 1338 if (spdk_bdev_get_by_name(alias)) { 1339 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1340 return -EEXIST; 1341 } 1342 1343 tmp = calloc(1, sizeof(*tmp)); 1344 if (tmp == NULL) { 1345 SPDK_ERRLOG("Unable to allocate alias\n"); 1346 return -ENOMEM; 1347 } 1348 1349 tmp->alias = strdup(alias); 1350 if (tmp->alias == NULL) { 1351 free(tmp); 1352 SPDK_ERRLOG("Unable to allocate alias\n"); 1353 return -ENOMEM; 1354 } 1355 1356 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1357 1358 return 0; 1359 } 1360 1361 int 1362 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1363 { 1364 struct spdk_bdev_alias *tmp; 1365 1366 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1367 if (strcmp(alias, tmp->alias) == 0) { 1368 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1369 free(tmp->alias); 1370 free(tmp); 1371 return 0; 1372 } 1373 } 1374 1375 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1376 1377 return -ENOENT; 1378 } 1379 1380 struct spdk_io_channel * 1381 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1382 { 1383 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1384 } 1385 1386 const char * 1387 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1388 { 1389 return bdev->name; 1390 } 1391 1392 const char * 1393 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1394 { 1395 return bdev->product_name; 1396 } 1397 1398 const struct spdk_bdev_aliases_list * 1399 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1400 { 1401 return &bdev->aliases; 1402 } 1403 1404 uint32_t 1405 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1406 { 1407 return bdev->blocklen; 1408 } 1409 1410 uint64_t 1411 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1412 { 1413 return bdev->blockcnt; 1414 } 1415 1416 uint64_t 1417 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev) 1418 { 1419 uint64_t rate_limit = 0; 1420 1421 pthread_mutex_lock(&bdev->mutex); 1422 if (bdev->qos) { 1423 rate_limit = bdev->qos->rate_limit; 1424 } 1425 pthread_mutex_unlock(&bdev->mutex); 1426 1427 return rate_limit; 1428 } 1429 1430 size_t 1431 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1432 { 1433 /* TODO: push this logic down to the bdev modules */ 1434 if (bdev->need_aligned_buffer) { 1435 return bdev->blocklen; 1436 } 1437 1438 return 1; 1439 } 1440 1441 uint32_t 1442 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1443 { 1444 return bdev->optimal_io_boundary; 1445 } 1446 1447 bool 1448 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1449 { 1450 return bdev->write_cache; 1451 } 1452 1453 const struct spdk_uuid * 1454 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1455 { 1456 return &bdev->uuid; 1457 } 1458 1459 int 1460 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1461 { 1462 int ret; 1463 1464 pthread_mutex_lock(&bdev->mutex); 1465 1466 /* bdev has open descriptors */ 1467 if (!TAILQ_EMPTY(&bdev->open_descs) && 1468 bdev->blockcnt > size) { 1469 ret = -EBUSY; 1470 } else { 1471 bdev->blockcnt = size; 1472 ret = 0; 1473 } 1474 1475 pthread_mutex_unlock(&bdev->mutex); 1476 1477 return ret; 1478 } 1479 1480 /* 1481 * Convert I/O offset and length from bytes to blocks. 1482 * 1483 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1484 */ 1485 static uint64_t 1486 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1487 uint64_t num_bytes, uint64_t *num_blocks) 1488 { 1489 uint32_t block_size = bdev->blocklen; 1490 1491 *offset_blocks = offset_bytes / block_size; 1492 *num_blocks = num_bytes / block_size; 1493 1494 return (offset_bytes % block_size) | (num_bytes % block_size); 1495 } 1496 1497 static bool 1498 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1499 { 1500 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1501 * has been an overflow and hence the offset has been wrapped around */ 1502 if (offset_blocks + num_blocks < offset_blocks) { 1503 return false; 1504 } 1505 1506 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1507 if (offset_blocks + num_blocks > bdev->blockcnt) { 1508 return false; 1509 } 1510 1511 return true; 1512 } 1513 1514 int 1515 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1516 void *buf, uint64_t offset, uint64_t nbytes, 1517 spdk_bdev_io_completion_cb cb, void *cb_arg) 1518 { 1519 uint64_t offset_blocks, num_blocks; 1520 1521 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1522 return -EINVAL; 1523 } 1524 1525 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1526 } 1527 1528 int 1529 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1530 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1531 spdk_bdev_io_completion_cb cb, void *cb_arg) 1532 { 1533 struct spdk_bdev *bdev = desc->bdev; 1534 struct spdk_bdev_io *bdev_io; 1535 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1536 1537 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1538 return -EINVAL; 1539 } 1540 1541 bdev_io = spdk_bdev_get_io(channel); 1542 if (!bdev_io) { 1543 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1544 return -ENOMEM; 1545 } 1546 1547 bdev_io->ch = channel; 1548 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1549 bdev_io->u.bdev.iov.iov_base = buf; 1550 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1551 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1552 bdev_io->u.bdev.iovcnt = 1; 1553 bdev_io->u.bdev.num_blocks = num_blocks; 1554 bdev_io->u.bdev.offset_blocks = offset_blocks; 1555 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1556 1557 spdk_bdev_io_submit(bdev_io); 1558 return 0; 1559 } 1560 1561 int 1562 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1563 struct iovec *iov, int iovcnt, 1564 uint64_t offset, uint64_t nbytes, 1565 spdk_bdev_io_completion_cb cb, void *cb_arg) 1566 { 1567 uint64_t offset_blocks, num_blocks; 1568 1569 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1570 return -EINVAL; 1571 } 1572 1573 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1574 } 1575 1576 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1577 struct iovec *iov, int iovcnt, 1578 uint64_t offset_blocks, uint64_t num_blocks, 1579 spdk_bdev_io_completion_cb cb, void *cb_arg) 1580 { 1581 struct spdk_bdev *bdev = desc->bdev; 1582 struct spdk_bdev_io *bdev_io; 1583 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1584 1585 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1586 return -EINVAL; 1587 } 1588 1589 bdev_io = spdk_bdev_get_io(channel); 1590 if (!bdev_io) { 1591 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1592 return -ENOMEM; 1593 } 1594 1595 bdev_io->ch = channel; 1596 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1597 bdev_io->u.bdev.iovs = iov; 1598 bdev_io->u.bdev.iovcnt = iovcnt; 1599 bdev_io->u.bdev.num_blocks = num_blocks; 1600 bdev_io->u.bdev.offset_blocks = offset_blocks; 1601 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1602 1603 spdk_bdev_io_submit(bdev_io); 1604 return 0; 1605 } 1606 1607 int 1608 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1609 void *buf, uint64_t offset, uint64_t nbytes, 1610 spdk_bdev_io_completion_cb cb, void *cb_arg) 1611 { 1612 uint64_t offset_blocks, num_blocks; 1613 1614 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1615 return -EINVAL; 1616 } 1617 1618 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1619 } 1620 1621 int 1622 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1623 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1624 spdk_bdev_io_completion_cb cb, void *cb_arg) 1625 { 1626 struct spdk_bdev *bdev = desc->bdev; 1627 struct spdk_bdev_io *bdev_io; 1628 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1629 1630 if (!desc->write) { 1631 return -EBADF; 1632 } 1633 1634 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1635 return -EINVAL; 1636 } 1637 1638 bdev_io = spdk_bdev_get_io(channel); 1639 if (!bdev_io) { 1640 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1641 return -ENOMEM; 1642 } 1643 1644 bdev_io->ch = channel; 1645 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1646 bdev_io->u.bdev.iov.iov_base = buf; 1647 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1648 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1649 bdev_io->u.bdev.iovcnt = 1; 1650 bdev_io->u.bdev.num_blocks = num_blocks; 1651 bdev_io->u.bdev.offset_blocks = offset_blocks; 1652 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1653 1654 spdk_bdev_io_submit(bdev_io); 1655 return 0; 1656 } 1657 1658 int 1659 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1660 struct iovec *iov, int iovcnt, 1661 uint64_t offset, uint64_t len, 1662 spdk_bdev_io_completion_cb cb, void *cb_arg) 1663 { 1664 uint64_t offset_blocks, num_blocks; 1665 1666 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1667 return -EINVAL; 1668 } 1669 1670 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1671 } 1672 1673 int 1674 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1675 struct iovec *iov, int iovcnt, 1676 uint64_t offset_blocks, uint64_t num_blocks, 1677 spdk_bdev_io_completion_cb cb, void *cb_arg) 1678 { 1679 struct spdk_bdev *bdev = desc->bdev; 1680 struct spdk_bdev_io *bdev_io; 1681 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1682 1683 if (!desc->write) { 1684 return -EBADF; 1685 } 1686 1687 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1688 return -EINVAL; 1689 } 1690 1691 bdev_io = spdk_bdev_get_io(channel); 1692 if (!bdev_io) { 1693 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1694 return -ENOMEM; 1695 } 1696 1697 bdev_io->ch = channel; 1698 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1699 bdev_io->u.bdev.iovs = iov; 1700 bdev_io->u.bdev.iovcnt = iovcnt; 1701 bdev_io->u.bdev.num_blocks = num_blocks; 1702 bdev_io->u.bdev.offset_blocks = offset_blocks; 1703 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1704 1705 spdk_bdev_io_submit(bdev_io); 1706 return 0; 1707 } 1708 1709 int 1710 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1711 uint64_t offset, uint64_t len, 1712 spdk_bdev_io_completion_cb cb, void *cb_arg) 1713 { 1714 uint64_t offset_blocks, num_blocks; 1715 1716 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1717 return -EINVAL; 1718 } 1719 1720 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1721 } 1722 1723 int 1724 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1725 uint64_t offset_blocks, uint64_t num_blocks, 1726 spdk_bdev_io_completion_cb cb, void *cb_arg) 1727 { 1728 struct spdk_bdev *bdev = desc->bdev; 1729 struct spdk_bdev_io *bdev_io; 1730 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1731 uint64_t len; 1732 bool split_request = false; 1733 1734 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1735 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1736 return -ERANGE; 1737 } 1738 1739 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1740 return -EINVAL; 1741 } 1742 1743 bdev_io = spdk_bdev_get_io(channel); 1744 1745 if (!bdev_io) { 1746 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1747 return -ENOMEM; 1748 } 1749 1750 bdev_io->ch = channel; 1751 bdev_io->u.bdev.offset_blocks = offset_blocks; 1752 1753 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1754 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1755 bdev_io->u.bdev.num_blocks = num_blocks; 1756 bdev_io->u.bdev.iovs = NULL; 1757 bdev_io->u.bdev.iovcnt = 0; 1758 1759 } else { 1760 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1761 1762 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1763 1764 if (len > ZERO_BUFFER_SIZE) { 1765 split_request = true; 1766 len = ZERO_BUFFER_SIZE; 1767 } 1768 1769 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1770 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1771 bdev_io->u.bdev.iov.iov_len = len; 1772 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1773 bdev_io->u.bdev.iovcnt = 1; 1774 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1775 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1776 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1777 } 1778 1779 if (split_request) { 1780 bdev_io->u.bdev.stored_user_cb = cb; 1781 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1782 } else { 1783 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1784 } 1785 spdk_bdev_io_submit(bdev_io); 1786 return 0; 1787 } 1788 1789 int 1790 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1791 uint64_t offset, uint64_t nbytes, 1792 spdk_bdev_io_completion_cb cb, void *cb_arg) 1793 { 1794 uint64_t offset_blocks, num_blocks; 1795 1796 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1797 return -EINVAL; 1798 } 1799 1800 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1801 } 1802 1803 int 1804 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1805 uint64_t offset_blocks, uint64_t num_blocks, 1806 spdk_bdev_io_completion_cb cb, void *cb_arg) 1807 { 1808 struct spdk_bdev *bdev = desc->bdev; 1809 struct spdk_bdev_io *bdev_io; 1810 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1811 1812 if (!desc->write) { 1813 return -EBADF; 1814 } 1815 1816 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1817 return -EINVAL; 1818 } 1819 1820 if (num_blocks == 0) { 1821 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1822 return -EINVAL; 1823 } 1824 1825 bdev_io = spdk_bdev_get_io(channel); 1826 if (!bdev_io) { 1827 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1828 return -ENOMEM; 1829 } 1830 1831 bdev_io->ch = channel; 1832 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1833 bdev_io->u.bdev.iov.iov_base = NULL; 1834 bdev_io->u.bdev.iov.iov_len = 0; 1835 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1836 bdev_io->u.bdev.iovcnt = 1; 1837 bdev_io->u.bdev.offset_blocks = offset_blocks; 1838 bdev_io->u.bdev.num_blocks = num_blocks; 1839 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1840 1841 spdk_bdev_io_submit(bdev_io); 1842 return 0; 1843 } 1844 1845 int 1846 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1847 uint64_t offset, uint64_t length, 1848 spdk_bdev_io_completion_cb cb, void *cb_arg) 1849 { 1850 uint64_t offset_blocks, num_blocks; 1851 1852 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1853 return -EINVAL; 1854 } 1855 1856 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1857 } 1858 1859 int 1860 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1861 uint64_t offset_blocks, uint64_t num_blocks, 1862 spdk_bdev_io_completion_cb cb, void *cb_arg) 1863 { 1864 struct spdk_bdev *bdev = desc->bdev; 1865 struct spdk_bdev_io *bdev_io; 1866 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1867 1868 if (!desc->write) { 1869 return -EBADF; 1870 } 1871 1872 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1873 return -EINVAL; 1874 } 1875 1876 bdev_io = spdk_bdev_get_io(channel); 1877 if (!bdev_io) { 1878 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1879 return -ENOMEM; 1880 } 1881 1882 bdev_io->ch = channel; 1883 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1884 bdev_io->u.bdev.iovs = NULL; 1885 bdev_io->u.bdev.iovcnt = 0; 1886 bdev_io->u.bdev.offset_blocks = offset_blocks; 1887 bdev_io->u.bdev.num_blocks = num_blocks; 1888 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1889 1890 spdk_bdev_io_submit(bdev_io); 1891 return 0; 1892 } 1893 1894 static void 1895 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1896 { 1897 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1898 struct spdk_bdev_io *bdev_io; 1899 1900 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1901 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1902 spdk_bdev_io_submit_reset(bdev_io); 1903 } 1904 1905 static void 1906 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1907 { 1908 struct spdk_io_channel *ch; 1909 struct spdk_bdev_channel *channel; 1910 struct spdk_bdev_mgmt_channel *mgmt_channel; 1911 struct spdk_bdev_shared_resource *shared_resource; 1912 1913 ch = spdk_io_channel_iter_get_channel(i); 1914 channel = spdk_io_channel_get_ctx(ch); 1915 shared_resource = channel->shared_resource; 1916 mgmt_channel = shared_resource->mgmt_ch; 1917 1918 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1919 1920 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 1921 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1922 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1923 1924 spdk_for_each_channel_continue(i, 0); 1925 } 1926 1927 static void 1928 _spdk_bdev_reset_freeze_qos_channel(void *ctx) 1929 { 1930 struct spdk_bdev *bdev = ctx; 1931 struct spdk_bdev_mgmt_channel *mgmt_channel = NULL; 1932 struct spdk_bdev_channel *qos_channel = bdev->qos->ch; 1933 struct spdk_bdev_shared_resource *shared_resource = NULL; 1934 1935 if (qos_channel) { 1936 shared_resource = qos_channel->shared_resource; 1937 mgmt_channel = shared_resource->mgmt_ch; 1938 1939 qos_channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1940 1941 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, qos_channel); 1942 _spdk_bdev_abort_queued_io(&bdev->qos->queued, qos_channel); 1943 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, qos_channel); 1944 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, qos_channel); 1945 } 1946 } 1947 1948 static void 1949 _spdk_bdev_start_reset(void *ctx) 1950 { 1951 struct spdk_bdev_channel *ch = ctx; 1952 1953 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1954 ch, _spdk_bdev_reset_dev); 1955 } 1956 1957 static void 1958 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1959 { 1960 struct spdk_bdev *bdev = ch->bdev; 1961 1962 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1963 1964 pthread_mutex_lock(&bdev->mutex); 1965 if (bdev->reset_in_progress == NULL) { 1966 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1967 /* 1968 * Take a channel reference for the target bdev for the life of this 1969 * reset. This guards against the channel getting destroyed while 1970 * spdk_for_each_channel() calls related to this reset IO are in 1971 * progress. We will release the reference when this reset is 1972 * completed. 1973 */ 1974 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1975 _spdk_bdev_start_reset(ch); 1976 } 1977 pthread_mutex_unlock(&bdev->mutex); 1978 } 1979 1980 int 1981 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1982 spdk_bdev_io_completion_cb cb, void *cb_arg) 1983 { 1984 struct spdk_bdev *bdev = desc->bdev; 1985 struct spdk_bdev_io *bdev_io; 1986 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1987 1988 bdev_io = spdk_bdev_get_io(channel); 1989 if (!bdev_io) { 1990 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1991 return -ENOMEM; 1992 } 1993 1994 bdev_io->ch = channel; 1995 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1996 bdev_io->u.reset.ch_ref = NULL; 1997 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1998 1999 pthread_mutex_lock(&bdev->mutex); 2000 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 2001 pthread_mutex_unlock(&bdev->mutex); 2002 2003 _spdk_bdev_channel_start_reset(channel); 2004 2005 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 2006 if (bdev->qos && bdev->qos->thread) { 2007 spdk_thread_send_msg(bdev->qos->thread, 2008 _spdk_bdev_reset_freeze_qos_channel, bdev); 2009 } 2010 2011 return 0; 2012 } 2013 2014 void 2015 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2016 struct spdk_bdev_io_stat *stat) 2017 { 2018 #ifdef SPDK_CONFIG_VTUNE 2019 SPDK_ERRLOG("Calling spdk_bdev_get_io_stat is not allowed when VTune integration is enabled.\n"); 2020 memset(stat, 0, sizeof(*stat)); 2021 return; 2022 #endif 2023 2024 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2025 2026 channel->stat.ticks_rate = spdk_get_ticks_hz(); 2027 *stat = channel->stat; 2028 memset(&channel->stat, 0, sizeof(channel->stat)); 2029 } 2030 2031 int 2032 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2033 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2034 spdk_bdev_io_completion_cb cb, void *cb_arg) 2035 { 2036 struct spdk_bdev *bdev = desc->bdev; 2037 struct spdk_bdev_io *bdev_io; 2038 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2039 2040 if (!desc->write) { 2041 return -EBADF; 2042 } 2043 2044 bdev_io = spdk_bdev_get_io(channel); 2045 if (!bdev_io) { 2046 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2047 return -ENOMEM; 2048 } 2049 2050 bdev_io->ch = channel; 2051 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 2052 bdev_io->u.nvme_passthru.cmd = *cmd; 2053 bdev_io->u.nvme_passthru.buf = buf; 2054 bdev_io->u.nvme_passthru.nbytes = nbytes; 2055 bdev_io->u.nvme_passthru.md_buf = NULL; 2056 bdev_io->u.nvme_passthru.md_len = 0; 2057 2058 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2059 2060 spdk_bdev_io_submit(bdev_io); 2061 return 0; 2062 } 2063 2064 int 2065 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2066 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2067 spdk_bdev_io_completion_cb cb, void *cb_arg) 2068 { 2069 struct spdk_bdev *bdev = desc->bdev; 2070 struct spdk_bdev_io *bdev_io; 2071 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2072 2073 if (!desc->write) { 2074 /* 2075 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2076 * to easily determine if the command is a read or write, but for now just 2077 * do not allow io_passthru with a read-only descriptor. 2078 */ 2079 return -EBADF; 2080 } 2081 2082 bdev_io = spdk_bdev_get_io(channel); 2083 if (!bdev_io) { 2084 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2085 return -ENOMEM; 2086 } 2087 2088 bdev_io->ch = channel; 2089 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2090 bdev_io->u.nvme_passthru.cmd = *cmd; 2091 bdev_io->u.nvme_passthru.buf = buf; 2092 bdev_io->u.nvme_passthru.nbytes = nbytes; 2093 bdev_io->u.nvme_passthru.md_buf = NULL; 2094 bdev_io->u.nvme_passthru.md_len = 0; 2095 2096 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2097 2098 spdk_bdev_io_submit(bdev_io); 2099 return 0; 2100 } 2101 2102 int 2103 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2104 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2105 spdk_bdev_io_completion_cb cb, void *cb_arg) 2106 { 2107 struct spdk_bdev *bdev = desc->bdev; 2108 struct spdk_bdev_io *bdev_io; 2109 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2110 2111 if (!desc->write) { 2112 /* 2113 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2114 * to easily determine if the command is a read or write, but for now just 2115 * do not allow io_passthru with a read-only descriptor. 2116 */ 2117 return -EBADF; 2118 } 2119 2120 bdev_io = spdk_bdev_get_io(channel); 2121 if (!bdev_io) { 2122 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2123 return -ENOMEM; 2124 } 2125 2126 bdev_io->ch = channel; 2127 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2128 bdev_io->u.nvme_passthru.cmd = *cmd; 2129 bdev_io->u.nvme_passthru.buf = buf; 2130 bdev_io->u.nvme_passthru.nbytes = nbytes; 2131 bdev_io->u.nvme_passthru.md_buf = md_buf; 2132 bdev_io->u.nvme_passthru.md_len = md_len; 2133 2134 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2135 2136 spdk_bdev_io_submit(bdev_io); 2137 return 0; 2138 } 2139 2140 int 2141 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2142 { 2143 if (!bdev_io) { 2144 SPDK_ERRLOG("bdev_io is NULL\n"); 2145 return -1; 2146 } 2147 2148 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2149 SPDK_ERRLOG("bdev_io is in pending state\n"); 2150 assert(false); 2151 return -1; 2152 } 2153 2154 spdk_bdev_put_io(bdev_io); 2155 2156 return 0; 2157 } 2158 2159 static void 2160 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2161 { 2162 struct spdk_bdev *bdev = bdev_ch->bdev; 2163 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2164 struct spdk_bdev_io *bdev_io; 2165 2166 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 2167 /* 2168 * Allow some more I/O to complete before retrying the nomem_io queue. 2169 * Some drivers (such as nvme) cannot immediately take a new I/O in 2170 * the context of a completion, because the resources for the I/O are 2171 * not released until control returns to the bdev poller. Also, we 2172 * may require several small I/O to complete before a larger I/O 2173 * (that requires splitting) can be submitted. 2174 */ 2175 return; 2176 } 2177 2178 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 2179 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 2180 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, link); 2181 bdev_io->ch->io_outstanding++; 2182 shared_resource->io_outstanding++; 2183 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2184 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2185 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2186 break; 2187 } 2188 } 2189 } 2190 2191 static inline void 2192 _spdk_bdev_io_complete(void *ctx) 2193 { 2194 struct spdk_bdev_io *bdev_io = ctx; 2195 2196 if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) { 2197 /* 2198 * Send the completion to the thread that originally submitted the I/O, 2199 * which may not be the current thread in the case of QoS. 2200 */ 2201 if (bdev_io->io_submit_ch) { 2202 bdev_io->ch = bdev_io->io_submit_ch; 2203 bdev_io->io_submit_ch = NULL; 2204 } 2205 2206 /* 2207 * Defer completion to avoid potential infinite recursion if the 2208 * user's completion callback issues a new I/O. 2209 */ 2210 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2211 _spdk_bdev_io_complete, bdev_io); 2212 return; 2213 } 2214 2215 if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2216 switch (bdev_io->type) { 2217 case SPDK_BDEV_IO_TYPE_READ: 2218 bdev_io->ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2219 bdev_io->ch->stat.num_read_ops++; 2220 bdev_io->ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2221 break; 2222 case SPDK_BDEV_IO_TYPE_WRITE: 2223 bdev_io->ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2224 bdev_io->ch->stat.num_write_ops++; 2225 bdev_io->ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2226 break; 2227 default: 2228 break; 2229 } 2230 } 2231 2232 #ifdef SPDK_CONFIG_VTUNE 2233 uint64_t now_tsc = spdk_get_ticks(); 2234 if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) { 2235 uint64_t data[5]; 2236 2237 data[0] = bdev_io->ch->stat.num_read_ops; 2238 data[1] = bdev_io->ch->stat.bytes_read; 2239 data[2] = bdev_io->ch->stat.num_write_ops; 2240 data[3] = bdev_io->ch->stat.bytes_written; 2241 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 2242 bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0; 2243 2244 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle, 2245 __itt_metadata_u64, 5, data); 2246 2247 memset(&bdev_io->ch->stat, 0, sizeof(bdev_io->ch->stat)); 2248 bdev_io->ch->start_tsc = now_tsc; 2249 } 2250 #endif 2251 2252 assert(bdev_io->cb != NULL); 2253 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel)); 2254 2255 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2256 bdev_io->caller_ctx); 2257 } 2258 2259 static void 2260 _spdk_bdev_unfreeze_qos_channel(void *ctx) 2261 { 2262 struct spdk_bdev *bdev = ctx; 2263 2264 if (bdev->qos->ch) { 2265 bdev->qos->ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2266 assert(TAILQ_EMPTY(&bdev->qos->ch->queued_resets)); 2267 } 2268 } 2269 2270 static void 2271 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2272 { 2273 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2274 2275 if (bdev_io->u.reset.ch_ref != NULL) { 2276 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2277 bdev_io->u.reset.ch_ref = NULL; 2278 } 2279 2280 _spdk_bdev_io_complete(bdev_io); 2281 } 2282 2283 static void 2284 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2285 { 2286 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2287 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2288 2289 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2290 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2291 _spdk_bdev_channel_start_reset(ch); 2292 } 2293 2294 spdk_for_each_channel_continue(i, 0); 2295 } 2296 2297 void 2298 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2299 { 2300 struct spdk_bdev *bdev = bdev_io->bdev; 2301 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2302 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2303 2304 bdev_io->status = status; 2305 2306 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2307 bool unlock_channels = false; 2308 2309 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2310 SPDK_ERRLOG("NOMEM returned for reset\n"); 2311 } 2312 pthread_mutex_lock(&bdev->mutex); 2313 if (bdev_io == bdev->reset_in_progress) { 2314 bdev->reset_in_progress = NULL; 2315 unlock_channels = true; 2316 } 2317 pthread_mutex_unlock(&bdev->mutex); 2318 2319 if (unlock_channels) { 2320 /* Explicitly handle the QoS bdev channel as no IO channel associated */ 2321 if (bdev->qos && bdev->qos->thread) { 2322 spdk_thread_send_msg(bdev->qos->thread, 2323 _spdk_bdev_unfreeze_qos_channel, bdev); 2324 } 2325 2326 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2327 bdev_io, _spdk_bdev_reset_complete); 2328 return; 2329 } 2330 } else { 2331 assert(bdev_ch->io_outstanding > 0); 2332 assert(shared_resource->io_outstanding > 0); 2333 bdev_ch->io_outstanding--; 2334 shared_resource->io_outstanding--; 2335 2336 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2337 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, link); 2338 /* 2339 * Wait for some of the outstanding I/O to complete before we 2340 * retry any of the nomem_io. Normally we will wait for 2341 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2342 * depth channels we will instead wait for half to complete. 2343 */ 2344 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 2345 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 2346 return; 2347 } 2348 2349 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 2350 _spdk_bdev_ch_retry_io(bdev_ch); 2351 } 2352 } 2353 2354 _spdk_bdev_io_complete(bdev_io); 2355 } 2356 2357 void 2358 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2359 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2360 { 2361 if (sc == SPDK_SCSI_STATUS_GOOD) { 2362 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2363 } else { 2364 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2365 bdev_io->error.scsi.sc = sc; 2366 bdev_io->error.scsi.sk = sk; 2367 bdev_io->error.scsi.asc = asc; 2368 bdev_io->error.scsi.ascq = ascq; 2369 } 2370 2371 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2372 } 2373 2374 void 2375 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2376 int *sc, int *sk, int *asc, int *ascq) 2377 { 2378 assert(sc != NULL); 2379 assert(sk != NULL); 2380 assert(asc != NULL); 2381 assert(ascq != NULL); 2382 2383 switch (bdev_io->status) { 2384 case SPDK_BDEV_IO_STATUS_SUCCESS: 2385 *sc = SPDK_SCSI_STATUS_GOOD; 2386 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2387 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2388 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2389 break; 2390 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2391 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2392 break; 2393 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2394 *sc = bdev_io->error.scsi.sc; 2395 *sk = bdev_io->error.scsi.sk; 2396 *asc = bdev_io->error.scsi.asc; 2397 *ascq = bdev_io->error.scsi.ascq; 2398 break; 2399 default: 2400 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2401 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2402 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2403 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2404 break; 2405 } 2406 } 2407 2408 void 2409 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2410 { 2411 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2412 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2413 } else { 2414 bdev_io->error.nvme.sct = sct; 2415 bdev_io->error.nvme.sc = sc; 2416 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2417 } 2418 2419 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2420 } 2421 2422 void 2423 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2424 { 2425 assert(sct != NULL); 2426 assert(sc != NULL); 2427 2428 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2429 *sct = bdev_io->error.nvme.sct; 2430 *sc = bdev_io->error.nvme.sc; 2431 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2432 *sct = SPDK_NVME_SCT_GENERIC; 2433 *sc = SPDK_NVME_SC_SUCCESS; 2434 } else { 2435 *sct = SPDK_NVME_SCT_GENERIC; 2436 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2437 } 2438 } 2439 2440 struct spdk_thread * 2441 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2442 { 2443 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2444 } 2445 2446 static void 2447 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2448 { 2449 struct spdk_conf_section *sp = NULL; 2450 const char *val = NULL; 2451 uint64_t ios_per_sec = 0; 2452 int i = 0; 2453 2454 sp = spdk_conf_find_section(NULL, "QoS"); 2455 if (!sp) { 2456 return; 2457 } 2458 2459 while (true) { 2460 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 0); 2461 if (!val) { 2462 break; 2463 } 2464 2465 if (strcmp(bdev->name, val) != 0) { 2466 i++; 2467 continue; 2468 } 2469 2470 val = spdk_conf_section_get_nmval(sp, "Limit_IOPS", i, 1); 2471 if (!val) { 2472 return; 2473 } 2474 2475 ios_per_sec = strtoull(val, NULL, 10); 2476 if (ios_per_sec > 0) { 2477 if (ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 2478 SPDK_ERRLOG("Assigned IOPS %" PRIu64 " on bdev %s is not multiple of %u\n", 2479 ios_per_sec, bdev->name, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 2480 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2481 } else { 2482 bdev->qos = calloc(1, sizeof(*bdev->qos)); 2483 if (!bdev->qos) { 2484 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 2485 return; 2486 } 2487 bdev->qos->rate_limit = ios_per_sec; 2488 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS:%lu\n", 2489 bdev->name, bdev->qos->rate_limit); 2490 } 2491 } 2492 2493 return; 2494 } 2495 } 2496 2497 static int 2498 spdk_bdev_init(struct spdk_bdev *bdev) 2499 { 2500 assert(bdev->module != NULL); 2501 2502 if (!bdev->name) { 2503 SPDK_ERRLOG("Bdev name is NULL\n"); 2504 return -EINVAL; 2505 } 2506 2507 if (spdk_bdev_get_by_name(bdev->name)) { 2508 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2509 return -EEXIST; 2510 } 2511 2512 bdev->status = SPDK_BDEV_STATUS_READY; 2513 2514 TAILQ_INIT(&bdev->open_descs); 2515 2516 TAILQ_INIT(&bdev->aliases); 2517 2518 bdev->reset_in_progress = NULL; 2519 2520 _spdk_bdev_qos_config(bdev); 2521 2522 spdk_io_device_register(__bdev_to_io_dev(bdev), 2523 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2524 sizeof(struct spdk_bdev_channel)); 2525 2526 pthread_mutex_init(&bdev->mutex, NULL); 2527 return 0; 2528 } 2529 2530 static void 2531 spdk_bdev_destroy_cb(void *io_device) 2532 { 2533 int rc; 2534 struct spdk_bdev *bdev; 2535 spdk_bdev_unregister_cb cb_fn; 2536 void *cb_arg; 2537 2538 bdev = __bdev_from_io_dev(io_device); 2539 cb_fn = bdev->unregister_cb; 2540 cb_arg = bdev->unregister_ctx; 2541 2542 rc = bdev->fn_table->destruct(bdev->ctxt); 2543 if (rc < 0) { 2544 SPDK_ERRLOG("destruct failed\n"); 2545 } 2546 if (rc <= 0 && cb_fn != NULL) { 2547 cb_fn(cb_arg, rc); 2548 } 2549 } 2550 2551 2552 static void 2553 spdk_bdev_fini(struct spdk_bdev *bdev) 2554 { 2555 pthread_mutex_destroy(&bdev->mutex); 2556 2557 free(bdev->qos); 2558 2559 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2560 } 2561 2562 static void 2563 spdk_bdev_start(struct spdk_bdev *bdev) 2564 { 2565 struct spdk_bdev_module *module; 2566 2567 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2568 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2569 2570 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2571 if (module->examine) { 2572 module->action_in_progress++; 2573 module->examine(bdev); 2574 } 2575 } 2576 } 2577 2578 int 2579 spdk_bdev_register(struct spdk_bdev *bdev) 2580 { 2581 int rc = spdk_bdev_init(bdev); 2582 2583 if (rc == 0) { 2584 spdk_bdev_start(bdev); 2585 } 2586 2587 return rc; 2588 } 2589 2590 static void 2591 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2592 { 2593 struct spdk_bdev **bdevs; 2594 struct spdk_bdev *base; 2595 size_t i, j, k; 2596 bool found; 2597 2598 /* Iterate over base bdevs to remove vbdev from them. */ 2599 for (i = 0; i < vbdev->base_bdevs_cnt; i++) { 2600 found = false; 2601 base = vbdev->base_bdevs[i]; 2602 2603 for (j = 0; j < base->vbdevs_cnt; j++) { 2604 if (base->vbdevs[j] != vbdev) { 2605 continue; 2606 } 2607 2608 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2609 base->vbdevs[k] = base->vbdevs[k + 1]; 2610 } 2611 2612 base->vbdevs_cnt--; 2613 if (base->vbdevs_cnt > 0) { 2614 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2615 /* It would be odd if shrinking memory block fail. */ 2616 assert(bdevs); 2617 base->vbdevs = bdevs; 2618 } else { 2619 free(base->vbdevs); 2620 base->vbdevs = NULL; 2621 } 2622 2623 found = true; 2624 break; 2625 } 2626 2627 if (!found) { 2628 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2629 } 2630 } 2631 2632 free(vbdev->base_bdevs); 2633 vbdev->base_bdevs = NULL; 2634 vbdev->base_bdevs_cnt = 0; 2635 } 2636 2637 static int 2638 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2639 { 2640 struct spdk_bdev **vbdevs; 2641 struct spdk_bdev *base; 2642 size_t i; 2643 2644 /* Adding base bdevs isn't supported (yet?). */ 2645 assert(vbdev->base_bdevs_cnt == 0); 2646 2647 vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0])); 2648 if (!vbdev->base_bdevs) { 2649 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2650 return -ENOMEM; 2651 } 2652 2653 memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0])); 2654 vbdev->base_bdevs_cnt = cnt; 2655 2656 /* Iterate over base bdevs to add this vbdev to them. */ 2657 for (i = 0; i < cnt; i++) { 2658 base = vbdev->base_bdevs[i]; 2659 2660 assert(base != NULL); 2661 assert(base->claim_module != NULL); 2662 2663 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2664 if (!vbdevs) { 2665 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 2666 spdk_vbdev_remove_base_bdevs(vbdev); 2667 return -ENOMEM; 2668 } 2669 2670 vbdevs[base->vbdevs_cnt] = vbdev; 2671 base->vbdevs = vbdevs; 2672 base->vbdevs_cnt++; 2673 } 2674 2675 return 0; 2676 } 2677 2678 int 2679 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2680 { 2681 int rc; 2682 2683 rc = spdk_bdev_init(vbdev); 2684 if (rc) { 2685 return rc; 2686 } 2687 2688 if (base_bdev_count == 0) { 2689 spdk_bdev_start(vbdev); 2690 return 0; 2691 } 2692 2693 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 2694 if (rc) { 2695 spdk_bdev_fini(vbdev); 2696 return rc; 2697 } 2698 2699 spdk_bdev_start(vbdev); 2700 return 0; 2701 2702 } 2703 2704 void 2705 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 2706 { 2707 if (bdev->unregister_cb != NULL) { 2708 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2709 } 2710 } 2711 2712 static void 2713 _remove_notify(void *arg) 2714 { 2715 struct spdk_bdev_desc *desc = arg; 2716 2717 desc->remove_cb(desc->remove_ctx); 2718 } 2719 2720 void 2721 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2722 { 2723 struct spdk_bdev_desc *desc, *tmp; 2724 bool do_destruct = true; 2725 struct spdk_thread *thread; 2726 2727 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2728 2729 thread = spdk_get_thread(); 2730 if (!thread) { 2731 /* The user called this from a non-SPDK thread. */ 2732 cb_fn(cb_arg, -ENOTSUP); 2733 return; 2734 } 2735 2736 pthread_mutex_lock(&bdev->mutex); 2737 2738 spdk_vbdev_remove_base_bdevs(bdev); 2739 2740 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2741 bdev->unregister_cb = cb_fn; 2742 bdev->unregister_ctx = cb_arg; 2743 2744 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2745 if (desc->remove_cb) { 2746 do_destruct = false; 2747 /* 2748 * Defer invocation of the remove_cb to a separate message that will 2749 * run later on this thread. This ensures this context unwinds and 2750 * we don't recursively unregister this bdev again if the remove_cb 2751 * immediately closes its descriptor. 2752 */ 2753 spdk_thread_send_msg(thread, _remove_notify, desc); 2754 } 2755 } 2756 2757 if (!do_destruct) { 2758 pthread_mutex_unlock(&bdev->mutex); 2759 return; 2760 } 2761 2762 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2763 pthread_mutex_unlock(&bdev->mutex); 2764 2765 spdk_bdev_fini(bdev); 2766 } 2767 2768 int 2769 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2770 void *remove_ctx, struct spdk_bdev_desc **_desc) 2771 { 2772 struct spdk_bdev_desc *desc; 2773 2774 desc = calloc(1, sizeof(*desc)); 2775 if (desc == NULL) { 2776 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2777 return -ENOMEM; 2778 } 2779 2780 pthread_mutex_lock(&bdev->mutex); 2781 2782 if (write && bdev->claim_module) { 2783 SPDK_INFOLOG(SPDK_LOG_BDEV, "Could not open %s - already claimed\n", bdev->name); 2784 free(desc); 2785 pthread_mutex_unlock(&bdev->mutex); 2786 return -EPERM; 2787 } 2788 2789 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2790 2791 desc->bdev = bdev; 2792 desc->remove_cb = remove_cb; 2793 desc->remove_ctx = remove_ctx; 2794 desc->write = write; 2795 *_desc = desc; 2796 2797 pthread_mutex_unlock(&bdev->mutex); 2798 2799 return 0; 2800 } 2801 2802 void 2803 spdk_bdev_close(struct spdk_bdev_desc *desc) 2804 { 2805 struct spdk_bdev *bdev = desc->bdev; 2806 bool do_unregister = false; 2807 2808 pthread_mutex_lock(&bdev->mutex); 2809 2810 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2811 free(desc); 2812 2813 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2814 do_unregister = true; 2815 } 2816 pthread_mutex_unlock(&bdev->mutex); 2817 2818 if (do_unregister == true) { 2819 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2820 } 2821 } 2822 2823 int 2824 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2825 struct spdk_bdev_module *module) 2826 { 2827 if (bdev->claim_module != NULL) { 2828 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2829 bdev->claim_module->name); 2830 return -EPERM; 2831 } 2832 2833 if (desc && !desc->write) { 2834 desc->write = true; 2835 } 2836 2837 bdev->claim_module = module; 2838 return 0; 2839 } 2840 2841 void 2842 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2843 { 2844 assert(bdev->claim_module != NULL); 2845 bdev->claim_module = NULL; 2846 } 2847 2848 struct spdk_bdev * 2849 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2850 { 2851 return desc->bdev; 2852 } 2853 2854 void 2855 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2856 { 2857 struct iovec *iovs; 2858 int iovcnt; 2859 2860 if (bdev_io == NULL) { 2861 return; 2862 } 2863 2864 switch (bdev_io->type) { 2865 case SPDK_BDEV_IO_TYPE_READ: 2866 iovs = bdev_io->u.bdev.iovs; 2867 iovcnt = bdev_io->u.bdev.iovcnt; 2868 break; 2869 case SPDK_BDEV_IO_TYPE_WRITE: 2870 iovs = bdev_io->u.bdev.iovs; 2871 iovcnt = bdev_io->u.bdev.iovcnt; 2872 break; 2873 default: 2874 iovs = NULL; 2875 iovcnt = 0; 2876 break; 2877 } 2878 2879 if (iovp) { 2880 *iovp = iovs; 2881 } 2882 if (iovcntp) { 2883 *iovcntp = iovcnt; 2884 } 2885 } 2886 2887 void 2888 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2889 { 2890 2891 if (spdk_bdev_module_list_find(bdev_module->name)) { 2892 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 2893 assert(false); 2894 } 2895 2896 if (bdev_module->async_init) { 2897 bdev_module->action_in_progress = 1; 2898 } 2899 2900 /* 2901 * Modules with examine callbacks must be initialized first, so they are 2902 * ready to handle examine callbacks from later modules that will 2903 * register physical bdevs. 2904 */ 2905 if (bdev_module->examine != NULL) { 2906 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2907 } else { 2908 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2909 } 2910 } 2911 2912 struct spdk_bdev_module * 2913 spdk_bdev_module_list_find(const char *name) 2914 { 2915 struct spdk_bdev_module *bdev_module; 2916 2917 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2918 if (strcmp(name, bdev_module->name) == 0) { 2919 break; 2920 } 2921 } 2922 2923 return bdev_module; 2924 } 2925 2926 static void 2927 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2928 { 2929 uint64_t len; 2930 2931 if (!success) { 2932 bdev_io->cb = bdev_io->u.bdev.stored_user_cb; 2933 _spdk_bdev_io_complete(bdev_io); 2934 return; 2935 } 2936 2937 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2938 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 2939 ZERO_BUFFER_SIZE); 2940 2941 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 2942 bdev_io->u.bdev.iov.iov_len = len; 2943 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2944 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2945 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2946 2947 /* if this round completes the i/o, change the callback to be the original user callback */ 2948 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 2949 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 2950 } else { 2951 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 2952 } 2953 spdk_bdev_io_submit(bdev_io); 2954 } 2955 2956 struct set_qos_limit_ctx { 2957 void (*cb_fn)(void *cb_arg, int status); 2958 void *cb_arg; 2959 struct spdk_bdev *bdev; 2960 }; 2961 2962 static void 2963 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 2964 { 2965 pthread_mutex_lock(&ctx->bdev->mutex); 2966 ctx->bdev->qos_mod_in_progress = false; 2967 pthread_mutex_unlock(&ctx->bdev->mutex); 2968 2969 ctx->cb_fn(ctx->cb_arg, status); 2970 free(ctx); 2971 } 2972 2973 static void 2974 _spdk_bdev_disable_qos_done(void *cb_arg) 2975 { 2976 struct set_qos_limit_ctx *ctx = cb_arg; 2977 struct spdk_bdev *bdev = ctx->bdev; 2978 struct spdk_bdev_qos *qos; 2979 2980 pthread_mutex_lock(&bdev->mutex); 2981 qos = bdev->qos; 2982 bdev->qos = NULL; 2983 pthread_mutex_unlock(&bdev->mutex); 2984 2985 _spdk_bdev_abort_queued_io(&qos->queued, qos->ch); 2986 _spdk_bdev_channel_destroy(qos->ch); 2987 spdk_poller_unregister(&qos->poller); 2988 2989 free(qos->ch); 2990 free(qos); 2991 2992 _spdk_bdev_set_qos_limit_done(ctx, 0); 2993 } 2994 2995 static void 2996 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 2997 { 2998 void *io_device = spdk_io_channel_iter_get_io_device(i); 2999 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3000 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3001 struct spdk_thread *thread; 3002 3003 pthread_mutex_lock(&bdev->mutex); 3004 thread = bdev->qos->thread; 3005 pthread_mutex_unlock(&bdev->mutex); 3006 3007 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 3008 } 3009 3010 static void 3011 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 3012 { 3013 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3014 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3015 3016 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 3017 3018 spdk_for_each_channel_continue(i, 0); 3019 } 3020 3021 static void 3022 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg) 3023 { 3024 struct set_qos_limit_ctx *ctx = cb_arg; 3025 struct spdk_bdev *bdev = ctx->bdev; 3026 3027 pthread_mutex_lock(&bdev->mutex); 3028 spdk_bdev_qos_update_max_ios_per_timeslice(bdev->qos); 3029 pthread_mutex_unlock(&bdev->mutex); 3030 3031 _spdk_bdev_set_qos_limit_done(ctx, 0); 3032 } 3033 3034 static void 3035 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 3036 { 3037 void *io_device = spdk_io_channel_iter_get_io_device(i); 3038 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3039 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3040 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3041 int rc; 3042 3043 pthread_mutex_lock(&bdev->mutex); 3044 rc = _spdk_bdev_enable_qos(bdev, bdev_ch); 3045 pthread_mutex_unlock(&bdev->mutex); 3046 spdk_for_each_channel_continue(i, rc); 3047 } 3048 3049 static void 3050 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 3051 { 3052 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3053 3054 _spdk_bdev_set_qos_limit_done(ctx, status); 3055 } 3056 3057 void 3058 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec, 3059 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 3060 { 3061 struct set_qos_limit_ctx *ctx; 3062 3063 if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 3064 SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n", 3065 ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 3066 cb_fn(cb_arg, -EINVAL); 3067 return; 3068 } 3069 3070 ctx = calloc(1, sizeof(*ctx)); 3071 if (ctx == NULL) { 3072 cb_fn(cb_arg, -ENOMEM); 3073 return; 3074 } 3075 3076 ctx->cb_fn = cb_fn; 3077 ctx->cb_arg = cb_arg; 3078 ctx->bdev = bdev; 3079 3080 pthread_mutex_lock(&bdev->mutex); 3081 if (bdev->qos_mod_in_progress) { 3082 pthread_mutex_unlock(&bdev->mutex); 3083 free(ctx); 3084 cb_fn(cb_arg, -EAGAIN); 3085 return; 3086 } 3087 bdev->qos_mod_in_progress = true; 3088 3089 if (ios_per_sec > 0) { 3090 if (bdev->qos == NULL) { 3091 /* Enabling */ 3092 bdev->qos = calloc(1, sizeof(*bdev->qos)); 3093 if (!bdev->qos) { 3094 pthread_mutex_unlock(&bdev->mutex); 3095 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3096 free(ctx); 3097 cb_fn(cb_arg, -ENOMEM); 3098 return; 3099 } 3100 3101 bdev->qos->rate_limit = ios_per_sec; 3102 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3103 _spdk_bdev_enable_qos_msg, ctx, 3104 _spdk_bdev_enable_qos_done); 3105 } else { 3106 /* Updating */ 3107 bdev->qos->rate_limit = ios_per_sec; 3108 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx); 3109 } 3110 } else { 3111 if (bdev->qos != NULL) { 3112 /* Disabling */ 3113 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3114 _spdk_bdev_disable_qos_msg, ctx, 3115 _spdk_bdev_disable_qos_msg_done); 3116 } else { 3117 pthread_mutex_unlock(&bdev->mutex); 3118 _spdk_bdev_set_qos_limit_done(ctx, 0); 3119 return; 3120 } 3121 } 3122 3123 pthread_mutex_unlock(&bdev->mutex); 3124 } 3125 3126 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 3127