1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 #include "spdk/conf.h" 39 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk_internal/log.h" 51 #include "spdk/string.h" 52 53 #ifdef SPDK_CONFIG_VTUNE 54 #include "ittnotify.h" 55 #include "ittnotify_types.h" 56 int __itt_init_ittlib(const char *, __itt_group_id); 57 #endif 58 59 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 60 #define SPDK_BDEV_IO_CACHE_SIZE 256 61 #define BUF_SMALL_POOL_SIZE 8192 62 #define BUF_LARGE_POOL_SIZE 1024 63 #define NOMEM_THRESHOLD_COUNT 8 64 #define ZERO_BUFFER_SIZE 0x100000 65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 66 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 69 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC 10 70 71 enum spdk_bdev_qos_type { 72 SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0, 73 SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT, 74 SPDK_BDEV_QOS_NUM_TYPES /* Keep last */ 75 }; 76 77 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"}; 78 79 struct spdk_bdev_mgr { 80 struct spdk_mempool *bdev_io_pool; 81 82 struct spdk_mempool *buf_small_pool; 83 struct spdk_mempool *buf_large_pool; 84 85 void *zero_buffer; 86 87 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 88 89 TAILQ_HEAD(, spdk_bdev) bdevs; 90 91 bool init_complete; 92 bool module_init_complete; 93 94 #ifdef SPDK_CONFIG_VTUNE 95 __itt_domain *domain; 96 #endif 97 }; 98 99 static struct spdk_bdev_mgr g_bdev_mgr = { 100 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 101 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 102 .init_complete = false, 103 .module_init_complete = false, 104 }; 105 106 static spdk_bdev_init_cb g_init_cb_fn = NULL; 107 static void *g_init_cb_arg = NULL; 108 109 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 110 static void *g_fini_cb_arg = NULL; 111 static struct spdk_thread *g_fini_thread = NULL; 112 113 struct spdk_bdev_qos { 114 /** Rate limit, in I/O per second */ 115 uint64_t iops_rate_limit; 116 117 /** Rate limit, in byte per second */ 118 uint64_t byte_rate_limit; 119 120 /** The channel that all I/O are funneled through */ 121 struct spdk_bdev_channel *ch; 122 123 /** The thread on which the poller is running. */ 124 struct spdk_thread *thread; 125 126 /** Queue of I/O waiting to be issued. */ 127 bdev_io_tailq_t queued; 128 129 /** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 130 * only valid for the master channel which manages the outstanding IOs. */ 131 uint64_t max_ios_per_timeslice; 132 133 /** Submitted IO in one timeslice (e.g., 1ms) */ 134 uint64_t io_submitted_this_timeslice; 135 136 /** Polller that processes queued I/O commands each time slice. */ 137 struct spdk_poller *poller; 138 }; 139 140 struct spdk_bdev_mgmt_channel { 141 bdev_io_stailq_t need_buf_small; 142 bdev_io_stailq_t need_buf_large; 143 144 /* 145 * Each thread keeps a cache of bdev_io - this allows 146 * bdev threads which are *not* DPDK threads to still 147 * benefit from a per-thread bdev_io cache. Without 148 * this, non-DPDK threads fetching from the mempool 149 * incur a cmpxchg on get and put. 150 */ 151 bdev_io_stailq_t per_thread_cache; 152 uint32_t per_thread_cache_count; 153 154 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 155 }; 156 157 /* 158 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 159 * will queue here their IO that awaits retry. It makes it posible to retry sending 160 * IO to one bdev after IO from other bdev completes. 161 */ 162 struct spdk_bdev_shared_resource { 163 /* The bdev management channel */ 164 struct spdk_bdev_mgmt_channel *mgmt_ch; 165 166 /* 167 * Count of I/O submitted to bdev module and waiting for completion. 168 * Incremented before submit_request() is called on an spdk_bdev_io. 169 */ 170 uint64_t io_outstanding; 171 172 /* 173 * Queue of IO awaiting retry because of a previous NOMEM status returned 174 * on this channel. 175 */ 176 bdev_io_tailq_t nomem_io; 177 178 /* 179 * Threshold which io_outstanding must drop to before retrying nomem_io. 180 */ 181 uint64_t nomem_threshold; 182 183 /* I/O channel allocated by a bdev module */ 184 struct spdk_io_channel *shared_ch; 185 186 /* Refcount of bdev channels using this resource */ 187 uint32_t ref; 188 189 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 190 }; 191 192 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 193 #define BDEV_CH_QOS_ENABLED (1 << 1) 194 195 struct spdk_bdev_channel { 196 struct spdk_bdev *bdev; 197 198 /* The channel for the underlying device */ 199 struct spdk_io_channel *channel; 200 201 /* Per io_device per thread data */ 202 struct spdk_bdev_shared_resource *shared_resource; 203 204 struct spdk_bdev_io_stat stat; 205 206 /* 207 * Count of I/O submitted through this channel and waiting for completion. 208 * Incremented before submit_request() is called on an spdk_bdev_io. 209 */ 210 uint64_t io_outstanding; 211 212 bdev_io_tailq_t queued_resets; 213 214 uint32_t flags; 215 216 #ifdef SPDK_CONFIG_VTUNE 217 uint64_t start_tsc; 218 uint64_t interval_tsc; 219 __itt_string_handle *handle; 220 struct spdk_bdev_io_stat prev_stat; 221 #endif 222 223 }; 224 225 struct spdk_bdev_desc { 226 struct spdk_bdev *bdev; 227 spdk_bdev_remove_cb_t remove_cb; 228 void *remove_ctx; 229 bool write; 230 TAILQ_ENTRY(spdk_bdev_desc) link; 231 }; 232 233 struct spdk_bdev_iostat_ctx { 234 struct spdk_bdev_io_stat *stat; 235 spdk_bdev_get_device_stat_cb cb; 236 void *cb_arg; 237 }; 238 239 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 240 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 241 242 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 243 244 struct spdk_bdev * 245 spdk_bdev_first(void) 246 { 247 struct spdk_bdev *bdev; 248 249 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 250 if (bdev) { 251 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 252 } 253 254 return bdev; 255 } 256 257 struct spdk_bdev * 258 spdk_bdev_next(struct spdk_bdev *prev) 259 { 260 struct spdk_bdev *bdev; 261 262 bdev = TAILQ_NEXT(prev, link); 263 if (bdev) { 264 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 265 } 266 267 return bdev; 268 } 269 270 static struct spdk_bdev * 271 _bdev_next_leaf(struct spdk_bdev *bdev) 272 { 273 while (bdev != NULL) { 274 if (bdev->claim_module == NULL) { 275 return bdev; 276 } else { 277 bdev = TAILQ_NEXT(bdev, link); 278 } 279 } 280 281 return bdev; 282 } 283 284 struct spdk_bdev * 285 spdk_bdev_first_leaf(void) 286 { 287 struct spdk_bdev *bdev; 288 289 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 290 291 if (bdev) { 292 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 293 } 294 295 return bdev; 296 } 297 298 struct spdk_bdev * 299 spdk_bdev_next_leaf(struct spdk_bdev *prev) 300 { 301 struct spdk_bdev *bdev; 302 303 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 304 305 if (bdev) { 306 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 307 } 308 309 return bdev; 310 } 311 312 struct spdk_bdev * 313 spdk_bdev_get_by_name(const char *bdev_name) 314 { 315 struct spdk_bdev_alias *tmp; 316 struct spdk_bdev *bdev = spdk_bdev_first(); 317 318 while (bdev != NULL) { 319 if (strcmp(bdev_name, bdev->name) == 0) { 320 return bdev; 321 } 322 323 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 324 if (strcmp(bdev_name, tmp->alias) == 0) { 325 return bdev; 326 } 327 } 328 329 bdev = spdk_bdev_next(bdev); 330 } 331 332 return NULL; 333 } 334 335 static void 336 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 337 { 338 assert(bdev_io->get_buf_cb != NULL); 339 assert(buf != NULL); 340 assert(bdev_io->u.bdev.iovs != NULL); 341 342 bdev_io->buf = buf; 343 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 344 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 345 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 346 } 347 348 static void 349 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 350 { 351 struct spdk_mempool *pool; 352 struct spdk_bdev_io *tmp; 353 void *buf; 354 bdev_io_stailq_t *stailq; 355 struct spdk_bdev_mgmt_channel *ch; 356 357 assert(bdev_io->u.bdev.iovcnt == 1); 358 359 buf = bdev_io->buf; 360 ch = bdev_io->ch->shared_resource->mgmt_ch; 361 362 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 363 pool = g_bdev_mgr.buf_small_pool; 364 stailq = &ch->need_buf_small; 365 } else { 366 pool = g_bdev_mgr.buf_large_pool; 367 stailq = &ch->need_buf_large; 368 } 369 370 if (STAILQ_EMPTY(stailq)) { 371 spdk_mempool_put(pool, buf); 372 } else { 373 tmp = STAILQ_FIRST(stailq); 374 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 375 spdk_bdev_io_set_buf(tmp, buf); 376 } 377 } 378 379 void 380 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 381 { 382 struct spdk_mempool *pool; 383 bdev_io_stailq_t *stailq; 384 void *buf = NULL; 385 struct spdk_bdev_mgmt_channel *mgmt_ch; 386 387 assert(cb != NULL); 388 assert(bdev_io->u.bdev.iovs != NULL); 389 390 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 391 /* Buffer already present */ 392 cb(bdev_io->ch->channel, bdev_io); 393 return; 394 } 395 396 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 397 mgmt_ch = bdev_io->ch->shared_resource->mgmt_ch; 398 399 bdev_io->buf_len = len; 400 bdev_io->get_buf_cb = cb; 401 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 402 pool = g_bdev_mgr.buf_small_pool; 403 stailq = &mgmt_ch->need_buf_small; 404 } else { 405 pool = g_bdev_mgr.buf_large_pool; 406 stailq = &mgmt_ch->need_buf_large; 407 } 408 409 buf = spdk_mempool_get(pool); 410 411 if (!buf) { 412 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 413 } else { 414 spdk_bdev_io_set_buf(bdev_io, buf); 415 } 416 } 417 418 static int 419 spdk_bdev_module_get_max_ctx_size(void) 420 { 421 struct spdk_bdev_module *bdev_module; 422 int max_bdev_module_size = 0; 423 424 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 425 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 426 max_bdev_module_size = bdev_module->get_ctx_size(); 427 } 428 } 429 430 return max_bdev_module_size; 431 } 432 433 void 434 spdk_bdev_config_text(FILE *fp) 435 { 436 struct spdk_bdev_module *bdev_module; 437 438 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 439 if (bdev_module->config_text) { 440 bdev_module->config_text(fp); 441 } 442 } 443 } 444 445 void 446 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 447 { 448 struct spdk_bdev_module *bdev_module; 449 struct spdk_bdev *bdev; 450 451 assert(w != NULL); 452 453 spdk_json_write_array_begin(w); 454 455 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 456 if (bdev_module->config_json) { 457 bdev_module->config_json(w); 458 } 459 } 460 461 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) { 462 spdk_bdev_config_json(bdev, w); 463 } 464 465 spdk_json_write_array_end(w); 466 } 467 468 static int 469 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 470 { 471 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 472 473 STAILQ_INIT(&ch->need_buf_small); 474 STAILQ_INIT(&ch->need_buf_large); 475 476 STAILQ_INIT(&ch->per_thread_cache); 477 ch->per_thread_cache_count = 0; 478 479 TAILQ_INIT(&ch->shared_resources); 480 481 return 0; 482 } 483 484 static void 485 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 486 { 487 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 488 struct spdk_bdev_io *bdev_io; 489 490 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 491 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 492 } 493 494 if (!TAILQ_EMPTY(&ch->shared_resources)) { 495 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 496 } 497 498 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 499 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 500 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 501 ch->per_thread_cache_count--; 502 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 503 } 504 505 assert(ch->per_thread_cache_count == 0); 506 } 507 508 static void 509 spdk_bdev_init_complete(int rc) 510 { 511 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 512 void *cb_arg = g_init_cb_arg; 513 struct spdk_bdev_module *m; 514 515 g_bdev_mgr.init_complete = true; 516 g_init_cb_fn = NULL; 517 g_init_cb_arg = NULL; 518 519 /* 520 * For modules that need to know when subsystem init is complete, 521 * inform them now. 522 */ 523 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 524 if (m->init_complete) { 525 m->init_complete(); 526 } 527 } 528 529 cb_fn(cb_arg, rc); 530 } 531 532 static void 533 spdk_bdev_module_action_complete(void) 534 { 535 struct spdk_bdev_module *m; 536 537 /* 538 * Don't finish bdev subsystem initialization if 539 * module pre-initialization is still in progress, or 540 * the subsystem been already initialized. 541 */ 542 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 543 return; 544 } 545 546 /* 547 * Check all bdev modules for inits/examinations in progress. If any 548 * exist, return immediately since we cannot finish bdev subsystem 549 * initialization until all are completed. 550 */ 551 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 552 if (m->action_in_progress > 0) { 553 return; 554 } 555 } 556 557 /* 558 * Modules already finished initialization - now that all 559 * the bdev modules have finished their asynchronous I/O 560 * processing, the entire bdev layer can be marked as complete. 561 */ 562 spdk_bdev_init_complete(0); 563 } 564 565 static void 566 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 567 { 568 assert(module->action_in_progress > 0); 569 module->action_in_progress--; 570 spdk_bdev_module_action_complete(); 571 } 572 573 void 574 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 575 { 576 spdk_bdev_module_action_done(module); 577 } 578 579 void 580 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 581 { 582 spdk_bdev_module_action_done(module); 583 } 584 585 static int 586 spdk_bdev_modules_init(void) 587 { 588 struct spdk_bdev_module *module; 589 int rc = 0; 590 591 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 592 rc = module->module_init(); 593 if (rc != 0) { 594 break; 595 } 596 } 597 598 g_bdev_mgr.module_init_complete = true; 599 return rc; 600 } 601 void 602 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 603 { 604 int cache_size; 605 int rc = 0; 606 char mempool_name[32]; 607 608 assert(cb_fn != NULL); 609 610 g_init_cb_fn = cb_fn; 611 g_init_cb_arg = cb_arg; 612 613 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 614 615 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 616 SPDK_BDEV_IO_POOL_SIZE, 617 sizeof(struct spdk_bdev_io) + 618 spdk_bdev_module_get_max_ctx_size(), 619 0, 620 SPDK_ENV_SOCKET_ID_ANY); 621 622 if (g_bdev_mgr.bdev_io_pool == NULL) { 623 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 624 spdk_bdev_init_complete(-1); 625 return; 626 } 627 628 /** 629 * Ensure no more than half of the total buffers end up local caches, by 630 * using spdk_env_get_core_count() to determine how many local caches we need 631 * to account for. 632 */ 633 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 634 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 635 636 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 637 BUF_SMALL_POOL_SIZE, 638 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 639 cache_size, 640 SPDK_ENV_SOCKET_ID_ANY); 641 if (!g_bdev_mgr.buf_small_pool) { 642 SPDK_ERRLOG("create rbuf small pool failed\n"); 643 spdk_bdev_init_complete(-1); 644 return; 645 } 646 647 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 648 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 649 650 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 651 BUF_LARGE_POOL_SIZE, 652 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 653 cache_size, 654 SPDK_ENV_SOCKET_ID_ANY); 655 if (!g_bdev_mgr.buf_large_pool) { 656 SPDK_ERRLOG("create rbuf large pool failed\n"); 657 spdk_bdev_init_complete(-1); 658 return; 659 } 660 661 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 662 NULL); 663 if (!g_bdev_mgr.zero_buffer) { 664 SPDK_ERRLOG("create bdev zero buffer failed\n"); 665 spdk_bdev_init_complete(-1); 666 return; 667 } 668 669 #ifdef SPDK_CONFIG_VTUNE 670 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 671 #endif 672 673 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 674 spdk_bdev_mgmt_channel_destroy, 675 sizeof(struct spdk_bdev_mgmt_channel)); 676 677 rc = spdk_bdev_modules_init(); 678 if (rc != 0) { 679 SPDK_ERRLOG("bdev modules init failed\n"); 680 spdk_bdev_init_complete(-1); 681 return; 682 } 683 684 spdk_bdev_module_action_complete(); 685 } 686 687 static void 688 spdk_bdev_mgr_unregister_cb(void *io_device) 689 { 690 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 691 692 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 693 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 694 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 695 SPDK_BDEV_IO_POOL_SIZE); 696 } 697 698 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 699 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 700 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 701 BUF_SMALL_POOL_SIZE); 702 assert(false); 703 } 704 705 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 706 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 707 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 708 BUF_LARGE_POOL_SIZE); 709 assert(false); 710 } 711 712 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 713 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 714 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 715 spdk_dma_free(g_bdev_mgr.zero_buffer); 716 717 cb_fn(g_fini_cb_arg); 718 g_fini_cb_fn = NULL; 719 g_fini_cb_arg = NULL; 720 } 721 722 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 723 724 static void 725 spdk_bdev_module_finish_iter(void *arg) 726 { 727 struct spdk_bdev_module *bdev_module; 728 729 /* Start iterating from the last touched module */ 730 if (!g_resume_bdev_module) { 731 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 732 } else { 733 bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq); 734 } 735 736 while (bdev_module) { 737 if (bdev_module->async_fini) { 738 /* Save our place so we can resume later. We must 739 * save the variable here, before calling module_fini() 740 * below, because in some cases the module may immediately 741 * call spdk_bdev_module_finish_done() and re-enter 742 * this function to continue iterating. */ 743 g_resume_bdev_module = bdev_module; 744 } 745 746 if (bdev_module->module_fini) { 747 bdev_module->module_fini(); 748 } 749 750 if (bdev_module->async_fini) { 751 return; 752 } 753 754 bdev_module = TAILQ_NEXT(bdev_module, tailq); 755 } 756 757 g_resume_bdev_module = NULL; 758 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 759 } 760 761 void 762 spdk_bdev_module_finish_done(void) 763 { 764 if (spdk_get_thread() != g_fini_thread) { 765 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 766 } else { 767 spdk_bdev_module_finish_iter(NULL); 768 } 769 } 770 771 static void 772 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 773 { 774 struct spdk_bdev *bdev = cb_arg; 775 776 if (bdeverrno && bdev) { 777 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 778 bdev->name); 779 780 /* 781 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 782 * bdev; try to continue by manually removing this bdev from the list and continue 783 * with the next bdev in the list. 784 */ 785 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 786 } 787 788 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 789 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 790 /* 791 * Bdev module finish need to be deffered as we might be in the middle of some context 792 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 793 * after returning. 794 */ 795 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 796 return; 797 } 798 799 /* 800 * Unregister the first bdev in the list. 801 * 802 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 803 * calling the remove_cb of the descriptors first. 804 * 805 * Once this bdev and all of its open descriptors have been cleaned up, this function 806 * will be called again via the unregister completion callback to continue the cleanup 807 * process with the next bdev. 808 */ 809 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 810 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 811 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 812 } 813 814 void 815 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 816 { 817 assert(cb_fn != NULL); 818 819 g_fini_thread = spdk_get_thread(); 820 821 g_fini_cb_fn = cb_fn; 822 g_fini_cb_arg = cb_arg; 823 824 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 825 } 826 827 static struct spdk_bdev_io * 828 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 829 { 830 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 831 struct spdk_bdev_io *bdev_io; 832 833 if (ch->per_thread_cache_count > 0) { 834 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 835 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 836 ch->per_thread_cache_count--; 837 } else { 838 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 839 if (!bdev_io) { 840 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 841 return NULL; 842 } 843 } 844 845 return bdev_io; 846 } 847 848 static void 849 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 850 { 851 struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->shared_resource->mgmt_ch; 852 853 if (bdev_io->buf != NULL) { 854 spdk_bdev_io_put_buf(bdev_io); 855 } 856 857 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 858 ch->per_thread_cache_count++; 859 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 860 } else { 861 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 862 } 863 } 864 865 static void 866 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 867 { 868 struct spdk_bdev_io *bdev_io = NULL; 869 struct spdk_bdev *bdev = ch->bdev; 870 struct spdk_bdev_qos *qos = bdev->qos; 871 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 872 873 while (!TAILQ_EMPTY(&qos->queued)) { 874 if (qos->io_submitted_this_timeslice < qos->max_ios_per_timeslice) { 875 bdev_io = TAILQ_FIRST(&qos->queued); 876 TAILQ_REMOVE(&qos->queued, bdev_io, link); 877 qos->io_submitted_this_timeslice++; 878 ch->io_outstanding++; 879 shared_resource->io_outstanding++; 880 bdev->fn_table->submit_request(ch->channel, bdev_io); 881 } else { 882 break; 883 } 884 } 885 } 886 887 static void 888 _spdk_bdev_io_submit(void *ctx) 889 { 890 struct spdk_bdev_io *bdev_io = ctx; 891 struct spdk_bdev *bdev = bdev_io->bdev; 892 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 893 struct spdk_io_channel *ch = bdev_ch->channel; 894 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 895 896 bdev_io->submit_tsc = spdk_get_ticks(); 897 bdev_ch->io_outstanding++; 898 shared_resource->io_outstanding++; 899 bdev_io->in_submit_request = true; 900 if (spdk_likely(bdev_ch->flags == 0)) { 901 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 902 bdev->fn_table->submit_request(ch, bdev_io); 903 } else { 904 bdev_ch->io_outstanding--; 905 shared_resource->io_outstanding--; 906 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, link); 907 } 908 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 909 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 910 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 911 bdev_ch->io_outstanding--; 912 shared_resource->io_outstanding--; 913 TAILQ_INSERT_TAIL(&bdev->qos->queued, bdev_io, link); 914 _spdk_bdev_qos_io_submit(bdev_ch); 915 } else { 916 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 917 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 918 } 919 bdev_io->in_submit_request = false; 920 } 921 922 static void 923 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 924 { 925 struct spdk_bdev *bdev = bdev_io->bdev; 926 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->ch->channel); 927 928 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 929 930 if (bdev_io->ch->flags & BDEV_CH_QOS_ENABLED) { 931 if (thread == bdev->qos->thread) { 932 _spdk_bdev_io_submit(bdev_io); 933 } else { 934 bdev_io->io_submit_ch = bdev_io->ch; 935 bdev_io->ch = bdev->qos->ch; 936 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_io_submit, bdev_io); 937 } 938 } else { 939 _spdk_bdev_io_submit(bdev_io); 940 } 941 } 942 943 static void 944 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 945 { 946 struct spdk_bdev *bdev = bdev_io->bdev; 947 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 948 struct spdk_io_channel *ch = bdev_ch->channel; 949 950 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 951 952 bdev_io->in_submit_request = true; 953 bdev->fn_table->submit_request(ch, bdev_io); 954 bdev_io->in_submit_request = false; 955 } 956 957 static void 958 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 959 struct spdk_bdev *bdev, void *cb_arg, 960 spdk_bdev_io_completion_cb cb) 961 { 962 bdev_io->bdev = bdev; 963 bdev_io->caller_ctx = cb_arg; 964 bdev_io->cb = cb; 965 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 966 bdev_io->in_submit_request = false; 967 bdev_io->buf = NULL; 968 bdev_io->io_submit_ch = NULL; 969 } 970 971 bool 972 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 973 { 974 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 975 } 976 977 int 978 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 979 { 980 if (bdev->fn_table->dump_info_json) { 981 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 982 } 983 984 return 0; 985 } 986 987 void 988 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 989 { 990 assert(bdev != NULL); 991 assert(w != NULL); 992 993 if (bdev->fn_table->write_config_json) { 994 bdev->fn_table->write_config_json(bdev, w); 995 } else { 996 spdk_json_write_object_begin(w); 997 spdk_json_write_named_string(w, "name", bdev->name); 998 spdk_json_write_object_end(w); 999 } 1000 } 1001 1002 static void 1003 spdk_bdev_qos_update_max_ios_per_timeslice(struct spdk_bdev_qos *qos) 1004 { 1005 uint64_t max_ios_per_timeslice = 0; 1006 1007 max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1008 SPDK_BDEV_SEC_TO_USEC; 1009 qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice, 1010 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 1011 } 1012 1013 static int 1014 spdk_bdev_channel_poll_qos(void *arg) 1015 { 1016 struct spdk_bdev_qos *qos = arg; 1017 1018 /* Reset for next round of rate limiting */ 1019 qos->io_submitted_this_timeslice = 0; 1020 1021 _spdk_bdev_qos_io_submit(qos->ch); 1022 1023 return -1; 1024 } 1025 1026 static void 1027 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1028 { 1029 struct spdk_bdev_shared_resource *shared_resource; 1030 1031 if (!ch) { 1032 return; 1033 } 1034 1035 if (ch->channel) { 1036 spdk_put_io_channel(ch->channel); 1037 } 1038 1039 assert(ch->io_outstanding == 0); 1040 1041 shared_resource = ch->shared_resource; 1042 if (shared_resource) { 1043 assert(ch->io_outstanding == 0); 1044 assert(shared_resource->ref > 0); 1045 shared_resource->ref--; 1046 if (shared_resource->ref == 0) { 1047 assert(shared_resource->io_outstanding == 0); 1048 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1049 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1050 free(shared_resource); 1051 } 1052 } 1053 } 1054 1055 /* Caller must hold bdev->mutex. */ 1056 static int 1057 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1058 { 1059 struct spdk_bdev_qos *qos = bdev->qos; 1060 1061 /* Rate limiting on this bdev enabled */ 1062 if (qos) { 1063 if (qos->ch == NULL) { 1064 struct spdk_io_channel *io_ch; 1065 1066 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1067 bdev->name, spdk_get_thread()); 1068 1069 /* No qos channel has been selected, so set one up */ 1070 1071 /* Take another reference to ch */ 1072 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1073 qos->ch = ch; 1074 1075 qos->thread = spdk_io_channel_get_thread(io_ch); 1076 1077 TAILQ_INIT(&qos->queued); 1078 spdk_bdev_qos_update_max_ios_per_timeslice(qos); 1079 qos->io_submitted_this_timeslice = 0; 1080 1081 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1082 qos, 1083 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1084 } 1085 1086 ch->flags |= BDEV_CH_QOS_ENABLED; 1087 } 1088 1089 return 0; 1090 } 1091 1092 static int 1093 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1094 { 1095 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1096 struct spdk_bdev_channel *ch = ctx_buf; 1097 struct spdk_io_channel *mgmt_io_ch; 1098 struct spdk_bdev_mgmt_channel *mgmt_ch; 1099 struct spdk_bdev_shared_resource *shared_resource; 1100 1101 ch->bdev = bdev; 1102 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1103 if (!ch->channel) { 1104 return -1; 1105 } 1106 1107 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1108 if (!mgmt_io_ch) { 1109 return -1; 1110 } 1111 1112 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1113 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1114 if (shared_resource->shared_ch == ch->channel) { 1115 spdk_put_io_channel(mgmt_io_ch); 1116 shared_resource->ref++; 1117 break; 1118 } 1119 } 1120 1121 if (shared_resource == NULL) { 1122 shared_resource = calloc(1, sizeof(*shared_resource)); 1123 if (shared_resource == NULL) { 1124 spdk_put_io_channel(mgmt_io_ch); 1125 return -1; 1126 } 1127 1128 shared_resource->mgmt_ch = mgmt_ch; 1129 shared_resource->io_outstanding = 0; 1130 TAILQ_INIT(&shared_resource->nomem_io); 1131 shared_resource->nomem_threshold = 0; 1132 shared_resource->shared_ch = ch->channel; 1133 shared_resource->ref = 1; 1134 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1135 } 1136 1137 memset(&ch->stat, 0, sizeof(ch->stat)); 1138 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1139 ch->io_outstanding = 0; 1140 TAILQ_INIT(&ch->queued_resets); 1141 ch->flags = 0; 1142 ch->shared_resource = shared_resource; 1143 1144 #ifdef SPDK_CONFIG_VTUNE 1145 { 1146 char *name; 1147 __itt_init_ittlib(NULL, 0); 1148 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1149 if (!name) { 1150 _spdk_bdev_channel_destroy_resource(ch); 1151 return -1; 1152 } 1153 ch->handle = __itt_string_handle_create(name); 1154 free(name); 1155 ch->start_tsc = spdk_get_ticks(); 1156 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1157 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1158 } 1159 #endif 1160 1161 pthread_mutex_lock(&bdev->mutex); 1162 1163 if (_spdk_bdev_enable_qos(bdev, ch)) { 1164 _spdk_bdev_channel_destroy_resource(ch); 1165 pthread_mutex_unlock(&bdev->mutex); 1166 return -1; 1167 } 1168 1169 pthread_mutex_unlock(&bdev->mutex); 1170 1171 return 0; 1172 } 1173 1174 /* 1175 * Abort I/O that are waiting on a data buffer. These types of I/O are 1176 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 1177 */ 1178 static void 1179 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1180 { 1181 bdev_io_stailq_t tmp; 1182 struct spdk_bdev_io *bdev_io; 1183 1184 STAILQ_INIT(&tmp); 1185 1186 while (!STAILQ_EMPTY(queue)) { 1187 bdev_io = STAILQ_FIRST(queue); 1188 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 1189 if (bdev_io->ch == ch) { 1190 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1191 } else { 1192 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 1193 } 1194 } 1195 1196 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1197 } 1198 1199 /* 1200 * Abort I/O that are queued waiting for submission. These types of I/O are 1201 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1202 */ 1203 static void 1204 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1205 { 1206 struct spdk_bdev_io *bdev_io, *tmp; 1207 1208 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1209 if (bdev_io->ch == ch) { 1210 TAILQ_REMOVE(queue, bdev_io, link); 1211 /* 1212 * spdk_bdev_io_complete() assumes that the completed I/O had 1213 * been submitted to the bdev module. Since in this case it 1214 * hadn't, bump io_outstanding to account for the decrement 1215 * that spdk_bdev_io_complete() will do. 1216 */ 1217 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1218 ch->io_outstanding++; 1219 ch->shared_resource->io_outstanding++; 1220 } 1221 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1222 } 1223 } 1224 } 1225 1226 static void 1227 spdk_bdev_qos_channel_destroy(void *cb_arg) 1228 { 1229 struct spdk_bdev_qos *qos = cb_arg; 1230 1231 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 1232 spdk_poller_unregister(&qos->poller); 1233 1234 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 1235 1236 free(qos); 1237 } 1238 1239 static int 1240 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 1241 { 1242 /* 1243 * Cleanly shutting down the QoS poller is tricky, because 1244 * during the asynchronous operation the user could open 1245 * a new descriptor and create a new channel, spawning 1246 * a new QoS poller. 1247 * 1248 * The strategy is to create a new QoS structure here and swap it 1249 * in. The shutdown path then continues to refer to the old one 1250 * until it completes and then releases it. 1251 */ 1252 struct spdk_bdev_qos *new_qos, *old_qos; 1253 1254 old_qos = bdev->qos; 1255 1256 new_qos = calloc(1, sizeof(*new_qos)); 1257 if (!new_qos) { 1258 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 1259 return -ENOMEM; 1260 } 1261 1262 /* Copy the old QoS data into the newly allocated structure */ 1263 memcpy(new_qos, old_qos, sizeof(*new_qos)); 1264 1265 /* Zero out the key parts of the QoS structure */ 1266 new_qos->ch = NULL; 1267 new_qos->thread = NULL; 1268 new_qos->max_ios_per_timeslice = 0; 1269 new_qos->io_submitted_this_timeslice = 0; 1270 new_qos->poller = NULL; 1271 TAILQ_INIT(&new_qos->queued); 1272 1273 bdev->qos = new_qos; 1274 1275 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 1276 old_qos); 1277 1278 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 1279 * been destroyed yet. The destruction path will end up waiting for the final 1280 * channel to be put before it releases resources. */ 1281 1282 return 0; 1283 } 1284 1285 static void 1286 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1287 { 1288 struct spdk_bdev_channel *ch = ctx_buf; 1289 struct spdk_bdev_mgmt_channel *mgmt_ch; 1290 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1291 1292 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 1293 spdk_get_thread()); 1294 1295 mgmt_ch = shared_resource->mgmt_ch; 1296 1297 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1298 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 1299 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1300 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1301 1302 _spdk_bdev_channel_destroy_resource(ch); 1303 } 1304 1305 int 1306 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1307 { 1308 struct spdk_bdev_alias *tmp; 1309 1310 if (alias == NULL) { 1311 SPDK_ERRLOG("Empty alias passed\n"); 1312 return -EINVAL; 1313 } 1314 1315 if (spdk_bdev_get_by_name(alias)) { 1316 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1317 return -EEXIST; 1318 } 1319 1320 tmp = calloc(1, sizeof(*tmp)); 1321 if (tmp == NULL) { 1322 SPDK_ERRLOG("Unable to allocate alias\n"); 1323 return -ENOMEM; 1324 } 1325 1326 tmp->alias = strdup(alias); 1327 if (tmp->alias == NULL) { 1328 free(tmp); 1329 SPDK_ERRLOG("Unable to allocate alias\n"); 1330 return -ENOMEM; 1331 } 1332 1333 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1334 1335 return 0; 1336 } 1337 1338 int 1339 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1340 { 1341 struct spdk_bdev_alias *tmp; 1342 1343 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1344 if (strcmp(alias, tmp->alias) == 0) { 1345 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1346 free(tmp->alias); 1347 free(tmp); 1348 return 0; 1349 } 1350 } 1351 1352 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1353 1354 return -ENOENT; 1355 } 1356 1357 struct spdk_io_channel * 1358 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1359 { 1360 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1361 } 1362 1363 const char * 1364 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1365 { 1366 return bdev->name; 1367 } 1368 1369 const char * 1370 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1371 { 1372 return bdev->product_name; 1373 } 1374 1375 const struct spdk_bdev_aliases_list * 1376 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1377 { 1378 return &bdev->aliases; 1379 } 1380 1381 uint32_t 1382 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1383 { 1384 return bdev->blocklen; 1385 } 1386 1387 uint64_t 1388 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1389 { 1390 return bdev->blockcnt; 1391 } 1392 1393 uint64_t 1394 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev) 1395 { 1396 uint64_t iops_rate_limit = 0; 1397 1398 pthread_mutex_lock(&bdev->mutex); 1399 if (bdev->qos) { 1400 iops_rate_limit = bdev->qos->iops_rate_limit; 1401 } 1402 pthread_mutex_unlock(&bdev->mutex); 1403 1404 return iops_rate_limit; 1405 } 1406 1407 size_t 1408 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1409 { 1410 /* TODO: push this logic down to the bdev modules */ 1411 if (bdev->need_aligned_buffer) { 1412 return bdev->blocklen; 1413 } 1414 1415 return 1; 1416 } 1417 1418 uint32_t 1419 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1420 { 1421 return bdev->optimal_io_boundary; 1422 } 1423 1424 bool 1425 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1426 { 1427 return bdev->write_cache; 1428 } 1429 1430 const struct spdk_uuid * 1431 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1432 { 1433 return &bdev->uuid; 1434 } 1435 1436 int 1437 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1438 { 1439 int ret; 1440 1441 pthread_mutex_lock(&bdev->mutex); 1442 1443 /* bdev has open descriptors */ 1444 if (!TAILQ_EMPTY(&bdev->open_descs) && 1445 bdev->blockcnt > size) { 1446 ret = -EBUSY; 1447 } else { 1448 bdev->blockcnt = size; 1449 ret = 0; 1450 } 1451 1452 pthread_mutex_unlock(&bdev->mutex); 1453 1454 return ret; 1455 } 1456 1457 /* 1458 * Convert I/O offset and length from bytes to blocks. 1459 * 1460 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1461 */ 1462 static uint64_t 1463 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1464 uint64_t num_bytes, uint64_t *num_blocks) 1465 { 1466 uint32_t block_size = bdev->blocklen; 1467 1468 *offset_blocks = offset_bytes / block_size; 1469 *num_blocks = num_bytes / block_size; 1470 1471 return (offset_bytes % block_size) | (num_bytes % block_size); 1472 } 1473 1474 static bool 1475 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1476 { 1477 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1478 * has been an overflow and hence the offset has been wrapped around */ 1479 if (offset_blocks + num_blocks < offset_blocks) { 1480 return false; 1481 } 1482 1483 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1484 if (offset_blocks + num_blocks > bdev->blockcnt) { 1485 return false; 1486 } 1487 1488 return true; 1489 } 1490 1491 int 1492 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1493 void *buf, uint64_t offset, uint64_t nbytes, 1494 spdk_bdev_io_completion_cb cb, void *cb_arg) 1495 { 1496 uint64_t offset_blocks, num_blocks; 1497 1498 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1499 return -EINVAL; 1500 } 1501 1502 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1503 } 1504 1505 int 1506 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1507 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1508 spdk_bdev_io_completion_cb cb, void *cb_arg) 1509 { 1510 struct spdk_bdev *bdev = desc->bdev; 1511 struct spdk_bdev_io *bdev_io; 1512 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1513 1514 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1515 return -EINVAL; 1516 } 1517 1518 bdev_io = spdk_bdev_get_io(channel); 1519 if (!bdev_io) { 1520 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1521 return -ENOMEM; 1522 } 1523 1524 bdev_io->ch = channel; 1525 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1526 bdev_io->u.bdev.iov.iov_base = buf; 1527 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1528 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1529 bdev_io->u.bdev.iovcnt = 1; 1530 bdev_io->u.bdev.num_blocks = num_blocks; 1531 bdev_io->u.bdev.offset_blocks = offset_blocks; 1532 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1533 1534 spdk_bdev_io_submit(bdev_io); 1535 return 0; 1536 } 1537 1538 int 1539 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1540 struct iovec *iov, int iovcnt, 1541 uint64_t offset, uint64_t nbytes, 1542 spdk_bdev_io_completion_cb cb, void *cb_arg) 1543 { 1544 uint64_t offset_blocks, num_blocks; 1545 1546 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1547 return -EINVAL; 1548 } 1549 1550 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1551 } 1552 1553 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1554 struct iovec *iov, int iovcnt, 1555 uint64_t offset_blocks, uint64_t num_blocks, 1556 spdk_bdev_io_completion_cb cb, void *cb_arg) 1557 { 1558 struct spdk_bdev *bdev = desc->bdev; 1559 struct spdk_bdev_io *bdev_io; 1560 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1561 1562 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1563 return -EINVAL; 1564 } 1565 1566 bdev_io = spdk_bdev_get_io(channel); 1567 if (!bdev_io) { 1568 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1569 return -ENOMEM; 1570 } 1571 1572 bdev_io->ch = channel; 1573 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1574 bdev_io->u.bdev.iovs = iov; 1575 bdev_io->u.bdev.iovcnt = iovcnt; 1576 bdev_io->u.bdev.num_blocks = num_blocks; 1577 bdev_io->u.bdev.offset_blocks = offset_blocks; 1578 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1579 1580 spdk_bdev_io_submit(bdev_io); 1581 return 0; 1582 } 1583 1584 int 1585 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1586 void *buf, uint64_t offset, uint64_t nbytes, 1587 spdk_bdev_io_completion_cb cb, void *cb_arg) 1588 { 1589 uint64_t offset_blocks, num_blocks; 1590 1591 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1592 return -EINVAL; 1593 } 1594 1595 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1596 } 1597 1598 int 1599 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1600 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1601 spdk_bdev_io_completion_cb cb, void *cb_arg) 1602 { 1603 struct spdk_bdev *bdev = desc->bdev; 1604 struct spdk_bdev_io *bdev_io; 1605 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1606 1607 if (!desc->write) { 1608 return -EBADF; 1609 } 1610 1611 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1612 return -EINVAL; 1613 } 1614 1615 bdev_io = spdk_bdev_get_io(channel); 1616 if (!bdev_io) { 1617 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1618 return -ENOMEM; 1619 } 1620 1621 bdev_io->ch = channel; 1622 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1623 bdev_io->u.bdev.iov.iov_base = buf; 1624 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1625 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1626 bdev_io->u.bdev.iovcnt = 1; 1627 bdev_io->u.bdev.num_blocks = num_blocks; 1628 bdev_io->u.bdev.offset_blocks = offset_blocks; 1629 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1630 1631 spdk_bdev_io_submit(bdev_io); 1632 return 0; 1633 } 1634 1635 int 1636 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1637 struct iovec *iov, int iovcnt, 1638 uint64_t offset, uint64_t len, 1639 spdk_bdev_io_completion_cb cb, void *cb_arg) 1640 { 1641 uint64_t offset_blocks, num_blocks; 1642 1643 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1644 return -EINVAL; 1645 } 1646 1647 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1648 } 1649 1650 int 1651 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1652 struct iovec *iov, int iovcnt, 1653 uint64_t offset_blocks, uint64_t num_blocks, 1654 spdk_bdev_io_completion_cb cb, void *cb_arg) 1655 { 1656 struct spdk_bdev *bdev = desc->bdev; 1657 struct spdk_bdev_io *bdev_io; 1658 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1659 1660 if (!desc->write) { 1661 return -EBADF; 1662 } 1663 1664 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1665 return -EINVAL; 1666 } 1667 1668 bdev_io = spdk_bdev_get_io(channel); 1669 if (!bdev_io) { 1670 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1671 return -ENOMEM; 1672 } 1673 1674 bdev_io->ch = channel; 1675 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1676 bdev_io->u.bdev.iovs = iov; 1677 bdev_io->u.bdev.iovcnt = iovcnt; 1678 bdev_io->u.bdev.num_blocks = num_blocks; 1679 bdev_io->u.bdev.offset_blocks = offset_blocks; 1680 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1681 1682 spdk_bdev_io_submit(bdev_io); 1683 return 0; 1684 } 1685 1686 int 1687 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1688 uint64_t offset, uint64_t len, 1689 spdk_bdev_io_completion_cb cb, void *cb_arg) 1690 { 1691 uint64_t offset_blocks, num_blocks; 1692 1693 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1694 return -EINVAL; 1695 } 1696 1697 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1698 } 1699 1700 int 1701 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1702 uint64_t offset_blocks, uint64_t num_blocks, 1703 spdk_bdev_io_completion_cb cb, void *cb_arg) 1704 { 1705 struct spdk_bdev *bdev = desc->bdev; 1706 struct spdk_bdev_io *bdev_io; 1707 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1708 uint64_t len; 1709 bool split_request = false; 1710 1711 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1712 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1713 return -ERANGE; 1714 } 1715 1716 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1717 return -EINVAL; 1718 } 1719 1720 bdev_io = spdk_bdev_get_io(channel); 1721 1722 if (!bdev_io) { 1723 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1724 return -ENOMEM; 1725 } 1726 1727 bdev_io->ch = channel; 1728 bdev_io->u.bdev.offset_blocks = offset_blocks; 1729 1730 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1731 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1732 bdev_io->u.bdev.num_blocks = num_blocks; 1733 bdev_io->u.bdev.iovs = NULL; 1734 bdev_io->u.bdev.iovcnt = 0; 1735 1736 } else { 1737 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1738 1739 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1740 1741 if (len > ZERO_BUFFER_SIZE) { 1742 split_request = true; 1743 len = ZERO_BUFFER_SIZE; 1744 } 1745 1746 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1747 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1748 bdev_io->u.bdev.iov.iov_len = len; 1749 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1750 bdev_io->u.bdev.iovcnt = 1; 1751 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1752 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1753 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1754 } 1755 1756 if (split_request) { 1757 bdev_io->u.bdev.stored_user_cb = cb; 1758 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1759 } else { 1760 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1761 } 1762 spdk_bdev_io_submit(bdev_io); 1763 return 0; 1764 } 1765 1766 int 1767 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1768 uint64_t offset, uint64_t nbytes, 1769 spdk_bdev_io_completion_cb cb, void *cb_arg) 1770 { 1771 uint64_t offset_blocks, num_blocks; 1772 1773 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1774 return -EINVAL; 1775 } 1776 1777 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1778 } 1779 1780 int 1781 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1782 uint64_t offset_blocks, uint64_t num_blocks, 1783 spdk_bdev_io_completion_cb cb, void *cb_arg) 1784 { 1785 struct spdk_bdev *bdev = desc->bdev; 1786 struct spdk_bdev_io *bdev_io; 1787 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1788 1789 if (!desc->write) { 1790 return -EBADF; 1791 } 1792 1793 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1794 return -EINVAL; 1795 } 1796 1797 if (num_blocks == 0) { 1798 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1799 return -EINVAL; 1800 } 1801 1802 bdev_io = spdk_bdev_get_io(channel); 1803 if (!bdev_io) { 1804 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1805 return -ENOMEM; 1806 } 1807 1808 bdev_io->ch = channel; 1809 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1810 bdev_io->u.bdev.iov.iov_base = NULL; 1811 bdev_io->u.bdev.iov.iov_len = 0; 1812 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1813 bdev_io->u.bdev.iovcnt = 1; 1814 bdev_io->u.bdev.offset_blocks = offset_blocks; 1815 bdev_io->u.bdev.num_blocks = num_blocks; 1816 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1817 1818 spdk_bdev_io_submit(bdev_io); 1819 return 0; 1820 } 1821 1822 int 1823 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1824 uint64_t offset, uint64_t length, 1825 spdk_bdev_io_completion_cb cb, void *cb_arg) 1826 { 1827 uint64_t offset_blocks, num_blocks; 1828 1829 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1830 return -EINVAL; 1831 } 1832 1833 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1834 } 1835 1836 int 1837 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1838 uint64_t offset_blocks, uint64_t num_blocks, 1839 spdk_bdev_io_completion_cb cb, void *cb_arg) 1840 { 1841 struct spdk_bdev *bdev = desc->bdev; 1842 struct spdk_bdev_io *bdev_io; 1843 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1844 1845 if (!desc->write) { 1846 return -EBADF; 1847 } 1848 1849 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1850 return -EINVAL; 1851 } 1852 1853 bdev_io = spdk_bdev_get_io(channel); 1854 if (!bdev_io) { 1855 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1856 return -ENOMEM; 1857 } 1858 1859 bdev_io->ch = channel; 1860 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1861 bdev_io->u.bdev.iovs = NULL; 1862 bdev_io->u.bdev.iovcnt = 0; 1863 bdev_io->u.bdev.offset_blocks = offset_blocks; 1864 bdev_io->u.bdev.num_blocks = num_blocks; 1865 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1866 1867 spdk_bdev_io_submit(bdev_io); 1868 return 0; 1869 } 1870 1871 static void 1872 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1873 { 1874 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1875 struct spdk_bdev_io *bdev_io; 1876 1877 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1878 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1879 spdk_bdev_io_submit_reset(bdev_io); 1880 } 1881 1882 static void 1883 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1884 { 1885 struct spdk_io_channel *ch; 1886 struct spdk_bdev_channel *channel; 1887 struct spdk_bdev_mgmt_channel *mgmt_channel; 1888 struct spdk_bdev_shared_resource *shared_resource; 1889 bdev_io_tailq_t tmp_queued; 1890 1891 TAILQ_INIT(&tmp_queued); 1892 1893 ch = spdk_io_channel_iter_get_channel(i); 1894 channel = spdk_io_channel_get_ctx(ch); 1895 shared_resource = channel->shared_resource; 1896 mgmt_channel = shared_resource->mgmt_ch; 1897 1898 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1899 1900 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 1901 /* The QoS object is always valid and readable while 1902 * the channel flag is set, so the lock here should not 1903 * be necessary. We're not in the fast path though, so 1904 * just take it anyway. */ 1905 pthread_mutex_lock(&channel->bdev->mutex); 1906 if (channel->bdev->qos->ch == channel) { 1907 TAILQ_SWAP(&channel->bdev->qos->queued, &tmp_queued, spdk_bdev_io, link); 1908 } 1909 pthread_mutex_unlock(&channel->bdev->mutex); 1910 } 1911 1912 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 1913 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1914 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1915 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 1916 1917 spdk_for_each_channel_continue(i, 0); 1918 } 1919 1920 static void 1921 _spdk_bdev_start_reset(void *ctx) 1922 { 1923 struct spdk_bdev_channel *ch = ctx; 1924 1925 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1926 ch, _spdk_bdev_reset_dev); 1927 } 1928 1929 static void 1930 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1931 { 1932 struct spdk_bdev *bdev = ch->bdev; 1933 1934 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1935 1936 pthread_mutex_lock(&bdev->mutex); 1937 if (bdev->reset_in_progress == NULL) { 1938 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1939 /* 1940 * Take a channel reference for the target bdev for the life of this 1941 * reset. This guards against the channel getting destroyed while 1942 * spdk_for_each_channel() calls related to this reset IO are in 1943 * progress. We will release the reference when this reset is 1944 * completed. 1945 */ 1946 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1947 _spdk_bdev_start_reset(ch); 1948 } 1949 pthread_mutex_unlock(&bdev->mutex); 1950 } 1951 1952 int 1953 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1954 spdk_bdev_io_completion_cb cb, void *cb_arg) 1955 { 1956 struct spdk_bdev *bdev = desc->bdev; 1957 struct spdk_bdev_io *bdev_io; 1958 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1959 1960 bdev_io = spdk_bdev_get_io(channel); 1961 if (!bdev_io) { 1962 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 1963 return -ENOMEM; 1964 } 1965 1966 bdev_io->ch = channel; 1967 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 1968 bdev_io->u.reset.ch_ref = NULL; 1969 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1970 1971 pthread_mutex_lock(&bdev->mutex); 1972 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 1973 pthread_mutex_unlock(&bdev->mutex); 1974 1975 _spdk_bdev_channel_start_reset(channel); 1976 1977 return 0; 1978 } 1979 1980 void 1981 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 1982 struct spdk_bdev_io_stat *stat) 1983 { 1984 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1985 1986 *stat = channel->stat; 1987 } 1988 1989 static void 1990 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 1991 { 1992 void *io_device = spdk_io_channel_iter_get_io_device(i); 1993 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 1994 1995 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 1996 bdev_iostat_ctx->cb_arg, 0); 1997 free(bdev_iostat_ctx); 1998 } 1999 2000 static void 2001 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 2002 { 2003 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2004 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2005 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2006 2007 bdev_iostat_ctx->stat->bytes_read += channel->stat.bytes_read; 2008 bdev_iostat_ctx->stat->num_read_ops += channel->stat.num_read_ops; 2009 bdev_iostat_ctx->stat->bytes_written += channel->stat.bytes_written; 2010 bdev_iostat_ctx->stat->num_write_ops += channel->stat.num_write_ops; 2011 2012 spdk_for_each_channel_continue(i, 0); 2013 } 2014 2015 void 2016 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 2017 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 2018 { 2019 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 2020 2021 assert(bdev != NULL); 2022 assert(stat != NULL); 2023 assert(cb != NULL); 2024 2025 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 2026 if (bdev_iostat_ctx == NULL) { 2027 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 2028 cb(bdev, stat, cb_arg, -ENOMEM); 2029 return; 2030 } 2031 2032 bdev_iostat_ctx->stat = stat; 2033 bdev_iostat_ctx->cb = cb; 2034 bdev_iostat_ctx->cb_arg = cb_arg; 2035 2036 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2037 _spdk_bdev_get_each_channel_stat, 2038 bdev_iostat_ctx, 2039 _spdk_bdev_get_device_stat_done); 2040 } 2041 2042 int 2043 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2044 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2045 spdk_bdev_io_completion_cb cb, void *cb_arg) 2046 { 2047 struct spdk_bdev *bdev = desc->bdev; 2048 struct spdk_bdev_io *bdev_io; 2049 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2050 2051 if (!desc->write) { 2052 return -EBADF; 2053 } 2054 2055 bdev_io = spdk_bdev_get_io(channel); 2056 if (!bdev_io) { 2057 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2058 return -ENOMEM; 2059 } 2060 2061 bdev_io->ch = channel; 2062 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 2063 bdev_io->u.nvme_passthru.cmd = *cmd; 2064 bdev_io->u.nvme_passthru.buf = buf; 2065 bdev_io->u.nvme_passthru.nbytes = nbytes; 2066 bdev_io->u.nvme_passthru.md_buf = NULL; 2067 bdev_io->u.nvme_passthru.md_len = 0; 2068 2069 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2070 2071 spdk_bdev_io_submit(bdev_io); 2072 return 0; 2073 } 2074 2075 int 2076 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2077 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2078 spdk_bdev_io_completion_cb cb, void *cb_arg) 2079 { 2080 struct spdk_bdev *bdev = desc->bdev; 2081 struct spdk_bdev_io *bdev_io; 2082 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2083 2084 if (!desc->write) { 2085 /* 2086 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2087 * to easily determine if the command is a read or write, but for now just 2088 * do not allow io_passthru with a read-only descriptor. 2089 */ 2090 return -EBADF; 2091 } 2092 2093 bdev_io = spdk_bdev_get_io(channel); 2094 if (!bdev_io) { 2095 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2096 return -ENOMEM; 2097 } 2098 2099 bdev_io->ch = channel; 2100 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2101 bdev_io->u.nvme_passthru.cmd = *cmd; 2102 bdev_io->u.nvme_passthru.buf = buf; 2103 bdev_io->u.nvme_passthru.nbytes = nbytes; 2104 bdev_io->u.nvme_passthru.md_buf = NULL; 2105 bdev_io->u.nvme_passthru.md_len = 0; 2106 2107 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2108 2109 spdk_bdev_io_submit(bdev_io); 2110 return 0; 2111 } 2112 2113 int 2114 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2115 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2116 spdk_bdev_io_completion_cb cb, void *cb_arg) 2117 { 2118 struct spdk_bdev *bdev = desc->bdev; 2119 struct spdk_bdev_io *bdev_io; 2120 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2121 2122 if (!desc->write) { 2123 /* 2124 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2125 * to easily determine if the command is a read or write, but for now just 2126 * do not allow io_passthru with a read-only descriptor. 2127 */ 2128 return -EBADF; 2129 } 2130 2131 bdev_io = spdk_bdev_get_io(channel); 2132 if (!bdev_io) { 2133 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2134 return -ENOMEM; 2135 } 2136 2137 bdev_io->ch = channel; 2138 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2139 bdev_io->u.nvme_passthru.cmd = *cmd; 2140 bdev_io->u.nvme_passthru.buf = buf; 2141 bdev_io->u.nvme_passthru.nbytes = nbytes; 2142 bdev_io->u.nvme_passthru.md_buf = md_buf; 2143 bdev_io->u.nvme_passthru.md_len = md_len; 2144 2145 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2146 2147 spdk_bdev_io_submit(bdev_io); 2148 return 0; 2149 } 2150 2151 int 2152 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2153 { 2154 if (!bdev_io) { 2155 SPDK_ERRLOG("bdev_io is NULL\n"); 2156 return -1; 2157 } 2158 2159 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2160 SPDK_ERRLOG("bdev_io is in pending state\n"); 2161 assert(false); 2162 return -1; 2163 } 2164 2165 spdk_bdev_put_io(bdev_io); 2166 2167 return 0; 2168 } 2169 2170 static void 2171 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2172 { 2173 struct spdk_bdev *bdev = bdev_ch->bdev; 2174 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2175 struct spdk_bdev_io *bdev_io; 2176 2177 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 2178 /* 2179 * Allow some more I/O to complete before retrying the nomem_io queue. 2180 * Some drivers (such as nvme) cannot immediately take a new I/O in 2181 * the context of a completion, because the resources for the I/O are 2182 * not released until control returns to the bdev poller. Also, we 2183 * may require several small I/O to complete before a larger I/O 2184 * (that requires splitting) can be submitted. 2185 */ 2186 return; 2187 } 2188 2189 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 2190 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 2191 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, link); 2192 bdev_io->ch->io_outstanding++; 2193 shared_resource->io_outstanding++; 2194 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2195 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2196 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2197 break; 2198 } 2199 } 2200 } 2201 2202 static inline void 2203 _spdk_bdev_io_complete(void *ctx) 2204 { 2205 struct spdk_bdev_io *bdev_io = ctx; 2206 2207 if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) { 2208 /* 2209 * Send the completion to the thread that originally submitted the I/O, 2210 * which may not be the current thread in the case of QoS. 2211 */ 2212 if (bdev_io->io_submit_ch) { 2213 bdev_io->ch = bdev_io->io_submit_ch; 2214 bdev_io->io_submit_ch = NULL; 2215 } 2216 2217 /* 2218 * Defer completion to avoid potential infinite recursion if the 2219 * user's completion callback issues a new I/O. 2220 */ 2221 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2222 _spdk_bdev_io_complete, bdev_io); 2223 return; 2224 } 2225 2226 if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2227 switch (bdev_io->type) { 2228 case SPDK_BDEV_IO_TYPE_READ: 2229 bdev_io->ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2230 bdev_io->ch->stat.num_read_ops++; 2231 bdev_io->ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2232 break; 2233 case SPDK_BDEV_IO_TYPE_WRITE: 2234 bdev_io->ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2235 bdev_io->ch->stat.num_write_ops++; 2236 bdev_io->ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2237 break; 2238 default: 2239 break; 2240 } 2241 } 2242 2243 #ifdef SPDK_CONFIG_VTUNE 2244 uint64_t now_tsc = spdk_get_ticks(); 2245 if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) { 2246 uint64_t data[5]; 2247 2248 data[0] = bdev_io->ch->stat.num_read_ops - bdev_io->ch->prev_stat.num_read_ops; 2249 data[1] = bdev_io->ch->stat.bytes_read - bdev_io->ch->prev_stat.bytes_read; 2250 data[2] = bdev_io->ch->stat.num_write_ops - bdev_io->ch->prev_stat.num_write_ops; 2251 data[3] = bdev_io->ch->stat.bytes_written - bdev_io->ch->prev_stat.bytes_written; 2252 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 2253 bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0; 2254 2255 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle, 2256 __itt_metadata_u64, 5, data); 2257 2258 bdev_io->ch->prev_stat = bdev_io->ch->stat; 2259 bdev_io->ch->start_tsc = now_tsc; 2260 } 2261 #endif 2262 2263 assert(bdev_io->cb != NULL); 2264 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel)); 2265 2266 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2267 bdev_io->caller_ctx); 2268 } 2269 2270 static void 2271 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2272 { 2273 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2274 2275 if (bdev_io->u.reset.ch_ref != NULL) { 2276 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2277 bdev_io->u.reset.ch_ref = NULL; 2278 } 2279 2280 _spdk_bdev_io_complete(bdev_io); 2281 } 2282 2283 static void 2284 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2285 { 2286 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2287 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2288 2289 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2290 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2291 _spdk_bdev_channel_start_reset(ch); 2292 } 2293 2294 spdk_for_each_channel_continue(i, 0); 2295 } 2296 2297 void 2298 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2299 { 2300 struct spdk_bdev *bdev = bdev_io->bdev; 2301 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2302 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2303 2304 bdev_io->status = status; 2305 2306 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2307 bool unlock_channels = false; 2308 2309 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2310 SPDK_ERRLOG("NOMEM returned for reset\n"); 2311 } 2312 pthread_mutex_lock(&bdev->mutex); 2313 if (bdev_io == bdev->reset_in_progress) { 2314 bdev->reset_in_progress = NULL; 2315 unlock_channels = true; 2316 } 2317 pthread_mutex_unlock(&bdev->mutex); 2318 2319 if (unlock_channels) { 2320 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2321 bdev_io, _spdk_bdev_reset_complete); 2322 return; 2323 } 2324 } else { 2325 assert(bdev_ch->io_outstanding > 0); 2326 assert(shared_resource->io_outstanding > 0); 2327 bdev_ch->io_outstanding--; 2328 shared_resource->io_outstanding--; 2329 2330 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2331 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, link); 2332 /* 2333 * Wait for some of the outstanding I/O to complete before we 2334 * retry any of the nomem_io. Normally we will wait for 2335 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2336 * depth channels we will instead wait for half to complete. 2337 */ 2338 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 2339 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 2340 return; 2341 } 2342 2343 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 2344 _spdk_bdev_ch_retry_io(bdev_ch); 2345 } 2346 } 2347 2348 _spdk_bdev_io_complete(bdev_io); 2349 } 2350 2351 void 2352 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2353 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2354 { 2355 if (sc == SPDK_SCSI_STATUS_GOOD) { 2356 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2357 } else { 2358 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2359 bdev_io->error.scsi.sc = sc; 2360 bdev_io->error.scsi.sk = sk; 2361 bdev_io->error.scsi.asc = asc; 2362 bdev_io->error.scsi.ascq = ascq; 2363 } 2364 2365 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2366 } 2367 2368 void 2369 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2370 int *sc, int *sk, int *asc, int *ascq) 2371 { 2372 assert(sc != NULL); 2373 assert(sk != NULL); 2374 assert(asc != NULL); 2375 assert(ascq != NULL); 2376 2377 switch (bdev_io->status) { 2378 case SPDK_BDEV_IO_STATUS_SUCCESS: 2379 *sc = SPDK_SCSI_STATUS_GOOD; 2380 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2381 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2382 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2383 break; 2384 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2385 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2386 break; 2387 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2388 *sc = bdev_io->error.scsi.sc; 2389 *sk = bdev_io->error.scsi.sk; 2390 *asc = bdev_io->error.scsi.asc; 2391 *ascq = bdev_io->error.scsi.ascq; 2392 break; 2393 default: 2394 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2395 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2396 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2397 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2398 break; 2399 } 2400 } 2401 2402 void 2403 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2404 { 2405 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2406 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2407 } else { 2408 bdev_io->error.nvme.sct = sct; 2409 bdev_io->error.nvme.sc = sc; 2410 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2411 } 2412 2413 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2414 } 2415 2416 void 2417 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2418 { 2419 assert(sct != NULL); 2420 assert(sc != NULL); 2421 2422 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2423 *sct = bdev_io->error.nvme.sct; 2424 *sc = bdev_io->error.nvme.sc; 2425 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2426 *sct = SPDK_NVME_SCT_GENERIC; 2427 *sc = SPDK_NVME_SC_SUCCESS; 2428 } else { 2429 *sct = SPDK_NVME_SCT_GENERIC; 2430 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2431 } 2432 } 2433 2434 struct spdk_thread * 2435 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2436 { 2437 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2438 } 2439 2440 static void 2441 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set, 2442 enum spdk_bdev_qos_type qos_type) 2443 { 2444 uint64_t min_qos_set = 0; 2445 2446 switch (qos_type) { 2447 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2448 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 2449 break; 2450 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2451 min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC; 2452 break; 2453 default: 2454 SPDK_ERRLOG("Unsupported QoS type.\n"); 2455 return; 2456 } 2457 2458 if (qos_set % min_qos_set) { 2459 SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n", 2460 qos_set, bdev->name, min_qos_set); 2461 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2462 return; 2463 } 2464 2465 if (!bdev->qos) { 2466 bdev->qos = calloc(1, sizeof(*bdev->qos)); 2467 if (!bdev->qos) { 2468 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 2469 return; 2470 } 2471 } 2472 2473 switch (qos_type) { 2474 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2475 bdev->qos->iops_rate_limit = qos_set; 2476 break; 2477 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2478 bdev->qos->byte_rate_limit = qos_set * 1024 * 1024; 2479 break; 2480 default: 2481 break; 2482 } 2483 2484 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 2485 bdev->name, qos_type, qos_set); 2486 2487 return; 2488 } 2489 2490 static void 2491 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2492 { 2493 struct spdk_conf_section *sp = NULL; 2494 const char *val = NULL; 2495 uint64_t qos_set = 0; 2496 int i = 0, j = 0; 2497 2498 sp = spdk_conf_find_section(NULL, "QoS"); 2499 if (!sp) { 2500 return; 2501 } 2502 2503 while (j < SPDK_BDEV_QOS_NUM_TYPES) { 2504 i = 0; 2505 while (true) { 2506 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0); 2507 if (!val) { 2508 break; 2509 } 2510 2511 if (strcmp(bdev->name, val) != 0) { 2512 i++; 2513 continue; 2514 } 2515 2516 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1); 2517 if (val) { 2518 qos_set = strtoull(val, NULL, 10); 2519 _spdk_bdev_qos_config_type(bdev, qos_set, j); 2520 } 2521 2522 break; 2523 } 2524 2525 j++; 2526 } 2527 2528 return; 2529 } 2530 2531 static int 2532 spdk_bdev_init(struct spdk_bdev *bdev) 2533 { 2534 assert(bdev->module != NULL); 2535 2536 if (!bdev->name) { 2537 SPDK_ERRLOG("Bdev name is NULL\n"); 2538 return -EINVAL; 2539 } 2540 2541 if (spdk_bdev_get_by_name(bdev->name)) { 2542 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2543 return -EEXIST; 2544 } 2545 2546 bdev->status = SPDK_BDEV_STATUS_READY; 2547 2548 TAILQ_INIT(&bdev->open_descs); 2549 2550 TAILQ_INIT(&bdev->aliases); 2551 2552 bdev->reset_in_progress = NULL; 2553 2554 _spdk_bdev_qos_config(bdev); 2555 2556 spdk_io_device_register(__bdev_to_io_dev(bdev), 2557 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2558 sizeof(struct spdk_bdev_channel)); 2559 2560 pthread_mutex_init(&bdev->mutex, NULL); 2561 return 0; 2562 } 2563 2564 static void 2565 spdk_bdev_destroy_cb(void *io_device) 2566 { 2567 int rc; 2568 struct spdk_bdev *bdev; 2569 spdk_bdev_unregister_cb cb_fn; 2570 void *cb_arg; 2571 2572 bdev = __bdev_from_io_dev(io_device); 2573 cb_fn = bdev->unregister_cb; 2574 cb_arg = bdev->unregister_ctx; 2575 2576 rc = bdev->fn_table->destruct(bdev->ctxt); 2577 if (rc < 0) { 2578 SPDK_ERRLOG("destruct failed\n"); 2579 } 2580 if (rc <= 0 && cb_fn != NULL) { 2581 cb_fn(cb_arg, rc); 2582 } 2583 } 2584 2585 2586 static void 2587 spdk_bdev_fini(struct spdk_bdev *bdev) 2588 { 2589 pthread_mutex_destroy(&bdev->mutex); 2590 2591 free(bdev->qos); 2592 2593 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2594 } 2595 2596 static void 2597 spdk_bdev_start(struct spdk_bdev *bdev) 2598 { 2599 struct spdk_bdev_module *module; 2600 2601 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2602 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2603 2604 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2605 if (module->examine) { 2606 module->action_in_progress++; 2607 module->examine(bdev); 2608 } 2609 } 2610 } 2611 2612 int 2613 spdk_bdev_register(struct spdk_bdev *bdev) 2614 { 2615 int rc = spdk_bdev_init(bdev); 2616 2617 if (rc == 0) { 2618 spdk_bdev_start(bdev); 2619 } 2620 2621 return rc; 2622 } 2623 2624 static void 2625 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2626 { 2627 struct spdk_bdev **bdevs; 2628 struct spdk_bdev *base; 2629 size_t i, j, k; 2630 bool found; 2631 2632 /* Iterate over base bdevs to remove vbdev from them. */ 2633 for (i = 0; i < vbdev->base_bdevs_cnt; i++) { 2634 found = false; 2635 base = vbdev->base_bdevs[i]; 2636 2637 for (j = 0; j < base->vbdevs_cnt; j++) { 2638 if (base->vbdevs[j] != vbdev) { 2639 continue; 2640 } 2641 2642 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2643 base->vbdevs[k] = base->vbdevs[k + 1]; 2644 } 2645 2646 base->vbdevs_cnt--; 2647 if (base->vbdevs_cnt > 0) { 2648 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2649 /* It would be odd if shrinking memory block fail. */ 2650 assert(bdevs); 2651 base->vbdevs = bdevs; 2652 } else { 2653 free(base->vbdevs); 2654 base->vbdevs = NULL; 2655 } 2656 2657 found = true; 2658 break; 2659 } 2660 2661 if (!found) { 2662 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2663 } 2664 } 2665 2666 free(vbdev->base_bdevs); 2667 vbdev->base_bdevs = NULL; 2668 vbdev->base_bdevs_cnt = 0; 2669 } 2670 2671 static int 2672 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2673 { 2674 struct spdk_bdev **vbdevs; 2675 struct spdk_bdev *base; 2676 size_t i; 2677 2678 /* Adding base bdevs isn't supported (yet?). */ 2679 assert(vbdev->base_bdevs_cnt == 0); 2680 2681 vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0])); 2682 if (!vbdev->base_bdevs) { 2683 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2684 return -ENOMEM; 2685 } 2686 2687 memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0])); 2688 vbdev->base_bdevs_cnt = cnt; 2689 2690 /* Iterate over base bdevs to add this vbdev to them. */ 2691 for (i = 0; i < cnt; i++) { 2692 base = vbdev->base_bdevs[i]; 2693 2694 assert(base != NULL); 2695 assert(base->claim_module != NULL); 2696 2697 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2698 if (!vbdevs) { 2699 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 2700 spdk_vbdev_remove_base_bdevs(vbdev); 2701 return -ENOMEM; 2702 } 2703 2704 vbdevs[base->vbdevs_cnt] = vbdev; 2705 base->vbdevs = vbdevs; 2706 base->vbdevs_cnt++; 2707 } 2708 2709 return 0; 2710 } 2711 2712 int 2713 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2714 { 2715 int rc; 2716 2717 rc = spdk_bdev_init(vbdev); 2718 if (rc) { 2719 return rc; 2720 } 2721 2722 if (base_bdev_count == 0) { 2723 spdk_bdev_start(vbdev); 2724 return 0; 2725 } 2726 2727 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 2728 if (rc) { 2729 spdk_bdev_fini(vbdev); 2730 return rc; 2731 } 2732 2733 spdk_bdev_start(vbdev); 2734 return 0; 2735 2736 } 2737 2738 void 2739 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 2740 { 2741 if (bdev->unregister_cb != NULL) { 2742 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2743 } 2744 } 2745 2746 static void 2747 _remove_notify(void *arg) 2748 { 2749 struct spdk_bdev_desc *desc = arg; 2750 2751 desc->remove_cb(desc->remove_ctx); 2752 } 2753 2754 void 2755 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2756 { 2757 struct spdk_bdev_desc *desc, *tmp; 2758 bool do_destruct = true; 2759 struct spdk_thread *thread; 2760 2761 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2762 2763 thread = spdk_get_thread(); 2764 if (!thread) { 2765 /* The user called this from a non-SPDK thread. */ 2766 cb_fn(cb_arg, -ENOTSUP); 2767 return; 2768 } 2769 2770 pthread_mutex_lock(&bdev->mutex); 2771 2772 spdk_vbdev_remove_base_bdevs(bdev); 2773 2774 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2775 bdev->unregister_cb = cb_fn; 2776 bdev->unregister_ctx = cb_arg; 2777 2778 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2779 if (desc->remove_cb) { 2780 do_destruct = false; 2781 /* 2782 * Defer invocation of the remove_cb to a separate message that will 2783 * run later on this thread. This ensures this context unwinds and 2784 * we don't recursively unregister this bdev again if the remove_cb 2785 * immediately closes its descriptor. 2786 */ 2787 spdk_thread_send_msg(thread, _remove_notify, desc); 2788 } 2789 } 2790 2791 if (!do_destruct) { 2792 pthread_mutex_unlock(&bdev->mutex); 2793 return; 2794 } 2795 2796 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2797 pthread_mutex_unlock(&bdev->mutex); 2798 2799 spdk_bdev_fini(bdev); 2800 } 2801 2802 int 2803 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2804 void *remove_ctx, struct spdk_bdev_desc **_desc) 2805 { 2806 struct spdk_bdev_desc *desc; 2807 2808 desc = calloc(1, sizeof(*desc)); 2809 if (desc == NULL) { 2810 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2811 return -ENOMEM; 2812 } 2813 2814 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 2815 spdk_get_thread()); 2816 2817 pthread_mutex_lock(&bdev->mutex); 2818 2819 if (write && bdev->claim_module) { 2820 SPDK_ERRLOG("Could not open %s - already claimed\n", bdev->name); 2821 free(desc); 2822 pthread_mutex_unlock(&bdev->mutex); 2823 return -EPERM; 2824 } 2825 2826 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2827 2828 desc->bdev = bdev; 2829 desc->remove_cb = remove_cb; 2830 desc->remove_ctx = remove_ctx; 2831 desc->write = write; 2832 *_desc = desc; 2833 2834 pthread_mutex_unlock(&bdev->mutex); 2835 2836 return 0; 2837 } 2838 2839 void 2840 spdk_bdev_close(struct spdk_bdev_desc *desc) 2841 { 2842 struct spdk_bdev *bdev = desc->bdev; 2843 bool do_unregister = false; 2844 2845 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 2846 spdk_get_thread()); 2847 2848 pthread_mutex_lock(&bdev->mutex); 2849 2850 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2851 free(desc); 2852 2853 /* If no more descriptors, kill QoS channel */ 2854 if (bdev->qos && TAILQ_EMPTY(&bdev->open_descs)) { 2855 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 2856 bdev->name, spdk_get_thread()); 2857 2858 if (spdk_bdev_qos_destroy(bdev)) { 2859 /* There isn't anything we can do to recover here. Just let the 2860 * old QoS poller keep running. The QoS handling won't change 2861 * cores when the user allocates a new channel, but it won't break. */ 2862 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 2863 } 2864 } 2865 2866 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2867 do_unregister = true; 2868 } 2869 pthread_mutex_unlock(&bdev->mutex); 2870 2871 if (do_unregister == true) { 2872 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2873 } 2874 } 2875 2876 int 2877 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2878 struct spdk_bdev_module *module) 2879 { 2880 if (bdev->claim_module != NULL) { 2881 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2882 bdev->claim_module->name); 2883 return -EPERM; 2884 } 2885 2886 if (desc && !desc->write) { 2887 desc->write = true; 2888 } 2889 2890 bdev->claim_module = module; 2891 return 0; 2892 } 2893 2894 void 2895 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2896 { 2897 assert(bdev->claim_module != NULL); 2898 bdev->claim_module = NULL; 2899 } 2900 2901 struct spdk_bdev * 2902 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2903 { 2904 return desc->bdev; 2905 } 2906 2907 void 2908 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2909 { 2910 struct iovec *iovs; 2911 int iovcnt; 2912 2913 if (bdev_io == NULL) { 2914 return; 2915 } 2916 2917 switch (bdev_io->type) { 2918 case SPDK_BDEV_IO_TYPE_READ: 2919 iovs = bdev_io->u.bdev.iovs; 2920 iovcnt = bdev_io->u.bdev.iovcnt; 2921 break; 2922 case SPDK_BDEV_IO_TYPE_WRITE: 2923 iovs = bdev_io->u.bdev.iovs; 2924 iovcnt = bdev_io->u.bdev.iovcnt; 2925 break; 2926 default: 2927 iovs = NULL; 2928 iovcnt = 0; 2929 break; 2930 } 2931 2932 if (iovp) { 2933 *iovp = iovs; 2934 } 2935 if (iovcntp) { 2936 *iovcntp = iovcnt; 2937 } 2938 } 2939 2940 void 2941 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2942 { 2943 2944 if (spdk_bdev_module_list_find(bdev_module->name)) { 2945 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 2946 assert(false); 2947 } 2948 2949 if (bdev_module->async_init) { 2950 bdev_module->action_in_progress = 1; 2951 } 2952 2953 /* 2954 * Modules with examine callbacks must be initialized first, so they are 2955 * ready to handle examine callbacks from later modules that will 2956 * register physical bdevs. 2957 */ 2958 if (bdev_module->examine != NULL) { 2959 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2960 } else { 2961 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 2962 } 2963 } 2964 2965 struct spdk_bdev_module * 2966 spdk_bdev_module_list_find(const char *name) 2967 { 2968 struct spdk_bdev_module *bdev_module; 2969 2970 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 2971 if (strcmp(name, bdev_module->name) == 0) { 2972 break; 2973 } 2974 } 2975 2976 return bdev_module; 2977 } 2978 2979 static void 2980 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2981 { 2982 uint64_t len; 2983 2984 if (!success) { 2985 bdev_io->cb = bdev_io->u.bdev.stored_user_cb; 2986 _spdk_bdev_io_complete(bdev_io); 2987 return; 2988 } 2989 2990 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 2991 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 2992 ZERO_BUFFER_SIZE); 2993 2994 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 2995 bdev_io->u.bdev.iov.iov_len = len; 2996 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 2997 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 2998 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 2999 3000 /* if this round completes the i/o, change the callback to be the original user callback */ 3001 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 3002 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 3003 } else { 3004 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 3005 } 3006 spdk_bdev_io_submit(bdev_io); 3007 } 3008 3009 struct set_qos_limit_ctx { 3010 void (*cb_fn)(void *cb_arg, int status); 3011 void *cb_arg; 3012 struct spdk_bdev *bdev; 3013 }; 3014 3015 static void 3016 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 3017 { 3018 pthread_mutex_lock(&ctx->bdev->mutex); 3019 ctx->bdev->qos_mod_in_progress = false; 3020 pthread_mutex_unlock(&ctx->bdev->mutex); 3021 3022 ctx->cb_fn(ctx->cb_arg, status); 3023 free(ctx); 3024 } 3025 3026 static void 3027 _spdk_bdev_disable_qos_done(void *cb_arg) 3028 { 3029 struct set_qos_limit_ctx *ctx = cb_arg; 3030 struct spdk_bdev *bdev = ctx->bdev; 3031 struct spdk_bdev_qos *qos; 3032 3033 pthread_mutex_lock(&bdev->mutex); 3034 qos = bdev->qos; 3035 bdev->qos = NULL; 3036 pthread_mutex_unlock(&bdev->mutex); 3037 3038 _spdk_bdev_abort_queued_io(&qos->queued, qos->ch); 3039 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3040 spdk_poller_unregister(&qos->poller); 3041 3042 free(qos); 3043 3044 _spdk_bdev_set_qos_limit_done(ctx, 0); 3045 } 3046 3047 static void 3048 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 3049 { 3050 void *io_device = spdk_io_channel_iter_get_io_device(i); 3051 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3052 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3053 struct spdk_thread *thread; 3054 3055 pthread_mutex_lock(&bdev->mutex); 3056 thread = bdev->qos->thread; 3057 pthread_mutex_unlock(&bdev->mutex); 3058 3059 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 3060 } 3061 3062 static void 3063 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 3064 { 3065 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3066 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3067 3068 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 3069 3070 spdk_for_each_channel_continue(i, 0); 3071 } 3072 3073 static void 3074 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg) 3075 { 3076 struct set_qos_limit_ctx *ctx = cb_arg; 3077 struct spdk_bdev *bdev = ctx->bdev; 3078 3079 pthread_mutex_lock(&bdev->mutex); 3080 spdk_bdev_qos_update_max_ios_per_timeslice(bdev->qos); 3081 pthread_mutex_unlock(&bdev->mutex); 3082 3083 _spdk_bdev_set_qos_limit_done(ctx, 0); 3084 } 3085 3086 static void 3087 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 3088 { 3089 void *io_device = spdk_io_channel_iter_get_io_device(i); 3090 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3091 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3092 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3093 int rc; 3094 3095 pthread_mutex_lock(&bdev->mutex); 3096 rc = _spdk_bdev_enable_qos(bdev, bdev_ch); 3097 pthread_mutex_unlock(&bdev->mutex); 3098 spdk_for_each_channel_continue(i, rc); 3099 } 3100 3101 static void 3102 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 3103 { 3104 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3105 3106 _spdk_bdev_set_qos_limit_done(ctx, status); 3107 } 3108 3109 void 3110 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec, 3111 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 3112 { 3113 struct set_qos_limit_ctx *ctx; 3114 3115 if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 3116 SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n", 3117 ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 3118 cb_fn(cb_arg, -EINVAL); 3119 return; 3120 } 3121 3122 ctx = calloc(1, sizeof(*ctx)); 3123 if (ctx == NULL) { 3124 cb_fn(cb_arg, -ENOMEM); 3125 return; 3126 } 3127 3128 ctx->cb_fn = cb_fn; 3129 ctx->cb_arg = cb_arg; 3130 ctx->bdev = bdev; 3131 3132 pthread_mutex_lock(&bdev->mutex); 3133 if (bdev->qos_mod_in_progress) { 3134 pthread_mutex_unlock(&bdev->mutex); 3135 free(ctx); 3136 cb_fn(cb_arg, -EAGAIN); 3137 return; 3138 } 3139 bdev->qos_mod_in_progress = true; 3140 3141 if (ios_per_sec > 0) { 3142 if (bdev->qos == NULL) { 3143 /* Enabling */ 3144 bdev->qos = calloc(1, sizeof(*bdev->qos)); 3145 if (!bdev->qos) { 3146 pthread_mutex_unlock(&bdev->mutex); 3147 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3148 free(ctx); 3149 cb_fn(cb_arg, -ENOMEM); 3150 return; 3151 } 3152 3153 bdev->qos->iops_rate_limit = ios_per_sec; 3154 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3155 _spdk_bdev_enable_qos_msg, ctx, 3156 _spdk_bdev_enable_qos_done); 3157 } else { 3158 /* Updating */ 3159 bdev->qos->iops_rate_limit = ios_per_sec; 3160 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx); 3161 } 3162 } else { 3163 if (bdev->qos != NULL) { 3164 /* Disabling */ 3165 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3166 _spdk_bdev_disable_qos_msg, ctx, 3167 _spdk_bdev_disable_qos_msg_done); 3168 } else { 3169 pthread_mutex_unlock(&bdev->mutex); 3170 _spdk_bdev_set_qos_limit_done(ctx, 0); 3171 return; 3172 } 3173 } 3174 3175 pthread_mutex_unlock(&bdev->mutex); 3176 } 3177 3178 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 3179