1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. 5 * Copyright (c) Intel Corporation. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "spdk/bdev.h" 38 #include "spdk/conf.h" 39 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk_internal/log.h" 51 #include "spdk/string.h" 52 53 #ifdef SPDK_CONFIG_VTUNE 54 #include "ittnotify.h" 55 #include "ittnotify_types.h" 56 int __itt_init_ittlib(const char *, __itt_group_id); 57 #endif 58 59 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 60 #define SPDK_BDEV_IO_CACHE_SIZE 256 61 #define BUF_SMALL_POOL_SIZE 8192 62 #define BUF_LARGE_POOL_SIZE 1024 63 #define NOMEM_THRESHOLD_COUNT 8 64 #define ZERO_BUFFER_SIZE 0x100000 65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 66 #define SPDK_BDEV_SEC_TO_USEC 1000000ULL 67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 68 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 69 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 70 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC 10 71 72 enum spdk_bdev_qos_type { 73 SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0, 74 SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT, 75 SPDK_BDEV_QOS_NUM_TYPES /* Keep last */ 76 }; 77 78 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"}; 79 80 struct spdk_bdev_mgr { 81 struct spdk_mempool *bdev_io_pool; 82 83 struct spdk_mempool *buf_small_pool; 84 struct spdk_mempool *buf_large_pool; 85 86 void *zero_buffer; 87 88 TAILQ_HEAD(, spdk_bdev_module) bdev_modules; 89 90 TAILQ_HEAD(, spdk_bdev) bdevs; 91 92 bool init_complete; 93 bool module_init_complete; 94 95 #ifdef SPDK_CONFIG_VTUNE 96 __itt_domain *domain; 97 #endif 98 }; 99 100 static struct spdk_bdev_mgr g_bdev_mgr = { 101 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 102 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 103 .init_complete = false, 104 .module_init_complete = false, 105 }; 106 107 static spdk_bdev_init_cb g_init_cb_fn = NULL; 108 static void *g_init_cb_arg = NULL; 109 110 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 111 static void *g_fini_cb_arg = NULL; 112 static struct spdk_thread *g_fini_thread = NULL; 113 114 struct spdk_bdev_qos { 115 /** Rate limit, in I/O per second */ 116 uint64_t iops_rate_limit; 117 118 /** Rate limit, in byte per second */ 119 uint64_t byte_rate_limit; 120 121 /** The channel that all I/O are funneled through */ 122 struct spdk_bdev_channel *ch; 123 124 /** The thread on which the poller is running. */ 125 struct spdk_thread *thread; 126 127 /** Queue of I/O waiting to be issued. */ 128 bdev_io_tailq_t queued; 129 130 /** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and 131 * only valid for the master channel which manages the outstanding IOs. */ 132 uint64_t max_ios_per_timeslice; 133 134 /** Maximum allowed bytes to be issued in one timeslice (e.g., 1ms) and 135 * only valid for the master channel which manages the outstanding IOs. */ 136 uint64_t max_byte_per_timeslice; 137 138 /** Submitted IO in one timeslice (e.g., 1ms) */ 139 uint64_t io_submitted_this_timeslice; 140 141 /** Submitted byte in one timeslice (e.g., 1ms) */ 142 uint64_t byte_submitted_this_timeslice; 143 144 /** Polller that processes queued I/O commands each time slice. */ 145 struct spdk_poller *poller; 146 }; 147 148 struct spdk_bdev_mgmt_channel { 149 bdev_io_stailq_t need_buf_small; 150 bdev_io_stailq_t need_buf_large; 151 152 /* 153 * Each thread keeps a cache of bdev_io - this allows 154 * bdev threads which are *not* DPDK threads to still 155 * benefit from a per-thread bdev_io cache. Without 156 * this, non-DPDK threads fetching from the mempool 157 * incur a cmpxchg on get and put. 158 */ 159 bdev_io_stailq_t per_thread_cache; 160 uint32_t per_thread_cache_count; 161 162 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 163 }; 164 165 /* 166 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 167 * will queue here their IO that awaits retry. It makes it posible to retry sending 168 * IO to one bdev after IO from other bdev completes. 169 */ 170 struct spdk_bdev_shared_resource { 171 /* The bdev management channel */ 172 struct spdk_bdev_mgmt_channel *mgmt_ch; 173 174 /* 175 * Count of I/O submitted to bdev module and waiting for completion. 176 * Incremented before submit_request() is called on an spdk_bdev_io. 177 */ 178 uint64_t io_outstanding; 179 180 /* 181 * Queue of IO awaiting retry because of a previous NOMEM status returned 182 * on this channel. 183 */ 184 bdev_io_tailq_t nomem_io; 185 186 /* 187 * Threshold which io_outstanding must drop to before retrying nomem_io. 188 */ 189 uint64_t nomem_threshold; 190 191 /* I/O channel allocated by a bdev module */ 192 struct spdk_io_channel *shared_ch; 193 194 /* Refcount of bdev channels using this resource */ 195 uint32_t ref; 196 197 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 198 }; 199 200 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 201 #define BDEV_CH_QOS_ENABLED (1 << 1) 202 203 struct spdk_bdev_channel { 204 struct spdk_bdev *bdev; 205 206 /* The channel for the underlying device */ 207 struct spdk_io_channel *channel; 208 209 /* Per io_device per thread data */ 210 struct spdk_bdev_shared_resource *shared_resource; 211 212 struct spdk_bdev_io_stat stat; 213 214 /* 215 * Count of I/O submitted through this channel and waiting for completion. 216 * Incremented before submit_request() is called on an spdk_bdev_io. 217 */ 218 uint64_t io_outstanding; 219 220 bdev_io_tailq_t queued_resets; 221 222 uint32_t flags; 223 224 #ifdef SPDK_CONFIG_VTUNE 225 uint64_t start_tsc; 226 uint64_t interval_tsc; 227 __itt_string_handle *handle; 228 struct spdk_bdev_io_stat prev_stat; 229 #endif 230 231 }; 232 233 struct spdk_bdev_desc { 234 struct spdk_bdev *bdev; 235 spdk_bdev_remove_cb_t remove_cb; 236 void *remove_ctx; 237 bool write; 238 TAILQ_ENTRY(spdk_bdev_desc) link; 239 }; 240 241 struct spdk_bdev_iostat_ctx { 242 struct spdk_bdev_io_stat *stat; 243 spdk_bdev_get_device_stat_cb cb; 244 void *cb_arg; 245 }; 246 247 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 248 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 249 250 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 251 252 struct spdk_bdev * 253 spdk_bdev_first(void) 254 { 255 struct spdk_bdev *bdev; 256 257 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 258 if (bdev) { 259 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 260 } 261 262 return bdev; 263 } 264 265 struct spdk_bdev * 266 spdk_bdev_next(struct spdk_bdev *prev) 267 { 268 struct spdk_bdev *bdev; 269 270 bdev = TAILQ_NEXT(prev, link); 271 if (bdev) { 272 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 273 } 274 275 return bdev; 276 } 277 278 static struct spdk_bdev * 279 _bdev_next_leaf(struct spdk_bdev *bdev) 280 { 281 while (bdev != NULL) { 282 if (bdev->claim_module == NULL) { 283 return bdev; 284 } else { 285 bdev = TAILQ_NEXT(bdev, link); 286 } 287 } 288 289 return bdev; 290 } 291 292 struct spdk_bdev * 293 spdk_bdev_first_leaf(void) 294 { 295 struct spdk_bdev *bdev; 296 297 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 298 299 if (bdev) { 300 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 301 } 302 303 return bdev; 304 } 305 306 struct spdk_bdev * 307 spdk_bdev_next_leaf(struct spdk_bdev *prev) 308 { 309 struct spdk_bdev *bdev; 310 311 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link)); 312 313 if (bdev) { 314 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 315 } 316 317 return bdev; 318 } 319 320 struct spdk_bdev * 321 spdk_bdev_get_by_name(const char *bdev_name) 322 { 323 struct spdk_bdev_alias *tmp; 324 struct spdk_bdev *bdev = spdk_bdev_first(); 325 326 while (bdev != NULL) { 327 if (strcmp(bdev_name, bdev->name) == 0) { 328 return bdev; 329 } 330 331 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 332 if (strcmp(bdev_name, tmp->alias) == 0) { 333 return bdev; 334 } 335 } 336 337 bdev = spdk_bdev_next(bdev); 338 } 339 340 return NULL; 341 } 342 343 static void 344 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf) 345 { 346 assert(bdev_io->get_buf_cb != NULL); 347 assert(buf != NULL); 348 assert(bdev_io->u.bdev.iovs != NULL); 349 350 bdev_io->buf = buf; 351 bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL); 352 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len; 353 bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io); 354 } 355 356 static void 357 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 358 { 359 struct spdk_mempool *pool; 360 struct spdk_bdev_io *tmp; 361 void *buf; 362 bdev_io_stailq_t *stailq; 363 struct spdk_bdev_mgmt_channel *ch; 364 365 assert(bdev_io->u.bdev.iovcnt == 1); 366 367 buf = bdev_io->buf; 368 ch = bdev_io->ch->shared_resource->mgmt_ch; 369 370 if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 371 pool = g_bdev_mgr.buf_small_pool; 372 stailq = &ch->need_buf_small; 373 } else { 374 pool = g_bdev_mgr.buf_large_pool; 375 stailq = &ch->need_buf_large; 376 } 377 378 if (STAILQ_EMPTY(stailq)) { 379 spdk_mempool_put(pool, buf); 380 } else { 381 tmp = STAILQ_FIRST(stailq); 382 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 383 spdk_bdev_io_set_buf(tmp, buf); 384 } 385 } 386 387 void 388 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 389 { 390 struct spdk_mempool *pool; 391 bdev_io_stailq_t *stailq; 392 void *buf = NULL; 393 struct spdk_bdev_mgmt_channel *mgmt_ch; 394 395 assert(cb != NULL); 396 assert(bdev_io->u.bdev.iovs != NULL); 397 398 if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { 399 /* Buffer already present */ 400 cb(bdev_io->ch->channel, bdev_io); 401 return; 402 } 403 404 assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); 405 mgmt_ch = bdev_io->ch->shared_resource->mgmt_ch; 406 407 bdev_io->buf_len = len; 408 bdev_io->get_buf_cb = cb; 409 if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { 410 pool = g_bdev_mgr.buf_small_pool; 411 stailq = &mgmt_ch->need_buf_small; 412 } else { 413 pool = g_bdev_mgr.buf_large_pool; 414 stailq = &mgmt_ch->need_buf_large; 415 } 416 417 buf = spdk_mempool_get(pool); 418 419 if (!buf) { 420 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 421 } else { 422 spdk_bdev_io_set_buf(bdev_io, buf); 423 } 424 } 425 426 static int 427 spdk_bdev_module_get_max_ctx_size(void) 428 { 429 struct spdk_bdev_module *bdev_module; 430 int max_bdev_module_size = 0; 431 432 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 433 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 434 max_bdev_module_size = bdev_module->get_ctx_size(); 435 } 436 } 437 438 return max_bdev_module_size; 439 } 440 441 void 442 spdk_bdev_config_text(FILE *fp) 443 { 444 struct spdk_bdev_module *bdev_module; 445 446 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 447 if (bdev_module->config_text) { 448 bdev_module->config_text(fp); 449 } 450 } 451 } 452 453 void 454 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 455 { 456 struct spdk_bdev_module *bdev_module; 457 struct spdk_bdev *bdev; 458 459 assert(w != NULL); 460 461 spdk_json_write_array_begin(w); 462 463 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 464 if (bdev_module->config_json) { 465 bdev_module->config_json(w); 466 } 467 } 468 469 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) { 470 spdk_bdev_config_json(bdev, w); 471 } 472 473 spdk_json_write_array_end(w); 474 } 475 476 static int 477 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 478 { 479 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 480 481 STAILQ_INIT(&ch->need_buf_small); 482 STAILQ_INIT(&ch->need_buf_large); 483 484 STAILQ_INIT(&ch->per_thread_cache); 485 ch->per_thread_cache_count = 0; 486 487 TAILQ_INIT(&ch->shared_resources); 488 489 return 0; 490 } 491 492 static void 493 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 494 { 495 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 496 struct spdk_bdev_io *bdev_io; 497 498 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 499 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 500 } 501 502 if (!TAILQ_EMPTY(&ch->shared_resources)) { 503 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 504 } 505 506 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 507 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 508 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 509 ch->per_thread_cache_count--; 510 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 511 } 512 513 assert(ch->per_thread_cache_count == 0); 514 } 515 516 static void 517 spdk_bdev_init_complete(int rc) 518 { 519 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 520 void *cb_arg = g_init_cb_arg; 521 struct spdk_bdev_module *m; 522 523 g_bdev_mgr.init_complete = true; 524 g_init_cb_fn = NULL; 525 g_init_cb_arg = NULL; 526 527 /* 528 * For modules that need to know when subsystem init is complete, 529 * inform them now. 530 */ 531 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 532 if (m->init_complete) { 533 m->init_complete(); 534 } 535 } 536 537 cb_fn(cb_arg, rc); 538 } 539 540 static void 541 spdk_bdev_module_action_complete(void) 542 { 543 struct spdk_bdev_module *m; 544 545 /* 546 * Don't finish bdev subsystem initialization if 547 * module pre-initialization is still in progress, or 548 * the subsystem been already initialized. 549 */ 550 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 551 return; 552 } 553 554 /* 555 * Check all bdev modules for inits/examinations in progress. If any 556 * exist, return immediately since we cannot finish bdev subsystem 557 * initialization until all are completed. 558 */ 559 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) { 560 if (m->action_in_progress > 0) { 561 return; 562 } 563 } 564 565 /* 566 * Modules already finished initialization - now that all 567 * the bdev modules have finished their asynchronous I/O 568 * processing, the entire bdev layer can be marked as complete. 569 */ 570 spdk_bdev_init_complete(0); 571 } 572 573 static void 574 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 575 { 576 assert(module->action_in_progress > 0); 577 module->action_in_progress--; 578 spdk_bdev_module_action_complete(); 579 } 580 581 void 582 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 583 { 584 spdk_bdev_module_action_done(module); 585 } 586 587 void 588 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 589 { 590 spdk_bdev_module_action_done(module); 591 } 592 593 static int 594 spdk_bdev_modules_init(void) 595 { 596 struct spdk_bdev_module *module; 597 int rc = 0; 598 599 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 600 rc = module->module_init(); 601 if (rc != 0) { 602 break; 603 } 604 } 605 606 g_bdev_mgr.module_init_complete = true; 607 return rc; 608 } 609 void 610 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 611 { 612 int cache_size; 613 int rc = 0; 614 char mempool_name[32]; 615 616 assert(cb_fn != NULL); 617 618 g_init_cb_fn = cb_fn; 619 g_init_cb_arg = cb_arg; 620 621 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 622 623 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 624 SPDK_BDEV_IO_POOL_SIZE, 625 sizeof(struct spdk_bdev_io) + 626 spdk_bdev_module_get_max_ctx_size(), 627 0, 628 SPDK_ENV_SOCKET_ID_ANY); 629 630 if (g_bdev_mgr.bdev_io_pool == NULL) { 631 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 632 spdk_bdev_init_complete(-1); 633 return; 634 } 635 636 /** 637 * Ensure no more than half of the total buffers end up local caches, by 638 * using spdk_env_get_core_count() to determine how many local caches we need 639 * to account for. 640 */ 641 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 642 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 643 644 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 645 BUF_SMALL_POOL_SIZE, 646 SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, 647 cache_size, 648 SPDK_ENV_SOCKET_ID_ANY); 649 if (!g_bdev_mgr.buf_small_pool) { 650 SPDK_ERRLOG("create rbuf small pool failed\n"); 651 spdk_bdev_init_complete(-1); 652 return; 653 } 654 655 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 656 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 657 658 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 659 BUF_LARGE_POOL_SIZE, 660 SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, 661 cache_size, 662 SPDK_ENV_SOCKET_ID_ANY); 663 if (!g_bdev_mgr.buf_large_pool) { 664 SPDK_ERRLOG("create rbuf large pool failed\n"); 665 spdk_bdev_init_complete(-1); 666 return; 667 } 668 669 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 670 NULL); 671 if (!g_bdev_mgr.zero_buffer) { 672 SPDK_ERRLOG("create bdev zero buffer failed\n"); 673 spdk_bdev_init_complete(-1); 674 return; 675 } 676 677 #ifdef SPDK_CONFIG_VTUNE 678 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 679 #endif 680 681 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 682 spdk_bdev_mgmt_channel_destroy, 683 sizeof(struct spdk_bdev_mgmt_channel)); 684 685 rc = spdk_bdev_modules_init(); 686 if (rc != 0) { 687 SPDK_ERRLOG("bdev modules init failed\n"); 688 spdk_bdev_init_complete(-1); 689 return; 690 } 691 692 spdk_bdev_module_action_complete(); 693 } 694 695 static void 696 spdk_bdev_mgr_unregister_cb(void *io_device) 697 { 698 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 699 700 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != SPDK_BDEV_IO_POOL_SIZE) { 701 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 702 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 703 SPDK_BDEV_IO_POOL_SIZE); 704 } 705 706 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 707 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 708 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 709 BUF_SMALL_POOL_SIZE); 710 assert(false); 711 } 712 713 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 714 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 715 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 716 BUF_LARGE_POOL_SIZE); 717 assert(false); 718 } 719 720 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 721 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 722 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 723 spdk_dma_free(g_bdev_mgr.zero_buffer); 724 725 cb_fn(g_fini_cb_arg); 726 g_fini_cb_fn = NULL; 727 g_fini_cb_arg = NULL; 728 } 729 730 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 731 732 static void 733 spdk_bdev_module_finish_iter(void *arg) 734 { 735 struct spdk_bdev_module *bdev_module; 736 737 /* Start iterating from the last touched module */ 738 if (!g_resume_bdev_module) { 739 bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules); 740 } else { 741 bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq); 742 } 743 744 while (bdev_module) { 745 if (bdev_module->async_fini) { 746 /* Save our place so we can resume later. We must 747 * save the variable here, before calling module_fini() 748 * below, because in some cases the module may immediately 749 * call spdk_bdev_module_finish_done() and re-enter 750 * this function to continue iterating. */ 751 g_resume_bdev_module = bdev_module; 752 } 753 754 if (bdev_module->module_fini) { 755 bdev_module->module_fini(); 756 } 757 758 if (bdev_module->async_fini) { 759 return; 760 } 761 762 bdev_module = TAILQ_NEXT(bdev_module, tailq); 763 } 764 765 g_resume_bdev_module = NULL; 766 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 767 } 768 769 void 770 spdk_bdev_module_finish_done(void) 771 { 772 if (spdk_get_thread() != g_fini_thread) { 773 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 774 } else { 775 spdk_bdev_module_finish_iter(NULL); 776 } 777 } 778 779 static void 780 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 781 { 782 struct spdk_bdev *bdev = cb_arg; 783 784 if (bdeverrno && bdev) { 785 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 786 bdev->name); 787 788 /* 789 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 790 * bdev; try to continue by manually removing this bdev from the list and continue 791 * with the next bdev in the list. 792 */ 793 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 794 } 795 796 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 797 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 798 /* 799 * Bdev module finish need to be deffered as we might be in the middle of some context 800 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 801 * after returning. 802 */ 803 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 804 return; 805 } 806 807 /* 808 * Unregister the first bdev in the list. 809 * 810 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by 811 * calling the remove_cb of the descriptors first. 812 * 813 * Once this bdev and all of its open descriptors have been cleaned up, this function 814 * will be called again via the unregister completion callback to continue the cleanup 815 * process with the next bdev. 816 */ 817 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 818 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 819 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 820 } 821 822 void 823 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 824 { 825 assert(cb_fn != NULL); 826 827 g_fini_thread = spdk_get_thread(); 828 829 g_fini_cb_fn = cb_fn; 830 g_fini_cb_arg = cb_arg; 831 832 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 833 } 834 835 static struct spdk_bdev_io * 836 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 837 { 838 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 839 struct spdk_bdev_io *bdev_io; 840 841 if (ch->per_thread_cache_count > 0) { 842 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 843 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 844 ch->per_thread_cache_count--; 845 } else { 846 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 847 if (!bdev_io) { 848 SPDK_ERRLOG("Unable to get spdk_bdev_io\n"); 849 return NULL; 850 } 851 } 852 853 return bdev_io; 854 } 855 856 static void 857 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io) 858 { 859 struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->shared_resource->mgmt_ch; 860 861 if (bdev_io->buf != NULL) { 862 spdk_bdev_io_put_buf(bdev_io); 863 } 864 865 if (ch->per_thread_cache_count < SPDK_BDEV_IO_CACHE_SIZE) { 866 ch->per_thread_cache_count++; 867 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 868 } else { 869 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 870 } 871 } 872 873 static uint64_t 874 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 875 { 876 struct spdk_bdev *bdev = bdev_io->bdev; 877 878 switch (bdev_io->type) { 879 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 880 case SPDK_BDEV_IO_TYPE_NVME_IO: 881 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 882 return bdev_io->u.nvme_passthru.nbytes; 883 case SPDK_BDEV_IO_TYPE_READ: 884 case SPDK_BDEV_IO_TYPE_WRITE: 885 case SPDK_BDEV_IO_TYPE_UNMAP: 886 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 887 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 888 default: 889 return 0; 890 } 891 } 892 893 static void 894 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch) 895 { 896 struct spdk_bdev_io *bdev_io = NULL; 897 struct spdk_bdev *bdev = ch->bdev; 898 struct spdk_bdev_qos *qos = bdev->qos; 899 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 900 901 while (!TAILQ_EMPTY(&qos->queued)) { 902 if (qos->max_ios_per_timeslice > 0 && 903 qos->io_submitted_this_timeslice >= qos->max_ios_per_timeslice) { 904 break; 905 } 906 907 if (qos->max_byte_per_timeslice > 0 && 908 qos->byte_submitted_this_timeslice >= qos->max_byte_per_timeslice) { 909 break; 910 } 911 912 bdev_io = TAILQ_FIRST(&qos->queued); 913 TAILQ_REMOVE(&qos->queued, bdev_io, link); 914 qos->io_submitted_this_timeslice++; 915 qos->byte_submitted_this_timeslice += _spdk_bdev_get_io_size_in_byte(bdev_io); 916 ch->io_outstanding++; 917 shared_resource->io_outstanding++; 918 bdev->fn_table->submit_request(ch->channel, bdev_io); 919 } 920 } 921 922 static void 923 _spdk_bdev_io_submit(void *ctx) 924 { 925 struct spdk_bdev_io *bdev_io = ctx; 926 struct spdk_bdev *bdev = bdev_io->bdev; 927 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 928 struct spdk_io_channel *ch = bdev_ch->channel; 929 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 930 931 bdev_io->submit_tsc = spdk_get_ticks(); 932 bdev_ch->io_outstanding++; 933 shared_resource->io_outstanding++; 934 bdev_io->in_submit_request = true; 935 if (spdk_likely(bdev_ch->flags == 0)) { 936 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 937 bdev->fn_table->submit_request(ch, bdev_io); 938 } else { 939 bdev_ch->io_outstanding--; 940 shared_resource->io_outstanding--; 941 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, link); 942 } 943 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 944 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 945 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 946 bdev_ch->io_outstanding--; 947 shared_resource->io_outstanding--; 948 TAILQ_INSERT_TAIL(&bdev->qos->queued, bdev_io, link); 949 _spdk_bdev_qos_io_submit(bdev_ch); 950 } else { 951 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 952 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 953 } 954 bdev_io->in_submit_request = false; 955 } 956 957 static void 958 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 959 { 960 struct spdk_bdev *bdev = bdev_io->bdev; 961 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->ch->channel); 962 963 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 964 965 if (bdev_io->ch->flags & BDEV_CH_QOS_ENABLED) { 966 if (thread == bdev->qos->thread) { 967 _spdk_bdev_io_submit(bdev_io); 968 } else { 969 bdev_io->io_submit_ch = bdev_io->ch; 970 bdev_io->ch = bdev->qos->ch; 971 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_io_submit, bdev_io); 972 } 973 } else { 974 _spdk_bdev_io_submit(bdev_io); 975 } 976 } 977 978 static void 979 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 980 { 981 struct spdk_bdev *bdev = bdev_io->bdev; 982 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 983 struct spdk_io_channel *ch = bdev_ch->channel; 984 985 assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); 986 987 bdev_io->in_submit_request = true; 988 bdev->fn_table->submit_request(ch, bdev_io); 989 bdev_io->in_submit_request = false; 990 } 991 992 static void 993 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 994 struct spdk_bdev *bdev, void *cb_arg, 995 spdk_bdev_io_completion_cb cb) 996 { 997 bdev_io->bdev = bdev; 998 bdev_io->caller_ctx = cb_arg; 999 bdev_io->cb = cb; 1000 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 1001 bdev_io->in_submit_request = false; 1002 bdev_io->buf = NULL; 1003 bdev_io->io_submit_ch = NULL; 1004 } 1005 1006 bool 1007 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1008 { 1009 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1010 } 1011 1012 int 1013 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1014 { 1015 if (bdev->fn_table->dump_info_json) { 1016 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1017 } 1018 1019 return 0; 1020 } 1021 1022 void 1023 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1024 { 1025 assert(bdev != NULL); 1026 assert(w != NULL); 1027 1028 if (bdev->fn_table->write_config_json) { 1029 bdev->fn_table->write_config_json(bdev, w); 1030 } else { 1031 spdk_json_write_object_begin(w); 1032 spdk_json_write_named_string(w, "name", bdev->name); 1033 spdk_json_write_object_end(w); 1034 } 1035 } 1036 1037 static void 1038 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1039 { 1040 uint64_t max_ios_per_timeslice = 0, max_byte_per_timeslice = 0; 1041 1042 if (qos->iops_rate_limit > 0) { 1043 max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1044 SPDK_BDEV_SEC_TO_USEC; 1045 qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice, 1046 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE); 1047 } 1048 1049 if (qos->byte_rate_limit > 0) { 1050 max_byte_per_timeslice = qos->byte_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC / 1051 SPDK_BDEV_SEC_TO_USEC; 1052 qos->max_byte_per_timeslice = spdk_max(max_byte_per_timeslice, 1053 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE); 1054 } 1055 } 1056 1057 static int 1058 spdk_bdev_channel_poll_qos(void *arg) 1059 { 1060 struct spdk_bdev_qos *qos = arg; 1061 1062 /* Reset for next round of rate limiting */ 1063 qos->io_submitted_this_timeslice = 0; 1064 qos->byte_submitted_this_timeslice = 0; 1065 1066 _spdk_bdev_qos_io_submit(qos->ch); 1067 1068 return -1; 1069 } 1070 1071 static void 1072 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1073 { 1074 struct spdk_bdev_shared_resource *shared_resource; 1075 1076 if (!ch) { 1077 return; 1078 } 1079 1080 if (ch->channel) { 1081 spdk_put_io_channel(ch->channel); 1082 } 1083 1084 assert(ch->io_outstanding == 0); 1085 1086 shared_resource = ch->shared_resource; 1087 if (shared_resource) { 1088 assert(ch->io_outstanding == 0); 1089 assert(shared_resource->ref > 0); 1090 shared_resource->ref--; 1091 if (shared_resource->ref == 0) { 1092 assert(shared_resource->io_outstanding == 0); 1093 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1094 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1095 free(shared_resource); 1096 } 1097 } 1098 } 1099 1100 /* Caller must hold bdev->mutex. */ 1101 static int 1102 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1103 { 1104 struct spdk_bdev_qos *qos = bdev->qos; 1105 1106 /* Rate limiting on this bdev enabled */ 1107 if (qos) { 1108 if (qos->ch == NULL) { 1109 struct spdk_io_channel *io_ch; 1110 1111 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1112 bdev->name, spdk_get_thread()); 1113 1114 /* No qos channel has been selected, so set one up */ 1115 1116 /* Take another reference to ch */ 1117 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1118 qos->ch = ch; 1119 1120 qos->thread = spdk_io_channel_get_thread(io_ch); 1121 1122 TAILQ_INIT(&qos->queued); 1123 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1124 qos->io_submitted_this_timeslice = 0; 1125 qos->byte_submitted_this_timeslice = 0; 1126 1127 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1128 qos, 1129 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1130 } 1131 1132 ch->flags |= BDEV_CH_QOS_ENABLED; 1133 } 1134 1135 return 0; 1136 } 1137 1138 static int 1139 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1140 { 1141 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1142 struct spdk_bdev_channel *ch = ctx_buf; 1143 struct spdk_io_channel *mgmt_io_ch; 1144 struct spdk_bdev_mgmt_channel *mgmt_ch; 1145 struct spdk_bdev_shared_resource *shared_resource; 1146 1147 ch->bdev = bdev; 1148 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1149 if (!ch->channel) { 1150 return -1; 1151 } 1152 1153 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1154 if (!mgmt_io_ch) { 1155 return -1; 1156 } 1157 1158 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1159 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1160 if (shared_resource->shared_ch == ch->channel) { 1161 spdk_put_io_channel(mgmt_io_ch); 1162 shared_resource->ref++; 1163 break; 1164 } 1165 } 1166 1167 if (shared_resource == NULL) { 1168 shared_resource = calloc(1, sizeof(*shared_resource)); 1169 if (shared_resource == NULL) { 1170 spdk_put_io_channel(mgmt_io_ch); 1171 return -1; 1172 } 1173 1174 shared_resource->mgmt_ch = mgmt_ch; 1175 shared_resource->io_outstanding = 0; 1176 TAILQ_INIT(&shared_resource->nomem_io); 1177 shared_resource->nomem_threshold = 0; 1178 shared_resource->shared_ch = ch->channel; 1179 shared_resource->ref = 1; 1180 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1181 } 1182 1183 memset(&ch->stat, 0, sizeof(ch->stat)); 1184 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1185 ch->io_outstanding = 0; 1186 TAILQ_INIT(&ch->queued_resets); 1187 ch->flags = 0; 1188 ch->shared_resource = shared_resource; 1189 1190 #ifdef SPDK_CONFIG_VTUNE 1191 { 1192 char *name; 1193 __itt_init_ittlib(NULL, 0); 1194 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1195 if (!name) { 1196 _spdk_bdev_channel_destroy_resource(ch); 1197 return -1; 1198 } 1199 ch->handle = __itt_string_handle_create(name); 1200 free(name); 1201 ch->start_tsc = spdk_get_ticks(); 1202 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1203 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1204 } 1205 #endif 1206 1207 pthread_mutex_lock(&bdev->mutex); 1208 1209 if (_spdk_bdev_enable_qos(bdev, ch)) { 1210 _spdk_bdev_channel_destroy_resource(ch); 1211 pthread_mutex_unlock(&bdev->mutex); 1212 return -1; 1213 } 1214 1215 pthread_mutex_unlock(&bdev->mutex); 1216 1217 return 0; 1218 } 1219 1220 /* 1221 * Abort I/O that are waiting on a data buffer. These types of I/O are 1222 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 1223 */ 1224 static void 1225 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 1226 { 1227 bdev_io_stailq_t tmp; 1228 struct spdk_bdev_io *bdev_io; 1229 1230 STAILQ_INIT(&tmp); 1231 1232 while (!STAILQ_EMPTY(queue)) { 1233 bdev_io = STAILQ_FIRST(queue); 1234 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 1235 if (bdev_io->ch == ch) { 1236 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1237 } else { 1238 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 1239 } 1240 } 1241 1242 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 1243 } 1244 1245 /* 1246 * Abort I/O that are queued waiting for submission. These types of I/O are 1247 * linked using the spdk_bdev_io link TAILQ_ENTRY. 1248 */ 1249 static void 1250 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 1251 { 1252 struct spdk_bdev_io *bdev_io, *tmp; 1253 1254 TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { 1255 if (bdev_io->ch == ch) { 1256 TAILQ_REMOVE(queue, bdev_io, link); 1257 /* 1258 * spdk_bdev_io_complete() assumes that the completed I/O had 1259 * been submitted to the bdev module. Since in this case it 1260 * hadn't, bump io_outstanding to account for the decrement 1261 * that spdk_bdev_io_complete() will do. 1262 */ 1263 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 1264 ch->io_outstanding++; 1265 ch->shared_resource->io_outstanding++; 1266 } 1267 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1268 } 1269 } 1270 } 1271 1272 static void 1273 spdk_bdev_qos_channel_destroy(void *cb_arg) 1274 { 1275 struct spdk_bdev_qos *qos = cb_arg; 1276 1277 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 1278 spdk_poller_unregister(&qos->poller); 1279 1280 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 1281 1282 free(qos); 1283 } 1284 1285 static int 1286 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 1287 { 1288 /* 1289 * Cleanly shutting down the QoS poller is tricky, because 1290 * during the asynchronous operation the user could open 1291 * a new descriptor and create a new channel, spawning 1292 * a new QoS poller. 1293 * 1294 * The strategy is to create a new QoS structure here and swap it 1295 * in. The shutdown path then continues to refer to the old one 1296 * until it completes and then releases it. 1297 */ 1298 struct spdk_bdev_qos *new_qos, *old_qos; 1299 1300 old_qos = bdev->qos; 1301 1302 new_qos = calloc(1, sizeof(*new_qos)); 1303 if (!new_qos) { 1304 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 1305 return -ENOMEM; 1306 } 1307 1308 /* Copy the old QoS data into the newly allocated structure */ 1309 memcpy(new_qos, old_qos, sizeof(*new_qos)); 1310 1311 /* Zero out the key parts of the QoS structure */ 1312 new_qos->ch = NULL; 1313 new_qos->thread = NULL; 1314 new_qos->max_ios_per_timeslice = 0; 1315 new_qos->max_byte_per_timeslice = 0; 1316 new_qos->io_submitted_this_timeslice = 0; 1317 new_qos->byte_submitted_this_timeslice = 0; 1318 new_qos->poller = NULL; 1319 TAILQ_INIT(&new_qos->queued); 1320 1321 bdev->qos = new_qos; 1322 1323 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 1324 old_qos); 1325 1326 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 1327 * been destroyed yet. The destruction path will end up waiting for the final 1328 * channel to be put before it releases resources. */ 1329 1330 return 0; 1331 } 1332 1333 static void 1334 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 1335 { 1336 struct spdk_bdev_channel *ch = ctx_buf; 1337 struct spdk_bdev_mgmt_channel *mgmt_ch; 1338 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1339 1340 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 1341 spdk_get_thread()); 1342 1343 mgmt_ch = shared_resource->mgmt_ch; 1344 1345 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 1346 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 1347 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 1348 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 1349 1350 _spdk_bdev_channel_destroy_resource(ch); 1351 } 1352 1353 int 1354 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 1355 { 1356 struct spdk_bdev_alias *tmp; 1357 1358 if (alias == NULL) { 1359 SPDK_ERRLOG("Empty alias passed\n"); 1360 return -EINVAL; 1361 } 1362 1363 if (spdk_bdev_get_by_name(alias)) { 1364 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 1365 return -EEXIST; 1366 } 1367 1368 tmp = calloc(1, sizeof(*tmp)); 1369 if (tmp == NULL) { 1370 SPDK_ERRLOG("Unable to allocate alias\n"); 1371 return -ENOMEM; 1372 } 1373 1374 tmp->alias = strdup(alias); 1375 if (tmp->alias == NULL) { 1376 free(tmp); 1377 SPDK_ERRLOG("Unable to allocate alias\n"); 1378 return -ENOMEM; 1379 } 1380 1381 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 1382 1383 return 0; 1384 } 1385 1386 int 1387 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 1388 { 1389 struct spdk_bdev_alias *tmp; 1390 1391 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 1392 if (strcmp(alias, tmp->alias) == 0) { 1393 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 1394 free(tmp->alias); 1395 free(tmp); 1396 return 0; 1397 } 1398 } 1399 1400 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 1401 1402 return -ENOENT; 1403 } 1404 1405 struct spdk_io_channel * 1406 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 1407 { 1408 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 1409 } 1410 1411 const char * 1412 spdk_bdev_get_name(const struct spdk_bdev *bdev) 1413 { 1414 return bdev->name; 1415 } 1416 1417 const char * 1418 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 1419 { 1420 return bdev->product_name; 1421 } 1422 1423 const struct spdk_bdev_aliases_list * 1424 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 1425 { 1426 return &bdev->aliases; 1427 } 1428 1429 uint32_t 1430 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 1431 { 1432 return bdev->blocklen; 1433 } 1434 1435 uint64_t 1436 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 1437 { 1438 return bdev->blockcnt; 1439 } 1440 1441 uint64_t 1442 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev) 1443 { 1444 uint64_t iops_rate_limit = 0; 1445 1446 pthread_mutex_lock(&bdev->mutex); 1447 if (bdev->qos) { 1448 iops_rate_limit = bdev->qos->iops_rate_limit; 1449 } 1450 pthread_mutex_unlock(&bdev->mutex); 1451 1452 return iops_rate_limit; 1453 } 1454 1455 size_t 1456 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 1457 { 1458 /* TODO: push this logic down to the bdev modules */ 1459 if (bdev->need_aligned_buffer) { 1460 return bdev->blocklen; 1461 } 1462 1463 return 1; 1464 } 1465 1466 uint32_t 1467 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 1468 { 1469 return bdev->optimal_io_boundary; 1470 } 1471 1472 bool 1473 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 1474 { 1475 return bdev->write_cache; 1476 } 1477 1478 const struct spdk_uuid * 1479 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 1480 { 1481 return &bdev->uuid; 1482 } 1483 1484 int 1485 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 1486 { 1487 int ret; 1488 1489 pthread_mutex_lock(&bdev->mutex); 1490 1491 /* bdev has open descriptors */ 1492 if (!TAILQ_EMPTY(&bdev->open_descs) && 1493 bdev->blockcnt > size) { 1494 ret = -EBUSY; 1495 } else { 1496 bdev->blockcnt = size; 1497 ret = 0; 1498 } 1499 1500 pthread_mutex_unlock(&bdev->mutex); 1501 1502 return ret; 1503 } 1504 1505 /* 1506 * Convert I/O offset and length from bytes to blocks. 1507 * 1508 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 1509 */ 1510 static uint64_t 1511 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 1512 uint64_t num_bytes, uint64_t *num_blocks) 1513 { 1514 uint32_t block_size = bdev->blocklen; 1515 1516 *offset_blocks = offset_bytes / block_size; 1517 *num_blocks = num_bytes / block_size; 1518 1519 return (offset_bytes % block_size) | (num_bytes % block_size); 1520 } 1521 1522 static bool 1523 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 1524 { 1525 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 1526 * has been an overflow and hence the offset has been wrapped around */ 1527 if (offset_blocks + num_blocks < offset_blocks) { 1528 return false; 1529 } 1530 1531 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 1532 if (offset_blocks + num_blocks > bdev->blockcnt) { 1533 return false; 1534 } 1535 1536 return true; 1537 } 1538 1539 int 1540 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1541 void *buf, uint64_t offset, uint64_t nbytes, 1542 spdk_bdev_io_completion_cb cb, void *cb_arg) 1543 { 1544 uint64_t offset_blocks, num_blocks; 1545 1546 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1547 return -EINVAL; 1548 } 1549 1550 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1551 } 1552 1553 int 1554 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1555 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1556 spdk_bdev_io_completion_cb cb, void *cb_arg) 1557 { 1558 struct spdk_bdev *bdev = desc->bdev; 1559 struct spdk_bdev_io *bdev_io; 1560 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1561 1562 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1563 return -EINVAL; 1564 } 1565 1566 bdev_io = spdk_bdev_get_io(channel); 1567 if (!bdev_io) { 1568 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1569 return -ENOMEM; 1570 } 1571 1572 bdev_io->ch = channel; 1573 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1574 bdev_io->u.bdev.iov.iov_base = buf; 1575 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1576 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1577 bdev_io->u.bdev.iovcnt = 1; 1578 bdev_io->u.bdev.num_blocks = num_blocks; 1579 bdev_io->u.bdev.offset_blocks = offset_blocks; 1580 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1581 1582 spdk_bdev_io_submit(bdev_io); 1583 return 0; 1584 } 1585 1586 int 1587 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1588 struct iovec *iov, int iovcnt, 1589 uint64_t offset, uint64_t nbytes, 1590 spdk_bdev_io_completion_cb cb, void *cb_arg) 1591 { 1592 uint64_t offset_blocks, num_blocks; 1593 1594 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1595 return -EINVAL; 1596 } 1597 1598 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1599 } 1600 1601 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1602 struct iovec *iov, int iovcnt, 1603 uint64_t offset_blocks, uint64_t num_blocks, 1604 spdk_bdev_io_completion_cb cb, void *cb_arg) 1605 { 1606 struct spdk_bdev *bdev = desc->bdev; 1607 struct spdk_bdev_io *bdev_io; 1608 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1609 1610 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1611 return -EINVAL; 1612 } 1613 1614 bdev_io = spdk_bdev_get_io(channel); 1615 if (!bdev_io) { 1616 SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n"); 1617 return -ENOMEM; 1618 } 1619 1620 bdev_io->ch = channel; 1621 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 1622 bdev_io->u.bdev.iovs = iov; 1623 bdev_io->u.bdev.iovcnt = iovcnt; 1624 bdev_io->u.bdev.num_blocks = num_blocks; 1625 bdev_io->u.bdev.offset_blocks = offset_blocks; 1626 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1627 1628 spdk_bdev_io_submit(bdev_io); 1629 return 0; 1630 } 1631 1632 int 1633 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1634 void *buf, uint64_t offset, uint64_t nbytes, 1635 spdk_bdev_io_completion_cb cb, void *cb_arg) 1636 { 1637 uint64_t offset_blocks, num_blocks; 1638 1639 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1640 return -EINVAL; 1641 } 1642 1643 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 1644 } 1645 1646 int 1647 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1648 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 1649 spdk_bdev_io_completion_cb cb, void *cb_arg) 1650 { 1651 struct spdk_bdev *bdev = desc->bdev; 1652 struct spdk_bdev_io *bdev_io; 1653 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1654 1655 if (!desc->write) { 1656 return -EBADF; 1657 } 1658 1659 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1660 return -EINVAL; 1661 } 1662 1663 bdev_io = spdk_bdev_get_io(channel); 1664 if (!bdev_io) { 1665 SPDK_ERRLOG("bdev_io memory allocation failed duing write\n"); 1666 return -ENOMEM; 1667 } 1668 1669 bdev_io->ch = channel; 1670 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1671 bdev_io->u.bdev.iov.iov_base = buf; 1672 bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen; 1673 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1674 bdev_io->u.bdev.iovcnt = 1; 1675 bdev_io->u.bdev.num_blocks = num_blocks; 1676 bdev_io->u.bdev.offset_blocks = offset_blocks; 1677 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1678 1679 spdk_bdev_io_submit(bdev_io); 1680 return 0; 1681 } 1682 1683 int 1684 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1685 struct iovec *iov, int iovcnt, 1686 uint64_t offset, uint64_t len, 1687 spdk_bdev_io_completion_cb cb, void *cb_arg) 1688 { 1689 uint64_t offset_blocks, num_blocks; 1690 1691 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1692 return -EINVAL; 1693 } 1694 1695 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 1696 } 1697 1698 int 1699 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1700 struct iovec *iov, int iovcnt, 1701 uint64_t offset_blocks, uint64_t num_blocks, 1702 spdk_bdev_io_completion_cb cb, void *cb_arg) 1703 { 1704 struct spdk_bdev *bdev = desc->bdev; 1705 struct spdk_bdev_io *bdev_io; 1706 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1707 1708 if (!desc->write) { 1709 return -EBADF; 1710 } 1711 1712 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1713 return -EINVAL; 1714 } 1715 1716 bdev_io = spdk_bdev_get_io(channel); 1717 if (!bdev_io) { 1718 SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n"); 1719 return -ENOMEM; 1720 } 1721 1722 bdev_io->ch = channel; 1723 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1724 bdev_io->u.bdev.iovs = iov; 1725 bdev_io->u.bdev.iovcnt = iovcnt; 1726 bdev_io->u.bdev.num_blocks = num_blocks; 1727 bdev_io->u.bdev.offset_blocks = offset_blocks; 1728 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1729 1730 spdk_bdev_io_submit(bdev_io); 1731 return 0; 1732 } 1733 1734 int 1735 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1736 uint64_t offset, uint64_t len, 1737 spdk_bdev_io_completion_cb cb, void *cb_arg) 1738 { 1739 uint64_t offset_blocks, num_blocks; 1740 1741 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 1742 return -EINVAL; 1743 } 1744 1745 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1746 } 1747 1748 int 1749 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1750 uint64_t offset_blocks, uint64_t num_blocks, 1751 spdk_bdev_io_completion_cb cb, void *cb_arg) 1752 { 1753 struct spdk_bdev *bdev = desc->bdev; 1754 struct spdk_bdev_io *bdev_io; 1755 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1756 uint64_t len; 1757 bool split_request = false; 1758 1759 if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) { 1760 SPDK_ERRLOG("length argument out of range in write_zeroes\n"); 1761 return -ERANGE; 1762 } 1763 1764 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1765 return -EINVAL; 1766 } 1767 1768 bdev_io = spdk_bdev_get_io(channel); 1769 1770 if (!bdev_io) { 1771 SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n"); 1772 return -ENOMEM; 1773 } 1774 1775 bdev_io->ch = channel; 1776 bdev_io->u.bdev.offset_blocks = offset_blocks; 1777 1778 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 1779 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1780 bdev_io->u.bdev.num_blocks = num_blocks; 1781 bdev_io->u.bdev.iovs = NULL; 1782 bdev_io->u.bdev.iovcnt = 0; 1783 1784 } else { 1785 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 1786 1787 len = spdk_bdev_get_block_size(bdev) * num_blocks; 1788 1789 if (len > ZERO_BUFFER_SIZE) { 1790 split_request = true; 1791 len = ZERO_BUFFER_SIZE; 1792 } 1793 1794 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 1795 bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer; 1796 bdev_io->u.bdev.iov.iov_len = len; 1797 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1798 bdev_io->u.bdev.iovcnt = 1; 1799 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev); 1800 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks; 1801 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks; 1802 } 1803 1804 if (split_request) { 1805 bdev_io->u.bdev.stored_user_cb = cb; 1806 spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split); 1807 } else { 1808 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1809 } 1810 spdk_bdev_io_submit(bdev_io); 1811 return 0; 1812 } 1813 1814 int 1815 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1816 uint64_t offset, uint64_t nbytes, 1817 spdk_bdev_io_completion_cb cb, void *cb_arg) 1818 { 1819 uint64_t offset_blocks, num_blocks; 1820 1821 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 1822 return -EINVAL; 1823 } 1824 1825 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1826 } 1827 1828 int 1829 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1830 uint64_t offset_blocks, uint64_t num_blocks, 1831 spdk_bdev_io_completion_cb cb, void *cb_arg) 1832 { 1833 struct spdk_bdev *bdev = desc->bdev; 1834 struct spdk_bdev_io *bdev_io; 1835 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1836 1837 if (!desc->write) { 1838 return -EBADF; 1839 } 1840 1841 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1842 return -EINVAL; 1843 } 1844 1845 if (num_blocks == 0) { 1846 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 1847 return -EINVAL; 1848 } 1849 1850 bdev_io = spdk_bdev_get_io(channel); 1851 if (!bdev_io) { 1852 SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n"); 1853 return -ENOMEM; 1854 } 1855 1856 bdev_io->ch = channel; 1857 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 1858 bdev_io->u.bdev.iov.iov_base = NULL; 1859 bdev_io->u.bdev.iov.iov_len = 0; 1860 bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov; 1861 bdev_io->u.bdev.iovcnt = 1; 1862 bdev_io->u.bdev.offset_blocks = offset_blocks; 1863 bdev_io->u.bdev.num_blocks = num_blocks; 1864 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1865 1866 spdk_bdev_io_submit(bdev_io); 1867 return 0; 1868 } 1869 1870 int 1871 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1872 uint64_t offset, uint64_t length, 1873 spdk_bdev_io_completion_cb cb, void *cb_arg) 1874 { 1875 uint64_t offset_blocks, num_blocks; 1876 1877 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 1878 return -EINVAL; 1879 } 1880 1881 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 1882 } 1883 1884 int 1885 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 1886 uint64_t offset_blocks, uint64_t num_blocks, 1887 spdk_bdev_io_completion_cb cb, void *cb_arg) 1888 { 1889 struct spdk_bdev *bdev = desc->bdev; 1890 struct spdk_bdev_io *bdev_io; 1891 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 1892 1893 if (!desc->write) { 1894 return -EBADF; 1895 } 1896 1897 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 1898 return -EINVAL; 1899 } 1900 1901 bdev_io = spdk_bdev_get_io(channel); 1902 if (!bdev_io) { 1903 SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n"); 1904 return -ENOMEM; 1905 } 1906 1907 bdev_io->ch = channel; 1908 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 1909 bdev_io->u.bdev.iovs = NULL; 1910 bdev_io->u.bdev.iovcnt = 0; 1911 bdev_io->u.bdev.offset_blocks = offset_blocks; 1912 bdev_io->u.bdev.num_blocks = num_blocks; 1913 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 1914 1915 spdk_bdev_io_submit(bdev_io); 1916 return 0; 1917 } 1918 1919 static void 1920 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 1921 { 1922 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 1923 struct spdk_bdev_io *bdev_io; 1924 1925 bdev_io = TAILQ_FIRST(&ch->queued_resets); 1926 TAILQ_REMOVE(&ch->queued_resets, bdev_io, link); 1927 spdk_bdev_io_submit_reset(bdev_io); 1928 } 1929 1930 static void 1931 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 1932 { 1933 struct spdk_io_channel *ch; 1934 struct spdk_bdev_channel *channel; 1935 struct spdk_bdev_mgmt_channel *mgmt_channel; 1936 struct spdk_bdev_shared_resource *shared_resource; 1937 bdev_io_tailq_t tmp_queued; 1938 1939 TAILQ_INIT(&tmp_queued); 1940 1941 ch = spdk_io_channel_iter_get_channel(i); 1942 channel = spdk_io_channel_get_ctx(ch); 1943 shared_resource = channel->shared_resource; 1944 mgmt_channel = shared_resource->mgmt_ch; 1945 1946 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 1947 1948 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 1949 /* The QoS object is always valid and readable while 1950 * the channel flag is set, so the lock here should not 1951 * be necessary. We're not in the fast path though, so 1952 * just take it anyway. */ 1953 pthread_mutex_lock(&channel->bdev->mutex); 1954 if (channel->bdev->qos->ch == channel) { 1955 TAILQ_SWAP(&channel->bdev->qos->queued, &tmp_queued, spdk_bdev_io, link); 1956 } 1957 pthread_mutex_unlock(&channel->bdev->mutex); 1958 } 1959 1960 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 1961 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 1962 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 1963 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 1964 1965 spdk_for_each_channel_continue(i, 0); 1966 } 1967 1968 static void 1969 _spdk_bdev_start_reset(void *ctx) 1970 { 1971 struct spdk_bdev_channel *ch = ctx; 1972 1973 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 1974 ch, _spdk_bdev_reset_dev); 1975 } 1976 1977 static void 1978 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 1979 { 1980 struct spdk_bdev *bdev = ch->bdev; 1981 1982 assert(!TAILQ_EMPTY(&ch->queued_resets)); 1983 1984 pthread_mutex_lock(&bdev->mutex); 1985 if (bdev->reset_in_progress == NULL) { 1986 bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 1987 /* 1988 * Take a channel reference for the target bdev for the life of this 1989 * reset. This guards against the channel getting destroyed while 1990 * spdk_for_each_channel() calls related to this reset IO are in 1991 * progress. We will release the reference when this reset is 1992 * completed. 1993 */ 1994 bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1995 _spdk_bdev_start_reset(ch); 1996 } 1997 pthread_mutex_unlock(&bdev->mutex); 1998 } 1999 2000 int 2001 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2002 spdk_bdev_io_completion_cb cb, void *cb_arg) 2003 { 2004 struct spdk_bdev *bdev = desc->bdev; 2005 struct spdk_bdev_io *bdev_io; 2006 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2007 2008 bdev_io = spdk_bdev_get_io(channel); 2009 if (!bdev_io) { 2010 SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n"); 2011 return -ENOMEM; 2012 } 2013 2014 bdev_io->ch = channel; 2015 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2016 bdev_io->u.reset.ch_ref = NULL; 2017 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2018 2019 pthread_mutex_lock(&bdev->mutex); 2020 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link); 2021 pthread_mutex_unlock(&bdev->mutex); 2022 2023 _spdk_bdev_channel_start_reset(channel); 2024 2025 return 0; 2026 } 2027 2028 void 2029 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2030 struct spdk_bdev_io_stat *stat) 2031 { 2032 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2033 2034 *stat = channel->stat; 2035 } 2036 2037 static void 2038 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 2039 { 2040 void *io_device = spdk_io_channel_iter_get_io_device(i); 2041 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2042 2043 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 2044 bdev_iostat_ctx->cb_arg, 0); 2045 free(bdev_iostat_ctx); 2046 } 2047 2048 static void 2049 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 2050 { 2051 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2052 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2053 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2054 2055 bdev_iostat_ctx->stat->bytes_read += channel->stat.bytes_read; 2056 bdev_iostat_ctx->stat->num_read_ops += channel->stat.num_read_ops; 2057 bdev_iostat_ctx->stat->bytes_written += channel->stat.bytes_written; 2058 bdev_iostat_ctx->stat->num_write_ops += channel->stat.num_write_ops; 2059 2060 spdk_for_each_channel_continue(i, 0); 2061 } 2062 2063 void 2064 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 2065 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 2066 { 2067 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 2068 2069 assert(bdev != NULL); 2070 assert(stat != NULL); 2071 assert(cb != NULL); 2072 2073 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 2074 if (bdev_iostat_ctx == NULL) { 2075 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 2076 cb(bdev, stat, cb_arg, -ENOMEM); 2077 return; 2078 } 2079 2080 bdev_iostat_ctx->stat = stat; 2081 bdev_iostat_ctx->cb = cb; 2082 bdev_iostat_ctx->cb_arg = cb_arg; 2083 2084 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2085 _spdk_bdev_get_each_channel_stat, 2086 bdev_iostat_ctx, 2087 _spdk_bdev_get_device_stat_done); 2088 } 2089 2090 int 2091 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2092 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2093 spdk_bdev_io_completion_cb cb, void *cb_arg) 2094 { 2095 struct spdk_bdev *bdev = desc->bdev; 2096 struct spdk_bdev_io *bdev_io; 2097 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2098 2099 if (!desc->write) { 2100 return -EBADF; 2101 } 2102 2103 bdev_io = spdk_bdev_get_io(channel); 2104 if (!bdev_io) { 2105 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2106 return -ENOMEM; 2107 } 2108 2109 bdev_io->ch = channel; 2110 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 2111 bdev_io->u.nvme_passthru.cmd = *cmd; 2112 bdev_io->u.nvme_passthru.buf = buf; 2113 bdev_io->u.nvme_passthru.nbytes = nbytes; 2114 bdev_io->u.nvme_passthru.md_buf = NULL; 2115 bdev_io->u.nvme_passthru.md_len = 0; 2116 2117 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2118 2119 spdk_bdev_io_submit(bdev_io); 2120 return 0; 2121 } 2122 2123 int 2124 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2125 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2126 spdk_bdev_io_completion_cb cb, void *cb_arg) 2127 { 2128 struct spdk_bdev *bdev = desc->bdev; 2129 struct spdk_bdev_io *bdev_io; 2130 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2131 2132 if (!desc->write) { 2133 /* 2134 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2135 * to easily determine if the command is a read or write, but for now just 2136 * do not allow io_passthru with a read-only descriptor. 2137 */ 2138 return -EBADF; 2139 } 2140 2141 bdev_io = spdk_bdev_get_io(channel); 2142 if (!bdev_io) { 2143 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2144 return -ENOMEM; 2145 } 2146 2147 bdev_io->ch = channel; 2148 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 2149 bdev_io->u.nvme_passthru.cmd = *cmd; 2150 bdev_io->u.nvme_passthru.buf = buf; 2151 bdev_io->u.nvme_passthru.nbytes = nbytes; 2152 bdev_io->u.nvme_passthru.md_buf = NULL; 2153 bdev_io->u.nvme_passthru.md_len = 0; 2154 2155 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2156 2157 spdk_bdev_io_submit(bdev_io); 2158 return 0; 2159 } 2160 2161 int 2162 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2163 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 2164 spdk_bdev_io_completion_cb cb, void *cb_arg) 2165 { 2166 struct spdk_bdev *bdev = desc->bdev; 2167 struct spdk_bdev_io *bdev_io; 2168 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2169 2170 if (!desc->write) { 2171 /* 2172 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 2173 * to easily determine if the command is a read or write, but for now just 2174 * do not allow io_passthru with a read-only descriptor. 2175 */ 2176 return -EBADF; 2177 } 2178 2179 bdev_io = spdk_bdev_get_io(channel); 2180 if (!bdev_io) { 2181 SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n"); 2182 return -ENOMEM; 2183 } 2184 2185 bdev_io->ch = channel; 2186 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 2187 bdev_io->u.nvme_passthru.cmd = *cmd; 2188 bdev_io->u.nvme_passthru.buf = buf; 2189 bdev_io->u.nvme_passthru.nbytes = nbytes; 2190 bdev_io->u.nvme_passthru.md_buf = md_buf; 2191 bdev_io->u.nvme_passthru.md_len = md_len; 2192 2193 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2194 2195 spdk_bdev_io_submit(bdev_io); 2196 return 0; 2197 } 2198 2199 int 2200 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2201 { 2202 if (!bdev_io) { 2203 SPDK_ERRLOG("bdev_io is NULL\n"); 2204 return -1; 2205 } 2206 2207 if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) { 2208 SPDK_ERRLOG("bdev_io is in pending state\n"); 2209 assert(false); 2210 return -1; 2211 } 2212 2213 spdk_bdev_put_io(bdev_io); 2214 2215 return 0; 2216 } 2217 2218 static void 2219 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 2220 { 2221 struct spdk_bdev *bdev = bdev_ch->bdev; 2222 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2223 struct spdk_bdev_io *bdev_io; 2224 2225 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 2226 /* 2227 * Allow some more I/O to complete before retrying the nomem_io queue. 2228 * Some drivers (such as nvme) cannot immediately take a new I/O in 2229 * the context of a completion, because the resources for the I/O are 2230 * not released until control returns to the bdev poller. Also, we 2231 * may require several small I/O to complete before a larger I/O 2232 * (that requires splitting) can be submitted. 2233 */ 2234 return; 2235 } 2236 2237 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 2238 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 2239 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, link); 2240 bdev_io->ch->io_outstanding++; 2241 shared_resource->io_outstanding++; 2242 bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; 2243 bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); 2244 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { 2245 break; 2246 } 2247 } 2248 } 2249 2250 static inline void 2251 _spdk_bdev_io_complete(void *ctx) 2252 { 2253 struct spdk_bdev_io *bdev_io = ctx; 2254 2255 if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) { 2256 /* 2257 * Send the completion to the thread that originally submitted the I/O, 2258 * which may not be the current thread in the case of QoS. 2259 */ 2260 if (bdev_io->io_submit_ch) { 2261 bdev_io->ch = bdev_io->io_submit_ch; 2262 bdev_io->io_submit_ch = NULL; 2263 } 2264 2265 /* 2266 * Defer completion to avoid potential infinite recursion if the 2267 * user's completion callback issues a new I/O. 2268 */ 2269 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel), 2270 _spdk_bdev_io_complete, bdev_io); 2271 return; 2272 } 2273 2274 if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2275 switch (bdev_io->type) { 2276 case SPDK_BDEV_IO_TYPE_READ: 2277 bdev_io->ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2278 bdev_io->ch->stat.num_read_ops++; 2279 bdev_io->ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2280 break; 2281 case SPDK_BDEV_IO_TYPE_WRITE: 2282 bdev_io->ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2283 bdev_io->ch->stat.num_write_ops++; 2284 bdev_io->ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc); 2285 break; 2286 default: 2287 break; 2288 } 2289 } 2290 2291 #ifdef SPDK_CONFIG_VTUNE 2292 uint64_t now_tsc = spdk_get_ticks(); 2293 if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) { 2294 uint64_t data[5]; 2295 2296 data[0] = bdev_io->ch->stat.num_read_ops - bdev_io->ch->prev_stat.num_read_ops; 2297 data[1] = bdev_io->ch->stat.bytes_read - bdev_io->ch->prev_stat.bytes_read; 2298 data[2] = bdev_io->ch->stat.num_write_ops - bdev_io->ch->prev_stat.num_write_ops; 2299 data[3] = bdev_io->ch->stat.bytes_written - bdev_io->ch->prev_stat.bytes_written; 2300 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 2301 bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0; 2302 2303 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle, 2304 __itt_metadata_u64, 5, data); 2305 2306 bdev_io->ch->prev_stat = bdev_io->ch->stat; 2307 bdev_io->ch->start_tsc = now_tsc; 2308 } 2309 #endif 2310 2311 assert(bdev_io->cb != NULL); 2312 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel)); 2313 2314 bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS, 2315 bdev_io->caller_ctx); 2316 } 2317 2318 static void 2319 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 2320 { 2321 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 2322 2323 if (bdev_io->u.reset.ch_ref != NULL) { 2324 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 2325 bdev_io->u.reset.ch_ref = NULL; 2326 } 2327 2328 _spdk_bdev_io_complete(bdev_io); 2329 } 2330 2331 static void 2332 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 2333 { 2334 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2335 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 2336 2337 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 2338 if (!TAILQ_EMPTY(&ch->queued_resets)) { 2339 _spdk_bdev_channel_start_reset(ch); 2340 } 2341 2342 spdk_for_each_channel_continue(i, 0); 2343 } 2344 2345 void 2346 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 2347 { 2348 struct spdk_bdev *bdev = bdev_io->bdev; 2349 struct spdk_bdev_channel *bdev_ch = bdev_io->ch; 2350 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2351 2352 bdev_io->status = status; 2353 2354 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 2355 bool unlock_channels = false; 2356 2357 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 2358 SPDK_ERRLOG("NOMEM returned for reset\n"); 2359 } 2360 pthread_mutex_lock(&bdev->mutex); 2361 if (bdev_io == bdev->reset_in_progress) { 2362 bdev->reset_in_progress = NULL; 2363 unlock_channels = true; 2364 } 2365 pthread_mutex_unlock(&bdev->mutex); 2366 2367 if (unlock_channels) { 2368 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 2369 bdev_io, _spdk_bdev_reset_complete); 2370 return; 2371 } 2372 } else { 2373 assert(bdev_ch->io_outstanding > 0); 2374 assert(shared_resource->io_outstanding > 0); 2375 bdev_ch->io_outstanding--; 2376 shared_resource->io_outstanding--; 2377 2378 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 2379 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, link); 2380 /* 2381 * Wait for some of the outstanding I/O to complete before we 2382 * retry any of the nomem_io. Normally we will wait for 2383 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 2384 * depth channels we will instead wait for half to complete. 2385 */ 2386 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 2387 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 2388 return; 2389 } 2390 2391 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 2392 _spdk_bdev_ch_retry_io(bdev_ch); 2393 } 2394 } 2395 2396 _spdk_bdev_io_complete(bdev_io); 2397 } 2398 2399 void 2400 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 2401 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 2402 { 2403 if (sc == SPDK_SCSI_STATUS_GOOD) { 2404 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2405 } else { 2406 bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 2407 bdev_io->error.scsi.sc = sc; 2408 bdev_io->error.scsi.sk = sk; 2409 bdev_io->error.scsi.asc = asc; 2410 bdev_io->error.scsi.ascq = ascq; 2411 } 2412 2413 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2414 } 2415 2416 void 2417 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 2418 int *sc, int *sk, int *asc, int *ascq) 2419 { 2420 assert(sc != NULL); 2421 assert(sk != NULL); 2422 assert(asc != NULL); 2423 assert(ascq != NULL); 2424 2425 switch (bdev_io->status) { 2426 case SPDK_BDEV_IO_STATUS_SUCCESS: 2427 *sc = SPDK_SCSI_STATUS_GOOD; 2428 *sk = SPDK_SCSI_SENSE_NO_SENSE; 2429 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2430 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2431 break; 2432 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 2433 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 2434 break; 2435 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 2436 *sc = bdev_io->error.scsi.sc; 2437 *sk = bdev_io->error.scsi.sk; 2438 *asc = bdev_io->error.scsi.asc; 2439 *ascq = bdev_io->error.scsi.ascq; 2440 break; 2441 default: 2442 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 2443 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 2444 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 2445 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 2446 break; 2447 } 2448 } 2449 2450 void 2451 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 2452 { 2453 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 2454 bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS; 2455 } else { 2456 bdev_io->error.nvme.sct = sct; 2457 bdev_io->error.nvme.sc = sc; 2458 bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 2459 } 2460 2461 spdk_bdev_io_complete(bdev_io, bdev_io->status); 2462 } 2463 2464 void 2465 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 2466 { 2467 assert(sct != NULL); 2468 assert(sc != NULL); 2469 2470 if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 2471 *sct = bdev_io->error.nvme.sct; 2472 *sc = bdev_io->error.nvme.sc; 2473 } else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 2474 *sct = SPDK_NVME_SCT_GENERIC; 2475 *sc = SPDK_NVME_SC_SUCCESS; 2476 } else { 2477 *sct = SPDK_NVME_SCT_GENERIC; 2478 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 2479 } 2480 } 2481 2482 struct spdk_thread * 2483 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 2484 { 2485 return spdk_io_channel_get_thread(bdev_io->ch->channel); 2486 } 2487 2488 static void 2489 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set, 2490 enum spdk_bdev_qos_type qos_type) 2491 { 2492 uint64_t min_qos_set = 0; 2493 2494 switch (qos_type) { 2495 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2496 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 2497 break; 2498 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2499 min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC; 2500 break; 2501 default: 2502 SPDK_ERRLOG("Unsupported QoS type.\n"); 2503 return; 2504 } 2505 2506 if (qos_set % min_qos_set) { 2507 SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n", 2508 qos_set, bdev->name, min_qos_set); 2509 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 2510 return; 2511 } 2512 2513 if (!bdev->qos) { 2514 bdev->qos = calloc(1, sizeof(*bdev->qos)); 2515 if (!bdev->qos) { 2516 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 2517 return; 2518 } 2519 } 2520 2521 switch (qos_type) { 2522 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2523 bdev->qos->iops_rate_limit = qos_set; 2524 break; 2525 case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT: 2526 bdev->qos->byte_rate_limit = qos_set * 1024 * 1024; 2527 break; 2528 default: 2529 break; 2530 } 2531 2532 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 2533 bdev->name, qos_type, qos_set); 2534 2535 return; 2536 } 2537 2538 static void 2539 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 2540 { 2541 struct spdk_conf_section *sp = NULL; 2542 const char *val = NULL; 2543 uint64_t qos_set = 0; 2544 int i = 0, j = 0; 2545 2546 sp = spdk_conf_find_section(NULL, "QoS"); 2547 if (!sp) { 2548 return; 2549 } 2550 2551 while (j < SPDK_BDEV_QOS_NUM_TYPES) { 2552 i = 0; 2553 while (true) { 2554 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0); 2555 if (!val) { 2556 break; 2557 } 2558 2559 if (strcmp(bdev->name, val) != 0) { 2560 i++; 2561 continue; 2562 } 2563 2564 val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1); 2565 if (val) { 2566 qos_set = strtoull(val, NULL, 10); 2567 _spdk_bdev_qos_config_type(bdev, qos_set, j); 2568 } 2569 2570 break; 2571 } 2572 2573 j++; 2574 } 2575 2576 return; 2577 } 2578 2579 static int 2580 spdk_bdev_init(struct spdk_bdev *bdev) 2581 { 2582 assert(bdev->module != NULL); 2583 2584 if (!bdev->name) { 2585 SPDK_ERRLOG("Bdev name is NULL\n"); 2586 return -EINVAL; 2587 } 2588 2589 if (spdk_bdev_get_by_name(bdev->name)) { 2590 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 2591 return -EEXIST; 2592 } 2593 2594 bdev->status = SPDK_BDEV_STATUS_READY; 2595 2596 TAILQ_INIT(&bdev->open_descs); 2597 2598 TAILQ_INIT(&bdev->aliases); 2599 2600 bdev->reset_in_progress = NULL; 2601 2602 _spdk_bdev_qos_config(bdev); 2603 2604 spdk_io_device_register(__bdev_to_io_dev(bdev), 2605 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 2606 sizeof(struct spdk_bdev_channel)); 2607 2608 pthread_mutex_init(&bdev->mutex, NULL); 2609 return 0; 2610 } 2611 2612 static void 2613 spdk_bdev_destroy_cb(void *io_device) 2614 { 2615 int rc; 2616 struct spdk_bdev *bdev; 2617 spdk_bdev_unregister_cb cb_fn; 2618 void *cb_arg; 2619 2620 bdev = __bdev_from_io_dev(io_device); 2621 cb_fn = bdev->unregister_cb; 2622 cb_arg = bdev->unregister_ctx; 2623 2624 rc = bdev->fn_table->destruct(bdev->ctxt); 2625 if (rc < 0) { 2626 SPDK_ERRLOG("destruct failed\n"); 2627 } 2628 if (rc <= 0 && cb_fn != NULL) { 2629 cb_fn(cb_arg, rc); 2630 } 2631 } 2632 2633 2634 static void 2635 spdk_bdev_fini(struct spdk_bdev *bdev) 2636 { 2637 pthread_mutex_destroy(&bdev->mutex); 2638 2639 free(bdev->qos); 2640 2641 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 2642 } 2643 2644 static void 2645 spdk_bdev_start(struct spdk_bdev *bdev) 2646 { 2647 struct spdk_bdev_module *module; 2648 2649 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 2650 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link); 2651 2652 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) { 2653 if (module->examine) { 2654 module->action_in_progress++; 2655 module->examine(bdev); 2656 } 2657 } 2658 } 2659 2660 int 2661 spdk_bdev_register(struct spdk_bdev *bdev) 2662 { 2663 int rc = spdk_bdev_init(bdev); 2664 2665 if (rc == 0) { 2666 spdk_bdev_start(bdev); 2667 } 2668 2669 return rc; 2670 } 2671 2672 static void 2673 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev) 2674 { 2675 struct spdk_bdev **bdevs; 2676 struct spdk_bdev *base; 2677 size_t i, j, k; 2678 bool found; 2679 2680 /* Iterate over base bdevs to remove vbdev from them. */ 2681 for (i = 0; i < vbdev->base_bdevs_cnt; i++) { 2682 found = false; 2683 base = vbdev->base_bdevs[i]; 2684 2685 for (j = 0; j < base->vbdevs_cnt; j++) { 2686 if (base->vbdevs[j] != vbdev) { 2687 continue; 2688 } 2689 2690 for (k = j; k + 1 < base->vbdevs_cnt; k++) { 2691 base->vbdevs[k] = base->vbdevs[k + 1]; 2692 } 2693 2694 base->vbdevs_cnt--; 2695 if (base->vbdevs_cnt > 0) { 2696 bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0])); 2697 /* It would be odd if shrinking memory block fail. */ 2698 assert(bdevs); 2699 base->vbdevs = bdevs; 2700 } else { 2701 free(base->vbdevs); 2702 base->vbdevs = NULL; 2703 } 2704 2705 found = true; 2706 break; 2707 } 2708 2709 if (!found) { 2710 SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name); 2711 } 2712 } 2713 2714 free(vbdev->base_bdevs); 2715 vbdev->base_bdevs = NULL; 2716 vbdev->base_bdevs_cnt = 0; 2717 } 2718 2719 static int 2720 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt) 2721 { 2722 struct spdk_bdev **vbdevs; 2723 struct spdk_bdev *base; 2724 size_t i; 2725 2726 /* Adding base bdevs isn't supported (yet?). */ 2727 assert(vbdev->base_bdevs_cnt == 0); 2728 2729 vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0])); 2730 if (!vbdev->base_bdevs) { 2731 SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name); 2732 return -ENOMEM; 2733 } 2734 2735 memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0])); 2736 vbdev->base_bdevs_cnt = cnt; 2737 2738 /* Iterate over base bdevs to add this vbdev to them. */ 2739 for (i = 0; i < cnt; i++) { 2740 base = vbdev->base_bdevs[i]; 2741 2742 assert(base != NULL); 2743 assert(base->claim_module != NULL); 2744 2745 vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0])); 2746 if (!vbdevs) { 2747 SPDK_ERRLOG("%s - realloc() failed\n", base->name); 2748 spdk_vbdev_remove_base_bdevs(vbdev); 2749 return -ENOMEM; 2750 } 2751 2752 vbdevs[base->vbdevs_cnt] = vbdev; 2753 base->vbdevs = vbdevs; 2754 base->vbdevs_cnt++; 2755 } 2756 2757 return 0; 2758 } 2759 2760 int 2761 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 2762 { 2763 int rc; 2764 2765 rc = spdk_bdev_init(vbdev); 2766 if (rc) { 2767 return rc; 2768 } 2769 2770 if (base_bdev_count == 0) { 2771 spdk_bdev_start(vbdev); 2772 return 0; 2773 } 2774 2775 rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count); 2776 if (rc) { 2777 spdk_bdev_fini(vbdev); 2778 return rc; 2779 } 2780 2781 spdk_bdev_start(vbdev); 2782 return 0; 2783 2784 } 2785 2786 void 2787 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 2788 { 2789 if (bdev->unregister_cb != NULL) { 2790 bdev->unregister_cb(bdev->unregister_ctx, bdeverrno); 2791 } 2792 } 2793 2794 static void 2795 _remove_notify(void *arg) 2796 { 2797 struct spdk_bdev_desc *desc = arg; 2798 2799 desc->remove_cb(desc->remove_ctx); 2800 } 2801 2802 void 2803 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 2804 { 2805 struct spdk_bdev_desc *desc, *tmp; 2806 bool do_destruct = true; 2807 struct spdk_thread *thread; 2808 2809 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 2810 2811 thread = spdk_get_thread(); 2812 if (!thread) { 2813 /* The user called this from a non-SPDK thread. */ 2814 cb_fn(cb_arg, -ENOTSUP); 2815 return; 2816 } 2817 2818 pthread_mutex_lock(&bdev->mutex); 2819 2820 spdk_vbdev_remove_base_bdevs(bdev); 2821 2822 bdev->status = SPDK_BDEV_STATUS_REMOVING; 2823 bdev->unregister_cb = cb_fn; 2824 bdev->unregister_ctx = cb_arg; 2825 2826 TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) { 2827 if (desc->remove_cb) { 2828 do_destruct = false; 2829 /* 2830 * Defer invocation of the remove_cb to a separate message that will 2831 * run later on this thread. This ensures this context unwinds and 2832 * we don't recursively unregister this bdev again if the remove_cb 2833 * immediately closes its descriptor. 2834 */ 2835 spdk_thread_send_msg(thread, _remove_notify, desc); 2836 } 2837 } 2838 2839 if (!do_destruct) { 2840 pthread_mutex_unlock(&bdev->mutex); 2841 return; 2842 } 2843 2844 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link); 2845 pthread_mutex_unlock(&bdev->mutex); 2846 2847 spdk_bdev_fini(bdev); 2848 } 2849 2850 int 2851 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 2852 void *remove_ctx, struct spdk_bdev_desc **_desc) 2853 { 2854 struct spdk_bdev_desc *desc; 2855 2856 desc = calloc(1, sizeof(*desc)); 2857 if (desc == NULL) { 2858 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 2859 return -ENOMEM; 2860 } 2861 2862 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 2863 spdk_get_thread()); 2864 2865 pthread_mutex_lock(&bdev->mutex); 2866 2867 if (write && bdev->claim_module) { 2868 SPDK_ERRLOG("Could not open %s - already claimed\n", bdev->name); 2869 free(desc); 2870 pthread_mutex_unlock(&bdev->mutex); 2871 return -EPERM; 2872 } 2873 2874 TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link); 2875 2876 desc->bdev = bdev; 2877 desc->remove_cb = remove_cb; 2878 desc->remove_ctx = remove_ctx; 2879 desc->write = write; 2880 *_desc = desc; 2881 2882 pthread_mutex_unlock(&bdev->mutex); 2883 2884 return 0; 2885 } 2886 2887 void 2888 spdk_bdev_close(struct spdk_bdev_desc *desc) 2889 { 2890 struct spdk_bdev *bdev = desc->bdev; 2891 bool do_unregister = false; 2892 2893 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 2894 spdk_get_thread()); 2895 2896 pthread_mutex_lock(&bdev->mutex); 2897 2898 TAILQ_REMOVE(&bdev->open_descs, desc, link); 2899 free(desc); 2900 2901 /* If no more descriptors, kill QoS channel */ 2902 if (bdev->qos && TAILQ_EMPTY(&bdev->open_descs)) { 2903 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 2904 bdev->name, spdk_get_thread()); 2905 2906 if (spdk_bdev_qos_destroy(bdev)) { 2907 /* There isn't anything we can do to recover here. Just let the 2908 * old QoS poller keep running. The QoS handling won't change 2909 * cores when the user allocates a new channel, but it won't break. */ 2910 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 2911 } 2912 } 2913 2914 if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) { 2915 do_unregister = true; 2916 } 2917 pthread_mutex_unlock(&bdev->mutex); 2918 2919 if (do_unregister == true) { 2920 spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx); 2921 } 2922 } 2923 2924 int 2925 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 2926 struct spdk_bdev_module *module) 2927 { 2928 if (bdev->claim_module != NULL) { 2929 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 2930 bdev->claim_module->name); 2931 return -EPERM; 2932 } 2933 2934 if (desc && !desc->write) { 2935 desc->write = true; 2936 } 2937 2938 bdev->claim_module = module; 2939 return 0; 2940 } 2941 2942 void 2943 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 2944 { 2945 assert(bdev->claim_module != NULL); 2946 bdev->claim_module = NULL; 2947 } 2948 2949 struct spdk_bdev * 2950 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 2951 { 2952 return desc->bdev; 2953 } 2954 2955 void 2956 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 2957 { 2958 struct iovec *iovs; 2959 int iovcnt; 2960 2961 if (bdev_io == NULL) { 2962 return; 2963 } 2964 2965 switch (bdev_io->type) { 2966 case SPDK_BDEV_IO_TYPE_READ: 2967 iovs = bdev_io->u.bdev.iovs; 2968 iovcnt = bdev_io->u.bdev.iovcnt; 2969 break; 2970 case SPDK_BDEV_IO_TYPE_WRITE: 2971 iovs = bdev_io->u.bdev.iovs; 2972 iovcnt = bdev_io->u.bdev.iovcnt; 2973 break; 2974 default: 2975 iovs = NULL; 2976 iovcnt = 0; 2977 break; 2978 } 2979 2980 if (iovp) { 2981 *iovp = iovs; 2982 } 2983 if (iovcntp) { 2984 *iovcntp = iovcnt; 2985 } 2986 } 2987 2988 void 2989 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 2990 { 2991 2992 if (spdk_bdev_module_list_find(bdev_module->name)) { 2993 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 2994 assert(false); 2995 } 2996 2997 if (bdev_module->async_init) { 2998 bdev_module->action_in_progress = 1; 2999 } 3000 3001 /* 3002 * Modules with examine callbacks must be initialized first, so they are 3003 * ready to handle examine callbacks from later modules that will 3004 * register physical bdevs. 3005 */ 3006 if (bdev_module->examine != NULL) { 3007 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 3008 } else { 3009 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq); 3010 } 3011 } 3012 3013 struct spdk_bdev_module * 3014 spdk_bdev_module_list_find(const char *name) 3015 { 3016 struct spdk_bdev_module *bdev_module; 3017 3018 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) { 3019 if (strcmp(name, bdev_module->name) == 0) { 3020 break; 3021 } 3022 } 3023 3024 return bdev_module; 3025 } 3026 3027 static void 3028 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3029 { 3030 uint64_t len; 3031 3032 if (!success) { 3033 bdev_io->cb = bdev_io->u.bdev.stored_user_cb; 3034 _spdk_bdev_io_complete(bdev_io); 3035 return; 3036 } 3037 3038 /* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */ 3039 len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks, 3040 ZERO_BUFFER_SIZE); 3041 3042 bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks; 3043 bdev_io->u.bdev.iov.iov_len = len; 3044 bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev); 3045 bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks; 3046 bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks; 3047 3048 /* if this round completes the i/o, change the callback to be the original user callback */ 3049 if (bdev_io->u.bdev.split_remaining_num_blocks == 0) { 3050 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb); 3051 } else { 3052 spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split); 3053 } 3054 spdk_bdev_io_submit(bdev_io); 3055 } 3056 3057 struct set_qos_limit_ctx { 3058 void (*cb_fn)(void *cb_arg, int status); 3059 void *cb_arg; 3060 struct spdk_bdev *bdev; 3061 }; 3062 3063 static void 3064 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 3065 { 3066 pthread_mutex_lock(&ctx->bdev->mutex); 3067 ctx->bdev->qos_mod_in_progress = false; 3068 pthread_mutex_unlock(&ctx->bdev->mutex); 3069 3070 ctx->cb_fn(ctx->cb_arg, status); 3071 free(ctx); 3072 } 3073 3074 static void 3075 _spdk_bdev_disable_qos_done(void *cb_arg) 3076 { 3077 struct set_qos_limit_ctx *ctx = cb_arg; 3078 struct spdk_bdev *bdev = ctx->bdev; 3079 struct spdk_bdev_qos *qos; 3080 3081 pthread_mutex_lock(&bdev->mutex); 3082 qos = bdev->qos; 3083 bdev->qos = NULL; 3084 pthread_mutex_unlock(&bdev->mutex); 3085 3086 _spdk_bdev_abort_queued_io(&qos->queued, qos->ch); 3087 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3088 spdk_poller_unregister(&qos->poller); 3089 3090 free(qos); 3091 3092 _spdk_bdev_set_qos_limit_done(ctx, 0); 3093 } 3094 3095 static void 3096 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 3097 { 3098 void *io_device = spdk_io_channel_iter_get_io_device(i); 3099 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3100 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3101 struct spdk_thread *thread; 3102 3103 pthread_mutex_lock(&bdev->mutex); 3104 thread = bdev->qos->thread; 3105 pthread_mutex_unlock(&bdev->mutex); 3106 3107 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 3108 } 3109 3110 static void 3111 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 3112 { 3113 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3114 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3115 3116 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 3117 3118 spdk_for_each_channel_continue(i, 0); 3119 } 3120 3121 static void 3122 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg) 3123 { 3124 struct set_qos_limit_ctx *ctx = cb_arg; 3125 struct spdk_bdev *bdev = ctx->bdev; 3126 3127 pthread_mutex_lock(&bdev->mutex); 3128 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->qos); 3129 pthread_mutex_unlock(&bdev->mutex); 3130 3131 _spdk_bdev_set_qos_limit_done(ctx, 0); 3132 } 3133 3134 static void 3135 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 3136 { 3137 void *io_device = spdk_io_channel_iter_get_io_device(i); 3138 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3139 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3140 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 3141 int rc; 3142 3143 pthread_mutex_lock(&bdev->mutex); 3144 rc = _spdk_bdev_enable_qos(bdev, bdev_ch); 3145 pthread_mutex_unlock(&bdev->mutex); 3146 spdk_for_each_channel_continue(i, rc); 3147 } 3148 3149 static void 3150 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 3151 { 3152 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3153 3154 _spdk_bdev_set_qos_limit_done(ctx, status); 3155 } 3156 3157 void 3158 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec, 3159 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 3160 { 3161 struct set_qos_limit_ctx *ctx; 3162 3163 if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) { 3164 SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n", 3165 ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC); 3166 cb_fn(cb_arg, -EINVAL); 3167 return; 3168 } 3169 3170 ctx = calloc(1, sizeof(*ctx)); 3171 if (ctx == NULL) { 3172 cb_fn(cb_arg, -ENOMEM); 3173 return; 3174 } 3175 3176 ctx->cb_fn = cb_fn; 3177 ctx->cb_arg = cb_arg; 3178 ctx->bdev = bdev; 3179 3180 pthread_mutex_lock(&bdev->mutex); 3181 if (bdev->qos_mod_in_progress) { 3182 pthread_mutex_unlock(&bdev->mutex); 3183 free(ctx); 3184 cb_fn(cb_arg, -EAGAIN); 3185 return; 3186 } 3187 bdev->qos_mod_in_progress = true; 3188 3189 if (ios_per_sec > 0) { 3190 if (bdev->qos == NULL) { 3191 /* Enabling */ 3192 bdev->qos = calloc(1, sizeof(*bdev->qos)); 3193 if (!bdev->qos) { 3194 pthread_mutex_unlock(&bdev->mutex); 3195 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3196 free(ctx); 3197 cb_fn(cb_arg, -ENOMEM); 3198 return; 3199 } 3200 3201 bdev->qos->iops_rate_limit = ios_per_sec; 3202 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3203 _spdk_bdev_enable_qos_msg, ctx, 3204 _spdk_bdev_enable_qos_done); 3205 } else { 3206 /* Updating */ 3207 bdev->qos->iops_rate_limit = ios_per_sec; 3208 spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx); 3209 } 3210 } else { 3211 if (bdev->qos != NULL) { 3212 /* Disabling */ 3213 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3214 _spdk_bdev_disable_qos_msg, ctx, 3215 _spdk_bdev_disable_qos_msg_done); 3216 } else { 3217 pthread_mutex_unlock(&bdev->mutex); 3218 _spdk_bdev_set_qos_limit_done(ctx, 0); 3219 return; 3220 } 3221 } 3222 3223 pthread_mutex_unlock(&bdev->mutex); 3224 } 3225 3226 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 3227