1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk_internal/log.h" 52 #include "spdk/string.h" 53 54 #ifdef SPDK_CONFIG_VTUNE 55 #include "ittnotify.h" 56 #include "ittnotify_types.h" 57 int __itt_init_ittlib(const char *, __itt_group_id); 58 #endif 59 60 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 61 #define SPDK_BDEV_IO_CACHE_SIZE 256 62 #define BUF_SMALL_POOL_SIZE 8192 63 #define BUF_LARGE_POOL_SIZE 1024 64 #define NOMEM_THRESHOLD_COUNT 8 65 #define ZERO_BUFFER_SIZE 0x100000 66 67 #define OWNER_BDEV 0x2 68 69 #define OBJECT_BDEV_IO 0x2 70 71 #define TRACE_GROUP_BDEV 0x3 72 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 73 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 74 75 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 76 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 77 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 78 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 79 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (10 * 1024 * 1024) 80 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 81 82 #define SPDK_BDEV_POOL_ALIGNMENT 512 83 84 static const char *qos_conf_type[] = {"Limit_IOPS", 85 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 86 }; 87 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 88 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 89 }; 90 91 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 struct spdk_mempool *buf_small_pool; 97 struct spdk_mempool *buf_large_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 #ifdef SPDK_CONFIG_VTUNE 109 __itt_domain *domain; 110 #endif 111 }; 112 113 static struct spdk_bdev_mgr g_bdev_mgr = { 114 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 115 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 116 .init_complete = false, 117 .module_init_complete = false, 118 }; 119 120 static struct spdk_bdev_opts g_bdev_opts = { 121 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 122 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 123 }; 124 125 static spdk_bdev_init_cb g_init_cb_fn = NULL; 126 static void *g_init_cb_arg = NULL; 127 128 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 129 static void *g_fini_cb_arg = NULL; 130 static struct spdk_thread *g_fini_thread = NULL; 131 132 struct spdk_bdev_qos_limit { 133 /** IOs or bytes allowed per second (i.e., 1s). */ 134 uint64_t limit; 135 136 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 137 * For remaining bytes, allowed to run negative if an I/O is submitted when 138 * some bytes are remaining, but the I/O is bigger than that amount. The 139 * excess will be deducted from the next timeslice. 140 */ 141 int64_t remaining_this_timeslice; 142 143 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 144 uint32_t min_per_timeslice; 145 146 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 147 uint32_t max_per_timeslice; 148 149 /** Function to check whether to queue the IO. */ 150 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 151 152 /** Function to update for the submitted IO. */ 153 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 154 }; 155 156 struct spdk_bdev_qos { 157 /** Types of structure of rate limits. */ 158 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 159 160 /** The channel that all I/O are funneled through. */ 161 struct spdk_bdev_channel *ch; 162 163 /** The thread on which the poller is running. */ 164 struct spdk_thread *thread; 165 166 /** Queue of I/O waiting to be issued. */ 167 bdev_io_tailq_t queued; 168 169 /** Size of a timeslice in tsc ticks. */ 170 uint64_t timeslice_size; 171 172 /** Timestamp of start of last timeslice. */ 173 uint64_t last_timeslice; 174 175 /** Poller that processes queued I/O commands each time slice. */ 176 struct spdk_poller *poller; 177 }; 178 179 struct spdk_bdev_mgmt_channel { 180 bdev_io_stailq_t need_buf_small; 181 bdev_io_stailq_t need_buf_large; 182 183 /* 184 * Each thread keeps a cache of bdev_io - this allows 185 * bdev threads which are *not* DPDK threads to still 186 * benefit from a per-thread bdev_io cache. Without 187 * this, non-DPDK threads fetching from the mempool 188 * incur a cmpxchg on get and put. 189 */ 190 bdev_io_stailq_t per_thread_cache; 191 uint32_t per_thread_cache_count; 192 uint32_t bdev_io_cache_size; 193 194 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 195 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 196 }; 197 198 /* 199 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 200 * will queue here their IO that awaits retry. It makes it possible to retry sending 201 * IO to one bdev after IO from other bdev completes. 202 */ 203 struct spdk_bdev_shared_resource { 204 /* The bdev management channel */ 205 struct spdk_bdev_mgmt_channel *mgmt_ch; 206 207 /* 208 * Count of I/O submitted to bdev module and waiting for completion. 209 * Incremented before submit_request() is called on an spdk_bdev_io. 210 */ 211 uint64_t io_outstanding; 212 213 /* 214 * Queue of IO awaiting retry because of a previous NOMEM status returned 215 * on this channel. 216 */ 217 bdev_io_tailq_t nomem_io; 218 219 /* 220 * Threshold which io_outstanding must drop to before retrying nomem_io. 221 */ 222 uint64_t nomem_threshold; 223 224 /* I/O channel allocated by a bdev module */ 225 struct spdk_io_channel *shared_ch; 226 227 /* Refcount of bdev channels using this resource */ 228 uint32_t ref; 229 230 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 231 }; 232 233 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 234 #define BDEV_CH_QOS_ENABLED (1 << 1) 235 236 struct spdk_bdev_channel { 237 struct spdk_bdev *bdev; 238 239 /* The channel for the underlying device */ 240 struct spdk_io_channel *channel; 241 242 /* Per io_device per thread data */ 243 struct spdk_bdev_shared_resource *shared_resource; 244 245 struct spdk_bdev_io_stat stat; 246 247 /* 248 * Count of I/O submitted through this channel and waiting for completion. 249 * Incremented before submit_request() is called on an spdk_bdev_io. 250 */ 251 uint64_t io_outstanding; 252 253 bdev_io_tailq_t queued_resets; 254 255 uint32_t flags; 256 257 struct spdk_histogram_data *histogram; 258 259 #ifdef SPDK_CONFIG_VTUNE 260 uint64_t start_tsc; 261 uint64_t interval_tsc; 262 __itt_string_handle *handle; 263 struct spdk_bdev_io_stat prev_stat; 264 #endif 265 266 }; 267 268 struct spdk_bdev_desc { 269 struct spdk_bdev *bdev; 270 struct spdk_thread *thread; 271 spdk_bdev_remove_cb_t remove_cb; 272 void *remove_ctx; 273 bool remove_scheduled; 274 bool closed; 275 bool write; 276 TAILQ_ENTRY(spdk_bdev_desc) link; 277 }; 278 279 struct spdk_bdev_iostat_ctx { 280 struct spdk_bdev_io_stat *stat; 281 spdk_bdev_get_device_stat_cb cb; 282 void *cb_arg; 283 }; 284 285 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 286 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 287 288 static void _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, 289 void *cb_arg); 290 static void _spdk_bdev_write_zero_buffer_next(void *_bdev_io); 291 292 void 293 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 294 { 295 *opts = g_bdev_opts; 296 } 297 298 int 299 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 300 { 301 uint32_t min_pool_size; 302 303 /* 304 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 305 * initialization. A second mgmt_ch will be created on the same thread when the application starts 306 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 307 */ 308 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 309 if (opts->bdev_io_pool_size < min_pool_size) { 310 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 311 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 312 spdk_thread_get_count()); 313 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 314 return -1; 315 } 316 317 g_bdev_opts = *opts; 318 return 0; 319 } 320 321 struct spdk_bdev * 322 spdk_bdev_first(void) 323 { 324 struct spdk_bdev *bdev; 325 326 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 327 if (bdev) { 328 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 329 } 330 331 return bdev; 332 } 333 334 struct spdk_bdev * 335 spdk_bdev_next(struct spdk_bdev *prev) 336 { 337 struct spdk_bdev *bdev; 338 339 bdev = TAILQ_NEXT(prev, internal.link); 340 if (bdev) { 341 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 342 } 343 344 return bdev; 345 } 346 347 static struct spdk_bdev * 348 _bdev_next_leaf(struct spdk_bdev *bdev) 349 { 350 while (bdev != NULL) { 351 if (bdev->internal.claim_module == NULL) { 352 return bdev; 353 } else { 354 bdev = TAILQ_NEXT(bdev, internal.link); 355 } 356 } 357 358 return bdev; 359 } 360 361 struct spdk_bdev * 362 spdk_bdev_first_leaf(void) 363 { 364 struct spdk_bdev *bdev; 365 366 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 367 368 if (bdev) { 369 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 370 } 371 372 return bdev; 373 } 374 375 struct spdk_bdev * 376 spdk_bdev_next_leaf(struct spdk_bdev *prev) 377 { 378 struct spdk_bdev *bdev; 379 380 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 381 382 if (bdev) { 383 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 384 } 385 386 return bdev; 387 } 388 389 struct spdk_bdev * 390 spdk_bdev_get_by_name(const char *bdev_name) 391 { 392 struct spdk_bdev_alias *tmp; 393 struct spdk_bdev *bdev = spdk_bdev_first(); 394 395 while (bdev != NULL) { 396 if (strcmp(bdev_name, bdev->name) == 0) { 397 return bdev; 398 } 399 400 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 401 if (strcmp(bdev_name, tmp->alias) == 0) { 402 return bdev; 403 } 404 } 405 406 bdev = spdk_bdev_next(bdev); 407 } 408 409 return NULL; 410 } 411 412 void 413 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 414 { 415 struct iovec *iovs; 416 417 iovs = bdev_io->u.bdev.iovs; 418 419 assert(iovs != NULL); 420 assert(bdev_io->u.bdev.iovcnt >= 1); 421 422 iovs[0].iov_base = buf; 423 iovs[0].iov_len = len; 424 } 425 426 static bool 427 _is_buf_allocated(struct iovec *iovs) 428 { 429 return iovs[0].iov_base != NULL; 430 } 431 432 static bool 433 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 434 { 435 int i; 436 uintptr_t iov_base; 437 438 if (spdk_likely(alignment == 1)) { 439 return true; 440 } 441 442 for (i = 0; i < iovcnt; i++) { 443 iov_base = (uintptr_t)iovs[i].iov_base; 444 if ((iov_base & (alignment - 1)) != 0) { 445 return false; 446 } 447 } 448 449 return true; 450 } 451 452 static void 453 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 454 { 455 int i; 456 size_t len; 457 458 for (i = 0; i < iovcnt; i++) { 459 len = spdk_min(iovs[i].iov_len, buf_len); 460 memcpy(buf, iovs[i].iov_base, len); 461 buf += len; 462 buf_len -= len; 463 } 464 } 465 466 static void 467 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 468 { 469 int i; 470 size_t len; 471 472 for (i = 0; i < iovcnt; i++) { 473 len = spdk_min(iovs[i].iov_len, buf_len); 474 memcpy(iovs[i].iov_base, buf, len); 475 buf += len; 476 buf_len -= len; 477 } 478 } 479 480 static void 481 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 482 { 483 /* save original iovec */ 484 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 485 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 486 /* set bounce iov */ 487 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 488 bdev_io->u.bdev.iovcnt = 1; 489 /* set bounce buffer for this operation */ 490 bdev_io->u.bdev.iovs[0].iov_base = buf; 491 bdev_io->u.bdev.iovs[0].iov_len = len; 492 /* if this is write path, copy data from original buffer to bounce buffer */ 493 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 494 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 495 } 496 } 497 498 static void 499 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 500 { 501 struct spdk_mempool *pool; 502 struct spdk_bdev_io *tmp; 503 void *buf, *aligned_buf; 504 bdev_io_stailq_t *stailq; 505 struct spdk_bdev_mgmt_channel *ch; 506 uint64_t buf_len; 507 uint64_t alignment; 508 bool buf_allocated; 509 510 buf = bdev_io->internal.buf; 511 buf_len = bdev_io->internal.buf_len; 512 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 513 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 514 515 bdev_io->internal.buf = NULL; 516 517 if (buf_len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 518 pool = g_bdev_mgr.buf_small_pool; 519 stailq = &ch->need_buf_small; 520 } else { 521 pool = g_bdev_mgr.buf_large_pool; 522 stailq = &ch->need_buf_large; 523 } 524 525 if (STAILQ_EMPTY(stailq)) { 526 spdk_mempool_put(pool, buf); 527 } else { 528 tmp = STAILQ_FIRST(stailq); 529 530 alignment = spdk_bdev_get_buf_align(tmp->bdev); 531 buf_allocated = _is_buf_allocated(tmp->u.bdev.iovs); 532 533 aligned_buf = (void *)(((uintptr_t)buf + 534 (alignment - 1)) & ~(alignment - 1)); 535 if (buf_allocated) { 536 _bdev_io_set_bounce_buf(tmp, aligned_buf, tmp->internal.buf_len); 537 } else { 538 spdk_bdev_io_set_buf(tmp, aligned_buf, tmp->internal.buf_len); 539 } 540 541 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 542 tmp->internal.buf = buf; 543 tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); 544 } 545 } 546 547 static void 548 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 549 { 550 /* if this is read path, copy data from bounce buffer to original buffer */ 551 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 552 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 553 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt, 554 bdev_io->internal.bounce_iov.iov_base, bdev_io->internal.bounce_iov.iov_len); 555 } 556 /* set orignal buffer for this io */ 557 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 558 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 559 /* disable bouncing buffer for this io */ 560 bdev_io->internal.orig_iovcnt = 0; 561 bdev_io->internal.orig_iovs = NULL; 562 /* return bounce buffer to the pool */ 563 spdk_bdev_io_put_buf(bdev_io); 564 } 565 566 void 567 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 568 { 569 struct spdk_mempool *pool; 570 bdev_io_stailq_t *stailq; 571 void *buf, *aligned_buf; 572 struct spdk_bdev_mgmt_channel *mgmt_ch; 573 uint64_t alignment; 574 bool buf_allocated; 575 576 assert(cb != NULL); 577 assert(bdev_io->u.bdev.iovs != NULL); 578 579 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 580 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 581 582 if (buf_allocated && 583 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 584 /* Buffer already present and aligned */ 585 cb(bdev_io->internal.ch->channel, bdev_io); 586 return; 587 } 588 589 assert(len + alignment <= SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT); 590 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 591 592 bdev_io->internal.buf_len = len; 593 bdev_io->internal.get_buf_cb = cb; 594 595 if (len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 596 pool = g_bdev_mgr.buf_small_pool; 597 stailq = &mgmt_ch->need_buf_small; 598 } else { 599 pool = g_bdev_mgr.buf_large_pool; 600 stailq = &mgmt_ch->need_buf_large; 601 } 602 603 buf = spdk_mempool_get(pool); 604 605 if (!buf) { 606 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 607 } else { 608 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 609 610 if (buf_allocated) { 611 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 612 } else { 613 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 614 } 615 bdev_io->internal.buf = buf; 616 bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); 617 } 618 } 619 620 static int 621 spdk_bdev_module_get_max_ctx_size(void) 622 { 623 struct spdk_bdev_module *bdev_module; 624 int max_bdev_module_size = 0; 625 626 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 627 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 628 max_bdev_module_size = bdev_module->get_ctx_size(); 629 } 630 } 631 632 return max_bdev_module_size; 633 } 634 635 void 636 spdk_bdev_config_text(FILE *fp) 637 { 638 struct spdk_bdev_module *bdev_module; 639 640 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 641 if (bdev_module->config_text) { 642 bdev_module->config_text(fp); 643 } 644 } 645 } 646 647 static void 648 spdk_bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 649 { 650 int i; 651 struct spdk_bdev_qos *qos = bdev->internal.qos; 652 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 653 654 if (!qos) { 655 return; 656 } 657 658 spdk_bdev_get_qos_rate_limits(bdev, limits); 659 660 spdk_json_write_object_begin(w); 661 spdk_json_write_named_string(w, "method", "set_bdev_qos_limit"); 662 spdk_json_write_name(w, "params"); 663 664 spdk_json_write_object_begin(w); 665 spdk_json_write_named_string(w, "name", bdev->name); 666 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 667 if (limits[i] > 0) { 668 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 669 } 670 } 671 spdk_json_write_object_end(w); 672 673 spdk_json_write_object_end(w); 674 } 675 676 void 677 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 678 { 679 struct spdk_bdev_module *bdev_module; 680 struct spdk_bdev *bdev; 681 682 assert(w != NULL); 683 684 spdk_json_write_array_begin(w); 685 686 spdk_json_write_object_begin(w); 687 spdk_json_write_named_string(w, "method", "set_bdev_options"); 688 spdk_json_write_name(w, "params"); 689 spdk_json_write_object_begin(w); 690 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 691 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 692 spdk_json_write_object_end(w); 693 spdk_json_write_object_end(w); 694 695 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 696 if (bdev_module->config_json) { 697 bdev_module->config_json(w); 698 } 699 } 700 701 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 702 spdk_bdev_qos_config_json(bdev, w); 703 704 if (bdev->fn_table->write_config_json) { 705 bdev->fn_table->write_config_json(bdev, w); 706 } 707 } 708 709 spdk_json_write_array_end(w); 710 } 711 712 static int 713 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 714 { 715 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 716 struct spdk_bdev_io *bdev_io; 717 uint32_t i; 718 719 STAILQ_INIT(&ch->need_buf_small); 720 STAILQ_INIT(&ch->need_buf_large); 721 722 STAILQ_INIT(&ch->per_thread_cache); 723 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 724 725 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 726 ch->per_thread_cache_count = 0; 727 for (i = 0; i < ch->bdev_io_cache_size; i++) { 728 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 729 assert(bdev_io != NULL); 730 ch->per_thread_cache_count++; 731 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 732 } 733 734 TAILQ_INIT(&ch->shared_resources); 735 TAILQ_INIT(&ch->io_wait_queue); 736 737 return 0; 738 } 739 740 static void 741 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 742 { 743 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 744 struct spdk_bdev_io *bdev_io; 745 746 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 747 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 748 } 749 750 if (!TAILQ_EMPTY(&ch->shared_resources)) { 751 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 752 } 753 754 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 755 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 756 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 757 ch->per_thread_cache_count--; 758 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 759 } 760 761 assert(ch->per_thread_cache_count == 0); 762 } 763 764 static void 765 spdk_bdev_init_complete(int rc) 766 { 767 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 768 void *cb_arg = g_init_cb_arg; 769 struct spdk_bdev_module *m; 770 771 g_bdev_mgr.init_complete = true; 772 g_init_cb_fn = NULL; 773 g_init_cb_arg = NULL; 774 775 /* 776 * For modules that need to know when subsystem init is complete, 777 * inform them now. 778 */ 779 if (rc == 0) { 780 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 781 if (m->init_complete) { 782 m->init_complete(); 783 } 784 } 785 } 786 787 cb_fn(cb_arg, rc); 788 } 789 790 static void 791 spdk_bdev_module_action_complete(void) 792 { 793 struct spdk_bdev_module *m; 794 795 /* 796 * Don't finish bdev subsystem initialization if 797 * module pre-initialization is still in progress, or 798 * the subsystem been already initialized. 799 */ 800 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 801 return; 802 } 803 804 /* 805 * Check all bdev modules for inits/examinations in progress. If any 806 * exist, return immediately since we cannot finish bdev subsystem 807 * initialization until all are completed. 808 */ 809 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 810 if (m->internal.action_in_progress > 0) { 811 return; 812 } 813 } 814 815 /* 816 * Modules already finished initialization - now that all 817 * the bdev modules have finished their asynchronous I/O 818 * processing, the entire bdev layer can be marked as complete. 819 */ 820 spdk_bdev_init_complete(0); 821 } 822 823 static void 824 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 825 { 826 assert(module->internal.action_in_progress > 0); 827 module->internal.action_in_progress--; 828 spdk_bdev_module_action_complete(); 829 } 830 831 void 832 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 833 { 834 spdk_bdev_module_action_done(module); 835 } 836 837 void 838 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 839 { 840 spdk_bdev_module_action_done(module); 841 } 842 843 /** The last initialized bdev module */ 844 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 845 846 static int 847 spdk_bdev_modules_init(void) 848 { 849 struct spdk_bdev_module *module; 850 int rc = 0; 851 852 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 853 g_resume_bdev_module = module; 854 rc = module->module_init(); 855 if (rc != 0) { 856 return rc; 857 } 858 } 859 860 g_resume_bdev_module = NULL; 861 return 0; 862 } 863 864 865 static void 866 spdk_bdev_init_failed_complete(void *cb_arg) 867 { 868 spdk_bdev_init_complete(-1); 869 } 870 871 static void 872 spdk_bdev_init_failed(void *cb_arg) 873 { 874 spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); 875 } 876 877 void 878 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 879 { 880 struct spdk_conf_section *sp; 881 struct spdk_bdev_opts bdev_opts; 882 int32_t bdev_io_pool_size, bdev_io_cache_size; 883 int cache_size; 884 int rc = 0; 885 char mempool_name[32]; 886 887 assert(cb_fn != NULL); 888 889 sp = spdk_conf_find_section(NULL, "Bdev"); 890 if (sp != NULL) { 891 spdk_bdev_get_opts(&bdev_opts); 892 893 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 894 if (bdev_io_pool_size >= 0) { 895 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 896 } 897 898 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 899 if (bdev_io_cache_size >= 0) { 900 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 901 } 902 903 if (spdk_bdev_set_opts(&bdev_opts)) { 904 spdk_bdev_init_complete(-1); 905 return; 906 } 907 908 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 909 } 910 911 g_init_cb_fn = cb_fn; 912 g_init_cb_arg = cb_arg; 913 914 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 915 916 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 917 g_bdev_opts.bdev_io_pool_size, 918 sizeof(struct spdk_bdev_io) + 919 spdk_bdev_module_get_max_ctx_size(), 920 0, 921 SPDK_ENV_SOCKET_ID_ANY); 922 923 if (g_bdev_mgr.bdev_io_pool == NULL) { 924 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 925 spdk_bdev_init_complete(-1); 926 return; 927 } 928 929 /** 930 * Ensure no more than half of the total buffers end up local caches, by 931 * using spdk_thread_get_count() to determine how many local caches we need 932 * to account for. 933 */ 934 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 935 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 936 937 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 938 BUF_SMALL_POOL_SIZE, 939 SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 940 cache_size, 941 SPDK_ENV_SOCKET_ID_ANY); 942 if (!g_bdev_mgr.buf_small_pool) { 943 SPDK_ERRLOG("create rbuf small pool failed\n"); 944 spdk_bdev_init_complete(-1); 945 return; 946 } 947 948 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 949 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 950 951 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 952 BUF_LARGE_POOL_SIZE, 953 SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 954 cache_size, 955 SPDK_ENV_SOCKET_ID_ANY); 956 if (!g_bdev_mgr.buf_large_pool) { 957 SPDK_ERRLOG("create rbuf large pool failed\n"); 958 spdk_bdev_init_complete(-1); 959 return; 960 } 961 962 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 963 NULL); 964 if (!g_bdev_mgr.zero_buffer) { 965 SPDK_ERRLOG("create bdev zero buffer failed\n"); 966 spdk_bdev_init_complete(-1); 967 return; 968 } 969 970 #ifdef SPDK_CONFIG_VTUNE 971 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 972 #endif 973 974 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 975 spdk_bdev_mgmt_channel_destroy, 976 sizeof(struct spdk_bdev_mgmt_channel), 977 "bdev_mgr"); 978 979 rc = spdk_bdev_modules_init(); 980 g_bdev_mgr.module_init_complete = true; 981 if (rc != 0) { 982 SPDK_ERRLOG("bdev modules init failed\n"); 983 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); 984 return; 985 } 986 987 spdk_bdev_module_action_complete(); 988 } 989 990 static void 991 spdk_bdev_mgr_unregister_cb(void *io_device) 992 { 993 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 994 995 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 996 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 997 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 998 g_bdev_opts.bdev_io_pool_size); 999 } 1000 1001 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1002 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1003 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1004 BUF_SMALL_POOL_SIZE); 1005 assert(false); 1006 } 1007 1008 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1009 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1010 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1011 BUF_LARGE_POOL_SIZE); 1012 assert(false); 1013 } 1014 1015 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1016 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1017 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1018 spdk_dma_free(g_bdev_mgr.zero_buffer); 1019 1020 cb_fn(g_fini_cb_arg); 1021 g_fini_cb_fn = NULL; 1022 g_fini_cb_arg = NULL; 1023 g_bdev_mgr.init_complete = false; 1024 g_bdev_mgr.module_init_complete = false; 1025 } 1026 1027 static void 1028 spdk_bdev_module_finish_iter(void *arg) 1029 { 1030 struct spdk_bdev_module *bdev_module; 1031 1032 /* Start iterating from the last touched module */ 1033 if (!g_resume_bdev_module) { 1034 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1035 } else { 1036 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1037 internal.tailq); 1038 } 1039 1040 while (bdev_module) { 1041 if (bdev_module->async_fini) { 1042 /* Save our place so we can resume later. We must 1043 * save the variable here, before calling module_fini() 1044 * below, because in some cases the module may immediately 1045 * call spdk_bdev_module_finish_done() and re-enter 1046 * this function to continue iterating. */ 1047 g_resume_bdev_module = bdev_module; 1048 } 1049 1050 if (bdev_module->module_fini) { 1051 bdev_module->module_fini(); 1052 } 1053 1054 if (bdev_module->async_fini) { 1055 return; 1056 } 1057 1058 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1059 internal.tailq); 1060 } 1061 1062 g_resume_bdev_module = NULL; 1063 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 1064 } 1065 1066 void 1067 spdk_bdev_module_finish_done(void) 1068 { 1069 if (spdk_get_thread() != g_fini_thread) { 1070 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 1071 } else { 1072 spdk_bdev_module_finish_iter(NULL); 1073 } 1074 } 1075 1076 static void 1077 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1078 { 1079 struct spdk_bdev *bdev = cb_arg; 1080 1081 if (bdeverrno && bdev) { 1082 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1083 bdev->name); 1084 1085 /* 1086 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1087 * bdev; try to continue by manually removing this bdev from the list and continue 1088 * with the next bdev in the list. 1089 */ 1090 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1091 } 1092 1093 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1094 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1095 /* 1096 * Bdev module finish need to be deferred as we might be in the middle of some context 1097 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1098 * after returning. 1099 */ 1100 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 1101 return; 1102 } 1103 1104 /* 1105 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1106 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1107 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1108 * base bdevs. 1109 * 1110 * Also, walk the list in the reverse order. 1111 */ 1112 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1113 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1114 if (bdev->internal.claim_module != NULL) { 1115 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1116 bdev->name, bdev->internal.claim_module->name); 1117 continue; 1118 } 1119 1120 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1121 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1122 return; 1123 } 1124 1125 /* 1126 * If any bdev fails to unclaim underlying bdev properly, we may face the 1127 * case of bdev list consisting of claimed bdevs only (if claims are managed 1128 * correctly, this would mean there's a loop in the claims graph which is 1129 * clearly impossible). Warn and unregister last bdev on the list then. 1130 */ 1131 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1132 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1133 SPDK_ERRLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1134 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1135 return; 1136 } 1137 } 1138 1139 void 1140 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1141 { 1142 struct spdk_bdev_module *m; 1143 1144 assert(cb_fn != NULL); 1145 1146 g_fini_thread = spdk_get_thread(); 1147 1148 g_fini_cb_fn = cb_fn; 1149 g_fini_cb_arg = cb_arg; 1150 1151 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1152 if (m->fini_start) { 1153 m->fini_start(); 1154 } 1155 } 1156 1157 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 1158 } 1159 1160 static struct spdk_bdev_io * 1161 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 1162 { 1163 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1164 struct spdk_bdev_io *bdev_io; 1165 1166 if (ch->per_thread_cache_count > 0) { 1167 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1168 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1169 ch->per_thread_cache_count--; 1170 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1171 /* 1172 * Don't try to look for bdev_ios in the global pool if there are 1173 * waiters on bdev_ios - we don't want this caller to jump the line. 1174 */ 1175 bdev_io = NULL; 1176 } else { 1177 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1178 } 1179 1180 return bdev_io; 1181 } 1182 1183 void 1184 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1187 1188 assert(bdev_io != NULL); 1189 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1190 1191 if (bdev_io->internal.buf != NULL) { 1192 spdk_bdev_io_put_buf(bdev_io); 1193 } 1194 1195 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1196 ch->per_thread_cache_count++; 1197 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 1198 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1199 struct spdk_bdev_io_wait_entry *entry; 1200 1201 entry = TAILQ_FIRST(&ch->io_wait_queue); 1202 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1203 entry->cb_fn(entry->cb_arg); 1204 } 1205 } else { 1206 /* We should never have a full cache with entries on the io wait queue. */ 1207 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1208 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1209 } 1210 } 1211 1212 static bool 1213 _spdk_bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1214 { 1215 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1216 1217 switch (limit) { 1218 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1219 return true; 1220 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1221 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1222 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1223 return false; 1224 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1225 default: 1226 return false; 1227 } 1228 } 1229 1230 static bool 1231 _spdk_bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1232 { 1233 switch (bdev_io->type) { 1234 case SPDK_BDEV_IO_TYPE_NVME_IO: 1235 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1236 case SPDK_BDEV_IO_TYPE_READ: 1237 case SPDK_BDEV_IO_TYPE_WRITE: 1238 case SPDK_BDEV_IO_TYPE_UNMAP: 1239 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1240 return true; 1241 default: 1242 return false; 1243 } 1244 } 1245 1246 static bool 1247 _spdk_bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1248 { 1249 switch (bdev_io->type) { 1250 case SPDK_BDEV_IO_TYPE_NVME_IO: 1251 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1252 /* Bit 1 (0x2) set for read operation */ 1253 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1254 return true; 1255 } else { 1256 return false; 1257 } 1258 case SPDK_BDEV_IO_TYPE_READ: 1259 return true; 1260 default: 1261 return false; 1262 } 1263 } 1264 1265 static uint64_t 1266 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1267 { 1268 struct spdk_bdev *bdev = bdev_io->bdev; 1269 1270 switch (bdev_io->type) { 1271 case SPDK_BDEV_IO_TYPE_NVME_IO: 1272 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1273 return bdev_io->u.nvme_passthru.nbytes; 1274 case SPDK_BDEV_IO_TYPE_READ: 1275 case SPDK_BDEV_IO_TYPE_WRITE: 1276 case SPDK_BDEV_IO_TYPE_UNMAP: 1277 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1278 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1279 default: 1280 return 0; 1281 } 1282 } 1283 1284 static bool 1285 _spdk_bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1286 { 1287 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1288 return true; 1289 } else { 1290 return false; 1291 } 1292 } 1293 1294 static bool 1295 _spdk_bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1296 { 1297 if (_spdk_bdev_is_read_io(io) == false) { 1298 return false; 1299 } 1300 1301 return _spdk_bdev_qos_rw_queue_io(limit, io); 1302 } 1303 1304 static bool 1305 _spdk_bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1306 { 1307 if (_spdk_bdev_is_read_io(io) == true) { 1308 return false; 1309 } 1310 1311 return _spdk_bdev_qos_rw_queue_io(limit, io); 1312 } 1313 1314 static void 1315 _spdk_bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1316 { 1317 limit->remaining_this_timeslice--; 1318 } 1319 1320 static void 1321 _spdk_bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1322 { 1323 limit->remaining_this_timeslice -= _spdk_bdev_get_io_size_in_byte(io); 1324 } 1325 1326 static void 1327 _spdk_bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1328 { 1329 if (_spdk_bdev_is_read_io(io) == false) { 1330 return; 1331 } 1332 1333 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1334 } 1335 1336 static void 1337 _spdk_bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1338 { 1339 if (_spdk_bdev_is_read_io(io) == true) { 1340 return; 1341 } 1342 1343 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1344 } 1345 1346 static void 1347 _spdk_bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1348 { 1349 int i; 1350 1351 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1352 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1353 qos->rate_limits[i].queue_io = NULL; 1354 qos->rate_limits[i].update_quota = NULL; 1355 continue; 1356 } 1357 1358 switch (i) { 1359 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1360 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1361 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_iops_update_quota; 1362 break; 1363 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1364 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1365 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_bps_update_quota; 1366 break; 1367 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1368 qos->rate_limits[i].queue_io = _spdk_bdev_qos_r_queue_io; 1369 qos->rate_limits[i].update_quota = _spdk_bdev_qos_r_bps_update_quota; 1370 break; 1371 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1372 qos->rate_limits[i].queue_io = _spdk_bdev_qos_w_queue_io; 1373 qos->rate_limits[i].update_quota = _spdk_bdev_qos_w_bps_update_quota; 1374 break; 1375 default: 1376 break; 1377 } 1378 } 1379 } 1380 1381 static int 1382 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1383 { 1384 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1385 struct spdk_bdev *bdev = ch->bdev; 1386 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1387 int i, submitted_ios = 0; 1388 1389 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1390 if (_spdk_bdev_qos_io_to_limit(bdev_io) == true) { 1391 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1392 if (!qos->rate_limits[i].queue_io) { 1393 continue; 1394 } 1395 1396 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1397 bdev_io) == true) { 1398 return submitted_ios; 1399 } 1400 } 1401 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1402 if (!qos->rate_limits[i].update_quota) { 1403 continue; 1404 } 1405 1406 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1407 } 1408 } 1409 1410 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1411 ch->io_outstanding++; 1412 shared_resource->io_outstanding++; 1413 bdev->fn_table->submit_request(ch->channel, bdev_io); 1414 submitted_ios++; 1415 } 1416 1417 return submitted_ios; 1418 } 1419 1420 static void 1421 _spdk_bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1422 { 1423 int rc; 1424 1425 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1426 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1427 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1428 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1429 &bdev_io->internal.waitq_entry); 1430 if (rc != 0) { 1431 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1432 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1433 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1434 } 1435 } 1436 1437 static bool 1438 _spdk_bdev_io_type_can_split(uint8_t type) 1439 { 1440 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1441 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1442 1443 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1444 * UNMAP could be split, but these types of I/O are typically much larger 1445 * in size (sometimes the size of the entire block device), and the bdev 1446 * module can more efficiently split these types of I/O. Plus those types 1447 * of I/O do not have a payload, which makes the splitting process simpler. 1448 */ 1449 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1450 return true; 1451 } else { 1452 return false; 1453 } 1454 } 1455 1456 static bool 1457 _spdk_bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1458 { 1459 uint64_t start_stripe, end_stripe; 1460 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1461 1462 if (io_boundary == 0) { 1463 return false; 1464 } 1465 1466 if (!_spdk_bdev_io_type_can_split(bdev_io->type)) { 1467 return false; 1468 } 1469 1470 start_stripe = bdev_io->u.bdev.offset_blocks; 1471 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1472 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1473 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1474 start_stripe >>= spdk_u32log2(io_boundary); 1475 end_stripe >>= spdk_u32log2(io_boundary); 1476 } else { 1477 start_stripe /= io_boundary; 1478 end_stripe /= io_boundary; 1479 } 1480 return (start_stripe != end_stripe); 1481 } 1482 1483 static uint32_t 1484 _to_next_boundary(uint64_t offset, uint32_t boundary) 1485 { 1486 return (boundary - (offset % boundary)); 1487 } 1488 1489 static void 1490 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1491 1492 static void 1493 _spdk_bdev_io_split_with_payload(void *_bdev_io) 1494 { 1495 struct spdk_bdev_io *bdev_io = _bdev_io; 1496 uint64_t current_offset, remaining; 1497 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; 1498 struct iovec *parent_iov, *iov; 1499 uint64_t parent_iov_offset, iov_len; 1500 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1501 int rc; 1502 1503 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1504 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1505 blocklen = bdev_io->bdev->blocklen; 1506 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1507 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1508 1509 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1510 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1511 if (parent_iov_offset < parent_iov->iov_len) { 1512 break; 1513 } 1514 parent_iov_offset -= parent_iov->iov_len; 1515 } 1516 1517 child_iovcnt = 0; 1518 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1519 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1520 to_next_boundary = spdk_min(remaining, to_next_boundary); 1521 to_next_boundary_bytes = to_next_boundary * blocklen; 1522 iov = &bdev_io->child_iov[child_iovcnt]; 1523 iovcnt = 0; 1524 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1525 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1526 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1527 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1528 to_next_boundary_bytes -= iov_len; 1529 1530 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1531 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1532 1533 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1534 parent_iov_offset += iov_len; 1535 } else { 1536 parent_iovpos++; 1537 parent_iov_offset = 0; 1538 } 1539 child_iovcnt++; 1540 iovcnt++; 1541 } 1542 1543 if (to_next_boundary_bytes > 0) { 1544 /* We had to stop this child I/O early because we ran out of 1545 * child_iov space. Make sure the iovs collected are valid and 1546 * then adjust to_next_boundary before starting the child I/O. 1547 */ 1548 if ((to_next_boundary_bytes % blocklen) != 0) { 1549 SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", 1550 to_next_boundary_bytes, blocklen); 1551 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1552 if (bdev_io->u.bdev.split_outstanding == 0) { 1553 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1554 } 1555 return; 1556 } 1557 to_next_boundary -= to_next_boundary_bytes / blocklen; 1558 } 1559 1560 bdev_io->u.bdev.split_outstanding++; 1561 1562 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1563 rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, 1564 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1565 iov, iovcnt, current_offset, to_next_boundary, 1566 _spdk_bdev_io_split_done, bdev_io); 1567 } else { 1568 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 1569 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1570 iov, iovcnt, current_offset, to_next_boundary, 1571 _spdk_bdev_io_split_done, bdev_io); 1572 } 1573 1574 if (rc == 0) { 1575 current_offset += to_next_boundary; 1576 remaining -= to_next_boundary; 1577 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 1578 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 1579 } else { 1580 bdev_io->u.bdev.split_outstanding--; 1581 if (rc == -ENOMEM) { 1582 if (bdev_io->u.bdev.split_outstanding == 0) { 1583 /* No I/O is outstanding. Hence we should wait here. */ 1584 _spdk_bdev_queue_io_wait_with_cb(bdev_io, 1585 _spdk_bdev_io_split_with_payload); 1586 } 1587 } else { 1588 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1589 if (bdev_io->u.bdev.split_outstanding == 0) { 1590 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1591 } 1592 } 1593 1594 return; 1595 } 1596 } 1597 } 1598 1599 static void 1600 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1601 { 1602 struct spdk_bdev_io *parent_io = cb_arg; 1603 1604 spdk_bdev_free_io(bdev_io); 1605 1606 if (!success) { 1607 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1608 } 1609 parent_io->u.bdev.split_outstanding--; 1610 if (parent_io->u.bdev.split_outstanding != 0) { 1611 return; 1612 } 1613 1614 /* 1615 * Parent I/O finishes when all blocks are consumed or there is any failure of 1616 * child I/O and no outstanding child I/O. 1617 */ 1618 if (parent_io->u.bdev.split_remaining_num_blocks == 0 || 1619 parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { 1620 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 1621 parent_io->internal.caller_ctx); 1622 return; 1623 } 1624 1625 /* 1626 * Continue with the splitting process. This function will complete the parent I/O if the 1627 * splitting is done. 1628 */ 1629 _spdk_bdev_io_split_with_payload(parent_io); 1630 } 1631 1632 static void 1633 _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1634 { 1635 assert(_spdk_bdev_io_type_can_split(bdev_io->type)); 1636 1637 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 1638 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 1639 bdev_io->u.bdev.split_outstanding = 0; 1640 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 1641 1642 _spdk_bdev_io_split_with_payload(bdev_io); 1643 } 1644 1645 static void 1646 _spdk_bdev_io_submit(void *ctx) 1647 { 1648 struct spdk_bdev_io *bdev_io = ctx; 1649 struct spdk_bdev *bdev = bdev_io->bdev; 1650 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1651 struct spdk_io_channel *ch = bdev_ch->channel; 1652 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1653 uint64_t tsc; 1654 1655 tsc = spdk_get_ticks(); 1656 bdev_io->internal.submit_tsc = tsc; 1657 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 1658 bdev_ch->io_outstanding++; 1659 shared_resource->io_outstanding++; 1660 bdev_io->internal.in_submit_request = true; 1661 if (spdk_likely(bdev_ch->flags == 0)) { 1662 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1663 bdev->fn_table->submit_request(ch, bdev_io); 1664 } else { 1665 bdev_ch->io_outstanding--; 1666 shared_resource->io_outstanding--; 1667 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1668 } 1669 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1670 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1671 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1672 bdev_ch->io_outstanding--; 1673 shared_resource->io_outstanding--; 1674 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1675 _spdk_bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 1676 } else { 1677 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1678 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1679 } 1680 bdev_io->internal.in_submit_request = false; 1681 } 1682 1683 static void 1684 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 1685 { 1686 struct spdk_bdev *bdev = bdev_io->bdev; 1687 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 1688 1689 assert(thread != NULL); 1690 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1691 1692 if (bdev->split_on_optimal_io_boundary && _spdk_bdev_io_should_split(bdev_io)) { 1693 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1694 spdk_bdev_io_get_buf(bdev_io, _spdk_bdev_io_split, 1695 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 1696 } else { 1697 _spdk_bdev_io_split(NULL, bdev_io); 1698 } 1699 return; 1700 } 1701 1702 if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { 1703 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 1704 _spdk_bdev_io_submit(bdev_io); 1705 } else { 1706 bdev_io->internal.io_submit_ch = bdev_io->internal.ch; 1707 bdev_io->internal.ch = bdev->internal.qos->ch; 1708 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); 1709 } 1710 } else { 1711 _spdk_bdev_io_submit(bdev_io); 1712 } 1713 } 1714 1715 static void 1716 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 1717 { 1718 struct spdk_bdev *bdev = bdev_io->bdev; 1719 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1720 struct spdk_io_channel *ch = bdev_ch->channel; 1721 1722 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1723 1724 bdev_io->internal.in_submit_request = true; 1725 bdev->fn_table->submit_request(ch, bdev_io); 1726 bdev_io->internal.in_submit_request = false; 1727 } 1728 1729 static void 1730 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 1731 struct spdk_bdev *bdev, void *cb_arg, 1732 spdk_bdev_io_completion_cb cb) 1733 { 1734 bdev_io->bdev = bdev; 1735 bdev_io->internal.caller_ctx = cb_arg; 1736 bdev_io->internal.cb = cb; 1737 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1738 bdev_io->internal.in_submit_request = false; 1739 bdev_io->internal.buf = NULL; 1740 bdev_io->internal.io_submit_ch = NULL; 1741 bdev_io->internal.orig_iovs = NULL; 1742 bdev_io->internal.orig_iovcnt = 0; 1743 } 1744 1745 static bool 1746 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1747 { 1748 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1749 } 1750 1751 bool 1752 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1753 { 1754 bool supported; 1755 1756 supported = _spdk_bdev_io_type_supported(bdev, io_type); 1757 1758 if (!supported) { 1759 switch (io_type) { 1760 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1761 /* The bdev layer will emulate write zeroes as long as write is supported. */ 1762 supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 1763 break; 1764 default: 1765 break; 1766 } 1767 } 1768 1769 return supported; 1770 } 1771 1772 int 1773 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1774 { 1775 if (bdev->fn_table->dump_info_json) { 1776 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1777 } 1778 1779 return 0; 1780 } 1781 1782 static void 1783 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1784 { 1785 uint32_t max_per_timeslice = 0; 1786 int i; 1787 1788 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1789 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1790 qos->rate_limits[i].max_per_timeslice = 0; 1791 continue; 1792 } 1793 1794 max_per_timeslice = qos->rate_limits[i].limit * 1795 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 1796 1797 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 1798 qos->rate_limits[i].min_per_timeslice); 1799 1800 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 1801 } 1802 1803 _spdk_bdev_qos_set_ops(qos); 1804 } 1805 1806 static int 1807 spdk_bdev_channel_poll_qos(void *arg) 1808 { 1809 struct spdk_bdev_qos *qos = arg; 1810 uint64_t now = spdk_get_ticks(); 1811 int i; 1812 1813 if (now < (qos->last_timeslice + qos->timeslice_size)) { 1814 /* We received our callback earlier than expected - return 1815 * immediately and wait to do accounting until at least one 1816 * timeslice has actually expired. This should never happen 1817 * with a well-behaved timer implementation. 1818 */ 1819 return 0; 1820 } 1821 1822 /* Reset for next round of rate limiting */ 1823 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1824 /* We may have allowed the IOs or bytes to slightly overrun in the last 1825 * timeslice. remaining_this_timeslice is signed, so if it's negative 1826 * here, we'll account for the overrun so that the next timeslice will 1827 * be appropriately reduced. 1828 */ 1829 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 1830 qos->rate_limits[i].remaining_this_timeslice = 0; 1831 } 1832 } 1833 1834 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 1835 qos->last_timeslice += qos->timeslice_size; 1836 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1837 qos->rate_limits[i].remaining_this_timeslice += 1838 qos->rate_limits[i].max_per_timeslice; 1839 } 1840 } 1841 1842 return _spdk_bdev_qos_io_submit(qos->ch, qos); 1843 } 1844 1845 static void 1846 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1847 { 1848 struct spdk_bdev_shared_resource *shared_resource; 1849 1850 spdk_put_io_channel(ch->channel); 1851 1852 shared_resource = ch->shared_resource; 1853 1854 assert(ch->io_outstanding == 0); 1855 assert(shared_resource->ref > 0); 1856 shared_resource->ref--; 1857 if (shared_resource->ref == 0) { 1858 assert(shared_resource->io_outstanding == 0); 1859 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1860 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1861 free(shared_resource); 1862 } 1863 } 1864 1865 /* Caller must hold bdev->internal.mutex. */ 1866 static void 1867 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1868 { 1869 struct spdk_bdev_qos *qos = bdev->internal.qos; 1870 int i; 1871 1872 /* Rate limiting on this bdev enabled */ 1873 if (qos) { 1874 if (qos->ch == NULL) { 1875 struct spdk_io_channel *io_ch; 1876 1877 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1878 bdev->name, spdk_get_thread()); 1879 1880 /* No qos channel has been selected, so set one up */ 1881 1882 /* Take another reference to ch */ 1883 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1884 assert(io_ch != NULL); 1885 qos->ch = ch; 1886 1887 qos->thread = spdk_io_channel_get_thread(io_ch); 1888 1889 TAILQ_INIT(&qos->queued); 1890 1891 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1892 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 1893 qos->rate_limits[i].min_per_timeslice = 1894 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 1895 } else { 1896 qos->rate_limits[i].min_per_timeslice = 1897 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 1898 } 1899 1900 if (qos->rate_limits[i].limit == 0) { 1901 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 1902 } 1903 } 1904 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1905 qos->timeslice_size = 1906 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 1907 qos->last_timeslice = spdk_get_ticks(); 1908 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1909 qos, 1910 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1911 } 1912 1913 ch->flags |= BDEV_CH_QOS_ENABLED; 1914 } 1915 } 1916 1917 static int 1918 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1919 { 1920 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1921 struct spdk_bdev_channel *ch = ctx_buf; 1922 struct spdk_io_channel *mgmt_io_ch; 1923 struct spdk_bdev_mgmt_channel *mgmt_ch; 1924 struct spdk_bdev_shared_resource *shared_resource; 1925 1926 ch->bdev = bdev; 1927 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1928 if (!ch->channel) { 1929 return -1; 1930 } 1931 1932 assert(ch->histogram == NULL); 1933 if (bdev->internal.histogram_enabled) { 1934 ch->histogram = spdk_histogram_data_alloc(); 1935 if (ch->histogram == NULL) { 1936 SPDK_ERRLOG("Could not allocate histogram\n"); 1937 } 1938 } 1939 1940 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1941 if (!mgmt_io_ch) { 1942 spdk_put_io_channel(ch->channel); 1943 return -1; 1944 } 1945 1946 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1947 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1948 if (shared_resource->shared_ch == ch->channel) { 1949 spdk_put_io_channel(mgmt_io_ch); 1950 shared_resource->ref++; 1951 break; 1952 } 1953 } 1954 1955 if (shared_resource == NULL) { 1956 shared_resource = calloc(1, sizeof(*shared_resource)); 1957 if (shared_resource == NULL) { 1958 spdk_put_io_channel(ch->channel); 1959 spdk_put_io_channel(mgmt_io_ch); 1960 return -1; 1961 } 1962 1963 shared_resource->mgmt_ch = mgmt_ch; 1964 shared_resource->io_outstanding = 0; 1965 TAILQ_INIT(&shared_resource->nomem_io); 1966 shared_resource->nomem_threshold = 0; 1967 shared_resource->shared_ch = ch->channel; 1968 shared_resource->ref = 1; 1969 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1970 } 1971 1972 memset(&ch->stat, 0, sizeof(ch->stat)); 1973 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1974 ch->io_outstanding = 0; 1975 TAILQ_INIT(&ch->queued_resets); 1976 ch->flags = 0; 1977 ch->shared_resource = shared_resource; 1978 1979 #ifdef SPDK_CONFIG_VTUNE 1980 { 1981 char *name; 1982 __itt_init_ittlib(NULL, 0); 1983 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1984 if (!name) { 1985 _spdk_bdev_channel_destroy_resource(ch); 1986 return -1; 1987 } 1988 ch->handle = __itt_string_handle_create(name); 1989 free(name); 1990 ch->start_tsc = spdk_get_ticks(); 1991 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1992 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1993 } 1994 #endif 1995 1996 pthread_mutex_lock(&bdev->internal.mutex); 1997 _spdk_bdev_enable_qos(bdev, ch); 1998 pthread_mutex_unlock(&bdev->internal.mutex); 1999 2000 return 0; 2001 } 2002 2003 /* 2004 * Abort I/O that are waiting on a data buffer. These types of I/O are 2005 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2006 */ 2007 static void 2008 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2009 { 2010 bdev_io_stailq_t tmp; 2011 struct spdk_bdev_io *bdev_io; 2012 2013 STAILQ_INIT(&tmp); 2014 2015 while (!STAILQ_EMPTY(queue)) { 2016 bdev_io = STAILQ_FIRST(queue); 2017 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2018 if (bdev_io->internal.ch == ch) { 2019 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2020 } else { 2021 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2022 } 2023 } 2024 2025 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2026 } 2027 2028 /* 2029 * Abort I/O that are queued waiting for submission. These types of I/O are 2030 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2031 */ 2032 static void 2033 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2034 { 2035 struct spdk_bdev_io *bdev_io, *tmp; 2036 2037 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2038 if (bdev_io->internal.ch == ch) { 2039 TAILQ_REMOVE(queue, bdev_io, internal.link); 2040 /* 2041 * spdk_bdev_io_complete() assumes that the completed I/O had 2042 * been submitted to the bdev module. Since in this case it 2043 * hadn't, bump io_outstanding to account for the decrement 2044 * that spdk_bdev_io_complete() will do. 2045 */ 2046 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2047 ch->io_outstanding++; 2048 ch->shared_resource->io_outstanding++; 2049 } 2050 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2051 } 2052 } 2053 } 2054 2055 static void 2056 spdk_bdev_qos_channel_destroy(void *cb_arg) 2057 { 2058 struct spdk_bdev_qos *qos = cb_arg; 2059 2060 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2061 spdk_poller_unregister(&qos->poller); 2062 2063 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2064 2065 free(qos); 2066 } 2067 2068 static int 2069 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 2070 { 2071 int i; 2072 2073 /* 2074 * Cleanly shutting down the QoS poller is tricky, because 2075 * during the asynchronous operation the user could open 2076 * a new descriptor and create a new channel, spawning 2077 * a new QoS poller. 2078 * 2079 * The strategy is to create a new QoS structure here and swap it 2080 * in. The shutdown path then continues to refer to the old one 2081 * until it completes and then releases it. 2082 */ 2083 struct spdk_bdev_qos *new_qos, *old_qos; 2084 2085 old_qos = bdev->internal.qos; 2086 2087 new_qos = calloc(1, sizeof(*new_qos)); 2088 if (!new_qos) { 2089 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2090 return -ENOMEM; 2091 } 2092 2093 /* Copy the old QoS data into the newly allocated structure */ 2094 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2095 2096 /* Zero out the key parts of the QoS structure */ 2097 new_qos->ch = NULL; 2098 new_qos->thread = NULL; 2099 new_qos->poller = NULL; 2100 TAILQ_INIT(&new_qos->queued); 2101 /* 2102 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2103 * It will be used later for the new QoS structure. 2104 */ 2105 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2106 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2107 new_qos->rate_limits[i].min_per_timeslice = 0; 2108 new_qos->rate_limits[i].max_per_timeslice = 0; 2109 } 2110 2111 bdev->internal.qos = new_qos; 2112 2113 if (old_qos->thread == NULL) { 2114 free(old_qos); 2115 } else { 2116 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 2117 old_qos); 2118 } 2119 2120 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2121 * been destroyed yet. The destruction path will end up waiting for the final 2122 * channel to be put before it releases resources. */ 2123 2124 return 0; 2125 } 2126 2127 static void 2128 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2129 { 2130 total->bytes_read += add->bytes_read; 2131 total->num_read_ops += add->num_read_ops; 2132 total->bytes_written += add->bytes_written; 2133 total->num_write_ops += add->num_write_ops; 2134 total->bytes_unmapped += add->bytes_unmapped; 2135 total->num_unmap_ops += add->num_unmap_ops; 2136 total->read_latency_ticks += add->read_latency_ticks; 2137 total->write_latency_ticks += add->write_latency_ticks; 2138 total->unmap_latency_ticks += add->unmap_latency_ticks; 2139 } 2140 2141 static void 2142 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 2143 { 2144 struct spdk_bdev_channel *ch = ctx_buf; 2145 struct spdk_bdev_mgmt_channel *mgmt_ch; 2146 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2147 2148 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2149 spdk_get_thread()); 2150 2151 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2152 pthread_mutex_lock(&ch->bdev->internal.mutex); 2153 _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2154 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2155 2156 mgmt_ch = shared_resource->mgmt_ch; 2157 2158 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 2159 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 2160 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 2161 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 2162 2163 if (ch->histogram) { 2164 spdk_histogram_data_free(ch->histogram); 2165 } 2166 2167 _spdk_bdev_channel_destroy_resource(ch); 2168 } 2169 2170 int 2171 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2172 { 2173 struct spdk_bdev_alias *tmp; 2174 2175 if (alias == NULL) { 2176 SPDK_ERRLOG("Empty alias passed\n"); 2177 return -EINVAL; 2178 } 2179 2180 if (spdk_bdev_get_by_name(alias)) { 2181 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2182 return -EEXIST; 2183 } 2184 2185 tmp = calloc(1, sizeof(*tmp)); 2186 if (tmp == NULL) { 2187 SPDK_ERRLOG("Unable to allocate alias\n"); 2188 return -ENOMEM; 2189 } 2190 2191 tmp->alias = strdup(alias); 2192 if (tmp->alias == NULL) { 2193 free(tmp); 2194 SPDK_ERRLOG("Unable to allocate alias\n"); 2195 return -ENOMEM; 2196 } 2197 2198 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2199 2200 return 0; 2201 } 2202 2203 int 2204 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2205 { 2206 struct spdk_bdev_alias *tmp; 2207 2208 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2209 if (strcmp(alias, tmp->alias) == 0) { 2210 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2211 free(tmp->alias); 2212 free(tmp); 2213 return 0; 2214 } 2215 } 2216 2217 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2218 2219 return -ENOENT; 2220 } 2221 2222 void 2223 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2224 { 2225 struct spdk_bdev_alias *p, *tmp; 2226 2227 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2228 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2229 free(p->alias); 2230 free(p); 2231 } 2232 } 2233 2234 struct spdk_io_channel * 2235 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2236 { 2237 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 2238 } 2239 2240 const char * 2241 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2242 { 2243 return bdev->name; 2244 } 2245 2246 const char * 2247 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 2248 { 2249 return bdev->product_name; 2250 } 2251 2252 const struct spdk_bdev_aliases_list * 2253 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 2254 { 2255 return &bdev->aliases; 2256 } 2257 2258 uint32_t 2259 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 2260 { 2261 return bdev->blocklen; 2262 } 2263 2264 uint64_t 2265 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 2266 { 2267 return bdev->blockcnt; 2268 } 2269 2270 const char * 2271 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 2272 { 2273 return qos_rpc_type[type]; 2274 } 2275 2276 void 2277 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 2278 { 2279 int i; 2280 2281 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2282 2283 pthread_mutex_lock(&bdev->internal.mutex); 2284 if (bdev->internal.qos) { 2285 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2286 if (bdev->internal.qos->rate_limits[i].limit != 2287 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2288 limits[i] = bdev->internal.qos->rate_limits[i].limit; 2289 if (_spdk_bdev_qos_is_iops_rate_limit(i) == false) { 2290 /* Change from Byte to Megabyte which is user visible. */ 2291 limits[i] = limits[i] / 1024 / 1024; 2292 } 2293 } 2294 } 2295 } 2296 pthread_mutex_unlock(&bdev->internal.mutex); 2297 } 2298 2299 size_t 2300 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 2301 { 2302 return 1 << bdev->required_alignment; 2303 } 2304 2305 uint32_t 2306 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 2307 { 2308 return bdev->optimal_io_boundary; 2309 } 2310 2311 bool 2312 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 2313 { 2314 return bdev->write_cache; 2315 } 2316 2317 const struct spdk_uuid * 2318 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 2319 { 2320 return &bdev->uuid; 2321 } 2322 2323 uint64_t 2324 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 2325 { 2326 return bdev->internal.measured_queue_depth; 2327 } 2328 2329 uint64_t 2330 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 2331 { 2332 return bdev->internal.period; 2333 } 2334 2335 uint64_t 2336 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 2337 { 2338 return bdev->internal.weighted_io_time; 2339 } 2340 2341 uint64_t 2342 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 2343 { 2344 return bdev->internal.io_time; 2345 } 2346 2347 static void 2348 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 2349 { 2350 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2351 2352 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 2353 2354 if (bdev->internal.measured_queue_depth) { 2355 bdev->internal.io_time += bdev->internal.period; 2356 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 2357 } 2358 } 2359 2360 static void 2361 _calculate_measured_qd(struct spdk_io_channel_iter *i) 2362 { 2363 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2364 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2365 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 2366 2367 bdev->internal.temporary_queue_depth += ch->io_outstanding; 2368 spdk_for_each_channel_continue(i, 0); 2369 } 2370 2371 static int 2372 spdk_bdev_calculate_measured_queue_depth(void *ctx) 2373 { 2374 struct spdk_bdev *bdev = ctx; 2375 bdev->internal.temporary_queue_depth = 0; 2376 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 2377 _calculate_measured_qd_cpl); 2378 return 0; 2379 } 2380 2381 void 2382 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 2383 { 2384 bdev->internal.period = period; 2385 2386 if (bdev->internal.qd_poller != NULL) { 2387 spdk_poller_unregister(&bdev->internal.qd_poller); 2388 bdev->internal.measured_queue_depth = UINT64_MAX; 2389 } 2390 2391 if (period != 0) { 2392 bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, 2393 period); 2394 } 2395 } 2396 2397 int 2398 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 2399 { 2400 int ret; 2401 2402 pthread_mutex_lock(&bdev->internal.mutex); 2403 2404 /* bdev has open descriptors */ 2405 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 2406 bdev->blockcnt > size) { 2407 ret = -EBUSY; 2408 } else { 2409 bdev->blockcnt = size; 2410 ret = 0; 2411 } 2412 2413 pthread_mutex_unlock(&bdev->internal.mutex); 2414 2415 return ret; 2416 } 2417 2418 /* 2419 * Convert I/O offset and length from bytes to blocks. 2420 * 2421 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 2422 */ 2423 static uint64_t 2424 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 2425 uint64_t num_bytes, uint64_t *num_blocks) 2426 { 2427 uint32_t block_size = bdev->blocklen; 2428 uint8_t shift_cnt; 2429 2430 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2431 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 2432 shift_cnt = spdk_u32log2(block_size); 2433 *offset_blocks = offset_bytes >> shift_cnt; 2434 *num_blocks = num_bytes >> shift_cnt; 2435 return (offset_bytes - (*offset_blocks << shift_cnt)) | 2436 (num_bytes - (*num_blocks << shift_cnt)); 2437 } else { 2438 *offset_blocks = offset_bytes / block_size; 2439 *num_blocks = num_bytes / block_size; 2440 return (offset_bytes % block_size) | (num_bytes % block_size); 2441 } 2442 } 2443 2444 static bool 2445 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 2446 { 2447 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 2448 * has been an overflow and hence the offset has been wrapped around */ 2449 if (offset_blocks + num_blocks < offset_blocks) { 2450 return false; 2451 } 2452 2453 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 2454 if (offset_blocks + num_blocks > bdev->blockcnt) { 2455 return false; 2456 } 2457 2458 return true; 2459 } 2460 2461 int 2462 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2463 void *buf, uint64_t offset, uint64_t nbytes, 2464 spdk_bdev_io_completion_cb cb, void *cb_arg) 2465 { 2466 uint64_t offset_blocks, num_blocks; 2467 2468 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2469 return -EINVAL; 2470 } 2471 2472 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2473 } 2474 2475 int 2476 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2477 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2478 spdk_bdev_io_completion_cb cb, void *cb_arg) 2479 { 2480 struct spdk_bdev *bdev = desc->bdev; 2481 struct spdk_bdev_io *bdev_io; 2482 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2483 2484 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2485 return -EINVAL; 2486 } 2487 2488 bdev_io = spdk_bdev_get_io(channel); 2489 if (!bdev_io) { 2490 return -ENOMEM; 2491 } 2492 2493 bdev_io->internal.ch = channel; 2494 bdev_io->internal.desc = desc; 2495 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2496 bdev_io->u.bdev.iovs = &bdev_io->iov; 2497 bdev_io->u.bdev.iovs[0].iov_base = buf; 2498 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2499 bdev_io->u.bdev.iovcnt = 1; 2500 bdev_io->u.bdev.num_blocks = num_blocks; 2501 bdev_io->u.bdev.offset_blocks = offset_blocks; 2502 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2503 2504 spdk_bdev_io_submit(bdev_io); 2505 return 0; 2506 } 2507 2508 int 2509 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2510 struct iovec *iov, int iovcnt, 2511 uint64_t offset, uint64_t nbytes, 2512 spdk_bdev_io_completion_cb cb, void *cb_arg) 2513 { 2514 uint64_t offset_blocks, num_blocks; 2515 2516 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2517 return -EINVAL; 2518 } 2519 2520 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2521 } 2522 2523 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2524 struct iovec *iov, int iovcnt, 2525 uint64_t offset_blocks, uint64_t num_blocks, 2526 spdk_bdev_io_completion_cb cb, void *cb_arg) 2527 { 2528 struct spdk_bdev *bdev = desc->bdev; 2529 struct spdk_bdev_io *bdev_io; 2530 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2531 2532 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2533 return -EINVAL; 2534 } 2535 2536 bdev_io = spdk_bdev_get_io(channel); 2537 if (!bdev_io) { 2538 return -ENOMEM; 2539 } 2540 2541 bdev_io->internal.ch = channel; 2542 bdev_io->internal.desc = desc; 2543 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2544 bdev_io->u.bdev.iovs = iov; 2545 bdev_io->u.bdev.iovcnt = iovcnt; 2546 bdev_io->u.bdev.num_blocks = num_blocks; 2547 bdev_io->u.bdev.offset_blocks = offset_blocks; 2548 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2549 2550 spdk_bdev_io_submit(bdev_io); 2551 return 0; 2552 } 2553 2554 int 2555 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2556 void *buf, uint64_t offset, uint64_t nbytes, 2557 spdk_bdev_io_completion_cb cb, void *cb_arg) 2558 { 2559 uint64_t offset_blocks, num_blocks; 2560 2561 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2562 return -EINVAL; 2563 } 2564 2565 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2566 } 2567 2568 int 2569 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2570 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2571 spdk_bdev_io_completion_cb cb, void *cb_arg) 2572 { 2573 struct spdk_bdev *bdev = desc->bdev; 2574 struct spdk_bdev_io *bdev_io; 2575 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2576 2577 if (!desc->write) { 2578 return -EBADF; 2579 } 2580 2581 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2582 return -EINVAL; 2583 } 2584 2585 bdev_io = spdk_bdev_get_io(channel); 2586 if (!bdev_io) { 2587 return -ENOMEM; 2588 } 2589 2590 bdev_io->internal.ch = channel; 2591 bdev_io->internal.desc = desc; 2592 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2593 bdev_io->u.bdev.iovs = &bdev_io->iov; 2594 bdev_io->u.bdev.iovs[0].iov_base = buf; 2595 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2596 bdev_io->u.bdev.iovcnt = 1; 2597 bdev_io->u.bdev.num_blocks = num_blocks; 2598 bdev_io->u.bdev.offset_blocks = offset_blocks; 2599 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2600 2601 spdk_bdev_io_submit(bdev_io); 2602 return 0; 2603 } 2604 2605 int 2606 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2607 struct iovec *iov, int iovcnt, 2608 uint64_t offset, uint64_t len, 2609 spdk_bdev_io_completion_cb cb, void *cb_arg) 2610 { 2611 uint64_t offset_blocks, num_blocks; 2612 2613 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2614 return -EINVAL; 2615 } 2616 2617 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2618 } 2619 2620 int 2621 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2622 struct iovec *iov, int iovcnt, 2623 uint64_t offset_blocks, uint64_t num_blocks, 2624 spdk_bdev_io_completion_cb cb, void *cb_arg) 2625 { 2626 struct spdk_bdev *bdev = desc->bdev; 2627 struct spdk_bdev_io *bdev_io; 2628 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2629 2630 if (!desc->write) { 2631 return -EBADF; 2632 } 2633 2634 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2635 return -EINVAL; 2636 } 2637 2638 bdev_io = spdk_bdev_get_io(channel); 2639 if (!bdev_io) { 2640 return -ENOMEM; 2641 } 2642 2643 bdev_io->internal.ch = channel; 2644 bdev_io->internal.desc = desc; 2645 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2646 bdev_io->u.bdev.iovs = iov; 2647 bdev_io->u.bdev.iovcnt = iovcnt; 2648 bdev_io->u.bdev.num_blocks = num_blocks; 2649 bdev_io->u.bdev.offset_blocks = offset_blocks; 2650 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2651 2652 spdk_bdev_io_submit(bdev_io); 2653 return 0; 2654 } 2655 2656 int 2657 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2658 uint64_t offset, uint64_t len, 2659 spdk_bdev_io_completion_cb cb, void *cb_arg) 2660 { 2661 uint64_t offset_blocks, num_blocks; 2662 2663 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2664 return -EINVAL; 2665 } 2666 2667 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2668 } 2669 2670 int 2671 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2672 uint64_t offset_blocks, uint64_t num_blocks, 2673 spdk_bdev_io_completion_cb cb, void *cb_arg) 2674 { 2675 struct spdk_bdev *bdev = desc->bdev; 2676 struct spdk_bdev_io *bdev_io; 2677 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2678 2679 if (!desc->write) { 2680 return -EBADF; 2681 } 2682 2683 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2684 return -EINVAL; 2685 } 2686 2687 bdev_io = spdk_bdev_get_io(channel); 2688 2689 if (!bdev_io) { 2690 return -ENOMEM; 2691 } 2692 2693 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 2694 bdev_io->internal.ch = channel; 2695 bdev_io->internal.desc = desc; 2696 bdev_io->u.bdev.offset_blocks = offset_blocks; 2697 bdev_io->u.bdev.num_blocks = num_blocks; 2698 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2699 2700 if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 2701 spdk_bdev_io_submit(bdev_io); 2702 return 0; 2703 } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 2704 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 2705 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 2706 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 2707 _spdk_bdev_write_zero_buffer_next(bdev_io); 2708 return 0; 2709 } else { 2710 spdk_bdev_free_io(bdev_io); 2711 return -ENOTSUP; 2712 } 2713 } 2714 2715 int 2716 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2717 uint64_t offset, uint64_t nbytes, 2718 spdk_bdev_io_completion_cb cb, void *cb_arg) 2719 { 2720 uint64_t offset_blocks, num_blocks; 2721 2722 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2723 return -EINVAL; 2724 } 2725 2726 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2727 } 2728 2729 int 2730 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2731 uint64_t offset_blocks, uint64_t num_blocks, 2732 spdk_bdev_io_completion_cb cb, void *cb_arg) 2733 { 2734 struct spdk_bdev *bdev = desc->bdev; 2735 struct spdk_bdev_io *bdev_io; 2736 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2737 2738 if (!desc->write) { 2739 return -EBADF; 2740 } 2741 2742 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2743 return -EINVAL; 2744 } 2745 2746 if (num_blocks == 0) { 2747 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 2748 return -EINVAL; 2749 } 2750 2751 bdev_io = spdk_bdev_get_io(channel); 2752 if (!bdev_io) { 2753 return -ENOMEM; 2754 } 2755 2756 bdev_io->internal.ch = channel; 2757 bdev_io->internal.desc = desc; 2758 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 2759 2760 bdev_io->u.bdev.iovs = &bdev_io->iov; 2761 bdev_io->u.bdev.iovs[0].iov_base = NULL; 2762 bdev_io->u.bdev.iovs[0].iov_len = 0; 2763 bdev_io->u.bdev.iovcnt = 1; 2764 2765 bdev_io->u.bdev.offset_blocks = offset_blocks; 2766 bdev_io->u.bdev.num_blocks = num_blocks; 2767 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2768 2769 spdk_bdev_io_submit(bdev_io); 2770 return 0; 2771 } 2772 2773 int 2774 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2775 uint64_t offset, uint64_t length, 2776 spdk_bdev_io_completion_cb cb, void *cb_arg) 2777 { 2778 uint64_t offset_blocks, num_blocks; 2779 2780 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 2781 return -EINVAL; 2782 } 2783 2784 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2785 } 2786 2787 int 2788 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2789 uint64_t offset_blocks, uint64_t num_blocks, 2790 spdk_bdev_io_completion_cb cb, void *cb_arg) 2791 { 2792 struct spdk_bdev *bdev = desc->bdev; 2793 struct spdk_bdev_io *bdev_io; 2794 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2795 2796 if (!desc->write) { 2797 return -EBADF; 2798 } 2799 2800 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2801 return -EINVAL; 2802 } 2803 2804 bdev_io = spdk_bdev_get_io(channel); 2805 if (!bdev_io) { 2806 return -ENOMEM; 2807 } 2808 2809 bdev_io->internal.ch = channel; 2810 bdev_io->internal.desc = desc; 2811 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 2812 bdev_io->u.bdev.iovs = NULL; 2813 bdev_io->u.bdev.iovcnt = 0; 2814 bdev_io->u.bdev.offset_blocks = offset_blocks; 2815 bdev_io->u.bdev.num_blocks = num_blocks; 2816 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2817 2818 spdk_bdev_io_submit(bdev_io); 2819 return 0; 2820 } 2821 2822 static void 2823 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 2824 { 2825 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 2826 struct spdk_bdev_io *bdev_io; 2827 2828 bdev_io = TAILQ_FIRST(&ch->queued_resets); 2829 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 2830 spdk_bdev_io_submit_reset(bdev_io); 2831 } 2832 2833 static void 2834 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 2835 { 2836 struct spdk_io_channel *ch; 2837 struct spdk_bdev_channel *channel; 2838 struct spdk_bdev_mgmt_channel *mgmt_channel; 2839 struct spdk_bdev_shared_resource *shared_resource; 2840 bdev_io_tailq_t tmp_queued; 2841 2842 TAILQ_INIT(&tmp_queued); 2843 2844 ch = spdk_io_channel_iter_get_channel(i); 2845 channel = spdk_io_channel_get_ctx(ch); 2846 shared_resource = channel->shared_resource; 2847 mgmt_channel = shared_resource->mgmt_ch; 2848 2849 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 2850 2851 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 2852 /* The QoS object is always valid and readable while 2853 * the channel flag is set, so the lock here should not 2854 * be necessary. We're not in the fast path though, so 2855 * just take it anyway. */ 2856 pthread_mutex_lock(&channel->bdev->internal.mutex); 2857 if (channel->bdev->internal.qos->ch == channel) { 2858 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 2859 } 2860 pthread_mutex_unlock(&channel->bdev->internal.mutex); 2861 } 2862 2863 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 2864 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 2865 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 2866 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 2867 2868 spdk_for_each_channel_continue(i, 0); 2869 } 2870 2871 static void 2872 _spdk_bdev_start_reset(void *ctx) 2873 { 2874 struct spdk_bdev_channel *ch = ctx; 2875 2876 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 2877 ch, _spdk_bdev_reset_dev); 2878 } 2879 2880 static void 2881 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 2882 { 2883 struct spdk_bdev *bdev = ch->bdev; 2884 2885 assert(!TAILQ_EMPTY(&ch->queued_resets)); 2886 2887 pthread_mutex_lock(&bdev->internal.mutex); 2888 if (bdev->internal.reset_in_progress == NULL) { 2889 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 2890 /* 2891 * Take a channel reference for the target bdev for the life of this 2892 * reset. This guards against the channel getting destroyed while 2893 * spdk_for_each_channel() calls related to this reset IO are in 2894 * progress. We will release the reference when this reset is 2895 * completed. 2896 */ 2897 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2898 _spdk_bdev_start_reset(ch); 2899 } 2900 pthread_mutex_unlock(&bdev->internal.mutex); 2901 } 2902 2903 int 2904 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2905 spdk_bdev_io_completion_cb cb, void *cb_arg) 2906 { 2907 struct spdk_bdev *bdev = desc->bdev; 2908 struct spdk_bdev_io *bdev_io; 2909 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2910 2911 bdev_io = spdk_bdev_get_io(channel); 2912 if (!bdev_io) { 2913 return -ENOMEM; 2914 } 2915 2916 bdev_io->internal.ch = channel; 2917 bdev_io->internal.desc = desc; 2918 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2919 bdev_io->u.reset.ch_ref = NULL; 2920 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2921 2922 pthread_mutex_lock(&bdev->internal.mutex); 2923 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 2924 pthread_mutex_unlock(&bdev->internal.mutex); 2925 2926 _spdk_bdev_channel_start_reset(channel); 2927 2928 return 0; 2929 } 2930 2931 void 2932 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2933 struct spdk_bdev_io_stat *stat) 2934 { 2935 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2936 2937 *stat = channel->stat; 2938 } 2939 2940 static void 2941 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 2942 { 2943 void *io_device = spdk_io_channel_iter_get_io_device(i); 2944 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2945 2946 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 2947 bdev_iostat_ctx->cb_arg, 0); 2948 free(bdev_iostat_ctx); 2949 } 2950 2951 static void 2952 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 2953 { 2954 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2955 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2956 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2957 2958 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 2959 spdk_for_each_channel_continue(i, 0); 2960 } 2961 2962 void 2963 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 2964 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 2965 { 2966 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 2967 2968 assert(bdev != NULL); 2969 assert(stat != NULL); 2970 assert(cb != NULL); 2971 2972 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 2973 if (bdev_iostat_ctx == NULL) { 2974 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 2975 cb(bdev, stat, cb_arg, -ENOMEM); 2976 return; 2977 } 2978 2979 bdev_iostat_ctx->stat = stat; 2980 bdev_iostat_ctx->cb = cb; 2981 bdev_iostat_ctx->cb_arg = cb_arg; 2982 2983 /* Start with the statistics from previously deleted channels. */ 2984 pthread_mutex_lock(&bdev->internal.mutex); 2985 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 2986 pthread_mutex_unlock(&bdev->internal.mutex); 2987 2988 /* Then iterate and add the statistics from each existing channel. */ 2989 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2990 _spdk_bdev_get_each_channel_stat, 2991 bdev_iostat_ctx, 2992 _spdk_bdev_get_device_stat_done); 2993 } 2994 2995 int 2996 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2997 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 2998 spdk_bdev_io_completion_cb cb, void *cb_arg) 2999 { 3000 struct spdk_bdev *bdev = desc->bdev; 3001 struct spdk_bdev_io *bdev_io; 3002 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3003 3004 if (!desc->write) { 3005 return -EBADF; 3006 } 3007 3008 bdev_io = spdk_bdev_get_io(channel); 3009 if (!bdev_io) { 3010 return -ENOMEM; 3011 } 3012 3013 bdev_io->internal.ch = channel; 3014 bdev_io->internal.desc = desc; 3015 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 3016 bdev_io->u.nvme_passthru.cmd = *cmd; 3017 bdev_io->u.nvme_passthru.buf = buf; 3018 bdev_io->u.nvme_passthru.nbytes = nbytes; 3019 bdev_io->u.nvme_passthru.md_buf = NULL; 3020 bdev_io->u.nvme_passthru.md_len = 0; 3021 3022 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3023 3024 spdk_bdev_io_submit(bdev_io); 3025 return 0; 3026 } 3027 3028 int 3029 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3030 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3031 spdk_bdev_io_completion_cb cb, void *cb_arg) 3032 { 3033 struct spdk_bdev *bdev = desc->bdev; 3034 struct spdk_bdev_io *bdev_io; 3035 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3036 3037 if (!desc->write) { 3038 /* 3039 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3040 * to easily determine if the command is a read or write, but for now just 3041 * do not allow io_passthru with a read-only descriptor. 3042 */ 3043 return -EBADF; 3044 } 3045 3046 bdev_io = spdk_bdev_get_io(channel); 3047 if (!bdev_io) { 3048 return -ENOMEM; 3049 } 3050 3051 bdev_io->internal.ch = channel; 3052 bdev_io->internal.desc = desc; 3053 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 3054 bdev_io->u.nvme_passthru.cmd = *cmd; 3055 bdev_io->u.nvme_passthru.buf = buf; 3056 bdev_io->u.nvme_passthru.nbytes = nbytes; 3057 bdev_io->u.nvme_passthru.md_buf = NULL; 3058 bdev_io->u.nvme_passthru.md_len = 0; 3059 3060 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3061 3062 spdk_bdev_io_submit(bdev_io); 3063 return 0; 3064 } 3065 3066 int 3067 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3068 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 3069 spdk_bdev_io_completion_cb cb, void *cb_arg) 3070 { 3071 struct spdk_bdev *bdev = desc->bdev; 3072 struct spdk_bdev_io *bdev_io; 3073 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3074 3075 if (!desc->write) { 3076 /* 3077 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3078 * to easily determine if the command is a read or write, but for now just 3079 * do not allow io_passthru with a read-only descriptor. 3080 */ 3081 return -EBADF; 3082 } 3083 3084 bdev_io = spdk_bdev_get_io(channel); 3085 if (!bdev_io) { 3086 return -ENOMEM; 3087 } 3088 3089 bdev_io->internal.ch = channel; 3090 bdev_io->internal.desc = desc; 3091 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 3092 bdev_io->u.nvme_passthru.cmd = *cmd; 3093 bdev_io->u.nvme_passthru.buf = buf; 3094 bdev_io->u.nvme_passthru.nbytes = nbytes; 3095 bdev_io->u.nvme_passthru.md_buf = md_buf; 3096 bdev_io->u.nvme_passthru.md_len = md_len; 3097 3098 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3099 3100 spdk_bdev_io_submit(bdev_io); 3101 return 0; 3102 } 3103 3104 int 3105 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 3106 struct spdk_bdev_io_wait_entry *entry) 3107 { 3108 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3109 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 3110 3111 if (bdev != entry->bdev) { 3112 SPDK_ERRLOG("bdevs do not match\n"); 3113 return -EINVAL; 3114 } 3115 3116 if (mgmt_ch->per_thread_cache_count > 0) { 3117 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 3118 return -EINVAL; 3119 } 3120 3121 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 3122 return 0; 3123 } 3124 3125 static void 3126 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 3127 { 3128 struct spdk_bdev *bdev = bdev_ch->bdev; 3129 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3130 struct spdk_bdev_io *bdev_io; 3131 3132 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 3133 /* 3134 * Allow some more I/O to complete before retrying the nomem_io queue. 3135 * Some drivers (such as nvme) cannot immediately take a new I/O in 3136 * the context of a completion, because the resources for the I/O are 3137 * not released until control returns to the bdev poller. Also, we 3138 * may require several small I/O to complete before a larger I/O 3139 * (that requires splitting) can be submitted. 3140 */ 3141 return; 3142 } 3143 3144 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 3145 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 3146 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 3147 bdev_io->internal.ch->io_outstanding++; 3148 shared_resource->io_outstanding++; 3149 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3150 bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); 3151 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 3152 break; 3153 } 3154 } 3155 } 3156 3157 static inline void 3158 _spdk_bdev_io_complete(void *ctx) 3159 { 3160 struct spdk_bdev_io *bdev_io = ctx; 3161 uint64_t tsc, tsc_diff; 3162 3163 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 3164 /* 3165 * Send the completion to the thread that originally submitted the I/O, 3166 * which may not be the current thread in the case of QoS. 3167 */ 3168 if (bdev_io->internal.io_submit_ch) { 3169 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 3170 bdev_io->internal.io_submit_ch = NULL; 3171 } 3172 3173 /* 3174 * Defer completion to avoid potential infinite recursion if the 3175 * user's completion callback issues a new I/O. 3176 */ 3177 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 3178 _spdk_bdev_io_complete, bdev_io); 3179 return; 3180 } 3181 3182 tsc = spdk_get_ticks(); 3183 tsc_diff = tsc - bdev_io->internal.submit_tsc; 3184 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 3185 3186 if (bdev_io->internal.ch->histogram) { 3187 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 3188 } 3189 3190 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3191 switch (bdev_io->type) { 3192 case SPDK_BDEV_IO_TYPE_READ: 3193 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3194 bdev_io->internal.ch->stat.num_read_ops++; 3195 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 3196 break; 3197 case SPDK_BDEV_IO_TYPE_WRITE: 3198 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3199 bdev_io->internal.ch->stat.num_write_ops++; 3200 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 3201 break; 3202 case SPDK_BDEV_IO_TYPE_UNMAP: 3203 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3204 bdev_io->internal.ch->stat.num_unmap_ops++; 3205 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 3206 default: 3207 break; 3208 } 3209 } 3210 3211 #ifdef SPDK_CONFIG_VTUNE 3212 uint64_t now_tsc = spdk_get_ticks(); 3213 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 3214 uint64_t data[5]; 3215 3216 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 3217 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 3218 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 3219 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 3220 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 3221 bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; 3222 3223 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 3224 __itt_metadata_u64, 5, data); 3225 3226 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 3227 bdev_io->internal.ch->start_tsc = now_tsc; 3228 } 3229 #endif 3230 3231 assert(bdev_io->internal.cb != NULL); 3232 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); 3233 3234 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3235 bdev_io->internal.caller_ctx); 3236 } 3237 3238 static void 3239 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 3240 { 3241 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 3242 3243 if (bdev_io->u.reset.ch_ref != NULL) { 3244 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 3245 bdev_io->u.reset.ch_ref = NULL; 3246 } 3247 3248 _spdk_bdev_io_complete(bdev_io); 3249 } 3250 3251 static void 3252 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 3253 { 3254 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3255 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 3256 3257 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 3258 if (!TAILQ_EMPTY(&ch->queued_resets)) { 3259 _spdk_bdev_channel_start_reset(ch); 3260 } 3261 3262 spdk_for_each_channel_continue(i, 0); 3263 } 3264 3265 void 3266 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 3267 { 3268 struct spdk_bdev *bdev = bdev_io->bdev; 3269 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3270 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3271 3272 bdev_io->internal.status = status; 3273 3274 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 3275 bool unlock_channels = false; 3276 3277 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 3278 SPDK_ERRLOG("NOMEM returned for reset\n"); 3279 } 3280 pthread_mutex_lock(&bdev->internal.mutex); 3281 if (bdev_io == bdev->internal.reset_in_progress) { 3282 bdev->internal.reset_in_progress = NULL; 3283 unlock_channels = true; 3284 } 3285 pthread_mutex_unlock(&bdev->internal.mutex); 3286 3287 if (unlock_channels) { 3288 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 3289 bdev_io, _spdk_bdev_reset_complete); 3290 return; 3291 } 3292 } else { 3293 if (spdk_unlikely(bdev_io->internal.orig_iovcnt > 0)) { 3294 _bdev_io_unset_bounce_buf(bdev_io); 3295 } 3296 3297 assert(bdev_ch->io_outstanding > 0); 3298 assert(shared_resource->io_outstanding > 0); 3299 bdev_ch->io_outstanding--; 3300 shared_resource->io_outstanding--; 3301 3302 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 3303 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 3304 /* 3305 * Wait for some of the outstanding I/O to complete before we 3306 * retry any of the nomem_io. Normally we will wait for 3307 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 3308 * depth channels we will instead wait for half to complete. 3309 */ 3310 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 3311 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 3312 return; 3313 } 3314 3315 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 3316 _spdk_bdev_ch_retry_io(bdev_ch); 3317 } 3318 } 3319 3320 _spdk_bdev_io_complete(bdev_io); 3321 } 3322 3323 void 3324 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 3325 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 3326 { 3327 if (sc == SPDK_SCSI_STATUS_GOOD) { 3328 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3329 } else { 3330 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 3331 bdev_io->internal.error.scsi.sc = sc; 3332 bdev_io->internal.error.scsi.sk = sk; 3333 bdev_io->internal.error.scsi.asc = asc; 3334 bdev_io->internal.error.scsi.ascq = ascq; 3335 } 3336 3337 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3338 } 3339 3340 void 3341 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 3342 int *sc, int *sk, int *asc, int *ascq) 3343 { 3344 assert(sc != NULL); 3345 assert(sk != NULL); 3346 assert(asc != NULL); 3347 assert(ascq != NULL); 3348 3349 switch (bdev_io->internal.status) { 3350 case SPDK_BDEV_IO_STATUS_SUCCESS: 3351 *sc = SPDK_SCSI_STATUS_GOOD; 3352 *sk = SPDK_SCSI_SENSE_NO_SENSE; 3353 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3354 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3355 break; 3356 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 3357 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 3358 break; 3359 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 3360 *sc = bdev_io->internal.error.scsi.sc; 3361 *sk = bdev_io->internal.error.scsi.sk; 3362 *asc = bdev_io->internal.error.scsi.asc; 3363 *ascq = bdev_io->internal.error.scsi.ascq; 3364 break; 3365 default: 3366 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 3367 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 3368 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3369 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3370 break; 3371 } 3372 } 3373 3374 void 3375 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 3376 { 3377 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 3378 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3379 } else { 3380 bdev_io->internal.error.nvme.sct = sct; 3381 bdev_io->internal.error.nvme.sc = sc; 3382 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 3383 } 3384 3385 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3386 } 3387 3388 void 3389 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 3390 { 3391 assert(sct != NULL); 3392 assert(sc != NULL); 3393 3394 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 3395 *sct = bdev_io->internal.error.nvme.sct; 3396 *sc = bdev_io->internal.error.nvme.sc; 3397 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3398 *sct = SPDK_NVME_SCT_GENERIC; 3399 *sc = SPDK_NVME_SC_SUCCESS; 3400 } else { 3401 *sct = SPDK_NVME_SCT_GENERIC; 3402 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3403 } 3404 } 3405 3406 struct spdk_thread * 3407 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 3408 { 3409 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 3410 } 3411 3412 static void 3413 _spdk_bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 3414 { 3415 uint64_t min_qos_set; 3416 int i; 3417 3418 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3419 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3420 break; 3421 } 3422 } 3423 3424 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3425 SPDK_ERRLOG("Invalid rate limits set.\n"); 3426 return; 3427 } 3428 3429 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3430 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3431 continue; 3432 } 3433 3434 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 3435 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 3436 } else { 3437 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 3438 } 3439 3440 if (limits[i] == 0 || limits[i] % min_qos_set) { 3441 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 3442 limits[i], bdev->name, min_qos_set); 3443 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 3444 return; 3445 } 3446 } 3447 3448 if (!bdev->internal.qos) { 3449 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 3450 if (!bdev->internal.qos) { 3451 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3452 return; 3453 } 3454 } 3455 3456 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3457 bdev->internal.qos->rate_limits[i].limit = limits[i]; 3458 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 3459 bdev->name, i, limits[i]); 3460 } 3461 3462 return; 3463 } 3464 3465 static void 3466 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 3467 { 3468 struct spdk_conf_section *sp = NULL; 3469 const char *val = NULL; 3470 int i = 0, j = 0; 3471 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 3472 bool config_qos = false; 3473 3474 sp = spdk_conf_find_section(NULL, "QoS"); 3475 if (!sp) { 3476 return; 3477 } 3478 3479 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3480 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3481 3482 i = 0; 3483 while (true) { 3484 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 3485 if (!val) { 3486 break; 3487 } 3488 3489 if (strcmp(bdev->name, val) != 0) { 3490 i++; 3491 continue; 3492 } 3493 3494 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 3495 if (val) { 3496 if (_spdk_bdev_qos_is_iops_rate_limit(j) == true) { 3497 limits[j] = strtoull(val, NULL, 10); 3498 } else { 3499 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 3500 } 3501 config_qos = true; 3502 } 3503 3504 break; 3505 } 3506 3507 j++; 3508 } 3509 3510 if (config_qos == true) { 3511 _spdk_bdev_qos_config_limit(bdev, limits); 3512 } 3513 3514 return; 3515 } 3516 3517 static int 3518 spdk_bdev_init(struct spdk_bdev *bdev) 3519 { 3520 char *bdev_name; 3521 3522 assert(bdev->module != NULL); 3523 3524 if (!bdev->name) { 3525 SPDK_ERRLOG("Bdev name is NULL\n"); 3526 return -EINVAL; 3527 } 3528 3529 if (spdk_bdev_get_by_name(bdev->name)) { 3530 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 3531 return -EEXIST; 3532 } 3533 3534 /* Users often register their own I/O devices using the bdev name. In 3535 * order to avoid conflicts, prepend bdev_. */ 3536 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 3537 if (!bdev_name) { 3538 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 3539 return -ENOMEM; 3540 } 3541 3542 bdev->internal.status = SPDK_BDEV_STATUS_READY; 3543 bdev->internal.measured_queue_depth = UINT64_MAX; 3544 bdev->internal.claim_module = NULL; 3545 bdev->internal.qd_poller = NULL; 3546 bdev->internal.qos = NULL; 3547 3548 if (spdk_bdev_get_buf_align(bdev) > 1) { 3549 if (bdev->split_on_optimal_io_boundary) { 3550 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 3551 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 3552 } else { 3553 bdev->split_on_optimal_io_boundary = true; 3554 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 3555 } 3556 } 3557 3558 TAILQ_INIT(&bdev->internal.open_descs); 3559 3560 TAILQ_INIT(&bdev->aliases); 3561 3562 bdev->internal.reset_in_progress = NULL; 3563 3564 _spdk_bdev_qos_config(bdev); 3565 3566 spdk_io_device_register(__bdev_to_io_dev(bdev), 3567 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 3568 sizeof(struct spdk_bdev_channel), 3569 bdev_name); 3570 3571 free(bdev_name); 3572 3573 pthread_mutex_init(&bdev->internal.mutex, NULL); 3574 return 0; 3575 } 3576 3577 static void 3578 spdk_bdev_destroy_cb(void *io_device) 3579 { 3580 int rc; 3581 struct spdk_bdev *bdev; 3582 spdk_bdev_unregister_cb cb_fn; 3583 void *cb_arg; 3584 3585 bdev = __bdev_from_io_dev(io_device); 3586 cb_fn = bdev->internal.unregister_cb; 3587 cb_arg = bdev->internal.unregister_ctx; 3588 3589 rc = bdev->fn_table->destruct(bdev->ctxt); 3590 if (rc < 0) { 3591 SPDK_ERRLOG("destruct failed\n"); 3592 } 3593 if (rc <= 0 && cb_fn != NULL) { 3594 cb_fn(cb_arg, rc); 3595 } 3596 } 3597 3598 3599 static void 3600 spdk_bdev_fini(struct spdk_bdev *bdev) 3601 { 3602 pthread_mutex_destroy(&bdev->internal.mutex); 3603 3604 free(bdev->internal.qos); 3605 3606 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 3607 } 3608 3609 static void 3610 spdk_bdev_start(struct spdk_bdev *bdev) 3611 { 3612 struct spdk_bdev_module *module; 3613 uint32_t action; 3614 3615 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 3616 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 3617 3618 /* Examine configuration before initializing I/O */ 3619 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3620 if (module->examine_config) { 3621 action = module->internal.action_in_progress; 3622 module->internal.action_in_progress++; 3623 module->examine_config(bdev); 3624 if (action != module->internal.action_in_progress) { 3625 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 3626 module->name); 3627 } 3628 } 3629 } 3630 3631 if (bdev->internal.claim_module) { 3632 return; 3633 } 3634 3635 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3636 if (module->examine_disk) { 3637 module->internal.action_in_progress++; 3638 module->examine_disk(bdev); 3639 } 3640 } 3641 } 3642 3643 int 3644 spdk_bdev_register(struct spdk_bdev *bdev) 3645 { 3646 int rc = spdk_bdev_init(bdev); 3647 3648 if (rc == 0) { 3649 spdk_bdev_start(bdev); 3650 } 3651 3652 return rc; 3653 } 3654 3655 int 3656 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 3657 { 3658 int rc; 3659 3660 rc = spdk_bdev_init(vbdev); 3661 if (rc) { 3662 return rc; 3663 } 3664 3665 spdk_bdev_start(vbdev); 3666 return 0; 3667 } 3668 3669 void 3670 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 3671 { 3672 if (bdev->internal.unregister_cb != NULL) { 3673 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 3674 } 3675 } 3676 3677 static void 3678 _remove_notify(void *arg) 3679 { 3680 struct spdk_bdev_desc *desc = arg; 3681 3682 desc->remove_scheduled = false; 3683 3684 if (desc->closed) { 3685 free(desc); 3686 } else { 3687 desc->remove_cb(desc->remove_ctx); 3688 } 3689 } 3690 3691 void 3692 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 3693 { 3694 struct spdk_bdev_desc *desc, *tmp; 3695 bool do_destruct = true; 3696 struct spdk_thread *thread; 3697 3698 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 3699 3700 thread = spdk_get_thread(); 3701 if (!thread) { 3702 /* The user called this from a non-SPDK thread. */ 3703 if (cb_fn != NULL) { 3704 cb_fn(cb_arg, -ENOTSUP); 3705 } 3706 return; 3707 } 3708 3709 pthread_mutex_lock(&bdev->internal.mutex); 3710 3711 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 3712 bdev->internal.unregister_cb = cb_fn; 3713 bdev->internal.unregister_ctx = cb_arg; 3714 3715 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 3716 if (desc->remove_cb) { 3717 do_destruct = false; 3718 /* 3719 * Defer invocation of the remove_cb to a separate message that will 3720 * run later on its thread. This ensures this context unwinds and 3721 * we don't recursively unregister this bdev again if the remove_cb 3722 * immediately closes its descriptor. 3723 */ 3724 if (!desc->remove_scheduled) { 3725 /* Avoid scheduling removal of the same descriptor multiple times. */ 3726 desc->remove_scheduled = true; 3727 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 3728 } 3729 } 3730 } 3731 3732 if (!do_destruct) { 3733 pthread_mutex_unlock(&bdev->internal.mutex); 3734 return; 3735 } 3736 3737 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 3738 pthread_mutex_unlock(&bdev->internal.mutex); 3739 3740 spdk_bdev_fini(bdev); 3741 } 3742 3743 int 3744 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 3745 void *remove_ctx, struct spdk_bdev_desc **_desc) 3746 { 3747 struct spdk_bdev_desc *desc; 3748 struct spdk_thread *thread; 3749 3750 thread = spdk_get_thread(); 3751 if (!thread) { 3752 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 3753 return -ENOTSUP; 3754 } 3755 3756 desc = calloc(1, sizeof(*desc)); 3757 if (desc == NULL) { 3758 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 3759 return -ENOMEM; 3760 } 3761 3762 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3763 spdk_get_thread()); 3764 3765 desc->bdev = bdev; 3766 desc->thread = thread; 3767 desc->remove_cb = remove_cb; 3768 desc->remove_ctx = remove_ctx; 3769 desc->write = write; 3770 *_desc = desc; 3771 3772 pthread_mutex_lock(&bdev->internal.mutex); 3773 3774 if (write && bdev->internal.claim_module) { 3775 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 3776 bdev->name, bdev->internal.claim_module->name); 3777 pthread_mutex_unlock(&bdev->internal.mutex); 3778 free(desc); 3779 *_desc = NULL; 3780 return -EPERM; 3781 } 3782 3783 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 3784 3785 pthread_mutex_unlock(&bdev->internal.mutex); 3786 3787 return 0; 3788 } 3789 3790 void 3791 spdk_bdev_close(struct spdk_bdev_desc *desc) 3792 { 3793 struct spdk_bdev *bdev = desc->bdev; 3794 bool do_unregister = false; 3795 3796 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3797 spdk_get_thread()); 3798 3799 assert(desc->thread == spdk_get_thread()); 3800 3801 pthread_mutex_lock(&bdev->internal.mutex); 3802 3803 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 3804 3805 desc->closed = true; 3806 3807 if (!desc->remove_scheduled) { 3808 free(desc); 3809 } 3810 3811 /* If no more descriptors, kill QoS channel */ 3812 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3813 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 3814 bdev->name, spdk_get_thread()); 3815 3816 if (spdk_bdev_qos_destroy(bdev)) { 3817 /* There isn't anything we can do to recover here. Just let the 3818 * old QoS poller keep running. The QoS handling won't change 3819 * cores when the user allocates a new channel, but it won't break. */ 3820 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 3821 } 3822 } 3823 3824 spdk_bdev_set_qd_sampling_period(bdev, 0); 3825 3826 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3827 do_unregister = true; 3828 } 3829 pthread_mutex_unlock(&bdev->internal.mutex); 3830 3831 if (do_unregister == true) { 3832 spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); 3833 } 3834 } 3835 3836 int 3837 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 3838 struct spdk_bdev_module *module) 3839 { 3840 if (bdev->internal.claim_module != NULL) { 3841 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 3842 bdev->internal.claim_module->name); 3843 return -EPERM; 3844 } 3845 3846 if (desc && !desc->write) { 3847 desc->write = true; 3848 } 3849 3850 bdev->internal.claim_module = module; 3851 return 0; 3852 } 3853 3854 void 3855 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 3856 { 3857 assert(bdev->internal.claim_module != NULL); 3858 bdev->internal.claim_module = NULL; 3859 } 3860 3861 struct spdk_bdev * 3862 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 3863 { 3864 return desc->bdev; 3865 } 3866 3867 void 3868 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 3869 { 3870 struct iovec *iovs; 3871 int iovcnt; 3872 3873 if (bdev_io == NULL) { 3874 return; 3875 } 3876 3877 switch (bdev_io->type) { 3878 case SPDK_BDEV_IO_TYPE_READ: 3879 iovs = bdev_io->u.bdev.iovs; 3880 iovcnt = bdev_io->u.bdev.iovcnt; 3881 break; 3882 case SPDK_BDEV_IO_TYPE_WRITE: 3883 iovs = bdev_io->u.bdev.iovs; 3884 iovcnt = bdev_io->u.bdev.iovcnt; 3885 break; 3886 default: 3887 iovs = NULL; 3888 iovcnt = 0; 3889 break; 3890 } 3891 3892 if (iovp) { 3893 *iovp = iovs; 3894 } 3895 if (iovcntp) { 3896 *iovcntp = iovcnt; 3897 } 3898 } 3899 3900 void 3901 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 3902 { 3903 3904 if (spdk_bdev_module_list_find(bdev_module->name)) { 3905 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 3906 assert(false); 3907 } 3908 3909 if (bdev_module->async_init) { 3910 bdev_module->internal.action_in_progress = 1; 3911 } 3912 3913 /* 3914 * Modules with examine callbacks must be initialized first, so they are 3915 * ready to handle examine callbacks from later modules that will 3916 * register physical bdevs. 3917 */ 3918 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 3919 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3920 } else { 3921 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3922 } 3923 } 3924 3925 struct spdk_bdev_module * 3926 spdk_bdev_module_list_find(const char *name) 3927 { 3928 struct spdk_bdev_module *bdev_module; 3929 3930 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3931 if (strcmp(name, bdev_module->name) == 0) { 3932 break; 3933 } 3934 } 3935 3936 return bdev_module; 3937 } 3938 3939 static void 3940 _spdk_bdev_write_zero_buffer_next(void *_bdev_io) 3941 { 3942 struct spdk_bdev_io *bdev_io = _bdev_io; 3943 uint64_t num_bytes, num_blocks; 3944 int rc; 3945 3946 num_bytes = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * 3947 bdev_io->u.bdev.split_remaining_num_blocks, 3948 ZERO_BUFFER_SIZE); 3949 num_blocks = num_bytes / spdk_bdev_get_block_size(bdev_io->bdev); 3950 3951 rc = spdk_bdev_write_blocks(bdev_io->internal.desc, 3952 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3953 g_bdev_mgr.zero_buffer, 3954 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 3955 _spdk_bdev_write_zero_buffer_done, bdev_io); 3956 if (rc == 0) { 3957 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 3958 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 3959 } else if (rc == -ENOMEM) { 3960 _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_write_zero_buffer_next); 3961 } else { 3962 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3963 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3964 } 3965 } 3966 3967 static void 3968 _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3969 { 3970 struct spdk_bdev_io *parent_io = cb_arg; 3971 3972 spdk_bdev_free_io(bdev_io); 3973 3974 if (!success) { 3975 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3976 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3977 return; 3978 } 3979 3980 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3981 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3982 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 3983 return; 3984 } 3985 3986 _spdk_bdev_write_zero_buffer_next(parent_io); 3987 } 3988 3989 struct set_qos_limit_ctx { 3990 void (*cb_fn)(void *cb_arg, int status); 3991 void *cb_arg; 3992 struct spdk_bdev *bdev; 3993 }; 3994 3995 static void 3996 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 3997 { 3998 pthread_mutex_lock(&ctx->bdev->internal.mutex); 3999 ctx->bdev->internal.qos_mod_in_progress = false; 4000 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4001 4002 ctx->cb_fn(ctx->cb_arg, status); 4003 free(ctx); 4004 } 4005 4006 static void 4007 _spdk_bdev_disable_qos_done(void *cb_arg) 4008 { 4009 struct set_qos_limit_ctx *ctx = cb_arg; 4010 struct spdk_bdev *bdev = ctx->bdev; 4011 struct spdk_bdev_io *bdev_io; 4012 struct spdk_bdev_qos *qos; 4013 4014 pthread_mutex_lock(&bdev->internal.mutex); 4015 qos = bdev->internal.qos; 4016 bdev->internal.qos = NULL; 4017 pthread_mutex_unlock(&bdev->internal.mutex); 4018 4019 while (!TAILQ_EMPTY(&qos->queued)) { 4020 /* Send queued I/O back to their original thread for resubmission. */ 4021 bdev_io = TAILQ_FIRST(&qos->queued); 4022 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 4023 4024 if (bdev_io->internal.io_submit_ch) { 4025 /* 4026 * Channel was changed when sending it to the QoS thread - change it back 4027 * before sending it back to the original thread. 4028 */ 4029 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4030 bdev_io->internal.io_submit_ch = NULL; 4031 } 4032 4033 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 4034 _spdk_bdev_io_submit, bdev_io); 4035 } 4036 4037 if (qos->thread != NULL) { 4038 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4039 spdk_poller_unregister(&qos->poller); 4040 } 4041 4042 free(qos); 4043 4044 _spdk_bdev_set_qos_limit_done(ctx, 0); 4045 } 4046 4047 static void 4048 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 4049 { 4050 void *io_device = spdk_io_channel_iter_get_io_device(i); 4051 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4052 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4053 struct spdk_thread *thread; 4054 4055 pthread_mutex_lock(&bdev->internal.mutex); 4056 thread = bdev->internal.qos->thread; 4057 pthread_mutex_unlock(&bdev->internal.mutex); 4058 4059 if (thread != NULL) { 4060 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 4061 } else { 4062 _spdk_bdev_disable_qos_done(ctx); 4063 } 4064 } 4065 4066 static void 4067 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 4068 { 4069 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4070 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4071 4072 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 4073 4074 spdk_for_each_channel_continue(i, 0); 4075 } 4076 4077 static void 4078 _spdk_bdev_update_qos_rate_limit_msg(void *cb_arg) 4079 { 4080 struct set_qos_limit_ctx *ctx = cb_arg; 4081 struct spdk_bdev *bdev = ctx->bdev; 4082 4083 pthread_mutex_lock(&bdev->internal.mutex); 4084 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 4085 pthread_mutex_unlock(&bdev->internal.mutex); 4086 4087 _spdk_bdev_set_qos_limit_done(ctx, 0); 4088 } 4089 4090 static void 4091 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 4092 { 4093 void *io_device = spdk_io_channel_iter_get_io_device(i); 4094 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4095 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4096 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4097 4098 pthread_mutex_lock(&bdev->internal.mutex); 4099 _spdk_bdev_enable_qos(bdev, bdev_ch); 4100 pthread_mutex_unlock(&bdev->internal.mutex); 4101 spdk_for_each_channel_continue(i, 0); 4102 } 4103 4104 static void 4105 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 4106 { 4107 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4108 4109 _spdk_bdev_set_qos_limit_done(ctx, status); 4110 } 4111 4112 static void 4113 _spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4114 { 4115 int i; 4116 4117 assert(bdev->internal.qos != NULL); 4118 4119 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4120 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4121 bdev->internal.qos->rate_limits[i].limit = limits[i]; 4122 4123 if (limits[i] == 0) { 4124 bdev->internal.qos->rate_limits[i].limit = 4125 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4126 } 4127 } 4128 } 4129 } 4130 4131 void 4132 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 4133 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 4134 { 4135 struct set_qos_limit_ctx *ctx; 4136 uint32_t limit_set_complement; 4137 uint64_t min_limit_per_sec; 4138 int i; 4139 bool disable_rate_limit = true; 4140 4141 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4142 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4143 continue; 4144 } 4145 4146 if (limits[i] > 0) { 4147 disable_rate_limit = false; 4148 } 4149 4150 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 4151 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 4152 } else { 4153 /* Change from megabyte to byte rate limit */ 4154 limits[i] = limits[i] * 1024 * 1024; 4155 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 4156 } 4157 4158 limit_set_complement = limits[i] % min_limit_per_sec; 4159 if (limit_set_complement) { 4160 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 4161 limits[i], min_limit_per_sec); 4162 limits[i] += min_limit_per_sec - limit_set_complement; 4163 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 4164 } 4165 } 4166 4167 ctx = calloc(1, sizeof(*ctx)); 4168 if (ctx == NULL) { 4169 cb_fn(cb_arg, -ENOMEM); 4170 return; 4171 } 4172 4173 ctx->cb_fn = cb_fn; 4174 ctx->cb_arg = cb_arg; 4175 ctx->bdev = bdev; 4176 4177 pthread_mutex_lock(&bdev->internal.mutex); 4178 if (bdev->internal.qos_mod_in_progress) { 4179 pthread_mutex_unlock(&bdev->internal.mutex); 4180 free(ctx); 4181 cb_fn(cb_arg, -EAGAIN); 4182 return; 4183 } 4184 bdev->internal.qos_mod_in_progress = true; 4185 4186 if (disable_rate_limit == true && bdev->internal.qos) { 4187 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4188 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 4189 (bdev->internal.qos->rate_limits[i].limit > 0 && 4190 bdev->internal.qos->rate_limits[i].limit != 4191 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 4192 disable_rate_limit = false; 4193 break; 4194 } 4195 } 4196 } 4197 4198 if (disable_rate_limit == false) { 4199 if (bdev->internal.qos == NULL) { 4200 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 4201 if (!bdev->internal.qos) { 4202 pthread_mutex_unlock(&bdev->internal.mutex); 4203 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 4204 free(ctx); 4205 cb_fn(cb_arg, -ENOMEM); 4206 return; 4207 } 4208 } 4209 4210 if (bdev->internal.qos->thread == NULL) { 4211 /* Enabling */ 4212 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4213 4214 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4215 _spdk_bdev_enable_qos_msg, ctx, 4216 _spdk_bdev_enable_qos_done); 4217 } else { 4218 /* Updating */ 4219 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4220 4221 spdk_thread_send_msg(bdev->internal.qos->thread, 4222 _spdk_bdev_update_qos_rate_limit_msg, ctx); 4223 } 4224 } else { 4225 if (bdev->internal.qos != NULL) { 4226 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4227 4228 /* Disabling */ 4229 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4230 _spdk_bdev_disable_qos_msg, ctx, 4231 _spdk_bdev_disable_qos_msg_done); 4232 } else { 4233 pthread_mutex_unlock(&bdev->internal.mutex); 4234 _spdk_bdev_set_qos_limit_done(ctx, 0); 4235 return; 4236 } 4237 } 4238 4239 pthread_mutex_unlock(&bdev->internal.mutex); 4240 } 4241 4242 struct spdk_bdev_histogram_ctx { 4243 spdk_bdev_histogram_status_cb cb_fn; 4244 void *cb_arg; 4245 struct spdk_bdev *bdev; 4246 int status; 4247 }; 4248 4249 static void 4250 _spdk_bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 4251 { 4252 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4253 4254 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4255 ctx->bdev->internal.histogram_in_progress = false; 4256 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4257 ctx->cb_fn(ctx->cb_arg, ctx->status); 4258 free(ctx); 4259 } 4260 4261 static void 4262 _spdk_bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 4263 { 4264 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4265 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4266 4267 if (ch->histogram != NULL) { 4268 spdk_histogram_data_free(ch->histogram); 4269 ch->histogram = NULL; 4270 } 4271 spdk_for_each_channel_continue(i, 0); 4272 } 4273 4274 static void 4275 _spdk_bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 4276 { 4277 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4278 4279 if (status != 0) { 4280 ctx->status = status; 4281 ctx->bdev->internal.histogram_enabled = false; 4282 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), _spdk_bdev_histogram_disable_channel, ctx, 4283 _spdk_bdev_histogram_disable_channel_cb); 4284 } else { 4285 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4286 ctx->bdev->internal.histogram_in_progress = false; 4287 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4288 ctx->cb_fn(ctx->cb_arg, ctx->status); 4289 free(ctx); 4290 } 4291 } 4292 4293 static void 4294 _spdk_bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 4295 { 4296 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4297 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4298 int status = 0; 4299 4300 if (ch->histogram == NULL) { 4301 ch->histogram = spdk_histogram_data_alloc(); 4302 if (ch->histogram == NULL) { 4303 status = -ENOMEM; 4304 } 4305 } 4306 4307 spdk_for_each_channel_continue(i, status); 4308 } 4309 4310 void 4311 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 4312 void *cb_arg, bool enable) 4313 { 4314 struct spdk_bdev_histogram_ctx *ctx; 4315 4316 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 4317 if (ctx == NULL) { 4318 cb_fn(cb_arg, -ENOMEM); 4319 return; 4320 } 4321 4322 ctx->bdev = bdev; 4323 ctx->status = 0; 4324 ctx->cb_fn = cb_fn; 4325 ctx->cb_arg = cb_arg; 4326 4327 pthread_mutex_lock(&bdev->internal.mutex); 4328 if (bdev->internal.histogram_in_progress) { 4329 pthread_mutex_unlock(&bdev->internal.mutex); 4330 free(ctx); 4331 cb_fn(cb_arg, -EAGAIN); 4332 return; 4333 } 4334 4335 bdev->internal.histogram_in_progress = true; 4336 pthread_mutex_unlock(&bdev->internal.mutex); 4337 4338 bdev->internal.histogram_enabled = enable; 4339 4340 if (enable) { 4341 /* Allocate histogram for each channel */ 4342 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_enable_channel, ctx, 4343 _spdk_bdev_histogram_enable_channel_cb); 4344 } else { 4345 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_disable_channel, ctx, 4346 _spdk_bdev_histogram_disable_channel_cb); 4347 } 4348 } 4349 4350 struct spdk_bdev_histogram_data_ctx { 4351 spdk_bdev_histogram_data_cb cb_fn; 4352 void *cb_arg; 4353 struct spdk_bdev *bdev; 4354 /** merged histogram data from all channels */ 4355 struct spdk_histogram_data *histogram; 4356 }; 4357 4358 static void 4359 _spdk_bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 4360 { 4361 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4362 4363 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 4364 free(ctx); 4365 } 4366 4367 static void 4368 _spdk_bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 4369 { 4370 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4371 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4372 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4373 int status = 0; 4374 4375 if (ch->histogram == NULL) { 4376 status = -EFAULT; 4377 } else { 4378 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 4379 } 4380 4381 spdk_for_each_channel_continue(i, status); 4382 } 4383 4384 void 4385 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 4386 spdk_bdev_histogram_data_cb cb_fn, 4387 void *cb_arg) 4388 { 4389 struct spdk_bdev_histogram_data_ctx *ctx; 4390 4391 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 4392 if (ctx == NULL) { 4393 cb_fn(cb_arg, -ENOMEM, NULL); 4394 return; 4395 } 4396 4397 ctx->bdev = bdev; 4398 ctx->cb_fn = cb_fn; 4399 ctx->cb_arg = cb_arg; 4400 4401 ctx->histogram = histogram; 4402 4403 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_get_channel, ctx, 4404 _spdk_bdev_histogram_get_channel_cb); 4405 } 4406 4407 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 4408 4409 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 4410 { 4411 spdk_trace_register_owner(OWNER_BDEV, 'b'); 4412 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 4413 spdk_trace_register_description("BDEV_IO_START", "", TRACE_BDEV_IO_START, OWNER_BDEV, 4414 OBJECT_BDEV_IO, 1, 0, "type: "); 4415 spdk_trace_register_description("BDEV_IO_DONE", "", TRACE_BDEV_IO_DONE, OWNER_BDEV, 4416 OBJECT_BDEV_IO, 0, 0, ""); 4417 } 4418