1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk_internal/log.h" 52 #include "spdk/string.h" 53 54 #ifdef SPDK_CONFIG_VTUNE 55 #include "ittnotify.h" 56 #include "ittnotify_types.h" 57 int __itt_init_ittlib(const char *, __itt_group_id); 58 #endif 59 60 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 61 #define SPDK_BDEV_IO_CACHE_SIZE 256 62 #define BUF_SMALL_POOL_SIZE 8192 63 #define BUF_LARGE_POOL_SIZE 1024 64 #define NOMEM_THRESHOLD_COUNT 8 65 #define ZERO_BUFFER_SIZE 0x100000 66 67 #define OWNER_BDEV 0x2 68 69 #define OBJECT_BDEV_IO 0x2 70 71 #define TRACE_GROUP_BDEV 0x3 72 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 73 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 74 75 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 76 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 77 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 78 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 79 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (10 * 1024 * 1024) 80 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 81 82 #define SPDK_BDEV_POOL_ALIGNMENT 512 83 84 static const char *qos_conf_type[] = {"Limit_IOPS", 85 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 86 }; 87 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 88 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 89 }; 90 91 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 struct spdk_mempool *buf_small_pool; 97 struct spdk_mempool *buf_large_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 #ifdef SPDK_CONFIG_VTUNE 109 __itt_domain *domain; 110 #endif 111 }; 112 113 static struct spdk_bdev_mgr g_bdev_mgr = { 114 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 115 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 116 .init_complete = false, 117 .module_init_complete = false, 118 }; 119 120 static struct spdk_bdev_opts g_bdev_opts = { 121 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 122 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 123 }; 124 125 static spdk_bdev_init_cb g_init_cb_fn = NULL; 126 static void *g_init_cb_arg = NULL; 127 128 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 129 static void *g_fini_cb_arg = NULL; 130 static struct spdk_thread *g_fini_thread = NULL; 131 132 struct spdk_bdev_qos_limit { 133 /** IOs or bytes allowed per second (i.e., 1s). */ 134 uint64_t limit; 135 136 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 137 * For remaining bytes, allowed to run negative if an I/O is submitted when 138 * some bytes are remaining, but the I/O is bigger than that amount. The 139 * excess will be deducted from the next timeslice. 140 */ 141 int64_t remaining_this_timeslice; 142 143 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 144 uint32_t min_per_timeslice; 145 146 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 147 uint32_t max_per_timeslice; 148 149 /** Function to check whether to queue the IO. */ 150 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 151 152 /** Function to update for the submitted IO. */ 153 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 154 }; 155 156 struct spdk_bdev_qos { 157 /** Types of structure of rate limits. */ 158 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 159 160 /** The channel that all I/O are funneled through. */ 161 struct spdk_bdev_channel *ch; 162 163 /** The thread on which the poller is running. */ 164 struct spdk_thread *thread; 165 166 /** Queue of I/O waiting to be issued. */ 167 bdev_io_tailq_t queued; 168 169 /** Size of a timeslice in tsc ticks. */ 170 uint64_t timeslice_size; 171 172 /** Timestamp of start of last timeslice. */ 173 uint64_t last_timeslice; 174 175 /** Poller that processes queued I/O commands each time slice. */ 176 struct spdk_poller *poller; 177 }; 178 179 struct spdk_bdev_mgmt_channel { 180 bdev_io_stailq_t need_buf_small; 181 bdev_io_stailq_t need_buf_large; 182 183 /* 184 * Each thread keeps a cache of bdev_io - this allows 185 * bdev threads which are *not* DPDK threads to still 186 * benefit from a per-thread bdev_io cache. Without 187 * this, non-DPDK threads fetching from the mempool 188 * incur a cmpxchg on get and put. 189 */ 190 bdev_io_stailq_t per_thread_cache; 191 uint32_t per_thread_cache_count; 192 uint32_t bdev_io_cache_size; 193 194 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 195 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 196 }; 197 198 /* 199 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 200 * will queue here their IO that awaits retry. It makes it possible to retry sending 201 * IO to one bdev after IO from other bdev completes. 202 */ 203 struct spdk_bdev_shared_resource { 204 /* The bdev management channel */ 205 struct spdk_bdev_mgmt_channel *mgmt_ch; 206 207 /* 208 * Count of I/O submitted to bdev module and waiting for completion. 209 * Incremented before submit_request() is called on an spdk_bdev_io. 210 */ 211 uint64_t io_outstanding; 212 213 /* 214 * Queue of IO awaiting retry because of a previous NOMEM status returned 215 * on this channel. 216 */ 217 bdev_io_tailq_t nomem_io; 218 219 /* 220 * Threshold which io_outstanding must drop to before retrying nomem_io. 221 */ 222 uint64_t nomem_threshold; 223 224 /* I/O channel allocated by a bdev module */ 225 struct spdk_io_channel *shared_ch; 226 227 /* Refcount of bdev channels using this resource */ 228 uint32_t ref; 229 230 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 231 }; 232 233 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 234 #define BDEV_CH_QOS_ENABLED (1 << 1) 235 236 struct spdk_bdev_channel { 237 struct spdk_bdev *bdev; 238 239 /* The channel for the underlying device */ 240 struct spdk_io_channel *channel; 241 242 /* Per io_device per thread data */ 243 struct spdk_bdev_shared_resource *shared_resource; 244 245 struct spdk_bdev_io_stat stat; 246 247 /* 248 * Count of I/O submitted through this channel and waiting for completion. 249 * Incremented before submit_request() is called on an spdk_bdev_io. 250 */ 251 uint64_t io_outstanding; 252 253 bdev_io_tailq_t queued_resets; 254 255 uint32_t flags; 256 257 struct spdk_histogram_data *histogram; 258 259 #ifdef SPDK_CONFIG_VTUNE 260 uint64_t start_tsc; 261 uint64_t interval_tsc; 262 __itt_string_handle *handle; 263 struct spdk_bdev_io_stat prev_stat; 264 #endif 265 266 }; 267 268 struct spdk_bdev_desc { 269 struct spdk_bdev *bdev; 270 struct spdk_thread *thread; 271 spdk_bdev_remove_cb_t remove_cb; 272 void *remove_ctx; 273 bool remove_scheduled; 274 bool closed; 275 bool write; 276 TAILQ_ENTRY(spdk_bdev_desc) link; 277 }; 278 279 struct spdk_bdev_iostat_ctx { 280 struct spdk_bdev_io_stat *stat; 281 spdk_bdev_get_device_stat_cb cb; 282 void *cb_arg; 283 }; 284 285 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 286 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 287 288 static void _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, 289 void *cb_arg); 290 static void _spdk_bdev_write_zero_buffer_next(void *_bdev_io); 291 292 void 293 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 294 { 295 *opts = g_bdev_opts; 296 } 297 298 int 299 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 300 { 301 uint32_t min_pool_size; 302 303 /* 304 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 305 * initialization. A second mgmt_ch will be created on the same thread when the application starts 306 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 307 */ 308 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 309 if (opts->bdev_io_pool_size < min_pool_size) { 310 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 311 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 312 spdk_thread_get_count()); 313 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 314 return -1; 315 } 316 317 g_bdev_opts = *opts; 318 return 0; 319 } 320 321 struct spdk_bdev * 322 spdk_bdev_first(void) 323 { 324 struct spdk_bdev *bdev; 325 326 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 327 if (bdev) { 328 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 329 } 330 331 return bdev; 332 } 333 334 struct spdk_bdev * 335 spdk_bdev_next(struct spdk_bdev *prev) 336 { 337 struct spdk_bdev *bdev; 338 339 bdev = TAILQ_NEXT(prev, internal.link); 340 if (bdev) { 341 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 342 } 343 344 return bdev; 345 } 346 347 static struct spdk_bdev * 348 _bdev_next_leaf(struct spdk_bdev *bdev) 349 { 350 while (bdev != NULL) { 351 if (bdev->internal.claim_module == NULL) { 352 return bdev; 353 } else { 354 bdev = TAILQ_NEXT(bdev, internal.link); 355 } 356 } 357 358 return bdev; 359 } 360 361 struct spdk_bdev * 362 spdk_bdev_first_leaf(void) 363 { 364 struct spdk_bdev *bdev; 365 366 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 367 368 if (bdev) { 369 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 370 } 371 372 return bdev; 373 } 374 375 struct spdk_bdev * 376 spdk_bdev_next_leaf(struct spdk_bdev *prev) 377 { 378 struct spdk_bdev *bdev; 379 380 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 381 382 if (bdev) { 383 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 384 } 385 386 return bdev; 387 } 388 389 struct spdk_bdev * 390 spdk_bdev_get_by_name(const char *bdev_name) 391 { 392 struct spdk_bdev_alias *tmp; 393 struct spdk_bdev *bdev = spdk_bdev_first(); 394 395 while (bdev != NULL) { 396 if (strcmp(bdev_name, bdev->name) == 0) { 397 return bdev; 398 } 399 400 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 401 if (strcmp(bdev_name, tmp->alias) == 0) { 402 return bdev; 403 } 404 } 405 406 bdev = spdk_bdev_next(bdev); 407 } 408 409 return NULL; 410 } 411 412 void 413 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 414 { 415 struct iovec *iovs; 416 417 iovs = bdev_io->u.bdev.iovs; 418 419 assert(iovs != NULL); 420 assert(bdev_io->u.bdev.iovcnt >= 1); 421 422 iovs[0].iov_base = buf; 423 iovs[0].iov_len = len; 424 } 425 426 static bool 427 _is_buf_allocated(struct iovec *iovs) 428 { 429 return iovs[0].iov_base != NULL; 430 } 431 432 static bool 433 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 434 { 435 int i; 436 uintptr_t iov_base; 437 438 if (spdk_likely(alignment == 1)) { 439 return true; 440 } 441 442 for (i = 0; i < iovcnt; i++) { 443 iov_base = (uintptr_t)iovs[i].iov_base; 444 if ((iov_base & (alignment - 1)) != 0) { 445 return false; 446 } 447 } 448 449 return true; 450 } 451 452 static void 453 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 454 { 455 int i; 456 size_t len; 457 458 for (i = 0; i < iovcnt; i++) { 459 len = spdk_min(iovs[i].iov_len, buf_len); 460 memcpy(buf, iovs[i].iov_base, len); 461 buf += len; 462 buf_len -= len; 463 } 464 } 465 466 static void 467 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 468 { 469 int i; 470 size_t len; 471 472 for (i = 0; i < iovcnt; i++) { 473 len = spdk_min(iovs[i].iov_len, buf_len); 474 memcpy(iovs[i].iov_base, buf, len); 475 buf += len; 476 buf_len -= len; 477 } 478 } 479 480 static void 481 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 482 { 483 /* save original iovec */ 484 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 485 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 486 /* set bounce iov */ 487 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 488 bdev_io->u.bdev.iovcnt = 1; 489 /* set bounce buffer for this operation */ 490 bdev_io->u.bdev.iovs[0].iov_base = buf; 491 bdev_io->u.bdev.iovs[0].iov_len = len; 492 /* if this is write path, copy data from original buffer to bounce buffer */ 493 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 494 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 495 } 496 } 497 498 static void 499 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 500 { 501 struct spdk_mempool *pool; 502 struct spdk_bdev_io *tmp; 503 void *buf, *aligned_buf; 504 bdev_io_stailq_t *stailq; 505 struct spdk_bdev_mgmt_channel *ch; 506 uint64_t buf_len; 507 uint64_t alignment; 508 bool buf_allocated; 509 510 buf = bdev_io->internal.buf; 511 buf_len = bdev_io->internal.buf_len; 512 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 513 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 514 515 bdev_io->internal.buf = NULL; 516 517 if (buf_len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 518 pool = g_bdev_mgr.buf_small_pool; 519 stailq = &ch->need_buf_small; 520 } else { 521 pool = g_bdev_mgr.buf_large_pool; 522 stailq = &ch->need_buf_large; 523 } 524 525 if (STAILQ_EMPTY(stailq)) { 526 spdk_mempool_put(pool, buf); 527 } else { 528 tmp = STAILQ_FIRST(stailq); 529 530 alignment = spdk_bdev_get_buf_align(tmp->bdev); 531 buf_allocated = _is_buf_allocated(tmp->u.bdev.iovs); 532 533 aligned_buf = (void *)(((uintptr_t)buf + 534 (alignment - 1)) & ~(alignment - 1)); 535 if (buf_allocated) { 536 _bdev_io_set_bounce_buf(tmp, aligned_buf, tmp->internal.buf_len); 537 } else { 538 spdk_bdev_io_set_buf(tmp, aligned_buf, tmp->internal.buf_len); 539 } 540 541 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 542 tmp->internal.buf = buf; 543 tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); 544 } 545 } 546 547 static void 548 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 549 { 550 /* if this is read path, copy data from bounce buffer to original buffer */ 551 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 552 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 553 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt, 554 bdev_io->internal.bounce_iov.iov_base, bdev_io->internal.bounce_iov.iov_len); 555 } 556 /* set orignal buffer for this io */ 557 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 558 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 559 /* disable bouncing buffer for this io */ 560 bdev_io->internal.orig_iovcnt = 0; 561 bdev_io->internal.orig_iovs = NULL; 562 /* return bounce buffer to the pool */ 563 spdk_bdev_io_put_buf(bdev_io); 564 } 565 566 void 567 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 568 { 569 struct spdk_mempool *pool; 570 bdev_io_stailq_t *stailq; 571 void *buf, *aligned_buf; 572 struct spdk_bdev_mgmt_channel *mgmt_ch; 573 uint64_t alignment; 574 bool buf_allocated; 575 576 assert(cb != NULL); 577 assert(bdev_io->u.bdev.iovs != NULL); 578 579 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 580 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 581 582 if (buf_allocated && 583 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 584 /* Buffer already present and aligned */ 585 cb(bdev_io->internal.ch->channel, bdev_io); 586 return; 587 } 588 589 assert(len + alignment <= SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT); 590 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 591 592 bdev_io->internal.buf_len = len; 593 bdev_io->internal.get_buf_cb = cb; 594 595 if (len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 596 pool = g_bdev_mgr.buf_small_pool; 597 stailq = &mgmt_ch->need_buf_small; 598 } else { 599 pool = g_bdev_mgr.buf_large_pool; 600 stailq = &mgmt_ch->need_buf_large; 601 } 602 603 buf = spdk_mempool_get(pool); 604 605 if (!buf) { 606 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 607 } else { 608 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 609 610 if (buf_allocated) { 611 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 612 } else { 613 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 614 } 615 bdev_io->internal.buf = buf; 616 bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); 617 } 618 } 619 620 static int 621 spdk_bdev_module_get_max_ctx_size(void) 622 { 623 struct spdk_bdev_module *bdev_module; 624 int max_bdev_module_size = 0; 625 626 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 627 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 628 max_bdev_module_size = bdev_module->get_ctx_size(); 629 } 630 } 631 632 return max_bdev_module_size; 633 } 634 635 void 636 spdk_bdev_config_text(FILE *fp) 637 { 638 struct spdk_bdev_module *bdev_module; 639 640 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 641 if (bdev_module->config_text) { 642 bdev_module->config_text(fp); 643 } 644 } 645 } 646 647 static void 648 spdk_bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 649 { 650 int i; 651 struct spdk_bdev_qos *qos = bdev->internal.qos; 652 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 653 654 if (!qos) { 655 return; 656 } 657 658 spdk_bdev_get_qos_rate_limits(bdev, limits); 659 660 spdk_json_write_object_begin(w); 661 spdk_json_write_named_string(w, "method", "set_bdev_qos_limit"); 662 spdk_json_write_name(w, "params"); 663 664 spdk_json_write_object_begin(w); 665 spdk_json_write_named_string(w, "name", bdev->name); 666 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 667 if (limits[i] > 0) { 668 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 669 } 670 } 671 spdk_json_write_object_end(w); 672 673 spdk_json_write_object_end(w); 674 } 675 676 void 677 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 678 { 679 struct spdk_bdev_module *bdev_module; 680 struct spdk_bdev *bdev; 681 682 assert(w != NULL); 683 684 spdk_json_write_array_begin(w); 685 686 spdk_json_write_object_begin(w); 687 spdk_json_write_named_string(w, "method", "set_bdev_options"); 688 spdk_json_write_name(w, "params"); 689 spdk_json_write_object_begin(w); 690 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 691 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 692 spdk_json_write_object_end(w); 693 spdk_json_write_object_end(w); 694 695 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 696 if (bdev_module->config_json) { 697 bdev_module->config_json(w); 698 } 699 } 700 701 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 702 spdk_bdev_qos_config_json(bdev, w); 703 704 if (bdev->fn_table->write_config_json) { 705 bdev->fn_table->write_config_json(bdev, w); 706 } 707 } 708 709 spdk_json_write_array_end(w); 710 } 711 712 static int 713 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 714 { 715 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 716 struct spdk_bdev_io *bdev_io; 717 uint32_t i; 718 719 STAILQ_INIT(&ch->need_buf_small); 720 STAILQ_INIT(&ch->need_buf_large); 721 722 STAILQ_INIT(&ch->per_thread_cache); 723 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 724 725 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 726 ch->per_thread_cache_count = 0; 727 for (i = 0; i < ch->bdev_io_cache_size; i++) { 728 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 729 assert(bdev_io != NULL); 730 ch->per_thread_cache_count++; 731 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 732 } 733 734 TAILQ_INIT(&ch->shared_resources); 735 TAILQ_INIT(&ch->io_wait_queue); 736 737 return 0; 738 } 739 740 static void 741 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 742 { 743 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 744 struct spdk_bdev_io *bdev_io; 745 746 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 747 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 748 } 749 750 if (!TAILQ_EMPTY(&ch->shared_resources)) { 751 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 752 } 753 754 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 755 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 756 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 757 ch->per_thread_cache_count--; 758 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 759 } 760 761 assert(ch->per_thread_cache_count == 0); 762 } 763 764 static void 765 spdk_bdev_init_complete(int rc) 766 { 767 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 768 void *cb_arg = g_init_cb_arg; 769 struct spdk_bdev_module *m; 770 771 g_bdev_mgr.init_complete = true; 772 g_init_cb_fn = NULL; 773 g_init_cb_arg = NULL; 774 775 /* 776 * For modules that need to know when subsystem init is complete, 777 * inform them now. 778 */ 779 if (rc == 0) { 780 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 781 if (m->init_complete) { 782 m->init_complete(); 783 } 784 } 785 } 786 787 cb_fn(cb_arg, rc); 788 } 789 790 static void 791 spdk_bdev_module_action_complete(void) 792 { 793 struct spdk_bdev_module *m; 794 795 /* 796 * Don't finish bdev subsystem initialization if 797 * module pre-initialization is still in progress, or 798 * the subsystem been already initialized. 799 */ 800 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 801 return; 802 } 803 804 /* 805 * Check all bdev modules for inits/examinations in progress. If any 806 * exist, return immediately since we cannot finish bdev subsystem 807 * initialization until all are completed. 808 */ 809 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 810 if (m->internal.action_in_progress > 0) { 811 return; 812 } 813 } 814 815 /* 816 * Modules already finished initialization - now that all 817 * the bdev modules have finished their asynchronous I/O 818 * processing, the entire bdev layer can be marked as complete. 819 */ 820 spdk_bdev_init_complete(0); 821 } 822 823 static void 824 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 825 { 826 assert(module->internal.action_in_progress > 0); 827 module->internal.action_in_progress--; 828 spdk_bdev_module_action_complete(); 829 } 830 831 void 832 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 833 { 834 spdk_bdev_module_action_done(module); 835 } 836 837 void 838 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 839 { 840 spdk_bdev_module_action_done(module); 841 } 842 843 /** The last initialized bdev module */ 844 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 845 846 static int 847 spdk_bdev_modules_init(void) 848 { 849 struct spdk_bdev_module *module; 850 int rc = 0; 851 852 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 853 g_resume_bdev_module = module; 854 rc = module->module_init(); 855 if (rc != 0) { 856 return rc; 857 } 858 } 859 860 g_resume_bdev_module = NULL; 861 return 0; 862 } 863 864 865 static void 866 spdk_bdev_init_failed_complete(void *cb_arg) 867 { 868 spdk_bdev_init_complete(-1); 869 } 870 871 static void 872 spdk_bdev_init_failed(void *cb_arg) 873 { 874 spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); 875 } 876 877 void 878 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 879 { 880 struct spdk_conf_section *sp; 881 struct spdk_bdev_opts bdev_opts; 882 int32_t bdev_io_pool_size, bdev_io_cache_size; 883 int cache_size; 884 int rc = 0; 885 char mempool_name[32]; 886 887 assert(cb_fn != NULL); 888 889 sp = spdk_conf_find_section(NULL, "Bdev"); 890 if (sp != NULL) { 891 spdk_bdev_get_opts(&bdev_opts); 892 893 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 894 if (bdev_io_pool_size >= 0) { 895 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 896 } 897 898 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 899 if (bdev_io_cache_size >= 0) { 900 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 901 } 902 903 if (spdk_bdev_set_opts(&bdev_opts)) { 904 spdk_bdev_init_complete(-1); 905 return; 906 } 907 908 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 909 } 910 911 g_init_cb_fn = cb_fn; 912 g_init_cb_arg = cb_arg; 913 914 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 915 916 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 917 g_bdev_opts.bdev_io_pool_size, 918 sizeof(struct spdk_bdev_io) + 919 spdk_bdev_module_get_max_ctx_size(), 920 0, 921 SPDK_ENV_SOCKET_ID_ANY); 922 923 if (g_bdev_mgr.bdev_io_pool == NULL) { 924 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 925 spdk_bdev_init_complete(-1); 926 return; 927 } 928 929 /** 930 * Ensure no more than half of the total buffers end up local caches, by 931 * using spdk_thread_get_count() to determine how many local caches we need 932 * to account for. 933 */ 934 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 935 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 936 937 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 938 BUF_SMALL_POOL_SIZE, 939 SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 940 cache_size, 941 SPDK_ENV_SOCKET_ID_ANY); 942 if (!g_bdev_mgr.buf_small_pool) { 943 SPDK_ERRLOG("create rbuf small pool failed\n"); 944 spdk_bdev_init_complete(-1); 945 return; 946 } 947 948 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 949 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 950 951 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 952 BUF_LARGE_POOL_SIZE, 953 SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 954 cache_size, 955 SPDK_ENV_SOCKET_ID_ANY); 956 if (!g_bdev_mgr.buf_large_pool) { 957 SPDK_ERRLOG("create rbuf large pool failed\n"); 958 spdk_bdev_init_complete(-1); 959 return; 960 } 961 962 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 963 NULL); 964 if (!g_bdev_mgr.zero_buffer) { 965 SPDK_ERRLOG("create bdev zero buffer failed\n"); 966 spdk_bdev_init_complete(-1); 967 return; 968 } 969 970 #ifdef SPDK_CONFIG_VTUNE 971 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 972 #endif 973 974 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 975 spdk_bdev_mgmt_channel_destroy, 976 sizeof(struct spdk_bdev_mgmt_channel), 977 "bdev_mgr"); 978 979 rc = spdk_bdev_modules_init(); 980 g_bdev_mgr.module_init_complete = true; 981 if (rc != 0) { 982 SPDK_ERRLOG("bdev modules init failed\n"); 983 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); 984 return; 985 } 986 987 spdk_bdev_module_action_complete(); 988 } 989 990 static void 991 spdk_bdev_mgr_unregister_cb(void *io_device) 992 { 993 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 994 995 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 996 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 997 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 998 g_bdev_opts.bdev_io_pool_size); 999 } 1000 1001 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1002 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1003 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1004 BUF_SMALL_POOL_SIZE); 1005 assert(false); 1006 } 1007 1008 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1009 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1010 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1011 BUF_LARGE_POOL_SIZE); 1012 assert(false); 1013 } 1014 1015 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1016 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1017 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1018 spdk_dma_free(g_bdev_mgr.zero_buffer); 1019 1020 cb_fn(g_fini_cb_arg); 1021 g_fini_cb_fn = NULL; 1022 g_fini_cb_arg = NULL; 1023 g_bdev_mgr.init_complete = false; 1024 g_bdev_mgr.module_init_complete = false; 1025 } 1026 1027 static void 1028 spdk_bdev_module_finish_iter(void *arg) 1029 { 1030 struct spdk_bdev_module *bdev_module; 1031 1032 /* Start iterating from the last touched module */ 1033 if (!g_resume_bdev_module) { 1034 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1035 } else { 1036 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1037 internal.tailq); 1038 } 1039 1040 while (bdev_module) { 1041 if (bdev_module->async_fini) { 1042 /* Save our place so we can resume later. We must 1043 * save the variable here, before calling module_fini() 1044 * below, because in some cases the module may immediately 1045 * call spdk_bdev_module_finish_done() and re-enter 1046 * this function to continue iterating. */ 1047 g_resume_bdev_module = bdev_module; 1048 } 1049 1050 if (bdev_module->module_fini) { 1051 bdev_module->module_fini(); 1052 } 1053 1054 if (bdev_module->async_fini) { 1055 return; 1056 } 1057 1058 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1059 internal.tailq); 1060 } 1061 1062 g_resume_bdev_module = NULL; 1063 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 1064 } 1065 1066 void 1067 spdk_bdev_module_finish_done(void) 1068 { 1069 if (spdk_get_thread() != g_fini_thread) { 1070 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 1071 } else { 1072 spdk_bdev_module_finish_iter(NULL); 1073 } 1074 } 1075 1076 static void 1077 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1078 { 1079 struct spdk_bdev *bdev = cb_arg; 1080 1081 if (bdeverrno && bdev) { 1082 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1083 bdev->name); 1084 1085 /* 1086 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1087 * bdev; try to continue by manually removing this bdev from the list and continue 1088 * with the next bdev in the list. 1089 */ 1090 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1091 } 1092 1093 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1094 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1095 /* 1096 * Bdev module finish need to be deferred as we might be in the middle of some context 1097 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1098 * after returning. 1099 */ 1100 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 1101 return; 1102 } 1103 1104 /* 1105 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1106 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1107 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1108 * base bdevs. 1109 * 1110 * Also, walk the list in the reverse order. 1111 */ 1112 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1113 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1114 if (bdev->internal.claim_module != NULL) { 1115 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1116 bdev->name, bdev->internal.claim_module->name); 1117 continue; 1118 } 1119 1120 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1121 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1122 return; 1123 } 1124 1125 /* 1126 * If any bdev fails to unclaim underlying bdev properly, we may face the 1127 * case of bdev list consisting of claimed bdevs only (if claims are managed 1128 * correctly, this would mean there's a loop in the claims graph which is 1129 * clearly impossible). Warn and unregister last bdev on the list then. 1130 */ 1131 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1132 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1133 SPDK_ERRLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1134 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1135 return; 1136 } 1137 } 1138 1139 void 1140 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1141 { 1142 struct spdk_bdev_module *m; 1143 1144 assert(cb_fn != NULL); 1145 1146 g_fini_thread = spdk_get_thread(); 1147 1148 g_fini_cb_fn = cb_fn; 1149 g_fini_cb_arg = cb_arg; 1150 1151 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1152 if (m->fini_start) { 1153 m->fini_start(); 1154 } 1155 } 1156 1157 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 1158 } 1159 1160 static struct spdk_bdev_io * 1161 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 1162 { 1163 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1164 struct spdk_bdev_io *bdev_io; 1165 1166 if (ch->per_thread_cache_count > 0) { 1167 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1168 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1169 ch->per_thread_cache_count--; 1170 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1171 /* 1172 * Don't try to look for bdev_ios in the global pool if there are 1173 * waiters on bdev_ios - we don't want this caller to jump the line. 1174 */ 1175 bdev_io = NULL; 1176 } else { 1177 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1178 } 1179 1180 return bdev_io; 1181 } 1182 1183 void 1184 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1187 1188 assert(bdev_io != NULL); 1189 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1190 1191 if (bdev_io->internal.buf != NULL) { 1192 spdk_bdev_io_put_buf(bdev_io); 1193 } 1194 1195 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1196 ch->per_thread_cache_count++; 1197 STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); 1198 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1199 struct spdk_bdev_io_wait_entry *entry; 1200 1201 entry = TAILQ_FIRST(&ch->io_wait_queue); 1202 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1203 entry->cb_fn(entry->cb_arg); 1204 } 1205 } else { 1206 /* We should never have a full cache with entries on the io wait queue. */ 1207 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1208 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1209 } 1210 } 1211 1212 static bool 1213 _spdk_bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1214 { 1215 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1216 1217 switch (limit) { 1218 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1219 return true; 1220 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1221 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1222 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1223 return false; 1224 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1225 default: 1226 return false; 1227 } 1228 } 1229 1230 static bool 1231 _spdk_bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1232 { 1233 switch (bdev_io->type) { 1234 case SPDK_BDEV_IO_TYPE_NVME_IO: 1235 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1236 case SPDK_BDEV_IO_TYPE_READ: 1237 case SPDK_BDEV_IO_TYPE_WRITE: 1238 case SPDK_BDEV_IO_TYPE_UNMAP: 1239 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1240 return true; 1241 default: 1242 return false; 1243 } 1244 } 1245 1246 static bool 1247 _spdk_bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1248 { 1249 switch (bdev_io->type) { 1250 case SPDK_BDEV_IO_TYPE_NVME_IO: 1251 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1252 /* Bit 1 (0x2) set for read operation */ 1253 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1254 return true; 1255 } else { 1256 return false; 1257 } 1258 case SPDK_BDEV_IO_TYPE_READ: 1259 return true; 1260 default: 1261 return false; 1262 } 1263 } 1264 1265 static uint64_t 1266 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1267 { 1268 struct spdk_bdev *bdev = bdev_io->bdev; 1269 1270 switch (bdev_io->type) { 1271 case SPDK_BDEV_IO_TYPE_NVME_IO: 1272 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1273 return bdev_io->u.nvme_passthru.nbytes; 1274 case SPDK_BDEV_IO_TYPE_READ: 1275 case SPDK_BDEV_IO_TYPE_WRITE: 1276 case SPDK_BDEV_IO_TYPE_UNMAP: 1277 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1278 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1279 default: 1280 return 0; 1281 } 1282 } 1283 1284 static bool 1285 _spdk_bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1286 { 1287 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1288 return true; 1289 } else { 1290 return false; 1291 } 1292 } 1293 1294 static bool 1295 _spdk_bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1296 { 1297 if (_spdk_bdev_is_read_io(io) == false) { 1298 return false; 1299 } 1300 1301 return _spdk_bdev_qos_rw_queue_io(limit, io); 1302 } 1303 1304 static bool 1305 _spdk_bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1306 { 1307 if (_spdk_bdev_is_read_io(io) == true) { 1308 return false; 1309 } 1310 1311 return _spdk_bdev_qos_rw_queue_io(limit, io); 1312 } 1313 1314 static void 1315 _spdk_bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1316 { 1317 limit->remaining_this_timeslice--; 1318 } 1319 1320 static void 1321 _spdk_bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1322 { 1323 limit->remaining_this_timeslice -= _spdk_bdev_get_io_size_in_byte(io); 1324 } 1325 1326 static void 1327 _spdk_bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1328 { 1329 if (_spdk_bdev_is_read_io(io) == false) { 1330 return; 1331 } 1332 1333 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1334 } 1335 1336 static void 1337 _spdk_bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1338 { 1339 if (_spdk_bdev_is_read_io(io) == true) { 1340 return; 1341 } 1342 1343 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1344 } 1345 1346 static void 1347 _spdk_bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1348 { 1349 int i; 1350 1351 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1352 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1353 qos->rate_limits[i].queue_io = NULL; 1354 qos->rate_limits[i].update_quota = NULL; 1355 continue; 1356 } 1357 1358 switch (i) { 1359 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1360 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1361 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_iops_update_quota; 1362 break; 1363 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1364 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1365 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_bps_update_quota; 1366 break; 1367 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1368 qos->rate_limits[i].queue_io = _spdk_bdev_qos_r_queue_io; 1369 qos->rate_limits[i].update_quota = _spdk_bdev_qos_r_bps_update_quota; 1370 break; 1371 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1372 qos->rate_limits[i].queue_io = _spdk_bdev_qos_w_queue_io; 1373 qos->rate_limits[i].update_quota = _spdk_bdev_qos_w_bps_update_quota; 1374 break; 1375 default: 1376 break; 1377 } 1378 } 1379 } 1380 1381 static int 1382 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1383 { 1384 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1385 struct spdk_bdev *bdev = ch->bdev; 1386 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1387 int i, submitted_ios = 0; 1388 1389 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1390 if (_spdk_bdev_qos_io_to_limit(bdev_io) == true) { 1391 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1392 if (!qos->rate_limits[i].queue_io) { 1393 continue; 1394 } 1395 1396 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1397 bdev_io) == true) { 1398 return submitted_ios; 1399 } 1400 } 1401 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1402 if (!qos->rate_limits[i].update_quota) { 1403 continue; 1404 } 1405 1406 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1407 } 1408 } 1409 1410 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1411 ch->io_outstanding++; 1412 shared_resource->io_outstanding++; 1413 bdev_io->internal.in_submit_request = true; 1414 bdev->fn_table->submit_request(ch->channel, bdev_io); 1415 bdev_io->internal.in_submit_request = false; 1416 submitted_ios++; 1417 } 1418 1419 return submitted_ios; 1420 } 1421 1422 static void 1423 _spdk_bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1424 { 1425 int rc; 1426 1427 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1428 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1429 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1430 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1431 &bdev_io->internal.waitq_entry); 1432 if (rc != 0) { 1433 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1434 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1435 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1436 } 1437 } 1438 1439 static bool 1440 _spdk_bdev_io_type_can_split(uint8_t type) 1441 { 1442 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1443 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1444 1445 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1446 * UNMAP could be split, but these types of I/O are typically much larger 1447 * in size (sometimes the size of the entire block device), and the bdev 1448 * module can more efficiently split these types of I/O. Plus those types 1449 * of I/O do not have a payload, which makes the splitting process simpler. 1450 */ 1451 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1452 return true; 1453 } else { 1454 return false; 1455 } 1456 } 1457 1458 static bool 1459 _spdk_bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1460 { 1461 uint64_t start_stripe, end_stripe; 1462 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1463 1464 if (io_boundary == 0) { 1465 return false; 1466 } 1467 1468 if (!_spdk_bdev_io_type_can_split(bdev_io->type)) { 1469 return false; 1470 } 1471 1472 start_stripe = bdev_io->u.bdev.offset_blocks; 1473 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1474 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1475 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1476 start_stripe >>= spdk_u32log2(io_boundary); 1477 end_stripe >>= spdk_u32log2(io_boundary); 1478 } else { 1479 start_stripe /= io_boundary; 1480 end_stripe /= io_boundary; 1481 } 1482 return (start_stripe != end_stripe); 1483 } 1484 1485 static uint32_t 1486 _to_next_boundary(uint64_t offset, uint32_t boundary) 1487 { 1488 return (boundary - (offset % boundary)); 1489 } 1490 1491 static void 1492 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1493 1494 static void 1495 _spdk_bdev_io_split_with_payload(void *_bdev_io) 1496 { 1497 struct spdk_bdev_io *bdev_io = _bdev_io; 1498 uint64_t current_offset, remaining; 1499 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; 1500 struct iovec *parent_iov, *iov; 1501 uint64_t parent_iov_offset, iov_len; 1502 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1503 int rc; 1504 1505 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1506 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1507 blocklen = bdev_io->bdev->blocklen; 1508 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1509 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1510 1511 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1512 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1513 if (parent_iov_offset < parent_iov->iov_len) { 1514 break; 1515 } 1516 parent_iov_offset -= parent_iov->iov_len; 1517 } 1518 1519 child_iovcnt = 0; 1520 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1521 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1522 to_next_boundary = spdk_min(remaining, to_next_boundary); 1523 to_next_boundary_bytes = to_next_boundary * blocklen; 1524 iov = &bdev_io->child_iov[child_iovcnt]; 1525 iovcnt = 0; 1526 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1527 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1528 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1529 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1530 to_next_boundary_bytes -= iov_len; 1531 1532 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1533 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1534 1535 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1536 parent_iov_offset += iov_len; 1537 } else { 1538 parent_iovpos++; 1539 parent_iov_offset = 0; 1540 } 1541 child_iovcnt++; 1542 iovcnt++; 1543 } 1544 1545 if (to_next_boundary_bytes > 0) { 1546 /* We had to stop this child I/O early because we ran out of 1547 * child_iov space. Make sure the iovs collected are valid and 1548 * then adjust to_next_boundary before starting the child I/O. 1549 */ 1550 if ((to_next_boundary_bytes % blocklen) != 0) { 1551 SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", 1552 to_next_boundary_bytes, blocklen); 1553 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1554 if (bdev_io->u.bdev.split_outstanding == 0) { 1555 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1556 } 1557 return; 1558 } 1559 to_next_boundary -= to_next_boundary_bytes / blocklen; 1560 } 1561 1562 bdev_io->u.bdev.split_outstanding++; 1563 1564 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1565 rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, 1566 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1567 iov, iovcnt, current_offset, to_next_boundary, 1568 _spdk_bdev_io_split_done, bdev_io); 1569 } else { 1570 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 1571 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1572 iov, iovcnt, current_offset, to_next_boundary, 1573 _spdk_bdev_io_split_done, bdev_io); 1574 } 1575 1576 if (rc == 0) { 1577 current_offset += to_next_boundary; 1578 remaining -= to_next_boundary; 1579 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 1580 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 1581 } else { 1582 bdev_io->u.bdev.split_outstanding--; 1583 if (rc == -ENOMEM) { 1584 if (bdev_io->u.bdev.split_outstanding == 0) { 1585 /* No I/O is outstanding. Hence we should wait here. */ 1586 _spdk_bdev_queue_io_wait_with_cb(bdev_io, 1587 _spdk_bdev_io_split_with_payload); 1588 } 1589 } else { 1590 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1591 if (bdev_io->u.bdev.split_outstanding == 0) { 1592 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1593 } 1594 } 1595 1596 return; 1597 } 1598 } 1599 } 1600 1601 static void 1602 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1603 { 1604 struct spdk_bdev_io *parent_io = cb_arg; 1605 1606 spdk_bdev_free_io(bdev_io); 1607 1608 if (!success) { 1609 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1610 } 1611 parent_io->u.bdev.split_outstanding--; 1612 if (parent_io->u.bdev.split_outstanding != 0) { 1613 return; 1614 } 1615 1616 /* 1617 * Parent I/O finishes when all blocks are consumed or there is any failure of 1618 * child I/O and no outstanding child I/O. 1619 */ 1620 if (parent_io->u.bdev.split_remaining_num_blocks == 0 || 1621 parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { 1622 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 1623 parent_io->internal.caller_ctx); 1624 return; 1625 } 1626 1627 /* 1628 * Continue with the splitting process. This function will complete the parent I/O if the 1629 * splitting is done. 1630 */ 1631 _spdk_bdev_io_split_with_payload(parent_io); 1632 } 1633 1634 static void 1635 _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1636 { 1637 assert(_spdk_bdev_io_type_can_split(bdev_io->type)); 1638 1639 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 1640 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 1641 bdev_io->u.bdev.split_outstanding = 0; 1642 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 1643 1644 _spdk_bdev_io_split_with_payload(bdev_io); 1645 } 1646 1647 static void 1648 _spdk_bdev_io_submit(void *ctx) 1649 { 1650 struct spdk_bdev_io *bdev_io = ctx; 1651 struct spdk_bdev *bdev = bdev_io->bdev; 1652 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1653 struct spdk_io_channel *ch = bdev_ch->channel; 1654 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1655 uint64_t tsc; 1656 1657 tsc = spdk_get_ticks(); 1658 bdev_io->internal.submit_tsc = tsc; 1659 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 1660 bdev_ch->io_outstanding++; 1661 shared_resource->io_outstanding++; 1662 bdev_io->internal.in_submit_request = true; 1663 if (spdk_likely(bdev_ch->flags == 0)) { 1664 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1665 bdev->fn_table->submit_request(ch, bdev_io); 1666 } else { 1667 bdev_ch->io_outstanding--; 1668 shared_resource->io_outstanding--; 1669 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1670 } 1671 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1672 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1673 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1674 bdev_ch->io_outstanding--; 1675 shared_resource->io_outstanding--; 1676 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1677 _spdk_bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 1678 } else { 1679 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1680 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1681 } 1682 bdev_io->internal.in_submit_request = false; 1683 } 1684 1685 static void 1686 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 1687 { 1688 struct spdk_bdev *bdev = bdev_io->bdev; 1689 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 1690 1691 assert(thread != NULL); 1692 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1693 1694 if (bdev->split_on_optimal_io_boundary && _spdk_bdev_io_should_split(bdev_io)) { 1695 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1696 spdk_bdev_io_get_buf(bdev_io, _spdk_bdev_io_split, 1697 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 1698 } else { 1699 _spdk_bdev_io_split(NULL, bdev_io); 1700 } 1701 return; 1702 } 1703 1704 if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { 1705 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 1706 _spdk_bdev_io_submit(bdev_io); 1707 } else { 1708 bdev_io->internal.io_submit_ch = bdev_io->internal.ch; 1709 bdev_io->internal.ch = bdev->internal.qos->ch; 1710 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); 1711 } 1712 } else { 1713 _spdk_bdev_io_submit(bdev_io); 1714 } 1715 } 1716 1717 static void 1718 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 1719 { 1720 struct spdk_bdev *bdev = bdev_io->bdev; 1721 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1722 struct spdk_io_channel *ch = bdev_ch->channel; 1723 1724 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1725 1726 bdev_io->internal.in_submit_request = true; 1727 bdev->fn_table->submit_request(ch, bdev_io); 1728 bdev_io->internal.in_submit_request = false; 1729 } 1730 1731 static void 1732 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 1733 struct spdk_bdev *bdev, void *cb_arg, 1734 spdk_bdev_io_completion_cb cb) 1735 { 1736 bdev_io->bdev = bdev; 1737 bdev_io->internal.caller_ctx = cb_arg; 1738 bdev_io->internal.cb = cb; 1739 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1740 bdev_io->internal.in_submit_request = false; 1741 bdev_io->internal.buf = NULL; 1742 bdev_io->internal.io_submit_ch = NULL; 1743 bdev_io->internal.orig_iovs = NULL; 1744 bdev_io->internal.orig_iovcnt = 0; 1745 } 1746 1747 static bool 1748 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1749 { 1750 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1751 } 1752 1753 bool 1754 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1755 { 1756 bool supported; 1757 1758 supported = _spdk_bdev_io_type_supported(bdev, io_type); 1759 1760 if (!supported) { 1761 switch (io_type) { 1762 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1763 /* The bdev layer will emulate write zeroes as long as write is supported. */ 1764 supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 1765 break; 1766 default: 1767 break; 1768 } 1769 } 1770 1771 return supported; 1772 } 1773 1774 int 1775 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1776 { 1777 if (bdev->fn_table->dump_info_json) { 1778 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1779 } 1780 1781 return 0; 1782 } 1783 1784 static void 1785 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1786 { 1787 uint32_t max_per_timeslice = 0; 1788 int i; 1789 1790 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1791 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1792 qos->rate_limits[i].max_per_timeslice = 0; 1793 continue; 1794 } 1795 1796 max_per_timeslice = qos->rate_limits[i].limit * 1797 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 1798 1799 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 1800 qos->rate_limits[i].min_per_timeslice); 1801 1802 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 1803 } 1804 1805 _spdk_bdev_qos_set_ops(qos); 1806 } 1807 1808 static int 1809 spdk_bdev_channel_poll_qos(void *arg) 1810 { 1811 struct spdk_bdev_qos *qos = arg; 1812 uint64_t now = spdk_get_ticks(); 1813 int i; 1814 1815 if (now < (qos->last_timeslice + qos->timeslice_size)) { 1816 /* We received our callback earlier than expected - return 1817 * immediately and wait to do accounting until at least one 1818 * timeslice has actually expired. This should never happen 1819 * with a well-behaved timer implementation. 1820 */ 1821 return 0; 1822 } 1823 1824 /* Reset for next round of rate limiting */ 1825 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1826 /* We may have allowed the IOs or bytes to slightly overrun in the last 1827 * timeslice. remaining_this_timeslice is signed, so if it's negative 1828 * here, we'll account for the overrun so that the next timeslice will 1829 * be appropriately reduced. 1830 */ 1831 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 1832 qos->rate_limits[i].remaining_this_timeslice = 0; 1833 } 1834 } 1835 1836 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 1837 qos->last_timeslice += qos->timeslice_size; 1838 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1839 qos->rate_limits[i].remaining_this_timeslice += 1840 qos->rate_limits[i].max_per_timeslice; 1841 } 1842 } 1843 1844 return _spdk_bdev_qos_io_submit(qos->ch, qos); 1845 } 1846 1847 static void 1848 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1849 { 1850 struct spdk_bdev_shared_resource *shared_resource; 1851 1852 spdk_put_io_channel(ch->channel); 1853 1854 shared_resource = ch->shared_resource; 1855 1856 assert(ch->io_outstanding == 0); 1857 assert(shared_resource->ref > 0); 1858 shared_resource->ref--; 1859 if (shared_resource->ref == 0) { 1860 assert(shared_resource->io_outstanding == 0); 1861 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1862 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1863 free(shared_resource); 1864 } 1865 } 1866 1867 /* Caller must hold bdev->internal.mutex. */ 1868 static void 1869 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1870 { 1871 struct spdk_bdev_qos *qos = bdev->internal.qos; 1872 int i; 1873 1874 /* Rate limiting on this bdev enabled */ 1875 if (qos) { 1876 if (qos->ch == NULL) { 1877 struct spdk_io_channel *io_ch; 1878 1879 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1880 bdev->name, spdk_get_thread()); 1881 1882 /* No qos channel has been selected, so set one up */ 1883 1884 /* Take another reference to ch */ 1885 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1886 assert(io_ch != NULL); 1887 qos->ch = ch; 1888 1889 qos->thread = spdk_io_channel_get_thread(io_ch); 1890 1891 TAILQ_INIT(&qos->queued); 1892 1893 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1894 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 1895 qos->rate_limits[i].min_per_timeslice = 1896 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 1897 } else { 1898 qos->rate_limits[i].min_per_timeslice = 1899 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 1900 } 1901 1902 if (qos->rate_limits[i].limit == 0) { 1903 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 1904 } 1905 } 1906 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1907 qos->timeslice_size = 1908 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 1909 qos->last_timeslice = spdk_get_ticks(); 1910 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1911 qos, 1912 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1913 } 1914 1915 ch->flags |= BDEV_CH_QOS_ENABLED; 1916 } 1917 } 1918 1919 static int 1920 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1921 { 1922 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1923 struct spdk_bdev_channel *ch = ctx_buf; 1924 struct spdk_io_channel *mgmt_io_ch; 1925 struct spdk_bdev_mgmt_channel *mgmt_ch; 1926 struct spdk_bdev_shared_resource *shared_resource; 1927 1928 ch->bdev = bdev; 1929 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1930 if (!ch->channel) { 1931 return -1; 1932 } 1933 1934 assert(ch->histogram == NULL); 1935 if (bdev->internal.histogram_enabled) { 1936 ch->histogram = spdk_histogram_data_alloc(); 1937 if (ch->histogram == NULL) { 1938 SPDK_ERRLOG("Could not allocate histogram\n"); 1939 } 1940 } 1941 1942 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1943 if (!mgmt_io_ch) { 1944 spdk_put_io_channel(ch->channel); 1945 return -1; 1946 } 1947 1948 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1949 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1950 if (shared_resource->shared_ch == ch->channel) { 1951 spdk_put_io_channel(mgmt_io_ch); 1952 shared_resource->ref++; 1953 break; 1954 } 1955 } 1956 1957 if (shared_resource == NULL) { 1958 shared_resource = calloc(1, sizeof(*shared_resource)); 1959 if (shared_resource == NULL) { 1960 spdk_put_io_channel(ch->channel); 1961 spdk_put_io_channel(mgmt_io_ch); 1962 return -1; 1963 } 1964 1965 shared_resource->mgmt_ch = mgmt_ch; 1966 shared_resource->io_outstanding = 0; 1967 TAILQ_INIT(&shared_resource->nomem_io); 1968 shared_resource->nomem_threshold = 0; 1969 shared_resource->shared_ch = ch->channel; 1970 shared_resource->ref = 1; 1971 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1972 } 1973 1974 memset(&ch->stat, 0, sizeof(ch->stat)); 1975 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1976 ch->io_outstanding = 0; 1977 TAILQ_INIT(&ch->queued_resets); 1978 ch->flags = 0; 1979 ch->shared_resource = shared_resource; 1980 1981 #ifdef SPDK_CONFIG_VTUNE 1982 { 1983 char *name; 1984 __itt_init_ittlib(NULL, 0); 1985 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1986 if (!name) { 1987 _spdk_bdev_channel_destroy_resource(ch); 1988 return -1; 1989 } 1990 ch->handle = __itt_string_handle_create(name); 1991 free(name); 1992 ch->start_tsc = spdk_get_ticks(); 1993 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1994 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1995 } 1996 #endif 1997 1998 pthread_mutex_lock(&bdev->internal.mutex); 1999 _spdk_bdev_enable_qos(bdev, ch); 2000 pthread_mutex_unlock(&bdev->internal.mutex); 2001 2002 return 0; 2003 } 2004 2005 /* 2006 * Abort I/O that are waiting on a data buffer. These types of I/O are 2007 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2008 */ 2009 static void 2010 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2011 { 2012 bdev_io_stailq_t tmp; 2013 struct spdk_bdev_io *bdev_io; 2014 2015 STAILQ_INIT(&tmp); 2016 2017 while (!STAILQ_EMPTY(queue)) { 2018 bdev_io = STAILQ_FIRST(queue); 2019 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2020 if (bdev_io->internal.ch == ch) { 2021 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2022 } else { 2023 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2024 } 2025 } 2026 2027 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2028 } 2029 2030 /* 2031 * Abort I/O that are queued waiting for submission. These types of I/O are 2032 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2033 */ 2034 static void 2035 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2036 { 2037 struct spdk_bdev_io *bdev_io, *tmp; 2038 2039 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2040 if (bdev_io->internal.ch == ch) { 2041 TAILQ_REMOVE(queue, bdev_io, internal.link); 2042 /* 2043 * spdk_bdev_io_complete() assumes that the completed I/O had 2044 * been submitted to the bdev module. Since in this case it 2045 * hadn't, bump io_outstanding to account for the decrement 2046 * that spdk_bdev_io_complete() will do. 2047 */ 2048 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2049 ch->io_outstanding++; 2050 ch->shared_resource->io_outstanding++; 2051 } 2052 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2053 } 2054 } 2055 } 2056 2057 static void 2058 spdk_bdev_qos_channel_destroy(void *cb_arg) 2059 { 2060 struct spdk_bdev_qos *qos = cb_arg; 2061 2062 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2063 spdk_poller_unregister(&qos->poller); 2064 2065 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2066 2067 free(qos); 2068 } 2069 2070 static int 2071 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 2072 { 2073 int i; 2074 2075 /* 2076 * Cleanly shutting down the QoS poller is tricky, because 2077 * during the asynchronous operation the user could open 2078 * a new descriptor and create a new channel, spawning 2079 * a new QoS poller. 2080 * 2081 * The strategy is to create a new QoS structure here and swap it 2082 * in. The shutdown path then continues to refer to the old one 2083 * until it completes and then releases it. 2084 */ 2085 struct spdk_bdev_qos *new_qos, *old_qos; 2086 2087 old_qos = bdev->internal.qos; 2088 2089 new_qos = calloc(1, sizeof(*new_qos)); 2090 if (!new_qos) { 2091 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2092 return -ENOMEM; 2093 } 2094 2095 /* Copy the old QoS data into the newly allocated structure */ 2096 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2097 2098 /* Zero out the key parts of the QoS structure */ 2099 new_qos->ch = NULL; 2100 new_qos->thread = NULL; 2101 new_qos->poller = NULL; 2102 TAILQ_INIT(&new_qos->queued); 2103 /* 2104 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2105 * It will be used later for the new QoS structure. 2106 */ 2107 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2108 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2109 new_qos->rate_limits[i].min_per_timeslice = 0; 2110 new_qos->rate_limits[i].max_per_timeslice = 0; 2111 } 2112 2113 bdev->internal.qos = new_qos; 2114 2115 if (old_qos->thread == NULL) { 2116 free(old_qos); 2117 } else { 2118 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 2119 old_qos); 2120 } 2121 2122 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2123 * been destroyed yet. The destruction path will end up waiting for the final 2124 * channel to be put before it releases resources. */ 2125 2126 return 0; 2127 } 2128 2129 static void 2130 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2131 { 2132 total->bytes_read += add->bytes_read; 2133 total->num_read_ops += add->num_read_ops; 2134 total->bytes_written += add->bytes_written; 2135 total->num_write_ops += add->num_write_ops; 2136 total->bytes_unmapped += add->bytes_unmapped; 2137 total->num_unmap_ops += add->num_unmap_ops; 2138 total->read_latency_ticks += add->read_latency_ticks; 2139 total->write_latency_ticks += add->write_latency_ticks; 2140 total->unmap_latency_ticks += add->unmap_latency_ticks; 2141 } 2142 2143 static void 2144 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 2145 { 2146 struct spdk_bdev_channel *ch = ctx_buf; 2147 struct spdk_bdev_mgmt_channel *mgmt_ch; 2148 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2149 2150 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2151 spdk_get_thread()); 2152 2153 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2154 pthread_mutex_lock(&ch->bdev->internal.mutex); 2155 _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2156 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2157 2158 mgmt_ch = shared_resource->mgmt_ch; 2159 2160 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 2161 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 2162 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 2163 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 2164 2165 if (ch->histogram) { 2166 spdk_histogram_data_free(ch->histogram); 2167 } 2168 2169 _spdk_bdev_channel_destroy_resource(ch); 2170 } 2171 2172 int 2173 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2174 { 2175 struct spdk_bdev_alias *tmp; 2176 2177 if (alias == NULL) { 2178 SPDK_ERRLOG("Empty alias passed\n"); 2179 return -EINVAL; 2180 } 2181 2182 if (spdk_bdev_get_by_name(alias)) { 2183 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2184 return -EEXIST; 2185 } 2186 2187 tmp = calloc(1, sizeof(*tmp)); 2188 if (tmp == NULL) { 2189 SPDK_ERRLOG("Unable to allocate alias\n"); 2190 return -ENOMEM; 2191 } 2192 2193 tmp->alias = strdup(alias); 2194 if (tmp->alias == NULL) { 2195 free(tmp); 2196 SPDK_ERRLOG("Unable to allocate alias\n"); 2197 return -ENOMEM; 2198 } 2199 2200 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2201 2202 return 0; 2203 } 2204 2205 int 2206 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2207 { 2208 struct spdk_bdev_alias *tmp; 2209 2210 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2211 if (strcmp(alias, tmp->alias) == 0) { 2212 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2213 free(tmp->alias); 2214 free(tmp); 2215 return 0; 2216 } 2217 } 2218 2219 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2220 2221 return -ENOENT; 2222 } 2223 2224 void 2225 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2226 { 2227 struct spdk_bdev_alias *p, *tmp; 2228 2229 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2230 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2231 free(p->alias); 2232 free(p); 2233 } 2234 } 2235 2236 struct spdk_io_channel * 2237 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2238 { 2239 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 2240 } 2241 2242 const char * 2243 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2244 { 2245 return bdev->name; 2246 } 2247 2248 const char * 2249 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 2250 { 2251 return bdev->product_name; 2252 } 2253 2254 const struct spdk_bdev_aliases_list * 2255 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 2256 { 2257 return &bdev->aliases; 2258 } 2259 2260 uint32_t 2261 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 2262 { 2263 return bdev->blocklen; 2264 } 2265 2266 uint64_t 2267 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 2268 { 2269 return bdev->blockcnt; 2270 } 2271 2272 const char * 2273 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 2274 { 2275 return qos_rpc_type[type]; 2276 } 2277 2278 void 2279 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 2280 { 2281 int i; 2282 2283 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2284 2285 pthread_mutex_lock(&bdev->internal.mutex); 2286 if (bdev->internal.qos) { 2287 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2288 if (bdev->internal.qos->rate_limits[i].limit != 2289 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2290 limits[i] = bdev->internal.qos->rate_limits[i].limit; 2291 if (_spdk_bdev_qos_is_iops_rate_limit(i) == false) { 2292 /* Change from Byte to Megabyte which is user visible. */ 2293 limits[i] = limits[i] / 1024 / 1024; 2294 } 2295 } 2296 } 2297 } 2298 pthread_mutex_unlock(&bdev->internal.mutex); 2299 } 2300 2301 size_t 2302 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 2303 { 2304 return 1 << bdev->required_alignment; 2305 } 2306 2307 uint32_t 2308 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 2309 { 2310 return bdev->optimal_io_boundary; 2311 } 2312 2313 bool 2314 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 2315 { 2316 return bdev->write_cache; 2317 } 2318 2319 const struct spdk_uuid * 2320 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 2321 { 2322 return &bdev->uuid; 2323 } 2324 2325 uint64_t 2326 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 2327 { 2328 return bdev->internal.measured_queue_depth; 2329 } 2330 2331 uint64_t 2332 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 2333 { 2334 return bdev->internal.period; 2335 } 2336 2337 uint64_t 2338 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 2339 { 2340 return bdev->internal.weighted_io_time; 2341 } 2342 2343 uint64_t 2344 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 2345 { 2346 return bdev->internal.io_time; 2347 } 2348 2349 static void 2350 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 2351 { 2352 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2353 2354 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 2355 2356 if (bdev->internal.measured_queue_depth) { 2357 bdev->internal.io_time += bdev->internal.period; 2358 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 2359 } 2360 } 2361 2362 static void 2363 _calculate_measured_qd(struct spdk_io_channel_iter *i) 2364 { 2365 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2366 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2367 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 2368 2369 bdev->internal.temporary_queue_depth += ch->io_outstanding; 2370 spdk_for_each_channel_continue(i, 0); 2371 } 2372 2373 static int 2374 spdk_bdev_calculate_measured_queue_depth(void *ctx) 2375 { 2376 struct spdk_bdev *bdev = ctx; 2377 bdev->internal.temporary_queue_depth = 0; 2378 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 2379 _calculate_measured_qd_cpl); 2380 return 0; 2381 } 2382 2383 void 2384 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 2385 { 2386 bdev->internal.period = period; 2387 2388 if (bdev->internal.qd_poller != NULL) { 2389 spdk_poller_unregister(&bdev->internal.qd_poller); 2390 bdev->internal.measured_queue_depth = UINT64_MAX; 2391 } 2392 2393 if (period != 0) { 2394 bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, 2395 period); 2396 } 2397 } 2398 2399 int 2400 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 2401 { 2402 int ret; 2403 2404 pthread_mutex_lock(&bdev->internal.mutex); 2405 2406 /* bdev has open descriptors */ 2407 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 2408 bdev->blockcnt > size) { 2409 ret = -EBUSY; 2410 } else { 2411 bdev->blockcnt = size; 2412 ret = 0; 2413 } 2414 2415 pthread_mutex_unlock(&bdev->internal.mutex); 2416 2417 return ret; 2418 } 2419 2420 /* 2421 * Convert I/O offset and length from bytes to blocks. 2422 * 2423 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 2424 */ 2425 static uint64_t 2426 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 2427 uint64_t num_bytes, uint64_t *num_blocks) 2428 { 2429 uint32_t block_size = bdev->blocklen; 2430 uint8_t shift_cnt; 2431 2432 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2433 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 2434 shift_cnt = spdk_u32log2(block_size); 2435 *offset_blocks = offset_bytes >> shift_cnt; 2436 *num_blocks = num_bytes >> shift_cnt; 2437 return (offset_bytes - (*offset_blocks << shift_cnt)) | 2438 (num_bytes - (*num_blocks << shift_cnt)); 2439 } else { 2440 *offset_blocks = offset_bytes / block_size; 2441 *num_blocks = num_bytes / block_size; 2442 return (offset_bytes % block_size) | (num_bytes % block_size); 2443 } 2444 } 2445 2446 static bool 2447 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 2448 { 2449 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 2450 * has been an overflow and hence the offset has been wrapped around */ 2451 if (offset_blocks + num_blocks < offset_blocks) { 2452 return false; 2453 } 2454 2455 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 2456 if (offset_blocks + num_blocks > bdev->blockcnt) { 2457 return false; 2458 } 2459 2460 return true; 2461 } 2462 2463 int 2464 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2465 void *buf, uint64_t offset, uint64_t nbytes, 2466 spdk_bdev_io_completion_cb cb, void *cb_arg) 2467 { 2468 uint64_t offset_blocks, num_blocks; 2469 2470 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2471 return -EINVAL; 2472 } 2473 2474 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2475 } 2476 2477 int 2478 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2479 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2480 spdk_bdev_io_completion_cb cb, void *cb_arg) 2481 { 2482 struct spdk_bdev *bdev = desc->bdev; 2483 struct spdk_bdev_io *bdev_io; 2484 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2485 2486 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2487 return -EINVAL; 2488 } 2489 2490 bdev_io = spdk_bdev_get_io(channel); 2491 if (!bdev_io) { 2492 return -ENOMEM; 2493 } 2494 2495 bdev_io->internal.ch = channel; 2496 bdev_io->internal.desc = desc; 2497 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2498 bdev_io->u.bdev.iovs = &bdev_io->iov; 2499 bdev_io->u.bdev.iovs[0].iov_base = buf; 2500 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2501 bdev_io->u.bdev.iovcnt = 1; 2502 bdev_io->u.bdev.num_blocks = num_blocks; 2503 bdev_io->u.bdev.offset_blocks = offset_blocks; 2504 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2505 2506 spdk_bdev_io_submit(bdev_io); 2507 return 0; 2508 } 2509 2510 int 2511 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2512 struct iovec *iov, int iovcnt, 2513 uint64_t offset, uint64_t nbytes, 2514 spdk_bdev_io_completion_cb cb, void *cb_arg) 2515 { 2516 uint64_t offset_blocks, num_blocks; 2517 2518 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2519 return -EINVAL; 2520 } 2521 2522 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2523 } 2524 2525 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2526 struct iovec *iov, int iovcnt, 2527 uint64_t offset_blocks, uint64_t num_blocks, 2528 spdk_bdev_io_completion_cb cb, void *cb_arg) 2529 { 2530 struct spdk_bdev *bdev = desc->bdev; 2531 struct spdk_bdev_io *bdev_io; 2532 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2533 2534 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2535 return -EINVAL; 2536 } 2537 2538 bdev_io = spdk_bdev_get_io(channel); 2539 if (!bdev_io) { 2540 return -ENOMEM; 2541 } 2542 2543 bdev_io->internal.ch = channel; 2544 bdev_io->internal.desc = desc; 2545 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2546 bdev_io->u.bdev.iovs = iov; 2547 bdev_io->u.bdev.iovcnt = iovcnt; 2548 bdev_io->u.bdev.num_blocks = num_blocks; 2549 bdev_io->u.bdev.offset_blocks = offset_blocks; 2550 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2551 2552 spdk_bdev_io_submit(bdev_io); 2553 return 0; 2554 } 2555 2556 int 2557 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2558 void *buf, uint64_t offset, uint64_t nbytes, 2559 spdk_bdev_io_completion_cb cb, void *cb_arg) 2560 { 2561 uint64_t offset_blocks, num_blocks; 2562 2563 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2564 return -EINVAL; 2565 } 2566 2567 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2568 } 2569 2570 int 2571 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2572 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2573 spdk_bdev_io_completion_cb cb, void *cb_arg) 2574 { 2575 struct spdk_bdev *bdev = desc->bdev; 2576 struct spdk_bdev_io *bdev_io; 2577 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2578 2579 if (!desc->write) { 2580 return -EBADF; 2581 } 2582 2583 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2584 return -EINVAL; 2585 } 2586 2587 bdev_io = spdk_bdev_get_io(channel); 2588 if (!bdev_io) { 2589 return -ENOMEM; 2590 } 2591 2592 bdev_io->internal.ch = channel; 2593 bdev_io->internal.desc = desc; 2594 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2595 bdev_io->u.bdev.iovs = &bdev_io->iov; 2596 bdev_io->u.bdev.iovs[0].iov_base = buf; 2597 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2598 bdev_io->u.bdev.iovcnt = 1; 2599 bdev_io->u.bdev.num_blocks = num_blocks; 2600 bdev_io->u.bdev.offset_blocks = offset_blocks; 2601 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2602 2603 spdk_bdev_io_submit(bdev_io); 2604 return 0; 2605 } 2606 2607 int 2608 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2609 struct iovec *iov, int iovcnt, 2610 uint64_t offset, uint64_t len, 2611 spdk_bdev_io_completion_cb cb, void *cb_arg) 2612 { 2613 uint64_t offset_blocks, num_blocks; 2614 2615 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2616 return -EINVAL; 2617 } 2618 2619 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2620 } 2621 2622 int 2623 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2624 struct iovec *iov, int iovcnt, 2625 uint64_t offset_blocks, uint64_t num_blocks, 2626 spdk_bdev_io_completion_cb cb, void *cb_arg) 2627 { 2628 struct spdk_bdev *bdev = desc->bdev; 2629 struct spdk_bdev_io *bdev_io; 2630 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2631 2632 if (!desc->write) { 2633 return -EBADF; 2634 } 2635 2636 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2637 return -EINVAL; 2638 } 2639 2640 bdev_io = spdk_bdev_get_io(channel); 2641 if (!bdev_io) { 2642 return -ENOMEM; 2643 } 2644 2645 bdev_io->internal.ch = channel; 2646 bdev_io->internal.desc = desc; 2647 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2648 bdev_io->u.bdev.iovs = iov; 2649 bdev_io->u.bdev.iovcnt = iovcnt; 2650 bdev_io->u.bdev.num_blocks = num_blocks; 2651 bdev_io->u.bdev.offset_blocks = offset_blocks; 2652 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2653 2654 spdk_bdev_io_submit(bdev_io); 2655 return 0; 2656 } 2657 2658 int 2659 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2660 uint64_t offset, uint64_t len, 2661 spdk_bdev_io_completion_cb cb, void *cb_arg) 2662 { 2663 uint64_t offset_blocks, num_blocks; 2664 2665 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2666 return -EINVAL; 2667 } 2668 2669 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2670 } 2671 2672 int 2673 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2674 uint64_t offset_blocks, uint64_t num_blocks, 2675 spdk_bdev_io_completion_cb cb, void *cb_arg) 2676 { 2677 struct spdk_bdev *bdev = desc->bdev; 2678 struct spdk_bdev_io *bdev_io; 2679 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2680 2681 if (!desc->write) { 2682 return -EBADF; 2683 } 2684 2685 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2686 return -EINVAL; 2687 } 2688 2689 bdev_io = spdk_bdev_get_io(channel); 2690 2691 if (!bdev_io) { 2692 return -ENOMEM; 2693 } 2694 2695 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 2696 bdev_io->internal.ch = channel; 2697 bdev_io->internal.desc = desc; 2698 bdev_io->u.bdev.offset_blocks = offset_blocks; 2699 bdev_io->u.bdev.num_blocks = num_blocks; 2700 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2701 2702 if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 2703 spdk_bdev_io_submit(bdev_io); 2704 return 0; 2705 } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 2706 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 2707 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 2708 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 2709 _spdk_bdev_write_zero_buffer_next(bdev_io); 2710 return 0; 2711 } else { 2712 spdk_bdev_free_io(bdev_io); 2713 return -ENOTSUP; 2714 } 2715 } 2716 2717 int 2718 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2719 uint64_t offset, uint64_t nbytes, 2720 spdk_bdev_io_completion_cb cb, void *cb_arg) 2721 { 2722 uint64_t offset_blocks, num_blocks; 2723 2724 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2725 return -EINVAL; 2726 } 2727 2728 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2729 } 2730 2731 int 2732 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2733 uint64_t offset_blocks, uint64_t num_blocks, 2734 spdk_bdev_io_completion_cb cb, void *cb_arg) 2735 { 2736 struct spdk_bdev *bdev = desc->bdev; 2737 struct spdk_bdev_io *bdev_io; 2738 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2739 2740 if (!desc->write) { 2741 return -EBADF; 2742 } 2743 2744 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2745 return -EINVAL; 2746 } 2747 2748 if (num_blocks == 0) { 2749 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 2750 return -EINVAL; 2751 } 2752 2753 bdev_io = spdk_bdev_get_io(channel); 2754 if (!bdev_io) { 2755 return -ENOMEM; 2756 } 2757 2758 bdev_io->internal.ch = channel; 2759 bdev_io->internal.desc = desc; 2760 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 2761 2762 bdev_io->u.bdev.iovs = &bdev_io->iov; 2763 bdev_io->u.bdev.iovs[0].iov_base = NULL; 2764 bdev_io->u.bdev.iovs[0].iov_len = 0; 2765 bdev_io->u.bdev.iovcnt = 1; 2766 2767 bdev_io->u.bdev.offset_blocks = offset_blocks; 2768 bdev_io->u.bdev.num_blocks = num_blocks; 2769 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2770 2771 spdk_bdev_io_submit(bdev_io); 2772 return 0; 2773 } 2774 2775 int 2776 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2777 uint64_t offset, uint64_t length, 2778 spdk_bdev_io_completion_cb cb, void *cb_arg) 2779 { 2780 uint64_t offset_blocks, num_blocks; 2781 2782 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 2783 return -EINVAL; 2784 } 2785 2786 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2787 } 2788 2789 int 2790 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2791 uint64_t offset_blocks, uint64_t num_blocks, 2792 spdk_bdev_io_completion_cb cb, void *cb_arg) 2793 { 2794 struct spdk_bdev *bdev = desc->bdev; 2795 struct spdk_bdev_io *bdev_io; 2796 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2797 2798 if (!desc->write) { 2799 return -EBADF; 2800 } 2801 2802 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2803 return -EINVAL; 2804 } 2805 2806 bdev_io = spdk_bdev_get_io(channel); 2807 if (!bdev_io) { 2808 return -ENOMEM; 2809 } 2810 2811 bdev_io->internal.ch = channel; 2812 bdev_io->internal.desc = desc; 2813 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 2814 bdev_io->u.bdev.iovs = NULL; 2815 bdev_io->u.bdev.iovcnt = 0; 2816 bdev_io->u.bdev.offset_blocks = offset_blocks; 2817 bdev_io->u.bdev.num_blocks = num_blocks; 2818 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2819 2820 spdk_bdev_io_submit(bdev_io); 2821 return 0; 2822 } 2823 2824 static void 2825 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 2826 { 2827 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 2828 struct spdk_bdev_io *bdev_io; 2829 2830 bdev_io = TAILQ_FIRST(&ch->queued_resets); 2831 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 2832 spdk_bdev_io_submit_reset(bdev_io); 2833 } 2834 2835 static void 2836 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 2837 { 2838 struct spdk_io_channel *ch; 2839 struct spdk_bdev_channel *channel; 2840 struct spdk_bdev_mgmt_channel *mgmt_channel; 2841 struct spdk_bdev_shared_resource *shared_resource; 2842 bdev_io_tailq_t tmp_queued; 2843 2844 TAILQ_INIT(&tmp_queued); 2845 2846 ch = spdk_io_channel_iter_get_channel(i); 2847 channel = spdk_io_channel_get_ctx(ch); 2848 shared_resource = channel->shared_resource; 2849 mgmt_channel = shared_resource->mgmt_ch; 2850 2851 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 2852 2853 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 2854 /* The QoS object is always valid and readable while 2855 * the channel flag is set, so the lock here should not 2856 * be necessary. We're not in the fast path though, so 2857 * just take it anyway. */ 2858 pthread_mutex_lock(&channel->bdev->internal.mutex); 2859 if (channel->bdev->internal.qos->ch == channel) { 2860 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 2861 } 2862 pthread_mutex_unlock(&channel->bdev->internal.mutex); 2863 } 2864 2865 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 2866 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 2867 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 2868 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 2869 2870 spdk_for_each_channel_continue(i, 0); 2871 } 2872 2873 static void 2874 _spdk_bdev_start_reset(void *ctx) 2875 { 2876 struct spdk_bdev_channel *ch = ctx; 2877 2878 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 2879 ch, _spdk_bdev_reset_dev); 2880 } 2881 2882 static void 2883 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 2884 { 2885 struct spdk_bdev *bdev = ch->bdev; 2886 2887 assert(!TAILQ_EMPTY(&ch->queued_resets)); 2888 2889 pthread_mutex_lock(&bdev->internal.mutex); 2890 if (bdev->internal.reset_in_progress == NULL) { 2891 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 2892 /* 2893 * Take a channel reference for the target bdev for the life of this 2894 * reset. This guards against the channel getting destroyed while 2895 * spdk_for_each_channel() calls related to this reset IO are in 2896 * progress. We will release the reference when this reset is 2897 * completed. 2898 */ 2899 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2900 _spdk_bdev_start_reset(ch); 2901 } 2902 pthread_mutex_unlock(&bdev->internal.mutex); 2903 } 2904 2905 int 2906 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2907 spdk_bdev_io_completion_cb cb, void *cb_arg) 2908 { 2909 struct spdk_bdev *bdev = desc->bdev; 2910 struct spdk_bdev_io *bdev_io; 2911 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2912 2913 bdev_io = spdk_bdev_get_io(channel); 2914 if (!bdev_io) { 2915 return -ENOMEM; 2916 } 2917 2918 bdev_io->internal.ch = channel; 2919 bdev_io->internal.desc = desc; 2920 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2921 bdev_io->u.reset.ch_ref = NULL; 2922 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2923 2924 pthread_mutex_lock(&bdev->internal.mutex); 2925 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 2926 pthread_mutex_unlock(&bdev->internal.mutex); 2927 2928 _spdk_bdev_channel_start_reset(channel); 2929 2930 return 0; 2931 } 2932 2933 void 2934 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2935 struct spdk_bdev_io_stat *stat) 2936 { 2937 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2938 2939 *stat = channel->stat; 2940 } 2941 2942 static void 2943 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 2944 { 2945 void *io_device = spdk_io_channel_iter_get_io_device(i); 2946 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2947 2948 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 2949 bdev_iostat_ctx->cb_arg, 0); 2950 free(bdev_iostat_ctx); 2951 } 2952 2953 static void 2954 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 2955 { 2956 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2957 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2958 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2959 2960 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 2961 spdk_for_each_channel_continue(i, 0); 2962 } 2963 2964 void 2965 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 2966 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 2967 { 2968 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 2969 2970 assert(bdev != NULL); 2971 assert(stat != NULL); 2972 assert(cb != NULL); 2973 2974 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 2975 if (bdev_iostat_ctx == NULL) { 2976 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 2977 cb(bdev, stat, cb_arg, -ENOMEM); 2978 return; 2979 } 2980 2981 bdev_iostat_ctx->stat = stat; 2982 bdev_iostat_ctx->cb = cb; 2983 bdev_iostat_ctx->cb_arg = cb_arg; 2984 2985 /* Start with the statistics from previously deleted channels. */ 2986 pthread_mutex_lock(&bdev->internal.mutex); 2987 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 2988 pthread_mutex_unlock(&bdev->internal.mutex); 2989 2990 /* Then iterate and add the statistics from each existing channel. */ 2991 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2992 _spdk_bdev_get_each_channel_stat, 2993 bdev_iostat_ctx, 2994 _spdk_bdev_get_device_stat_done); 2995 } 2996 2997 int 2998 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2999 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3000 spdk_bdev_io_completion_cb cb, void *cb_arg) 3001 { 3002 struct spdk_bdev *bdev = desc->bdev; 3003 struct spdk_bdev_io *bdev_io; 3004 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3005 3006 if (!desc->write) { 3007 return -EBADF; 3008 } 3009 3010 bdev_io = spdk_bdev_get_io(channel); 3011 if (!bdev_io) { 3012 return -ENOMEM; 3013 } 3014 3015 bdev_io->internal.ch = channel; 3016 bdev_io->internal.desc = desc; 3017 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 3018 bdev_io->u.nvme_passthru.cmd = *cmd; 3019 bdev_io->u.nvme_passthru.buf = buf; 3020 bdev_io->u.nvme_passthru.nbytes = nbytes; 3021 bdev_io->u.nvme_passthru.md_buf = NULL; 3022 bdev_io->u.nvme_passthru.md_len = 0; 3023 3024 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3025 3026 spdk_bdev_io_submit(bdev_io); 3027 return 0; 3028 } 3029 3030 int 3031 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3032 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3033 spdk_bdev_io_completion_cb cb, void *cb_arg) 3034 { 3035 struct spdk_bdev *bdev = desc->bdev; 3036 struct spdk_bdev_io *bdev_io; 3037 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3038 3039 if (!desc->write) { 3040 /* 3041 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3042 * to easily determine if the command is a read or write, but for now just 3043 * do not allow io_passthru with a read-only descriptor. 3044 */ 3045 return -EBADF; 3046 } 3047 3048 bdev_io = spdk_bdev_get_io(channel); 3049 if (!bdev_io) { 3050 return -ENOMEM; 3051 } 3052 3053 bdev_io->internal.ch = channel; 3054 bdev_io->internal.desc = desc; 3055 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 3056 bdev_io->u.nvme_passthru.cmd = *cmd; 3057 bdev_io->u.nvme_passthru.buf = buf; 3058 bdev_io->u.nvme_passthru.nbytes = nbytes; 3059 bdev_io->u.nvme_passthru.md_buf = NULL; 3060 bdev_io->u.nvme_passthru.md_len = 0; 3061 3062 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3063 3064 spdk_bdev_io_submit(bdev_io); 3065 return 0; 3066 } 3067 3068 int 3069 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3070 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 3071 spdk_bdev_io_completion_cb cb, void *cb_arg) 3072 { 3073 struct spdk_bdev *bdev = desc->bdev; 3074 struct spdk_bdev_io *bdev_io; 3075 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3076 3077 if (!desc->write) { 3078 /* 3079 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3080 * to easily determine if the command is a read or write, but for now just 3081 * do not allow io_passthru with a read-only descriptor. 3082 */ 3083 return -EBADF; 3084 } 3085 3086 bdev_io = spdk_bdev_get_io(channel); 3087 if (!bdev_io) { 3088 return -ENOMEM; 3089 } 3090 3091 bdev_io->internal.ch = channel; 3092 bdev_io->internal.desc = desc; 3093 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 3094 bdev_io->u.nvme_passthru.cmd = *cmd; 3095 bdev_io->u.nvme_passthru.buf = buf; 3096 bdev_io->u.nvme_passthru.nbytes = nbytes; 3097 bdev_io->u.nvme_passthru.md_buf = md_buf; 3098 bdev_io->u.nvme_passthru.md_len = md_len; 3099 3100 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3101 3102 spdk_bdev_io_submit(bdev_io); 3103 return 0; 3104 } 3105 3106 int 3107 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 3108 struct spdk_bdev_io_wait_entry *entry) 3109 { 3110 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3111 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 3112 3113 if (bdev != entry->bdev) { 3114 SPDK_ERRLOG("bdevs do not match\n"); 3115 return -EINVAL; 3116 } 3117 3118 if (mgmt_ch->per_thread_cache_count > 0) { 3119 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 3120 return -EINVAL; 3121 } 3122 3123 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 3124 return 0; 3125 } 3126 3127 static void 3128 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 3129 { 3130 struct spdk_bdev *bdev = bdev_ch->bdev; 3131 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3132 struct spdk_bdev_io *bdev_io; 3133 3134 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 3135 /* 3136 * Allow some more I/O to complete before retrying the nomem_io queue. 3137 * Some drivers (such as nvme) cannot immediately take a new I/O in 3138 * the context of a completion, because the resources for the I/O are 3139 * not released until control returns to the bdev poller. Also, we 3140 * may require several small I/O to complete before a larger I/O 3141 * (that requires splitting) can be submitted. 3142 */ 3143 return; 3144 } 3145 3146 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 3147 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 3148 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 3149 bdev_io->internal.ch->io_outstanding++; 3150 shared_resource->io_outstanding++; 3151 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3152 bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); 3153 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 3154 break; 3155 } 3156 } 3157 } 3158 3159 static inline void 3160 _spdk_bdev_io_complete(void *ctx) 3161 { 3162 struct spdk_bdev_io *bdev_io = ctx; 3163 uint64_t tsc, tsc_diff; 3164 3165 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 3166 /* 3167 * Send the completion to the thread that originally submitted the I/O, 3168 * which may not be the current thread in the case of QoS. 3169 */ 3170 if (bdev_io->internal.io_submit_ch) { 3171 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 3172 bdev_io->internal.io_submit_ch = NULL; 3173 } 3174 3175 /* 3176 * Defer completion to avoid potential infinite recursion if the 3177 * user's completion callback issues a new I/O. 3178 */ 3179 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 3180 _spdk_bdev_io_complete, bdev_io); 3181 return; 3182 } 3183 3184 tsc = spdk_get_ticks(); 3185 tsc_diff = tsc - bdev_io->internal.submit_tsc; 3186 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 3187 3188 if (bdev_io->internal.ch->histogram) { 3189 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 3190 } 3191 3192 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3193 switch (bdev_io->type) { 3194 case SPDK_BDEV_IO_TYPE_READ: 3195 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3196 bdev_io->internal.ch->stat.num_read_ops++; 3197 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 3198 break; 3199 case SPDK_BDEV_IO_TYPE_WRITE: 3200 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3201 bdev_io->internal.ch->stat.num_write_ops++; 3202 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 3203 break; 3204 case SPDK_BDEV_IO_TYPE_UNMAP: 3205 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3206 bdev_io->internal.ch->stat.num_unmap_ops++; 3207 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 3208 default: 3209 break; 3210 } 3211 } 3212 3213 #ifdef SPDK_CONFIG_VTUNE 3214 uint64_t now_tsc = spdk_get_ticks(); 3215 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 3216 uint64_t data[5]; 3217 3218 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 3219 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 3220 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 3221 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 3222 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 3223 bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; 3224 3225 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 3226 __itt_metadata_u64, 5, data); 3227 3228 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 3229 bdev_io->internal.ch->start_tsc = now_tsc; 3230 } 3231 #endif 3232 3233 assert(bdev_io->internal.cb != NULL); 3234 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); 3235 3236 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3237 bdev_io->internal.caller_ctx); 3238 } 3239 3240 static void 3241 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 3242 { 3243 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 3244 3245 if (bdev_io->u.reset.ch_ref != NULL) { 3246 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 3247 bdev_io->u.reset.ch_ref = NULL; 3248 } 3249 3250 _spdk_bdev_io_complete(bdev_io); 3251 } 3252 3253 static void 3254 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 3255 { 3256 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3257 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 3258 3259 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 3260 if (!TAILQ_EMPTY(&ch->queued_resets)) { 3261 _spdk_bdev_channel_start_reset(ch); 3262 } 3263 3264 spdk_for_each_channel_continue(i, 0); 3265 } 3266 3267 void 3268 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 3269 { 3270 struct spdk_bdev *bdev = bdev_io->bdev; 3271 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3272 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3273 3274 bdev_io->internal.status = status; 3275 3276 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 3277 bool unlock_channels = false; 3278 3279 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 3280 SPDK_ERRLOG("NOMEM returned for reset\n"); 3281 } 3282 pthread_mutex_lock(&bdev->internal.mutex); 3283 if (bdev_io == bdev->internal.reset_in_progress) { 3284 bdev->internal.reset_in_progress = NULL; 3285 unlock_channels = true; 3286 } 3287 pthread_mutex_unlock(&bdev->internal.mutex); 3288 3289 if (unlock_channels) { 3290 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 3291 bdev_io, _spdk_bdev_reset_complete); 3292 return; 3293 } 3294 } else { 3295 if (spdk_unlikely(bdev_io->internal.orig_iovcnt > 0)) { 3296 _bdev_io_unset_bounce_buf(bdev_io); 3297 } 3298 3299 assert(bdev_ch->io_outstanding > 0); 3300 assert(shared_resource->io_outstanding > 0); 3301 bdev_ch->io_outstanding--; 3302 shared_resource->io_outstanding--; 3303 3304 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 3305 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 3306 /* 3307 * Wait for some of the outstanding I/O to complete before we 3308 * retry any of the nomem_io. Normally we will wait for 3309 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 3310 * depth channels we will instead wait for half to complete. 3311 */ 3312 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 3313 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 3314 return; 3315 } 3316 3317 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 3318 _spdk_bdev_ch_retry_io(bdev_ch); 3319 } 3320 } 3321 3322 _spdk_bdev_io_complete(bdev_io); 3323 } 3324 3325 void 3326 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 3327 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 3328 { 3329 if (sc == SPDK_SCSI_STATUS_GOOD) { 3330 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3331 } else { 3332 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 3333 bdev_io->internal.error.scsi.sc = sc; 3334 bdev_io->internal.error.scsi.sk = sk; 3335 bdev_io->internal.error.scsi.asc = asc; 3336 bdev_io->internal.error.scsi.ascq = ascq; 3337 } 3338 3339 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3340 } 3341 3342 void 3343 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 3344 int *sc, int *sk, int *asc, int *ascq) 3345 { 3346 assert(sc != NULL); 3347 assert(sk != NULL); 3348 assert(asc != NULL); 3349 assert(ascq != NULL); 3350 3351 switch (bdev_io->internal.status) { 3352 case SPDK_BDEV_IO_STATUS_SUCCESS: 3353 *sc = SPDK_SCSI_STATUS_GOOD; 3354 *sk = SPDK_SCSI_SENSE_NO_SENSE; 3355 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3356 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3357 break; 3358 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 3359 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 3360 break; 3361 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 3362 *sc = bdev_io->internal.error.scsi.sc; 3363 *sk = bdev_io->internal.error.scsi.sk; 3364 *asc = bdev_io->internal.error.scsi.asc; 3365 *ascq = bdev_io->internal.error.scsi.ascq; 3366 break; 3367 default: 3368 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 3369 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 3370 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3371 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3372 break; 3373 } 3374 } 3375 3376 void 3377 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 3378 { 3379 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 3380 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3381 } else { 3382 bdev_io->internal.error.nvme.sct = sct; 3383 bdev_io->internal.error.nvme.sc = sc; 3384 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 3385 } 3386 3387 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3388 } 3389 3390 void 3391 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 3392 { 3393 assert(sct != NULL); 3394 assert(sc != NULL); 3395 3396 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 3397 *sct = bdev_io->internal.error.nvme.sct; 3398 *sc = bdev_io->internal.error.nvme.sc; 3399 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3400 *sct = SPDK_NVME_SCT_GENERIC; 3401 *sc = SPDK_NVME_SC_SUCCESS; 3402 } else { 3403 *sct = SPDK_NVME_SCT_GENERIC; 3404 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3405 } 3406 } 3407 3408 struct spdk_thread * 3409 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 3410 { 3411 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 3412 } 3413 3414 static void 3415 _spdk_bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 3416 { 3417 uint64_t min_qos_set; 3418 int i; 3419 3420 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3421 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3422 break; 3423 } 3424 } 3425 3426 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3427 SPDK_ERRLOG("Invalid rate limits set.\n"); 3428 return; 3429 } 3430 3431 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3432 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3433 continue; 3434 } 3435 3436 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 3437 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 3438 } else { 3439 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 3440 } 3441 3442 if (limits[i] == 0 || limits[i] % min_qos_set) { 3443 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 3444 limits[i], bdev->name, min_qos_set); 3445 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 3446 return; 3447 } 3448 } 3449 3450 if (!bdev->internal.qos) { 3451 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 3452 if (!bdev->internal.qos) { 3453 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3454 return; 3455 } 3456 } 3457 3458 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3459 bdev->internal.qos->rate_limits[i].limit = limits[i]; 3460 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 3461 bdev->name, i, limits[i]); 3462 } 3463 3464 return; 3465 } 3466 3467 static void 3468 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 3469 { 3470 struct spdk_conf_section *sp = NULL; 3471 const char *val = NULL; 3472 int i = 0, j = 0; 3473 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 3474 bool config_qos = false; 3475 3476 sp = spdk_conf_find_section(NULL, "QoS"); 3477 if (!sp) { 3478 return; 3479 } 3480 3481 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3482 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3483 3484 i = 0; 3485 while (true) { 3486 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 3487 if (!val) { 3488 break; 3489 } 3490 3491 if (strcmp(bdev->name, val) != 0) { 3492 i++; 3493 continue; 3494 } 3495 3496 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 3497 if (val) { 3498 if (_spdk_bdev_qos_is_iops_rate_limit(j) == true) { 3499 limits[j] = strtoull(val, NULL, 10); 3500 } else { 3501 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 3502 } 3503 config_qos = true; 3504 } 3505 3506 break; 3507 } 3508 3509 j++; 3510 } 3511 3512 if (config_qos == true) { 3513 _spdk_bdev_qos_config_limit(bdev, limits); 3514 } 3515 3516 return; 3517 } 3518 3519 static int 3520 spdk_bdev_init(struct spdk_bdev *bdev) 3521 { 3522 char *bdev_name; 3523 3524 assert(bdev->module != NULL); 3525 3526 if (!bdev->name) { 3527 SPDK_ERRLOG("Bdev name is NULL\n"); 3528 return -EINVAL; 3529 } 3530 3531 if (spdk_bdev_get_by_name(bdev->name)) { 3532 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 3533 return -EEXIST; 3534 } 3535 3536 /* Users often register their own I/O devices using the bdev name. In 3537 * order to avoid conflicts, prepend bdev_. */ 3538 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 3539 if (!bdev_name) { 3540 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 3541 return -ENOMEM; 3542 } 3543 3544 bdev->internal.status = SPDK_BDEV_STATUS_READY; 3545 bdev->internal.measured_queue_depth = UINT64_MAX; 3546 bdev->internal.claim_module = NULL; 3547 bdev->internal.qd_poller = NULL; 3548 bdev->internal.qos = NULL; 3549 3550 if (spdk_bdev_get_buf_align(bdev) > 1) { 3551 if (bdev->split_on_optimal_io_boundary) { 3552 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 3553 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 3554 } else { 3555 bdev->split_on_optimal_io_boundary = true; 3556 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 3557 } 3558 } 3559 3560 TAILQ_INIT(&bdev->internal.open_descs); 3561 3562 TAILQ_INIT(&bdev->aliases); 3563 3564 bdev->internal.reset_in_progress = NULL; 3565 3566 _spdk_bdev_qos_config(bdev); 3567 3568 spdk_io_device_register(__bdev_to_io_dev(bdev), 3569 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 3570 sizeof(struct spdk_bdev_channel), 3571 bdev_name); 3572 3573 free(bdev_name); 3574 3575 pthread_mutex_init(&bdev->internal.mutex, NULL); 3576 return 0; 3577 } 3578 3579 static void 3580 spdk_bdev_destroy_cb(void *io_device) 3581 { 3582 int rc; 3583 struct spdk_bdev *bdev; 3584 spdk_bdev_unregister_cb cb_fn; 3585 void *cb_arg; 3586 3587 bdev = __bdev_from_io_dev(io_device); 3588 cb_fn = bdev->internal.unregister_cb; 3589 cb_arg = bdev->internal.unregister_ctx; 3590 3591 rc = bdev->fn_table->destruct(bdev->ctxt); 3592 if (rc < 0) { 3593 SPDK_ERRLOG("destruct failed\n"); 3594 } 3595 if (rc <= 0 && cb_fn != NULL) { 3596 cb_fn(cb_arg, rc); 3597 } 3598 } 3599 3600 3601 static void 3602 spdk_bdev_fini(struct spdk_bdev *bdev) 3603 { 3604 pthread_mutex_destroy(&bdev->internal.mutex); 3605 3606 free(bdev->internal.qos); 3607 3608 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 3609 } 3610 3611 static void 3612 spdk_bdev_start(struct spdk_bdev *bdev) 3613 { 3614 struct spdk_bdev_module *module; 3615 uint32_t action; 3616 3617 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 3618 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 3619 3620 /* Examine configuration before initializing I/O */ 3621 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3622 if (module->examine_config) { 3623 action = module->internal.action_in_progress; 3624 module->internal.action_in_progress++; 3625 module->examine_config(bdev); 3626 if (action != module->internal.action_in_progress) { 3627 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 3628 module->name); 3629 } 3630 } 3631 } 3632 3633 if (bdev->internal.claim_module) { 3634 return; 3635 } 3636 3637 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3638 if (module->examine_disk) { 3639 module->internal.action_in_progress++; 3640 module->examine_disk(bdev); 3641 } 3642 } 3643 } 3644 3645 int 3646 spdk_bdev_register(struct spdk_bdev *bdev) 3647 { 3648 int rc = spdk_bdev_init(bdev); 3649 3650 if (rc == 0) { 3651 spdk_bdev_start(bdev); 3652 } 3653 3654 return rc; 3655 } 3656 3657 int 3658 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 3659 { 3660 int rc; 3661 3662 rc = spdk_bdev_init(vbdev); 3663 if (rc) { 3664 return rc; 3665 } 3666 3667 spdk_bdev_start(vbdev); 3668 return 0; 3669 } 3670 3671 void 3672 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 3673 { 3674 if (bdev->internal.unregister_cb != NULL) { 3675 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 3676 } 3677 } 3678 3679 static void 3680 _remove_notify(void *arg) 3681 { 3682 struct spdk_bdev_desc *desc = arg; 3683 3684 desc->remove_scheduled = false; 3685 3686 if (desc->closed) { 3687 free(desc); 3688 } else { 3689 desc->remove_cb(desc->remove_ctx); 3690 } 3691 } 3692 3693 void 3694 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 3695 { 3696 struct spdk_bdev_desc *desc, *tmp; 3697 bool do_destruct = true; 3698 struct spdk_thread *thread; 3699 3700 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 3701 3702 thread = spdk_get_thread(); 3703 if (!thread) { 3704 /* The user called this from a non-SPDK thread. */ 3705 if (cb_fn != NULL) { 3706 cb_fn(cb_arg, -ENOTSUP); 3707 } 3708 return; 3709 } 3710 3711 pthread_mutex_lock(&bdev->internal.mutex); 3712 3713 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 3714 bdev->internal.unregister_cb = cb_fn; 3715 bdev->internal.unregister_ctx = cb_arg; 3716 3717 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 3718 if (desc->remove_cb) { 3719 do_destruct = false; 3720 /* 3721 * Defer invocation of the remove_cb to a separate message that will 3722 * run later on its thread. This ensures this context unwinds and 3723 * we don't recursively unregister this bdev again if the remove_cb 3724 * immediately closes its descriptor. 3725 */ 3726 if (!desc->remove_scheduled) { 3727 /* Avoid scheduling removal of the same descriptor multiple times. */ 3728 desc->remove_scheduled = true; 3729 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 3730 } 3731 } 3732 } 3733 3734 if (!do_destruct) { 3735 pthread_mutex_unlock(&bdev->internal.mutex); 3736 return; 3737 } 3738 3739 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 3740 pthread_mutex_unlock(&bdev->internal.mutex); 3741 3742 spdk_bdev_fini(bdev); 3743 } 3744 3745 int 3746 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 3747 void *remove_ctx, struct spdk_bdev_desc **_desc) 3748 { 3749 struct spdk_bdev_desc *desc; 3750 struct spdk_thread *thread; 3751 3752 thread = spdk_get_thread(); 3753 if (!thread) { 3754 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 3755 return -ENOTSUP; 3756 } 3757 3758 desc = calloc(1, sizeof(*desc)); 3759 if (desc == NULL) { 3760 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 3761 return -ENOMEM; 3762 } 3763 3764 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3765 spdk_get_thread()); 3766 3767 desc->bdev = bdev; 3768 desc->thread = thread; 3769 desc->remove_cb = remove_cb; 3770 desc->remove_ctx = remove_ctx; 3771 desc->write = write; 3772 *_desc = desc; 3773 3774 pthread_mutex_lock(&bdev->internal.mutex); 3775 3776 if (write && bdev->internal.claim_module) { 3777 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 3778 bdev->name, bdev->internal.claim_module->name); 3779 pthread_mutex_unlock(&bdev->internal.mutex); 3780 free(desc); 3781 *_desc = NULL; 3782 return -EPERM; 3783 } 3784 3785 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 3786 3787 pthread_mutex_unlock(&bdev->internal.mutex); 3788 3789 return 0; 3790 } 3791 3792 void 3793 spdk_bdev_close(struct spdk_bdev_desc *desc) 3794 { 3795 struct spdk_bdev *bdev = desc->bdev; 3796 bool do_unregister = false; 3797 3798 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3799 spdk_get_thread()); 3800 3801 assert(desc->thread == spdk_get_thread()); 3802 3803 pthread_mutex_lock(&bdev->internal.mutex); 3804 3805 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 3806 3807 desc->closed = true; 3808 3809 if (!desc->remove_scheduled) { 3810 free(desc); 3811 } 3812 3813 /* If no more descriptors, kill QoS channel */ 3814 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3815 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 3816 bdev->name, spdk_get_thread()); 3817 3818 if (spdk_bdev_qos_destroy(bdev)) { 3819 /* There isn't anything we can do to recover here. Just let the 3820 * old QoS poller keep running. The QoS handling won't change 3821 * cores when the user allocates a new channel, but it won't break. */ 3822 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 3823 } 3824 } 3825 3826 spdk_bdev_set_qd_sampling_period(bdev, 0); 3827 3828 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3829 do_unregister = true; 3830 } 3831 pthread_mutex_unlock(&bdev->internal.mutex); 3832 3833 if (do_unregister == true) { 3834 spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); 3835 } 3836 } 3837 3838 int 3839 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 3840 struct spdk_bdev_module *module) 3841 { 3842 if (bdev->internal.claim_module != NULL) { 3843 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 3844 bdev->internal.claim_module->name); 3845 return -EPERM; 3846 } 3847 3848 if (desc && !desc->write) { 3849 desc->write = true; 3850 } 3851 3852 bdev->internal.claim_module = module; 3853 return 0; 3854 } 3855 3856 void 3857 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 3858 { 3859 assert(bdev->internal.claim_module != NULL); 3860 bdev->internal.claim_module = NULL; 3861 } 3862 3863 struct spdk_bdev * 3864 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 3865 { 3866 return desc->bdev; 3867 } 3868 3869 void 3870 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 3871 { 3872 struct iovec *iovs; 3873 int iovcnt; 3874 3875 if (bdev_io == NULL) { 3876 return; 3877 } 3878 3879 switch (bdev_io->type) { 3880 case SPDK_BDEV_IO_TYPE_READ: 3881 iovs = bdev_io->u.bdev.iovs; 3882 iovcnt = bdev_io->u.bdev.iovcnt; 3883 break; 3884 case SPDK_BDEV_IO_TYPE_WRITE: 3885 iovs = bdev_io->u.bdev.iovs; 3886 iovcnt = bdev_io->u.bdev.iovcnt; 3887 break; 3888 default: 3889 iovs = NULL; 3890 iovcnt = 0; 3891 break; 3892 } 3893 3894 if (iovp) { 3895 *iovp = iovs; 3896 } 3897 if (iovcntp) { 3898 *iovcntp = iovcnt; 3899 } 3900 } 3901 3902 void 3903 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 3904 { 3905 3906 if (spdk_bdev_module_list_find(bdev_module->name)) { 3907 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 3908 assert(false); 3909 } 3910 3911 if (bdev_module->async_init) { 3912 bdev_module->internal.action_in_progress = 1; 3913 } 3914 3915 /* 3916 * Modules with examine callbacks must be initialized first, so they are 3917 * ready to handle examine callbacks from later modules that will 3918 * register physical bdevs. 3919 */ 3920 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 3921 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3922 } else { 3923 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3924 } 3925 } 3926 3927 struct spdk_bdev_module * 3928 spdk_bdev_module_list_find(const char *name) 3929 { 3930 struct spdk_bdev_module *bdev_module; 3931 3932 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3933 if (strcmp(name, bdev_module->name) == 0) { 3934 break; 3935 } 3936 } 3937 3938 return bdev_module; 3939 } 3940 3941 static void 3942 _spdk_bdev_write_zero_buffer_next(void *_bdev_io) 3943 { 3944 struct spdk_bdev_io *bdev_io = _bdev_io; 3945 uint64_t num_bytes, num_blocks; 3946 int rc; 3947 3948 num_bytes = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * 3949 bdev_io->u.bdev.split_remaining_num_blocks, 3950 ZERO_BUFFER_SIZE); 3951 num_blocks = num_bytes / spdk_bdev_get_block_size(bdev_io->bdev); 3952 3953 rc = spdk_bdev_write_blocks(bdev_io->internal.desc, 3954 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3955 g_bdev_mgr.zero_buffer, 3956 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 3957 _spdk_bdev_write_zero_buffer_done, bdev_io); 3958 if (rc == 0) { 3959 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 3960 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 3961 } else if (rc == -ENOMEM) { 3962 _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_write_zero_buffer_next); 3963 } else { 3964 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3965 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3966 } 3967 } 3968 3969 static void 3970 _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3971 { 3972 struct spdk_bdev_io *parent_io = cb_arg; 3973 3974 spdk_bdev_free_io(bdev_io); 3975 3976 if (!success) { 3977 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3978 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3979 return; 3980 } 3981 3982 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3983 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3984 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 3985 return; 3986 } 3987 3988 _spdk_bdev_write_zero_buffer_next(parent_io); 3989 } 3990 3991 struct set_qos_limit_ctx { 3992 void (*cb_fn)(void *cb_arg, int status); 3993 void *cb_arg; 3994 struct spdk_bdev *bdev; 3995 }; 3996 3997 static void 3998 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 3999 { 4000 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4001 ctx->bdev->internal.qos_mod_in_progress = false; 4002 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4003 4004 ctx->cb_fn(ctx->cb_arg, status); 4005 free(ctx); 4006 } 4007 4008 static void 4009 _spdk_bdev_disable_qos_done(void *cb_arg) 4010 { 4011 struct set_qos_limit_ctx *ctx = cb_arg; 4012 struct spdk_bdev *bdev = ctx->bdev; 4013 struct spdk_bdev_io *bdev_io; 4014 struct spdk_bdev_qos *qos; 4015 4016 pthread_mutex_lock(&bdev->internal.mutex); 4017 qos = bdev->internal.qos; 4018 bdev->internal.qos = NULL; 4019 pthread_mutex_unlock(&bdev->internal.mutex); 4020 4021 while (!TAILQ_EMPTY(&qos->queued)) { 4022 /* Send queued I/O back to their original thread for resubmission. */ 4023 bdev_io = TAILQ_FIRST(&qos->queued); 4024 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 4025 4026 if (bdev_io->internal.io_submit_ch) { 4027 /* 4028 * Channel was changed when sending it to the QoS thread - change it back 4029 * before sending it back to the original thread. 4030 */ 4031 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4032 bdev_io->internal.io_submit_ch = NULL; 4033 } 4034 4035 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 4036 _spdk_bdev_io_submit, bdev_io); 4037 } 4038 4039 if (qos->thread != NULL) { 4040 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4041 spdk_poller_unregister(&qos->poller); 4042 } 4043 4044 free(qos); 4045 4046 _spdk_bdev_set_qos_limit_done(ctx, 0); 4047 } 4048 4049 static void 4050 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 4051 { 4052 void *io_device = spdk_io_channel_iter_get_io_device(i); 4053 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4054 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4055 struct spdk_thread *thread; 4056 4057 pthread_mutex_lock(&bdev->internal.mutex); 4058 thread = bdev->internal.qos->thread; 4059 pthread_mutex_unlock(&bdev->internal.mutex); 4060 4061 if (thread != NULL) { 4062 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 4063 } else { 4064 _spdk_bdev_disable_qos_done(ctx); 4065 } 4066 } 4067 4068 static void 4069 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 4070 { 4071 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4072 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4073 4074 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 4075 4076 spdk_for_each_channel_continue(i, 0); 4077 } 4078 4079 static void 4080 _spdk_bdev_update_qos_rate_limit_msg(void *cb_arg) 4081 { 4082 struct set_qos_limit_ctx *ctx = cb_arg; 4083 struct spdk_bdev *bdev = ctx->bdev; 4084 4085 pthread_mutex_lock(&bdev->internal.mutex); 4086 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 4087 pthread_mutex_unlock(&bdev->internal.mutex); 4088 4089 _spdk_bdev_set_qos_limit_done(ctx, 0); 4090 } 4091 4092 static void 4093 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 4094 { 4095 void *io_device = spdk_io_channel_iter_get_io_device(i); 4096 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4097 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4098 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4099 4100 pthread_mutex_lock(&bdev->internal.mutex); 4101 _spdk_bdev_enable_qos(bdev, bdev_ch); 4102 pthread_mutex_unlock(&bdev->internal.mutex); 4103 spdk_for_each_channel_continue(i, 0); 4104 } 4105 4106 static void 4107 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 4108 { 4109 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4110 4111 _spdk_bdev_set_qos_limit_done(ctx, status); 4112 } 4113 4114 static void 4115 _spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4116 { 4117 int i; 4118 4119 assert(bdev->internal.qos != NULL); 4120 4121 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4122 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4123 bdev->internal.qos->rate_limits[i].limit = limits[i]; 4124 4125 if (limits[i] == 0) { 4126 bdev->internal.qos->rate_limits[i].limit = 4127 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4128 } 4129 } 4130 } 4131 } 4132 4133 void 4134 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 4135 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 4136 { 4137 struct set_qos_limit_ctx *ctx; 4138 uint32_t limit_set_complement; 4139 uint64_t min_limit_per_sec; 4140 int i; 4141 bool disable_rate_limit = true; 4142 4143 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4144 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4145 continue; 4146 } 4147 4148 if (limits[i] > 0) { 4149 disable_rate_limit = false; 4150 } 4151 4152 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 4153 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 4154 } else { 4155 /* Change from megabyte to byte rate limit */ 4156 limits[i] = limits[i] * 1024 * 1024; 4157 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 4158 } 4159 4160 limit_set_complement = limits[i] % min_limit_per_sec; 4161 if (limit_set_complement) { 4162 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 4163 limits[i], min_limit_per_sec); 4164 limits[i] += min_limit_per_sec - limit_set_complement; 4165 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 4166 } 4167 } 4168 4169 ctx = calloc(1, sizeof(*ctx)); 4170 if (ctx == NULL) { 4171 cb_fn(cb_arg, -ENOMEM); 4172 return; 4173 } 4174 4175 ctx->cb_fn = cb_fn; 4176 ctx->cb_arg = cb_arg; 4177 ctx->bdev = bdev; 4178 4179 pthread_mutex_lock(&bdev->internal.mutex); 4180 if (bdev->internal.qos_mod_in_progress) { 4181 pthread_mutex_unlock(&bdev->internal.mutex); 4182 free(ctx); 4183 cb_fn(cb_arg, -EAGAIN); 4184 return; 4185 } 4186 bdev->internal.qos_mod_in_progress = true; 4187 4188 if (disable_rate_limit == true && bdev->internal.qos) { 4189 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4190 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 4191 (bdev->internal.qos->rate_limits[i].limit > 0 && 4192 bdev->internal.qos->rate_limits[i].limit != 4193 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 4194 disable_rate_limit = false; 4195 break; 4196 } 4197 } 4198 } 4199 4200 if (disable_rate_limit == false) { 4201 if (bdev->internal.qos == NULL) { 4202 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 4203 if (!bdev->internal.qos) { 4204 pthread_mutex_unlock(&bdev->internal.mutex); 4205 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 4206 free(ctx); 4207 cb_fn(cb_arg, -ENOMEM); 4208 return; 4209 } 4210 } 4211 4212 if (bdev->internal.qos->thread == NULL) { 4213 /* Enabling */ 4214 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4215 4216 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4217 _spdk_bdev_enable_qos_msg, ctx, 4218 _spdk_bdev_enable_qos_done); 4219 } else { 4220 /* Updating */ 4221 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4222 4223 spdk_thread_send_msg(bdev->internal.qos->thread, 4224 _spdk_bdev_update_qos_rate_limit_msg, ctx); 4225 } 4226 } else { 4227 if (bdev->internal.qos != NULL) { 4228 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4229 4230 /* Disabling */ 4231 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4232 _spdk_bdev_disable_qos_msg, ctx, 4233 _spdk_bdev_disable_qos_msg_done); 4234 } else { 4235 pthread_mutex_unlock(&bdev->internal.mutex); 4236 _spdk_bdev_set_qos_limit_done(ctx, 0); 4237 return; 4238 } 4239 } 4240 4241 pthread_mutex_unlock(&bdev->internal.mutex); 4242 } 4243 4244 struct spdk_bdev_histogram_ctx { 4245 spdk_bdev_histogram_status_cb cb_fn; 4246 void *cb_arg; 4247 struct spdk_bdev *bdev; 4248 int status; 4249 }; 4250 4251 static void 4252 _spdk_bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 4253 { 4254 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4255 4256 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4257 ctx->bdev->internal.histogram_in_progress = false; 4258 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4259 ctx->cb_fn(ctx->cb_arg, ctx->status); 4260 free(ctx); 4261 } 4262 4263 static void 4264 _spdk_bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 4265 { 4266 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4267 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4268 4269 if (ch->histogram != NULL) { 4270 spdk_histogram_data_free(ch->histogram); 4271 ch->histogram = NULL; 4272 } 4273 spdk_for_each_channel_continue(i, 0); 4274 } 4275 4276 static void 4277 _spdk_bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 4278 { 4279 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4280 4281 if (status != 0) { 4282 ctx->status = status; 4283 ctx->bdev->internal.histogram_enabled = false; 4284 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), _spdk_bdev_histogram_disable_channel, ctx, 4285 _spdk_bdev_histogram_disable_channel_cb); 4286 } else { 4287 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4288 ctx->bdev->internal.histogram_in_progress = false; 4289 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4290 ctx->cb_fn(ctx->cb_arg, ctx->status); 4291 free(ctx); 4292 } 4293 } 4294 4295 static void 4296 _spdk_bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 4297 { 4298 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4299 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4300 int status = 0; 4301 4302 if (ch->histogram == NULL) { 4303 ch->histogram = spdk_histogram_data_alloc(); 4304 if (ch->histogram == NULL) { 4305 status = -ENOMEM; 4306 } 4307 } 4308 4309 spdk_for_each_channel_continue(i, status); 4310 } 4311 4312 void 4313 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 4314 void *cb_arg, bool enable) 4315 { 4316 struct spdk_bdev_histogram_ctx *ctx; 4317 4318 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 4319 if (ctx == NULL) { 4320 cb_fn(cb_arg, -ENOMEM); 4321 return; 4322 } 4323 4324 ctx->bdev = bdev; 4325 ctx->status = 0; 4326 ctx->cb_fn = cb_fn; 4327 ctx->cb_arg = cb_arg; 4328 4329 pthread_mutex_lock(&bdev->internal.mutex); 4330 if (bdev->internal.histogram_in_progress) { 4331 pthread_mutex_unlock(&bdev->internal.mutex); 4332 free(ctx); 4333 cb_fn(cb_arg, -EAGAIN); 4334 return; 4335 } 4336 4337 bdev->internal.histogram_in_progress = true; 4338 pthread_mutex_unlock(&bdev->internal.mutex); 4339 4340 bdev->internal.histogram_enabled = enable; 4341 4342 if (enable) { 4343 /* Allocate histogram for each channel */ 4344 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_enable_channel, ctx, 4345 _spdk_bdev_histogram_enable_channel_cb); 4346 } else { 4347 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_disable_channel, ctx, 4348 _spdk_bdev_histogram_disable_channel_cb); 4349 } 4350 } 4351 4352 struct spdk_bdev_histogram_data_ctx { 4353 spdk_bdev_histogram_data_cb cb_fn; 4354 void *cb_arg; 4355 struct spdk_bdev *bdev; 4356 /** merged histogram data from all channels */ 4357 struct spdk_histogram_data *histogram; 4358 }; 4359 4360 static void 4361 _spdk_bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 4362 { 4363 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4364 4365 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 4366 free(ctx); 4367 } 4368 4369 static void 4370 _spdk_bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 4371 { 4372 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4373 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4374 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4375 int status = 0; 4376 4377 if (ch->histogram == NULL) { 4378 status = -EFAULT; 4379 } else { 4380 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 4381 } 4382 4383 spdk_for_each_channel_continue(i, status); 4384 } 4385 4386 void 4387 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 4388 spdk_bdev_histogram_data_cb cb_fn, 4389 void *cb_arg) 4390 { 4391 struct spdk_bdev_histogram_data_ctx *ctx; 4392 4393 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 4394 if (ctx == NULL) { 4395 cb_fn(cb_arg, -ENOMEM, NULL); 4396 return; 4397 } 4398 4399 ctx->bdev = bdev; 4400 ctx->cb_fn = cb_fn; 4401 ctx->cb_arg = cb_arg; 4402 4403 ctx->histogram = histogram; 4404 4405 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_get_channel, ctx, 4406 _spdk_bdev_histogram_get_channel_cb); 4407 } 4408 4409 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 4410 4411 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 4412 { 4413 spdk_trace_register_owner(OWNER_BDEV, 'b'); 4414 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 4415 spdk_trace_register_description("BDEV_IO_START", "", TRACE_BDEV_IO_START, OWNER_BDEV, 4416 OBJECT_BDEV_IO, 1, 0, "type: "); 4417 spdk_trace_register_description("BDEV_IO_DONE", "", TRACE_BDEV_IO_DONE, OWNER_BDEV, 4418 OBJECT_BDEV_IO, 0, 0, ""); 4419 } 4420