1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk_internal/log.h" 52 #include "spdk/string.h" 53 54 #ifdef SPDK_CONFIG_VTUNE 55 #include "ittnotify.h" 56 #include "ittnotify_types.h" 57 int __itt_init_ittlib(const char *, __itt_group_id); 58 #endif 59 60 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 61 #define SPDK_BDEV_IO_CACHE_SIZE 256 62 #define BUF_SMALL_POOL_SIZE 8192 63 #define BUF_LARGE_POOL_SIZE 1024 64 #define NOMEM_THRESHOLD_COUNT 8 65 #define ZERO_BUFFER_SIZE 0x100000 66 67 #define OWNER_BDEV 0x2 68 69 #define OBJECT_BDEV_IO 0x2 70 71 #define TRACE_GROUP_BDEV 0x3 72 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 73 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 74 75 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 76 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 77 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 78 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 79 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (10 * 1024 * 1024) 80 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 81 82 #define SPDK_BDEV_POOL_ALIGNMENT 512 83 84 static const char *qos_conf_type[] = {"Limit_IOPS", 85 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 86 }; 87 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 88 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 89 }; 90 91 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 struct spdk_mempool *buf_small_pool; 97 struct spdk_mempool *buf_large_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 #ifdef SPDK_CONFIG_VTUNE 109 __itt_domain *domain; 110 #endif 111 }; 112 113 static struct spdk_bdev_mgr g_bdev_mgr = { 114 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 115 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 116 .init_complete = false, 117 .module_init_complete = false, 118 }; 119 120 static struct spdk_bdev_opts g_bdev_opts = { 121 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 122 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 123 }; 124 125 static spdk_bdev_init_cb g_init_cb_fn = NULL; 126 static void *g_init_cb_arg = NULL; 127 128 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 129 static void *g_fini_cb_arg = NULL; 130 static struct spdk_thread *g_fini_thread = NULL; 131 132 struct spdk_bdev_qos_limit { 133 /** IOs or bytes allowed per second (i.e., 1s). */ 134 uint64_t limit; 135 136 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 137 * For remaining bytes, allowed to run negative if an I/O is submitted when 138 * some bytes are remaining, but the I/O is bigger than that amount. The 139 * excess will be deducted from the next timeslice. 140 */ 141 int64_t remaining_this_timeslice; 142 143 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 144 uint32_t min_per_timeslice; 145 146 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 147 uint32_t max_per_timeslice; 148 149 /** Function to check whether to queue the IO. */ 150 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 151 152 /** Function to update for the submitted IO. */ 153 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 154 }; 155 156 struct spdk_bdev_qos { 157 /** Types of structure of rate limits. */ 158 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 159 160 /** The channel that all I/O are funneled through. */ 161 struct spdk_bdev_channel *ch; 162 163 /** The thread on which the poller is running. */ 164 struct spdk_thread *thread; 165 166 /** Queue of I/O waiting to be issued. */ 167 bdev_io_tailq_t queued; 168 169 /** Size of a timeslice in tsc ticks. */ 170 uint64_t timeslice_size; 171 172 /** Timestamp of start of last timeslice. */ 173 uint64_t last_timeslice; 174 175 /** Poller that processes queued I/O commands each time slice. */ 176 struct spdk_poller *poller; 177 }; 178 179 struct spdk_bdev_mgmt_channel { 180 bdev_io_stailq_t need_buf_small; 181 bdev_io_stailq_t need_buf_large; 182 183 /* 184 * Each thread keeps a cache of bdev_io - this allows 185 * bdev threads which are *not* DPDK threads to still 186 * benefit from a per-thread bdev_io cache. Without 187 * this, non-DPDK threads fetching from the mempool 188 * incur a cmpxchg on get and put. 189 */ 190 bdev_io_stailq_t per_thread_cache; 191 uint32_t per_thread_cache_count; 192 uint32_t bdev_io_cache_size; 193 194 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 195 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 196 }; 197 198 /* 199 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 200 * will queue here their IO that awaits retry. It makes it possible to retry sending 201 * IO to one bdev after IO from other bdev completes. 202 */ 203 struct spdk_bdev_shared_resource { 204 /* The bdev management channel */ 205 struct spdk_bdev_mgmt_channel *mgmt_ch; 206 207 /* 208 * Count of I/O submitted to bdev module and waiting for completion. 209 * Incremented before submit_request() is called on an spdk_bdev_io. 210 */ 211 uint64_t io_outstanding; 212 213 /* 214 * Queue of IO awaiting retry because of a previous NOMEM status returned 215 * on this channel. 216 */ 217 bdev_io_tailq_t nomem_io; 218 219 /* 220 * Threshold which io_outstanding must drop to before retrying nomem_io. 221 */ 222 uint64_t nomem_threshold; 223 224 /* I/O channel allocated by a bdev module */ 225 struct spdk_io_channel *shared_ch; 226 227 /* Refcount of bdev channels using this resource */ 228 uint32_t ref; 229 230 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 231 }; 232 233 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 234 #define BDEV_CH_QOS_ENABLED (1 << 1) 235 236 struct spdk_bdev_channel { 237 struct spdk_bdev *bdev; 238 239 /* The channel for the underlying device */ 240 struct spdk_io_channel *channel; 241 242 /* Per io_device per thread data */ 243 struct spdk_bdev_shared_resource *shared_resource; 244 245 struct spdk_bdev_io_stat stat; 246 247 /* 248 * Count of I/O submitted through this channel and waiting for completion. 249 * Incremented before submit_request() is called on an spdk_bdev_io. 250 */ 251 uint64_t io_outstanding; 252 253 bdev_io_tailq_t queued_resets; 254 255 uint32_t flags; 256 257 struct spdk_histogram_data *histogram; 258 259 #ifdef SPDK_CONFIG_VTUNE 260 uint64_t start_tsc; 261 uint64_t interval_tsc; 262 __itt_string_handle *handle; 263 struct spdk_bdev_io_stat prev_stat; 264 #endif 265 266 }; 267 268 struct spdk_bdev_desc { 269 struct spdk_bdev *bdev; 270 struct spdk_thread *thread; 271 spdk_bdev_remove_cb_t remove_cb; 272 void *remove_ctx; 273 bool remove_scheduled; 274 bool closed; 275 bool write; 276 TAILQ_ENTRY(spdk_bdev_desc) link; 277 }; 278 279 struct spdk_bdev_iostat_ctx { 280 struct spdk_bdev_io_stat *stat; 281 spdk_bdev_get_device_stat_cb cb; 282 void *cb_arg; 283 }; 284 285 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 286 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 287 288 static void _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, 289 void *cb_arg); 290 static void _spdk_bdev_write_zero_buffer_next(void *_bdev_io); 291 292 void 293 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 294 { 295 *opts = g_bdev_opts; 296 } 297 298 int 299 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 300 { 301 uint32_t min_pool_size; 302 303 /* 304 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 305 * initialization. A second mgmt_ch will be created on the same thread when the application starts 306 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 307 */ 308 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 309 if (opts->bdev_io_pool_size < min_pool_size) { 310 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 311 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 312 spdk_thread_get_count()); 313 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 314 return -1; 315 } 316 317 g_bdev_opts = *opts; 318 return 0; 319 } 320 321 struct spdk_bdev * 322 spdk_bdev_first(void) 323 { 324 struct spdk_bdev *bdev; 325 326 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 327 if (bdev) { 328 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 329 } 330 331 return bdev; 332 } 333 334 struct spdk_bdev * 335 spdk_bdev_next(struct spdk_bdev *prev) 336 { 337 struct spdk_bdev *bdev; 338 339 bdev = TAILQ_NEXT(prev, internal.link); 340 if (bdev) { 341 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 342 } 343 344 return bdev; 345 } 346 347 static struct spdk_bdev * 348 _bdev_next_leaf(struct spdk_bdev *bdev) 349 { 350 while (bdev != NULL) { 351 if (bdev->internal.claim_module == NULL) { 352 return bdev; 353 } else { 354 bdev = TAILQ_NEXT(bdev, internal.link); 355 } 356 } 357 358 return bdev; 359 } 360 361 struct spdk_bdev * 362 spdk_bdev_first_leaf(void) 363 { 364 struct spdk_bdev *bdev; 365 366 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 367 368 if (bdev) { 369 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 370 } 371 372 return bdev; 373 } 374 375 struct spdk_bdev * 376 spdk_bdev_next_leaf(struct spdk_bdev *prev) 377 { 378 struct spdk_bdev *bdev; 379 380 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 381 382 if (bdev) { 383 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 384 } 385 386 return bdev; 387 } 388 389 struct spdk_bdev * 390 spdk_bdev_get_by_name(const char *bdev_name) 391 { 392 struct spdk_bdev_alias *tmp; 393 struct spdk_bdev *bdev = spdk_bdev_first(); 394 395 while (bdev != NULL) { 396 if (strcmp(bdev_name, bdev->name) == 0) { 397 return bdev; 398 } 399 400 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 401 if (strcmp(bdev_name, tmp->alias) == 0) { 402 return bdev; 403 } 404 } 405 406 bdev = spdk_bdev_next(bdev); 407 } 408 409 return NULL; 410 } 411 412 void 413 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 414 { 415 struct iovec *iovs; 416 417 iovs = bdev_io->u.bdev.iovs; 418 419 assert(iovs != NULL); 420 assert(bdev_io->u.bdev.iovcnt >= 1); 421 422 iovs[0].iov_base = buf; 423 iovs[0].iov_len = len; 424 } 425 426 static bool 427 _is_buf_allocated(struct iovec *iovs) 428 { 429 return iovs[0].iov_base != NULL; 430 } 431 432 static bool 433 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 434 { 435 int i; 436 uintptr_t iov_base; 437 438 if (spdk_likely(alignment == 1)) { 439 return true; 440 } 441 442 for (i = 0; i < iovcnt; i++) { 443 iov_base = (uintptr_t)iovs[i].iov_base; 444 if ((iov_base & (alignment - 1)) != 0) { 445 return false; 446 } 447 } 448 449 return true; 450 } 451 452 static void 453 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 454 { 455 int i; 456 size_t len; 457 458 for (i = 0; i < iovcnt; i++) { 459 len = spdk_min(iovs[i].iov_len, buf_len); 460 memcpy(buf, iovs[i].iov_base, len); 461 buf += len; 462 buf_len -= len; 463 } 464 } 465 466 static void 467 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 468 { 469 int i; 470 size_t len; 471 472 for (i = 0; i < iovcnt; i++) { 473 len = spdk_min(iovs[i].iov_len, buf_len); 474 memcpy(iovs[i].iov_base, buf, len); 475 buf += len; 476 buf_len -= len; 477 } 478 } 479 480 static void 481 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 482 { 483 /* save original iovec */ 484 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 485 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 486 /* set bounce iov */ 487 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 488 bdev_io->u.bdev.iovcnt = 1; 489 /* set bounce buffer for this operation */ 490 bdev_io->u.bdev.iovs[0].iov_base = buf; 491 bdev_io->u.bdev.iovs[0].iov_len = len; 492 /* if this is write path, copy data from original buffer to bounce buffer */ 493 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 494 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 495 } 496 } 497 498 static void 499 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 500 { 501 struct spdk_mempool *pool; 502 struct spdk_bdev_io *tmp; 503 void *buf, *aligned_buf; 504 bdev_io_stailq_t *stailq; 505 struct spdk_bdev_mgmt_channel *ch; 506 uint64_t buf_len; 507 uint64_t alignment; 508 bool buf_allocated; 509 510 buf = bdev_io->internal.buf; 511 buf_len = bdev_io->internal.buf_len; 512 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 513 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 514 515 bdev_io->internal.buf = NULL; 516 517 if (buf_len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 518 pool = g_bdev_mgr.buf_small_pool; 519 stailq = &ch->need_buf_small; 520 } else { 521 pool = g_bdev_mgr.buf_large_pool; 522 stailq = &ch->need_buf_large; 523 } 524 525 if (STAILQ_EMPTY(stailq)) { 526 spdk_mempool_put(pool, buf); 527 } else { 528 tmp = STAILQ_FIRST(stailq); 529 530 alignment = spdk_bdev_get_buf_align(tmp->bdev); 531 buf_allocated = _is_buf_allocated(tmp->u.bdev.iovs); 532 533 aligned_buf = (void *)(((uintptr_t)buf + 534 (alignment - 1)) & ~(alignment - 1)); 535 if (buf_allocated) { 536 _bdev_io_set_bounce_buf(tmp, aligned_buf, tmp->internal.buf_len); 537 } else { 538 spdk_bdev_io_set_buf(tmp, aligned_buf, tmp->internal.buf_len); 539 } 540 541 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 542 tmp->internal.buf = buf; 543 tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); 544 } 545 } 546 547 static void 548 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 549 { 550 /* if this is read path, copy data from bounce buffer to original buffer */ 551 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 552 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 553 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt, 554 bdev_io->internal.bounce_iov.iov_base, bdev_io->internal.bounce_iov.iov_len); 555 } 556 /* set orignal buffer for this io */ 557 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 558 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 559 /* disable bouncing buffer for this io */ 560 bdev_io->internal.orig_iovcnt = 0; 561 bdev_io->internal.orig_iovs = NULL; 562 /* return bounce buffer to the pool */ 563 spdk_bdev_io_put_buf(bdev_io); 564 } 565 566 void 567 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 568 { 569 struct spdk_mempool *pool; 570 bdev_io_stailq_t *stailq; 571 void *buf, *aligned_buf; 572 struct spdk_bdev_mgmt_channel *mgmt_ch; 573 uint64_t alignment; 574 bool buf_allocated; 575 576 assert(cb != NULL); 577 assert(bdev_io->u.bdev.iovs != NULL); 578 579 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 580 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 581 582 if (buf_allocated && 583 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 584 /* Buffer already present and aligned */ 585 cb(bdev_io->internal.ch->channel, bdev_io); 586 return; 587 } 588 589 assert(len + alignment <= SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT); 590 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 591 592 bdev_io->internal.buf_len = len; 593 bdev_io->internal.get_buf_cb = cb; 594 595 if (len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 596 pool = g_bdev_mgr.buf_small_pool; 597 stailq = &mgmt_ch->need_buf_small; 598 } else { 599 pool = g_bdev_mgr.buf_large_pool; 600 stailq = &mgmt_ch->need_buf_large; 601 } 602 603 buf = spdk_mempool_get(pool); 604 605 if (!buf) { 606 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 607 } else { 608 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 609 610 if (buf_allocated) { 611 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 612 } else { 613 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 614 } 615 bdev_io->internal.buf = buf; 616 bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); 617 } 618 } 619 620 static int 621 spdk_bdev_module_get_max_ctx_size(void) 622 { 623 struct spdk_bdev_module *bdev_module; 624 int max_bdev_module_size = 0; 625 626 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 627 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 628 max_bdev_module_size = bdev_module->get_ctx_size(); 629 } 630 } 631 632 return max_bdev_module_size; 633 } 634 635 void 636 spdk_bdev_config_text(FILE *fp) 637 { 638 struct spdk_bdev_module *bdev_module; 639 640 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 641 if (bdev_module->config_text) { 642 bdev_module->config_text(fp); 643 } 644 } 645 } 646 647 static void 648 spdk_bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 649 { 650 int i; 651 struct spdk_bdev_qos *qos = bdev->internal.qos; 652 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 653 654 if (!qos) { 655 return; 656 } 657 658 spdk_bdev_get_qos_rate_limits(bdev, limits); 659 660 spdk_json_write_object_begin(w); 661 spdk_json_write_named_string(w, "method", "set_bdev_qos_limit"); 662 663 spdk_json_write_named_object_begin(w, "params"); 664 spdk_json_write_named_string(w, "name", bdev->name); 665 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 666 if (limits[i] > 0) { 667 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 668 } 669 } 670 spdk_json_write_object_end(w); 671 672 spdk_json_write_object_end(w); 673 } 674 675 void 676 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 677 { 678 struct spdk_bdev_module *bdev_module; 679 struct spdk_bdev *bdev; 680 681 assert(w != NULL); 682 683 spdk_json_write_array_begin(w); 684 685 spdk_json_write_object_begin(w); 686 spdk_json_write_named_string(w, "method", "set_bdev_options"); 687 spdk_json_write_named_object_begin(w, "params"); 688 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 689 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 690 spdk_json_write_object_end(w); 691 spdk_json_write_object_end(w); 692 693 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 694 if (bdev_module->config_json) { 695 bdev_module->config_json(w); 696 } 697 } 698 699 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 700 spdk_bdev_qos_config_json(bdev, w); 701 702 if (bdev->fn_table->write_config_json) { 703 bdev->fn_table->write_config_json(bdev, w); 704 } 705 } 706 707 spdk_json_write_array_end(w); 708 } 709 710 static int 711 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 712 { 713 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 714 struct spdk_bdev_io *bdev_io; 715 uint32_t i; 716 717 STAILQ_INIT(&ch->need_buf_small); 718 STAILQ_INIT(&ch->need_buf_large); 719 720 STAILQ_INIT(&ch->per_thread_cache); 721 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 722 723 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 724 ch->per_thread_cache_count = 0; 725 for (i = 0; i < ch->bdev_io_cache_size; i++) { 726 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 727 assert(bdev_io != NULL); 728 ch->per_thread_cache_count++; 729 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 730 } 731 732 TAILQ_INIT(&ch->shared_resources); 733 TAILQ_INIT(&ch->io_wait_queue); 734 735 return 0; 736 } 737 738 static void 739 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 740 { 741 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 742 struct spdk_bdev_io *bdev_io; 743 744 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 745 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 746 } 747 748 if (!TAILQ_EMPTY(&ch->shared_resources)) { 749 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 750 } 751 752 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 753 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 754 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 755 ch->per_thread_cache_count--; 756 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 757 } 758 759 assert(ch->per_thread_cache_count == 0); 760 } 761 762 static void 763 spdk_bdev_init_complete(int rc) 764 { 765 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 766 void *cb_arg = g_init_cb_arg; 767 struct spdk_bdev_module *m; 768 769 g_bdev_mgr.init_complete = true; 770 g_init_cb_fn = NULL; 771 g_init_cb_arg = NULL; 772 773 /* 774 * For modules that need to know when subsystem init is complete, 775 * inform them now. 776 */ 777 if (rc == 0) { 778 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 779 if (m->init_complete) { 780 m->init_complete(); 781 } 782 } 783 } 784 785 cb_fn(cb_arg, rc); 786 } 787 788 static void 789 spdk_bdev_module_action_complete(void) 790 { 791 struct spdk_bdev_module *m; 792 793 /* 794 * Don't finish bdev subsystem initialization if 795 * module pre-initialization is still in progress, or 796 * the subsystem been already initialized. 797 */ 798 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 799 return; 800 } 801 802 /* 803 * Check all bdev modules for inits/examinations in progress. If any 804 * exist, return immediately since we cannot finish bdev subsystem 805 * initialization until all are completed. 806 */ 807 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 808 if (m->internal.action_in_progress > 0) { 809 return; 810 } 811 } 812 813 /* 814 * Modules already finished initialization - now that all 815 * the bdev modules have finished their asynchronous I/O 816 * processing, the entire bdev layer can be marked as complete. 817 */ 818 spdk_bdev_init_complete(0); 819 } 820 821 static void 822 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 823 { 824 assert(module->internal.action_in_progress > 0); 825 module->internal.action_in_progress--; 826 spdk_bdev_module_action_complete(); 827 } 828 829 void 830 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 831 { 832 spdk_bdev_module_action_done(module); 833 } 834 835 void 836 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 837 { 838 spdk_bdev_module_action_done(module); 839 } 840 841 /** The last initialized bdev module */ 842 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 843 844 static int 845 spdk_bdev_modules_init(void) 846 { 847 struct spdk_bdev_module *module; 848 int rc = 0; 849 850 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 851 g_resume_bdev_module = module; 852 rc = module->module_init(); 853 if (rc != 0) { 854 return rc; 855 } 856 } 857 858 g_resume_bdev_module = NULL; 859 return 0; 860 } 861 862 863 static void 864 spdk_bdev_init_failed_complete(void *cb_arg) 865 { 866 spdk_bdev_init_complete(-1); 867 } 868 869 static void 870 spdk_bdev_init_failed(void *cb_arg) 871 { 872 spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); 873 } 874 875 void 876 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 877 { 878 struct spdk_conf_section *sp; 879 struct spdk_bdev_opts bdev_opts; 880 int32_t bdev_io_pool_size, bdev_io_cache_size; 881 int cache_size; 882 int rc = 0; 883 char mempool_name[32]; 884 885 assert(cb_fn != NULL); 886 887 sp = spdk_conf_find_section(NULL, "Bdev"); 888 if (sp != NULL) { 889 spdk_bdev_get_opts(&bdev_opts); 890 891 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 892 if (bdev_io_pool_size >= 0) { 893 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 894 } 895 896 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 897 if (bdev_io_cache_size >= 0) { 898 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 899 } 900 901 if (spdk_bdev_set_opts(&bdev_opts)) { 902 spdk_bdev_init_complete(-1); 903 return; 904 } 905 906 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 907 } 908 909 g_init_cb_fn = cb_fn; 910 g_init_cb_arg = cb_arg; 911 912 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 913 914 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 915 g_bdev_opts.bdev_io_pool_size, 916 sizeof(struct spdk_bdev_io) + 917 spdk_bdev_module_get_max_ctx_size(), 918 0, 919 SPDK_ENV_SOCKET_ID_ANY); 920 921 if (g_bdev_mgr.bdev_io_pool == NULL) { 922 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 923 spdk_bdev_init_complete(-1); 924 return; 925 } 926 927 /** 928 * Ensure no more than half of the total buffers end up local caches, by 929 * using spdk_thread_get_count() to determine how many local caches we need 930 * to account for. 931 */ 932 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 933 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 934 935 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 936 BUF_SMALL_POOL_SIZE, 937 SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 938 cache_size, 939 SPDK_ENV_SOCKET_ID_ANY); 940 if (!g_bdev_mgr.buf_small_pool) { 941 SPDK_ERRLOG("create rbuf small pool failed\n"); 942 spdk_bdev_init_complete(-1); 943 return; 944 } 945 946 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 947 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 948 949 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 950 BUF_LARGE_POOL_SIZE, 951 SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 952 cache_size, 953 SPDK_ENV_SOCKET_ID_ANY); 954 if (!g_bdev_mgr.buf_large_pool) { 955 SPDK_ERRLOG("create rbuf large pool failed\n"); 956 spdk_bdev_init_complete(-1); 957 return; 958 } 959 960 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 961 NULL); 962 if (!g_bdev_mgr.zero_buffer) { 963 SPDK_ERRLOG("create bdev zero buffer failed\n"); 964 spdk_bdev_init_complete(-1); 965 return; 966 } 967 968 #ifdef SPDK_CONFIG_VTUNE 969 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 970 #endif 971 972 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 973 spdk_bdev_mgmt_channel_destroy, 974 sizeof(struct spdk_bdev_mgmt_channel), 975 "bdev_mgr"); 976 977 rc = spdk_bdev_modules_init(); 978 g_bdev_mgr.module_init_complete = true; 979 if (rc != 0) { 980 SPDK_ERRLOG("bdev modules init failed\n"); 981 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); 982 return; 983 } 984 985 spdk_bdev_module_action_complete(); 986 } 987 988 static void 989 spdk_bdev_mgr_unregister_cb(void *io_device) 990 { 991 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 992 993 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 994 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 995 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 996 g_bdev_opts.bdev_io_pool_size); 997 } 998 999 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1000 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1001 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1002 BUF_SMALL_POOL_SIZE); 1003 assert(false); 1004 } 1005 1006 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1007 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1008 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1009 BUF_LARGE_POOL_SIZE); 1010 assert(false); 1011 } 1012 1013 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1014 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1015 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1016 spdk_dma_free(g_bdev_mgr.zero_buffer); 1017 1018 cb_fn(g_fini_cb_arg); 1019 g_fini_cb_fn = NULL; 1020 g_fini_cb_arg = NULL; 1021 g_bdev_mgr.init_complete = false; 1022 g_bdev_mgr.module_init_complete = false; 1023 } 1024 1025 static void 1026 spdk_bdev_module_finish_iter(void *arg) 1027 { 1028 struct spdk_bdev_module *bdev_module; 1029 1030 /* Start iterating from the last touched module */ 1031 if (!g_resume_bdev_module) { 1032 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1033 } else { 1034 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1035 internal.tailq); 1036 } 1037 1038 while (bdev_module) { 1039 if (bdev_module->async_fini) { 1040 /* Save our place so we can resume later. We must 1041 * save the variable here, before calling module_fini() 1042 * below, because in some cases the module may immediately 1043 * call spdk_bdev_module_finish_done() and re-enter 1044 * this function to continue iterating. */ 1045 g_resume_bdev_module = bdev_module; 1046 } 1047 1048 if (bdev_module->module_fini) { 1049 bdev_module->module_fini(); 1050 } 1051 1052 if (bdev_module->async_fini) { 1053 return; 1054 } 1055 1056 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1057 internal.tailq); 1058 } 1059 1060 g_resume_bdev_module = NULL; 1061 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 1062 } 1063 1064 void 1065 spdk_bdev_module_finish_done(void) 1066 { 1067 if (spdk_get_thread() != g_fini_thread) { 1068 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 1069 } else { 1070 spdk_bdev_module_finish_iter(NULL); 1071 } 1072 } 1073 1074 static void 1075 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1076 { 1077 struct spdk_bdev *bdev = cb_arg; 1078 1079 if (bdeverrno && bdev) { 1080 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1081 bdev->name); 1082 1083 /* 1084 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1085 * bdev; try to continue by manually removing this bdev from the list and continue 1086 * with the next bdev in the list. 1087 */ 1088 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1089 } 1090 1091 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1092 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1093 /* 1094 * Bdev module finish need to be deferred as we might be in the middle of some context 1095 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1096 * after returning. 1097 */ 1098 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 1099 return; 1100 } 1101 1102 /* 1103 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1104 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1105 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1106 * base bdevs. 1107 * 1108 * Also, walk the list in the reverse order. 1109 */ 1110 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1111 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1112 if (bdev->internal.claim_module != NULL) { 1113 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1114 bdev->name, bdev->internal.claim_module->name); 1115 continue; 1116 } 1117 1118 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1119 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1120 return; 1121 } 1122 1123 /* 1124 * If any bdev fails to unclaim underlying bdev properly, we may face the 1125 * case of bdev list consisting of claimed bdevs only (if claims are managed 1126 * correctly, this would mean there's a loop in the claims graph which is 1127 * clearly impossible). Warn and unregister last bdev on the list then. 1128 */ 1129 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1130 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1131 SPDK_ERRLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1132 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1133 return; 1134 } 1135 } 1136 1137 void 1138 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1139 { 1140 struct spdk_bdev_module *m; 1141 1142 assert(cb_fn != NULL); 1143 1144 g_fini_thread = spdk_get_thread(); 1145 1146 g_fini_cb_fn = cb_fn; 1147 g_fini_cb_arg = cb_arg; 1148 1149 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1150 if (m->fini_start) { 1151 m->fini_start(); 1152 } 1153 } 1154 1155 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 1156 } 1157 1158 static struct spdk_bdev_io * 1159 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 1160 { 1161 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1162 struct spdk_bdev_io *bdev_io; 1163 1164 if (ch->per_thread_cache_count > 0) { 1165 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1166 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1167 ch->per_thread_cache_count--; 1168 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1169 /* 1170 * Don't try to look for bdev_ios in the global pool if there are 1171 * waiters on bdev_ios - we don't want this caller to jump the line. 1172 */ 1173 bdev_io = NULL; 1174 } else { 1175 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1176 } 1177 1178 return bdev_io; 1179 } 1180 1181 void 1182 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1183 { 1184 struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1185 1186 assert(bdev_io != NULL); 1187 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1188 1189 if (bdev_io->internal.buf != NULL) { 1190 spdk_bdev_io_put_buf(bdev_io); 1191 } 1192 1193 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1194 ch->per_thread_cache_count++; 1195 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1196 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1197 struct spdk_bdev_io_wait_entry *entry; 1198 1199 entry = TAILQ_FIRST(&ch->io_wait_queue); 1200 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1201 entry->cb_fn(entry->cb_arg); 1202 } 1203 } else { 1204 /* We should never have a full cache with entries on the io wait queue. */ 1205 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1206 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1207 } 1208 } 1209 1210 static bool 1211 _spdk_bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1212 { 1213 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1214 1215 switch (limit) { 1216 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1217 return true; 1218 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1219 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1220 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1221 return false; 1222 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1223 default: 1224 return false; 1225 } 1226 } 1227 1228 static bool 1229 _spdk_bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1230 { 1231 switch (bdev_io->type) { 1232 case SPDK_BDEV_IO_TYPE_NVME_IO: 1233 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1234 case SPDK_BDEV_IO_TYPE_READ: 1235 case SPDK_BDEV_IO_TYPE_WRITE: 1236 return true; 1237 default: 1238 return false; 1239 } 1240 } 1241 1242 static bool 1243 _spdk_bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1244 { 1245 switch (bdev_io->type) { 1246 case SPDK_BDEV_IO_TYPE_NVME_IO: 1247 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1248 /* Bit 1 (0x2) set for read operation */ 1249 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1250 return true; 1251 } else { 1252 return false; 1253 } 1254 case SPDK_BDEV_IO_TYPE_READ: 1255 return true; 1256 default: 1257 return false; 1258 } 1259 } 1260 1261 static uint64_t 1262 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1263 { 1264 struct spdk_bdev *bdev = bdev_io->bdev; 1265 1266 switch (bdev_io->type) { 1267 case SPDK_BDEV_IO_TYPE_NVME_IO: 1268 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1269 return bdev_io->u.nvme_passthru.nbytes; 1270 case SPDK_BDEV_IO_TYPE_READ: 1271 case SPDK_BDEV_IO_TYPE_WRITE: 1272 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1273 default: 1274 return 0; 1275 } 1276 } 1277 1278 static bool 1279 _spdk_bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1280 { 1281 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1282 return true; 1283 } else { 1284 return false; 1285 } 1286 } 1287 1288 static bool 1289 _spdk_bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1290 { 1291 if (_spdk_bdev_is_read_io(io) == false) { 1292 return false; 1293 } 1294 1295 return _spdk_bdev_qos_rw_queue_io(limit, io); 1296 } 1297 1298 static bool 1299 _spdk_bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1300 { 1301 if (_spdk_bdev_is_read_io(io) == true) { 1302 return false; 1303 } 1304 1305 return _spdk_bdev_qos_rw_queue_io(limit, io); 1306 } 1307 1308 static void 1309 _spdk_bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1310 { 1311 limit->remaining_this_timeslice--; 1312 } 1313 1314 static void 1315 _spdk_bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1316 { 1317 limit->remaining_this_timeslice -= _spdk_bdev_get_io_size_in_byte(io); 1318 } 1319 1320 static void 1321 _spdk_bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1322 { 1323 if (_spdk_bdev_is_read_io(io) == false) { 1324 return; 1325 } 1326 1327 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1328 } 1329 1330 static void 1331 _spdk_bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1332 { 1333 if (_spdk_bdev_is_read_io(io) == true) { 1334 return; 1335 } 1336 1337 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1338 } 1339 1340 static void 1341 _spdk_bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1342 { 1343 int i; 1344 1345 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1346 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1347 qos->rate_limits[i].queue_io = NULL; 1348 qos->rate_limits[i].update_quota = NULL; 1349 continue; 1350 } 1351 1352 switch (i) { 1353 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1354 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1355 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_iops_update_quota; 1356 break; 1357 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1358 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1359 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_bps_update_quota; 1360 break; 1361 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1362 qos->rate_limits[i].queue_io = _spdk_bdev_qos_r_queue_io; 1363 qos->rate_limits[i].update_quota = _spdk_bdev_qos_r_bps_update_quota; 1364 break; 1365 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1366 qos->rate_limits[i].queue_io = _spdk_bdev_qos_w_queue_io; 1367 qos->rate_limits[i].update_quota = _spdk_bdev_qos_w_bps_update_quota; 1368 break; 1369 default: 1370 break; 1371 } 1372 } 1373 } 1374 1375 static int 1376 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1377 { 1378 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1379 struct spdk_bdev *bdev = ch->bdev; 1380 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1381 int i, submitted_ios = 0; 1382 1383 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1384 if (_spdk_bdev_qos_io_to_limit(bdev_io) == true) { 1385 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1386 if (!qos->rate_limits[i].queue_io) { 1387 continue; 1388 } 1389 1390 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1391 bdev_io) == true) { 1392 return submitted_ios; 1393 } 1394 } 1395 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1396 if (!qos->rate_limits[i].update_quota) { 1397 continue; 1398 } 1399 1400 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1401 } 1402 } 1403 1404 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1405 ch->io_outstanding++; 1406 shared_resource->io_outstanding++; 1407 bdev_io->internal.in_submit_request = true; 1408 bdev->fn_table->submit_request(ch->channel, bdev_io); 1409 bdev_io->internal.in_submit_request = false; 1410 submitted_ios++; 1411 } 1412 1413 return submitted_ios; 1414 } 1415 1416 static void 1417 _spdk_bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1418 { 1419 int rc; 1420 1421 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1422 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1423 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1424 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1425 &bdev_io->internal.waitq_entry); 1426 if (rc != 0) { 1427 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1428 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1429 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1430 } 1431 } 1432 1433 static bool 1434 _spdk_bdev_io_type_can_split(uint8_t type) 1435 { 1436 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1437 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1438 1439 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1440 * UNMAP could be split, but these types of I/O are typically much larger 1441 * in size (sometimes the size of the entire block device), and the bdev 1442 * module can more efficiently split these types of I/O. Plus those types 1443 * of I/O do not have a payload, which makes the splitting process simpler. 1444 */ 1445 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1446 return true; 1447 } else { 1448 return false; 1449 } 1450 } 1451 1452 static bool 1453 _spdk_bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1454 { 1455 uint64_t start_stripe, end_stripe; 1456 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1457 1458 if (io_boundary == 0) { 1459 return false; 1460 } 1461 1462 if (!_spdk_bdev_io_type_can_split(bdev_io->type)) { 1463 return false; 1464 } 1465 1466 start_stripe = bdev_io->u.bdev.offset_blocks; 1467 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1468 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1469 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1470 start_stripe >>= spdk_u32log2(io_boundary); 1471 end_stripe >>= spdk_u32log2(io_boundary); 1472 } else { 1473 start_stripe /= io_boundary; 1474 end_stripe /= io_boundary; 1475 } 1476 return (start_stripe != end_stripe); 1477 } 1478 1479 static uint32_t 1480 _to_next_boundary(uint64_t offset, uint32_t boundary) 1481 { 1482 return (boundary - (offset % boundary)); 1483 } 1484 1485 static void 1486 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1487 1488 static void 1489 _spdk_bdev_io_split_with_payload(void *_bdev_io) 1490 { 1491 struct spdk_bdev_io *bdev_io = _bdev_io; 1492 uint64_t current_offset, remaining; 1493 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; 1494 struct iovec *parent_iov, *iov; 1495 uint64_t parent_iov_offset, iov_len; 1496 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1497 int rc; 1498 1499 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1500 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1501 blocklen = bdev_io->bdev->blocklen; 1502 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1503 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1504 1505 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1506 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1507 if (parent_iov_offset < parent_iov->iov_len) { 1508 break; 1509 } 1510 parent_iov_offset -= parent_iov->iov_len; 1511 } 1512 1513 child_iovcnt = 0; 1514 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1515 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1516 to_next_boundary = spdk_min(remaining, to_next_boundary); 1517 to_next_boundary_bytes = to_next_boundary * blocklen; 1518 iov = &bdev_io->child_iov[child_iovcnt]; 1519 iovcnt = 0; 1520 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1521 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1522 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1523 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1524 to_next_boundary_bytes -= iov_len; 1525 1526 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1527 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1528 1529 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1530 parent_iov_offset += iov_len; 1531 } else { 1532 parent_iovpos++; 1533 parent_iov_offset = 0; 1534 } 1535 child_iovcnt++; 1536 iovcnt++; 1537 } 1538 1539 if (to_next_boundary_bytes > 0) { 1540 /* We had to stop this child I/O early because we ran out of 1541 * child_iov space. Make sure the iovs collected are valid and 1542 * then adjust to_next_boundary before starting the child I/O. 1543 */ 1544 if ((to_next_boundary_bytes % blocklen) != 0) { 1545 SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", 1546 to_next_boundary_bytes, blocklen); 1547 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1548 if (bdev_io->u.bdev.split_outstanding == 0) { 1549 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1550 } 1551 return; 1552 } 1553 to_next_boundary -= to_next_boundary_bytes / blocklen; 1554 } 1555 1556 bdev_io->u.bdev.split_outstanding++; 1557 1558 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1559 rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, 1560 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1561 iov, iovcnt, current_offset, to_next_boundary, 1562 _spdk_bdev_io_split_done, bdev_io); 1563 } else { 1564 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 1565 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1566 iov, iovcnt, current_offset, to_next_boundary, 1567 _spdk_bdev_io_split_done, bdev_io); 1568 } 1569 1570 if (rc == 0) { 1571 current_offset += to_next_boundary; 1572 remaining -= to_next_boundary; 1573 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 1574 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 1575 } else { 1576 bdev_io->u.bdev.split_outstanding--; 1577 if (rc == -ENOMEM) { 1578 if (bdev_io->u.bdev.split_outstanding == 0) { 1579 /* No I/O is outstanding. Hence we should wait here. */ 1580 _spdk_bdev_queue_io_wait_with_cb(bdev_io, 1581 _spdk_bdev_io_split_with_payload); 1582 } 1583 } else { 1584 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1585 if (bdev_io->u.bdev.split_outstanding == 0) { 1586 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1587 } 1588 } 1589 1590 return; 1591 } 1592 } 1593 } 1594 1595 static void 1596 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1597 { 1598 struct spdk_bdev_io *parent_io = cb_arg; 1599 1600 spdk_bdev_free_io(bdev_io); 1601 1602 if (!success) { 1603 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1604 } 1605 parent_io->u.bdev.split_outstanding--; 1606 if (parent_io->u.bdev.split_outstanding != 0) { 1607 return; 1608 } 1609 1610 /* 1611 * Parent I/O finishes when all blocks are consumed or there is any failure of 1612 * child I/O and no outstanding child I/O. 1613 */ 1614 if (parent_io->u.bdev.split_remaining_num_blocks == 0 || 1615 parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { 1616 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 1617 parent_io->internal.caller_ctx); 1618 return; 1619 } 1620 1621 /* 1622 * Continue with the splitting process. This function will complete the parent I/O if the 1623 * splitting is done. 1624 */ 1625 _spdk_bdev_io_split_with_payload(parent_io); 1626 } 1627 1628 static void 1629 _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1630 { 1631 assert(_spdk_bdev_io_type_can_split(bdev_io->type)); 1632 1633 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 1634 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 1635 bdev_io->u.bdev.split_outstanding = 0; 1636 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 1637 1638 _spdk_bdev_io_split_with_payload(bdev_io); 1639 } 1640 1641 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 1642 * be inlined, at least on some compilers. 1643 */ 1644 static inline void 1645 _spdk_bdev_io_submit(void *ctx) 1646 { 1647 struct spdk_bdev_io *bdev_io = ctx; 1648 struct spdk_bdev *bdev = bdev_io->bdev; 1649 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1650 struct spdk_io_channel *ch = bdev_ch->channel; 1651 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1652 uint64_t tsc; 1653 1654 tsc = spdk_get_ticks(); 1655 bdev_io->internal.submit_tsc = tsc; 1656 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 1657 bdev_ch->io_outstanding++; 1658 shared_resource->io_outstanding++; 1659 bdev_io->internal.in_submit_request = true; 1660 if (spdk_likely(bdev_ch->flags == 0)) { 1661 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1662 bdev->fn_table->submit_request(ch, bdev_io); 1663 } else { 1664 bdev_ch->io_outstanding--; 1665 shared_resource->io_outstanding--; 1666 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1667 } 1668 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1669 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1670 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1671 bdev_ch->io_outstanding--; 1672 shared_resource->io_outstanding--; 1673 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1674 _spdk_bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 1675 } else { 1676 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1677 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1678 } 1679 bdev_io->internal.in_submit_request = false; 1680 } 1681 1682 static void 1683 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 1684 { 1685 struct spdk_bdev *bdev = bdev_io->bdev; 1686 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 1687 1688 assert(thread != NULL); 1689 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1690 1691 if (bdev->split_on_optimal_io_boundary && _spdk_bdev_io_should_split(bdev_io)) { 1692 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1693 spdk_bdev_io_get_buf(bdev_io, _spdk_bdev_io_split, 1694 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 1695 } else { 1696 _spdk_bdev_io_split(NULL, bdev_io); 1697 } 1698 return; 1699 } 1700 1701 if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { 1702 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 1703 _spdk_bdev_io_submit(bdev_io); 1704 } else { 1705 bdev_io->internal.io_submit_ch = bdev_io->internal.ch; 1706 bdev_io->internal.ch = bdev->internal.qos->ch; 1707 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); 1708 } 1709 } else { 1710 _spdk_bdev_io_submit(bdev_io); 1711 } 1712 } 1713 1714 static void 1715 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 1716 { 1717 struct spdk_bdev *bdev = bdev_io->bdev; 1718 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1719 struct spdk_io_channel *ch = bdev_ch->channel; 1720 1721 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1722 1723 bdev_io->internal.in_submit_request = true; 1724 bdev->fn_table->submit_request(ch, bdev_io); 1725 bdev_io->internal.in_submit_request = false; 1726 } 1727 1728 static void 1729 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 1730 struct spdk_bdev *bdev, void *cb_arg, 1731 spdk_bdev_io_completion_cb cb) 1732 { 1733 bdev_io->bdev = bdev; 1734 bdev_io->internal.caller_ctx = cb_arg; 1735 bdev_io->internal.cb = cb; 1736 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1737 bdev_io->internal.in_submit_request = false; 1738 bdev_io->internal.buf = NULL; 1739 bdev_io->internal.io_submit_ch = NULL; 1740 bdev_io->internal.orig_iovs = NULL; 1741 bdev_io->internal.orig_iovcnt = 0; 1742 } 1743 1744 static bool 1745 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1746 { 1747 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1748 } 1749 1750 bool 1751 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1752 { 1753 bool supported; 1754 1755 supported = _spdk_bdev_io_type_supported(bdev, io_type); 1756 1757 if (!supported) { 1758 switch (io_type) { 1759 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1760 /* The bdev layer will emulate write zeroes as long as write is supported. */ 1761 supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 1762 break; 1763 default: 1764 break; 1765 } 1766 } 1767 1768 return supported; 1769 } 1770 1771 int 1772 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1773 { 1774 if (bdev->fn_table->dump_info_json) { 1775 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1776 } 1777 1778 return 0; 1779 } 1780 1781 static void 1782 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1783 { 1784 uint32_t max_per_timeslice = 0; 1785 int i; 1786 1787 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1788 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1789 qos->rate_limits[i].max_per_timeslice = 0; 1790 continue; 1791 } 1792 1793 max_per_timeslice = qos->rate_limits[i].limit * 1794 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 1795 1796 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 1797 qos->rate_limits[i].min_per_timeslice); 1798 1799 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 1800 } 1801 1802 _spdk_bdev_qos_set_ops(qos); 1803 } 1804 1805 static int 1806 spdk_bdev_channel_poll_qos(void *arg) 1807 { 1808 struct spdk_bdev_qos *qos = arg; 1809 uint64_t now = spdk_get_ticks(); 1810 int i; 1811 1812 if (now < (qos->last_timeslice + qos->timeslice_size)) { 1813 /* We received our callback earlier than expected - return 1814 * immediately and wait to do accounting until at least one 1815 * timeslice has actually expired. This should never happen 1816 * with a well-behaved timer implementation. 1817 */ 1818 return 0; 1819 } 1820 1821 /* Reset for next round of rate limiting */ 1822 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1823 /* We may have allowed the IOs or bytes to slightly overrun in the last 1824 * timeslice. remaining_this_timeslice is signed, so if it's negative 1825 * here, we'll account for the overrun so that the next timeslice will 1826 * be appropriately reduced. 1827 */ 1828 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 1829 qos->rate_limits[i].remaining_this_timeslice = 0; 1830 } 1831 } 1832 1833 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 1834 qos->last_timeslice += qos->timeslice_size; 1835 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1836 qos->rate_limits[i].remaining_this_timeslice += 1837 qos->rate_limits[i].max_per_timeslice; 1838 } 1839 } 1840 1841 return _spdk_bdev_qos_io_submit(qos->ch, qos); 1842 } 1843 1844 static void 1845 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1846 { 1847 struct spdk_bdev_shared_resource *shared_resource; 1848 1849 spdk_put_io_channel(ch->channel); 1850 1851 shared_resource = ch->shared_resource; 1852 1853 assert(ch->io_outstanding == 0); 1854 assert(shared_resource->ref > 0); 1855 shared_resource->ref--; 1856 if (shared_resource->ref == 0) { 1857 assert(shared_resource->io_outstanding == 0); 1858 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1859 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1860 free(shared_resource); 1861 } 1862 } 1863 1864 /* Caller must hold bdev->internal.mutex. */ 1865 static void 1866 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1867 { 1868 struct spdk_bdev_qos *qos = bdev->internal.qos; 1869 int i; 1870 1871 /* Rate limiting on this bdev enabled */ 1872 if (qos) { 1873 if (qos->ch == NULL) { 1874 struct spdk_io_channel *io_ch; 1875 1876 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1877 bdev->name, spdk_get_thread()); 1878 1879 /* No qos channel has been selected, so set one up */ 1880 1881 /* Take another reference to ch */ 1882 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1883 assert(io_ch != NULL); 1884 qos->ch = ch; 1885 1886 qos->thread = spdk_io_channel_get_thread(io_ch); 1887 1888 TAILQ_INIT(&qos->queued); 1889 1890 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1891 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 1892 qos->rate_limits[i].min_per_timeslice = 1893 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 1894 } else { 1895 qos->rate_limits[i].min_per_timeslice = 1896 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 1897 } 1898 1899 if (qos->rate_limits[i].limit == 0) { 1900 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 1901 } 1902 } 1903 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1904 qos->timeslice_size = 1905 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 1906 qos->last_timeslice = spdk_get_ticks(); 1907 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1908 qos, 1909 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1910 } 1911 1912 ch->flags |= BDEV_CH_QOS_ENABLED; 1913 } 1914 } 1915 1916 static int 1917 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1918 { 1919 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1920 struct spdk_bdev_channel *ch = ctx_buf; 1921 struct spdk_io_channel *mgmt_io_ch; 1922 struct spdk_bdev_mgmt_channel *mgmt_ch; 1923 struct spdk_bdev_shared_resource *shared_resource; 1924 1925 ch->bdev = bdev; 1926 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1927 if (!ch->channel) { 1928 return -1; 1929 } 1930 1931 assert(ch->histogram == NULL); 1932 if (bdev->internal.histogram_enabled) { 1933 ch->histogram = spdk_histogram_data_alloc(); 1934 if (ch->histogram == NULL) { 1935 SPDK_ERRLOG("Could not allocate histogram\n"); 1936 } 1937 } 1938 1939 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1940 if (!mgmt_io_ch) { 1941 spdk_put_io_channel(ch->channel); 1942 return -1; 1943 } 1944 1945 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1946 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1947 if (shared_resource->shared_ch == ch->channel) { 1948 spdk_put_io_channel(mgmt_io_ch); 1949 shared_resource->ref++; 1950 break; 1951 } 1952 } 1953 1954 if (shared_resource == NULL) { 1955 shared_resource = calloc(1, sizeof(*shared_resource)); 1956 if (shared_resource == NULL) { 1957 spdk_put_io_channel(ch->channel); 1958 spdk_put_io_channel(mgmt_io_ch); 1959 return -1; 1960 } 1961 1962 shared_resource->mgmt_ch = mgmt_ch; 1963 shared_resource->io_outstanding = 0; 1964 TAILQ_INIT(&shared_resource->nomem_io); 1965 shared_resource->nomem_threshold = 0; 1966 shared_resource->shared_ch = ch->channel; 1967 shared_resource->ref = 1; 1968 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1969 } 1970 1971 memset(&ch->stat, 0, sizeof(ch->stat)); 1972 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1973 ch->io_outstanding = 0; 1974 TAILQ_INIT(&ch->queued_resets); 1975 ch->flags = 0; 1976 ch->shared_resource = shared_resource; 1977 1978 #ifdef SPDK_CONFIG_VTUNE 1979 { 1980 char *name; 1981 __itt_init_ittlib(NULL, 0); 1982 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1983 if (!name) { 1984 _spdk_bdev_channel_destroy_resource(ch); 1985 return -1; 1986 } 1987 ch->handle = __itt_string_handle_create(name); 1988 free(name); 1989 ch->start_tsc = spdk_get_ticks(); 1990 ch->interval_tsc = spdk_get_ticks_hz() / 100; 1991 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 1992 } 1993 #endif 1994 1995 pthread_mutex_lock(&bdev->internal.mutex); 1996 _spdk_bdev_enable_qos(bdev, ch); 1997 pthread_mutex_unlock(&bdev->internal.mutex); 1998 1999 return 0; 2000 } 2001 2002 /* 2003 * Abort I/O that are waiting on a data buffer. These types of I/O are 2004 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2005 */ 2006 static void 2007 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2008 { 2009 bdev_io_stailq_t tmp; 2010 struct spdk_bdev_io *bdev_io; 2011 2012 STAILQ_INIT(&tmp); 2013 2014 while (!STAILQ_EMPTY(queue)) { 2015 bdev_io = STAILQ_FIRST(queue); 2016 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2017 if (bdev_io->internal.ch == ch) { 2018 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2019 } else { 2020 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2021 } 2022 } 2023 2024 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2025 } 2026 2027 /* 2028 * Abort I/O that are queued waiting for submission. These types of I/O are 2029 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2030 */ 2031 static void 2032 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2033 { 2034 struct spdk_bdev_io *bdev_io, *tmp; 2035 2036 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2037 if (bdev_io->internal.ch == ch) { 2038 TAILQ_REMOVE(queue, bdev_io, internal.link); 2039 /* 2040 * spdk_bdev_io_complete() assumes that the completed I/O had 2041 * been submitted to the bdev module. Since in this case it 2042 * hadn't, bump io_outstanding to account for the decrement 2043 * that spdk_bdev_io_complete() will do. 2044 */ 2045 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2046 ch->io_outstanding++; 2047 ch->shared_resource->io_outstanding++; 2048 } 2049 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2050 } 2051 } 2052 } 2053 2054 static void 2055 spdk_bdev_qos_channel_destroy(void *cb_arg) 2056 { 2057 struct spdk_bdev_qos *qos = cb_arg; 2058 2059 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2060 spdk_poller_unregister(&qos->poller); 2061 2062 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2063 2064 free(qos); 2065 } 2066 2067 static int 2068 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 2069 { 2070 int i; 2071 2072 /* 2073 * Cleanly shutting down the QoS poller is tricky, because 2074 * during the asynchronous operation the user could open 2075 * a new descriptor and create a new channel, spawning 2076 * a new QoS poller. 2077 * 2078 * The strategy is to create a new QoS structure here and swap it 2079 * in. The shutdown path then continues to refer to the old one 2080 * until it completes and then releases it. 2081 */ 2082 struct spdk_bdev_qos *new_qos, *old_qos; 2083 2084 old_qos = bdev->internal.qos; 2085 2086 new_qos = calloc(1, sizeof(*new_qos)); 2087 if (!new_qos) { 2088 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2089 return -ENOMEM; 2090 } 2091 2092 /* Copy the old QoS data into the newly allocated structure */ 2093 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2094 2095 /* Zero out the key parts of the QoS structure */ 2096 new_qos->ch = NULL; 2097 new_qos->thread = NULL; 2098 new_qos->poller = NULL; 2099 TAILQ_INIT(&new_qos->queued); 2100 /* 2101 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2102 * It will be used later for the new QoS structure. 2103 */ 2104 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2105 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2106 new_qos->rate_limits[i].min_per_timeslice = 0; 2107 new_qos->rate_limits[i].max_per_timeslice = 0; 2108 } 2109 2110 bdev->internal.qos = new_qos; 2111 2112 if (old_qos->thread == NULL) { 2113 free(old_qos); 2114 } else { 2115 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 2116 old_qos); 2117 } 2118 2119 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2120 * been destroyed yet. The destruction path will end up waiting for the final 2121 * channel to be put before it releases resources. */ 2122 2123 return 0; 2124 } 2125 2126 static void 2127 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2128 { 2129 total->bytes_read += add->bytes_read; 2130 total->num_read_ops += add->num_read_ops; 2131 total->bytes_written += add->bytes_written; 2132 total->num_write_ops += add->num_write_ops; 2133 total->bytes_unmapped += add->bytes_unmapped; 2134 total->num_unmap_ops += add->num_unmap_ops; 2135 total->read_latency_ticks += add->read_latency_ticks; 2136 total->write_latency_ticks += add->write_latency_ticks; 2137 total->unmap_latency_ticks += add->unmap_latency_ticks; 2138 } 2139 2140 static void 2141 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 2142 { 2143 struct spdk_bdev_channel *ch = ctx_buf; 2144 struct spdk_bdev_mgmt_channel *mgmt_ch; 2145 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2146 2147 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2148 spdk_get_thread()); 2149 2150 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2151 pthread_mutex_lock(&ch->bdev->internal.mutex); 2152 _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2153 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2154 2155 mgmt_ch = shared_resource->mgmt_ch; 2156 2157 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 2158 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 2159 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 2160 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 2161 2162 if (ch->histogram) { 2163 spdk_histogram_data_free(ch->histogram); 2164 } 2165 2166 _spdk_bdev_channel_destroy_resource(ch); 2167 } 2168 2169 int 2170 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2171 { 2172 struct spdk_bdev_alias *tmp; 2173 2174 if (alias == NULL) { 2175 SPDK_ERRLOG("Empty alias passed\n"); 2176 return -EINVAL; 2177 } 2178 2179 if (spdk_bdev_get_by_name(alias)) { 2180 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2181 return -EEXIST; 2182 } 2183 2184 tmp = calloc(1, sizeof(*tmp)); 2185 if (tmp == NULL) { 2186 SPDK_ERRLOG("Unable to allocate alias\n"); 2187 return -ENOMEM; 2188 } 2189 2190 tmp->alias = strdup(alias); 2191 if (tmp->alias == NULL) { 2192 free(tmp); 2193 SPDK_ERRLOG("Unable to allocate alias\n"); 2194 return -ENOMEM; 2195 } 2196 2197 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2198 2199 return 0; 2200 } 2201 2202 int 2203 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2204 { 2205 struct spdk_bdev_alias *tmp; 2206 2207 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2208 if (strcmp(alias, tmp->alias) == 0) { 2209 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2210 free(tmp->alias); 2211 free(tmp); 2212 return 0; 2213 } 2214 } 2215 2216 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2217 2218 return -ENOENT; 2219 } 2220 2221 void 2222 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2223 { 2224 struct spdk_bdev_alias *p, *tmp; 2225 2226 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2227 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2228 free(p->alias); 2229 free(p); 2230 } 2231 } 2232 2233 struct spdk_io_channel * 2234 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2235 { 2236 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 2237 } 2238 2239 const char * 2240 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2241 { 2242 return bdev->name; 2243 } 2244 2245 const char * 2246 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 2247 { 2248 return bdev->product_name; 2249 } 2250 2251 const struct spdk_bdev_aliases_list * 2252 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 2253 { 2254 return &bdev->aliases; 2255 } 2256 2257 uint32_t 2258 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 2259 { 2260 return bdev->blocklen; 2261 } 2262 2263 uint64_t 2264 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 2265 { 2266 return bdev->blockcnt; 2267 } 2268 2269 const char * 2270 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 2271 { 2272 return qos_rpc_type[type]; 2273 } 2274 2275 void 2276 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 2277 { 2278 int i; 2279 2280 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2281 2282 pthread_mutex_lock(&bdev->internal.mutex); 2283 if (bdev->internal.qos) { 2284 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2285 if (bdev->internal.qos->rate_limits[i].limit != 2286 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2287 limits[i] = bdev->internal.qos->rate_limits[i].limit; 2288 if (_spdk_bdev_qos_is_iops_rate_limit(i) == false) { 2289 /* Change from Byte to Megabyte which is user visible. */ 2290 limits[i] = limits[i] / 1024 / 1024; 2291 } 2292 } 2293 } 2294 } 2295 pthread_mutex_unlock(&bdev->internal.mutex); 2296 } 2297 2298 size_t 2299 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 2300 { 2301 return 1 << bdev->required_alignment; 2302 } 2303 2304 uint32_t 2305 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 2306 { 2307 return bdev->optimal_io_boundary; 2308 } 2309 2310 bool 2311 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 2312 { 2313 return bdev->write_cache; 2314 } 2315 2316 const struct spdk_uuid * 2317 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 2318 { 2319 return &bdev->uuid; 2320 } 2321 2322 uint32_t 2323 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 2324 { 2325 return bdev->md_len; 2326 } 2327 2328 bool 2329 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 2330 { 2331 return (bdev->md_len != 0) && bdev->md_interleave; 2332 } 2333 2334 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 2335 { 2336 if (bdev->md_len != 0) { 2337 return bdev->dif_type; 2338 } else { 2339 return SPDK_DIF_DISABLE; 2340 } 2341 } 2342 2343 bool 2344 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 2345 { 2346 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 2347 return bdev->dif_is_head_of_md; 2348 } else { 2349 return false; 2350 } 2351 } 2352 2353 bool 2354 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 2355 enum spdk_dif_check_type check_type) 2356 { 2357 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 2358 return false; 2359 } 2360 2361 switch (check_type) { 2362 case SPDK_DIF_CHECK_TYPE_REFTAG: 2363 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 2364 case SPDK_DIF_CHECK_TYPE_APPTAG: 2365 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 2366 case SPDK_DIF_CHECK_TYPE_GUARD: 2367 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 2368 default: 2369 return false; 2370 } 2371 } 2372 2373 uint64_t 2374 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 2375 { 2376 return bdev->internal.measured_queue_depth; 2377 } 2378 2379 uint64_t 2380 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 2381 { 2382 return bdev->internal.period; 2383 } 2384 2385 uint64_t 2386 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 2387 { 2388 return bdev->internal.weighted_io_time; 2389 } 2390 2391 uint64_t 2392 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 2393 { 2394 return bdev->internal.io_time; 2395 } 2396 2397 static void 2398 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 2399 { 2400 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2401 2402 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 2403 2404 if (bdev->internal.measured_queue_depth) { 2405 bdev->internal.io_time += bdev->internal.period; 2406 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 2407 } 2408 } 2409 2410 static void 2411 _calculate_measured_qd(struct spdk_io_channel_iter *i) 2412 { 2413 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2414 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2415 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 2416 2417 bdev->internal.temporary_queue_depth += ch->io_outstanding; 2418 spdk_for_each_channel_continue(i, 0); 2419 } 2420 2421 static int 2422 spdk_bdev_calculate_measured_queue_depth(void *ctx) 2423 { 2424 struct spdk_bdev *bdev = ctx; 2425 bdev->internal.temporary_queue_depth = 0; 2426 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 2427 _calculate_measured_qd_cpl); 2428 return 0; 2429 } 2430 2431 void 2432 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 2433 { 2434 bdev->internal.period = period; 2435 2436 if (bdev->internal.qd_poller != NULL) { 2437 spdk_poller_unregister(&bdev->internal.qd_poller); 2438 bdev->internal.measured_queue_depth = UINT64_MAX; 2439 } 2440 2441 if (period != 0) { 2442 bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, 2443 period); 2444 } 2445 } 2446 2447 int 2448 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 2449 { 2450 int ret; 2451 2452 pthread_mutex_lock(&bdev->internal.mutex); 2453 2454 /* bdev has open descriptors */ 2455 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 2456 bdev->blockcnt > size) { 2457 ret = -EBUSY; 2458 } else { 2459 bdev->blockcnt = size; 2460 ret = 0; 2461 } 2462 2463 pthread_mutex_unlock(&bdev->internal.mutex); 2464 2465 return ret; 2466 } 2467 2468 /* 2469 * Convert I/O offset and length from bytes to blocks. 2470 * 2471 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 2472 */ 2473 static uint64_t 2474 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 2475 uint64_t num_bytes, uint64_t *num_blocks) 2476 { 2477 uint32_t block_size = bdev->blocklen; 2478 uint8_t shift_cnt; 2479 2480 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2481 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 2482 shift_cnt = spdk_u32log2(block_size); 2483 *offset_blocks = offset_bytes >> shift_cnt; 2484 *num_blocks = num_bytes >> shift_cnt; 2485 return (offset_bytes - (*offset_blocks << shift_cnt)) | 2486 (num_bytes - (*num_blocks << shift_cnt)); 2487 } else { 2488 *offset_blocks = offset_bytes / block_size; 2489 *num_blocks = num_bytes / block_size; 2490 return (offset_bytes % block_size) | (num_bytes % block_size); 2491 } 2492 } 2493 2494 static bool 2495 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 2496 { 2497 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 2498 * has been an overflow and hence the offset has been wrapped around */ 2499 if (offset_blocks + num_blocks < offset_blocks) { 2500 return false; 2501 } 2502 2503 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 2504 if (offset_blocks + num_blocks > bdev->blockcnt) { 2505 return false; 2506 } 2507 2508 return true; 2509 } 2510 2511 int 2512 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2513 void *buf, uint64_t offset, uint64_t nbytes, 2514 spdk_bdev_io_completion_cb cb, void *cb_arg) 2515 { 2516 uint64_t offset_blocks, num_blocks; 2517 2518 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2519 return -EINVAL; 2520 } 2521 2522 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2523 } 2524 2525 int 2526 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2527 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2528 spdk_bdev_io_completion_cb cb, void *cb_arg) 2529 { 2530 struct spdk_bdev *bdev = desc->bdev; 2531 struct spdk_bdev_io *bdev_io; 2532 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2533 2534 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2535 return -EINVAL; 2536 } 2537 2538 bdev_io = spdk_bdev_get_io(channel); 2539 if (!bdev_io) { 2540 return -ENOMEM; 2541 } 2542 2543 bdev_io->internal.ch = channel; 2544 bdev_io->internal.desc = desc; 2545 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2546 bdev_io->u.bdev.iovs = &bdev_io->iov; 2547 bdev_io->u.bdev.iovs[0].iov_base = buf; 2548 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2549 bdev_io->u.bdev.iovcnt = 1; 2550 bdev_io->u.bdev.num_blocks = num_blocks; 2551 bdev_io->u.bdev.offset_blocks = offset_blocks; 2552 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2553 2554 spdk_bdev_io_submit(bdev_io); 2555 return 0; 2556 } 2557 2558 int 2559 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2560 struct iovec *iov, int iovcnt, 2561 uint64_t offset, uint64_t nbytes, 2562 spdk_bdev_io_completion_cb cb, void *cb_arg) 2563 { 2564 uint64_t offset_blocks, num_blocks; 2565 2566 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2567 return -EINVAL; 2568 } 2569 2570 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2571 } 2572 2573 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2574 struct iovec *iov, int iovcnt, 2575 uint64_t offset_blocks, uint64_t num_blocks, 2576 spdk_bdev_io_completion_cb cb, void *cb_arg) 2577 { 2578 struct spdk_bdev *bdev = desc->bdev; 2579 struct spdk_bdev_io *bdev_io; 2580 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2581 2582 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2583 return -EINVAL; 2584 } 2585 2586 bdev_io = spdk_bdev_get_io(channel); 2587 if (!bdev_io) { 2588 return -ENOMEM; 2589 } 2590 2591 bdev_io->internal.ch = channel; 2592 bdev_io->internal.desc = desc; 2593 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2594 bdev_io->u.bdev.iovs = iov; 2595 bdev_io->u.bdev.iovcnt = iovcnt; 2596 bdev_io->u.bdev.num_blocks = num_blocks; 2597 bdev_io->u.bdev.offset_blocks = offset_blocks; 2598 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2599 2600 spdk_bdev_io_submit(bdev_io); 2601 return 0; 2602 } 2603 2604 int 2605 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2606 void *buf, uint64_t offset, uint64_t nbytes, 2607 spdk_bdev_io_completion_cb cb, void *cb_arg) 2608 { 2609 uint64_t offset_blocks, num_blocks; 2610 2611 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2612 return -EINVAL; 2613 } 2614 2615 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2616 } 2617 2618 int 2619 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2620 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2621 spdk_bdev_io_completion_cb cb, void *cb_arg) 2622 { 2623 struct spdk_bdev *bdev = desc->bdev; 2624 struct spdk_bdev_io *bdev_io; 2625 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2626 2627 if (!desc->write) { 2628 return -EBADF; 2629 } 2630 2631 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2632 return -EINVAL; 2633 } 2634 2635 bdev_io = spdk_bdev_get_io(channel); 2636 if (!bdev_io) { 2637 return -ENOMEM; 2638 } 2639 2640 bdev_io->internal.ch = channel; 2641 bdev_io->internal.desc = desc; 2642 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2643 bdev_io->u.bdev.iovs = &bdev_io->iov; 2644 bdev_io->u.bdev.iovs[0].iov_base = buf; 2645 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2646 bdev_io->u.bdev.iovcnt = 1; 2647 bdev_io->u.bdev.num_blocks = num_blocks; 2648 bdev_io->u.bdev.offset_blocks = offset_blocks; 2649 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2650 2651 spdk_bdev_io_submit(bdev_io); 2652 return 0; 2653 } 2654 2655 int 2656 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2657 struct iovec *iov, int iovcnt, 2658 uint64_t offset, uint64_t len, 2659 spdk_bdev_io_completion_cb cb, void *cb_arg) 2660 { 2661 uint64_t offset_blocks, num_blocks; 2662 2663 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2664 return -EINVAL; 2665 } 2666 2667 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2668 } 2669 2670 int 2671 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2672 struct iovec *iov, int iovcnt, 2673 uint64_t offset_blocks, uint64_t num_blocks, 2674 spdk_bdev_io_completion_cb cb, void *cb_arg) 2675 { 2676 struct spdk_bdev *bdev = desc->bdev; 2677 struct spdk_bdev_io *bdev_io; 2678 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2679 2680 if (!desc->write) { 2681 return -EBADF; 2682 } 2683 2684 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2685 return -EINVAL; 2686 } 2687 2688 bdev_io = spdk_bdev_get_io(channel); 2689 if (!bdev_io) { 2690 return -ENOMEM; 2691 } 2692 2693 bdev_io->internal.ch = channel; 2694 bdev_io->internal.desc = desc; 2695 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2696 bdev_io->u.bdev.iovs = iov; 2697 bdev_io->u.bdev.iovcnt = iovcnt; 2698 bdev_io->u.bdev.num_blocks = num_blocks; 2699 bdev_io->u.bdev.offset_blocks = offset_blocks; 2700 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2701 2702 spdk_bdev_io_submit(bdev_io); 2703 return 0; 2704 } 2705 2706 int 2707 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2708 uint64_t offset, uint64_t len, 2709 spdk_bdev_io_completion_cb cb, void *cb_arg) 2710 { 2711 uint64_t offset_blocks, num_blocks; 2712 2713 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2714 return -EINVAL; 2715 } 2716 2717 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2718 } 2719 2720 int 2721 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2722 uint64_t offset_blocks, uint64_t num_blocks, 2723 spdk_bdev_io_completion_cb cb, void *cb_arg) 2724 { 2725 struct spdk_bdev *bdev = desc->bdev; 2726 struct spdk_bdev_io *bdev_io; 2727 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2728 2729 if (!desc->write) { 2730 return -EBADF; 2731 } 2732 2733 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2734 return -EINVAL; 2735 } 2736 2737 bdev_io = spdk_bdev_get_io(channel); 2738 2739 if (!bdev_io) { 2740 return -ENOMEM; 2741 } 2742 2743 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 2744 bdev_io->internal.ch = channel; 2745 bdev_io->internal.desc = desc; 2746 bdev_io->u.bdev.offset_blocks = offset_blocks; 2747 bdev_io->u.bdev.num_blocks = num_blocks; 2748 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2749 2750 if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 2751 spdk_bdev_io_submit(bdev_io); 2752 return 0; 2753 } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 2754 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 2755 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 2756 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 2757 _spdk_bdev_write_zero_buffer_next(bdev_io); 2758 return 0; 2759 } else { 2760 spdk_bdev_free_io(bdev_io); 2761 return -ENOTSUP; 2762 } 2763 } 2764 2765 int 2766 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2767 uint64_t offset, uint64_t nbytes, 2768 spdk_bdev_io_completion_cb cb, void *cb_arg) 2769 { 2770 uint64_t offset_blocks, num_blocks; 2771 2772 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2773 return -EINVAL; 2774 } 2775 2776 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2777 } 2778 2779 int 2780 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2781 uint64_t offset_blocks, uint64_t num_blocks, 2782 spdk_bdev_io_completion_cb cb, void *cb_arg) 2783 { 2784 struct spdk_bdev *bdev = desc->bdev; 2785 struct spdk_bdev_io *bdev_io; 2786 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2787 2788 if (!desc->write) { 2789 return -EBADF; 2790 } 2791 2792 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2793 return -EINVAL; 2794 } 2795 2796 if (num_blocks == 0) { 2797 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 2798 return -EINVAL; 2799 } 2800 2801 bdev_io = spdk_bdev_get_io(channel); 2802 if (!bdev_io) { 2803 return -ENOMEM; 2804 } 2805 2806 bdev_io->internal.ch = channel; 2807 bdev_io->internal.desc = desc; 2808 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 2809 2810 bdev_io->u.bdev.iovs = &bdev_io->iov; 2811 bdev_io->u.bdev.iovs[0].iov_base = NULL; 2812 bdev_io->u.bdev.iovs[0].iov_len = 0; 2813 bdev_io->u.bdev.iovcnt = 1; 2814 2815 bdev_io->u.bdev.offset_blocks = offset_blocks; 2816 bdev_io->u.bdev.num_blocks = num_blocks; 2817 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2818 2819 spdk_bdev_io_submit(bdev_io); 2820 return 0; 2821 } 2822 2823 int 2824 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2825 uint64_t offset, uint64_t length, 2826 spdk_bdev_io_completion_cb cb, void *cb_arg) 2827 { 2828 uint64_t offset_blocks, num_blocks; 2829 2830 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 2831 return -EINVAL; 2832 } 2833 2834 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2835 } 2836 2837 int 2838 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2839 uint64_t offset_blocks, uint64_t num_blocks, 2840 spdk_bdev_io_completion_cb cb, void *cb_arg) 2841 { 2842 struct spdk_bdev *bdev = desc->bdev; 2843 struct spdk_bdev_io *bdev_io; 2844 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2845 2846 if (!desc->write) { 2847 return -EBADF; 2848 } 2849 2850 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2851 return -EINVAL; 2852 } 2853 2854 bdev_io = spdk_bdev_get_io(channel); 2855 if (!bdev_io) { 2856 return -ENOMEM; 2857 } 2858 2859 bdev_io->internal.ch = channel; 2860 bdev_io->internal.desc = desc; 2861 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 2862 bdev_io->u.bdev.iovs = NULL; 2863 bdev_io->u.bdev.iovcnt = 0; 2864 bdev_io->u.bdev.offset_blocks = offset_blocks; 2865 bdev_io->u.bdev.num_blocks = num_blocks; 2866 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2867 2868 spdk_bdev_io_submit(bdev_io); 2869 return 0; 2870 } 2871 2872 static void 2873 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 2874 { 2875 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 2876 struct spdk_bdev_io *bdev_io; 2877 2878 bdev_io = TAILQ_FIRST(&ch->queued_resets); 2879 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 2880 spdk_bdev_io_submit_reset(bdev_io); 2881 } 2882 2883 static void 2884 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 2885 { 2886 struct spdk_io_channel *ch; 2887 struct spdk_bdev_channel *channel; 2888 struct spdk_bdev_mgmt_channel *mgmt_channel; 2889 struct spdk_bdev_shared_resource *shared_resource; 2890 bdev_io_tailq_t tmp_queued; 2891 2892 TAILQ_INIT(&tmp_queued); 2893 2894 ch = spdk_io_channel_iter_get_channel(i); 2895 channel = spdk_io_channel_get_ctx(ch); 2896 shared_resource = channel->shared_resource; 2897 mgmt_channel = shared_resource->mgmt_ch; 2898 2899 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 2900 2901 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 2902 /* The QoS object is always valid and readable while 2903 * the channel flag is set, so the lock here should not 2904 * be necessary. We're not in the fast path though, so 2905 * just take it anyway. */ 2906 pthread_mutex_lock(&channel->bdev->internal.mutex); 2907 if (channel->bdev->internal.qos->ch == channel) { 2908 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 2909 } 2910 pthread_mutex_unlock(&channel->bdev->internal.mutex); 2911 } 2912 2913 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 2914 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 2915 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 2916 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 2917 2918 spdk_for_each_channel_continue(i, 0); 2919 } 2920 2921 static void 2922 _spdk_bdev_start_reset(void *ctx) 2923 { 2924 struct spdk_bdev_channel *ch = ctx; 2925 2926 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 2927 ch, _spdk_bdev_reset_dev); 2928 } 2929 2930 static void 2931 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 2932 { 2933 struct spdk_bdev *bdev = ch->bdev; 2934 2935 assert(!TAILQ_EMPTY(&ch->queued_resets)); 2936 2937 pthread_mutex_lock(&bdev->internal.mutex); 2938 if (bdev->internal.reset_in_progress == NULL) { 2939 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 2940 /* 2941 * Take a channel reference for the target bdev for the life of this 2942 * reset. This guards against the channel getting destroyed while 2943 * spdk_for_each_channel() calls related to this reset IO are in 2944 * progress. We will release the reference when this reset is 2945 * completed. 2946 */ 2947 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2948 _spdk_bdev_start_reset(ch); 2949 } 2950 pthread_mutex_unlock(&bdev->internal.mutex); 2951 } 2952 2953 int 2954 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2955 spdk_bdev_io_completion_cb cb, void *cb_arg) 2956 { 2957 struct spdk_bdev *bdev = desc->bdev; 2958 struct spdk_bdev_io *bdev_io; 2959 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2960 2961 bdev_io = spdk_bdev_get_io(channel); 2962 if (!bdev_io) { 2963 return -ENOMEM; 2964 } 2965 2966 bdev_io->internal.ch = channel; 2967 bdev_io->internal.desc = desc; 2968 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2969 bdev_io->u.reset.ch_ref = NULL; 2970 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2971 2972 pthread_mutex_lock(&bdev->internal.mutex); 2973 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 2974 pthread_mutex_unlock(&bdev->internal.mutex); 2975 2976 _spdk_bdev_channel_start_reset(channel); 2977 2978 return 0; 2979 } 2980 2981 void 2982 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2983 struct spdk_bdev_io_stat *stat) 2984 { 2985 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2986 2987 *stat = channel->stat; 2988 } 2989 2990 static void 2991 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 2992 { 2993 void *io_device = spdk_io_channel_iter_get_io_device(i); 2994 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 2995 2996 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 2997 bdev_iostat_ctx->cb_arg, 0); 2998 free(bdev_iostat_ctx); 2999 } 3000 3001 static void 3002 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 3003 { 3004 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 3005 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3006 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3007 3008 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 3009 spdk_for_each_channel_continue(i, 0); 3010 } 3011 3012 void 3013 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 3014 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 3015 { 3016 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 3017 3018 assert(bdev != NULL); 3019 assert(stat != NULL); 3020 assert(cb != NULL); 3021 3022 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 3023 if (bdev_iostat_ctx == NULL) { 3024 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 3025 cb(bdev, stat, cb_arg, -ENOMEM); 3026 return; 3027 } 3028 3029 bdev_iostat_ctx->stat = stat; 3030 bdev_iostat_ctx->cb = cb; 3031 bdev_iostat_ctx->cb_arg = cb_arg; 3032 3033 /* Start with the statistics from previously deleted channels. */ 3034 pthread_mutex_lock(&bdev->internal.mutex); 3035 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 3036 pthread_mutex_unlock(&bdev->internal.mutex); 3037 3038 /* Then iterate and add the statistics from each existing channel. */ 3039 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3040 _spdk_bdev_get_each_channel_stat, 3041 bdev_iostat_ctx, 3042 _spdk_bdev_get_device_stat_done); 3043 } 3044 3045 int 3046 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3047 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3048 spdk_bdev_io_completion_cb cb, void *cb_arg) 3049 { 3050 struct spdk_bdev *bdev = desc->bdev; 3051 struct spdk_bdev_io *bdev_io; 3052 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3053 3054 if (!desc->write) { 3055 return -EBADF; 3056 } 3057 3058 bdev_io = spdk_bdev_get_io(channel); 3059 if (!bdev_io) { 3060 return -ENOMEM; 3061 } 3062 3063 bdev_io->internal.ch = channel; 3064 bdev_io->internal.desc = desc; 3065 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 3066 bdev_io->u.nvme_passthru.cmd = *cmd; 3067 bdev_io->u.nvme_passthru.buf = buf; 3068 bdev_io->u.nvme_passthru.nbytes = nbytes; 3069 bdev_io->u.nvme_passthru.md_buf = NULL; 3070 bdev_io->u.nvme_passthru.md_len = 0; 3071 3072 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3073 3074 spdk_bdev_io_submit(bdev_io); 3075 return 0; 3076 } 3077 3078 int 3079 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3080 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3081 spdk_bdev_io_completion_cb cb, void *cb_arg) 3082 { 3083 struct spdk_bdev *bdev = desc->bdev; 3084 struct spdk_bdev_io *bdev_io; 3085 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3086 3087 if (!desc->write) { 3088 /* 3089 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3090 * to easily determine if the command is a read or write, but for now just 3091 * do not allow io_passthru with a read-only descriptor. 3092 */ 3093 return -EBADF; 3094 } 3095 3096 bdev_io = spdk_bdev_get_io(channel); 3097 if (!bdev_io) { 3098 return -ENOMEM; 3099 } 3100 3101 bdev_io->internal.ch = channel; 3102 bdev_io->internal.desc = desc; 3103 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 3104 bdev_io->u.nvme_passthru.cmd = *cmd; 3105 bdev_io->u.nvme_passthru.buf = buf; 3106 bdev_io->u.nvme_passthru.nbytes = nbytes; 3107 bdev_io->u.nvme_passthru.md_buf = NULL; 3108 bdev_io->u.nvme_passthru.md_len = 0; 3109 3110 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3111 3112 spdk_bdev_io_submit(bdev_io); 3113 return 0; 3114 } 3115 3116 int 3117 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3118 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 3119 spdk_bdev_io_completion_cb cb, void *cb_arg) 3120 { 3121 struct spdk_bdev *bdev = desc->bdev; 3122 struct spdk_bdev_io *bdev_io; 3123 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3124 3125 if (!desc->write) { 3126 /* 3127 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3128 * to easily determine if the command is a read or write, but for now just 3129 * do not allow io_passthru with a read-only descriptor. 3130 */ 3131 return -EBADF; 3132 } 3133 3134 bdev_io = spdk_bdev_get_io(channel); 3135 if (!bdev_io) { 3136 return -ENOMEM; 3137 } 3138 3139 bdev_io->internal.ch = channel; 3140 bdev_io->internal.desc = desc; 3141 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 3142 bdev_io->u.nvme_passthru.cmd = *cmd; 3143 bdev_io->u.nvme_passthru.buf = buf; 3144 bdev_io->u.nvme_passthru.nbytes = nbytes; 3145 bdev_io->u.nvme_passthru.md_buf = md_buf; 3146 bdev_io->u.nvme_passthru.md_len = md_len; 3147 3148 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3149 3150 spdk_bdev_io_submit(bdev_io); 3151 return 0; 3152 } 3153 3154 int 3155 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 3156 struct spdk_bdev_io_wait_entry *entry) 3157 { 3158 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3159 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 3160 3161 if (bdev != entry->bdev) { 3162 SPDK_ERRLOG("bdevs do not match\n"); 3163 return -EINVAL; 3164 } 3165 3166 if (mgmt_ch->per_thread_cache_count > 0) { 3167 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 3168 return -EINVAL; 3169 } 3170 3171 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 3172 return 0; 3173 } 3174 3175 static void 3176 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 3177 { 3178 struct spdk_bdev *bdev = bdev_ch->bdev; 3179 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3180 struct spdk_bdev_io *bdev_io; 3181 3182 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 3183 /* 3184 * Allow some more I/O to complete before retrying the nomem_io queue. 3185 * Some drivers (such as nvme) cannot immediately take a new I/O in 3186 * the context of a completion, because the resources for the I/O are 3187 * not released until control returns to the bdev poller. Also, we 3188 * may require several small I/O to complete before a larger I/O 3189 * (that requires splitting) can be submitted. 3190 */ 3191 return; 3192 } 3193 3194 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 3195 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 3196 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 3197 bdev_io->internal.ch->io_outstanding++; 3198 shared_resource->io_outstanding++; 3199 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3200 bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); 3201 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 3202 break; 3203 } 3204 } 3205 } 3206 3207 static inline void 3208 _spdk_bdev_io_complete(void *ctx) 3209 { 3210 struct spdk_bdev_io *bdev_io = ctx; 3211 uint64_t tsc, tsc_diff; 3212 3213 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 3214 /* 3215 * Send the completion to the thread that originally submitted the I/O, 3216 * which may not be the current thread in the case of QoS. 3217 */ 3218 if (bdev_io->internal.io_submit_ch) { 3219 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 3220 bdev_io->internal.io_submit_ch = NULL; 3221 } 3222 3223 /* 3224 * Defer completion to avoid potential infinite recursion if the 3225 * user's completion callback issues a new I/O. 3226 */ 3227 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 3228 _spdk_bdev_io_complete, bdev_io); 3229 return; 3230 } 3231 3232 tsc = spdk_get_ticks(); 3233 tsc_diff = tsc - bdev_io->internal.submit_tsc; 3234 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 3235 3236 if (bdev_io->internal.ch->histogram) { 3237 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 3238 } 3239 3240 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3241 switch (bdev_io->type) { 3242 case SPDK_BDEV_IO_TYPE_READ: 3243 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3244 bdev_io->internal.ch->stat.num_read_ops++; 3245 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 3246 break; 3247 case SPDK_BDEV_IO_TYPE_WRITE: 3248 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3249 bdev_io->internal.ch->stat.num_write_ops++; 3250 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 3251 break; 3252 case SPDK_BDEV_IO_TYPE_UNMAP: 3253 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3254 bdev_io->internal.ch->stat.num_unmap_ops++; 3255 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 3256 default: 3257 break; 3258 } 3259 } 3260 3261 #ifdef SPDK_CONFIG_VTUNE 3262 uint64_t now_tsc = spdk_get_ticks(); 3263 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 3264 uint64_t data[5]; 3265 3266 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 3267 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 3268 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 3269 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 3270 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 3271 bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; 3272 3273 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 3274 __itt_metadata_u64, 5, data); 3275 3276 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 3277 bdev_io->internal.ch->start_tsc = now_tsc; 3278 } 3279 #endif 3280 3281 assert(bdev_io->internal.cb != NULL); 3282 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); 3283 3284 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3285 bdev_io->internal.caller_ctx); 3286 } 3287 3288 static void 3289 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 3290 { 3291 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 3292 3293 if (bdev_io->u.reset.ch_ref != NULL) { 3294 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 3295 bdev_io->u.reset.ch_ref = NULL; 3296 } 3297 3298 _spdk_bdev_io_complete(bdev_io); 3299 } 3300 3301 static void 3302 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 3303 { 3304 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3305 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 3306 3307 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 3308 if (!TAILQ_EMPTY(&ch->queued_resets)) { 3309 _spdk_bdev_channel_start_reset(ch); 3310 } 3311 3312 spdk_for_each_channel_continue(i, 0); 3313 } 3314 3315 void 3316 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 3317 { 3318 struct spdk_bdev *bdev = bdev_io->bdev; 3319 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3320 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3321 3322 bdev_io->internal.status = status; 3323 3324 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 3325 bool unlock_channels = false; 3326 3327 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 3328 SPDK_ERRLOG("NOMEM returned for reset\n"); 3329 } 3330 pthread_mutex_lock(&bdev->internal.mutex); 3331 if (bdev_io == bdev->internal.reset_in_progress) { 3332 bdev->internal.reset_in_progress = NULL; 3333 unlock_channels = true; 3334 } 3335 pthread_mutex_unlock(&bdev->internal.mutex); 3336 3337 if (unlock_channels) { 3338 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 3339 bdev_io, _spdk_bdev_reset_complete); 3340 return; 3341 } 3342 } else { 3343 if (spdk_unlikely(bdev_io->internal.orig_iovcnt > 0)) { 3344 _bdev_io_unset_bounce_buf(bdev_io); 3345 } 3346 3347 assert(bdev_ch->io_outstanding > 0); 3348 assert(shared_resource->io_outstanding > 0); 3349 bdev_ch->io_outstanding--; 3350 shared_resource->io_outstanding--; 3351 3352 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 3353 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 3354 /* 3355 * Wait for some of the outstanding I/O to complete before we 3356 * retry any of the nomem_io. Normally we will wait for 3357 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 3358 * depth channels we will instead wait for half to complete. 3359 */ 3360 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 3361 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 3362 return; 3363 } 3364 3365 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 3366 _spdk_bdev_ch_retry_io(bdev_ch); 3367 } 3368 } 3369 3370 _spdk_bdev_io_complete(bdev_io); 3371 } 3372 3373 void 3374 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 3375 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 3376 { 3377 if (sc == SPDK_SCSI_STATUS_GOOD) { 3378 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3379 } else { 3380 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 3381 bdev_io->internal.error.scsi.sc = sc; 3382 bdev_io->internal.error.scsi.sk = sk; 3383 bdev_io->internal.error.scsi.asc = asc; 3384 bdev_io->internal.error.scsi.ascq = ascq; 3385 } 3386 3387 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3388 } 3389 3390 void 3391 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 3392 int *sc, int *sk, int *asc, int *ascq) 3393 { 3394 assert(sc != NULL); 3395 assert(sk != NULL); 3396 assert(asc != NULL); 3397 assert(ascq != NULL); 3398 3399 switch (bdev_io->internal.status) { 3400 case SPDK_BDEV_IO_STATUS_SUCCESS: 3401 *sc = SPDK_SCSI_STATUS_GOOD; 3402 *sk = SPDK_SCSI_SENSE_NO_SENSE; 3403 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3404 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3405 break; 3406 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 3407 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 3408 break; 3409 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 3410 *sc = bdev_io->internal.error.scsi.sc; 3411 *sk = bdev_io->internal.error.scsi.sk; 3412 *asc = bdev_io->internal.error.scsi.asc; 3413 *ascq = bdev_io->internal.error.scsi.ascq; 3414 break; 3415 default: 3416 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 3417 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 3418 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3419 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3420 break; 3421 } 3422 } 3423 3424 void 3425 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 3426 { 3427 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 3428 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3429 } else { 3430 bdev_io->internal.error.nvme.sct = sct; 3431 bdev_io->internal.error.nvme.sc = sc; 3432 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 3433 } 3434 3435 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3436 } 3437 3438 void 3439 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 3440 { 3441 assert(sct != NULL); 3442 assert(sc != NULL); 3443 3444 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 3445 *sct = bdev_io->internal.error.nvme.sct; 3446 *sc = bdev_io->internal.error.nvme.sc; 3447 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3448 *sct = SPDK_NVME_SCT_GENERIC; 3449 *sc = SPDK_NVME_SC_SUCCESS; 3450 } else { 3451 *sct = SPDK_NVME_SCT_GENERIC; 3452 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3453 } 3454 } 3455 3456 struct spdk_thread * 3457 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 3458 { 3459 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 3460 } 3461 3462 struct spdk_io_channel * 3463 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 3464 { 3465 return bdev_io->internal.ch->channel; 3466 } 3467 3468 static void 3469 _spdk_bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 3470 { 3471 uint64_t min_qos_set; 3472 int i; 3473 3474 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3475 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3476 break; 3477 } 3478 } 3479 3480 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3481 SPDK_ERRLOG("Invalid rate limits set.\n"); 3482 return; 3483 } 3484 3485 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3486 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3487 continue; 3488 } 3489 3490 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 3491 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 3492 } else { 3493 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 3494 } 3495 3496 if (limits[i] == 0 || limits[i] % min_qos_set) { 3497 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 3498 limits[i], bdev->name, min_qos_set); 3499 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 3500 return; 3501 } 3502 } 3503 3504 if (!bdev->internal.qos) { 3505 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 3506 if (!bdev->internal.qos) { 3507 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3508 return; 3509 } 3510 } 3511 3512 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3513 bdev->internal.qos->rate_limits[i].limit = limits[i]; 3514 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 3515 bdev->name, i, limits[i]); 3516 } 3517 3518 return; 3519 } 3520 3521 static void 3522 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 3523 { 3524 struct spdk_conf_section *sp = NULL; 3525 const char *val = NULL; 3526 int i = 0, j = 0; 3527 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 3528 bool config_qos = false; 3529 3530 sp = spdk_conf_find_section(NULL, "QoS"); 3531 if (!sp) { 3532 return; 3533 } 3534 3535 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3536 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3537 3538 i = 0; 3539 while (true) { 3540 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 3541 if (!val) { 3542 break; 3543 } 3544 3545 if (strcmp(bdev->name, val) != 0) { 3546 i++; 3547 continue; 3548 } 3549 3550 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 3551 if (val) { 3552 if (_spdk_bdev_qos_is_iops_rate_limit(j) == true) { 3553 limits[j] = strtoull(val, NULL, 10); 3554 } else { 3555 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 3556 } 3557 config_qos = true; 3558 } 3559 3560 break; 3561 } 3562 3563 j++; 3564 } 3565 3566 if (config_qos == true) { 3567 _spdk_bdev_qos_config_limit(bdev, limits); 3568 } 3569 3570 return; 3571 } 3572 3573 static int 3574 spdk_bdev_init(struct spdk_bdev *bdev) 3575 { 3576 char *bdev_name; 3577 3578 assert(bdev->module != NULL); 3579 3580 if (!bdev->name) { 3581 SPDK_ERRLOG("Bdev name is NULL\n"); 3582 return -EINVAL; 3583 } 3584 3585 if (spdk_bdev_get_by_name(bdev->name)) { 3586 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 3587 return -EEXIST; 3588 } 3589 3590 /* Users often register their own I/O devices using the bdev name. In 3591 * order to avoid conflicts, prepend bdev_. */ 3592 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 3593 if (!bdev_name) { 3594 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 3595 return -ENOMEM; 3596 } 3597 3598 bdev->internal.status = SPDK_BDEV_STATUS_READY; 3599 bdev->internal.measured_queue_depth = UINT64_MAX; 3600 bdev->internal.claim_module = NULL; 3601 bdev->internal.qd_poller = NULL; 3602 bdev->internal.qos = NULL; 3603 3604 if (spdk_bdev_get_buf_align(bdev) > 1) { 3605 if (bdev->split_on_optimal_io_boundary) { 3606 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 3607 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 3608 } else { 3609 bdev->split_on_optimal_io_boundary = true; 3610 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 3611 } 3612 } 3613 3614 TAILQ_INIT(&bdev->internal.open_descs); 3615 3616 TAILQ_INIT(&bdev->aliases); 3617 3618 bdev->internal.reset_in_progress = NULL; 3619 3620 _spdk_bdev_qos_config(bdev); 3621 3622 spdk_io_device_register(__bdev_to_io_dev(bdev), 3623 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 3624 sizeof(struct spdk_bdev_channel), 3625 bdev_name); 3626 3627 free(bdev_name); 3628 3629 pthread_mutex_init(&bdev->internal.mutex, NULL); 3630 return 0; 3631 } 3632 3633 static void 3634 spdk_bdev_destroy_cb(void *io_device) 3635 { 3636 int rc; 3637 struct spdk_bdev *bdev; 3638 spdk_bdev_unregister_cb cb_fn; 3639 void *cb_arg; 3640 3641 bdev = __bdev_from_io_dev(io_device); 3642 cb_fn = bdev->internal.unregister_cb; 3643 cb_arg = bdev->internal.unregister_ctx; 3644 3645 rc = bdev->fn_table->destruct(bdev->ctxt); 3646 if (rc < 0) { 3647 SPDK_ERRLOG("destruct failed\n"); 3648 } 3649 if (rc <= 0 && cb_fn != NULL) { 3650 cb_fn(cb_arg, rc); 3651 } 3652 } 3653 3654 3655 static void 3656 spdk_bdev_fini(struct spdk_bdev *bdev) 3657 { 3658 pthread_mutex_destroy(&bdev->internal.mutex); 3659 3660 free(bdev->internal.qos); 3661 3662 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 3663 } 3664 3665 static void 3666 spdk_bdev_start(struct spdk_bdev *bdev) 3667 { 3668 struct spdk_bdev_module *module; 3669 uint32_t action; 3670 3671 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 3672 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 3673 3674 /* Examine configuration before initializing I/O */ 3675 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3676 if (module->examine_config) { 3677 action = module->internal.action_in_progress; 3678 module->internal.action_in_progress++; 3679 module->examine_config(bdev); 3680 if (action != module->internal.action_in_progress) { 3681 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 3682 module->name); 3683 } 3684 } 3685 } 3686 3687 if (bdev->internal.claim_module) { 3688 return; 3689 } 3690 3691 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3692 if (module->examine_disk) { 3693 module->internal.action_in_progress++; 3694 module->examine_disk(bdev); 3695 } 3696 } 3697 } 3698 3699 int 3700 spdk_bdev_register(struct spdk_bdev *bdev) 3701 { 3702 int rc = spdk_bdev_init(bdev); 3703 3704 if (rc == 0) { 3705 spdk_bdev_start(bdev); 3706 } 3707 3708 return rc; 3709 } 3710 3711 int 3712 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 3713 { 3714 int rc; 3715 3716 rc = spdk_bdev_init(vbdev); 3717 if (rc) { 3718 return rc; 3719 } 3720 3721 spdk_bdev_start(vbdev); 3722 return 0; 3723 } 3724 3725 void 3726 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 3727 { 3728 if (bdev->internal.unregister_cb != NULL) { 3729 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 3730 } 3731 } 3732 3733 static void 3734 _remove_notify(void *arg) 3735 { 3736 struct spdk_bdev_desc *desc = arg; 3737 3738 desc->remove_scheduled = false; 3739 3740 if (desc->closed) { 3741 free(desc); 3742 } else { 3743 desc->remove_cb(desc->remove_ctx); 3744 } 3745 } 3746 3747 void 3748 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 3749 { 3750 struct spdk_bdev_desc *desc, *tmp; 3751 bool do_destruct = true; 3752 struct spdk_thread *thread; 3753 3754 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 3755 3756 thread = spdk_get_thread(); 3757 if (!thread) { 3758 /* The user called this from a non-SPDK thread. */ 3759 if (cb_fn != NULL) { 3760 cb_fn(cb_arg, -ENOTSUP); 3761 } 3762 return; 3763 } 3764 3765 pthread_mutex_lock(&bdev->internal.mutex); 3766 3767 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 3768 bdev->internal.unregister_cb = cb_fn; 3769 bdev->internal.unregister_ctx = cb_arg; 3770 3771 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 3772 if (desc->remove_cb) { 3773 do_destruct = false; 3774 /* 3775 * Defer invocation of the remove_cb to a separate message that will 3776 * run later on its thread. This ensures this context unwinds and 3777 * we don't recursively unregister this bdev again if the remove_cb 3778 * immediately closes its descriptor. 3779 */ 3780 if (!desc->remove_scheduled) { 3781 /* Avoid scheduling removal of the same descriptor multiple times. */ 3782 desc->remove_scheduled = true; 3783 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 3784 } 3785 } 3786 } 3787 3788 if (!do_destruct) { 3789 pthread_mutex_unlock(&bdev->internal.mutex); 3790 return; 3791 } 3792 3793 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 3794 pthread_mutex_unlock(&bdev->internal.mutex); 3795 3796 spdk_bdev_fini(bdev); 3797 } 3798 3799 int 3800 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 3801 void *remove_ctx, struct spdk_bdev_desc **_desc) 3802 { 3803 struct spdk_bdev_desc *desc; 3804 struct spdk_thread *thread; 3805 3806 thread = spdk_get_thread(); 3807 if (!thread) { 3808 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 3809 return -ENOTSUP; 3810 } 3811 3812 desc = calloc(1, sizeof(*desc)); 3813 if (desc == NULL) { 3814 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 3815 return -ENOMEM; 3816 } 3817 3818 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3819 spdk_get_thread()); 3820 3821 desc->bdev = bdev; 3822 desc->thread = thread; 3823 desc->remove_cb = remove_cb; 3824 desc->remove_ctx = remove_ctx; 3825 desc->write = write; 3826 *_desc = desc; 3827 3828 pthread_mutex_lock(&bdev->internal.mutex); 3829 3830 if (write && bdev->internal.claim_module) { 3831 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 3832 bdev->name, bdev->internal.claim_module->name); 3833 pthread_mutex_unlock(&bdev->internal.mutex); 3834 free(desc); 3835 *_desc = NULL; 3836 return -EPERM; 3837 } 3838 3839 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 3840 3841 pthread_mutex_unlock(&bdev->internal.mutex); 3842 3843 return 0; 3844 } 3845 3846 void 3847 spdk_bdev_close(struct spdk_bdev_desc *desc) 3848 { 3849 struct spdk_bdev *bdev = desc->bdev; 3850 bool do_unregister = false; 3851 3852 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3853 spdk_get_thread()); 3854 3855 assert(desc->thread == spdk_get_thread()); 3856 3857 pthread_mutex_lock(&bdev->internal.mutex); 3858 3859 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 3860 3861 desc->closed = true; 3862 3863 if (!desc->remove_scheduled) { 3864 free(desc); 3865 } 3866 3867 /* If no more descriptors, kill QoS channel */ 3868 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3869 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 3870 bdev->name, spdk_get_thread()); 3871 3872 if (spdk_bdev_qos_destroy(bdev)) { 3873 /* There isn't anything we can do to recover here. Just let the 3874 * old QoS poller keep running. The QoS handling won't change 3875 * cores when the user allocates a new channel, but it won't break. */ 3876 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 3877 } 3878 } 3879 3880 spdk_bdev_set_qd_sampling_period(bdev, 0); 3881 3882 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3883 do_unregister = true; 3884 } 3885 pthread_mutex_unlock(&bdev->internal.mutex); 3886 3887 if (do_unregister == true) { 3888 spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); 3889 } 3890 } 3891 3892 int 3893 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 3894 struct spdk_bdev_module *module) 3895 { 3896 if (bdev->internal.claim_module != NULL) { 3897 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 3898 bdev->internal.claim_module->name); 3899 return -EPERM; 3900 } 3901 3902 if (desc && !desc->write) { 3903 desc->write = true; 3904 } 3905 3906 bdev->internal.claim_module = module; 3907 return 0; 3908 } 3909 3910 void 3911 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 3912 { 3913 assert(bdev->internal.claim_module != NULL); 3914 bdev->internal.claim_module = NULL; 3915 } 3916 3917 struct spdk_bdev * 3918 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 3919 { 3920 return desc->bdev; 3921 } 3922 3923 void 3924 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 3925 { 3926 struct iovec *iovs; 3927 int iovcnt; 3928 3929 if (bdev_io == NULL) { 3930 return; 3931 } 3932 3933 switch (bdev_io->type) { 3934 case SPDK_BDEV_IO_TYPE_READ: 3935 iovs = bdev_io->u.bdev.iovs; 3936 iovcnt = bdev_io->u.bdev.iovcnt; 3937 break; 3938 case SPDK_BDEV_IO_TYPE_WRITE: 3939 iovs = bdev_io->u.bdev.iovs; 3940 iovcnt = bdev_io->u.bdev.iovcnt; 3941 break; 3942 default: 3943 iovs = NULL; 3944 iovcnt = 0; 3945 break; 3946 } 3947 3948 if (iovp) { 3949 *iovp = iovs; 3950 } 3951 if (iovcntp) { 3952 *iovcntp = iovcnt; 3953 } 3954 } 3955 3956 void 3957 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 3958 { 3959 3960 if (spdk_bdev_module_list_find(bdev_module->name)) { 3961 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 3962 assert(false); 3963 } 3964 3965 if (bdev_module->async_init) { 3966 bdev_module->internal.action_in_progress = 1; 3967 } 3968 3969 /* 3970 * Modules with examine callbacks must be initialized first, so they are 3971 * ready to handle examine callbacks from later modules that will 3972 * register physical bdevs. 3973 */ 3974 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 3975 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3976 } else { 3977 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 3978 } 3979 } 3980 3981 struct spdk_bdev_module * 3982 spdk_bdev_module_list_find(const char *name) 3983 { 3984 struct spdk_bdev_module *bdev_module; 3985 3986 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3987 if (strcmp(name, bdev_module->name) == 0) { 3988 break; 3989 } 3990 } 3991 3992 return bdev_module; 3993 } 3994 3995 static void 3996 _spdk_bdev_write_zero_buffer_next(void *_bdev_io) 3997 { 3998 struct spdk_bdev_io *bdev_io = _bdev_io; 3999 uint64_t num_bytes, num_blocks; 4000 int rc; 4001 4002 num_bytes = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * 4003 bdev_io->u.bdev.split_remaining_num_blocks, 4004 ZERO_BUFFER_SIZE); 4005 num_blocks = num_bytes / spdk_bdev_get_block_size(bdev_io->bdev); 4006 4007 rc = spdk_bdev_write_blocks(bdev_io->internal.desc, 4008 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4009 g_bdev_mgr.zero_buffer, 4010 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 4011 _spdk_bdev_write_zero_buffer_done, bdev_io); 4012 if (rc == 0) { 4013 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 4014 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 4015 } else if (rc == -ENOMEM) { 4016 _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_write_zero_buffer_next); 4017 } else { 4018 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4019 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4020 } 4021 } 4022 4023 static void 4024 _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4025 { 4026 struct spdk_bdev_io *parent_io = cb_arg; 4027 4028 spdk_bdev_free_io(bdev_io); 4029 4030 if (!success) { 4031 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4032 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4033 return; 4034 } 4035 4036 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 4037 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4038 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4039 return; 4040 } 4041 4042 _spdk_bdev_write_zero_buffer_next(parent_io); 4043 } 4044 4045 struct set_qos_limit_ctx { 4046 void (*cb_fn)(void *cb_arg, int status); 4047 void *cb_arg; 4048 struct spdk_bdev *bdev; 4049 }; 4050 4051 static void 4052 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 4053 { 4054 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4055 ctx->bdev->internal.qos_mod_in_progress = false; 4056 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4057 4058 ctx->cb_fn(ctx->cb_arg, status); 4059 free(ctx); 4060 } 4061 4062 static void 4063 _spdk_bdev_disable_qos_done(void *cb_arg) 4064 { 4065 struct set_qos_limit_ctx *ctx = cb_arg; 4066 struct spdk_bdev *bdev = ctx->bdev; 4067 struct spdk_bdev_io *bdev_io; 4068 struct spdk_bdev_qos *qos; 4069 4070 pthread_mutex_lock(&bdev->internal.mutex); 4071 qos = bdev->internal.qos; 4072 bdev->internal.qos = NULL; 4073 pthread_mutex_unlock(&bdev->internal.mutex); 4074 4075 while (!TAILQ_EMPTY(&qos->queued)) { 4076 /* Send queued I/O back to their original thread for resubmission. */ 4077 bdev_io = TAILQ_FIRST(&qos->queued); 4078 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 4079 4080 if (bdev_io->internal.io_submit_ch) { 4081 /* 4082 * Channel was changed when sending it to the QoS thread - change it back 4083 * before sending it back to the original thread. 4084 */ 4085 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4086 bdev_io->internal.io_submit_ch = NULL; 4087 } 4088 4089 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 4090 _spdk_bdev_io_submit, bdev_io); 4091 } 4092 4093 if (qos->thread != NULL) { 4094 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4095 spdk_poller_unregister(&qos->poller); 4096 } 4097 4098 free(qos); 4099 4100 _spdk_bdev_set_qos_limit_done(ctx, 0); 4101 } 4102 4103 static void 4104 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 4105 { 4106 void *io_device = spdk_io_channel_iter_get_io_device(i); 4107 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4108 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4109 struct spdk_thread *thread; 4110 4111 pthread_mutex_lock(&bdev->internal.mutex); 4112 thread = bdev->internal.qos->thread; 4113 pthread_mutex_unlock(&bdev->internal.mutex); 4114 4115 if (thread != NULL) { 4116 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 4117 } else { 4118 _spdk_bdev_disable_qos_done(ctx); 4119 } 4120 } 4121 4122 static void 4123 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 4124 { 4125 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4126 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4127 4128 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 4129 4130 spdk_for_each_channel_continue(i, 0); 4131 } 4132 4133 static void 4134 _spdk_bdev_update_qos_rate_limit_msg(void *cb_arg) 4135 { 4136 struct set_qos_limit_ctx *ctx = cb_arg; 4137 struct spdk_bdev *bdev = ctx->bdev; 4138 4139 pthread_mutex_lock(&bdev->internal.mutex); 4140 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 4141 pthread_mutex_unlock(&bdev->internal.mutex); 4142 4143 _spdk_bdev_set_qos_limit_done(ctx, 0); 4144 } 4145 4146 static void 4147 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 4148 { 4149 void *io_device = spdk_io_channel_iter_get_io_device(i); 4150 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4151 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4152 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4153 4154 pthread_mutex_lock(&bdev->internal.mutex); 4155 _spdk_bdev_enable_qos(bdev, bdev_ch); 4156 pthread_mutex_unlock(&bdev->internal.mutex); 4157 spdk_for_each_channel_continue(i, 0); 4158 } 4159 4160 static void 4161 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 4162 { 4163 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4164 4165 _spdk_bdev_set_qos_limit_done(ctx, status); 4166 } 4167 4168 static void 4169 _spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4170 { 4171 int i; 4172 4173 assert(bdev->internal.qos != NULL); 4174 4175 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4176 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4177 bdev->internal.qos->rate_limits[i].limit = limits[i]; 4178 4179 if (limits[i] == 0) { 4180 bdev->internal.qos->rate_limits[i].limit = 4181 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4182 } 4183 } 4184 } 4185 } 4186 4187 void 4188 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 4189 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 4190 { 4191 struct set_qos_limit_ctx *ctx; 4192 uint32_t limit_set_complement; 4193 uint64_t min_limit_per_sec; 4194 int i; 4195 bool disable_rate_limit = true; 4196 4197 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4198 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4199 continue; 4200 } 4201 4202 if (limits[i] > 0) { 4203 disable_rate_limit = false; 4204 } 4205 4206 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 4207 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 4208 } else { 4209 /* Change from megabyte to byte rate limit */ 4210 limits[i] = limits[i] * 1024 * 1024; 4211 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 4212 } 4213 4214 limit_set_complement = limits[i] % min_limit_per_sec; 4215 if (limit_set_complement) { 4216 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 4217 limits[i], min_limit_per_sec); 4218 limits[i] += min_limit_per_sec - limit_set_complement; 4219 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 4220 } 4221 } 4222 4223 ctx = calloc(1, sizeof(*ctx)); 4224 if (ctx == NULL) { 4225 cb_fn(cb_arg, -ENOMEM); 4226 return; 4227 } 4228 4229 ctx->cb_fn = cb_fn; 4230 ctx->cb_arg = cb_arg; 4231 ctx->bdev = bdev; 4232 4233 pthread_mutex_lock(&bdev->internal.mutex); 4234 if (bdev->internal.qos_mod_in_progress) { 4235 pthread_mutex_unlock(&bdev->internal.mutex); 4236 free(ctx); 4237 cb_fn(cb_arg, -EAGAIN); 4238 return; 4239 } 4240 bdev->internal.qos_mod_in_progress = true; 4241 4242 if (disable_rate_limit == true && bdev->internal.qos) { 4243 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4244 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 4245 (bdev->internal.qos->rate_limits[i].limit > 0 && 4246 bdev->internal.qos->rate_limits[i].limit != 4247 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 4248 disable_rate_limit = false; 4249 break; 4250 } 4251 } 4252 } 4253 4254 if (disable_rate_limit == false) { 4255 if (bdev->internal.qos == NULL) { 4256 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 4257 if (!bdev->internal.qos) { 4258 pthread_mutex_unlock(&bdev->internal.mutex); 4259 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 4260 free(ctx); 4261 cb_fn(cb_arg, -ENOMEM); 4262 return; 4263 } 4264 } 4265 4266 if (bdev->internal.qos->thread == NULL) { 4267 /* Enabling */ 4268 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4269 4270 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4271 _spdk_bdev_enable_qos_msg, ctx, 4272 _spdk_bdev_enable_qos_done); 4273 } else { 4274 /* Updating */ 4275 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4276 4277 spdk_thread_send_msg(bdev->internal.qos->thread, 4278 _spdk_bdev_update_qos_rate_limit_msg, ctx); 4279 } 4280 } else { 4281 if (bdev->internal.qos != NULL) { 4282 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4283 4284 /* Disabling */ 4285 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4286 _spdk_bdev_disable_qos_msg, ctx, 4287 _spdk_bdev_disable_qos_msg_done); 4288 } else { 4289 pthread_mutex_unlock(&bdev->internal.mutex); 4290 _spdk_bdev_set_qos_limit_done(ctx, 0); 4291 return; 4292 } 4293 } 4294 4295 pthread_mutex_unlock(&bdev->internal.mutex); 4296 } 4297 4298 struct spdk_bdev_histogram_ctx { 4299 spdk_bdev_histogram_status_cb cb_fn; 4300 void *cb_arg; 4301 struct spdk_bdev *bdev; 4302 int status; 4303 }; 4304 4305 static void 4306 _spdk_bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 4307 { 4308 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4309 4310 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4311 ctx->bdev->internal.histogram_in_progress = false; 4312 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4313 ctx->cb_fn(ctx->cb_arg, ctx->status); 4314 free(ctx); 4315 } 4316 4317 static void 4318 _spdk_bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 4319 { 4320 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4321 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4322 4323 if (ch->histogram != NULL) { 4324 spdk_histogram_data_free(ch->histogram); 4325 ch->histogram = NULL; 4326 } 4327 spdk_for_each_channel_continue(i, 0); 4328 } 4329 4330 static void 4331 _spdk_bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 4332 { 4333 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4334 4335 if (status != 0) { 4336 ctx->status = status; 4337 ctx->bdev->internal.histogram_enabled = false; 4338 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), _spdk_bdev_histogram_disable_channel, ctx, 4339 _spdk_bdev_histogram_disable_channel_cb); 4340 } else { 4341 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4342 ctx->bdev->internal.histogram_in_progress = false; 4343 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4344 ctx->cb_fn(ctx->cb_arg, ctx->status); 4345 free(ctx); 4346 } 4347 } 4348 4349 static void 4350 _spdk_bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 4351 { 4352 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4353 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4354 int status = 0; 4355 4356 if (ch->histogram == NULL) { 4357 ch->histogram = spdk_histogram_data_alloc(); 4358 if (ch->histogram == NULL) { 4359 status = -ENOMEM; 4360 } 4361 } 4362 4363 spdk_for_each_channel_continue(i, status); 4364 } 4365 4366 void 4367 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 4368 void *cb_arg, bool enable) 4369 { 4370 struct spdk_bdev_histogram_ctx *ctx; 4371 4372 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 4373 if (ctx == NULL) { 4374 cb_fn(cb_arg, -ENOMEM); 4375 return; 4376 } 4377 4378 ctx->bdev = bdev; 4379 ctx->status = 0; 4380 ctx->cb_fn = cb_fn; 4381 ctx->cb_arg = cb_arg; 4382 4383 pthread_mutex_lock(&bdev->internal.mutex); 4384 if (bdev->internal.histogram_in_progress) { 4385 pthread_mutex_unlock(&bdev->internal.mutex); 4386 free(ctx); 4387 cb_fn(cb_arg, -EAGAIN); 4388 return; 4389 } 4390 4391 bdev->internal.histogram_in_progress = true; 4392 pthread_mutex_unlock(&bdev->internal.mutex); 4393 4394 bdev->internal.histogram_enabled = enable; 4395 4396 if (enable) { 4397 /* Allocate histogram for each channel */ 4398 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_enable_channel, ctx, 4399 _spdk_bdev_histogram_enable_channel_cb); 4400 } else { 4401 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_disable_channel, ctx, 4402 _spdk_bdev_histogram_disable_channel_cb); 4403 } 4404 } 4405 4406 struct spdk_bdev_histogram_data_ctx { 4407 spdk_bdev_histogram_data_cb cb_fn; 4408 void *cb_arg; 4409 struct spdk_bdev *bdev; 4410 /** merged histogram data from all channels */ 4411 struct spdk_histogram_data *histogram; 4412 }; 4413 4414 static void 4415 _spdk_bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 4416 { 4417 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4418 4419 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 4420 free(ctx); 4421 } 4422 4423 static void 4424 _spdk_bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 4425 { 4426 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4427 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4428 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4429 int status = 0; 4430 4431 if (ch->histogram == NULL) { 4432 status = -EFAULT; 4433 } else { 4434 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 4435 } 4436 4437 spdk_for_each_channel_continue(i, status); 4438 } 4439 4440 void 4441 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 4442 spdk_bdev_histogram_data_cb cb_fn, 4443 void *cb_arg) 4444 { 4445 struct spdk_bdev_histogram_data_ctx *ctx; 4446 4447 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 4448 if (ctx == NULL) { 4449 cb_fn(cb_arg, -ENOMEM, NULL); 4450 return; 4451 } 4452 4453 ctx->bdev = bdev; 4454 ctx->cb_fn = cb_fn; 4455 ctx->cb_arg = cb_arg; 4456 4457 ctx->histogram = histogram; 4458 4459 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_get_channel, ctx, 4460 _spdk_bdev_histogram_get_channel_cb); 4461 } 4462 4463 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 4464 4465 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 4466 { 4467 spdk_trace_register_owner(OWNER_BDEV, 'b'); 4468 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 4469 spdk_trace_register_description("BDEV_IO_START", "", TRACE_BDEV_IO_START, OWNER_BDEV, 4470 OBJECT_BDEV_IO, 1, 0, "type: "); 4471 spdk_trace_register_description("BDEV_IO_DONE", "", TRACE_BDEV_IO_DONE, OWNER_BDEV, 4472 OBJECT_BDEV_IO, 0, 0, ""); 4473 } 4474