1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/util.h" 48 #include "spdk/trace.h" 49 50 #include "spdk/bdev_module.h" 51 #include "spdk_internal/log.h" 52 #include "spdk/string.h" 53 54 #ifdef SPDK_CONFIG_VTUNE 55 #include "ittnotify.h" 56 #include "ittnotify_types.h" 57 int __itt_init_ittlib(const char *, __itt_group_id); 58 #endif 59 60 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) 61 #define SPDK_BDEV_IO_CACHE_SIZE 256 62 #define BUF_SMALL_POOL_SIZE 8192 63 #define BUF_LARGE_POOL_SIZE 1024 64 #define NOMEM_THRESHOLD_COUNT 8 65 #define ZERO_BUFFER_SIZE 0x100000 66 67 #define OWNER_BDEV 0x2 68 69 #define OBJECT_BDEV_IO 0x2 70 71 #define TRACE_GROUP_BDEV 0x3 72 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 73 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 74 75 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 76 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 77 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 78 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 79 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (10 * 1024 * 1024) 80 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 81 82 #define SPDK_BDEV_POOL_ALIGNMENT 512 83 84 static const char *qos_conf_type[] = {"Limit_IOPS", 85 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 86 }; 87 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 88 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 89 }; 90 91 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 struct spdk_mempool *buf_small_pool; 97 struct spdk_mempool *buf_large_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 #ifdef SPDK_CONFIG_VTUNE 109 __itt_domain *domain; 110 #endif 111 }; 112 113 static struct spdk_bdev_mgr g_bdev_mgr = { 114 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 115 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 116 .init_complete = false, 117 .module_init_complete = false, 118 }; 119 120 static struct spdk_bdev_opts g_bdev_opts = { 121 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 122 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 123 }; 124 125 static spdk_bdev_init_cb g_init_cb_fn = NULL; 126 static void *g_init_cb_arg = NULL; 127 128 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 129 static void *g_fini_cb_arg = NULL; 130 static struct spdk_thread *g_fini_thread = NULL; 131 132 struct spdk_bdev_qos_limit { 133 /** IOs or bytes allowed per second (i.e., 1s). */ 134 uint64_t limit; 135 136 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 137 * For remaining bytes, allowed to run negative if an I/O is submitted when 138 * some bytes are remaining, but the I/O is bigger than that amount. The 139 * excess will be deducted from the next timeslice. 140 */ 141 int64_t remaining_this_timeslice; 142 143 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 144 uint32_t min_per_timeslice; 145 146 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 147 uint32_t max_per_timeslice; 148 149 /** Function to check whether to queue the IO. */ 150 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 151 152 /** Function to update for the submitted IO. */ 153 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 154 }; 155 156 struct spdk_bdev_qos { 157 /** Types of structure of rate limits. */ 158 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 159 160 /** The channel that all I/O are funneled through. */ 161 struct spdk_bdev_channel *ch; 162 163 /** The thread on which the poller is running. */ 164 struct spdk_thread *thread; 165 166 /** Queue of I/O waiting to be issued. */ 167 bdev_io_tailq_t queued; 168 169 /** Size of a timeslice in tsc ticks. */ 170 uint64_t timeslice_size; 171 172 /** Timestamp of start of last timeslice. */ 173 uint64_t last_timeslice; 174 175 /** Poller that processes queued I/O commands each time slice. */ 176 struct spdk_poller *poller; 177 }; 178 179 struct spdk_bdev_mgmt_channel { 180 bdev_io_stailq_t need_buf_small; 181 bdev_io_stailq_t need_buf_large; 182 183 /* 184 * Each thread keeps a cache of bdev_io - this allows 185 * bdev threads which are *not* DPDK threads to still 186 * benefit from a per-thread bdev_io cache. Without 187 * this, non-DPDK threads fetching from the mempool 188 * incur a cmpxchg on get and put. 189 */ 190 bdev_io_stailq_t per_thread_cache; 191 uint32_t per_thread_cache_count; 192 uint32_t bdev_io_cache_size; 193 194 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 195 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 196 }; 197 198 /* 199 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 200 * will queue here their IO that awaits retry. It makes it possible to retry sending 201 * IO to one bdev after IO from other bdev completes. 202 */ 203 struct spdk_bdev_shared_resource { 204 /* The bdev management channel */ 205 struct spdk_bdev_mgmt_channel *mgmt_ch; 206 207 /* 208 * Count of I/O submitted to bdev module and waiting for completion. 209 * Incremented before submit_request() is called on an spdk_bdev_io. 210 */ 211 uint64_t io_outstanding; 212 213 /* 214 * Queue of IO awaiting retry because of a previous NOMEM status returned 215 * on this channel. 216 */ 217 bdev_io_tailq_t nomem_io; 218 219 /* 220 * Threshold which io_outstanding must drop to before retrying nomem_io. 221 */ 222 uint64_t nomem_threshold; 223 224 /* I/O channel allocated by a bdev module */ 225 struct spdk_io_channel *shared_ch; 226 227 /* Refcount of bdev channels using this resource */ 228 uint32_t ref; 229 230 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 231 }; 232 233 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 234 #define BDEV_CH_QOS_ENABLED (1 << 1) 235 236 struct spdk_bdev_channel { 237 struct spdk_bdev *bdev; 238 239 /* The channel for the underlying device */ 240 struct spdk_io_channel *channel; 241 242 /* Per io_device per thread data */ 243 struct spdk_bdev_shared_resource *shared_resource; 244 245 struct spdk_bdev_io_stat stat; 246 247 /* 248 * Count of I/O submitted through this channel and waiting for completion. 249 * Incremented before submit_request() is called on an spdk_bdev_io. 250 */ 251 uint64_t io_outstanding; 252 253 bdev_io_tailq_t queued_resets; 254 255 uint32_t flags; 256 257 struct spdk_histogram_data *histogram; 258 259 #ifdef SPDK_CONFIG_VTUNE 260 uint64_t start_tsc; 261 uint64_t interval_tsc; 262 __itt_string_handle *handle; 263 struct spdk_bdev_io_stat prev_stat; 264 #endif 265 266 }; 267 268 struct spdk_bdev_desc { 269 struct spdk_bdev *bdev; 270 struct spdk_thread *thread; 271 spdk_bdev_remove_cb_t remove_cb; 272 void *remove_ctx; 273 bool remove_scheduled; 274 bool closed; 275 bool write; 276 TAILQ_ENTRY(spdk_bdev_desc) link; 277 }; 278 279 struct spdk_bdev_iostat_ctx { 280 struct spdk_bdev_io_stat *stat; 281 spdk_bdev_get_device_stat_cb cb; 282 void *cb_arg; 283 }; 284 285 struct set_qos_limit_ctx { 286 void (*cb_fn)(void *cb_arg, int status); 287 void *cb_arg; 288 struct spdk_bdev *bdev; 289 }; 290 291 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 292 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 293 294 static void _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, 295 void *cb_arg); 296 static void _spdk_bdev_write_zero_buffer_next(void *_bdev_io); 297 298 static void _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 299 static void _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 300 301 void 302 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 303 { 304 *opts = g_bdev_opts; 305 } 306 307 int 308 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 309 { 310 uint32_t min_pool_size; 311 312 /* 313 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 314 * initialization. A second mgmt_ch will be created on the same thread when the application starts 315 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 316 */ 317 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 318 if (opts->bdev_io_pool_size < min_pool_size) { 319 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 320 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 321 spdk_thread_get_count()); 322 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 323 return -1; 324 } 325 326 g_bdev_opts = *opts; 327 return 0; 328 } 329 330 struct spdk_bdev * 331 spdk_bdev_first(void) 332 { 333 struct spdk_bdev *bdev; 334 335 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 336 if (bdev) { 337 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 338 } 339 340 return bdev; 341 } 342 343 struct spdk_bdev * 344 spdk_bdev_next(struct spdk_bdev *prev) 345 { 346 struct spdk_bdev *bdev; 347 348 bdev = TAILQ_NEXT(prev, internal.link); 349 if (bdev) { 350 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 351 } 352 353 return bdev; 354 } 355 356 static struct spdk_bdev * 357 _bdev_next_leaf(struct spdk_bdev *bdev) 358 { 359 while (bdev != NULL) { 360 if (bdev->internal.claim_module == NULL) { 361 return bdev; 362 } else { 363 bdev = TAILQ_NEXT(bdev, internal.link); 364 } 365 } 366 367 return bdev; 368 } 369 370 struct spdk_bdev * 371 spdk_bdev_first_leaf(void) 372 { 373 struct spdk_bdev *bdev; 374 375 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 376 377 if (bdev) { 378 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 379 } 380 381 return bdev; 382 } 383 384 struct spdk_bdev * 385 spdk_bdev_next_leaf(struct spdk_bdev *prev) 386 { 387 struct spdk_bdev *bdev; 388 389 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 390 391 if (bdev) { 392 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 393 } 394 395 return bdev; 396 } 397 398 struct spdk_bdev * 399 spdk_bdev_get_by_name(const char *bdev_name) 400 { 401 struct spdk_bdev_alias *tmp; 402 struct spdk_bdev *bdev = spdk_bdev_first(); 403 404 while (bdev != NULL) { 405 if (strcmp(bdev_name, bdev->name) == 0) { 406 return bdev; 407 } 408 409 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 410 if (strcmp(bdev_name, tmp->alias) == 0) { 411 return bdev; 412 } 413 } 414 415 bdev = spdk_bdev_next(bdev); 416 } 417 418 return NULL; 419 } 420 421 void 422 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 423 { 424 struct iovec *iovs; 425 426 iovs = bdev_io->u.bdev.iovs; 427 428 assert(iovs != NULL); 429 assert(bdev_io->u.bdev.iovcnt >= 1); 430 431 iovs[0].iov_base = buf; 432 iovs[0].iov_len = len; 433 } 434 435 static bool 436 _is_buf_allocated(struct iovec *iovs) 437 { 438 return iovs[0].iov_base != NULL; 439 } 440 441 static bool 442 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 443 { 444 int i; 445 uintptr_t iov_base; 446 447 if (spdk_likely(alignment == 1)) { 448 return true; 449 } 450 451 for (i = 0; i < iovcnt; i++) { 452 iov_base = (uintptr_t)iovs[i].iov_base; 453 if ((iov_base & (alignment - 1)) != 0) { 454 return false; 455 } 456 } 457 458 return true; 459 } 460 461 static void 462 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 463 { 464 int i; 465 size_t len; 466 467 for (i = 0; i < iovcnt; i++) { 468 len = spdk_min(iovs[i].iov_len, buf_len); 469 memcpy(buf, iovs[i].iov_base, len); 470 buf += len; 471 buf_len -= len; 472 } 473 } 474 475 static void 476 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 477 { 478 int i; 479 size_t len; 480 481 for (i = 0; i < iovcnt; i++) { 482 len = spdk_min(iovs[i].iov_len, buf_len); 483 memcpy(iovs[i].iov_base, buf, len); 484 buf += len; 485 buf_len -= len; 486 } 487 } 488 489 static void 490 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 491 { 492 /* save original iovec */ 493 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 494 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 495 /* set bounce iov */ 496 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 497 bdev_io->u.bdev.iovcnt = 1; 498 /* set bounce buffer for this operation */ 499 bdev_io->u.bdev.iovs[0].iov_base = buf; 500 bdev_io->u.bdev.iovs[0].iov_len = len; 501 /* if this is write path, copy data from original buffer to bounce buffer */ 502 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 503 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 504 } 505 } 506 507 static void 508 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 509 { 510 struct spdk_mempool *pool; 511 struct spdk_bdev_io *tmp; 512 void *buf, *aligned_buf; 513 bdev_io_stailq_t *stailq; 514 struct spdk_bdev_mgmt_channel *ch; 515 uint64_t buf_len; 516 uint64_t alignment; 517 bool buf_allocated; 518 519 buf = bdev_io->internal.buf; 520 buf_len = bdev_io->internal.buf_len; 521 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 522 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 523 524 bdev_io->internal.buf = NULL; 525 526 if (buf_len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 527 pool = g_bdev_mgr.buf_small_pool; 528 stailq = &ch->need_buf_small; 529 } else { 530 pool = g_bdev_mgr.buf_large_pool; 531 stailq = &ch->need_buf_large; 532 } 533 534 if (STAILQ_EMPTY(stailq)) { 535 spdk_mempool_put(pool, buf); 536 } else { 537 tmp = STAILQ_FIRST(stailq); 538 539 alignment = spdk_bdev_get_buf_align(tmp->bdev); 540 buf_allocated = _is_buf_allocated(tmp->u.bdev.iovs); 541 542 aligned_buf = (void *)(((uintptr_t)buf + 543 (alignment - 1)) & ~(alignment - 1)); 544 if (buf_allocated) { 545 _bdev_io_set_bounce_buf(tmp, aligned_buf, tmp->internal.buf_len); 546 } else { 547 spdk_bdev_io_set_buf(tmp, aligned_buf, tmp->internal.buf_len); 548 } 549 550 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 551 tmp->internal.buf = buf; 552 tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); 553 } 554 } 555 556 static void 557 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 558 { 559 /* if this is read path, copy data from bounce buffer to original buffer */ 560 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 561 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 562 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt, 563 bdev_io->internal.bounce_iov.iov_base, bdev_io->internal.bounce_iov.iov_len); 564 } 565 /* set orignal buffer for this io */ 566 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 567 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 568 /* disable bouncing buffer for this io */ 569 bdev_io->internal.orig_iovcnt = 0; 570 bdev_io->internal.orig_iovs = NULL; 571 /* return bounce buffer to the pool */ 572 spdk_bdev_io_put_buf(bdev_io); 573 } 574 575 void 576 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 577 { 578 struct spdk_mempool *pool; 579 bdev_io_stailq_t *stailq; 580 void *buf, *aligned_buf; 581 struct spdk_bdev_mgmt_channel *mgmt_ch; 582 uint64_t alignment; 583 bool buf_allocated; 584 585 assert(cb != NULL); 586 assert(bdev_io->u.bdev.iovs != NULL); 587 588 alignment = spdk_bdev_get_buf_align(bdev_io->bdev); 589 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 590 591 if (buf_allocated && 592 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 593 /* Buffer already present and aligned */ 594 cb(bdev_io->internal.ch->channel, bdev_io); 595 return; 596 } 597 598 assert(len + alignment <= SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT); 599 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 600 601 bdev_io->internal.buf_len = len; 602 bdev_io->internal.get_buf_cb = cb; 603 604 if (len + alignment <= SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT) { 605 pool = g_bdev_mgr.buf_small_pool; 606 stailq = &mgmt_ch->need_buf_small; 607 } else { 608 pool = g_bdev_mgr.buf_large_pool; 609 stailq = &mgmt_ch->need_buf_large; 610 } 611 612 buf = spdk_mempool_get(pool); 613 614 if (!buf) { 615 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 616 } else { 617 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 618 619 if (buf_allocated) { 620 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 621 } else { 622 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 623 } 624 bdev_io->internal.buf = buf; 625 bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); 626 } 627 } 628 629 static int 630 spdk_bdev_module_get_max_ctx_size(void) 631 { 632 struct spdk_bdev_module *bdev_module; 633 int max_bdev_module_size = 0; 634 635 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 636 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 637 max_bdev_module_size = bdev_module->get_ctx_size(); 638 } 639 } 640 641 return max_bdev_module_size; 642 } 643 644 void 645 spdk_bdev_config_text(FILE *fp) 646 { 647 struct spdk_bdev_module *bdev_module; 648 649 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 650 if (bdev_module->config_text) { 651 bdev_module->config_text(fp); 652 } 653 } 654 } 655 656 static void 657 spdk_bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 658 { 659 int i; 660 struct spdk_bdev_qos *qos = bdev->internal.qos; 661 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 662 663 if (!qos) { 664 return; 665 } 666 667 spdk_bdev_get_qos_rate_limits(bdev, limits); 668 669 spdk_json_write_object_begin(w); 670 spdk_json_write_named_string(w, "method", "set_bdev_qos_limit"); 671 672 spdk_json_write_named_object_begin(w, "params"); 673 spdk_json_write_named_string(w, "name", bdev->name); 674 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 675 if (limits[i] > 0) { 676 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 677 } 678 } 679 spdk_json_write_object_end(w); 680 681 spdk_json_write_object_end(w); 682 } 683 684 void 685 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 686 { 687 struct spdk_bdev_module *bdev_module; 688 struct spdk_bdev *bdev; 689 690 assert(w != NULL); 691 692 spdk_json_write_array_begin(w); 693 694 spdk_json_write_object_begin(w); 695 spdk_json_write_named_string(w, "method", "set_bdev_options"); 696 spdk_json_write_named_object_begin(w, "params"); 697 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 698 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 699 spdk_json_write_object_end(w); 700 spdk_json_write_object_end(w); 701 702 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 703 if (bdev_module->config_json) { 704 bdev_module->config_json(w); 705 } 706 } 707 708 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 709 spdk_bdev_qos_config_json(bdev, w); 710 711 if (bdev->fn_table->write_config_json) { 712 bdev->fn_table->write_config_json(bdev, w); 713 } 714 } 715 716 spdk_json_write_array_end(w); 717 } 718 719 static int 720 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 721 { 722 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 723 struct spdk_bdev_io *bdev_io; 724 uint32_t i; 725 726 STAILQ_INIT(&ch->need_buf_small); 727 STAILQ_INIT(&ch->need_buf_large); 728 729 STAILQ_INIT(&ch->per_thread_cache); 730 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 731 732 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 733 ch->per_thread_cache_count = 0; 734 for (i = 0; i < ch->bdev_io_cache_size; i++) { 735 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 736 assert(bdev_io != NULL); 737 ch->per_thread_cache_count++; 738 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 739 } 740 741 TAILQ_INIT(&ch->shared_resources); 742 TAILQ_INIT(&ch->io_wait_queue); 743 744 return 0; 745 } 746 747 static void 748 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 749 { 750 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 751 struct spdk_bdev_io *bdev_io; 752 753 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 754 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 755 } 756 757 if (!TAILQ_EMPTY(&ch->shared_resources)) { 758 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 759 } 760 761 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 762 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 763 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 764 ch->per_thread_cache_count--; 765 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 766 } 767 768 assert(ch->per_thread_cache_count == 0); 769 } 770 771 static void 772 spdk_bdev_init_complete(int rc) 773 { 774 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 775 void *cb_arg = g_init_cb_arg; 776 struct spdk_bdev_module *m; 777 778 g_bdev_mgr.init_complete = true; 779 g_init_cb_fn = NULL; 780 g_init_cb_arg = NULL; 781 782 /* 783 * For modules that need to know when subsystem init is complete, 784 * inform them now. 785 */ 786 if (rc == 0) { 787 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 788 if (m->init_complete) { 789 m->init_complete(); 790 } 791 } 792 } 793 794 cb_fn(cb_arg, rc); 795 } 796 797 static void 798 spdk_bdev_module_action_complete(void) 799 { 800 struct spdk_bdev_module *m; 801 802 /* 803 * Don't finish bdev subsystem initialization if 804 * module pre-initialization is still in progress, or 805 * the subsystem been already initialized. 806 */ 807 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 808 return; 809 } 810 811 /* 812 * Check all bdev modules for inits/examinations in progress. If any 813 * exist, return immediately since we cannot finish bdev subsystem 814 * initialization until all are completed. 815 */ 816 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 817 if (m->internal.action_in_progress > 0) { 818 return; 819 } 820 } 821 822 /* 823 * Modules already finished initialization - now that all 824 * the bdev modules have finished their asynchronous I/O 825 * processing, the entire bdev layer can be marked as complete. 826 */ 827 spdk_bdev_init_complete(0); 828 } 829 830 static void 831 spdk_bdev_module_action_done(struct spdk_bdev_module *module) 832 { 833 assert(module->internal.action_in_progress > 0); 834 module->internal.action_in_progress--; 835 spdk_bdev_module_action_complete(); 836 } 837 838 void 839 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 840 { 841 spdk_bdev_module_action_done(module); 842 } 843 844 void 845 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 846 { 847 spdk_bdev_module_action_done(module); 848 } 849 850 /** The last initialized bdev module */ 851 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 852 853 static int 854 spdk_bdev_modules_init(void) 855 { 856 struct spdk_bdev_module *module; 857 int rc = 0; 858 859 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 860 g_resume_bdev_module = module; 861 rc = module->module_init(); 862 if (rc != 0) { 863 return rc; 864 } 865 } 866 867 g_resume_bdev_module = NULL; 868 return 0; 869 } 870 871 872 static void 873 spdk_bdev_init_failed_complete(void *cb_arg) 874 { 875 spdk_bdev_init_complete(-1); 876 } 877 878 static void 879 spdk_bdev_init_failed(void *cb_arg) 880 { 881 spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); 882 } 883 884 void 885 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 886 { 887 struct spdk_conf_section *sp; 888 struct spdk_bdev_opts bdev_opts; 889 int32_t bdev_io_pool_size, bdev_io_cache_size; 890 int cache_size; 891 int rc = 0; 892 char mempool_name[32]; 893 894 assert(cb_fn != NULL); 895 896 sp = spdk_conf_find_section(NULL, "Bdev"); 897 if (sp != NULL) { 898 spdk_bdev_get_opts(&bdev_opts); 899 900 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 901 if (bdev_io_pool_size >= 0) { 902 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 903 } 904 905 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 906 if (bdev_io_cache_size >= 0) { 907 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 908 } 909 910 if (spdk_bdev_set_opts(&bdev_opts)) { 911 spdk_bdev_init_complete(-1); 912 return; 913 } 914 915 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 916 } 917 918 g_init_cb_fn = cb_fn; 919 g_init_cb_arg = cb_arg; 920 921 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 922 923 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 924 g_bdev_opts.bdev_io_pool_size, 925 sizeof(struct spdk_bdev_io) + 926 spdk_bdev_module_get_max_ctx_size(), 927 0, 928 SPDK_ENV_SOCKET_ID_ANY); 929 930 if (g_bdev_mgr.bdev_io_pool == NULL) { 931 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 932 spdk_bdev_init_complete(-1); 933 return; 934 } 935 936 /** 937 * Ensure no more than half of the total buffers end up local caches, by 938 * using spdk_thread_get_count() to determine how many local caches we need 939 * to account for. 940 */ 941 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 942 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 943 944 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 945 BUF_SMALL_POOL_SIZE, 946 SPDK_BDEV_SMALL_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 947 cache_size, 948 SPDK_ENV_SOCKET_ID_ANY); 949 if (!g_bdev_mgr.buf_small_pool) { 950 SPDK_ERRLOG("create rbuf small pool failed\n"); 951 spdk_bdev_init_complete(-1); 952 return; 953 } 954 955 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 956 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 957 958 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 959 BUF_LARGE_POOL_SIZE, 960 SPDK_BDEV_LARGE_BUF_MAX_SIZE + SPDK_BDEV_POOL_ALIGNMENT, 961 cache_size, 962 SPDK_ENV_SOCKET_ID_ANY); 963 if (!g_bdev_mgr.buf_large_pool) { 964 SPDK_ERRLOG("create rbuf large pool failed\n"); 965 spdk_bdev_init_complete(-1); 966 return; 967 } 968 969 g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 970 NULL); 971 if (!g_bdev_mgr.zero_buffer) { 972 SPDK_ERRLOG("create bdev zero buffer failed\n"); 973 spdk_bdev_init_complete(-1); 974 return; 975 } 976 977 #ifdef SPDK_CONFIG_VTUNE 978 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 979 #endif 980 981 spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, 982 spdk_bdev_mgmt_channel_destroy, 983 sizeof(struct spdk_bdev_mgmt_channel), 984 "bdev_mgr"); 985 986 rc = spdk_bdev_modules_init(); 987 g_bdev_mgr.module_init_complete = true; 988 if (rc != 0) { 989 SPDK_ERRLOG("bdev modules init failed\n"); 990 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); 991 return; 992 } 993 994 spdk_bdev_module_action_complete(); 995 } 996 997 static void 998 spdk_bdev_mgr_unregister_cb(void *io_device) 999 { 1000 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1001 1002 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1003 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1004 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1005 g_bdev_opts.bdev_io_pool_size); 1006 } 1007 1008 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1009 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1010 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1011 BUF_SMALL_POOL_SIZE); 1012 assert(false); 1013 } 1014 1015 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1016 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1017 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1018 BUF_LARGE_POOL_SIZE); 1019 assert(false); 1020 } 1021 1022 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1023 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1024 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1025 spdk_dma_free(g_bdev_mgr.zero_buffer); 1026 1027 cb_fn(g_fini_cb_arg); 1028 g_fini_cb_fn = NULL; 1029 g_fini_cb_arg = NULL; 1030 g_bdev_mgr.init_complete = false; 1031 g_bdev_mgr.module_init_complete = false; 1032 } 1033 1034 static void 1035 spdk_bdev_module_finish_iter(void *arg) 1036 { 1037 struct spdk_bdev_module *bdev_module; 1038 1039 /* Start iterating from the last touched module */ 1040 if (!g_resume_bdev_module) { 1041 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1042 } else { 1043 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1044 internal.tailq); 1045 } 1046 1047 while (bdev_module) { 1048 if (bdev_module->async_fini) { 1049 /* Save our place so we can resume later. We must 1050 * save the variable here, before calling module_fini() 1051 * below, because in some cases the module may immediately 1052 * call spdk_bdev_module_finish_done() and re-enter 1053 * this function to continue iterating. */ 1054 g_resume_bdev_module = bdev_module; 1055 } 1056 1057 if (bdev_module->module_fini) { 1058 bdev_module->module_fini(); 1059 } 1060 1061 if (bdev_module->async_fini) { 1062 return; 1063 } 1064 1065 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1066 internal.tailq); 1067 } 1068 1069 g_resume_bdev_module = NULL; 1070 spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); 1071 } 1072 1073 void 1074 spdk_bdev_module_finish_done(void) 1075 { 1076 if (spdk_get_thread() != g_fini_thread) { 1077 spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); 1078 } else { 1079 spdk_bdev_module_finish_iter(NULL); 1080 } 1081 } 1082 1083 static void 1084 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1085 { 1086 struct spdk_bdev *bdev = cb_arg; 1087 1088 if (bdeverrno && bdev) { 1089 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1090 bdev->name); 1091 1092 /* 1093 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1094 * bdev; try to continue by manually removing this bdev from the list and continue 1095 * with the next bdev in the list. 1096 */ 1097 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1098 } 1099 1100 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1101 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1102 /* 1103 * Bdev module finish need to be deferred as we might be in the middle of some context 1104 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1105 * after returning. 1106 */ 1107 spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); 1108 return; 1109 } 1110 1111 /* 1112 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1113 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1114 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1115 * base bdevs. 1116 * 1117 * Also, walk the list in the reverse order. 1118 */ 1119 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1120 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1121 if (bdev->internal.claim_module != NULL) { 1122 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1123 bdev->name, bdev->internal.claim_module->name); 1124 continue; 1125 } 1126 1127 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1128 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1129 return; 1130 } 1131 1132 /* 1133 * If any bdev fails to unclaim underlying bdev properly, we may face the 1134 * case of bdev list consisting of claimed bdevs only (if claims are managed 1135 * correctly, this would mean there's a loop in the claims graph which is 1136 * clearly impossible). Warn and unregister last bdev on the list then. 1137 */ 1138 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1139 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1140 SPDK_ERRLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1141 spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); 1142 return; 1143 } 1144 } 1145 1146 void 1147 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1148 { 1149 struct spdk_bdev_module *m; 1150 1151 assert(cb_fn != NULL); 1152 1153 g_fini_thread = spdk_get_thread(); 1154 1155 g_fini_cb_fn = cb_fn; 1156 g_fini_cb_arg = cb_arg; 1157 1158 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1159 if (m->fini_start) { 1160 m->fini_start(); 1161 } 1162 } 1163 1164 _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); 1165 } 1166 1167 static struct spdk_bdev_io * 1168 spdk_bdev_get_io(struct spdk_bdev_channel *channel) 1169 { 1170 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1171 struct spdk_bdev_io *bdev_io; 1172 1173 if (ch->per_thread_cache_count > 0) { 1174 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1175 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1176 ch->per_thread_cache_count--; 1177 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1178 /* 1179 * Don't try to look for bdev_ios in the global pool if there are 1180 * waiters on bdev_ios - we don't want this caller to jump the line. 1181 */ 1182 bdev_io = NULL; 1183 } else { 1184 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1185 } 1186 1187 return bdev_io; 1188 } 1189 1190 void 1191 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1192 { 1193 struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1194 1195 assert(bdev_io != NULL); 1196 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1197 1198 if (bdev_io->internal.buf != NULL) { 1199 spdk_bdev_io_put_buf(bdev_io); 1200 } 1201 1202 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1203 ch->per_thread_cache_count++; 1204 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1205 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1206 struct spdk_bdev_io_wait_entry *entry; 1207 1208 entry = TAILQ_FIRST(&ch->io_wait_queue); 1209 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1210 entry->cb_fn(entry->cb_arg); 1211 } 1212 } else { 1213 /* We should never have a full cache with entries on the io wait queue. */ 1214 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1215 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1216 } 1217 } 1218 1219 static bool 1220 _spdk_bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1221 { 1222 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1223 1224 switch (limit) { 1225 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1226 return true; 1227 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1228 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1229 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1230 return false; 1231 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1232 default: 1233 return false; 1234 } 1235 } 1236 1237 static bool 1238 _spdk_bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1239 { 1240 switch (bdev_io->type) { 1241 case SPDK_BDEV_IO_TYPE_NVME_IO: 1242 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1243 case SPDK_BDEV_IO_TYPE_READ: 1244 case SPDK_BDEV_IO_TYPE_WRITE: 1245 return true; 1246 default: 1247 return false; 1248 } 1249 } 1250 1251 static bool 1252 _spdk_bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1253 { 1254 switch (bdev_io->type) { 1255 case SPDK_BDEV_IO_TYPE_NVME_IO: 1256 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1257 /* Bit 1 (0x2) set for read operation */ 1258 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1259 return true; 1260 } else { 1261 return false; 1262 } 1263 case SPDK_BDEV_IO_TYPE_READ: 1264 return true; 1265 default: 1266 return false; 1267 } 1268 } 1269 1270 static uint64_t 1271 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1272 { 1273 struct spdk_bdev *bdev = bdev_io->bdev; 1274 1275 switch (bdev_io->type) { 1276 case SPDK_BDEV_IO_TYPE_NVME_IO: 1277 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1278 return bdev_io->u.nvme_passthru.nbytes; 1279 case SPDK_BDEV_IO_TYPE_READ: 1280 case SPDK_BDEV_IO_TYPE_WRITE: 1281 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1282 default: 1283 return 0; 1284 } 1285 } 1286 1287 static bool 1288 _spdk_bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1289 { 1290 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1291 return true; 1292 } else { 1293 return false; 1294 } 1295 } 1296 1297 static bool 1298 _spdk_bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1299 { 1300 if (_spdk_bdev_is_read_io(io) == false) { 1301 return false; 1302 } 1303 1304 return _spdk_bdev_qos_rw_queue_io(limit, io); 1305 } 1306 1307 static bool 1308 _spdk_bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1309 { 1310 if (_spdk_bdev_is_read_io(io) == true) { 1311 return false; 1312 } 1313 1314 return _spdk_bdev_qos_rw_queue_io(limit, io); 1315 } 1316 1317 static void 1318 _spdk_bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1319 { 1320 limit->remaining_this_timeslice--; 1321 } 1322 1323 static void 1324 _spdk_bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1325 { 1326 limit->remaining_this_timeslice -= _spdk_bdev_get_io_size_in_byte(io); 1327 } 1328 1329 static void 1330 _spdk_bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1331 { 1332 if (_spdk_bdev_is_read_io(io) == false) { 1333 return; 1334 } 1335 1336 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1337 } 1338 1339 static void 1340 _spdk_bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1341 { 1342 if (_spdk_bdev_is_read_io(io) == true) { 1343 return; 1344 } 1345 1346 return _spdk_bdev_qos_rw_bps_update_quota(limit, io); 1347 } 1348 1349 static void 1350 _spdk_bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1351 { 1352 int i; 1353 1354 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1355 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1356 qos->rate_limits[i].queue_io = NULL; 1357 qos->rate_limits[i].update_quota = NULL; 1358 continue; 1359 } 1360 1361 switch (i) { 1362 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1363 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1364 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_iops_update_quota; 1365 break; 1366 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1367 qos->rate_limits[i].queue_io = _spdk_bdev_qos_rw_queue_io; 1368 qos->rate_limits[i].update_quota = _spdk_bdev_qos_rw_bps_update_quota; 1369 break; 1370 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1371 qos->rate_limits[i].queue_io = _spdk_bdev_qos_r_queue_io; 1372 qos->rate_limits[i].update_quota = _spdk_bdev_qos_r_bps_update_quota; 1373 break; 1374 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1375 qos->rate_limits[i].queue_io = _spdk_bdev_qos_w_queue_io; 1376 qos->rate_limits[i].update_quota = _spdk_bdev_qos_w_bps_update_quota; 1377 break; 1378 default: 1379 break; 1380 } 1381 } 1382 } 1383 1384 static int 1385 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1386 { 1387 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1388 struct spdk_bdev *bdev = ch->bdev; 1389 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 1390 int i, submitted_ios = 0; 1391 1392 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1393 if (_spdk_bdev_qos_io_to_limit(bdev_io) == true) { 1394 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1395 if (!qos->rate_limits[i].queue_io) { 1396 continue; 1397 } 1398 1399 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1400 bdev_io) == true) { 1401 return submitted_ios; 1402 } 1403 } 1404 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1405 if (!qos->rate_limits[i].update_quota) { 1406 continue; 1407 } 1408 1409 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1410 } 1411 } 1412 1413 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1414 ch->io_outstanding++; 1415 shared_resource->io_outstanding++; 1416 bdev_io->internal.in_submit_request = true; 1417 bdev->fn_table->submit_request(ch->channel, bdev_io); 1418 bdev_io->internal.in_submit_request = false; 1419 submitted_ios++; 1420 } 1421 1422 return submitted_ios; 1423 } 1424 1425 static void 1426 _spdk_bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1427 { 1428 int rc; 1429 1430 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1431 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1432 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1433 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1434 &bdev_io->internal.waitq_entry); 1435 if (rc != 0) { 1436 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1437 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1438 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1439 } 1440 } 1441 1442 static bool 1443 _spdk_bdev_io_type_can_split(uint8_t type) 1444 { 1445 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1446 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1447 1448 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1449 * UNMAP could be split, but these types of I/O are typically much larger 1450 * in size (sometimes the size of the entire block device), and the bdev 1451 * module can more efficiently split these types of I/O. Plus those types 1452 * of I/O do not have a payload, which makes the splitting process simpler. 1453 */ 1454 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1455 return true; 1456 } else { 1457 return false; 1458 } 1459 } 1460 1461 static bool 1462 _spdk_bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1463 { 1464 uint64_t start_stripe, end_stripe; 1465 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1466 1467 if (io_boundary == 0) { 1468 return false; 1469 } 1470 1471 if (!_spdk_bdev_io_type_can_split(bdev_io->type)) { 1472 return false; 1473 } 1474 1475 start_stripe = bdev_io->u.bdev.offset_blocks; 1476 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1477 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1478 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1479 start_stripe >>= spdk_u32log2(io_boundary); 1480 end_stripe >>= spdk_u32log2(io_boundary); 1481 } else { 1482 start_stripe /= io_boundary; 1483 end_stripe /= io_boundary; 1484 } 1485 return (start_stripe != end_stripe); 1486 } 1487 1488 static uint32_t 1489 _to_next_boundary(uint64_t offset, uint32_t boundary) 1490 { 1491 return (boundary - (offset % boundary)); 1492 } 1493 1494 static void 1495 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1496 1497 static void 1498 _spdk_bdev_io_split_with_payload(void *_bdev_io) 1499 { 1500 struct spdk_bdev_io *bdev_io = _bdev_io; 1501 uint64_t current_offset, remaining; 1502 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; 1503 struct iovec *parent_iov, *iov; 1504 uint64_t parent_iov_offset, iov_len; 1505 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1506 int rc; 1507 1508 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1509 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1510 blocklen = bdev_io->bdev->blocklen; 1511 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1512 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1513 1514 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1515 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1516 if (parent_iov_offset < parent_iov->iov_len) { 1517 break; 1518 } 1519 parent_iov_offset -= parent_iov->iov_len; 1520 } 1521 1522 child_iovcnt = 0; 1523 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1524 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1525 to_next_boundary = spdk_min(remaining, to_next_boundary); 1526 to_next_boundary_bytes = to_next_boundary * blocklen; 1527 iov = &bdev_io->child_iov[child_iovcnt]; 1528 iovcnt = 0; 1529 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1530 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1531 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1532 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1533 to_next_boundary_bytes -= iov_len; 1534 1535 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1536 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1537 1538 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1539 parent_iov_offset += iov_len; 1540 } else { 1541 parent_iovpos++; 1542 parent_iov_offset = 0; 1543 } 1544 child_iovcnt++; 1545 iovcnt++; 1546 } 1547 1548 if (to_next_boundary_bytes > 0) { 1549 /* We had to stop this child I/O early because we ran out of 1550 * child_iov space. Make sure the iovs collected are valid and 1551 * then adjust to_next_boundary before starting the child I/O. 1552 */ 1553 if ((to_next_boundary_bytes % blocklen) != 0) { 1554 SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", 1555 to_next_boundary_bytes, blocklen); 1556 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1557 if (bdev_io->u.bdev.split_outstanding == 0) { 1558 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1559 } 1560 return; 1561 } 1562 to_next_boundary -= to_next_boundary_bytes / blocklen; 1563 } 1564 1565 bdev_io->u.bdev.split_outstanding++; 1566 1567 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1568 rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, 1569 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1570 iov, iovcnt, current_offset, to_next_boundary, 1571 _spdk_bdev_io_split_done, bdev_io); 1572 } else { 1573 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 1574 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1575 iov, iovcnt, current_offset, to_next_boundary, 1576 _spdk_bdev_io_split_done, bdev_io); 1577 } 1578 1579 if (rc == 0) { 1580 current_offset += to_next_boundary; 1581 remaining -= to_next_boundary; 1582 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 1583 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 1584 } else { 1585 bdev_io->u.bdev.split_outstanding--; 1586 if (rc == -ENOMEM) { 1587 if (bdev_io->u.bdev.split_outstanding == 0) { 1588 /* No I/O is outstanding. Hence we should wait here. */ 1589 _spdk_bdev_queue_io_wait_with_cb(bdev_io, 1590 _spdk_bdev_io_split_with_payload); 1591 } 1592 } else { 1593 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1594 if (bdev_io->u.bdev.split_outstanding == 0) { 1595 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1596 } 1597 } 1598 1599 return; 1600 } 1601 } 1602 } 1603 1604 static void 1605 _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1606 { 1607 struct spdk_bdev_io *parent_io = cb_arg; 1608 1609 spdk_bdev_free_io(bdev_io); 1610 1611 if (!success) { 1612 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1613 } 1614 parent_io->u.bdev.split_outstanding--; 1615 if (parent_io->u.bdev.split_outstanding != 0) { 1616 return; 1617 } 1618 1619 /* 1620 * Parent I/O finishes when all blocks are consumed or there is any failure of 1621 * child I/O and no outstanding child I/O. 1622 */ 1623 if (parent_io->u.bdev.split_remaining_num_blocks == 0 || 1624 parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { 1625 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 1626 parent_io->internal.caller_ctx); 1627 return; 1628 } 1629 1630 /* 1631 * Continue with the splitting process. This function will complete the parent I/O if the 1632 * splitting is done. 1633 */ 1634 _spdk_bdev_io_split_with_payload(parent_io); 1635 } 1636 1637 static void 1638 _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1639 { 1640 assert(_spdk_bdev_io_type_can_split(bdev_io->type)); 1641 1642 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 1643 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 1644 bdev_io->u.bdev.split_outstanding = 0; 1645 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 1646 1647 _spdk_bdev_io_split_with_payload(bdev_io); 1648 } 1649 1650 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 1651 * be inlined, at least on some compilers. 1652 */ 1653 static inline void 1654 _spdk_bdev_io_submit(void *ctx) 1655 { 1656 struct spdk_bdev_io *bdev_io = ctx; 1657 struct spdk_bdev *bdev = bdev_io->bdev; 1658 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1659 struct spdk_io_channel *ch = bdev_ch->channel; 1660 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1661 uint64_t tsc; 1662 1663 tsc = spdk_get_ticks(); 1664 bdev_io->internal.submit_tsc = tsc; 1665 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 1666 bdev_ch->io_outstanding++; 1667 shared_resource->io_outstanding++; 1668 bdev_io->internal.in_submit_request = true; 1669 if (spdk_likely(bdev_ch->flags == 0)) { 1670 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1671 bdev->fn_table->submit_request(ch, bdev_io); 1672 } else { 1673 bdev_ch->io_outstanding--; 1674 shared_resource->io_outstanding--; 1675 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1676 } 1677 } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1678 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1679 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1680 bdev_ch->io_outstanding--; 1681 shared_resource->io_outstanding--; 1682 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1683 _spdk_bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 1684 } else { 1685 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1686 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1687 } 1688 bdev_io->internal.in_submit_request = false; 1689 } 1690 1691 static void 1692 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) 1693 { 1694 struct spdk_bdev *bdev = bdev_io->bdev; 1695 struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 1696 1697 assert(thread != NULL); 1698 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1699 1700 if (bdev->split_on_optimal_io_boundary && _spdk_bdev_io_should_split(bdev_io)) { 1701 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1702 spdk_bdev_io_get_buf(bdev_io, _spdk_bdev_io_split, 1703 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 1704 } else { 1705 _spdk_bdev_io_split(NULL, bdev_io); 1706 } 1707 return; 1708 } 1709 1710 if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { 1711 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 1712 _spdk_bdev_io_submit(bdev_io); 1713 } else { 1714 bdev_io->internal.io_submit_ch = bdev_io->internal.ch; 1715 bdev_io->internal.ch = bdev->internal.qos->ch; 1716 spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); 1717 } 1718 } else { 1719 _spdk_bdev_io_submit(bdev_io); 1720 } 1721 } 1722 1723 static void 1724 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 1725 { 1726 struct spdk_bdev *bdev = bdev_io->bdev; 1727 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1728 struct spdk_io_channel *ch = bdev_ch->channel; 1729 1730 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 1731 1732 bdev_io->internal.in_submit_request = true; 1733 bdev->fn_table->submit_request(ch, bdev_io); 1734 bdev_io->internal.in_submit_request = false; 1735 } 1736 1737 static void 1738 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, 1739 struct spdk_bdev *bdev, void *cb_arg, 1740 spdk_bdev_io_completion_cb cb) 1741 { 1742 bdev_io->bdev = bdev; 1743 bdev_io->internal.caller_ctx = cb_arg; 1744 bdev_io->internal.cb = cb; 1745 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1746 bdev_io->internal.in_submit_request = false; 1747 bdev_io->internal.buf = NULL; 1748 bdev_io->internal.io_submit_ch = NULL; 1749 bdev_io->internal.orig_iovs = NULL; 1750 bdev_io->internal.orig_iovcnt = 0; 1751 } 1752 1753 static bool 1754 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1755 { 1756 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 1757 } 1758 1759 bool 1760 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 1761 { 1762 bool supported; 1763 1764 supported = _spdk_bdev_io_type_supported(bdev, io_type); 1765 1766 if (!supported) { 1767 switch (io_type) { 1768 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1769 /* The bdev layer will emulate write zeroes as long as write is supported. */ 1770 supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 1771 break; 1772 default: 1773 break; 1774 } 1775 } 1776 1777 return supported; 1778 } 1779 1780 int 1781 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1782 { 1783 if (bdev->fn_table->dump_info_json) { 1784 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 1785 } 1786 1787 return 0; 1788 } 1789 1790 static void 1791 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 1792 { 1793 uint32_t max_per_timeslice = 0; 1794 int i; 1795 1796 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1797 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1798 qos->rate_limits[i].max_per_timeslice = 0; 1799 continue; 1800 } 1801 1802 max_per_timeslice = qos->rate_limits[i].limit * 1803 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 1804 1805 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 1806 qos->rate_limits[i].min_per_timeslice); 1807 1808 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 1809 } 1810 1811 _spdk_bdev_qos_set_ops(qos); 1812 } 1813 1814 static int 1815 spdk_bdev_channel_poll_qos(void *arg) 1816 { 1817 struct spdk_bdev_qos *qos = arg; 1818 uint64_t now = spdk_get_ticks(); 1819 int i; 1820 1821 if (now < (qos->last_timeslice + qos->timeslice_size)) { 1822 /* We received our callback earlier than expected - return 1823 * immediately and wait to do accounting until at least one 1824 * timeslice has actually expired. This should never happen 1825 * with a well-behaved timer implementation. 1826 */ 1827 return 0; 1828 } 1829 1830 /* Reset for next round of rate limiting */ 1831 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1832 /* We may have allowed the IOs or bytes to slightly overrun in the last 1833 * timeslice. remaining_this_timeslice is signed, so if it's negative 1834 * here, we'll account for the overrun so that the next timeslice will 1835 * be appropriately reduced. 1836 */ 1837 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 1838 qos->rate_limits[i].remaining_this_timeslice = 0; 1839 } 1840 } 1841 1842 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 1843 qos->last_timeslice += qos->timeslice_size; 1844 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1845 qos->rate_limits[i].remaining_this_timeslice += 1846 qos->rate_limits[i].max_per_timeslice; 1847 } 1848 } 1849 1850 return _spdk_bdev_qos_io_submit(qos->ch, qos); 1851 } 1852 1853 static void 1854 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 1855 { 1856 struct spdk_bdev_shared_resource *shared_resource; 1857 1858 spdk_put_io_channel(ch->channel); 1859 1860 shared_resource = ch->shared_resource; 1861 1862 assert(ch->io_outstanding == 0); 1863 assert(shared_resource->ref > 0); 1864 shared_resource->ref--; 1865 if (shared_resource->ref == 0) { 1866 assert(shared_resource->io_outstanding == 0); 1867 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 1868 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 1869 free(shared_resource); 1870 } 1871 } 1872 1873 /* Caller must hold bdev->internal.mutex. */ 1874 static void 1875 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 1876 { 1877 struct spdk_bdev_qos *qos = bdev->internal.qos; 1878 int i; 1879 1880 /* Rate limiting on this bdev enabled */ 1881 if (qos) { 1882 if (qos->ch == NULL) { 1883 struct spdk_io_channel *io_ch; 1884 1885 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 1886 bdev->name, spdk_get_thread()); 1887 1888 /* No qos channel has been selected, so set one up */ 1889 1890 /* Take another reference to ch */ 1891 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 1892 assert(io_ch != NULL); 1893 qos->ch = ch; 1894 1895 qos->thread = spdk_io_channel_get_thread(io_ch); 1896 1897 TAILQ_INIT(&qos->queued); 1898 1899 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1900 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 1901 qos->rate_limits[i].min_per_timeslice = 1902 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 1903 } else { 1904 qos->rate_limits[i].min_per_timeslice = 1905 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 1906 } 1907 1908 if (qos->rate_limits[i].limit == 0) { 1909 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 1910 } 1911 } 1912 spdk_bdev_qos_update_max_quota_per_timeslice(qos); 1913 qos->timeslice_size = 1914 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 1915 qos->last_timeslice = spdk_get_ticks(); 1916 qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, 1917 qos, 1918 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 1919 } 1920 1921 ch->flags |= BDEV_CH_QOS_ENABLED; 1922 } 1923 } 1924 1925 static int 1926 spdk_bdev_channel_create(void *io_device, void *ctx_buf) 1927 { 1928 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 1929 struct spdk_bdev_channel *ch = ctx_buf; 1930 struct spdk_io_channel *mgmt_io_ch; 1931 struct spdk_bdev_mgmt_channel *mgmt_ch; 1932 struct spdk_bdev_shared_resource *shared_resource; 1933 1934 ch->bdev = bdev; 1935 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 1936 if (!ch->channel) { 1937 return -1; 1938 } 1939 1940 assert(ch->histogram == NULL); 1941 if (bdev->internal.histogram_enabled) { 1942 ch->histogram = spdk_histogram_data_alloc(); 1943 if (ch->histogram == NULL) { 1944 SPDK_ERRLOG("Could not allocate histogram\n"); 1945 } 1946 } 1947 1948 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 1949 if (!mgmt_io_ch) { 1950 spdk_put_io_channel(ch->channel); 1951 return -1; 1952 } 1953 1954 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 1955 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 1956 if (shared_resource->shared_ch == ch->channel) { 1957 spdk_put_io_channel(mgmt_io_ch); 1958 shared_resource->ref++; 1959 break; 1960 } 1961 } 1962 1963 if (shared_resource == NULL) { 1964 shared_resource = calloc(1, sizeof(*shared_resource)); 1965 if (shared_resource == NULL) { 1966 spdk_put_io_channel(ch->channel); 1967 spdk_put_io_channel(mgmt_io_ch); 1968 return -1; 1969 } 1970 1971 shared_resource->mgmt_ch = mgmt_ch; 1972 shared_resource->io_outstanding = 0; 1973 TAILQ_INIT(&shared_resource->nomem_io); 1974 shared_resource->nomem_threshold = 0; 1975 shared_resource->shared_ch = ch->channel; 1976 shared_resource->ref = 1; 1977 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 1978 } 1979 1980 memset(&ch->stat, 0, sizeof(ch->stat)); 1981 ch->stat.ticks_rate = spdk_get_ticks_hz(); 1982 ch->io_outstanding = 0; 1983 TAILQ_INIT(&ch->queued_resets); 1984 ch->flags = 0; 1985 ch->shared_resource = shared_resource; 1986 1987 #ifdef SPDK_CONFIG_VTUNE 1988 { 1989 char *name; 1990 __itt_init_ittlib(NULL, 0); 1991 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 1992 if (!name) { 1993 _spdk_bdev_channel_destroy_resource(ch); 1994 return -1; 1995 } 1996 ch->handle = __itt_string_handle_create(name); 1997 free(name); 1998 ch->start_tsc = spdk_get_ticks(); 1999 ch->interval_tsc = spdk_get_ticks_hz() / 100; 2000 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 2001 } 2002 #endif 2003 2004 pthread_mutex_lock(&bdev->internal.mutex); 2005 _spdk_bdev_enable_qos(bdev, ch); 2006 pthread_mutex_unlock(&bdev->internal.mutex); 2007 2008 return 0; 2009 } 2010 2011 /* 2012 * Abort I/O that are waiting on a data buffer. These types of I/O are 2013 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2014 */ 2015 static void 2016 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2017 { 2018 bdev_io_stailq_t tmp; 2019 struct spdk_bdev_io *bdev_io; 2020 2021 STAILQ_INIT(&tmp); 2022 2023 while (!STAILQ_EMPTY(queue)) { 2024 bdev_io = STAILQ_FIRST(queue); 2025 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2026 if (bdev_io->internal.ch == ch) { 2027 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2028 } else { 2029 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2030 } 2031 } 2032 2033 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2034 } 2035 2036 /* 2037 * Abort I/O that are queued waiting for submission. These types of I/O are 2038 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2039 */ 2040 static void 2041 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2042 { 2043 struct spdk_bdev_io *bdev_io, *tmp; 2044 2045 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2046 if (bdev_io->internal.ch == ch) { 2047 TAILQ_REMOVE(queue, bdev_io, internal.link); 2048 /* 2049 * spdk_bdev_io_complete() assumes that the completed I/O had 2050 * been submitted to the bdev module. Since in this case it 2051 * hadn't, bump io_outstanding to account for the decrement 2052 * that spdk_bdev_io_complete() will do. 2053 */ 2054 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2055 ch->io_outstanding++; 2056 ch->shared_resource->io_outstanding++; 2057 } 2058 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2059 } 2060 } 2061 } 2062 2063 static void 2064 spdk_bdev_qos_channel_destroy(void *cb_arg) 2065 { 2066 struct spdk_bdev_qos *qos = cb_arg; 2067 2068 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2069 spdk_poller_unregister(&qos->poller); 2070 2071 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2072 2073 free(qos); 2074 } 2075 2076 static int 2077 spdk_bdev_qos_destroy(struct spdk_bdev *bdev) 2078 { 2079 int i; 2080 2081 /* 2082 * Cleanly shutting down the QoS poller is tricky, because 2083 * during the asynchronous operation the user could open 2084 * a new descriptor and create a new channel, spawning 2085 * a new QoS poller. 2086 * 2087 * The strategy is to create a new QoS structure here and swap it 2088 * in. The shutdown path then continues to refer to the old one 2089 * until it completes and then releases it. 2090 */ 2091 struct spdk_bdev_qos *new_qos, *old_qos; 2092 2093 old_qos = bdev->internal.qos; 2094 2095 new_qos = calloc(1, sizeof(*new_qos)); 2096 if (!new_qos) { 2097 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2098 return -ENOMEM; 2099 } 2100 2101 /* Copy the old QoS data into the newly allocated structure */ 2102 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2103 2104 /* Zero out the key parts of the QoS structure */ 2105 new_qos->ch = NULL; 2106 new_qos->thread = NULL; 2107 new_qos->poller = NULL; 2108 TAILQ_INIT(&new_qos->queued); 2109 /* 2110 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2111 * It will be used later for the new QoS structure. 2112 */ 2113 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2114 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2115 new_qos->rate_limits[i].min_per_timeslice = 0; 2116 new_qos->rate_limits[i].max_per_timeslice = 0; 2117 } 2118 2119 bdev->internal.qos = new_qos; 2120 2121 if (old_qos->thread == NULL) { 2122 free(old_qos); 2123 } else { 2124 spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, 2125 old_qos); 2126 } 2127 2128 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2129 * been destroyed yet. The destruction path will end up waiting for the final 2130 * channel to be put before it releases resources. */ 2131 2132 return 0; 2133 } 2134 2135 static void 2136 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2137 { 2138 total->bytes_read += add->bytes_read; 2139 total->num_read_ops += add->num_read_ops; 2140 total->bytes_written += add->bytes_written; 2141 total->num_write_ops += add->num_write_ops; 2142 total->bytes_unmapped += add->bytes_unmapped; 2143 total->num_unmap_ops += add->num_unmap_ops; 2144 total->read_latency_ticks += add->read_latency_ticks; 2145 total->write_latency_ticks += add->write_latency_ticks; 2146 total->unmap_latency_ticks += add->unmap_latency_ticks; 2147 } 2148 2149 static void 2150 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) 2151 { 2152 struct spdk_bdev_channel *ch = ctx_buf; 2153 struct spdk_bdev_mgmt_channel *mgmt_ch; 2154 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2155 2156 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2157 spdk_get_thread()); 2158 2159 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2160 pthread_mutex_lock(&ch->bdev->internal.mutex); 2161 _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2162 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2163 2164 mgmt_ch = shared_resource->mgmt_ch; 2165 2166 _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); 2167 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); 2168 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 2169 _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 2170 2171 if (ch->histogram) { 2172 spdk_histogram_data_free(ch->histogram); 2173 } 2174 2175 _spdk_bdev_channel_destroy_resource(ch); 2176 } 2177 2178 int 2179 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2180 { 2181 struct spdk_bdev_alias *tmp; 2182 2183 if (alias == NULL) { 2184 SPDK_ERRLOG("Empty alias passed\n"); 2185 return -EINVAL; 2186 } 2187 2188 if (spdk_bdev_get_by_name(alias)) { 2189 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2190 return -EEXIST; 2191 } 2192 2193 tmp = calloc(1, sizeof(*tmp)); 2194 if (tmp == NULL) { 2195 SPDK_ERRLOG("Unable to allocate alias\n"); 2196 return -ENOMEM; 2197 } 2198 2199 tmp->alias = strdup(alias); 2200 if (tmp->alias == NULL) { 2201 free(tmp); 2202 SPDK_ERRLOG("Unable to allocate alias\n"); 2203 return -ENOMEM; 2204 } 2205 2206 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2207 2208 return 0; 2209 } 2210 2211 int 2212 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2213 { 2214 struct spdk_bdev_alias *tmp; 2215 2216 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2217 if (strcmp(alias, tmp->alias) == 0) { 2218 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2219 free(tmp->alias); 2220 free(tmp); 2221 return 0; 2222 } 2223 } 2224 2225 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2226 2227 return -ENOENT; 2228 } 2229 2230 void 2231 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2232 { 2233 struct spdk_bdev_alias *p, *tmp; 2234 2235 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2236 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2237 free(p->alias); 2238 free(p); 2239 } 2240 } 2241 2242 struct spdk_io_channel * 2243 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2244 { 2245 return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); 2246 } 2247 2248 const char * 2249 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2250 { 2251 return bdev->name; 2252 } 2253 2254 const char * 2255 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 2256 { 2257 return bdev->product_name; 2258 } 2259 2260 const struct spdk_bdev_aliases_list * 2261 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 2262 { 2263 return &bdev->aliases; 2264 } 2265 2266 uint32_t 2267 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 2268 { 2269 return bdev->blocklen; 2270 } 2271 2272 uint64_t 2273 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 2274 { 2275 return bdev->blockcnt; 2276 } 2277 2278 const char * 2279 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 2280 { 2281 return qos_rpc_type[type]; 2282 } 2283 2284 void 2285 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 2286 { 2287 int i; 2288 2289 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2290 2291 pthread_mutex_lock(&bdev->internal.mutex); 2292 if (bdev->internal.qos) { 2293 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2294 if (bdev->internal.qos->rate_limits[i].limit != 2295 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2296 limits[i] = bdev->internal.qos->rate_limits[i].limit; 2297 if (_spdk_bdev_qos_is_iops_rate_limit(i) == false) { 2298 /* Change from Byte to Megabyte which is user visible. */ 2299 limits[i] = limits[i] / 1024 / 1024; 2300 } 2301 } 2302 } 2303 } 2304 pthread_mutex_unlock(&bdev->internal.mutex); 2305 } 2306 2307 size_t 2308 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 2309 { 2310 return 1 << bdev->required_alignment; 2311 } 2312 2313 uint32_t 2314 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 2315 { 2316 return bdev->optimal_io_boundary; 2317 } 2318 2319 bool 2320 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 2321 { 2322 return bdev->write_cache; 2323 } 2324 2325 const struct spdk_uuid * 2326 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 2327 { 2328 return &bdev->uuid; 2329 } 2330 2331 uint32_t 2332 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 2333 { 2334 return bdev->md_len; 2335 } 2336 2337 bool 2338 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 2339 { 2340 return (bdev->md_len != 0) && bdev->md_interleave; 2341 } 2342 2343 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 2344 { 2345 if (bdev->md_len != 0) { 2346 return bdev->dif_type; 2347 } else { 2348 return SPDK_DIF_DISABLE; 2349 } 2350 } 2351 2352 bool 2353 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 2354 { 2355 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 2356 return bdev->dif_is_head_of_md; 2357 } else { 2358 return false; 2359 } 2360 } 2361 2362 bool 2363 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 2364 enum spdk_dif_check_type check_type) 2365 { 2366 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 2367 return false; 2368 } 2369 2370 switch (check_type) { 2371 case SPDK_DIF_CHECK_TYPE_REFTAG: 2372 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 2373 case SPDK_DIF_CHECK_TYPE_APPTAG: 2374 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 2375 case SPDK_DIF_CHECK_TYPE_GUARD: 2376 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 2377 default: 2378 return false; 2379 } 2380 } 2381 2382 uint64_t 2383 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 2384 { 2385 return bdev->internal.measured_queue_depth; 2386 } 2387 2388 uint64_t 2389 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 2390 { 2391 return bdev->internal.period; 2392 } 2393 2394 uint64_t 2395 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 2396 { 2397 return bdev->internal.weighted_io_time; 2398 } 2399 2400 uint64_t 2401 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 2402 { 2403 return bdev->internal.io_time; 2404 } 2405 2406 static void 2407 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 2408 { 2409 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2410 2411 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 2412 2413 if (bdev->internal.measured_queue_depth) { 2414 bdev->internal.io_time += bdev->internal.period; 2415 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 2416 } 2417 } 2418 2419 static void 2420 _calculate_measured_qd(struct spdk_io_channel_iter *i) 2421 { 2422 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 2423 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2424 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 2425 2426 bdev->internal.temporary_queue_depth += ch->io_outstanding; 2427 spdk_for_each_channel_continue(i, 0); 2428 } 2429 2430 static int 2431 spdk_bdev_calculate_measured_queue_depth(void *ctx) 2432 { 2433 struct spdk_bdev *bdev = ctx; 2434 bdev->internal.temporary_queue_depth = 0; 2435 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 2436 _calculate_measured_qd_cpl); 2437 return 0; 2438 } 2439 2440 void 2441 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 2442 { 2443 bdev->internal.period = period; 2444 2445 if (bdev->internal.qd_poller != NULL) { 2446 spdk_poller_unregister(&bdev->internal.qd_poller); 2447 bdev->internal.measured_queue_depth = UINT64_MAX; 2448 } 2449 2450 if (period != 0) { 2451 bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, 2452 period); 2453 } 2454 } 2455 2456 int 2457 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 2458 { 2459 int ret; 2460 2461 pthread_mutex_lock(&bdev->internal.mutex); 2462 2463 /* bdev has open descriptors */ 2464 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 2465 bdev->blockcnt > size) { 2466 ret = -EBUSY; 2467 } else { 2468 bdev->blockcnt = size; 2469 ret = 0; 2470 } 2471 2472 pthread_mutex_unlock(&bdev->internal.mutex); 2473 2474 return ret; 2475 } 2476 2477 /* 2478 * Convert I/O offset and length from bytes to blocks. 2479 * 2480 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 2481 */ 2482 static uint64_t 2483 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 2484 uint64_t num_bytes, uint64_t *num_blocks) 2485 { 2486 uint32_t block_size = bdev->blocklen; 2487 uint8_t shift_cnt; 2488 2489 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2490 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 2491 shift_cnt = spdk_u32log2(block_size); 2492 *offset_blocks = offset_bytes >> shift_cnt; 2493 *num_blocks = num_bytes >> shift_cnt; 2494 return (offset_bytes - (*offset_blocks << shift_cnt)) | 2495 (num_bytes - (*num_blocks << shift_cnt)); 2496 } else { 2497 *offset_blocks = offset_bytes / block_size; 2498 *num_blocks = num_bytes / block_size; 2499 return (offset_bytes % block_size) | (num_bytes % block_size); 2500 } 2501 } 2502 2503 static bool 2504 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 2505 { 2506 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 2507 * has been an overflow and hence the offset has been wrapped around */ 2508 if (offset_blocks + num_blocks < offset_blocks) { 2509 return false; 2510 } 2511 2512 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 2513 if (offset_blocks + num_blocks > bdev->blockcnt) { 2514 return false; 2515 } 2516 2517 return true; 2518 } 2519 2520 int 2521 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2522 void *buf, uint64_t offset, uint64_t nbytes, 2523 spdk_bdev_io_completion_cb cb, void *cb_arg) 2524 { 2525 uint64_t offset_blocks, num_blocks; 2526 2527 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2528 return -EINVAL; 2529 } 2530 2531 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2532 } 2533 2534 int 2535 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2536 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2537 spdk_bdev_io_completion_cb cb, void *cb_arg) 2538 { 2539 struct spdk_bdev *bdev = desc->bdev; 2540 struct spdk_bdev_io *bdev_io; 2541 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2542 2543 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2544 return -EINVAL; 2545 } 2546 2547 bdev_io = spdk_bdev_get_io(channel); 2548 if (!bdev_io) { 2549 return -ENOMEM; 2550 } 2551 2552 bdev_io->internal.ch = channel; 2553 bdev_io->internal.desc = desc; 2554 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2555 bdev_io->u.bdev.iovs = &bdev_io->iov; 2556 bdev_io->u.bdev.iovs[0].iov_base = buf; 2557 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2558 bdev_io->u.bdev.iovcnt = 1; 2559 bdev_io->u.bdev.num_blocks = num_blocks; 2560 bdev_io->u.bdev.offset_blocks = offset_blocks; 2561 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2562 2563 spdk_bdev_io_submit(bdev_io); 2564 return 0; 2565 } 2566 2567 int 2568 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2569 struct iovec *iov, int iovcnt, 2570 uint64_t offset, uint64_t nbytes, 2571 spdk_bdev_io_completion_cb cb, void *cb_arg) 2572 { 2573 uint64_t offset_blocks, num_blocks; 2574 2575 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2576 return -EINVAL; 2577 } 2578 2579 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2580 } 2581 2582 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2583 struct iovec *iov, int iovcnt, 2584 uint64_t offset_blocks, uint64_t num_blocks, 2585 spdk_bdev_io_completion_cb cb, void *cb_arg) 2586 { 2587 struct spdk_bdev *bdev = desc->bdev; 2588 struct spdk_bdev_io *bdev_io; 2589 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2590 2591 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2592 return -EINVAL; 2593 } 2594 2595 bdev_io = spdk_bdev_get_io(channel); 2596 if (!bdev_io) { 2597 return -ENOMEM; 2598 } 2599 2600 bdev_io->internal.ch = channel; 2601 bdev_io->internal.desc = desc; 2602 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 2603 bdev_io->u.bdev.iovs = iov; 2604 bdev_io->u.bdev.iovcnt = iovcnt; 2605 bdev_io->u.bdev.num_blocks = num_blocks; 2606 bdev_io->u.bdev.offset_blocks = offset_blocks; 2607 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2608 2609 spdk_bdev_io_submit(bdev_io); 2610 return 0; 2611 } 2612 2613 int 2614 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2615 void *buf, uint64_t offset, uint64_t nbytes, 2616 spdk_bdev_io_completion_cb cb, void *cb_arg) 2617 { 2618 uint64_t offset_blocks, num_blocks; 2619 2620 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2621 return -EINVAL; 2622 } 2623 2624 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 2625 } 2626 2627 int 2628 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2629 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 2630 spdk_bdev_io_completion_cb cb, void *cb_arg) 2631 { 2632 struct spdk_bdev *bdev = desc->bdev; 2633 struct spdk_bdev_io *bdev_io; 2634 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2635 2636 if (!desc->write) { 2637 return -EBADF; 2638 } 2639 2640 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2641 return -EINVAL; 2642 } 2643 2644 bdev_io = spdk_bdev_get_io(channel); 2645 if (!bdev_io) { 2646 return -ENOMEM; 2647 } 2648 2649 bdev_io->internal.ch = channel; 2650 bdev_io->internal.desc = desc; 2651 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2652 bdev_io->u.bdev.iovs = &bdev_io->iov; 2653 bdev_io->u.bdev.iovs[0].iov_base = buf; 2654 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 2655 bdev_io->u.bdev.iovcnt = 1; 2656 bdev_io->u.bdev.num_blocks = num_blocks; 2657 bdev_io->u.bdev.offset_blocks = offset_blocks; 2658 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2659 2660 spdk_bdev_io_submit(bdev_io); 2661 return 0; 2662 } 2663 2664 int 2665 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2666 struct iovec *iov, int iovcnt, 2667 uint64_t offset, uint64_t len, 2668 spdk_bdev_io_completion_cb cb, void *cb_arg) 2669 { 2670 uint64_t offset_blocks, num_blocks; 2671 2672 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2673 return -EINVAL; 2674 } 2675 2676 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 2677 } 2678 2679 int 2680 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2681 struct iovec *iov, int iovcnt, 2682 uint64_t offset_blocks, uint64_t num_blocks, 2683 spdk_bdev_io_completion_cb cb, void *cb_arg) 2684 { 2685 struct spdk_bdev *bdev = desc->bdev; 2686 struct spdk_bdev_io *bdev_io; 2687 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2688 2689 if (!desc->write) { 2690 return -EBADF; 2691 } 2692 2693 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2694 return -EINVAL; 2695 } 2696 2697 bdev_io = spdk_bdev_get_io(channel); 2698 if (!bdev_io) { 2699 return -ENOMEM; 2700 } 2701 2702 bdev_io->internal.ch = channel; 2703 bdev_io->internal.desc = desc; 2704 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 2705 bdev_io->u.bdev.iovs = iov; 2706 bdev_io->u.bdev.iovcnt = iovcnt; 2707 bdev_io->u.bdev.num_blocks = num_blocks; 2708 bdev_io->u.bdev.offset_blocks = offset_blocks; 2709 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2710 2711 spdk_bdev_io_submit(bdev_io); 2712 return 0; 2713 } 2714 2715 int 2716 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2717 uint64_t offset, uint64_t len, 2718 spdk_bdev_io_completion_cb cb, void *cb_arg) 2719 { 2720 uint64_t offset_blocks, num_blocks; 2721 2722 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { 2723 return -EINVAL; 2724 } 2725 2726 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2727 } 2728 2729 int 2730 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2731 uint64_t offset_blocks, uint64_t num_blocks, 2732 spdk_bdev_io_completion_cb cb, void *cb_arg) 2733 { 2734 struct spdk_bdev *bdev = desc->bdev; 2735 struct spdk_bdev_io *bdev_io; 2736 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2737 2738 if (!desc->write) { 2739 return -EBADF; 2740 } 2741 2742 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2743 return -EINVAL; 2744 } 2745 2746 bdev_io = spdk_bdev_get_io(channel); 2747 2748 if (!bdev_io) { 2749 return -ENOMEM; 2750 } 2751 2752 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 2753 bdev_io->internal.ch = channel; 2754 bdev_io->internal.desc = desc; 2755 bdev_io->u.bdev.offset_blocks = offset_blocks; 2756 bdev_io->u.bdev.num_blocks = num_blocks; 2757 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2758 2759 if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 2760 spdk_bdev_io_submit(bdev_io); 2761 return 0; 2762 } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 2763 assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); 2764 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 2765 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 2766 _spdk_bdev_write_zero_buffer_next(bdev_io); 2767 return 0; 2768 } else { 2769 spdk_bdev_free_io(bdev_io); 2770 return -ENOTSUP; 2771 } 2772 } 2773 2774 int 2775 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2776 uint64_t offset, uint64_t nbytes, 2777 spdk_bdev_io_completion_cb cb, void *cb_arg) 2778 { 2779 uint64_t offset_blocks, num_blocks; 2780 2781 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 2782 return -EINVAL; 2783 } 2784 2785 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2786 } 2787 2788 int 2789 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2790 uint64_t offset_blocks, uint64_t num_blocks, 2791 spdk_bdev_io_completion_cb cb, void *cb_arg) 2792 { 2793 struct spdk_bdev *bdev = desc->bdev; 2794 struct spdk_bdev_io *bdev_io; 2795 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2796 2797 if (!desc->write) { 2798 return -EBADF; 2799 } 2800 2801 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2802 return -EINVAL; 2803 } 2804 2805 if (num_blocks == 0) { 2806 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 2807 return -EINVAL; 2808 } 2809 2810 bdev_io = spdk_bdev_get_io(channel); 2811 if (!bdev_io) { 2812 return -ENOMEM; 2813 } 2814 2815 bdev_io->internal.ch = channel; 2816 bdev_io->internal.desc = desc; 2817 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 2818 2819 bdev_io->u.bdev.iovs = &bdev_io->iov; 2820 bdev_io->u.bdev.iovs[0].iov_base = NULL; 2821 bdev_io->u.bdev.iovs[0].iov_len = 0; 2822 bdev_io->u.bdev.iovcnt = 1; 2823 2824 bdev_io->u.bdev.offset_blocks = offset_blocks; 2825 bdev_io->u.bdev.num_blocks = num_blocks; 2826 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2827 2828 spdk_bdev_io_submit(bdev_io); 2829 return 0; 2830 } 2831 2832 int 2833 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2834 uint64_t offset, uint64_t length, 2835 spdk_bdev_io_completion_cb cb, void *cb_arg) 2836 { 2837 uint64_t offset_blocks, num_blocks; 2838 2839 if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { 2840 return -EINVAL; 2841 } 2842 2843 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 2844 } 2845 2846 int 2847 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2848 uint64_t offset_blocks, uint64_t num_blocks, 2849 spdk_bdev_io_completion_cb cb, void *cb_arg) 2850 { 2851 struct spdk_bdev *bdev = desc->bdev; 2852 struct spdk_bdev_io *bdev_io; 2853 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2854 2855 if (!desc->write) { 2856 return -EBADF; 2857 } 2858 2859 if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 2860 return -EINVAL; 2861 } 2862 2863 bdev_io = spdk_bdev_get_io(channel); 2864 if (!bdev_io) { 2865 return -ENOMEM; 2866 } 2867 2868 bdev_io->internal.ch = channel; 2869 bdev_io->internal.desc = desc; 2870 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 2871 bdev_io->u.bdev.iovs = NULL; 2872 bdev_io->u.bdev.iovcnt = 0; 2873 bdev_io->u.bdev.offset_blocks = offset_blocks; 2874 bdev_io->u.bdev.num_blocks = num_blocks; 2875 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2876 2877 spdk_bdev_io_submit(bdev_io); 2878 return 0; 2879 } 2880 2881 static void 2882 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 2883 { 2884 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 2885 struct spdk_bdev_io *bdev_io; 2886 2887 bdev_io = TAILQ_FIRST(&ch->queued_resets); 2888 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 2889 spdk_bdev_io_submit_reset(bdev_io); 2890 } 2891 2892 static void 2893 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 2894 { 2895 struct spdk_io_channel *ch; 2896 struct spdk_bdev_channel *channel; 2897 struct spdk_bdev_mgmt_channel *mgmt_channel; 2898 struct spdk_bdev_shared_resource *shared_resource; 2899 bdev_io_tailq_t tmp_queued; 2900 2901 TAILQ_INIT(&tmp_queued); 2902 2903 ch = spdk_io_channel_iter_get_channel(i); 2904 channel = spdk_io_channel_get_ctx(ch); 2905 shared_resource = channel->shared_resource; 2906 mgmt_channel = shared_resource->mgmt_ch; 2907 2908 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 2909 2910 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 2911 /* The QoS object is always valid and readable while 2912 * the channel flag is set, so the lock here should not 2913 * be necessary. We're not in the fast path though, so 2914 * just take it anyway. */ 2915 pthread_mutex_lock(&channel->bdev->internal.mutex); 2916 if (channel->bdev->internal.qos->ch == channel) { 2917 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 2918 } 2919 pthread_mutex_unlock(&channel->bdev->internal.mutex); 2920 } 2921 2922 _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); 2923 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 2924 _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 2925 _spdk_bdev_abort_queued_io(&tmp_queued, channel); 2926 2927 spdk_for_each_channel_continue(i, 0); 2928 } 2929 2930 static void 2931 _spdk_bdev_start_reset(void *ctx) 2932 { 2933 struct spdk_bdev_channel *ch = ctx; 2934 2935 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, 2936 ch, _spdk_bdev_reset_dev); 2937 } 2938 2939 static void 2940 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) 2941 { 2942 struct spdk_bdev *bdev = ch->bdev; 2943 2944 assert(!TAILQ_EMPTY(&ch->queued_resets)); 2945 2946 pthread_mutex_lock(&bdev->internal.mutex); 2947 if (bdev->internal.reset_in_progress == NULL) { 2948 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 2949 /* 2950 * Take a channel reference for the target bdev for the life of this 2951 * reset. This guards against the channel getting destroyed while 2952 * spdk_for_each_channel() calls related to this reset IO are in 2953 * progress. We will release the reference when this reset is 2954 * completed. 2955 */ 2956 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2957 _spdk_bdev_start_reset(ch); 2958 } 2959 pthread_mutex_unlock(&bdev->internal.mutex); 2960 } 2961 2962 int 2963 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 2964 spdk_bdev_io_completion_cb cb, void *cb_arg) 2965 { 2966 struct spdk_bdev *bdev = desc->bdev; 2967 struct spdk_bdev_io *bdev_io; 2968 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2969 2970 bdev_io = spdk_bdev_get_io(channel); 2971 if (!bdev_io) { 2972 return -ENOMEM; 2973 } 2974 2975 bdev_io->internal.ch = channel; 2976 bdev_io->internal.desc = desc; 2977 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 2978 bdev_io->u.reset.ch_ref = NULL; 2979 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 2980 2981 pthread_mutex_lock(&bdev->internal.mutex); 2982 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 2983 pthread_mutex_unlock(&bdev->internal.mutex); 2984 2985 _spdk_bdev_channel_start_reset(channel); 2986 2987 return 0; 2988 } 2989 2990 void 2991 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 2992 struct spdk_bdev_io_stat *stat) 2993 { 2994 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 2995 2996 *stat = channel->stat; 2997 } 2998 2999 static void 3000 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 3001 { 3002 void *io_device = spdk_io_channel_iter_get_io_device(i); 3003 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 3004 3005 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 3006 bdev_iostat_ctx->cb_arg, 0); 3007 free(bdev_iostat_ctx); 3008 } 3009 3010 static void 3011 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 3012 { 3013 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 3014 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 3015 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3016 3017 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 3018 spdk_for_each_channel_continue(i, 0); 3019 } 3020 3021 void 3022 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 3023 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 3024 { 3025 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 3026 3027 assert(bdev != NULL); 3028 assert(stat != NULL); 3029 assert(cb != NULL); 3030 3031 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 3032 if (bdev_iostat_ctx == NULL) { 3033 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 3034 cb(bdev, stat, cb_arg, -ENOMEM); 3035 return; 3036 } 3037 3038 bdev_iostat_ctx->stat = stat; 3039 bdev_iostat_ctx->cb = cb; 3040 bdev_iostat_ctx->cb_arg = cb_arg; 3041 3042 /* Start with the statistics from previously deleted channels. */ 3043 pthread_mutex_lock(&bdev->internal.mutex); 3044 _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 3045 pthread_mutex_unlock(&bdev->internal.mutex); 3046 3047 /* Then iterate and add the statistics from each existing channel. */ 3048 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3049 _spdk_bdev_get_each_channel_stat, 3050 bdev_iostat_ctx, 3051 _spdk_bdev_get_device_stat_done); 3052 } 3053 3054 int 3055 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3056 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3057 spdk_bdev_io_completion_cb cb, void *cb_arg) 3058 { 3059 struct spdk_bdev *bdev = desc->bdev; 3060 struct spdk_bdev_io *bdev_io; 3061 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3062 3063 if (!desc->write) { 3064 return -EBADF; 3065 } 3066 3067 bdev_io = spdk_bdev_get_io(channel); 3068 if (!bdev_io) { 3069 return -ENOMEM; 3070 } 3071 3072 bdev_io->internal.ch = channel; 3073 bdev_io->internal.desc = desc; 3074 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 3075 bdev_io->u.nvme_passthru.cmd = *cmd; 3076 bdev_io->u.nvme_passthru.buf = buf; 3077 bdev_io->u.nvme_passthru.nbytes = nbytes; 3078 bdev_io->u.nvme_passthru.md_buf = NULL; 3079 bdev_io->u.nvme_passthru.md_len = 0; 3080 3081 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3082 3083 spdk_bdev_io_submit(bdev_io); 3084 return 0; 3085 } 3086 3087 int 3088 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3089 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 3090 spdk_bdev_io_completion_cb cb, void *cb_arg) 3091 { 3092 struct spdk_bdev *bdev = desc->bdev; 3093 struct spdk_bdev_io *bdev_io; 3094 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3095 3096 if (!desc->write) { 3097 /* 3098 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3099 * to easily determine if the command is a read or write, but for now just 3100 * do not allow io_passthru with a read-only descriptor. 3101 */ 3102 return -EBADF; 3103 } 3104 3105 bdev_io = spdk_bdev_get_io(channel); 3106 if (!bdev_io) { 3107 return -ENOMEM; 3108 } 3109 3110 bdev_io->internal.ch = channel; 3111 bdev_io->internal.desc = desc; 3112 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 3113 bdev_io->u.nvme_passthru.cmd = *cmd; 3114 bdev_io->u.nvme_passthru.buf = buf; 3115 bdev_io->u.nvme_passthru.nbytes = nbytes; 3116 bdev_io->u.nvme_passthru.md_buf = NULL; 3117 bdev_io->u.nvme_passthru.md_len = 0; 3118 3119 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3120 3121 spdk_bdev_io_submit(bdev_io); 3122 return 0; 3123 } 3124 3125 int 3126 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3127 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 3128 spdk_bdev_io_completion_cb cb, void *cb_arg) 3129 { 3130 struct spdk_bdev *bdev = desc->bdev; 3131 struct spdk_bdev_io *bdev_io; 3132 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3133 3134 if (!desc->write) { 3135 /* 3136 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 3137 * to easily determine if the command is a read or write, but for now just 3138 * do not allow io_passthru with a read-only descriptor. 3139 */ 3140 return -EBADF; 3141 } 3142 3143 bdev_io = spdk_bdev_get_io(channel); 3144 if (!bdev_io) { 3145 return -ENOMEM; 3146 } 3147 3148 bdev_io->internal.ch = channel; 3149 bdev_io->internal.desc = desc; 3150 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 3151 bdev_io->u.nvme_passthru.cmd = *cmd; 3152 bdev_io->u.nvme_passthru.buf = buf; 3153 bdev_io->u.nvme_passthru.nbytes = nbytes; 3154 bdev_io->u.nvme_passthru.md_buf = md_buf; 3155 bdev_io->u.nvme_passthru.md_len = md_len; 3156 3157 spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); 3158 3159 spdk_bdev_io_submit(bdev_io); 3160 return 0; 3161 } 3162 3163 int 3164 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 3165 struct spdk_bdev_io_wait_entry *entry) 3166 { 3167 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3168 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 3169 3170 if (bdev != entry->bdev) { 3171 SPDK_ERRLOG("bdevs do not match\n"); 3172 return -EINVAL; 3173 } 3174 3175 if (mgmt_ch->per_thread_cache_count > 0) { 3176 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 3177 return -EINVAL; 3178 } 3179 3180 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 3181 return 0; 3182 } 3183 3184 static void 3185 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 3186 { 3187 struct spdk_bdev *bdev = bdev_ch->bdev; 3188 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3189 struct spdk_bdev_io *bdev_io; 3190 3191 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 3192 /* 3193 * Allow some more I/O to complete before retrying the nomem_io queue. 3194 * Some drivers (such as nvme) cannot immediately take a new I/O in 3195 * the context of a completion, because the resources for the I/O are 3196 * not released until control returns to the bdev poller. Also, we 3197 * may require several small I/O to complete before a larger I/O 3198 * (that requires splitting) can be submitted. 3199 */ 3200 return; 3201 } 3202 3203 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 3204 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 3205 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 3206 bdev_io->internal.ch->io_outstanding++; 3207 shared_resource->io_outstanding++; 3208 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3209 bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); 3210 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 3211 break; 3212 } 3213 } 3214 } 3215 3216 static inline void 3217 _spdk_bdev_io_complete(void *ctx) 3218 { 3219 struct spdk_bdev_io *bdev_io = ctx; 3220 uint64_t tsc, tsc_diff; 3221 3222 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 3223 /* 3224 * Send the completion to the thread that originally submitted the I/O, 3225 * which may not be the current thread in the case of QoS. 3226 */ 3227 if (bdev_io->internal.io_submit_ch) { 3228 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 3229 bdev_io->internal.io_submit_ch = NULL; 3230 } 3231 3232 /* 3233 * Defer completion to avoid potential infinite recursion if the 3234 * user's completion callback issues a new I/O. 3235 */ 3236 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 3237 _spdk_bdev_io_complete, bdev_io); 3238 return; 3239 } 3240 3241 tsc = spdk_get_ticks(); 3242 tsc_diff = tsc - bdev_io->internal.submit_tsc; 3243 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 3244 3245 if (bdev_io->internal.ch->histogram) { 3246 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 3247 } 3248 3249 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3250 switch (bdev_io->type) { 3251 case SPDK_BDEV_IO_TYPE_READ: 3252 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3253 bdev_io->internal.ch->stat.num_read_ops++; 3254 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 3255 break; 3256 case SPDK_BDEV_IO_TYPE_WRITE: 3257 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3258 bdev_io->internal.ch->stat.num_write_ops++; 3259 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 3260 break; 3261 case SPDK_BDEV_IO_TYPE_UNMAP: 3262 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 3263 bdev_io->internal.ch->stat.num_unmap_ops++; 3264 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 3265 default: 3266 break; 3267 } 3268 } 3269 3270 #ifdef SPDK_CONFIG_VTUNE 3271 uint64_t now_tsc = spdk_get_ticks(); 3272 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 3273 uint64_t data[5]; 3274 3275 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 3276 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 3277 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 3278 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 3279 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 3280 bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; 3281 3282 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 3283 __itt_metadata_u64, 5, data); 3284 3285 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 3286 bdev_io->internal.ch->start_tsc = now_tsc; 3287 } 3288 #endif 3289 3290 assert(bdev_io->internal.cb != NULL); 3291 assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); 3292 3293 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3294 bdev_io->internal.caller_ctx); 3295 } 3296 3297 static void 3298 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 3299 { 3300 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 3301 3302 if (bdev_io->u.reset.ch_ref != NULL) { 3303 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 3304 bdev_io->u.reset.ch_ref = NULL; 3305 } 3306 3307 _spdk_bdev_io_complete(bdev_io); 3308 } 3309 3310 static void 3311 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 3312 { 3313 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 3314 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 3315 3316 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 3317 if (!TAILQ_EMPTY(&ch->queued_resets)) { 3318 _spdk_bdev_channel_start_reset(ch); 3319 } 3320 3321 spdk_for_each_channel_continue(i, 0); 3322 } 3323 3324 void 3325 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 3326 { 3327 struct spdk_bdev *bdev = bdev_io->bdev; 3328 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3329 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 3330 3331 bdev_io->internal.status = status; 3332 3333 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 3334 bool unlock_channels = false; 3335 3336 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 3337 SPDK_ERRLOG("NOMEM returned for reset\n"); 3338 } 3339 pthread_mutex_lock(&bdev->internal.mutex); 3340 if (bdev_io == bdev->internal.reset_in_progress) { 3341 bdev->internal.reset_in_progress = NULL; 3342 unlock_channels = true; 3343 } 3344 pthread_mutex_unlock(&bdev->internal.mutex); 3345 3346 if (unlock_channels) { 3347 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, 3348 bdev_io, _spdk_bdev_reset_complete); 3349 return; 3350 } 3351 } else { 3352 if (spdk_unlikely(bdev_io->internal.orig_iovcnt > 0)) { 3353 _bdev_io_unset_bounce_buf(bdev_io); 3354 } 3355 3356 assert(bdev_ch->io_outstanding > 0); 3357 assert(shared_resource->io_outstanding > 0); 3358 bdev_ch->io_outstanding--; 3359 shared_resource->io_outstanding--; 3360 3361 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 3362 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 3363 /* 3364 * Wait for some of the outstanding I/O to complete before we 3365 * retry any of the nomem_io. Normally we will wait for 3366 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 3367 * depth channels we will instead wait for half to complete. 3368 */ 3369 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 3370 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 3371 return; 3372 } 3373 3374 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 3375 _spdk_bdev_ch_retry_io(bdev_ch); 3376 } 3377 } 3378 3379 _spdk_bdev_io_complete(bdev_io); 3380 } 3381 3382 void 3383 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 3384 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 3385 { 3386 if (sc == SPDK_SCSI_STATUS_GOOD) { 3387 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3388 } else { 3389 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 3390 bdev_io->internal.error.scsi.sc = sc; 3391 bdev_io->internal.error.scsi.sk = sk; 3392 bdev_io->internal.error.scsi.asc = asc; 3393 bdev_io->internal.error.scsi.ascq = ascq; 3394 } 3395 3396 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3397 } 3398 3399 void 3400 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 3401 int *sc, int *sk, int *asc, int *ascq) 3402 { 3403 assert(sc != NULL); 3404 assert(sk != NULL); 3405 assert(asc != NULL); 3406 assert(ascq != NULL); 3407 3408 switch (bdev_io->internal.status) { 3409 case SPDK_BDEV_IO_STATUS_SUCCESS: 3410 *sc = SPDK_SCSI_STATUS_GOOD; 3411 *sk = SPDK_SCSI_SENSE_NO_SENSE; 3412 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3413 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3414 break; 3415 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 3416 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 3417 break; 3418 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 3419 *sc = bdev_io->internal.error.scsi.sc; 3420 *sk = bdev_io->internal.error.scsi.sk; 3421 *asc = bdev_io->internal.error.scsi.asc; 3422 *ascq = bdev_io->internal.error.scsi.ascq; 3423 break; 3424 default: 3425 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 3426 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 3427 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 3428 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 3429 break; 3430 } 3431 } 3432 3433 void 3434 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) 3435 { 3436 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 3437 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3438 } else { 3439 bdev_io->internal.error.nvme.sct = sct; 3440 bdev_io->internal.error.nvme.sc = sc; 3441 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 3442 } 3443 3444 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 3445 } 3446 3447 void 3448 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) 3449 { 3450 assert(sct != NULL); 3451 assert(sc != NULL); 3452 3453 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 3454 *sct = bdev_io->internal.error.nvme.sct; 3455 *sc = bdev_io->internal.error.nvme.sc; 3456 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 3457 *sct = SPDK_NVME_SCT_GENERIC; 3458 *sc = SPDK_NVME_SC_SUCCESS; 3459 } else { 3460 *sct = SPDK_NVME_SCT_GENERIC; 3461 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 3462 } 3463 } 3464 3465 struct spdk_thread * 3466 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 3467 { 3468 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 3469 } 3470 3471 struct spdk_io_channel * 3472 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 3473 { 3474 return bdev_io->internal.ch->channel; 3475 } 3476 3477 static void 3478 _spdk_bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 3479 { 3480 uint64_t min_qos_set; 3481 int i; 3482 3483 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3484 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3485 break; 3486 } 3487 } 3488 3489 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3490 SPDK_ERRLOG("Invalid rate limits set.\n"); 3491 return; 3492 } 3493 3494 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3495 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3496 continue; 3497 } 3498 3499 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 3500 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 3501 } else { 3502 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 3503 } 3504 3505 if (limits[i] == 0 || limits[i] % min_qos_set) { 3506 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 3507 limits[i], bdev->name, min_qos_set); 3508 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 3509 return; 3510 } 3511 } 3512 3513 if (!bdev->internal.qos) { 3514 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 3515 if (!bdev->internal.qos) { 3516 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 3517 return; 3518 } 3519 } 3520 3521 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3522 bdev->internal.qos->rate_limits[i].limit = limits[i]; 3523 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 3524 bdev->name, i, limits[i]); 3525 } 3526 3527 return; 3528 } 3529 3530 static void 3531 _spdk_bdev_qos_config(struct spdk_bdev *bdev) 3532 { 3533 struct spdk_conf_section *sp = NULL; 3534 const char *val = NULL; 3535 int i = 0, j = 0; 3536 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 3537 bool config_qos = false; 3538 3539 sp = spdk_conf_find_section(NULL, "QoS"); 3540 if (!sp) { 3541 return; 3542 } 3543 3544 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 3545 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3546 3547 i = 0; 3548 while (true) { 3549 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 3550 if (!val) { 3551 break; 3552 } 3553 3554 if (strcmp(bdev->name, val) != 0) { 3555 i++; 3556 continue; 3557 } 3558 3559 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 3560 if (val) { 3561 if (_spdk_bdev_qos_is_iops_rate_limit(j) == true) { 3562 limits[j] = strtoull(val, NULL, 10); 3563 } else { 3564 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 3565 } 3566 config_qos = true; 3567 } 3568 3569 break; 3570 } 3571 3572 j++; 3573 } 3574 3575 if (config_qos == true) { 3576 _spdk_bdev_qos_config_limit(bdev, limits); 3577 } 3578 3579 return; 3580 } 3581 3582 static int 3583 spdk_bdev_init(struct spdk_bdev *bdev) 3584 { 3585 char *bdev_name; 3586 3587 assert(bdev->module != NULL); 3588 3589 if (!bdev->name) { 3590 SPDK_ERRLOG("Bdev name is NULL\n"); 3591 return -EINVAL; 3592 } 3593 3594 if (spdk_bdev_get_by_name(bdev->name)) { 3595 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 3596 return -EEXIST; 3597 } 3598 3599 /* Users often register their own I/O devices using the bdev name. In 3600 * order to avoid conflicts, prepend bdev_. */ 3601 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 3602 if (!bdev_name) { 3603 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 3604 return -ENOMEM; 3605 } 3606 3607 bdev->internal.status = SPDK_BDEV_STATUS_READY; 3608 bdev->internal.measured_queue_depth = UINT64_MAX; 3609 bdev->internal.claim_module = NULL; 3610 bdev->internal.qd_poller = NULL; 3611 bdev->internal.qos = NULL; 3612 3613 if (spdk_bdev_get_buf_align(bdev) > 1) { 3614 if (bdev->split_on_optimal_io_boundary) { 3615 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 3616 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 3617 } else { 3618 bdev->split_on_optimal_io_boundary = true; 3619 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 3620 } 3621 } 3622 3623 TAILQ_INIT(&bdev->internal.open_descs); 3624 3625 TAILQ_INIT(&bdev->aliases); 3626 3627 bdev->internal.reset_in_progress = NULL; 3628 3629 _spdk_bdev_qos_config(bdev); 3630 3631 spdk_io_device_register(__bdev_to_io_dev(bdev), 3632 spdk_bdev_channel_create, spdk_bdev_channel_destroy, 3633 sizeof(struct spdk_bdev_channel), 3634 bdev_name); 3635 3636 free(bdev_name); 3637 3638 pthread_mutex_init(&bdev->internal.mutex, NULL); 3639 return 0; 3640 } 3641 3642 static void 3643 spdk_bdev_destroy_cb(void *io_device) 3644 { 3645 int rc; 3646 struct spdk_bdev *bdev; 3647 spdk_bdev_unregister_cb cb_fn; 3648 void *cb_arg; 3649 3650 bdev = __bdev_from_io_dev(io_device); 3651 cb_fn = bdev->internal.unregister_cb; 3652 cb_arg = bdev->internal.unregister_ctx; 3653 3654 rc = bdev->fn_table->destruct(bdev->ctxt); 3655 if (rc < 0) { 3656 SPDK_ERRLOG("destruct failed\n"); 3657 } 3658 if (rc <= 0 && cb_fn != NULL) { 3659 cb_fn(cb_arg, rc); 3660 } 3661 } 3662 3663 3664 static void 3665 spdk_bdev_fini(struct spdk_bdev *bdev) 3666 { 3667 pthread_mutex_destroy(&bdev->internal.mutex); 3668 3669 free(bdev->internal.qos); 3670 3671 spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); 3672 } 3673 3674 static void 3675 spdk_bdev_start(struct spdk_bdev *bdev) 3676 { 3677 struct spdk_bdev_module *module; 3678 uint32_t action; 3679 3680 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 3681 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 3682 3683 /* Examine configuration before initializing I/O */ 3684 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3685 if (module->examine_config) { 3686 action = module->internal.action_in_progress; 3687 module->internal.action_in_progress++; 3688 module->examine_config(bdev); 3689 if (action != module->internal.action_in_progress) { 3690 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 3691 module->name); 3692 } 3693 } 3694 } 3695 3696 if (bdev->internal.claim_module) { 3697 return; 3698 } 3699 3700 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 3701 if (module->examine_disk) { 3702 module->internal.action_in_progress++; 3703 module->examine_disk(bdev); 3704 } 3705 } 3706 } 3707 3708 int 3709 spdk_bdev_register(struct spdk_bdev *bdev) 3710 { 3711 int rc = spdk_bdev_init(bdev); 3712 3713 if (rc == 0) { 3714 spdk_bdev_start(bdev); 3715 } 3716 3717 return rc; 3718 } 3719 3720 int 3721 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 3722 { 3723 int rc; 3724 3725 rc = spdk_bdev_init(vbdev); 3726 if (rc) { 3727 return rc; 3728 } 3729 3730 spdk_bdev_start(vbdev); 3731 return 0; 3732 } 3733 3734 void 3735 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 3736 { 3737 if (bdev->internal.unregister_cb != NULL) { 3738 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 3739 } 3740 } 3741 3742 static void 3743 _remove_notify(void *arg) 3744 { 3745 struct spdk_bdev_desc *desc = arg; 3746 3747 desc->remove_scheduled = false; 3748 3749 if (desc->closed) { 3750 free(desc); 3751 } else { 3752 desc->remove_cb(desc->remove_ctx); 3753 } 3754 } 3755 3756 void 3757 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 3758 { 3759 struct spdk_bdev_desc *desc, *tmp; 3760 bool do_destruct = true; 3761 struct spdk_thread *thread; 3762 3763 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 3764 3765 thread = spdk_get_thread(); 3766 if (!thread) { 3767 /* The user called this from a non-SPDK thread. */ 3768 if (cb_fn != NULL) { 3769 cb_fn(cb_arg, -ENOTSUP); 3770 } 3771 return; 3772 } 3773 3774 pthread_mutex_lock(&bdev->internal.mutex); 3775 3776 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 3777 bdev->internal.unregister_cb = cb_fn; 3778 bdev->internal.unregister_ctx = cb_arg; 3779 3780 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 3781 if (desc->remove_cb) { 3782 do_destruct = false; 3783 /* 3784 * Defer invocation of the remove_cb to a separate message that will 3785 * run later on its thread. This ensures this context unwinds and 3786 * we don't recursively unregister this bdev again if the remove_cb 3787 * immediately closes its descriptor. 3788 */ 3789 if (!desc->remove_scheduled) { 3790 /* Avoid scheduling removal of the same descriptor multiple times. */ 3791 desc->remove_scheduled = true; 3792 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 3793 } 3794 } 3795 } 3796 3797 if (!do_destruct) { 3798 pthread_mutex_unlock(&bdev->internal.mutex); 3799 return; 3800 } 3801 3802 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 3803 pthread_mutex_unlock(&bdev->internal.mutex); 3804 3805 spdk_bdev_fini(bdev); 3806 } 3807 3808 int 3809 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 3810 void *remove_ctx, struct spdk_bdev_desc **_desc) 3811 { 3812 struct spdk_bdev_desc *desc; 3813 struct spdk_thread *thread; 3814 struct set_qos_limit_ctx *ctx; 3815 3816 thread = spdk_get_thread(); 3817 if (!thread) { 3818 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 3819 return -ENOTSUP; 3820 } 3821 3822 desc = calloc(1, sizeof(*desc)); 3823 if (desc == NULL) { 3824 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 3825 return -ENOMEM; 3826 } 3827 3828 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3829 spdk_get_thread()); 3830 3831 desc->bdev = bdev; 3832 desc->thread = thread; 3833 desc->remove_cb = remove_cb; 3834 desc->remove_ctx = remove_ctx; 3835 desc->write = write; 3836 *_desc = desc; 3837 3838 pthread_mutex_lock(&bdev->internal.mutex); 3839 3840 if (write && bdev->internal.claim_module) { 3841 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 3842 bdev->name, bdev->internal.claim_module->name); 3843 pthread_mutex_unlock(&bdev->internal.mutex); 3844 free(desc); 3845 *_desc = NULL; 3846 return -EPERM; 3847 } 3848 3849 /* Enable QoS */ 3850 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 3851 ctx = calloc(1, sizeof(*ctx)); 3852 if (ctx == NULL) { 3853 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 3854 pthread_mutex_unlock(&bdev->internal.mutex); 3855 free(desc); 3856 *_desc = NULL; 3857 return -ENOMEM; 3858 } 3859 ctx->bdev = bdev; 3860 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3861 _spdk_bdev_enable_qos_msg, ctx, 3862 _spdk_bdev_enable_qos_done); 3863 } 3864 3865 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 3866 3867 pthread_mutex_unlock(&bdev->internal.mutex); 3868 3869 return 0; 3870 } 3871 3872 void 3873 spdk_bdev_close(struct spdk_bdev_desc *desc) 3874 { 3875 struct spdk_bdev *bdev = desc->bdev; 3876 bool do_unregister = false; 3877 3878 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 3879 spdk_get_thread()); 3880 3881 assert(desc->thread == spdk_get_thread()); 3882 3883 pthread_mutex_lock(&bdev->internal.mutex); 3884 3885 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 3886 3887 desc->closed = true; 3888 3889 if (!desc->remove_scheduled) { 3890 free(desc); 3891 } 3892 3893 /* If no more descriptors, kill QoS channel */ 3894 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3895 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 3896 bdev->name, spdk_get_thread()); 3897 3898 if (spdk_bdev_qos_destroy(bdev)) { 3899 /* There isn't anything we can do to recover here. Just let the 3900 * old QoS poller keep running. The QoS handling won't change 3901 * cores when the user allocates a new channel, but it won't break. */ 3902 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 3903 } 3904 } 3905 3906 spdk_bdev_set_qd_sampling_period(bdev, 0); 3907 3908 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 3909 do_unregister = true; 3910 } 3911 pthread_mutex_unlock(&bdev->internal.mutex); 3912 3913 if (do_unregister == true) { 3914 spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); 3915 } 3916 } 3917 3918 int 3919 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 3920 struct spdk_bdev_module *module) 3921 { 3922 if (bdev->internal.claim_module != NULL) { 3923 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 3924 bdev->internal.claim_module->name); 3925 return -EPERM; 3926 } 3927 3928 if (desc && !desc->write) { 3929 desc->write = true; 3930 } 3931 3932 bdev->internal.claim_module = module; 3933 return 0; 3934 } 3935 3936 void 3937 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 3938 { 3939 assert(bdev->internal.claim_module != NULL); 3940 bdev->internal.claim_module = NULL; 3941 } 3942 3943 struct spdk_bdev * 3944 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 3945 { 3946 return desc->bdev; 3947 } 3948 3949 void 3950 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 3951 { 3952 struct iovec *iovs; 3953 int iovcnt; 3954 3955 if (bdev_io == NULL) { 3956 return; 3957 } 3958 3959 switch (bdev_io->type) { 3960 case SPDK_BDEV_IO_TYPE_READ: 3961 iovs = bdev_io->u.bdev.iovs; 3962 iovcnt = bdev_io->u.bdev.iovcnt; 3963 break; 3964 case SPDK_BDEV_IO_TYPE_WRITE: 3965 iovs = bdev_io->u.bdev.iovs; 3966 iovcnt = bdev_io->u.bdev.iovcnt; 3967 break; 3968 default: 3969 iovs = NULL; 3970 iovcnt = 0; 3971 break; 3972 } 3973 3974 if (iovp) { 3975 *iovp = iovs; 3976 } 3977 if (iovcntp) { 3978 *iovcntp = iovcnt; 3979 } 3980 } 3981 3982 void 3983 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 3984 { 3985 3986 if (spdk_bdev_module_list_find(bdev_module->name)) { 3987 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 3988 assert(false); 3989 } 3990 3991 if (bdev_module->async_init) { 3992 bdev_module->internal.action_in_progress = 1; 3993 } 3994 3995 /* 3996 * Modules with examine callbacks must be initialized first, so they are 3997 * ready to handle examine callbacks from later modules that will 3998 * register physical bdevs. 3999 */ 4000 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 4001 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 4002 } else { 4003 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 4004 } 4005 } 4006 4007 struct spdk_bdev_module * 4008 spdk_bdev_module_list_find(const char *name) 4009 { 4010 struct spdk_bdev_module *bdev_module; 4011 4012 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 4013 if (strcmp(name, bdev_module->name) == 0) { 4014 break; 4015 } 4016 } 4017 4018 return bdev_module; 4019 } 4020 4021 static void 4022 _spdk_bdev_write_zero_buffer_next(void *_bdev_io) 4023 { 4024 struct spdk_bdev_io *bdev_io = _bdev_io; 4025 uint64_t num_bytes, num_blocks; 4026 int rc; 4027 4028 num_bytes = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * 4029 bdev_io->u.bdev.split_remaining_num_blocks, 4030 ZERO_BUFFER_SIZE); 4031 num_blocks = num_bytes / spdk_bdev_get_block_size(bdev_io->bdev); 4032 4033 rc = spdk_bdev_write_blocks(bdev_io->internal.desc, 4034 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4035 g_bdev_mgr.zero_buffer, 4036 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 4037 _spdk_bdev_write_zero_buffer_done, bdev_io); 4038 if (rc == 0) { 4039 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 4040 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 4041 } else if (rc == -ENOMEM) { 4042 _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_write_zero_buffer_next); 4043 } else { 4044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4045 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4046 } 4047 } 4048 4049 static void 4050 _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4051 { 4052 struct spdk_bdev_io *parent_io = cb_arg; 4053 4054 spdk_bdev_free_io(bdev_io); 4055 4056 if (!success) { 4057 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4058 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4059 return; 4060 } 4061 4062 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 4063 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4064 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4065 return; 4066 } 4067 4068 _spdk_bdev_write_zero_buffer_next(parent_io); 4069 } 4070 4071 static void 4072 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 4073 { 4074 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4075 ctx->bdev->internal.qos_mod_in_progress = false; 4076 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4077 4078 if (ctx->cb_fn) { 4079 ctx->cb_fn(ctx->cb_arg, status); 4080 } 4081 free(ctx); 4082 } 4083 4084 static void 4085 _spdk_bdev_disable_qos_done(void *cb_arg) 4086 { 4087 struct set_qos_limit_ctx *ctx = cb_arg; 4088 struct spdk_bdev *bdev = ctx->bdev; 4089 struct spdk_bdev_io *bdev_io; 4090 struct spdk_bdev_qos *qos; 4091 4092 pthread_mutex_lock(&bdev->internal.mutex); 4093 qos = bdev->internal.qos; 4094 bdev->internal.qos = NULL; 4095 pthread_mutex_unlock(&bdev->internal.mutex); 4096 4097 while (!TAILQ_EMPTY(&qos->queued)) { 4098 /* Send queued I/O back to their original thread for resubmission. */ 4099 bdev_io = TAILQ_FIRST(&qos->queued); 4100 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 4101 4102 if (bdev_io->internal.io_submit_ch) { 4103 /* 4104 * Channel was changed when sending it to the QoS thread - change it back 4105 * before sending it back to the original thread. 4106 */ 4107 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4108 bdev_io->internal.io_submit_ch = NULL; 4109 } 4110 4111 spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), 4112 _spdk_bdev_io_submit, bdev_io); 4113 } 4114 4115 if (qos->thread != NULL) { 4116 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4117 spdk_poller_unregister(&qos->poller); 4118 } 4119 4120 free(qos); 4121 4122 _spdk_bdev_set_qos_limit_done(ctx, 0); 4123 } 4124 4125 static void 4126 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 4127 { 4128 void *io_device = spdk_io_channel_iter_get_io_device(i); 4129 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4130 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4131 struct spdk_thread *thread; 4132 4133 pthread_mutex_lock(&bdev->internal.mutex); 4134 thread = bdev->internal.qos->thread; 4135 pthread_mutex_unlock(&bdev->internal.mutex); 4136 4137 if (thread != NULL) { 4138 spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); 4139 } else { 4140 _spdk_bdev_disable_qos_done(ctx); 4141 } 4142 } 4143 4144 static void 4145 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 4146 { 4147 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4148 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4149 4150 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 4151 4152 spdk_for_each_channel_continue(i, 0); 4153 } 4154 4155 static void 4156 _spdk_bdev_update_qos_rate_limit_msg(void *cb_arg) 4157 { 4158 struct set_qos_limit_ctx *ctx = cb_arg; 4159 struct spdk_bdev *bdev = ctx->bdev; 4160 4161 pthread_mutex_lock(&bdev->internal.mutex); 4162 spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 4163 pthread_mutex_unlock(&bdev->internal.mutex); 4164 4165 _spdk_bdev_set_qos_limit_done(ctx, 0); 4166 } 4167 4168 static void 4169 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 4170 { 4171 void *io_device = spdk_io_channel_iter_get_io_device(i); 4172 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4173 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4174 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 4175 4176 pthread_mutex_lock(&bdev->internal.mutex); 4177 _spdk_bdev_enable_qos(bdev, bdev_ch); 4178 pthread_mutex_unlock(&bdev->internal.mutex); 4179 spdk_for_each_channel_continue(i, 0); 4180 } 4181 4182 static void 4183 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 4184 { 4185 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4186 4187 _spdk_bdev_set_qos_limit_done(ctx, status); 4188 } 4189 4190 static void 4191 _spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4192 { 4193 int i; 4194 4195 assert(bdev->internal.qos != NULL); 4196 4197 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4198 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4199 bdev->internal.qos->rate_limits[i].limit = limits[i]; 4200 4201 if (limits[i] == 0) { 4202 bdev->internal.qos->rate_limits[i].limit = 4203 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4204 } 4205 } 4206 } 4207 } 4208 4209 void 4210 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 4211 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 4212 { 4213 struct set_qos_limit_ctx *ctx; 4214 uint32_t limit_set_complement; 4215 uint64_t min_limit_per_sec; 4216 int i; 4217 bool disable_rate_limit = true; 4218 4219 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4220 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4221 continue; 4222 } 4223 4224 if (limits[i] > 0) { 4225 disable_rate_limit = false; 4226 } 4227 4228 if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { 4229 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 4230 } else { 4231 /* Change from megabyte to byte rate limit */ 4232 limits[i] = limits[i] * 1024 * 1024; 4233 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 4234 } 4235 4236 limit_set_complement = limits[i] % min_limit_per_sec; 4237 if (limit_set_complement) { 4238 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 4239 limits[i], min_limit_per_sec); 4240 limits[i] += min_limit_per_sec - limit_set_complement; 4241 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 4242 } 4243 } 4244 4245 ctx = calloc(1, sizeof(*ctx)); 4246 if (ctx == NULL) { 4247 cb_fn(cb_arg, -ENOMEM); 4248 return; 4249 } 4250 4251 ctx->cb_fn = cb_fn; 4252 ctx->cb_arg = cb_arg; 4253 ctx->bdev = bdev; 4254 4255 pthread_mutex_lock(&bdev->internal.mutex); 4256 if (bdev->internal.qos_mod_in_progress) { 4257 pthread_mutex_unlock(&bdev->internal.mutex); 4258 free(ctx); 4259 cb_fn(cb_arg, -EAGAIN); 4260 return; 4261 } 4262 bdev->internal.qos_mod_in_progress = true; 4263 4264 if (disable_rate_limit == true && bdev->internal.qos) { 4265 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4266 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 4267 (bdev->internal.qos->rate_limits[i].limit > 0 && 4268 bdev->internal.qos->rate_limits[i].limit != 4269 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 4270 disable_rate_limit = false; 4271 break; 4272 } 4273 } 4274 } 4275 4276 if (disable_rate_limit == false) { 4277 if (bdev->internal.qos == NULL) { 4278 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 4279 if (!bdev->internal.qos) { 4280 pthread_mutex_unlock(&bdev->internal.mutex); 4281 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 4282 free(ctx); 4283 cb_fn(cb_arg, -ENOMEM); 4284 return; 4285 } 4286 } 4287 4288 if (bdev->internal.qos->thread == NULL) { 4289 /* Enabling */ 4290 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4291 4292 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4293 _spdk_bdev_enable_qos_msg, ctx, 4294 _spdk_bdev_enable_qos_done); 4295 } else { 4296 /* Updating */ 4297 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4298 4299 spdk_thread_send_msg(bdev->internal.qos->thread, 4300 _spdk_bdev_update_qos_rate_limit_msg, ctx); 4301 } 4302 } else { 4303 if (bdev->internal.qos != NULL) { 4304 _spdk_bdev_set_qos_rate_limits(bdev, limits); 4305 4306 /* Disabling */ 4307 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4308 _spdk_bdev_disable_qos_msg, ctx, 4309 _spdk_bdev_disable_qos_msg_done); 4310 } else { 4311 pthread_mutex_unlock(&bdev->internal.mutex); 4312 _spdk_bdev_set_qos_limit_done(ctx, 0); 4313 return; 4314 } 4315 } 4316 4317 pthread_mutex_unlock(&bdev->internal.mutex); 4318 } 4319 4320 struct spdk_bdev_histogram_ctx { 4321 spdk_bdev_histogram_status_cb cb_fn; 4322 void *cb_arg; 4323 struct spdk_bdev *bdev; 4324 int status; 4325 }; 4326 4327 static void 4328 _spdk_bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 4329 { 4330 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4331 4332 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4333 ctx->bdev->internal.histogram_in_progress = false; 4334 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4335 ctx->cb_fn(ctx->cb_arg, ctx->status); 4336 free(ctx); 4337 } 4338 4339 static void 4340 _spdk_bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 4341 { 4342 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4343 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4344 4345 if (ch->histogram != NULL) { 4346 spdk_histogram_data_free(ch->histogram); 4347 ch->histogram = NULL; 4348 } 4349 spdk_for_each_channel_continue(i, 0); 4350 } 4351 4352 static void 4353 _spdk_bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 4354 { 4355 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4356 4357 if (status != 0) { 4358 ctx->status = status; 4359 ctx->bdev->internal.histogram_enabled = false; 4360 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), _spdk_bdev_histogram_disable_channel, ctx, 4361 _spdk_bdev_histogram_disable_channel_cb); 4362 } else { 4363 pthread_mutex_lock(&ctx->bdev->internal.mutex); 4364 ctx->bdev->internal.histogram_in_progress = false; 4365 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 4366 ctx->cb_fn(ctx->cb_arg, ctx->status); 4367 free(ctx); 4368 } 4369 } 4370 4371 static void 4372 _spdk_bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 4373 { 4374 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4375 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4376 int status = 0; 4377 4378 if (ch->histogram == NULL) { 4379 ch->histogram = spdk_histogram_data_alloc(); 4380 if (ch->histogram == NULL) { 4381 status = -ENOMEM; 4382 } 4383 } 4384 4385 spdk_for_each_channel_continue(i, status); 4386 } 4387 4388 void 4389 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 4390 void *cb_arg, bool enable) 4391 { 4392 struct spdk_bdev_histogram_ctx *ctx; 4393 4394 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 4395 if (ctx == NULL) { 4396 cb_fn(cb_arg, -ENOMEM); 4397 return; 4398 } 4399 4400 ctx->bdev = bdev; 4401 ctx->status = 0; 4402 ctx->cb_fn = cb_fn; 4403 ctx->cb_arg = cb_arg; 4404 4405 pthread_mutex_lock(&bdev->internal.mutex); 4406 if (bdev->internal.histogram_in_progress) { 4407 pthread_mutex_unlock(&bdev->internal.mutex); 4408 free(ctx); 4409 cb_fn(cb_arg, -EAGAIN); 4410 return; 4411 } 4412 4413 bdev->internal.histogram_in_progress = true; 4414 pthread_mutex_unlock(&bdev->internal.mutex); 4415 4416 bdev->internal.histogram_enabled = enable; 4417 4418 if (enable) { 4419 /* Allocate histogram for each channel */ 4420 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_enable_channel, ctx, 4421 _spdk_bdev_histogram_enable_channel_cb); 4422 } else { 4423 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_disable_channel, ctx, 4424 _spdk_bdev_histogram_disable_channel_cb); 4425 } 4426 } 4427 4428 struct spdk_bdev_histogram_data_ctx { 4429 spdk_bdev_histogram_data_cb cb_fn; 4430 void *cb_arg; 4431 struct spdk_bdev *bdev; 4432 /** merged histogram data from all channels */ 4433 struct spdk_histogram_data *histogram; 4434 }; 4435 4436 static void 4437 _spdk_bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 4438 { 4439 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4440 4441 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 4442 free(ctx); 4443 } 4444 4445 static void 4446 _spdk_bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 4447 { 4448 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4449 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4450 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4451 int status = 0; 4452 4453 if (ch->histogram == NULL) { 4454 status = -EFAULT; 4455 } else { 4456 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 4457 } 4458 4459 spdk_for_each_channel_continue(i, status); 4460 } 4461 4462 void 4463 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 4464 spdk_bdev_histogram_data_cb cb_fn, 4465 void *cb_arg) 4466 { 4467 struct spdk_bdev_histogram_data_ctx *ctx; 4468 4469 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 4470 if (ctx == NULL) { 4471 cb_fn(cb_arg, -ENOMEM, NULL); 4472 return; 4473 } 4474 4475 ctx->bdev = bdev; 4476 ctx->cb_fn = cb_fn; 4477 ctx->cb_arg = cb_arg; 4478 4479 ctx->histogram = histogram; 4480 4481 spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_histogram_get_channel, ctx, 4482 _spdk_bdev_histogram_get_channel_cb); 4483 } 4484 4485 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 4486 4487 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 4488 { 4489 spdk_trace_register_owner(OWNER_BDEV, 'b'); 4490 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 4491 spdk_trace_register_description("BDEV_IO_START", "", TRACE_BDEV_IO_START, OWNER_BDEV, 4492 OBJECT_BDEV_IO, 1, 0, "type: "); 4493 spdk_trace_register_description("BDEV_IO_DONE", "", TRACE_BDEV_IO_DONE, OWNER_BDEV, 4494 OBJECT_BDEV_IO, 0, 0, ""); 4495 } 4496