1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 #include "spdk_internal/uring.h" 51 52 struct bdev_uring_io_channel { 53 struct bdev_uring_group_channel *group_ch; 54 }; 55 56 struct bdev_uring_group_channel { 57 uint64_t io_inflight; 58 uint64_t io_pending; 59 struct spdk_poller *poller; 60 struct io_uring uring; 61 }; 62 63 struct bdev_uring_task { 64 uint64_t len; 65 struct bdev_uring_io_channel *ch; 66 TAILQ_ENTRY(bdev_uring_task) link; 67 }; 68 69 struct bdev_uring { 70 struct spdk_bdev bdev; 71 char *filename; 72 int fd; 73 TAILQ_ENTRY(bdev_uring) link; 74 }; 75 76 static int bdev_uring_init(void); 77 static void bdev_uring_fini(void); 78 static void uring_free_bdev(struct bdev_uring *uring); 79 static void bdev_uring_get_spdk_running_config(FILE *fp); 80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 81 82 #define SPDK_URING_QUEUE_DEPTH 512 83 #define MAX_EVENTS_PER_POLL 32 84 85 static int 86 bdev_uring_get_ctx_size(void) 87 { 88 return sizeof(struct bdev_uring_task); 89 } 90 91 static struct spdk_bdev_module uring_if = { 92 .name = "uring", 93 .module_init = bdev_uring_init, 94 .module_fini = bdev_uring_fini, 95 .config_text = bdev_uring_get_spdk_running_config, 96 .get_ctx_size = bdev_uring_get_ctx_size, 97 }; 98 99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 100 101 static int 102 bdev_uring_open(struct bdev_uring *bdev) 103 { 104 int fd; 105 106 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 107 if (fd < 0) { 108 /* Try without O_DIRECT for non-disk files */ 109 fd = open(bdev->filename, O_RDWR | O_NOATIME); 110 if (fd < 0) { 111 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 112 bdev->filename, errno, spdk_strerror(errno)); 113 bdev->fd = -1; 114 return -1; 115 } 116 } 117 118 bdev->fd = fd; 119 120 return 0; 121 } 122 123 static int 124 bdev_uring_close(struct bdev_uring *bdev) 125 { 126 int rc; 127 128 if (bdev->fd == -1) { 129 return 0; 130 } 131 132 rc = close(bdev->fd); 133 if (rc < 0) { 134 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 135 bdev->fd, errno, spdk_strerror(errno)); 136 return -1; 137 } 138 139 bdev->fd = -1; 140 141 return 0; 142 } 143 144 static int64_t 145 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 146 struct bdev_uring_task *uring_task, 147 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 148 { 149 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 150 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 151 struct io_uring_sqe *sqe; 152 153 sqe = io_uring_get_sqe(&group_ch->uring); 154 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 155 io_uring_sqe_set_data(sqe, uring_task); 156 uring_task->len = nbytes; 157 uring_task->ch = uring_ch; 158 159 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", 160 iovcnt, nbytes, offset); 161 162 group_ch->io_pending++; 163 return nbytes; 164 } 165 166 static int64_t 167 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 168 struct bdev_uring_task *uring_task, 169 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 170 { 171 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 172 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 173 struct io_uring_sqe *sqe; 174 175 sqe = io_uring_get_sqe(&group_ch->uring); 176 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 177 io_uring_sqe_set_data(sqe, uring_task); 178 uring_task->len = nbytes; 179 uring_task->ch = uring_ch; 180 181 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", 182 iovcnt, nbytes, offset); 183 184 group_ch->io_pending++; 185 return nbytes; 186 } 187 188 static int 189 bdev_uring_destruct(void *ctx) 190 { 191 struct bdev_uring *uring = ctx; 192 int rc = 0; 193 194 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 195 rc = bdev_uring_close(uring); 196 if (rc < 0) { 197 SPDK_ERRLOG("bdev_uring_close() failed\n"); 198 } 199 spdk_io_device_unregister(uring, NULL); 200 uring_free_bdev(uring); 201 return rc; 202 } 203 204 static int 205 bdev_uring_reap(struct io_uring *ring, int max) 206 { 207 int i, count, ret; 208 struct io_uring_cqe *cqe; 209 struct bdev_uring_task *uring_task; 210 enum spdk_bdev_io_status status; 211 212 count = 0; 213 for (i = 0; i < max; i++) { 214 ret = io_uring_peek_cqe(ring, &cqe); 215 if (ret != 0) { 216 return ret; 217 } 218 219 if (cqe == NULL) { 220 return count; 221 } 222 223 uring_task = (struct bdev_uring_task *)cqe->user_data; 224 if (cqe->res != (signed)uring_task->len) { 225 status = SPDK_BDEV_IO_STATUS_FAILED; 226 } else { 227 status = SPDK_BDEV_IO_STATUS_SUCCESS; 228 } 229 230 uring_task->ch->group_ch->io_inflight--; 231 io_uring_cqe_seen(ring, cqe); 232 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 233 count++; 234 } 235 236 return count; 237 } 238 239 static int 240 bdev_uring_group_poll(void *arg) 241 { 242 struct bdev_uring_group_channel *group_ch = arg; 243 int to_complete, to_submit; 244 int count, ret; 245 246 to_submit = group_ch->io_pending; 247 to_complete = group_ch->io_inflight; 248 249 ret = 0; 250 if (to_submit > 0) { 251 /* If there are I/O to submit, use io_uring_submit here. 252 * It will automatically call spdk_io_uring_enter appropriately. */ 253 ret = io_uring_submit(&group_ch->uring); 254 group_ch->io_pending = 0; 255 group_ch->io_inflight += to_submit; 256 } else if (to_complete > 0) { 257 /* If there are I/O in flight but none to submit, we need to 258 * call io_uring_enter ourselves. */ 259 ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0, 260 IORING_ENTER_GETEVENTS); 261 } 262 263 if (ret < 0) { 264 return SPDK_POLLER_BUSY; 265 } 266 267 count = 0; 268 if (to_complete > 0) { 269 count = bdev_uring_reap(&group_ch->uring, to_complete); 270 } 271 272 if (count + to_submit > 0) { 273 return SPDK_POLLER_BUSY; 274 } else { 275 return SPDK_POLLER_IDLE; 276 } 277 } 278 279 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 280 bool success) 281 { 282 if (!success) { 283 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 284 return; 285 } 286 287 switch (bdev_io->type) { 288 case SPDK_BDEV_IO_TYPE_READ: 289 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 290 ch, 291 (struct bdev_uring_task *)bdev_io->driver_ctx, 292 bdev_io->u.bdev.iovs, 293 bdev_io->u.bdev.iovcnt, 294 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 295 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 296 break; 297 case SPDK_BDEV_IO_TYPE_WRITE: 298 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 299 ch, 300 (struct bdev_uring_task *)bdev_io->driver_ctx, 301 bdev_io->u.bdev.iovs, 302 bdev_io->u.bdev.iovcnt, 303 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 304 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 305 break; 306 default: 307 SPDK_ERRLOG("Wrong io type\n"); 308 break; 309 } 310 } 311 312 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 313 { 314 switch (bdev_io->type) { 315 /* Read and write operations must be performed on buffers aligned to 316 * bdev->required_alignment. If user specified unaligned buffers, 317 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 318 case SPDK_BDEV_IO_TYPE_READ: 319 case SPDK_BDEV_IO_TYPE_WRITE: 320 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 321 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 322 return 0; 323 default: 324 return -1; 325 } 326 } 327 328 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 329 { 330 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 331 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 332 } 333 } 334 335 static bool 336 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 337 { 338 switch (io_type) { 339 case SPDK_BDEV_IO_TYPE_READ: 340 case SPDK_BDEV_IO_TYPE_WRITE: 341 return true; 342 default: 343 return false; 344 } 345 } 346 347 static int 348 bdev_uring_create_cb(void *io_device, void *ctx_buf) 349 { 350 struct bdev_uring_io_channel *ch = ctx_buf; 351 352 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 353 354 return 0; 355 } 356 357 static void 358 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 359 { 360 struct bdev_uring_io_channel *ch = ctx_buf; 361 362 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 363 } 364 365 static struct spdk_io_channel * 366 bdev_uring_get_io_channel(void *ctx) 367 { 368 struct bdev_uring *uring = ctx; 369 370 return spdk_get_io_channel(uring); 371 } 372 373 static int 374 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 375 { 376 struct bdev_uring *uring = ctx; 377 378 spdk_json_write_named_object_begin(w, "uring"); 379 380 spdk_json_write_named_string(w, "filename", uring->filename); 381 382 spdk_json_write_object_end(w); 383 384 return 0; 385 } 386 387 static void 388 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 389 { 390 struct bdev_uring *uring = bdev->ctxt; 391 392 spdk_json_write_object_begin(w); 393 394 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 395 396 spdk_json_write_named_object_begin(w, "params"); 397 spdk_json_write_named_string(w, "name", bdev->name); 398 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 399 spdk_json_write_named_string(w, "filename", uring->filename); 400 spdk_json_write_object_end(w); 401 402 spdk_json_write_object_end(w); 403 } 404 405 static const struct spdk_bdev_fn_table uring_fn_table = { 406 .destruct = bdev_uring_destruct, 407 .submit_request = bdev_uring_submit_request, 408 .io_type_supported = bdev_uring_io_type_supported, 409 .get_io_channel = bdev_uring_get_io_channel, 410 .dump_info_json = bdev_uring_dump_info_json, 411 .write_config_json = bdev_uring_write_json_config, 412 }; 413 414 static void uring_free_bdev(struct bdev_uring *uring) 415 { 416 if (uring == NULL) { 417 return; 418 } 419 free(uring->filename); 420 free(uring->bdev.name); 421 free(uring); 422 } 423 424 static int 425 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 426 { 427 struct bdev_uring_group_channel *ch = ctx_buf; 428 429 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { 430 SPDK_ERRLOG("uring I/O context setup failure\n"); 431 return -1; 432 } 433 434 ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); 435 return 0; 436 } 437 438 static void 439 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 440 { 441 struct bdev_uring_group_channel *ch = ctx_buf; 442 443 io_uring_queue_exit(&ch->uring); 444 445 spdk_poller_unregister(&ch->poller); 446 } 447 448 struct spdk_bdev * 449 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 450 { 451 struct bdev_uring *uring; 452 uint32_t detected_block_size; 453 uint64_t bdev_size; 454 int rc; 455 456 uring = calloc(1, sizeof(*uring)); 457 if (!uring) { 458 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 459 return NULL; 460 } 461 462 uring->filename = strdup(filename); 463 if (!uring->filename) { 464 goto error_return; 465 } 466 467 if (bdev_uring_open(uring)) { 468 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 469 goto error_return; 470 } 471 472 bdev_size = spdk_fd_get_size(uring->fd); 473 474 uring->bdev.name = strdup(name); 475 if (!uring->bdev.name) { 476 goto error_return; 477 } 478 uring->bdev.product_name = "URING bdev"; 479 uring->bdev.module = &uring_if; 480 481 uring->bdev.write_cache = 1; 482 483 detected_block_size = spdk_fd_get_blocklen(uring->fd); 484 if (block_size == 0) { 485 /* User did not specify block size - use autodetected block size. */ 486 if (detected_block_size == 0) { 487 SPDK_ERRLOG("Block size could not be auto-detected\n"); 488 goto error_return; 489 } 490 block_size = detected_block_size; 491 } else { 492 if (block_size < detected_block_size) { 493 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 494 "auto-detected block size %" PRIu32 "\n", 495 block_size, detected_block_size); 496 goto error_return; 497 } else if (detected_block_size != 0 && block_size != detected_block_size) { 498 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 499 "auto-detected block size %" PRIu32 "\n", 500 block_size, detected_block_size); 501 } 502 } 503 504 if (block_size < 512) { 505 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 506 goto error_return; 507 } 508 509 if (!spdk_u32_is_pow2(block_size)) { 510 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 511 goto error_return; 512 } 513 514 uring->bdev.blocklen = block_size; 515 uring->bdev.required_alignment = spdk_u32log2(block_size); 516 517 if (bdev_size % uring->bdev.blocklen != 0) { 518 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 519 bdev_size, uring->bdev.blocklen); 520 goto error_return; 521 } 522 523 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 524 uring->bdev.ctxt = uring; 525 526 uring->bdev.fn_table = &uring_fn_table; 527 528 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 529 sizeof(struct bdev_uring_io_channel), 530 uring->bdev.name); 531 rc = spdk_bdev_register(&uring->bdev); 532 if (rc) { 533 spdk_io_device_unregister(uring, NULL); 534 goto error_return; 535 } 536 537 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 538 return &uring->bdev; 539 540 error_return: 541 bdev_uring_close(uring); 542 uring_free_bdev(uring); 543 return NULL; 544 } 545 546 struct delete_uring_bdev_ctx { 547 spdk_delete_uring_complete cb_fn; 548 void *cb_arg; 549 }; 550 551 static void 552 uring_bdev_unregister_cb(void *arg, int bdeverrno) 553 { 554 struct delete_uring_bdev_ctx *ctx = arg; 555 556 ctx->cb_fn(ctx->cb_arg, bdeverrno); 557 free(ctx); 558 } 559 560 void 561 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 562 { 563 struct delete_uring_bdev_ctx *ctx; 564 565 if (!bdev || bdev->module != &uring_if) { 566 cb_fn(cb_arg, -ENODEV); 567 return; 568 } 569 570 ctx = calloc(1, sizeof(*ctx)); 571 if (ctx == NULL) { 572 cb_fn(cb_arg, -ENOMEM); 573 return; 574 } 575 576 ctx->cb_fn = cb_fn; 577 ctx->cb_arg = cb_arg; 578 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 579 } 580 581 static int 582 bdev_uring_init(void) 583 { 584 size_t i; 585 struct spdk_conf_section *sp; 586 struct spdk_bdev *bdev; 587 588 TAILQ_INIT(&g_uring_bdev_head); 589 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 590 sizeof(struct bdev_uring_group_channel), 591 "uring_module"); 592 593 sp = spdk_conf_find_section(NULL, "URING"); 594 if (!sp) { 595 return 0; 596 } 597 598 i = 0; 599 while (true) { 600 const char *file; 601 const char *name; 602 const char *block_size_str; 603 uint32_t block_size = 0; 604 long int tmp; 605 606 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 607 if (!file) { 608 break; 609 } 610 611 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 612 if (!name) { 613 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 614 i++; 615 continue; 616 } 617 618 block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2); 619 if (block_size_str) { 620 tmp = spdk_strtol(block_size_str, 10); 621 if (tmp < 0) { 622 SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file); 623 i++; 624 continue; 625 } 626 block_size = (uint32_t)tmp; 627 } 628 629 bdev = create_uring_bdev(name, file, block_size); 630 if (!bdev) { 631 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 632 i++; 633 continue; 634 } 635 636 i++; 637 } 638 639 return 0; 640 } 641 642 static void 643 bdev_uring_fini(void) 644 { 645 spdk_io_device_unregister(&uring_if, NULL); 646 } 647 648 static void 649 bdev_uring_get_spdk_running_config(FILE *fp) 650 { 651 char *file; 652 char *name; 653 uint32_t block_size; 654 struct bdev_uring *uring; 655 656 fprintf(fp, 657 "\n" 658 "# Users must change this section to match the /dev/sdX devices to be\n" 659 "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n" 660 "# The format is:\n" 661 "# URING <file name> <bdev name> [<block size>]\n" 662 "# The file name is the backing device\n" 663 "# The bdev name can be referenced from elsewhere in the configuration file.\n" 664 "# Block size may be omitted to automatically detect the block size of a bdev.\n" 665 "[URING]\n"); 666 667 TAILQ_FOREACH(uring, &g_uring_bdev_head, link) { 668 file = uring->filename; 669 name = uring->bdev.name; 670 block_size = uring->bdev.blocklen; 671 fprintf(fp, " URING %s %s %d\n", file, name, block_size); 672 } 673 fprintf(fp, "\n"); 674 } 675 676 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) 677