1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 51 #include <liburing.h> 52 53 struct bdev_uring_io_channel { 54 struct bdev_uring_group_channel *group_ch; 55 }; 56 57 struct bdev_uring_group_channel { 58 uint64_t io_inflight; 59 uint64_t io_pending; 60 struct spdk_poller *poller; 61 struct io_uring uring; 62 }; 63 64 struct bdev_uring_task { 65 uint64_t len; 66 struct bdev_uring_io_channel *ch; 67 TAILQ_ENTRY(bdev_uring_task) link; 68 }; 69 70 struct bdev_uring { 71 struct spdk_bdev bdev; 72 char *filename; 73 int fd; 74 TAILQ_ENTRY(bdev_uring) link; 75 }; 76 77 static int bdev_uring_init(void); 78 static void bdev_uring_fini(void); 79 static void uring_free_bdev(struct bdev_uring *uring); 80 static void bdev_uring_get_spdk_running_config(FILE *fp); 81 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 82 83 #define SPDK_URING_QUEUE_DEPTH 512 84 #define MAX_EVENTS_PER_POLL 32 85 86 static int 87 bdev_uring_get_ctx_size(void) 88 { 89 return sizeof(struct bdev_uring_task); 90 } 91 92 static struct spdk_bdev_module uring_if = { 93 .name = "uring", 94 .module_init = bdev_uring_init, 95 .module_fini = bdev_uring_fini, 96 .config_text = bdev_uring_get_spdk_running_config, 97 .get_ctx_size = bdev_uring_get_ctx_size, 98 }; 99 100 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 101 102 static int 103 bdev_uring_open(struct bdev_uring *bdev) 104 { 105 int fd; 106 107 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 108 if (fd < 0) { 109 /* Try without O_DIRECT for non-disk files */ 110 fd = open(bdev->filename, O_RDWR | O_NOATIME); 111 if (fd < 0) { 112 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 113 bdev->filename, errno, spdk_strerror(errno)); 114 bdev->fd = -1; 115 return -1; 116 } 117 } 118 119 bdev->fd = fd; 120 121 return 0; 122 } 123 124 static int 125 bdev_uring_close(struct bdev_uring *bdev) 126 { 127 int rc; 128 129 if (bdev->fd == -1) { 130 return 0; 131 } 132 133 rc = close(bdev->fd); 134 if (rc < 0) { 135 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 136 bdev->fd, errno, spdk_strerror(errno)); 137 return -1; 138 } 139 140 bdev->fd = -1; 141 142 return 0; 143 } 144 145 static int64_t 146 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 147 struct bdev_uring_task *uring_task, 148 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 149 { 150 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 151 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 152 struct io_uring_sqe *sqe; 153 154 sqe = io_uring_get_sqe(&group_ch->uring); 155 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 156 io_uring_sqe_set_data(sqe, uring_task); 157 uring_task->len = nbytes; 158 uring_task->ch = uring_ch; 159 160 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", 161 iovcnt, nbytes, offset); 162 163 group_ch->io_pending++; 164 return nbytes; 165 } 166 167 static int64_t 168 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 169 struct bdev_uring_task *uring_task, 170 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 171 { 172 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 173 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 174 struct io_uring_sqe *sqe; 175 176 sqe = io_uring_get_sqe(&group_ch->uring); 177 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 178 io_uring_sqe_set_data(sqe, uring_task); 179 uring_task->len = nbytes; 180 uring_task->ch = uring_ch; 181 182 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", 183 iovcnt, nbytes, offset); 184 185 group_ch->io_pending++; 186 return nbytes; 187 } 188 189 static int 190 bdev_uring_destruct(void *ctx) 191 { 192 struct bdev_uring *uring = ctx; 193 int rc = 0; 194 195 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 196 rc = bdev_uring_close(uring); 197 if (rc < 0) { 198 SPDK_ERRLOG("bdev_uring_close() failed\n"); 199 } 200 spdk_io_device_unregister(uring, NULL); 201 uring_free_bdev(uring); 202 return rc; 203 } 204 205 static int 206 bdev_uring_reap(struct io_uring *ring, int max) 207 { 208 int i, count, ret; 209 struct io_uring_cqe *cqe; 210 struct bdev_uring_task *uring_task; 211 enum spdk_bdev_io_status status; 212 213 count = 0; 214 for (i = 0; i < max; i++) { 215 ret = io_uring_peek_cqe(ring, &cqe); 216 if (ret != 0) { 217 return ret; 218 } 219 220 if (cqe == NULL) { 221 return count; 222 } 223 224 uring_task = (struct bdev_uring_task *)cqe->user_data; 225 if (cqe->res != (signed)uring_task->len) { 226 status = SPDK_BDEV_IO_STATUS_FAILED; 227 } else { 228 status = SPDK_BDEV_IO_STATUS_SUCCESS; 229 } 230 231 uring_task->ch->group_ch->io_inflight--; 232 io_uring_cqe_seen(ring, cqe); 233 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 234 count++; 235 } 236 237 return count; 238 } 239 240 static int 241 bdev_uring_group_poll(void *arg) 242 { 243 struct bdev_uring_group_channel *group_ch = arg; 244 int to_complete, to_submit; 245 int count, ret; 246 247 to_submit = group_ch->io_pending; 248 to_complete = group_ch->io_inflight; 249 250 ret = 0; 251 if (to_submit > 0) { 252 /* If there are I/O to submit, use io_uring_submit here. 253 * It will automatically call io_uring_enter appropriately. */ 254 ret = io_uring_submit(&group_ch->uring); 255 group_ch->io_pending = 0; 256 group_ch->io_inflight += to_submit; 257 } else if (to_complete > 0) { 258 /* If there are I/O in flight but none to submit, we need to 259 * call io_uring_enter ourselves. */ 260 ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0, 261 IORING_ENTER_GETEVENTS, NULL); 262 } 263 264 if (ret < 0) { 265 return 1; 266 } 267 268 count = 0; 269 if (to_complete > 0) { 270 count = bdev_uring_reap(&group_ch->uring, to_complete); 271 } 272 273 return (count + to_submit); 274 } 275 276 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 277 bool success) 278 { 279 if (!success) { 280 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 281 return; 282 } 283 284 switch (bdev_io->type) { 285 case SPDK_BDEV_IO_TYPE_READ: 286 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 287 ch, 288 (struct bdev_uring_task *)bdev_io->driver_ctx, 289 bdev_io->u.bdev.iovs, 290 bdev_io->u.bdev.iovcnt, 291 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 292 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 293 break; 294 case SPDK_BDEV_IO_TYPE_WRITE: 295 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 296 ch, 297 (struct bdev_uring_task *)bdev_io->driver_ctx, 298 bdev_io->u.bdev.iovs, 299 bdev_io->u.bdev.iovcnt, 300 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 301 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 302 break; 303 default: 304 SPDK_ERRLOG("Wrong io type\n"); 305 break; 306 } 307 } 308 309 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 310 { 311 switch (bdev_io->type) { 312 /* Read and write operations must be performed on buffers aligned to 313 * bdev->required_alignment. If user specified unaligned buffers, 314 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 315 case SPDK_BDEV_IO_TYPE_READ: 316 case SPDK_BDEV_IO_TYPE_WRITE: 317 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 318 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 319 return 0; 320 default: 321 return -1; 322 } 323 } 324 325 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 326 { 327 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 328 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 329 } 330 } 331 332 static bool 333 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 334 { 335 switch (io_type) { 336 case SPDK_BDEV_IO_TYPE_READ: 337 case SPDK_BDEV_IO_TYPE_WRITE: 338 return true; 339 default: 340 return false; 341 } 342 } 343 344 static int 345 bdev_uring_create_cb(void *io_device, void *ctx_buf) 346 { 347 struct bdev_uring_io_channel *ch = ctx_buf; 348 349 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 350 351 return 0; 352 } 353 354 static void 355 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 356 { 357 struct bdev_uring_io_channel *ch = ctx_buf; 358 359 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 360 } 361 362 static struct spdk_io_channel * 363 bdev_uring_get_io_channel(void *ctx) 364 { 365 struct bdev_uring *uring = ctx; 366 367 return spdk_get_io_channel(uring); 368 } 369 370 static int 371 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 372 { 373 struct bdev_uring *uring = ctx; 374 375 spdk_json_write_named_object_begin(w, "uring"); 376 377 spdk_json_write_named_string(w, "filename", uring->filename); 378 379 spdk_json_write_object_end(w); 380 381 return 0; 382 } 383 384 static void 385 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 386 { 387 struct bdev_uring *uring = bdev->ctxt; 388 389 spdk_json_write_object_begin(w); 390 391 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 392 393 spdk_json_write_named_object_begin(w, "params"); 394 spdk_json_write_named_string(w, "name", bdev->name); 395 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 396 spdk_json_write_named_string(w, "filename", uring->filename); 397 spdk_json_write_object_end(w); 398 399 spdk_json_write_object_end(w); 400 } 401 402 static const struct spdk_bdev_fn_table uring_fn_table = { 403 .destruct = bdev_uring_destruct, 404 .submit_request = bdev_uring_submit_request, 405 .io_type_supported = bdev_uring_io_type_supported, 406 .get_io_channel = bdev_uring_get_io_channel, 407 .dump_info_json = bdev_uring_dump_info_json, 408 .write_config_json = bdev_uring_write_json_config, 409 }; 410 411 static void uring_free_bdev(struct bdev_uring *uring) 412 { 413 if (uring == NULL) { 414 return; 415 } 416 free(uring->filename); 417 free(uring->bdev.name); 418 free(uring); 419 } 420 421 static int 422 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 423 { 424 struct bdev_uring_group_channel *ch = ctx_buf; 425 426 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { 427 SPDK_ERRLOG("uring I/O context setup failure\n"); 428 return -1; 429 } 430 431 ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0); 432 return 0; 433 } 434 435 static void 436 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 437 { 438 struct bdev_uring_group_channel *ch = ctx_buf; 439 440 close(ch->uring.ring_fd); 441 io_uring_queue_exit(&ch->uring); 442 443 spdk_poller_unregister(&ch->poller); 444 } 445 446 struct spdk_bdev * 447 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 448 { 449 struct bdev_uring *uring; 450 uint32_t detected_block_size; 451 uint64_t bdev_size; 452 int rc; 453 454 uring = calloc(1, sizeof(*uring)); 455 if (!uring) { 456 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 457 return NULL; 458 } 459 460 uring->filename = strdup(filename); 461 if (!uring->filename) { 462 goto error_return; 463 } 464 465 if (bdev_uring_open(uring)) { 466 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 467 goto error_return; 468 } 469 470 bdev_size = spdk_fd_get_size(uring->fd); 471 472 uring->bdev.name = strdup(name); 473 if (!uring->bdev.name) { 474 goto error_return; 475 } 476 uring->bdev.product_name = "URING bdev"; 477 uring->bdev.module = &uring_if; 478 479 uring->bdev.write_cache = 1; 480 481 detected_block_size = spdk_fd_get_blocklen(uring->fd); 482 if (block_size == 0) { 483 /* User did not specify block size - use autodetected block size. */ 484 if (detected_block_size == 0) { 485 SPDK_ERRLOG("Block size could not be auto-detected\n"); 486 goto error_return; 487 } 488 block_size = detected_block_size; 489 } else { 490 if (block_size < detected_block_size) { 491 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 492 "auto-detected block size %" PRIu32 "\n", 493 block_size, detected_block_size); 494 goto error_return; 495 } else if (detected_block_size != 0 && block_size != detected_block_size) { 496 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 497 "auto-detected block size %" PRIu32 "\n", 498 block_size, detected_block_size); 499 } 500 } 501 502 if (block_size < 512) { 503 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 504 goto error_return; 505 } 506 507 if (!spdk_u32_is_pow2(block_size)) { 508 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 509 goto error_return; 510 } 511 512 uring->bdev.blocklen = block_size; 513 uring->bdev.required_alignment = spdk_u32log2(block_size); 514 515 if (bdev_size % uring->bdev.blocklen != 0) { 516 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 517 bdev_size, uring->bdev.blocklen); 518 goto error_return; 519 } 520 521 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 522 uring->bdev.ctxt = uring; 523 524 uring->bdev.fn_table = &uring_fn_table; 525 526 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 527 sizeof(struct bdev_uring_io_channel), 528 uring->bdev.name); 529 rc = spdk_bdev_register(&uring->bdev); 530 if (rc) { 531 spdk_io_device_unregister(uring, NULL); 532 goto error_return; 533 } 534 535 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 536 return &uring->bdev; 537 538 error_return: 539 bdev_uring_close(uring); 540 uring_free_bdev(uring); 541 return NULL; 542 } 543 544 struct delete_uring_bdev_ctx { 545 spdk_delete_uring_complete cb_fn; 546 void *cb_arg; 547 }; 548 549 static void 550 uring_bdev_unregister_cb(void *arg, int bdeverrno) 551 { 552 struct delete_uring_bdev_ctx *ctx = arg; 553 554 ctx->cb_fn(ctx->cb_arg, bdeverrno); 555 free(ctx); 556 } 557 558 void 559 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 560 { 561 struct delete_uring_bdev_ctx *ctx; 562 563 if (!bdev || bdev->module != &uring_if) { 564 cb_fn(cb_arg, -ENODEV); 565 return; 566 } 567 568 ctx = calloc(1, sizeof(*ctx)); 569 if (ctx == NULL) { 570 cb_fn(cb_arg, -ENOMEM); 571 return; 572 } 573 574 ctx->cb_fn = cb_fn; 575 ctx->cb_arg = cb_arg; 576 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 577 } 578 579 static int 580 bdev_uring_init(void) 581 { 582 size_t i; 583 struct spdk_conf_section *sp; 584 struct spdk_bdev *bdev; 585 586 TAILQ_INIT(&g_uring_bdev_head); 587 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 588 sizeof(struct bdev_uring_group_channel), 589 "uring_module"); 590 591 sp = spdk_conf_find_section(NULL, "URING"); 592 if (!sp) { 593 return 0; 594 } 595 596 i = 0; 597 while (true) { 598 const char *file; 599 const char *name; 600 const char *block_size_str; 601 uint32_t block_size = 0; 602 long int tmp; 603 604 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 605 if (!file) { 606 break; 607 } 608 609 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 610 if (!name) { 611 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 612 i++; 613 continue; 614 } 615 616 block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2); 617 if (block_size_str) { 618 tmp = spdk_strtol(block_size_str, 10); 619 if (tmp < 0) { 620 SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file); 621 i++; 622 continue; 623 } 624 block_size = (uint32_t)tmp; 625 } 626 627 bdev = create_uring_bdev(name, file, block_size); 628 if (!bdev) { 629 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 630 i++; 631 continue; 632 } 633 634 i++; 635 } 636 637 return 0; 638 } 639 640 static void 641 bdev_uring_fini(void) 642 { 643 spdk_io_device_unregister(&uring_if, NULL); 644 } 645 646 static void 647 bdev_uring_get_spdk_running_config(FILE *fp) 648 { 649 char *file; 650 char *name; 651 uint32_t block_size; 652 struct bdev_uring *uring; 653 654 fprintf(fp, 655 "\n" 656 "# Users must change this section to match the /dev/sdX devices to be\n" 657 "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n" 658 "# The format is:\n" 659 "# URING <file name> <bdev name> [<block size>]\n" 660 "# The file name is the backing device\n" 661 "# The bdev name can be referenced from elsewhere in the configuration file.\n" 662 "# Block size may be omitted to automatically detect the block size of a bdev.\n" 663 "[URING]\n"); 664 665 TAILQ_FOREACH(uring, &g_uring_bdev_head, link) { 666 file = uring->filename; 667 name = uring->bdev.name; 668 block_size = uring->bdev.blocklen; 669 fprintf(fp, " URING %s %s %d\n", file, name, block_size); 670 } 671 fprintf(fp, "\n"); 672 } 673 674 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) 675