1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 #include "spdk_internal/uring.h" 51 52 struct bdev_uring_io_channel { 53 struct bdev_uring_group_channel *group_ch; 54 }; 55 56 struct bdev_uring_group_channel { 57 uint64_t io_inflight; 58 uint64_t io_pending; 59 struct spdk_poller *poller; 60 struct io_uring uring; 61 }; 62 63 struct bdev_uring_task { 64 uint64_t len; 65 struct bdev_uring_io_channel *ch; 66 TAILQ_ENTRY(bdev_uring_task) link; 67 }; 68 69 struct bdev_uring { 70 struct spdk_bdev bdev; 71 char *filename; 72 int fd; 73 TAILQ_ENTRY(bdev_uring) link; 74 }; 75 76 static int bdev_uring_init(void); 77 static void bdev_uring_fini(void); 78 static void uring_free_bdev(struct bdev_uring *uring); 79 static void bdev_uring_get_spdk_running_config(FILE *fp); 80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 81 82 #define SPDK_URING_QUEUE_DEPTH 512 83 #define MAX_EVENTS_PER_POLL 32 84 85 static int 86 bdev_uring_get_ctx_size(void) 87 { 88 return sizeof(struct bdev_uring_task); 89 } 90 91 static struct spdk_bdev_module uring_if = { 92 .name = "uring", 93 .module_init = bdev_uring_init, 94 .module_fini = bdev_uring_fini, 95 .config_text = bdev_uring_get_spdk_running_config, 96 .get_ctx_size = bdev_uring_get_ctx_size, 97 }; 98 99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 100 101 static int 102 bdev_uring_open(struct bdev_uring *bdev) 103 { 104 int fd; 105 106 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 107 if (fd < 0) { 108 /* Try without O_DIRECT for non-disk files */ 109 fd = open(bdev->filename, O_RDWR | O_NOATIME); 110 if (fd < 0) { 111 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 112 bdev->filename, errno, spdk_strerror(errno)); 113 bdev->fd = -1; 114 return -1; 115 } 116 } 117 118 bdev->fd = fd; 119 120 return 0; 121 } 122 123 static int 124 bdev_uring_close(struct bdev_uring *bdev) 125 { 126 int rc; 127 128 if (bdev->fd == -1) { 129 return 0; 130 } 131 132 rc = close(bdev->fd); 133 if (rc < 0) { 134 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 135 bdev->fd, errno, spdk_strerror(errno)); 136 return -1; 137 } 138 139 bdev->fd = -1; 140 141 return 0; 142 } 143 144 static int64_t 145 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 146 struct bdev_uring_task *uring_task, 147 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 148 { 149 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 150 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 151 struct io_uring_sqe *sqe; 152 153 sqe = io_uring_get_sqe(&group_ch->uring); 154 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 155 io_uring_sqe_set_data(sqe, uring_task); 156 uring_task->len = nbytes; 157 uring_task->ch = uring_ch; 158 159 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", 160 iovcnt, nbytes, offset); 161 162 group_ch->io_pending++; 163 return nbytes; 164 } 165 166 static int64_t 167 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 168 struct bdev_uring_task *uring_task, 169 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 170 { 171 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 172 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 173 struct io_uring_sqe *sqe; 174 175 sqe = io_uring_get_sqe(&group_ch->uring); 176 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 177 io_uring_sqe_set_data(sqe, uring_task); 178 uring_task->len = nbytes; 179 uring_task->ch = uring_ch; 180 181 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", 182 iovcnt, nbytes, offset); 183 184 group_ch->io_pending++; 185 return nbytes; 186 } 187 188 static int 189 bdev_uring_destruct(void *ctx) 190 { 191 struct bdev_uring *uring = ctx; 192 int rc = 0; 193 194 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 195 rc = bdev_uring_close(uring); 196 if (rc < 0) { 197 SPDK_ERRLOG("bdev_uring_close() failed\n"); 198 } 199 spdk_io_device_unregister(uring, NULL); 200 uring_free_bdev(uring); 201 return rc; 202 } 203 204 static int 205 bdev_uring_reap(struct io_uring *ring, int max) 206 { 207 int i, count, ret; 208 struct io_uring_cqe *cqe; 209 struct bdev_uring_task *uring_task; 210 enum spdk_bdev_io_status status; 211 212 count = 0; 213 for (i = 0; i < max; i++) { 214 ret = io_uring_peek_cqe(ring, &cqe); 215 if (ret != 0) { 216 return ret; 217 } 218 219 if (cqe == NULL) { 220 return count; 221 } 222 223 uring_task = (struct bdev_uring_task *)cqe->user_data; 224 if (cqe->res != (signed)uring_task->len) { 225 status = SPDK_BDEV_IO_STATUS_FAILED; 226 } else { 227 status = SPDK_BDEV_IO_STATUS_SUCCESS; 228 } 229 230 uring_task->ch->group_ch->io_inflight--; 231 io_uring_cqe_seen(ring, cqe); 232 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 233 count++; 234 } 235 236 return count; 237 } 238 239 static int 240 bdev_uring_group_poll(void *arg) 241 { 242 struct bdev_uring_group_channel *group_ch = arg; 243 int to_complete, to_submit; 244 int count, ret; 245 246 to_submit = group_ch->io_pending; 247 to_complete = group_ch->io_inflight; 248 249 ret = 0; 250 if (to_submit > 0) { 251 /* If there are I/O to submit, use io_uring_submit here. 252 * It will automatically call spdk_io_uring_enter appropriately. */ 253 ret = io_uring_submit(&group_ch->uring); 254 group_ch->io_pending = 0; 255 group_ch->io_inflight += to_submit; 256 } else if (to_complete > 0) { 257 /* If there are I/O in flight but none to submit, we need to 258 * call io_uring_enter ourselves. */ 259 ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0, 260 IORING_ENTER_GETEVENTS); 261 } 262 263 if (ret < 0) { 264 return 1; 265 } 266 267 count = 0; 268 if (to_complete > 0) { 269 count = bdev_uring_reap(&group_ch->uring, to_complete); 270 } 271 272 return (count + to_submit); 273 } 274 275 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 276 bool success) 277 { 278 if (!success) { 279 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 280 return; 281 } 282 283 switch (bdev_io->type) { 284 case SPDK_BDEV_IO_TYPE_READ: 285 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 286 ch, 287 (struct bdev_uring_task *)bdev_io->driver_ctx, 288 bdev_io->u.bdev.iovs, 289 bdev_io->u.bdev.iovcnt, 290 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 291 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 292 break; 293 case SPDK_BDEV_IO_TYPE_WRITE: 294 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 295 ch, 296 (struct bdev_uring_task *)bdev_io->driver_ctx, 297 bdev_io->u.bdev.iovs, 298 bdev_io->u.bdev.iovcnt, 299 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 300 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 301 break; 302 default: 303 SPDK_ERRLOG("Wrong io type\n"); 304 break; 305 } 306 } 307 308 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 309 { 310 switch (bdev_io->type) { 311 /* Read and write operations must be performed on buffers aligned to 312 * bdev->required_alignment. If user specified unaligned buffers, 313 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 314 case SPDK_BDEV_IO_TYPE_READ: 315 case SPDK_BDEV_IO_TYPE_WRITE: 316 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 317 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 318 return 0; 319 default: 320 return -1; 321 } 322 } 323 324 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 325 { 326 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 327 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 328 } 329 } 330 331 static bool 332 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 333 { 334 switch (io_type) { 335 case SPDK_BDEV_IO_TYPE_READ: 336 case SPDK_BDEV_IO_TYPE_WRITE: 337 return true; 338 default: 339 return false; 340 } 341 } 342 343 static int 344 bdev_uring_create_cb(void *io_device, void *ctx_buf) 345 { 346 struct bdev_uring_io_channel *ch = ctx_buf; 347 348 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 349 350 return 0; 351 } 352 353 static void 354 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 355 { 356 struct bdev_uring_io_channel *ch = ctx_buf; 357 358 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 359 } 360 361 static struct spdk_io_channel * 362 bdev_uring_get_io_channel(void *ctx) 363 { 364 struct bdev_uring *uring = ctx; 365 366 return spdk_get_io_channel(uring); 367 } 368 369 static int 370 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 371 { 372 struct bdev_uring *uring = ctx; 373 374 spdk_json_write_named_object_begin(w, "uring"); 375 376 spdk_json_write_named_string(w, "filename", uring->filename); 377 378 spdk_json_write_object_end(w); 379 380 return 0; 381 } 382 383 static void 384 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 385 { 386 struct bdev_uring *uring = bdev->ctxt; 387 388 spdk_json_write_object_begin(w); 389 390 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 391 392 spdk_json_write_named_object_begin(w, "params"); 393 spdk_json_write_named_string(w, "name", bdev->name); 394 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 395 spdk_json_write_named_string(w, "filename", uring->filename); 396 spdk_json_write_object_end(w); 397 398 spdk_json_write_object_end(w); 399 } 400 401 static const struct spdk_bdev_fn_table uring_fn_table = { 402 .destruct = bdev_uring_destruct, 403 .submit_request = bdev_uring_submit_request, 404 .io_type_supported = bdev_uring_io_type_supported, 405 .get_io_channel = bdev_uring_get_io_channel, 406 .dump_info_json = bdev_uring_dump_info_json, 407 .write_config_json = bdev_uring_write_json_config, 408 }; 409 410 static void uring_free_bdev(struct bdev_uring *uring) 411 { 412 if (uring == NULL) { 413 return; 414 } 415 free(uring->filename); 416 free(uring->bdev.name); 417 free(uring); 418 } 419 420 static int 421 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 422 { 423 struct bdev_uring_group_channel *ch = ctx_buf; 424 425 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { 426 SPDK_ERRLOG("uring I/O context setup failure\n"); 427 return -1; 428 } 429 430 ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0); 431 return 0; 432 } 433 434 static void 435 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 436 { 437 struct bdev_uring_group_channel *ch = ctx_buf; 438 439 close(ch->uring.ring_fd); 440 io_uring_queue_exit(&ch->uring); 441 442 spdk_poller_unregister(&ch->poller); 443 } 444 445 struct spdk_bdev * 446 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 447 { 448 struct bdev_uring *uring; 449 uint32_t detected_block_size; 450 uint64_t bdev_size; 451 int rc; 452 453 uring = calloc(1, sizeof(*uring)); 454 if (!uring) { 455 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 456 return NULL; 457 } 458 459 uring->filename = strdup(filename); 460 if (!uring->filename) { 461 goto error_return; 462 } 463 464 if (bdev_uring_open(uring)) { 465 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 466 goto error_return; 467 } 468 469 bdev_size = spdk_fd_get_size(uring->fd); 470 471 uring->bdev.name = strdup(name); 472 if (!uring->bdev.name) { 473 goto error_return; 474 } 475 uring->bdev.product_name = "URING bdev"; 476 uring->bdev.module = &uring_if; 477 478 uring->bdev.write_cache = 1; 479 480 detected_block_size = spdk_fd_get_blocklen(uring->fd); 481 if (block_size == 0) { 482 /* User did not specify block size - use autodetected block size. */ 483 if (detected_block_size == 0) { 484 SPDK_ERRLOG("Block size could not be auto-detected\n"); 485 goto error_return; 486 } 487 block_size = detected_block_size; 488 } else { 489 if (block_size < detected_block_size) { 490 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 491 "auto-detected block size %" PRIu32 "\n", 492 block_size, detected_block_size); 493 goto error_return; 494 } else if (detected_block_size != 0 && block_size != detected_block_size) { 495 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 496 "auto-detected block size %" PRIu32 "\n", 497 block_size, detected_block_size); 498 } 499 } 500 501 if (block_size < 512) { 502 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 503 goto error_return; 504 } 505 506 if (!spdk_u32_is_pow2(block_size)) { 507 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 508 goto error_return; 509 } 510 511 uring->bdev.blocklen = block_size; 512 uring->bdev.required_alignment = spdk_u32log2(block_size); 513 514 if (bdev_size % uring->bdev.blocklen != 0) { 515 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 516 bdev_size, uring->bdev.blocklen); 517 goto error_return; 518 } 519 520 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 521 uring->bdev.ctxt = uring; 522 523 uring->bdev.fn_table = &uring_fn_table; 524 525 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 526 sizeof(struct bdev_uring_io_channel), 527 uring->bdev.name); 528 rc = spdk_bdev_register(&uring->bdev); 529 if (rc) { 530 spdk_io_device_unregister(uring, NULL); 531 goto error_return; 532 } 533 534 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 535 return &uring->bdev; 536 537 error_return: 538 bdev_uring_close(uring); 539 uring_free_bdev(uring); 540 return NULL; 541 } 542 543 struct delete_uring_bdev_ctx { 544 spdk_delete_uring_complete cb_fn; 545 void *cb_arg; 546 }; 547 548 static void 549 uring_bdev_unregister_cb(void *arg, int bdeverrno) 550 { 551 struct delete_uring_bdev_ctx *ctx = arg; 552 553 ctx->cb_fn(ctx->cb_arg, bdeverrno); 554 free(ctx); 555 } 556 557 void 558 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 559 { 560 struct delete_uring_bdev_ctx *ctx; 561 562 if (!bdev || bdev->module != &uring_if) { 563 cb_fn(cb_arg, -ENODEV); 564 return; 565 } 566 567 ctx = calloc(1, sizeof(*ctx)); 568 if (ctx == NULL) { 569 cb_fn(cb_arg, -ENOMEM); 570 return; 571 } 572 573 ctx->cb_fn = cb_fn; 574 ctx->cb_arg = cb_arg; 575 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 576 } 577 578 static int 579 bdev_uring_init(void) 580 { 581 size_t i; 582 struct spdk_conf_section *sp; 583 struct spdk_bdev *bdev; 584 585 TAILQ_INIT(&g_uring_bdev_head); 586 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 587 sizeof(struct bdev_uring_group_channel), 588 "uring_module"); 589 590 sp = spdk_conf_find_section(NULL, "URING"); 591 if (!sp) { 592 return 0; 593 } 594 595 i = 0; 596 while (true) { 597 const char *file; 598 const char *name; 599 const char *block_size_str; 600 uint32_t block_size = 0; 601 long int tmp; 602 603 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 604 if (!file) { 605 break; 606 } 607 608 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 609 if (!name) { 610 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 611 i++; 612 continue; 613 } 614 615 block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2); 616 if (block_size_str) { 617 tmp = spdk_strtol(block_size_str, 10); 618 if (tmp < 0) { 619 SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file); 620 i++; 621 continue; 622 } 623 block_size = (uint32_t)tmp; 624 } 625 626 bdev = create_uring_bdev(name, file, block_size); 627 if (!bdev) { 628 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 629 i++; 630 continue; 631 } 632 633 i++; 634 } 635 636 return 0; 637 } 638 639 static void 640 bdev_uring_fini(void) 641 { 642 spdk_io_device_unregister(&uring_if, NULL); 643 } 644 645 static void 646 bdev_uring_get_spdk_running_config(FILE *fp) 647 { 648 char *file; 649 char *name; 650 uint32_t block_size; 651 struct bdev_uring *uring; 652 653 fprintf(fp, 654 "\n" 655 "# Users must change this section to match the /dev/sdX devices to be\n" 656 "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n" 657 "# The format is:\n" 658 "# URING <file name> <bdev name> [<block size>]\n" 659 "# The file name is the backing device\n" 660 "# The bdev name can be referenced from elsewhere in the configuration file.\n" 661 "# Block size may be omitted to automatically detect the block size of a bdev.\n" 662 "[URING]\n"); 663 664 TAILQ_FOREACH(uring, &g_uring_bdev_head, link) { 665 file = uring->filename; 666 name = uring->bdev.name; 667 block_size = uring->bdev.blocklen; 668 fprintf(fp, " URING %s %s %d\n", file, name, block_size); 669 } 670 fprintf(fp, "\n"); 671 } 672 673 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) 674