1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 #include "spdk_internal/uring.h" 51 52 struct bdev_uring_io_channel { 53 struct bdev_uring_group_channel *group_ch; 54 }; 55 56 struct bdev_uring_group_channel { 57 uint64_t io_inflight; 58 uint64_t io_pending; 59 struct spdk_poller *poller; 60 struct io_uring uring; 61 }; 62 63 struct bdev_uring_task { 64 uint64_t len; 65 struct bdev_uring_io_channel *ch; 66 TAILQ_ENTRY(bdev_uring_task) link; 67 }; 68 69 struct bdev_uring { 70 struct spdk_bdev bdev; 71 char *filename; 72 int fd; 73 TAILQ_ENTRY(bdev_uring) link; 74 }; 75 76 static int bdev_uring_init(void); 77 static void bdev_uring_fini(void); 78 static void uring_free_bdev(struct bdev_uring *uring); 79 static void bdev_uring_get_spdk_running_config(FILE *fp); 80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 81 82 #define SPDK_URING_QUEUE_DEPTH 512 83 #define MAX_EVENTS_PER_POLL 32 84 85 static int 86 bdev_uring_get_ctx_size(void) 87 { 88 return sizeof(struct bdev_uring_task); 89 } 90 91 static struct spdk_bdev_module uring_if = { 92 .name = "uring", 93 .module_init = bdev_uring_init, 94 .module_fini = bdev_uring_fini, 95 .config_text = bdev_uring_get_spdk_running_config, 96 .get_ctx_size = bdev_uring_get_ctx_size, 97 }; 98 99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 100 101 static int 102 bdev_uring_open(struct bdev_uring *bdev) 103 { 104 int fd; 105 106 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 107 if (fd < 0) { 108 /* Try without O_DIRECT for non-disk files */ 109 fd = open(bdev->filename, O_RDWR | O_NOATIME); 110 if (fd < 0) { 111 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 112 bdev->filename, errno, spdk_strerror(errno)); 113 bdev->fd = -1; 114 return -1; 115 } 116 } 117 118 bdev->fd = fd; 119 120 return 0; 121 } 122 123 static int 124 bdev_uring_close(struct bdev_uring *bdev) 125 { 126 int rc; 127 128 if (bdev->fd == -1) { 129 return 0; 130 } 131 132 rc = close(bdev->fd); 133 if (rc < 0) { 134 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 135 bdev->fd, errno, spdk_strerror(errno)); 136 return -1; 137 } 138 139 bdev->fd = -1; 140 141 return 0; 142 } 143 144 static int64_t 145 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 146 struct bdev_uring_task *uring_task, 147 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 148 { 149 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 150 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 151 struct io_uring_sqe *sqe; 152 153 sqe = io_uring_get_sqe(&group_ch->uring); 154 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 155 io_uring_sqe_set_data(sqe, uring_task); 156 uring_task->len = nbytes; 157 uring_task->ch = uring_ch; 158 159 SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n", 160 iovcnt, nbytes, offset); 161 162 group_ch->io_pending++; 163 return nbytes; 164 } 165 166 static int64_t 167 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 168 struct bdev_uring_task *uring_task, 169 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 170 { 171 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 172 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 173 struct io_uring_sqe *sqe; 174 175 sqe = io_uring_get_sqe(&group_ch->uring); 176 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 177 io_uring_sqe_set_data(sqe, uring_task); 178 uring_task->len = nbytes; 179 uring_task->ch = uring_ch; 180 181 SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n", 182 iovcnt, nbytes, offset); 183 184 group_ch->io_pending++; 185 return nbytes; 186 } 187 188 static int 189 bdev_uring_destruct(void *ctx) 190 { 191 struct bdev_uring *uring = ctx; 192 int rc = 0; 193 194 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 195 rc = bdev_uring_close(uring); 196 if (rc < 0) { 197 SPDK_ERRLOG("bdev_uring_close() failed\n"); 198 } 199 spdk_io_device_unregister(uring, NULL); 200 uring_free_bdev(uring); 201 return rc; 202 } 203 204 static int 205 bdev_uring_reap(struct io_uring *ring, int max) 206 { 207 int i, count, ret; 208 struct io_uring_cqe *cqe; 209 struct bdev_uring_task *uring_task; 210 enum spdk_bdev_io_status status; 211 212 count = 0; 213 for (i = 0; i < max; i++) { 214 ret = io_uring_peek_cqe(ring, &cqe); 215 if (ret != 0) { 216 return ret; 217 } 218 219 if (cqe == NULL) { 220 return count; 221 } 222 223 uring_task = (struct bdev_uring_task *)cqe->user_data; 224 if (cqe->res != (signed)uring_task->len) { 225 status = SPDK_BDEV_IO_STATUS_FAILED; 226 } else { 227 status = SPDK_BDEV_IO_STATUS_SUCCESS; 228 } 229 230 uring_task->ch->group_ch->io_inflight--; 231 io_uring_cqe_seen(ring, cqe); 232 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 233 count++; 234 } 235 236 return count; 237 } 238 239 static int 240 bdev_uring_group_poll(void *arg) 241 { 242 struct bdev_uring_group_channel *group_ch = arg; 243 int to_complete, to_submit; 244 int count, ret; 245 246 to_submit = group_ch->io_pending; 247 248 if (to_submit > 0) { 249 /* If there are I/O to submit, use io_uring_submit here. 250 * It will automatically call spdk_io_uring_enter appropriately. */ 251 ret = io_uring_submit(&group_ch->uring); 252 if (ret < 0) { 253 return SPDK_POLLER_BUSY; 254 } 255 256 group_ch->io_pending = 0; 257 group_ch->io_inflight += to_submit; 258 } 259 260 to_complete = group_ch->io_inflight; 261 count = 0; 262 if (to_complete > 0) { 263 count = bdev_uring_reap(&group_ch->uring, to_complete); 264 } 265 266 if (count + to_submit > 0) { 267 return SPDK_POLLER_BUSY; 268 } else { 269 return SPDK_POLLER_IDLE; 270 } 271 } 272 273 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 274 bool success) 275 { 276 if (!success) { 277 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 278 return; 279 } 280 281 switch (bdev_io->type) { 282 case SPDK_BDEV_IO_TYPE_READ: 283 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 284 ch, 285 (struct bdev_uring_task *)bdev_io->driver_ctx, 286 bdev_io->u.bdev.iovs, 287 bdev_io->u.bdev.iovcnt, 288 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 289 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 290 break; 291 case SPDK_BDEV_IO_TYPE_WRITE: 292 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 293 ch, 294 (struct bdev_uring_task *)bdev_io->driver_ctx, 295 bdev_io->u.bdev.iovs, 296 bdev_io->u.bdev.iovcnt, 297 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 298 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 299 break; 300 default: 301 SPDK_ERRLOG("Wrong io type\n"); 302 break; 303 } 304 } 305 306 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 307 { 308 switch (bdev_io->type) { 309 /* Read and write operations must be performed on buffers aligned to 310 * bdev->required_alignment. If user specified unaligned buffers, 311 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 312 case SPDK_BDEV_IO_TYPE_READ: 313 case SPDK_BDEV_IO_TYPE_WRITE: 314 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 315 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 316 return 0; 317 default: 318 return -1; 319 } 320 } 321 322 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 323 { 324 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 325 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 326 } 327 } 328 329 static bool 330 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 331 { 332 switch (io_type) { 333 case SPDK_BDEV_IO_TYPE_READ: 334 case SPDK_BDEV_IO_TYPE_WRITE: 335 return true; 336 default: 337 return false; 338 } 339 } 340 341 static int 342 bdev_uring_create_cb(void *io_device, void *ctx_buf) 343 { 344 struct bdev_uring_io_channel *ch = ctx_buf; 345 346 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 347 348 return 0; 349 } 350 351 static void 352 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 353 { 354 struct bdev_uring_io_channel *ch = ctx_buf; 355 356 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 357 } 358 359 static struct spdk_io_channel * 360 bdev_uring_get_io_channel(void *ctx) 361 { 362 struct bdev_uring *uring = ctx; 363 364 return spdk_get_io_channel(uring); 365 } 366 367 static int 368 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 369 { 370 struct bdev_uring *uring = ctx; 371 372 spdk_json_write_named_object_begin(w, "uring"); 373 374 spdk_json_write_named_string(w, "filename", uring->filename); 375 376 spdk_json_write_object_end(w); 377 378 return 0; 379 } 380 381 static void 382 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 383 { 384 struct bdev_uring *uring = bdev->ctxt; 385 386 spdk_json_write_object_begin(w); 387 388 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 389 390 spdk_json_write_named_object_begin(w, "params"); 391 spdk_json_write_named_string(w, "name", bdev->name); 392 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 393 spdk_json_write_named_string(w, "filename", uring->filename); 394 spdk_json_write_object_end(w); 395 396 spdk_json_write_object_end(w); 397 } 398 399 static const struct spdk_bdev_fn_table uring_fn_table = { 400 .destruct = bdev_uring_destruct, 401 .submit_request = bdev_uring_submit_request, 402 .io_type_supported = bdev_uring_io_type_supported, 403 .get_io_channel = bdev_uring_get_io_channel, 404 .dump_info_json = bdev_uring_dump_info_json, 405 .write_config_json = bdev_uring_write_json_config, 406 }; 407 408 static void uring_free_bdev(struct bdev_uring *uring) 409 { 410 if (uring == NULL) { 411 return; 412 } 413 free(uring->filename); 414 free(uring->bdev.name); 415 free(uring); 416 } 417 418 static int 419 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 420 { 421 struct bdev_uring_group_channel *ch = ctx_buf; 422 423 /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only 424 * local devices but also devices attached from remote target */ 425 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { 426 SPDK_ERRLOG("uring I/O context setup failure\n"); 427 return -1; 428 } 429 430 ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); 431 return 0; 432 } 433 434 static void 435 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 436 { 437 struct bdev_uring_group_channel *ch = ctx_buf; 438 439 io_uring_queue_exit(&ch->uring); 440 441 spdk_poller_unregister(&ch->poller); 442 } 443 444 struct spdk_bdev * 445 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 446 { 447 struct bdev_uring *uring; 448 uint32_t detected_block_size; 449 uint64_t bdev_size; 450 int rc; 451 452 uring = calloc(1, sizeof(*uring)); 453 if (!uring) { 454 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 455 return NULL; 456 } 457 458 uring->filename = strdup(filename); 459 if (!uring->filename) { 460 goto error_return; 461 } 462 463 if (bdev_uring_open(uring)) { 464 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 465 goto error_return; 466 } 467 468 bdev_size = spdk_fd_get_size(uring->fd); 469 470 uring->bdev.name = strdup(name); 471 if (!uring->bdev.name) { 472 goto error_return; 473 } 474 uring->bdev.product_name = "URING bdev"; 475 uring->bdev.module = &uring_if; 476 477 uring->bdev.write_cache = 1; 478 479 detected_block_size = spdk_fd_get_blocklen(uring->fd); 480 if (block_size == 0) { 481 /* User did not specify block size - use autodetected block size. */ 482 if (detected_block_size == 0) { 483 SPDK_ERRLOG("Block size could not be auto-detected\n"); 484 goto error_return; 485 } 486 block_size = detected_block_size; 487 } else { 488 if (block_size < detected_block_size) { 489 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 490 "auto-detected block size %" PRIu32 "\n", 491 block_size, detected_block_size); 492 goto error_return; 493 } else if (detected_block_size != 0 && block_size != detected_block_size) { 494 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 495 "auto-detected block size %" PRIu32 "\n", 496 block_size, detected_block_size); 497 } 498 } 499 500 if (block_size < 512) { 501 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 502 goto error_return; 503 } 504 505 if (!spdk_u32_is_pow2(block_size)) { 506 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 507 goto error_return; 508 } 509 510 uring->bdev.blocklen = block_size; 511 uring->bdev.required_alignment = spdk_u32log2(block_size); 512 513 if (bdev_size % uring->bdev.blocklen != 0) { 514 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 515 bdev_size, uring->bdev.blocklen); 516 goto error_return; 517 } 518 519 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 520 uring->bdev.ctxt = uring; 521 522 uring->bdev.fn_table = &uring_fn_table; 523 524 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 525 sizeof(struct bdev_uring_io_channel), 526 uring->bdev.name); 527 rc = spdk_bdev_register(&uring->bdev); 528 if (rc) { 529 spdk_io_device_unregister(uring, NULL); 530 goto error_return; 531 } 532 533 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 534 return &uring->bdev; 535 536 error_return: 537 bdev_uring_close(uring); 538 uring_free_bdev(uring); 539 return NULL; 540 } 541 542 struct delete_uring_bdev_ctx { 543 spdk_delete_uring_complete cb_fn; 544 void *cb_arg; 545 }; 546 547 static void 548 uring_bdev_unregister_cb(void *arg, int bdeverrno) 549 { 550 struct delete_uring_bdev_ctx *ctx = arg; 551 552 ctx->cb_fn(ctx->cb_arg, bdeverrno); 553 free(ctx); 554 } 555 556 void 557 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 558 { 559 struct delete_uring_bdev_ctx *ctx; 560 561 if (!bdev || bdev->module != &uring_if) { 562 cb_fn(cb_arg, -ENODEV); 563 return; 564 } 565 566 ctx = calloc(1, sizeof(*ctx)); 567 if (ctx == NULL) { 568 cb_fn(cb_arg, -ENOMEM); 569 return; 570 } 571 572 ctx->cb_fn = cb_fn; 573 ctx->cb_arg = cb_arg; 574 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 575 } 576 577 static int 578 bdev_uring_init(void) 579 { 580 size_t i; 581 struct spdk_conf_section *sp; 582 struct spdk_bdev *bdev; 583 584 TAILQ_INIT(&g_uring_bdev_head); 585 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 586 sizeof(struct bdev_uring_group_channel), 587 "uring_module"); 588 589 sp = spdk_conf_find_section(NULL, "URING"); 590 if (!sp) { 591 return 0; 592 } 593 594 i = 0; 595 while (true) { 596 const char *file; 597 const char *name; 598 const char *block_size_str; 599 uint32_t block_size = 0; 600 long int tmp; 601 602 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 603 if (!file) { 604 break; 605 } 606 607 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 608 if (!name) { 609 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 610 i++; 611 continue; 612 } 613 614 block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2); 615 if (block_size_str) { 616 tmp = spdk_strtol(block_size_str, 10); 617 if (tmp < 0) { 618 SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file); 619 i++; 620 continue; 621 } 622 block_size = (uint32_t)tmp; 623 } 624 625 bdev = create_uring_bdev(name, file, block_size); 626 if (!bdev) { 627 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 628 i++; 629 continue; 630 } 631 632 i++; 633 } 634 635 return 0; 636 } 637 638 static void 639 bdev_uring_fini(void) 640 { 641 spdk_io_device_unregister(&uring_if, NULL); 642 } 643 644 static void 645 bdev_uring_get_spdk_running_config(FILE *fp) 646 { 647 char *file; 648 char *name; 649 uint32_t block_size; 650 struct bdev_uring *uring; 651 652 fprintf(fp, 653 "\n" 654 "# Users must change this section to match the /dev/sdX devices to be\n" 655 "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n" 656 "# The format is:\n" 657 "# URING <file name> <bdev name> [<block size>]\n" 658 "# The file name is the backing device\n" 659 "# The bdev name can be referenced from elsewhere in the configuration file.\n" 660 "# Block size may be omitted to automatically detect the block size of a bdev.\n" 661 "[URING]\n"); 662 663 TAILQ_FOREACH(uring, &g_uring_bdev_head, link) { 664 file = uring->filename; 665 name = uring->bdev.name; 666 block_size = uring->bdev.blocklen; 667 fprintf(fp, " URING %s %s %d\n", file, name, block_size); 668 } 669 fprintf(fp, "\n"); 670 } 671 672 SPDK_LOG_REGISTER_COMPONENT(uring) 673