1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 51 #include <liburing.h> 52 53 struct bdev_uring_io_channel { 54 struct bdev_uring_group_channel *group_ch; 55 }; 56 57 struct bdev_uring_group_channel { 58 uint64_t io_inflight; 59 uint64_t io_pending; 60 struct spdk_poller *poller; 61 struct io_uring uring; 62 }; 63 64 struct bdev_uring_task { 65 uint64_t len; 66 struct bdev_uring_io_channel *ch; 67 TAILQ_ENTRY(bdev_uring_task) link; 68 }; 69 70 struct bdev_uring { 71 struct spdk_bdev bdev; 72 char *filename; 73 int fd; 74 TAILQ_ENTRY(bdev_uring) link; 75 }; 76 77 static int bdev_uring_init(void); 78 static void bdev_uring_fini(void); 79 static void uring_free_bdev(struct bdev_uring *uring); 80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 81 82 #define SPDK_URING_QUEUE_DEPTH 512 83 #define MAX_EVENTS_PER_POLL 32 84 85 static int 86 bdev_uring_get_ctx_size(void) 87 { 88 return sizeof(struct bdev_uring_task); 89 } 90 91 static struct spdk_bdev_module uring_if = { 92 .name = "uring", 93 .module_init = bdev_uring_init, 94 .module_fini = bdev_uring_fini, 95 .config_text = NULL, 96 .get_ctx_size = bdev_uring_get_ctx_size, 97 }; 98 99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 100 101 static int 102 bdev_uring_open(struct bdev_uring *bdev) 103 { 104 int fd; 105 106 fd = open(bdev->filename, O_NOATIME | O_DIRECT); 107 if (fd < 0) { 108 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 109 bdev->filename, errno, spdk_strerror(errno)); 110 bdev->fd = -1; 111 return -1; 112 } 113 114 bdev->fd = fd; 115 116 return 0; 117 } 118 119 static int 120 bdev_uring_close(struct bdev_uring *bdev) 121 { 122 int rc; 123 124 if (bdev->fd == -1) { 125 return 0; 126 } 127 128 rc = close(bdev->fd); 129 if (rc < 0) { 130 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 131 bdev->fd, errno, spdk_strerror(errno)); 132 return -1; 133 } 134 135 bdev->fd = -1; 136 137 return 0; 138 } 139 140 static int64_t 141 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 142 struct bdev_uring_task *uring_task, 143 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 144 { 145 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 146 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 147 struct io_uring_sqe *sqe; 148 149 sqe = io_uring_get_sqe(&group_ch->uring); 150 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 151 io_uring_sqe_set_data(sqe, uring_task); 152 uring_task->len = nbytes; 153 uring_task->ch = uring_ch; 154 155 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", 156 iovcnt, nbytes, offset); 157 158 group_ch->io_pending++; 159 return nbytes; 160 } 161 162 static int64_t 163 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 164 struct bdev_uring_task *uring_task, 165 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 166 { 167 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 168 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 169 struct io_uring_sqe *sqe; 170 171 sqe = io_uring_get_sqe(&group_ch->uring); 172 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 173 io_uring_sqe_set_data(sqe, uring_task); 174 uring_task->ch = uring_ch; 175 176 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", 177 iovcnt, nbytes, offset); 178 179 group_ch->io_pending++; 180 return nbytes; 181 } 182 183 static int 184 bdev_uring_destruct(void *ctx) 185 { 186 struct bdev_uring *uring = ctx; 187 int rc = 0; 188 189 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 190 rc = bdev_uring_close(uring); 191 if (rc < 0) { 192 SPDK_ERRLOG("bdev_uring_close() failed\n"); 193 } 194 spdk_io_device_unregister(uring, NULL); 195 uring_free_bdev(uring); 196 return rc; 197 } 198 199 static int 200 bdev_uring_reap(struct io_uring *ring, int max) 201 { 202 int i, count, ret; 203 struct io_uring_cqe *cqe; 204 struct bdev_uring_task *uring_task; 205 enum spdk_bdev_io_status status; 206 207 count = 0; 208 for (i = 0; i < max; i++) { 209 ret = io_uring_peek_cqe(ring, &cqe); 210 if (ret != 0) { 211 return ret; 212 } 213 214 if (cqe == NULL) { 215 return count; 216 } 217 218 uring_task = (struct bdev_uring_task *)cqe->user_data; 219 if (cqe->res != (signed)uring_task->len) { 220 status = SPDK_BDEV_IO_STATUS_FAILED; 221 } else { 222 status = SPDK_BDEV_IO_STATUS_SUCCESS; 223 } 224 225 uring_task->ch->group_ch->io_inflight--; 226 io_uring_cqe_seen(ring, cqe); 227 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 228 count++; 229 } 230 231 return count; 232 } 233 234 static int 235 bdev_uring_group_poll(void *arg) 236 { 237 struct bdev_uring_group_channel *group_ch = arg; 238 int to_complete, to_submit; 239 int count, ret; 240 241 to_submit = group_ch->io_pending; 242 to_complete = group_ch->io_inflight; 243 244 ret = 0; 245 if (to_submit > 0) { 246 /* If there are I/O to submit, use io_uring_submit here. 247 * It will automatically call io_uring_enter appropriately. */ 248 ret = io_uring_submit(&group_ch->uring); 249 group_ch->io_pending = 0; 250 group_ch->io_inflight += to_submit; 251 } else if (to_complete > 0) { 252 /* If there are I/O in flight but none to submit, we need to 253 * call io_uring_enter ourselves. */ 254 ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0, 255 IORING_ENTER_GETEVENTS, NULL); 256 } 257 258 if (ret < 0) { 259 return 1; 260 } 261 262 count = 0; 263 if (to_complete > 0) { 264 count = bdev_uring_reap(&group_ch->uring, to_complete); 265 } 266 267 return (count + to_submit); 268 } 269 270 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 271 bool success) 272 { 273 if (!success) { 274 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 275 return; 276 } 277 278 switch (bdev_io->type) { 279 case SPDK_BDEV_IO_TYPE_READ: 280 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 281 ch, 282 (struct bdev_uring_task *)bdev_io->driver_ctx, 283 bdev_io->u.bdev.iovs, 284 bdev_io->u.bdev.iovcnt, 285 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 286 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 287 break; 288 case SPDK_BDEV_IO_TYPE_WRITE: 289 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 290 ch, 291 (struct bdev_uring_task *)bdev_io->driver_ctx, 292 bdev_io->u.bdev.iovs, 293 bdev_io->u.bdev.iovcnt, 294 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 295 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 296 break; 297 default: 298 SPDK_ERRLOG("Wrong io type\n"); 299 break; 300 } 301 } 302 303 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 304 { 305 switch (bdev_io->type) { 306 /* Read and write operations must be performed on buffers aligned to 307 * bdev->required_alignment. If user specified unaligned buffers, 308 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 309 case SPDK_BDEV_IO_TYPE_READ: 310 case SPDK_BDEV_IO_TYPE_WRITE: 311 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 312 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 313 return 0; 314 default: 315 return -1; 316 } 317 } 318 319 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 320 { 321 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 322 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 323 } 324 } 325 326 static bool 327 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 328 { 329 switch (io_type) { 330 case SPDK_BDEV_IO_TYPE_READ: 331 case SPDK_BDEV_IO_TYPE_WRITE: 332 return true; 333 default: 334 return false; 335 } 336 } 337 338 static int 339 bdev_uring_create_cb(void *io_device, void *ctx_buf) 340 { 341 struct bdev_uring_io_channel *ch = ctx_buf; 342 343 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 344 345 return 0; 346 } 347 348 static void 349 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 350 { 351 struct bdev_uring_io_channel *ch = ctx_buf; 352 353 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 354 } 355 356 static struct spdk_io_channel * 357 bdev_uring_get_io_channel(void *ctx) 358 { 359 struct bdev_uring *uring = ctx; 360 361 return spdk_get_io_channel(uring); 362 } 363 364 365 static const struct spdk_bdev_fn_table uring_fn_table = { 366 .destruct = bdev_uring_destruct, 367 .submit_request = bdev_uring_submit_request, 368 .io_type_supported = bdev_uring_io_type_supported, 369 .get_io_channel = bdev_uring_get_io_channel, 370 }; 371 372 static void uring_free_bdev(struct bdev_uring *uring) 373 { 374 if (uring == NULL) { 375 return; 376 } 377 free(uring->filename); 378 free(uring->bdev.name); 379 free(uring); 380 } 381 382 static int 383 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 384 { 385 struct bdev_uring_group_channel *ch = ctx_buf; 386 387 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { 388 SPDK_ERRLOG("uring I/O context setup failure\n"); 389 return -1; 390 } 391 392 ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0); 393 return 0; 394 } 395 396 static void 397 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 398 { 399 struct bdev_uring_group_channel *ch = ctx_buf; 400 401 close(ch->uring.ring_fd); 402 io_uring_queue_exit(&ch->uring); 403 404 spdk_poller_unregister(&ch->poller); 405 } 406 407 struct spdk_bdev * 408 create_uring_bdev(const char *name, const char *filename) 409 { 410 struct bdev_uring *uring; 411 uint32_t block_size; 412 uint64_t bdev_size; 413 int rc; 414 415 uring = calloc(1, sizeof(*uring)); 416 if (!uring) { 417 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 418 return NULL; 419 } 420 421 uring->filename = strdup(filename); 422 if (!uring->filename) { 423 goto error_return; 424 } 425 426 if (bdev_uring_open(uring)) { 427 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 428 goto error_return; 429 } 430 431 bdev_size = spdk_fd_get_size(uring->fd); 432 433 uring->bdev.name = strdup(name); 434 if (!uring->bdev.name) { 435 goto error_return; 436 } 437 uring->bdev.product_name = "URING bdev"; 438 uring->bdev.module = &uring_if; 439 440 uring->bdev.write_cache = 1; 441 442 block_size = spdk_fd_get_blocklen(uring->fd); 443 if (block_size == 0) { 444 SPDK_ERRLOG("Block size could not be auto-detected\n"); 445 goto error_return; 446 } 447 448 if (block_size < 512) { 449 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 450 goto error_return; 451 } 452 453 if (!spdk_u32_is_pow2(block_size)) { 454 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 455 goto error_return; 456 } 457 458 uring->bdev.blocklen = block_size; 459 uring->bdev.required_alignment = spdk_u32log2(block_size); 460 461 if (bdev_size % uring->bdev.blocklen != 0) { 462 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 463 bdev_size, uring->bdev.blocklen); 464 goto error_return; 465 } 466 467 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 468 uring->bdev.ctxt = uring; 469 470 uring->bdev.fn_table = &uring_fn_table; 471 472 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 473 sizeof(struct bdev_uring_io_channel), 474 uring->bdev.name); 475 rc = spdk_bdev_register(&uring->bdev); 476 if (rc) { 477 spdk_io_device_unregister(uring, NULL); 478 goto error_return; 479 } 480 481 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 482 return &uring->bdev; 483 484 error_return: 485 bdev_uring_close(uring); 486 uring_free_bdev(uring); 487 return NULL; 488 } 489 490 struct delete_uring_bdev_ctx { 491 spdk_delete_uring_complete cb_fn; 492 void *cb_arg; 493 }; 494 495 static void 496 uring_bdev_unregister_cb(void *arg, int bdeverrno) 497 { 498 struct delete_uring_bdev_ctx *ctx = arg; 499 500 ctx->cb_fn(ctx->cb_arg, bdeverrno); 501 free(ctx); 502 } 503 504 void 505 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 506 { 507 struct delete_uring_bdev_ctx *ctx; 508 509 if (!bdev || bdev->module != &uring_if) { 510 cb_fn(cb_arg, -ENODEV); 511 return; 512 } 513 514 ctx = calloc(1, sizeof(*ctx)); 515 if (ctx == NULL) { 516 cb_fn(cb_arg, -ENOMEM); 517 return; 518 } 519 520 ctx->cb_fn = cb_fn; 521 ctx->cb_arg = cb_arg; 522 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 523 } 524 525 static int 526 bdev_uring_init(void) 527 { 528 size_t i; 529 struct spdk_conf_section *sp; 530 struct spdk_bdev *bdev; 531 532 TAILQ_INIT(&g_uring_bdev_head); 533 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 534 sizeof(struct bdev_uring_group_channel), 535 "uring_module"); 536 537 sp = spdk_conf_find_section(NULL, "URING"); 538 if (!sp) { 539 return 0; 540 } 541 542 i = 0; 543 while (true) { 544 const char *file; 545 const char *name; 546 547 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 548 if (!file) { 549 break; 550 } 551 552 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 553 if (!name) { 554 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 555 i++; 556 continue; 557 } 558 559 bdev = create_uring_bdev(name, file); 560 if (!bdev) { 561 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 562 i++; 563 continue; 564 } 565 566 i++; 567 } 568 569 return 0; 570 } 571 572 static void 573 bdev_uring_fini(void) 574 { 575 spdk_io_device_unregister(&uring_if, NULL); 576 } 577 578 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) 579