1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/conf.h" 41 #include "spdk/env.h" 42 #include "spdk/fd.h" 43 #include "spdk/likely.h" 44 #include "spdk/thread.h" 45 #include "spdk/json.h" 46 #include "spdk/util.h" 47 #include "spdk/string.h" 48 49 #include "spdk_internal/log.h" 50 51 #include <liburing.h> 52 53 struct bdev_uring_io_channel { 54 struct bdev_uring_group_channel *group_ch; 55 }; 56 57 struct bdev_uring_group_channel { 58 uint64_t io_inflight; 59 uint64_t io_pending; 60 struct spdk_poller *poller; 61 struct io_uring uring; 62 }; 63 64 struct bdev_uring_task { 65 uint64_t len; 66 struct bdev_uring_io_channel *ch; 67 TAILQ_ENTRY(bdev_uring_task) link; 68 }; 69 70 struct bdev_uring { 71 struct spdk_bdev bdev; 72 char *filename; 73 int fd; 74 TAILQ_ENTRY(bdev_uring) link; 75 }; 76 77 static int bdev_uring_init(void); 78 static void bdev_uring_fini(void); 79 static void uring_free_bdev(struct bdev_uring *uring); 80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; 81 82 #define SPDK_URING_QUEUE_DEPTH 512 83 #define MAX_EVENTS_PER_POLL 32 84 85 static int 86 bdev_uring_get_ctx_size(void) 87 { 88 return sizeof(struct bdev_uring_task); 89 } 90 91 static struct spdk_bdev_module uring_if = { 92 .name = "uring", 93 .module_init = bdev_uring_init, 94 .module_fini = bdev_uring_fini, 95 .config_text = NULL, 96 .get_ctx_size = bdev_uring_get_ctx_size, 97 }; 98 99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 100 101 static int 102 bdev_uring_open(struct bdev_uring *bdev) 103 { 104 int fd; 105 106 fd = open(bdev->filename, O_NOATIME | O_DIRECT); 107 if (fd < 0) { 108 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 109 bdev->filename, errno, spdk_strerror(errno)); 110 bdev->fd = -1; 111 return -1; 112 } 113 114 bdev->fd = fd; 115 116 return 0; 117 } 118 119 static int 120 bdev_uring_close(struct bdev_uring *bdev) 121 { 122 int rc; 123 124 if (bdev->fd == -1) { 125 return 0; 126 } 127 128 rc = close(bdev->fd); 129 if (rc < 0) { 130 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 131 bdev->fd, errno, spdk_strerror(errno)); 132 return -1; 133 } 134 135 bdev->fd = -1; 136 137 return 0; 138 } 139 140 static int64_t 141 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 142 struct bdev_uring_task *uring_task, 143 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 144 { 145 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 146 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 147 struct io_uring_sqe *sqe; 148 149 sqe = io_uring_get_sqe(&group_ch->uring); 150 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 151 io_uring_sqe_set_data(sqe, uring_task); 152 uring_task->len = nbytes; 153 uring_task->ch = uring_ch; 154 155 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", 156 iovcnt, nbytes, offset); 157 158 group_ch->io_pending++; 159 return nbytes; 160 } 161 162 static int64_t 163 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 164 struct bdev_uring_task *uring_task, 165 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 166 { 167 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 168 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 169 struct io_uring_sqe *sqe; 170 171 sqe = io_uring_get_sqe(&group_ch->uring); 172 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 173 io_uring_sqe_set_data(sqe, uring_task); 174 uring_task->len = nbytes; 175 uring_task->ch = uring_ch; 176 177 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", 178 iovcnt, nbytes, offset); 179 180 group_ch->io_pending++; 181 return nbytes; 182 } 183 184 static int 185 bdev_uring_destruct(void *ctx) 186 { 187 struct bdev_uring *uring = ctx; 188 int rc = 0; 189 190 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 191 rc = bdev_uring_close(uring); 192 if (rc < 0) { 193 SPDK_ERRLOG("bdev_uring_close() failed\n"); 194 } 195 spdk_io_device_unregister(uring, NULL); 196 uring_free_bdev(uring); 197 return rc; 198 } 199 200 static int 201 bdev_uring_reap(struct io_uring *ring, int max) 202 { 203 int i, count, ret; 204 struct io_uring_cqe *cqe; 205 struct bdev_uring_task *uring_task; 206 enum spdk_bdev_io_status status; 207 208 count = 0; 209 for (i = 0; i < max; i++) { 210 ret = io_uring_peek_cqe(ring, &cqe); 211 if (ret != 0) { 212 return ret; 213 } 214 215 if (cqe == NULL) { 216 return count; 217 } 218 219 uring_task = (struct bdev_uring_task *)cqe->user_data; 220 if (cqe->res != (signed)uring_task->len) { 221 status = SPDK_BDEV_IO_STATUS_FAILED; 222 } else { 223 status = SPDK_BDEV_IO_STATUS_SUCCESS; 224 } 225 226 uring_task->ch->group_ch->io_inflight--; 227 io_uring_cqe_seen(ring, cqe); 228 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 229 count++; 230 } 231 232 return count; 233 } 234 235 static int 236 bdev_uring_group_poll(void *arg) 237 { 238 struct bdev_uring_group_channel *group_ch = arg; 239 int to_complete, to_submit; 240 int count, ret; 241 242 to_submit = group_ch->io_pending; 243 to_complete = group_ch->io_inflight; 244 245 ret = 0; 246 if (to_submit > 0) { 247 /* If there are I/O to submit, use io_uring_submit here. 248 * It will automatically call io_uring_enter appropriately. */ 249 ret = io_uring_submit(&group_ch->uring); 250 group_ch->io_pending = 0; 251 group_ch->io_inflight += to_submit; 252 } else if (to_complete > 0) { 253 /* If there are I/O in flight but none to submit, we need to 254 * call io_uring_enter ourselves. */ 255 ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0, 256 IORING_ENTER_GETEVENTS, NULL); 257 } 258 259 if (ret < 0) { 260 return 1; 261 } 262 263 count = 0; 264 if (to_complete > 0) { 265 count = bdev_uring_reap(&group_ch->uring, to_complete); 266 } 267 268 return (count + to_submit); 269 } 270 271 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 272 bool success) 273 { 274 if (!success) { 275 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 276 return; 277 } 278 279 switch (bdev_io->type) { 280 case SPDK_BDEV_IO_TYPE_READ: 281 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 282 ch, 283 (struct bdev_uring_task *)bdev_io->driver_ctx, 284 bdev_io->u.bdev.iovs, 285 bdev_io->u.bdev.iovcnt, 286 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 287 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 288 break; 289 case SPDK_BDEV_IO_TYPE_WRITE: 290 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 291 ch, 292 (struct bdev_uring_task *)bdev_io->driver_ctx, 293 bdev_io->u.bdev.iovs, 294 bdev_io->u.bdev.iovcnt, 295 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 296 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 297 break; 298 default: 299 SPDK_ERRLOG("Wrong io type\n"); 300 break; 301 } 302 } 303 304 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 305 { 306 switch (bdev_io->type) { 307 /* Read and write operations must be performed on buffers aligned to 308 * bdev->required_alignment. If user specified unaligned buffers, 309 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 310 case SPDK_BDEV_IO_TYPE_READ: 311 case SPDK_BDEV_IO_TYPE_WRITE: 312 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 313 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 314 return 0; 315 default: 316 return -1; 317 } 318 } 319 320 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 321 { 322 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 323 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 324 } 325 } 326 327 static bool 328 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 329 { 330 switch (io_type) { 331 case SPDK_BDEV_IO_TYPE_READ: 332 case SPDK_BDEV_IO_TYPE_WRITE: 333 return true; 334 default: 335 return false; 336 } 337 } 338 339 static int 340 bdev_uring_create_cb(void *io_device, void *ctx_buf) 341 { 342 struct bdev_uring_io_channel *ch = ctx_buf; 343 344 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 345 346 return 0; 347 } 348 349 static void 350 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 351 { 352 struct bdev_uring_io_channel *ch = ctx_buf; 353 354 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 355 } 356 357 static struct spdk_io_channel * 358 bdev_uring_get_io_channel(void *ctx) 359 { 360 struct bdev_uring *uring = ctx; 361 362 return spdk_get_io_channel(uring); 363 } 364 365 366 static const struct spdk_bdev_fn_table uring_fn_table = { 367 .destruct = bdev_uring_destruct, 368 .submit_request = bdev_uring_submit_request, 369 .io_type_supported = bdev_uring_io_type_supported, 370 .get_io_channel = bdev_uring_get_io_channel, 371 }; 372 373 static void uring_free_bdev(struct bdev_uring *uring) 374 { 375 if (uring == NULL) { 376 return; 377 } 378 free(uring->filename); 379 free(uring->bdev.name); 380 free(uring); 381 } 382 383 static int 384 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 385 { 386 struct bdev_uring_group_channel *ch = ctx_buf; 387 388 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { 389 SPDK_ERRLOG("uring I/O context setup failure\n"); 390 return -1; 391 } 392 393 ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0); 394 return 0; 395 } 396 397 static void 398 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 399 { 400 struct bdev_uring_group_channel *ch = ctx_buf; 401 402 close(ch->uring.ring_fd); 403 io_uring_queue_exit(&ch->uring); 404 405 spdk_poller_unregister(&ch->poller); 406 } 407 408 struct spdk_bdev * 409 create_uring_bdev(const char *name, const char *filename) 410 { 411 struct bdev_uring *uring; 412 uint32_t block_size; 413 uint64_t bdev_size; 414 int rc; 415 416 uring = calloc(1, sizeof(*uring)); 417 if (!uring) { 418 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 419 return NULL; 420 } 421 422 uring->filename = strdup(filename); 423 if (!uring->filename) { 424 goto error_return; 425 } 426 427 if (bdev_uring_open(uring)) { 428 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 429 goto error_return; 430 } 431 432 bdev_size = spdk_fd_get_size(uring->fd); 433 434 uring->bdev.name = strdup(name); 435 if (!uring->bdev.name) { 436 goto error_return; 437 } 438 uring->bdev.product_name = "URING bdev"; 439 uring->bdev.module = &uring_if; 440 441 uring->bdev.write_cache = 1; 442 443 block_size = spdk_fd_get_blocklen(uring->fd); 444 if (block_size == 0) { 445 SPDK_ERRLOG("Block size could not be auto-detected\n"); 446 goto error_return; 447 } 448 449 if (block_size < 512) { 450 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 451 goto error_return; 452 } 453 454 if (!spdk_u32_is_pow2(block_size)) { 455 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 456 goto error_return; 457 } 458 459 uring->bdev.blocklen = block_size; 460 uring->bdev.required_alignment = spdk_u32log2(block_size); 461 462 if (bdev_size % uring->bdev.blocklen != 0) { 463 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 464 bdev_size, uring->bdev.blocklen); 465 goto error_return; 466 } 467 468 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 469 uring->bdev.ctxt = uring; 470 471 uring->bdev.fn_table = &uring_fn_table; 472 473 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 474 sizeof(struct bdev_uring_io_channel), 475 uring->bdev.name); 476 rc = spdk_bdev_register(&uring->bdev); 477 if (rc) { 478 spdk_io_device_unregister(uring, NULL); 479 goto error_return; 480 } 481 482 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 483 return &uring->bdev; 484 485 error_return: 486 bdev_uring_close(uring); 487 uring_free_bdev(uring); 488 return NULL; 489 } 490 491 struct delete_uring_bdev_ctx { 492 spdk_delete_uring_complete cb_fn; 493 void *cb_arg; 494 }; 495 496 static void 497 uring_bdev_unregister_cb(void *arg, int bdeverrno) 498 { 499 struct delete_uring_bdev_ctx *ctx = arg; 500 501 ctx->cb_fn(ctx->cb_arg, bdeverrno); 502 free(ctx); 503 } 504 505 void 506 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) 507 { 508 struct delete_uring_bdev_ctx *ctx; 509 510 if (!bdev || bdev->module != &uring_if) { 511 cb_fn(cb_arg, -ENODEV); 512 return; 513 } 514 515 ctx = calloc(1, sizeof(*ctx)); 516 if (ctx == NULL) { 517 cb_fn(cb_arg, -ENOMEM); 518 return; 519 } 520 521 ctx->cb_fn = cb_fn; 522 ctx->cb_arg = cb_arg; 523 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); 524 } 525 526 static int 527 bdev_uring_init(void) 528 { 529 size_t i; 530 struct spdk_conf_section *sp; 531 struct spdk_bdev *bdev; 532 533 TAILQ_INIT(&g_uring_bdev_head); 534 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 535 sizeof(struct bdev_uring_group_channel), 536 "uring_module"); 537 538 sp = spdk_conf_find_section(NULL, "URING"); 539 if (!sp) { 540 return 0; 541 } 542 543 i = 0; 544 while (true) { 545 const char *file; 546 const char *name; 547 548 file = spdk_conf_section_get_nmval(sp, "URING", i, 0); 549 if (!file) { 550 break; 551 } 552 553 name = spdk_conf_section_get_nmval(sp, "URING", i, 1); 554 if (!name) { 555 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); 556 i++; 557 continue; 558 } 559 560 bdev = create_uring_bdev(name, file); 561 if (!bdev) { 562 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); 563 i++; 564 continue; 565 } 566 567 i++; 568 } 569 570 return 0; 571 } 572 573 static void 574 bdev_uring_fini(void) 575 { 576 spdk_io_device_unregister(&uring_if, NULL); 577 } 578 579 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) 580