1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_uring.h" 35 36 #include "spdk/stdinc.h" 37 38 #include "spdk/barrier.h" 39 #include "spdk/bdev.h" 40 #include "spdk/env.h" 41 #include "spdk/fd.h" 42 #include "spdk/likely.h" 43 #include "spdk/thread.h" 44 #include "spdk/json.h" 45 #include "spdk/util.h" 46 #include "spdk/string.h" 47 48 #include "spdk/log.h" 49 #include "spdk_internal/uring.h" 50 51 struct bdev_uring_io_channel { 52 struct bdev_uring_group_channel *group_ch; 53 }; 54 55 struct bdev_uring_group_channel { 56 uint64_t io_inflight; 57 uint64_t io_pending; 58 struct spdk_poller *poller; 59 struct io_uring uring; 60 }; 61 62 struct bdev_uring_task { 63 uint64_t len; 64 struct bdev_uring_io_channel *ch; 65 TAILQ_ENTRY(bdev_uring_task) link; 66 }; 67 68 struct bdev_uring { 69 struct spdk_bdev bdev; 70 char *filename; 71 int fd; 72 TAILQ_ENTRY(bdev_uring) link; 73 }; 74 75 static int bdev_uring_init(void); 76 static void bdev_uring_fini(void); 77 static void uring_free_bdev(struct bdev_uring *uring); 78 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head); 79 80 #define SPDK_URING_QUEUE_DEPTH 512 81 #define MAX_EVENTS_PER_POLL 32 82 83 static int 84 bdev_uring_get_ctx_size(void) 85 { 86 return sizeof(struct bdev_uring_task); 87 } 88 89 static struct spdk_bdev_module uring_if = { 90 .name = "uring", 91 .module_init = bdev_uring_init, 92 .module_fini = bdev_uring_fini, 93 .get_ctx_size = bdev_uring_get_ctx_size, 94 }; 95 96 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 97 98 static int 99 bdev_uring_open(struct bdev_uring *bdev) 100 { 101 int fd; 102 103 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 104 if (fd < 0) { 105 /* Try without O_DIRECT for non-disk files */ 106 fd = open(bdev->filename, O_RDWR | O_NOATIME); 107 if (fd < 0) { 108 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 109 bdev->filename, errno, spdk_strerror(errno)); 110 bdev->fd = -1; 111 return -1; 112 } 113 } 114 115 bdev->fd = fd; 116 117 return 0; 118 } 119 120 static int 121 bdev_uring_close(struct bdev_uring *bdev) 122 { 123 int rc; 124 125 if (bdev->fd == -1) { 126 return 0; 127 } 128 129 rc = close(bdev->fd); 130 if (rc < 0) { 131 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 132 bdev->fd, errno, spdk_strerror(errno)); 133 return -1; 134 } 135 136 bdev->fd = -1; 137 138 return 0; 139 } 140 141 static int64_t 142 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 143 struct bdev_uring_task *uring_task, 144 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 145 { 146 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 147 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 148 struct io_uring_sqe *sqe; 149 150 sqe = io_uring_get_sqe(&group_ch->uring); 151 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 152 io_uring_sqe_set_data(sqe, uring_task); 153 uring_task->len = nbytes; 154 uring_task->ch = uring_ch; 155 156 SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n", 157 iovcnt, nbytes, offset); 158 159 group_ch->io_pending++; 160 return nbytes; 161 } 162 163 static int64_t 164 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 165 struct bdev_uring_task *uring_task, 166 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 167 { 168 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 169 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 170 struct io_uring_sqe *sqe; 171 172 sqe = io_uring_get_sqe(&group_ch->uring); 173 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 174 io_uring_sqe_set_data(sqe, uring_task); 175 uring_task->len = nbytes; 176 uring_task->ch = uring_ch; 177 178 SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n", 179 iovcnt, nbytes, offset); 180 181 group_ch->io_pending++; 182 return nbytes; 183 } 184 185 static int 186 bdev_uring_destruct(void *ctx) 187 { 188 struct bdev_uring *uring = ctx; 189 int rc = 0; 190 191 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 192 rc = bdev_uring_close(uring); 193 if (rc < 0) { 194 SPDK_ERRLOG("bdev_uring_close() failed\n"); 195 } 196 spdk_io_device_unregister(uring, NULL); 197 uring_free_bdev(uring); 198 return rc; 199 } 200 201 static int 202 bdev_uring_reap(struct io_uring *ring, int max) 203 { 204 int i, count, ret; 205 struct io_uring_cqe *cqe; 206 struct bdev_uring_task *uring_task; 207 enum spdk_bdev_io_status status; 208 209 count = 0; 210 for (i = 0; i < max; i++) { 211 ret = io_uring_peek_cqe(ring, &cqe); 212 if (ret != 0) { 213 return ret; 214 } 215 216 if (cqe == NULL) { 217 return count; 218 } 219 220 uring_task = (struct bdev_uring_task *)cqe->user_data; 221 if (cqe->res != (signed)uring_task->len) { 222 status = SPDK_BDEV_IO_STATUS_FAILED; 223 } else { 224 status = SPDK_BDEV_IO_STATUS_SUCCESS; 225 } 226 227 uring_task->ch->group_ch->io_inflight--; 228 io_uring_cqe_seen(ring, cqe); 229 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 230 count++; 231 } 232 233 return count; 234 } 235 236 static int 237 bdev_uring_group_poll(void *arg) 238 { 239 struct bdev_uring_group_channel *group_ch = arg; 240 int to_complete, to_submit; 241 int count, ret; 242 243 to_submit = group_ch->io_pending; 244 245 if (to_submit > 0) { 246 /* If there are I/O to submit, use io_uring_submit here. 247 * It will automatically call spdk_io_uring_enter appropriately. */ 248 ret = io_uring_submit(&group_ch->uring); 249 if (ret < 0) { 250 return SPDK_POLLER_BUSY; 251 } 252 253 group_ch->io_pending = 0; 254 group_ch->io_inflight += to_submit; 255 } 256 257 to_complete = group_ch->io_inflight; 258 count = 0; 259 if (to_complete > 0) { 260 count = bdev_uring_reap(&group_ch->uring, to_complete); 261 } 262 263 if (count + to_submit > 0) { 264 return SPDK_POLLER_BUSY; 265 } else { 266 return SPDK_POLLER_IDLE; 267 } 268 } 269 270 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 271 bool success) 272 { 273 if (!success) { 274 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 275 return; 276 } 277 278 switch (bdev_io->type) { 279 case SPDK_BDEV_IO_TYPE_READ: 280 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 281 ch, 282 (struct bdev_uring_task *)bdev_io->driver_ctx, 283 bdev_io->u.bdev.iovs, 284 bdev_io->u.bdev.iovcnt, 285 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 286 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 287 break; 288 case SPDK_BDEV_IO_TYPE_WRITE: 289 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 290 ch, 291 (struct bdev_uring_task *)bdev_io->driver_ctx, 292 bdev_io->u.bdev.iovs, 293 bdev_io->u.bdev.iovcnt, 294 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 295 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 296 break; 297 default: 298 SPDK_ERRLOG("Wrong io type\n"); 299 break; 300 } 301 } 302 303 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 304 { 305 switch (bdev_io->type) { 306 /* Read and write operations must be performed on buffers aligned to 307 * bdev->required_alignment. If user specified unaligned buffers, 308 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 309 case SPDK_BDEV_IO_TYPE_READ: 310 case SPDK_BDEV_IO_TYPE_WRITE: 311 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 312 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 313 return 0; 314 default: 315 return -1; 316 } 317 } 318 319 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 320 { 321 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 322 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 323 } 324 } 325 326 static bool 327 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 328 { 329 switch (io_type) { 330 case SPDK_BDEV_IO_TYPE_READ: 331 case SPDK_BDEV_IO_TYPE_WRITE: 332 return true; 333 default: 334 return false; 335 } 336 } 337 338 static int 339 bdev_uring_create_cb(void *io_device, void *ctx_buf) 340 { 341 struct bdev_uring_io_channel *ch = ctx_buf; 342 343 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 344 345 return 0; 346 } 347 348 static void 349 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 350 { 351 struct bdev_uring_io_channel *ch = ctx_buf; 352 353 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 354 } 355 356 static struct spdk_io_channel * 357 bdev_uring_get_io_channel(void *ctx) 358 { 359 struct bdev_uring *uring = ctx; 360 361 return spdk_get_io_channel(uring); 362 } 363 364 static int 365 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 366 { 367 struct bdev_uring *uring = ctx; 368 369 spdk_json_write_named_object_begin(w, "uring"); 370 371 spdk_json_write_named_string(w, "filename", uring->filename); 372 373 spdk_json_write_object_end(w); 374 375 return 0; 376 } 377 378 static void 379 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 380 { 381 struct bdev_uring *uring = bdev->ctxt; 382 383 spdk_json_write_object_begin(w); 384 385 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 386 387 spdk_json_write_named_object_begin(w, "params"); 388 spdk_json_write_named_string(w, "name", bdev->name); 389 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 390 spdk_json_write_named_string(w, "filename", uring->filename); 391 spdk_json_write_object_end(w); 392 393 spdk_json_write_object_end(w); 394 } 395 396 static const struct spdk_bdev_fn_table uring_fn_table = { 397 .destruct = bdev_uring_destruct, 398 .submit_request = bdev_uring_submit_request, 399 .io_type_supported = bdev_uring_io_type_supported, 400 .get_io_channel = bdev_uring_get_io_channel, 401 .dump_info_json = bdev_uring_dump_info_json, 402 .write_config_json = bdev_uring_write_json_config, 403 }; 404 405 static void uring_free_bdev(struct bdev_uring *uring) 406 { 407 if (uring == NULL) { 408 return; 409 } 410 free(uring->filename); 411 free(uring->bdev.name); 412 free(uring); 413 } 414 415 static int 416 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 417 { 418 struct bdev_uring_group_channel *ch = ctx_buf; 419 420 /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only 421 * local devices but also devices attached from remote target */ 422 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { 423 SPDK_ERRLOG("uring I/O context setup failure\n"); 424 return -1; 425 } 426 427 ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); 428 return 0; 429 } 430 431 static void 432 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 433 { 434 struct bdev_uring_group_channel *ch = ctx_buf; 435 436 io_uring_queue_exit(&ch->uring); 437 438 spdk_poller_unregister(&ch->poller); 439 } 440 441 struct spdk_bdev * 442 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 443 { 444 struct bdev_uring *uring; 445 uint32_t detected_block_size; 446 uint64_t bdev_size; 447 int rc; 448 449 uring = calloc(1, sizeof(*uring)); 450 if (!uring) { 451 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 452 return NULL; 453 } 454 455 uring->filename = strdup(filename); 456 if (!uring->filename) { 457 goto error_return; 458 } 459 460 if (bdev_uring_open(uring)) { 461 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 462 goto error_return; 463 } 464 465 bdev_size = spdk_fd_get_size(uring->fd); 466 467 uring->bdev.name = strdup(name); 468 if (!uring->bdev.name) { 469 goto error_return; 470 } 471 uring->bdev.product_name = "URING bdev"; 472 uring->bdev.module = &uring_if; 473 474 uring->bdev.write_cache = 1; 475 476 detected_block_size = spdk_fd_get_blocklen(uring->fd); 477 if (block_size == 0) { 478 /* User did not specify block size - use autodetected block size. */ 479 if (detected_block_size == 0) { 480 SPDK_ERRLOG("Block size could not be auto-detected\n"); 481 goto error_return; 482 } 483 block_size = detected_block_size; 484 } else { 485 if (block_size < detected_block_size) { 486 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 487 "auto-detected block size %" PRIu32 "\n", 488 block_size, detected_block_size); 489 goto error_return; 490 } else if (detected_block_size != 0 && block_size != detected_block_size) { 491 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 492 "auto-detected block size %" PRIu32 "\n", 493 block_size, detected_block_size); 494 } 495 } 496 497 if (block_size < 512) { 498 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 499 goto error_return; 500 } 501 502 if (!spdk_u32_is_pow2(block_size)) { 503 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 504 goto error_return; 505 } 506 507 uring->bdev.blocklen = block_size; 508 uring->bdev.required_alignment = spdk_u32log2(block_size); 509 510 if (bdev_size % uring->bdev.blocklen != 0) { 511 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 512 bdev_size, uring->bdev.blocklen); 513 goto error_return; 514 } 515 516 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 517 uring->bdev.ctxt = uring; 518 519 uring->bdev.fn_table = &uring_fn_table; 520 521 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 522 sizeof(struct bdev_uring_io_channel), 523 uring->bdev.name); 524 rc = spdk_bdev_register(&uring->bdev); 525 if (rc) { 526 spdk_io_device_unregister(uring, NULL); 527 goto error_return; 528 } 529 530 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 531 return &uring->bdev; 532 533 error_return: 534 bdev_uring_close(uring); 535 uring_free_bdev(uring); 536 return NULL; 537 } 538 539 struct delete_uring_bdev_ctx { 540 spdk_delete_uring_complete cb_fn; 541 void *cb_arg; 542 }; 543 544 static void 545 uring_bdev_unregister_cb(void *arg, int bdeverrno) 546 { 547 struct delete_uring_bdev_ctx *ctx = arg; 548 549 ctx->cb_fn(ctx->cb_arg, bdeverrno); 550 free(ctx); 551 } 552 553 void 554 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg) 555 { 556 struct delete_uring_bdev_ctx *ctx; 557 int rc; 558 559 ctx = calloc(1, sizeof(*ctx)); 560 if (ctx == NULL) { 561 cb_fn(cb_arg, -ENOMEM); 562 return; 563 } 564 565 ctx->cb_fn = cb_fn; 566 ctx->cb_arg = cb_arg; 567 rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx); 568 if (rc != 0) { 569 uring_bdev_unregister_cb(ctx, rc); 570 } 571 } 572 573 static int 574 bdev_uring_init(void) 575 { 576 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 577 sizeof(struct bdev_uring_group_channel), "uring_module"); 578 579 return 0; 580 } 581 582 static void 583 bdev_uring_fini(void) 584 { 585 spdk_io_device_unregister(&uring_if, NULL); 586 } 587 588 SPDK_LOG_REGISTER_COMPONENT(uring) 589