1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "bdev_uring.h" 7 8 #include "spdk/stdinc.h" 9 10 #include "spdk/barrier.h" 11 #include "spdk/bdev.h" 12 #include "spdk/env.h" 13 #include "spdk/fd.h" 14 #include "spdk/likely.h" 15 #include "spdk/thread.h" 16 #include "spdk/json.h" 17 #include "spdk/util.h" 18 #include "spdk/string.h" 19 20 #include "spdk/log.h" 21 #include "spdk_internal/uring.h" 22 23 struct bdev_uring_io_channel { 24 struct bdev_uring_group_channel *group_ch; 25 }; 26 27 struct bdev_uring_group_channel { 28 uint64_t io_inflight; 29 uint64_t io_pending; 30 struct spdk_poller *poller; 31 struct io_uring uring; 32 }; 33 34 struct bdev_uring_task { 35 uint64_t len; 36 struct bdev_uring_io_channel *ch; 37 TAILQ_ENTRY(bdev_uring_task) link; 38 }; 39 40 struct bdev_uring { 41 struct spdk_bdev bdev; 42 char *filename; 43 int fd; 44 TAILQ_ENTRY(bdev_uring) link; 45 }; 46 47 static int bdev_uring_init(void); 48 static void bdev_uring_fini(void); 49 static void uring_free_bdev(struct bdev_uring *uring); 50 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head); 51 52 #define SPDK_URING_QUEUE_DEPTH 512 53 #define MAX_EVENTS_PER_POLL 32 54 55 static int 56 bdev_uring_get_ctx_size(void) 57 { 58 return sizeof(struct bdev_uring_task); 59 } 60 61 static struct spdk_bdev_module uring_if = { 62 .name = "uring", 63 .module_init = bdev_uring_init, 64 .module_fini = bdev_uring_fini, 65 .get_ctx_size = bdev_uring_get_ctx_size, 66 }; 67 68 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) 69 70 static int 71 bdev_uring_open(struct bdev_uring *bdev) 72 { 73 int fd; 74 75 fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); 76 if (fd < 0) { 77 /* Try without O_DIRECT for non-disk files */ 78 fd = open(bdev->filename, O_RDWR | O_NOATIME); 79 if (fd < 0) { 80 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 81 bdev->filename, errno, spdk_strerror(errno)); 82 bdev->fd = -1; 83 return -1; 84 } 85 } 86 87 bdev->fd = fd; 88 89 return 0; 90 } 91 92 static int 93 bdev_uring_close(struct bdev_uring *bdev) 94 { 95 int rc; 96 97 if (bdev->fd == -1) { 98 return 0; 99 } 100 101 rc = close(bdev->fd); 102 if (rc < 0) { 103 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 104 bdev->fd, errno, spdk_strerror(errno)); 105 return -1; 106 } 107 108 bdev->fd = -1; 109 110 return 0; 111 } 112 113 static int64_t 114 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, 115 struct bdev_uring_task *uring_task, 116 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 117 { 118 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 119 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 120 struct io_uring_sqe *sqe; 121 122 sqe = io_uring_get_sqe(&group_ch->uring); 123 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); 124 io_uring_sqe_set_data(sqe, uring_task); 125 uring_task->len = nbytes; 126 uring_task->ch = uring_ch; 127 128 SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n", 129 iovcnt, nbytes, offset); 130 131 group_ch->io_pending++; 132 return nbytes; 133 } 134 135 static int64_t 136 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, 137 struct bdev_uring_task *uring_task, 138 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) 139 { 140 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); 141 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; 142 struct io_uring_sqe *sqe; 143 144 sqe = io_uring_get_sqe(&group_ch->uring); 145 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); 146 io_uring_sqe_set_data(sqe, uring_task); 147 uring_task->len = nbytes; 148 uring_task->ch = uring_ch; 149 150 SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n", 151 iovcnt, nbytes, offset); 152 153 group_ch->io_pending++; 154 return nbytes; 155 } 156 157 static int 158 bdev_uring_destruct(void *ctx) 159 { 160 struct bdev_uring *uring = ctx; 161 int rc = 0; 162 163 TAILQ_REMOVE(&g_uring_bdev_head, uring, link); 164 rc = bdev_uring_close(uring); 165 if (rc < 0) { 166 SPDK_ERRLOG("bdev_uring_close() failed\n"); 167 } 168 spdk_io_device_unregister(uring, NULL); 169 uring_free_bdev(uring); 170 return rc; 171 } 172 173 static int 174 bdev_uring_reap(struct io_uring *ring, int max) 175 { 176 int i, count, ret; 177 struct io_uring_cqe *cqe; 178 struct bdev_uring_task *uring_task; 179 enum spdk_bdev_io_status status; 180 181 count = 0; 182 for (i = 0; i < max; i++) { 183 ret = io_uring_peek_cqe(ring, &cqe); 184 if (ret != 0) { 185 return ret; 186 } 187 188 if (cqe == NULL) { 189 return count; 190 } 191 192 uring_task = (struct bdev_uring_task *)cqe->user_data; 193 if (cqe->res != (signed)uring_task->len) { 194 status = SPDK_BDEV_IO_STATUS_FAILED; 195 } else { 196 status = SPDK_BDEV_IO_STATUS_SUCCESS; 197 } 198 199 uring_task->ch->group_ch->io_inflight--; 200 io_uring_cqe_seen(ring, cqe); 201 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); 202 count++; 203 } 204 205 return count; 206 } 207 208 static int 209 bdev_uring_group_poll(void *arg) 210 { 211 struct bdev_uring_group_channel *group_ch = arg; 212 int to_complete, to_submit; 213 int count, ret; 214 215 to_submit = group_ch->io_pending; 216 217 if (to_submit > 0) { 218 /* If there are I/O to submit, use io_uring_submit here. 219 * It will automatically call spdk_io_uring_enter appropriately. */ 220 ret = io_uring_submit(&group_ch->uring); 221 if (ret < 0) { 222 return SPDK_POLLER_BUSY; 223 } 224 225 group_ch->io_pending = 0; 226 group_ch->io_inflight += to_submit; 227 } 228 229 to_complete = group_ch->io_inflight; 230 count = 0; 231 if (to_complete > 0) { 232 count = bdev_uring_reap(&group_ch->uring, to_complete); 233 } 234 235 if (count + to_submit > 0) { 236 return SPDK_POLLER_BUSY; 237 } else { 238 return SPDK_POLLER_IDLE; 239 } 240 } 241 242 static void 243 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 244 bool success) 245 { 246 if (!success) { 247 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 248 return; 249 } 250 251 switch (bdev_io->type) { 252 case SPDK_BDEV_IO_TYPE_READ: 253 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, 254 ch, 255 (struct bdev_uring_task *)bdev_io->driver_ctx, 256 bdev_io->u.bdev.iovs, 257 bdev_io->u.bdev.iovcnt, 258 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 259 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 260 break; 261 case SPDK_BDEV_IO_TYPE_WRITE: 262 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, 263 ch, 264 (struct bdev_uring_task *)bdev_io->driver_ctx, 265 bdev_io->u.bdev.iovs, 266 bdev_io->u.bdev.iovcnt, 267 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 268 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 269 break; 270 default: 271 SPDK_ERRLOG("Wrong io type\n"); 272 break; 273 } 274 } 275 276 static int 277 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 278 { 279 switch (bdev_io->type) { 280 /* Read and write operations must be performed on buffers aligned to 281 * bdev->required_alignment. If user specified unaligned buffers, 282 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 283 case SPDK_BDEV_IO_TYPE_READ: 284 case SPDK_BDEV_IO_TYPE_WRITE: 285 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, 286 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 287 return 0; 288 default: 289 return -1; 290 } 291 } 292 293 static void 294 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 295 { 296 if (_bdev_uring_submit_request(ch, bdev_io) < 0) { 297 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 298 } 299 } 300 301 static bool 302 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 303 { 304 switch (io_type) { 305 case SPDK_BDEV_IO_TYPE_READ: 306 case SPDK_BDEV_IO_TYPE_WRITE: 307 return true; 308 default: 309 return false; 310 } 311 } 312 313 static int 314 bdev_uring_create_cb(void *io_device, void *ctx_buf) 315 { 316 struct bdev_uring_io_channel *ch = ctx_buf; 317 318 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); 319 320 return 0; 321 } 322 323 static void 324 bdev_uring_destroy_cb(void *io_device, void *ctx_buf) 325 { 326 struct bdev_uring_io_channel *ch = ctx_buf; 327 328 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 329 } 330 331 static struct spdk_io_channel * 332 bdev_uring_get_io_channel(void *ctx) 333 { 334 struct bdev_uring *uring = ctx; 335 336 return spdk_get_io_channel(uring); 337 } 338 339 static int 340 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 341 { 342 struct bdev_uring *uring = ctx; 343 344 spdk_json_write_named_object_begin(w, "uring"); 345 346 spdk_json_write_named_string(w, "filename", uring->filename); 347 348 spdk_json_write_object_end(w); 349 350 return 0; 351 } 352 353 static void 354 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 355 { 356 struct bdev_uring *uring = bdev->ctxt; 357 358 spdk_json_write_object_begin(w); 359 360 spdk_json_write_named_string(w, "method", "bdev_uring_create"); 361 362 spdk_json_write_named_object_begin(w, "params"); 363 spdk_json_write_named_string(w, "name", bdev->name); 364 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 365 spdk_json_write_named_string(w, "filename", uring->filename); 366 spdk_json_write_object_end(w); 367 368 spdk_json_write_object_end(w); 369 } 370 371 static const struct spdk_bdev_fn_table uring_fn_table = { 372 .destruct = bdev_uring_destruct, 373 .submit_request = bdev_uring_submit_request, 374 .io_type_supported = bdev_uring_io_type_supported, 375 .get_io_channel = bdev_uring_get_io_channel, 376 .dump_info_json = bdev_uring_dump_info_json, 377 .write_config_json = bdev_uring_write_json_config, 378 }; 379 380 static void 381 uring_free_bdev(struct bdev_uring *uring) 382 { 383 if (uring == NULL) { 384 return; 385 } 386 free(uring->filename); 387 free(uring->bdev.name); 388 free(uring); 389 } 390 391 static int 392 bdev_uring_group_create_cb(void *io_device, void *ctx_buf) 393 { 394 struct bdev_uring_group_channel *ch = ctx_buf; 395 396 /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only 397 * local devices but also devices attached from remote target */ 398 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { 399 SPDK_ERRLOG("uring I/O context setup failure\n"); 400 return -1; 401 } 402 403 ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); 404 return 0; 405 } 406 407 static void 408 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) 409 { 410 struct bdev_uring_group_channel *ch = ctx_buf; 411 412 io_uring_queue_exit(&ch->uring); 413 414 spdk_poller_unregister(&ch->poller); 415 } 416 417 struct spdk_bdev * 418 create_uring_bdev(const char *name, const char *filename, uint32_t block_size) 419 { 420 struct bdev_uring *uring; 421 uint32_t detected_block_size; 422 uint64_t bdev_size; 423 int rc; 424 425 uring = calloc(1, sizeof(*uring)); 426 if (!uring) { 427 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); 428 return NULL; 429 } 430 431 uring->filename = strdup(filename); 432 if (!uring->filename) { 433 goto error_return; 434 } 435 436 if (bdev_uring_open(uring)) { 437 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); 438 goto error_return; 439 } 440 441 bdev_size = spdk_fd_get_size(uring->fd); 442 443 uring->bdev.name = strdup(name); 444 if (!uring->bdev.name) { 445 goto error_return; 446 } 447 uring->bdev.product_name = "URING bdev"; 448 uring->bdev.module = &uring_if; 449 450 uring->bdev.write_cache = 1; 451 452 detected_block_size = spdk_fd_get_blocklen(uring->fd); 453 if (block_size == 0) { 454 /* User did not specify block size - use autodetected block size. */ 455 if (detected_block_size == 0) { 456 SPDK_ERRLOG("Block size could not be auto-detected\n"); 457 goto error_return; 458 } 459 block_size = detected_block_size; 460 } else { 461 if (block_size < detected_block_size) { 462 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 463 "auto-detected block size %" PRIu32 "\n", 464 block_size, detected_block_size); 465 goto error_return; 466 } else if (detected_block_size != 0 && block_size != detected_block_size) { 467 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 468 "auto-detected block size %" PRIu32 "\n", 469 block_size, detected_block_size); 470 } 471 } 472 473 if (block_size < 512) { 474 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 475 goto error_return; 476 } 477 478 if (!spdk_u32_is_pow2(block_size)) { 479 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 480 goto error_return; 481 } 482 483 uring->bdev.blocklen = block_size; 484 uring->bdev.required_alignment = spdk_u32log2(block_size); 485 486 if (bdev_size % uring->bdev.blocklen != 0) { 487 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 488 bdev_size, uring->bdev.blocklen); 489 goto error_return; 490 } 491 492 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; 493 uring->bdev.ctxt = uring; 494 495 uring->bdev.fn_table = &uring_fn_table; 496 497 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, 498 sizeof(struct bdev_uring_io_channel), 499 uring->bdev.name); 500 rc = spdk_bdev_register(&uring->bdev); 501 if (rc) { 502 spdk_io_device_unregister(uring, NULL); 503 goto error_return; 504 } 505 506 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); 507 return &uring->bdev; 508 509 error_return: 510 bdev_uring_close(uring); 511 uring_free_bdev(uring); 512 return NULL; 513 } 514 515 struct delete_uring_bdev_ctx { 516 spdk_delete_uring_complete cb_fn; 517 void *cb_arg; 518 }; 519 520 static void 521 uring_bdev_unregister_cb(void *arg, int bdeverrno) 522 { 523 struct delete_uring_bdev_ctx *ctx = arg; 524 525 ctx->cb_fn(ctx->cb_arg, bdeverrno); 526 free(ctx); 527 } 528 529 void 530 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg) 531 { 532 struct delete_uring_bdev_ctx *ctx; 533 int rc; 534 535 ctx = calloc(1, sizeof(*ctx)); 536 if (ctx == NULL) { 537 cb_fn(cb_arg, -ENOMEM); 538 return; 539 } 540 541 ctx->cb_fn = cb_fn; 542 ctx->cb_arg = cb_arg; 543 rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx); 544 if (rc != 0) { 545 uring_bdev_unregister_cb(ctx, rc); 546 } 547 } 548 549 static int 550 bdev_uring_init(void) 551 { 552 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, 553 sizeof(struct bdev_uring_group_channel), "uring_module"); 554 555 return 0; 556 } 557 558 static void 559 bdev_uring_fini(void) 560 { 561 spdk_io_device_unregister(&uring_if, NULL); 562 } 563 564 SPDK_LOG_REGISTER_COMPONENT(uring) 565