1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_aio.h" 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/barrier.h" 12 #include "spdk/bdev.h" 13 #include "spdk/bdev_module.h" 14 #include "spdk/env.h" 15 #include "spdk/fd.h" 16 #include "spdk/likely.h" 17 #include "spdk/thread.h" 18 #include "spdk/json.h" 19 #include "spdk/util.h" 20 #include "spdk/string.h" 21 22 #include "spdk/log.h" 23 24 #include <sys/eventfd.h> 25 26 #ifndef __FreeBSD__ 27 #include <libaio.h> 28 #endif 29 30 struct bdev_aio_io_channel { 31 uint64_t io_inflight; 32 #ifdef __FreeBSD__ 33 int kqfd; 34 #else 35 io_context_t io_ctx; 36 #endif 37 struct bdev_aio_group_channel *group_ch; 38 TAILQ_ENTRY(bdev_aio_io_channel) link; 39 }; 40 41 struct bdev_aio_group_channel { 42 /* eventfd for io completion notification in interrupt mode. 43 * Negative value like '-1' indicates it is invalid or unused. 44 */ 45 int efd; 46 struct spdk_interrupt *intr; 47 struct spdk_poller *poller; 48 TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head; 49 }; 50 51 struct bdev_aio_task { 52 #ifdef __FreeBSD__ 53 struct aiocb aiocb; 54 #else 55 struct iocb iocb; 56 #endif 57 uint64_t len; 58 struct bdev_aio_io_channel *ch; 59 }; 60 61 struct file_disk { 62 struct bdev_aio_task *reset_task; 63 struct spdk_poller *reset_retry_timer; 64 struct spdk_bdev disk; 65 char *filename; 66 int fd; 67 bool use_nowait; 68 TAILQ_ENTRY(file_disk) link; 69 bool block_size_override; 70 bool readonly; 71 bool fallocate; 72 }; 73 74 /* For user space reaping of completions */ 75 struct spdk_aio_ring { 76 uint32_t id; 77 uint32_t size; 78 uint32_t head; 79 uint32_t tail; 80 81 uint32_t version; 82 uint32_t compat_features; 83 uint32_t incompat_features; 84 uint32_t header_length; 85 }; 86 87 #define SPDK_AIO_RING_VERSION 0xa10a10a1 88 89 static int bdev_aio_initialize(void); 90 static void bdev_aio_fini(void); 91 static void aio_free_disk(struct file_disk *fdisk); 92 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head); 93 94 #define SPDK_AIO_QUEUE_DEPTH 128 95 #define MAX_EVENTS_PER_POLL 32 96 97 static int 98 bdev_aio_get_ctx_size(void) 99 { 100 return sizeof(struct bdev_aio_task); 101 } 102 103 static struct spdk_bdev_module aio_if = { 104 .name = "aio", 105 .module_init = bdev_aio_initialize, 106 .module_fini = bdev_aio_fini, 107 .get_ctx_size = bdev_aio_get_ctx_size, 108 }; 109 110 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if) 111 112 static int 113 bdev_aio_open(struct file_disk *disk) 114 { 115 int fd; 116 int io_flag = disk->readonly ? O_RDONLY : O_RDWR; 117 struct stat st; 118 119 fd = open(disk->filename, io_flag | O_DIRECT); 120 if (fd < 0) { 121 /* Try without O_DIRECT for non-disk files */ 122 fd = open(disk->filename, io_flag); 123 if (fd < 0) { 124 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 125 disk->filename, errno, spdk_strerror(errno)); 126 disk->fd = -1; 127 return -1; 128 } 129 } 130 131 disk->fd = fd; 132 /* Some aio operations can block, for example if number outstanding 133 * I/O exceeds number of block layer tags. But not all files can 134 * support RWF_NOWAIT flag. So use RWF_NOWAIT on block devices only. 135 */ 136 disk->use_nowait = fstat(fd, &st) == 0 && S_ISBLK(st.st_mode); 137 138 return 0; 139 } 140 141 static int 142 bdev_aio_close(struct file_disk *disk) 143 { 144 int rc; 145 146 if (disk->fd == -1) { 147 return 0; 148 } 149 150 rc = close(disk->fd); 151 if (rc < 0) { 152 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 153 disk->fd, errno, spdk_strerror(errno)); 154 return -1; 155 } 156 157 disk->fd = -1; 158 159 return 0; 160 } 161 162 #ifdef __FreeBSD__ 163 static int 164 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 165 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 166 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 167 { 168 struct aiocb *aiocb = &aio_task->aiocb; 169 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 170 171 memset(aiocb, 0, sizeof(struct aiocb)); 172 aiocb->aio_fildes = fdisk->fd; 173 aiocb->aio_iov = iov; 174 aiocb->aio_iovcnt = iovcnt; 175 aiocb->aio_offset = offset; 176 aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd; 177 aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task; 178 aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT; 179 180 aio_task->len = nbytes; 181 aio_task->ch = aio_ch; 182 183 if (type == SPDK_BDEV_IO_TYPE_READ) { 184 return aio_readv(aiocb); 185 } 186 187 return aio_writev(aiocb); 188 } 189 #else 190 static int 191 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 192 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 193 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 194 { 195 struct iocb *iocb = &aio_task->iocb; 196 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 197 198 if (type == SPDK_BDEV_IO_TYPE_READ) { 199 io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset); 200 } else { 201 io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset); 202 } 203 204 if (aio_ch->group_ch->efd >= 0) { 205 io_set_eventfd(iocb, aio_ch->group_ch->efd); 206 } 207 iocb->data = aio_task; 208 if (fdisk->use_nowait) { 209 iocb->aio_rw_flags = RWF_NOWAIT; 210 } 211 aio_task->len = nbytes; 212 aio_task->ch = aio_ch; 213 214 return io_submit(aio_ch->io_ctx, 1, &iocb); 215 } 216 #endif 217 218 static void 219 bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk, 220 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 221 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 222 { 223 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 224 int rc; 225 226 if (type == SPDK_BDEV_IO_TYPE_READ) { 227 SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n", 228 iovcnt, nbytes, offset); 229 } else { 230 SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n", 231 iovcnt, nbytes, offset); 232 } 233 234 rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset); 235 if (spdk_unlikely(rc < 0)) { 236 if (rc == -EAGAIN) { 237 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); 238 } else { 239 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc); 240 SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); 241 } 242 } else { 243 aio_ch->io_inflight++; 244 } 245 } 246 247 static void 248 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 249 { 250 int rc = fsync(fdisk->fd); 251 252 if (rc == 0) { 253 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 254 } else { 255 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 256 } 257 } 258 259 #ifndef __FreeBSD__ 260 static void 261 bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode) 262 { 263 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 264 struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx; 265 uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 266 uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 267 int rc; 268 269 if (!fdisk->fallocate) { 270 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP); 271 return; 272 } 273 274 rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes); 275 if (rc == 0) { 276 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 277 } else { 278 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 279 } 280 } 281 282 static void 283 bdev_aio_unmap(struct spdk_bdev_io *bdev_io) 284 { 285 int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; 286 287 bdev_aio_fallocate(bdev_io, mode); 288 } 289 290 291 static void 292 bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io) 293 { 294 int mode = FALLOC_FL_ZERO_RANGE; 295 296 bdev_aio_fallocate(bdev_io, mode); 297 } 298 #endif 299 300 static void 301 bdev_aio_destruct_cb(void *io_device) 302 { 303 struct file_disk *fdisk = io_device; 304 int rc = 0; 305 306 TAILQ_REMOVE(&g_aio_disk_head, fdisk, link); 307 rc = bdev_aio_close(fdisk); 308 if (rc < 0) { 309 SPDK_ERRLOG("bdev_aio_close() failed\n"); 310 } 311 aio_free_disk(fdisk); 312 } 313 314 static int 315 bdev_aio_destruct(void *ctx) 316 { 317 struct file_disk *fdisk = ctx; 318 319 spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb); 320 321 return 0; 322 } 323 324 #ifdef __FreeBSD__ 325 static int 326 bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events) 327 { 328 struct timespec ts; 329 int count; 330 331 memset(events, 0, max * sizeof(struct kevent)); 332 memset(&ts, 0, sizeof(ts)); 333 334 count = kevent(kq, NULL, 0, events, max, &ts); 335 if (count < 0) { 336 SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno)); 337 return -errno; 338 } 339 340 return count; 341 } 342 343 static int 344 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 345 { 346 int nr, i, res = 0; 347 struct bdev_aio_task *aio_task; 348 struct kevent events[SPDK_AIO_QUEUE_DEPTH]; 349 350 nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events); 351 if (nr < 0) { 352 return 0; 353 } 354 355 for (i = 0; i < nr; i++) { 356 aio_task = events[i].udata; 357 aio_task->ch->io_inflight--; 358 if (aio_task == NULL) { 359 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 360 break; 361 } else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) { 362 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 363 } else { 364 SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb)); 365 res = aio_error(&aio_task->aiocb); 366 if (res != 0) { 367 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 368 } else { 369 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 370 } 371 } 372 } 373 374 return nr; 375 } 376 #else 377 static int 378 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents) 379 { 380 uint32_t head, tail, count; 381 struct spdk_aio_ring *ring; 382 struct timespec timeout; 383 struct io_event *kevents; 384 385 ring = (struct spdk_aio_ring *)io_ctx; 386 387 if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) { 388 timeout.tv_sec = 0; 389 timeout.tv_nsec = 0; 390 391 return io_getevents(io_ctx, 0, max, uevents, &timeout); 392 } 393 394 /* Read the current state out of the ring */ 395 head = ring->head; 396 tail = ring->tail; 397 398 /* This memory barrier is required to prevent the loads above 399 * from being re-ordered with stores to the events array 400 * potentially occurring on other threads. */ 401 spdk_smp_rmb(); 402 403 /* Calculate how many items are in the circular ring */ 404 count = tail - head; 405 if (tail < head) { 406 count += ring->size; 407 } 408 409 /* Reduce the count to the limit provided by the user */ 410 count = spdk_min(max, count); 411 412 /* Grab the memory location of the event array */ 413 kevents = (struct io_event *)((uintptr_t)ring + ring->header_length); 414 415 /* Copy the events out of the ring. */ 416 if ((head + count) <= ring->size) { 417 /* Only one copy is required */ 418 memcpy(uevents, &kevents[head], count * sizeof(struct io_event)); 419 } else { 420 uint32_t first_part = ring->size - head; 421 /* Two copies are required */ 422 memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event)); 423 memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event)); 424 } 425 426 /* Update the head pointer. On x86, stores will not be reordered with older loads, 427 * so the copies out of the event array will always be complete prior to this 428 * update becoming visible. On other architectures this is not guaranteed, so 429 * add a barrier. */ 430 #if defined(__i386__) || defined(__x86_64__) 431 spdk_compiler_barrier(); 432 #else 433 spdk_smp_mb(); 434 #endif 435 ring->head = (head + count) % ring->size; 436 437 return count; 438 } 439 440 static int 441 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 442 { 443 int nr, i, res = 0; 444 struct bdev_aio_task *aio_task; 445 struct io_event events[SPDK_AIO_QUEUE_DEPTH]; 446 447 nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); 448 if (nr < 0) { 449 return 0; 450 } 451 452 for (i = 0; i < nr; i++) { 453 aio_task = events[i].data; 454 aio_task->ch->io_inflight--; 455 if (events[i].res == aio_task->len) { 456 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 457 } else { 458 /* From aio_abi.h, io_event.res is defined __s64, negative errno 459 * will be assigned to io_event.res for error situation. 460 * But from libaio.h, io_event.res is defined unsigned long, so 461 * convert it to signed value for error detection. 462 */ 463 res = (int)events[i].res; 464 if (res < 0) { 465 if (res == -EAGAIN) { 466 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); 467 } else { 468 SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res); 469 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 470 } 471 } else { 472 SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res); 473 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 474 } 475 } 476 } 477 478 return nr; 479 } 480 #endif 481 482 static int 483 bdev_aio_group_poll(void *arg) 484 { 485 struct bdev_aio_group_channel *group_ch = arg; 486 struct bdev_aio_io_channel *io_ch; 487 int nr = 0; 488 489 TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) { 490 nr += bdev_aio_io_channel_poll(io_ch); 491 } 492 493 return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 494 } 495 496 static int 497 bdev_aio_group_interrupt(void *arg) 498 { 499 struct bdev_aio_group_channel *group_ch = arg; 500 int rc; 501 uint64_t num_events; 502 503 assert(group_ch->efd >= 0); 504 505 /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH, 506 * io_getevent should be called again to ensure all completed IO are processed. 507 */ 508 rc = read(group_ch->efd, &num_events, sizeof(num_events)); 509 if (rc < 0) { 510 SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno)); 511 return -errno; 512 } 513 514 if (num_events > SPDK_AIO_QUEUE_DEPTH) { 515 num_events -= SPDK_AIO_QUEUE_DEPTH; 516 rc = write(group_ch->efd, &num_events, sizeof(num_events)); 517 if (rc < 0) { 518 SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno)); 519 } 520 } 521 522 return bdev_aio_group_poll(group_ch); 523 } 524 525 static void 526 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) 527 { 528 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 529 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 530 531 if (aio_ch->io_inflight) { 532 spdk_for_each_channel_continue(i, -1); 533 return; 534 } 535 536 spdk_for_each_channel_continue(i, 0); 537 } 538 539 static int bdev_aio_reset_retry_timer(void *arg); 540 541 static void 542 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) 543 { 544 struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i); 545 546 if (status == -1) { 547 fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500); 548 return; 549 } 550 551 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); 552 } 553 554 static int 555 bdev_aio_reset_retry_timer(void *arg) 556 { 557 struct file_disk *fdisk = arg; 558 559 if (fdisk->reset_retry_timer) { 560 spdk_poller_unregister(&fdisk->reset_retry_timer); 561 } 562 563 spdk_for_each_channel(fdisk, 564 _bdev_aio_get_io_inflight, 565 fdisk, 566 _bdev_aio_get_io_inflight_done); 567 568 return SPDK_POLLER_BUSY; 569 } 570 571 static void 572 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 573 { 574 fdisk->reset_task = aio_task; 575 576 bdev_aio_reset_retry_timer(fdisk); 577 } 578 579 static void 580 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 581 bool success) 582 { 583 if (!success) { 584 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 585 return; 586 } 587 588 switch (bdev_io->type) { 589 case SPDK_BDEV_IO_TYPE_READ: 590 case SPDK_BDEV_IO_TYPE_WRITE: 591 bdev_aio_rw(bdev_io->type, 592 (struct file_disk *)bdev_io->bdev->ctxt, 593 ch, 594 (struct bdev_aio_task *)bdev_io->driver_ctx, 595 bdev_io->u.bdev.iovs, 596 bdev_io->u.bdev.iovcnt, 597 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 598 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 599 break; 600 default: 601 SPDK_ERRLOG("Wrong io type\n"); 602 break; 603 } 604 } 605 606 static int 607 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 608 { 609 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 610 611 switch (bdev_io->type) { 612 /* Read and write operations must be performed on buffers aligned to 613 * bdev->required_alignment. If user specified unaligned buffers, 614 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 615 case SPDK_BDEV_IO_TYPE_READ: 616 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 617 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 618 return 0; 619 case SPDK_BDEV_IO_TYPE_WRITE: 620 if (fdisk->readonly) { 621 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 622 } else { 623 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 624 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 625 } 626 return 0; 627 628 case SPDK_BDEV_IO_TYPE_FLUSH: 629 bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt, 630 (struct bdev_aio_task *)bdev_io->driver_ctx); 631 return 0; 632 633 case SPDK_BDEV_IO_TYPE_RESET: 634 bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt, 635 (struct bdev_aio_task *)bdev_io->driver_ctx); 636 return 0; 637 638 #ifndef __FreeBSD__ 639 case SPDK_BDEV_IO_TYPE_UNMAP: 640 bdev_aio_unmap(bdev_io); 641 return 0; 642 643 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 644 bdev_aio_write_zeros(bdev_io); 645 return 0; 646 #endif 647 648 default: 649 return -1; 650 } 651 } 652 653 static void 654 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 655 { 656 if (_bdev_aio_submit_request(ch, bdev_io) < 0) { 657 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 658 } 659 } 660 661 static bool 662 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 663 { 664 struct file_disk *fdisk = ctx; 665 666 switch (io_type) { 667 case SPDK_BDEV_IO_TYPE_READ: 668 case SPDK_BDEV_IO_TYPE_WRITE: 669 case SPDK_BDEV_IO_TYPE_FLUSH: 670 case SPDK_BDEV_IO_TYPE_RESET: 671 return true; 672 673 case SPDK_BDEV_IO_TYPE_UNMAP: 674 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 675 return fdisk->fallocate; 676 677 default: 678 return false; 679 } 680 } 681 682 #ifdef __FreeBSD__ 683 static int 684 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 685 { 686 ch->kqfd = kqueue(); 687 if (ch->kqfd < 0) { 688 SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno)); 689 return -1; 690 } 691 692 return 0; 693 } 694 695 static void 696 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 697 { 698 close(ch->kqfd); 699 } 700 #else 701 static int 702 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 703 { 704 if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { 705 SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n"); 706 SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n"); 707 return -1; 708 } 709 710 return 0; 711 } 712 713 static void 714 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 715 { 716 io_destroy(ch->io_ctx); 717 } 718 #endif 719 720 static int 721 bdev_aio_create_cb(void *io_device, void *ctx_buf) 722 { 723 struct bdev_aio_io_channel *ch = ctx_buf; 724 int rc; 725 726 rc = bdev_aio_create_io(ch); 727 if (rc < 0) { 728 return rc; 729 } 730 731 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if)); 732 TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link); 733 734 return 0; 735 } 736 737 static void 738 bdev_aio_destroy_cb(void *io_device, void *ctx_buf) 739 { 740 struct bdev_aio_io_channel *ch = ctx_buf; 741 742 bdev_aio_destroy_io(ch); 743 744 assert(ch->group_ch); 745 TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link); 746 747 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 748 } 749 750 static struct spdk_io_channel * 751 bdev_aio_get_io_channel(void *ctx) 752 { 753 struct file_disk *fdisk = ctx; 754 755 return spdk_get_io_channel(fdisk); 756 } 757 758 759 static int 760 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 761 { 762 struct file_disk *fdisk = ctx; 763 764 spdk_json_write_named_object_begin(w, "aio"); 765 766 spdk_json_write_named_string(w, "filename", fdisk->filename); 767 768 spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override); 769 770 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 771 772 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 773 774 spdk_json_write_object_end(w); 775 776 return 0; 777 } 778 779 static void 780 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 781 { 782 struct file_disk *fdisk = bdev->ctxt; 783 const struct spdk_uuid *uuid = spdk_bdev_get_uuid(bdev); 784 785 spdk_json_write_object_begin(w); 786 787 spdk_json_write_named_string(w, "method", "bdev_aio_create"); 788 789 spdk_json_write_named_object_begin(w, "params"); 790 spdk_json_write_named_string(w, "name", bdev->name); 791 if (fdisk->block_size_override) { 792 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 793 } 794 spdk_json_write_named_string(w, "filename", fdisk->filename); 795 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 796 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 797 if (!spdk_uuid_is_null(uuid)) { 798 spdk_json_write_named_uuid(w, "uuid", uuid); 799 } 800 spdk_json_write_object_end(w); 801 802 spdk_json_write_object_end(w); 803 } 804 805 static const struct spdk_bdev_fn_table aio_fn_table = { 806 .destruct = bdev_aio_destruct, 807 .submit_request = bdev_aio_submit_request, 808 .io_type_supported = bdev_aio_io_type_supported, 809 .get_io_channel = bdev_aio_get_io_channel, 810 .dump_info_json = bdev_aio_dump_info_json, 811 .write_config_json = bdev_aio_write_json_config, 812 }; 813 814 static void 815 aio_free_disk(struct file_disk *fdisk) 816 { 817 if (fdisk == NULL) { 818 return; 819 } 820 free(fdisk->filename); 821 free(fdisk->disk.name); 822 free(fdisk); 823 } 824 825 static int 826 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch) 827 { 828 int efd; 829 830 efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 831 if (efd < 0) { 832 return -1; 833 } 834 835 ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch); 836 if (ch->intr == NULL) { 837 close(efd); 838 return -1; 839 } 840 ch->efd = efd; 841 842 return 0; 843 } 844 845 static void 846 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch) 847 { 848 spdk_interrupt_unregister(&ch->intr); 849 close(ch->efd); 850 ch->efd = -1; 851 } 852 853 static int 854 bdev_aio_group_create_cb(void *io_device, void *ctx_buf) 855 { 856 struct bdev_aio_group_channel *ch = ctx_buf; 857 int rc; 858 859 TAILQ_INIT(&ch->io_ch_head); 860 /* Initialize ch->efd to be invalid and unused. */ 861 ch->efd = -1; 862 if (spdk_interrupt_mode_is_enabled()) { 863 rc = bdev_aio_register_interrupt(ch); 864 if (rc < 0) { 865 SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n"); 866 return rc; 867 } 868 } 869 870 ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0); 871 spdk_poller_register_interrupt(ch->poller, NULL, NULL); 872 873 return 0; 874 } 875 876 static void 877 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) 878 { 879 struct bdev_aio_group_channel *ch = ctx_buf; 880 881 if (!TAILQ_EMPTY(&ch->io_ch_head)) { 882 SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n"); 883 } 884 885 spdk_poller_unregister(&ch->poller); 886 if (spdk_interrupt_mode_is_enabled()) { 887 bdev_aio_unregister_interrupt(ch); 888 } 889 } 890 891 int 892 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly, 893 bool fallocate, const struct spdk_uuid *uuid) 894 { 895 struct file_disk *fdisk; 896 uint32_t detected_block_size; 897 uint64_t disk_size; 898 int rc; 899 900 #ifdef __FreeBSD__ 901 if (fallocate) { 902 SPDK_ERRLOG("Unable to support fallocate on this platform\n"); 903 return -ENOTSUP; 904 } 905 #endif 906 907 fdisk = calloc(1, sizeof(*fdisk)); 908 if (!fdisk) { 909 SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n"); 910 return -ENOMEM; 911 } 912 fdisk->readonly = readonly; 913 fdisk->fallocate = fallocate; 914 915 fdisk->filename = strdup(filename); 916 if (!fdisk->filename) { 917 rc = -ENOMEM; 918 goto error_return; 919 } 920 921 if (bdev_aio_open(fdisk)) { 922 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno); 923 rc = -errno; 924 goto error_return; 925 } 926 927 disk_size = spdk_fd_get_size(fdisk->fd); 928 929 fdisk->disk.name = strdup(name); 930 if (!fdisk->disk.name) { 931 rc = -ENOMEM; 932 goto error_return; 933 } 934 fdisk->disk.product_name = "AIO disk"; 935 fdisk->disk.module = &aio_if; 936 937 fdisk->disk.write_cache = 1; 938 939 detected_block_size = spdk_fd_get_blocklen(fdisk->fd); 940 if (block_size == 0) { 941 /* User did not specify block size - use autodetected block size. */ 942 if (detected_block_size == 0) { 943 SPDK_ERRLOG("Block size could not be auto-detected\n"); 944 rc = -EINVAL; 945 goto error_return; 946 } 947 fdisk->block_size_override = false; 948 block_size = detected_block_size; 949 } else { 950 if (block_size < detected_block_size) { 951 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 952 "auto-detected block size %" PRIu32 "\n", 953 block_size, detected_block_size); 954 rc = -EINVAL; 955 goto error_return; 956 } else if (detected_block_size != 0 && block_size != detected_block_size) { 957 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 958 "auto-detected block size %" PRIu32 "\n", 959 block_size, detected_block_size); 960 } 961 fdisk->block_size_override = true; 962 } 963 964 if (block_size < 512) { 965 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 966 rc = -EINVAL; 967 goto error_return; 968 } 969 970 if (!spdk_u32_is_pow2(block_size)) { 971 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 972 rc = -EINVAL; 973 goto error_return; 974 } 975 976 fdisk->disk.blocklen = block_size; 977 if (fdisk->block_size_override && detected_block_size) { 978 fdisk->disk.required_alignment = spdk_u32log2(detected_block_size); 979 } else { 980 fdisk->disk.required_alignment = spdk_u32log2(block_size); 981 } 982 983 if (disk_size % fdisk->disk.blocklen != 0) { 984 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 985 disk_size, fdisk->disk.blocklen); 986 rc = -EINVAL; 987 goto error_return; 988 } 989 990 fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen; 991 fdisk->disk.ctxt = fdisk; 992 spdk_uuid_copy(&fdisk->disk.uuid, uuid); 993 994 fdisk->disk.fn_table = &aio_fn_table; 995 996 spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb, 997 sizeof(struct bdev_aio_io_channel), 998 fdisk->disk.name); 999 rc = spdk_bdev_register(&fdisk->disk); 1000 if (rc) { 1001 spdk_io_device_unregister(fdisk, NULL); 1002 goto error_return; 1003 } 1004 1005 TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link); 1006 return 0; 1007 1008 error_return: 1009 bdev_aio_close(fdisk); 1010 aio_free_disk(fdisk); 1011 return rc; 1012 } 1013 1014 static void 1015 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 1016 { 1017 } 1018 1019 int 1020 bdev_aio_rescan(const char *name) 1021 { 1022 struct spdk_bdev_desc *desc; 1023 struct spdk_bdev *bdev; 1024 struct file_disk *fdisk; 1025 uint64_t disk_size, blockcnt; 1026 int rc; 1027 1028 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc); 1029 if (rc != 0) { 1030 return rc; 1031 } 1032 1033 bdev = spdk_bdev_desc_get_bdev(desc); 1034 if (bdev->module != &aio_if) { 1035 rc = -ENODEV; 1036 goto exit; 1037 } 1038 1039 fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk); 1040 disk_size = spdk_fd_get_size(fdisk->fd); 1041 blockcnt = disk_size / bdev->blocklen; 1042 1043 if (bdev->blockcnt != blockcnt) { 1044 SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %" 1045 PRIu64 "\n", 1046 fdisk->filename, 1047 bdev->blockcnt, 1048 blockcnt); 1049 rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt); 1050 if (rc != 0) { 1051 SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n", 1052 fdisk->filename, rc); 1053 goto exit; 1054 } 1055 } 1056 1057 exit: 1058 spdk_bdev_close(desc); 1059 return rc; 1060 } 1061 1062 struct delete_aio_bdev_ctx { 1063 delete_aio_bdev_complete cb_fn; 1064 void *cb_arg; 1065 }; 1066 1067 static void 1068 aio_bdev_unregister_cb(void *arg, int bdeverrno) 1069 { 1070 struct delete_aio_bdev_ctx *ctx = arg; 1071 1072 ctx->cb_fn(ctx->cb_arg, bdeverrno); 1073 free(ctx); 1074 } 1075 1076 void 1077 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg) 1078 { 1079 struct delete_aio_bdev_ctx *ctx; 1080 int rc; 1081 1082 ctx = calloc(1, sizeof(*ctx)); 1083 if (ctx == NULL) { 1084 cb_fn(cb_arg, -ENOMEM); 1085 return; 1086 } 1087 1088 ctx->cb_fn = cb_fn; 1089 ctx->cb_arg = cb_arg; 1090 rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx); 1091 if (rc != 0) { 1092 aio_bdev_unregister_cb(ctx, rc); 1093 } 1094 } 1095 1096 static int 1097 bdev_aio_initialize(void) 1098 { 1099 spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb, 1100 sizeof(struct bdev_aio_group_channel), "aio_module"); 1101 1102 return 0; 1103 } 1104 1105 static void 1106 bdev_aio_fini(void) 1107 { 1108 spdk_io_device_unregister(&aio_if, NULL); 1109 } 1110 1111 SPDK_LOG_REGISTER_COMPONENT(aio) 1112