1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_aio.h" 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/barrier.h" 12 #include "spdk/bdev.h" 13 #include "spdk/bdev_module.h" 14 #include "spdk/env.h" 15 #include "spdk/fd.h" 16 #include "spdk/likely.h" 17 #include "spdk/thread.h" 18 #include "spdk/json.h" 19 #include "spdk/util.h" 20 #include "spdk/string.h" 21 22 #include "spdk/log.h" 23 24 #include <sys/eventfd.h> 25 26 #ifndef __FreeBSD__ 27 #include <libaio.h> 28 #endif 29 30 struct bdev_aio_io_channel { 31 uint64_t io_inflight; 32 #ifdef __FreeBSD__ 33 int kqfd; 34 #else 35 io_context_t io_ctx; 36 #endif 37 struct bdev_aio_group_channel *group_ch; 38 TAILQ_ENTRY(bdev_aio_io_channel) link; 39 }; 40 41 struct bdev_aio_group_channel { 42 /* eventfd for io completion notification in interrupt mode. 43 * Negative value like '-1' indicates it is invalid or unused. 44 */ 45 int efd; 46 struct spdk_interrupt *intr; 47 struct spdk_poller *poller; 48 TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head; 49 }; 50 51 struct bdev_aio_task { 52 #ifdef __FreeBSD__ 53 struct aiocb aiocb; 54 #else 55 struct iocb iocb; 56 #endif 57 uint64_t len; 58 struct bdev_aio_io_channel *ch; 59 }; 60 61 struct file_disk { 62 struct bdev_aio_task *reset_task; 63 struct spdk_poller *reset_retry_timer; 64 struct spdk_bdev disk; 65 char *filename; 66 int fd; 67 TAILQ_ENTRY(file_disk) link; 68 bool block_size_override; 69 bool readonly; 70 bool fallocate; 71 }; 72 73 /* For user space reaping of completions */ 74 struct spdk_aio_ring { 75 uint32_t id; 76 uint32_t size; 77 uint32_t head; 78 uint32_t tail; 79 80 uint32_t version; 81 uint32_t compat_features; 82 uint32_t incompat_features; 83 uint32_t header_length; 84 }; 85 86 #define SPDK_AIO_RING_VERSION 0xa10a10a1 87 88 static int bdev_aio_initialize(void); 89 static void bdev_aio_fini(void); 90 static void aio_free_disk(struct file_disk *fdisk); 91 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head); 92 93 #define SPDK_AIO_QUEUE_DEPTH 128 94 #define MAX_EVENTS_PER_POLL 32 95 96 static int 97 bdev_aio_get_ctx_size(void) 98 { 99 return sizeof(struct bdev_aio_task); 100 } 101 102 static struct spdk_bdev_module aio_if = { 103 .name = "aio", 104 .module_init = bdev_aio_initialize, 105 .module_fini = bdev_aio_fini, 106 .get_ctx_size = bdev_aio_get_ctx_size, 107 }; 108 109 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if) 110 111 static int 112 bdev_aio_open(struct file_disk *disk) 113 { 114 int fd; 115 int io_flag = disk->readonly ? O_RDONLY : O_RDWR; 116 117 fd = open(disk->filename, io_flag | O_DIRECT); 118 if (fd < 0) { 119 /* Try without O_DIRECT for non-disk files */ 120 fd = open(disk->filename, io_flag); 121 if (fd < 0) { 122 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 123 disk->filename, errno, spdk_strerror(errno)); 124 disk->fd = -1; 125 return -1; 126 } 127 } 128 129 disk->fd = fd; 130 131 return 0; 132 } 133 134 static int 135 bdev_aio_close(struct file_disk *disk) 136 { 137 int rc; 138 139 if (disk->fd == -1) { 140 return 0; 141 } 142 143 rc = close(disk->fd); 144 if (rc < 0) { 145 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 146 disk->fd, errno, spdk_strerror(errno)); 147 return -1; 148 } 149 150 disk->fd = -1; 151 152 return 0; 153 } 154 155 #ifdef __FreeBSD__ 156 static int 157 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 158 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 159 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 160 { 161 struct aiocb *aiocb = &aio_task->aiocb; 162 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 163 164 memset(aiocb, 0, sizeof(struct aiocb)); 165 aiocb->aio_fildes = fdisk->fd; 166 aiocb->aio_iov = iov; 167 aiocb->aio_iovcnt = iovcnt; 168 aiocb->aio_offset = offset; 169 aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd; 170 aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task; 171 aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT; 172 173 aio_task->len = nbytes; 174 aio_task->ch = aio_ch; 175 176 if (type == SPDK_BDEV_IO_TYPE_READ) { 177 return aio_readv(aiocb); 178 } 179 180 return aio_writev(aiocb); 181 } 182 #else 183 static int 184 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 185 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 186 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 187 { 188 struct iocb *iocb = &aio_task->iocb; 189 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 190 191 if (type == SPDK_BDEV_IO_TYPE_READ) { 192 io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset); 193 } else { 194 io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset); 195 } 196 197 if (aio_ch->group_ch->efd >= 0) { 198 io_set_eventfd(iocb, aio_ch->group_ch->efd); 199 } 200 iocb->data = aio_task; 201 aio_task->len = nbytes; 202 aio_task->ch = aio_ch; 203 204 return io_submit(aio_ch->io_ctx, 1, &iocb); 205 } 206 #endif 207 208 static void 209 bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk, 210 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 211 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 212 { 213 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 214 int rc; 215 216 if (type == SPDK_BDEV_IO_TYPE_READ) { 217 SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n", 218 iovcnt, nbytes, offset); 219 } else { 220 SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n", 221 iovcnt, nbytes, offset); 222 } 223 224 rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset); 225 if (spdk_unlikely(rc < 0)) { 226 if (rc == -EAGAIN) { 227 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); 228 } else { 229 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc); 230 SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); 231 } 232 } else { 233 aio_ch->io_inflight++; 234 } 235 } 236 237 static void 238 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 239 { 240 int rc = fsync(fdisk->fd); 241 242 if (rc == 0) { 243 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 244 } else { 245 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 246 } 247 } 248 249 #ifndef __FreeBSD__ 250 static void 251 bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode) 252 { 253 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 254 struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx; 255 uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 256 uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 257 int rc; 258 259 if (!fdisk->fallocate) { 260 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP); 261 return; 262 } 263 264 rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes); 265 if (rc == 0) { 266 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 267 } else { 268 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 269 } 270 } 271 272 static void 273 bdev_aio_unmap(struct spdk_bdev_io *bdev_io) 274 { 275 int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; 276 277 bdev_aio_fallocate(bdev_io, mode); 278 } 279 280 281 static void 282 bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io) 283 { 284 int mode = FALLOC_FL_ZERO_RANGE; 285 286 bdev_aio_fallocate(bdev_io, mode); 287 } 288 #endif 289 290 static void 291 bdev_aio_destruct_cb(void *io_device) 292 { 293 struct file_disk *fdisk = io_device; 294 int rc = 0; 295 296 TAILQ_REMOVE(&g_aio_disk_head, fdisk, link); 297 rc = bdev_aio_close(fdisk); 298 if (rc < 0) { 299 SPDK_ERRLOG("bdev_aio_close() failed\n"); 300 } 301 aio_free_disk(fdisk); 302 } 303 304 static int 305 bdev_aio_destruct(void *ctx) 306 { 307 struct file_disk *fdisk = ctx; 308 309 spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb); 310 311 return 0; 312 } 313 314 #ifdef __FreeBSD__ 315 static int 316 bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events) 317 { 318 struct timespec ts; 319 int count; 320 321 memset(events, 0, max * sizeof(struct kevent)); 322 memset(&ts, 0, sizeof(ts)); 323 324 count = kevent(kq, NULL, 0, events, max, &ts); 325 if (count < 0) { 326 SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno)); 327 return -errno; 328 } 329 330 return count; 331 } 332 333 static int 334 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 335 { 336 int nr, i, res = 0; 337 struct bdev_aio_task *aio_task; 338 struct kevent events[SPDK_AIO_QUEUE_DEPTH]; 339 340 nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events); 341 if (nr < 0) { 342 return 0; 343 } 344 345 for (i = 0; i < nr; i++) { 346 aio_task = events[i].udata; 347 aio_task->ch->io_inflight--; 348 if (aio_task == NULL) { 349 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 350 break; 351 } else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) { 352 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 353 } else { 354 SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb)); 355 res = aio_error(&aio_task->aiocb); 356 if (res != 0) { 357 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 358 } else { 359 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 360 } 361 } 362 } 363 364 return nr; 365 } 366 #else 367 static int 368 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents) 369 { 370 uint32_t head, tail, count; 371 struct spdk_aio_ring *ring; 372 struct timespec timeout; 373 struct io_event *kevents; 374 375 ring = (struct spdk_aio_ring *)io_ctx; 376 377 if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) { 378 timeout.tv_sec = 0; 379 timeout.tv_nsec = 0; 380 381 return io_getevents(io_ctx, 0, max, uevents, &timeout); 382 } 383 384 /* Read the current state out of the ring */ 385 head = ring->head; 386 tail = ring->tail; 387 388 /* This memory barrier is required to prevent the loads above 389 * from being re-ordered with stores to the events array 390 * potentially occurring on other threads. */ 391 spdk_smp_rmb(); 392 393 /* Calculate how many items are in the circular ring */ 394 count = tail - head; 395 if (tail < head) { 396 count += ring->size; 397 } 398 399 /* Reduce the count to the limit provided by the user */ 400 count = spdk_min(max, count); 401 402 /* Grab the memory location of the event array */ 403 kevents = (struct io_event *)((uintptr_t)ring + ring->header_length); 404 405 /* Copy the events out of the ring. */ 406 if ((head + count) <= ring->size) { 407 /* Only one copy is required */ 408 memcpy(uevents, &kevents[head], count * sizeof(struct io_event)); 409 } else { 410 uint32_t first_part = ring->size - head; 411 /* Two copies are required */ 412 memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event)); 413 memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event)); 414 } 415 416 /* Update the head pointer. On x86, stores will not be reordered with older loads, 417 * so the copies out of the event array will always be complete prior to this 418 * update becoming visible. On other architectures this is not guaranteed, so 419 * add a barrier. */ 420 #if defined(__i386__) || defined(__x86_64__) 421 spdk_compiler_barrier(); 422 #else 423 spdk_smp_mb(); 424 #endif 425 ring->head = (head + count) % ring->size; 426 427 return count; 428 } 429 430 static int 431 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 432 { 433 int nr, i, res = 0; 434 struct bdev_aio_task *aio_task; 435 struct io_event events[SPDK_AIO_QUEUE_DEPTH]; 436 437 nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); 438 if (nr < 0) { 439 return 0; 440 } 441 442 for (i = 0; i < nr; i++) { 443 aio_task = events[i].data; 444 aio_task->ch->io_inflight--; 445 if (events[i].res == aio_task->len) { 446 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 447 } else { 448 /* From aio_abi.h, io_event.res is defined __s64, negative errno 449 * will be assigned to io_event.res for error situation. 450 * But from libaio.h, io_event.res is defined unsigned long, so 451 * convert it to signed value for error detection. 452 */ 453 SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res); 454 res = (int)events[i].res; 455 if (res < 0) { 456 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 457 } else { 458 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 459 } 460 } 461 } 462 463 return nr; 464 } 465 #endif 466 467 static int 468 bdev_aio_group_poll(void *arg) 469 { 470 struct bdev_aio_group_channel *group_ch = arg; 471 struct bdev_aio_io_channel *io_ch; 472 int nr = 0; 473 474 TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) { 475 nr += bdev_aio_io_channel_poll(io_ch); 476 } 477 478 return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 479 } 480 481 static int 482 bdev_aio_group_interrupt(void *arg) 483 { 484 struct bdev_aio_group_channel *group_ch = arg; 485 int rc; 486 uint64_t num_events; 487 488 assert(group_ch->efd >= 0); 489 490 /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH, 491 * io_getevent should be called again to ensure all completed IO are processed. 492 */ 493 rc = read(group_ch->efd, &num_events, sizeof(num_events)); 494 if (rc < 0) { 495 SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno)); 496 return -errno; 497 } 498 499 if (num_events > SPDK_AIO_QUEUE_DEPTH) { 500 num_events -= SPDK_AIO_QUEUE_DEPTH; 501 rc = write(group_ch->efd, &num_events, sizeof(num_events)); 502 if (rc < 0) { 503 SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno)); 504 } 505 } 506 507 return bdev_aio_group_poll(group_ch); 508 } 509 510 static void 511 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) 512 { 513 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 514 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 515 516 if (aio_ch->io_inflight) { 517 spdk_for_each_channel_continue(i, -1); 518 return; 519 } 520 521 spdk_for_each_channel_continue(i, 0); 522 } 523 524 static int bdev_aio_reset_retry_timer(void *arg); 525 526 static void 527 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) 528 { 529 struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i); 530 531 if (status == -1) { 532 fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500); 533 return; 534 } 535 536 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); 537 } 538 539 static int 540 bdev_aio_reset_retry_timer(void *arg) 541 { 542 struct file_disk *fdisk = arg; 543 544 if (fdisk->reset_retry_timer) { 545 spdk_poller_unregister(&fdisk->reset_retry_timer); 546 } 547 548 spdk_for_each_channel(fdisk, 549 _bdev_aio_get_io_inflight, 550 fdisk, 551 _bdev_aio_get_io_inflight_done); 552 553 return SPDK_POLLER_BUSY; 554 } 555 556 static void 557 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 558 { 559 fdisk->reset_task = aio_task; 560 561 bdev_aio_reset_retry_timer(fdisk); 562 } 563 564 static void 565 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 566 bool success) 567 { 568 if (!success) { 569 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 570 return; 571 } 572 573 switch (bdev_io->type) { 574 case SPDK_BDEV_IO_TYPE_READ: 575 case SPDK_BDEV_IO_TYPE_WRITE: 576 bdev_aio_rw(bdev_io->type, 577 (struct file_disk *)bdev_io->bdev->ctxt, 578 ch, 579 (struct bdev_aio_task *)bdev_io->driver_ctx, 580 bdev_io->u.bdev.iovs, 581 bdev_io->u.bdev.iovcnt, 582 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 583 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 584 break; 585 default: 586 SPDK_ERRLOG("Wrong io type\n"); 587 break; 588 } 589 } 590 591 static int 592 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 593 { 594 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 595 596 switch (bdev_io->type) { 597 /* Read and write operations must be performed on buffers aligned to 598 * bdev->required_alignment. If user specified unaligned buffers, 599 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 600 case SPDK_BDEV_IO_TYPE_READ: 601 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 602 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 603 return 0; 604 case SPDK_BDEV_IO_TYPE_WRITE: 605 if (fdisk->readonly) { 606 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 607 } else { 608 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 609 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 610 } 611 return 0; 612 613 case SPDK_BDEV_IO_TYPE_FLUSH: 614 bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt, 615 (struct bdev_aio_task *)bdev_io->driver_ctx); 616 return 0; 617 618 case SPDK_BDEV_IO_TYPE_RESET: 619 bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt, 620 (struct bdev_aio_task *)bdev_io->driver_ctx); 621 return 0; 622 623 #ifndef __FreeBSD__ 624 case SPDK_BDEV_IO_TYPE_UNMAP: 625 bdev_aio_unmap(bdev_io); 626 return 0; 627 628 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 629 bdev_aio_write_zeros(bdev_io); 630 return 0; 631 #endif 632 633 default: 634 return -1; 635 } 636 } 637 638 static void 639 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 640 { 641 if (_bdev_aio_submit_request(ch, bdev_io) < 0) { 642 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 643 } 644 } 645 646 static bool 647 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 648 { 649 struct file_disk *fdisk = ctx; 650 651 switch (io_type) { 652 case SPDK_BDEV_IO_TYPE_READ: 653 case SPDK_BDEV_IO_TYPE_WRITE: 654 case SPDK_BDEV_IO_TYPE_FLUSH: 655 case SPDK_BDEV_IO_TYPE_RESET: 656 return true; 657 658 case SPDK_BDEV_IO_TYPE_UNMAP: 659 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 660 return fdisk->fallocate; 661 662 default: 663 return false; 664 } 665 } 666 667 #ifdef __FreeBSD__ 668 static int 669 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 670 { 671 ch->kqfd = kqueue(); 672 if (ch->kqfd < 0) { 673 SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno)); 674 return -1; 675 } 676 677 return 0; 678 } 679 680 static void 681 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 682 { 683 close(ch->kqfd); 684 } 685 #else 686 static int 687 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 688 { 689 if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { 690 SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n"); 691 SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n"); 692 return -1; 693 } 694 695 return 0; 696 } 697 698 static void 699 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 700 { 701 io_destroy(ch->io_ctx); 702 } 703 #endif 704 705 static int 706 bdev_aio_create_cb(void *io_device, void *ctx_buf) 707 { 708 struct bdev_aio_io_channel *ch = ctx_buf; 709 int rc; 710 711 rc = bdev_aio_create_io(ch); 712 if (rc < 0) { 713 return rc; 714 } 715 716 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if)); 717 TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link); 718 719 return 0; 720 } 721 722 static void 723 bdev_aio_destroy_cb(void *io_device, void *ctx_buf) 724 { 725 struct bdev_aio_io_channel *ch = ctx_buf; 726 727 bdev_aio_destroy_io(ch); 728 729 assert(ch->group_ch); 730 TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link); 731 732 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 733 } 734 735 static struct spdk_io_channel * 736 bdev_aio_get_io_channel(void *ctx) 737 { 738 struct file_disk *fdisk = ctx; 739 740 return spdk_get_io_channel(fdisk); 741 } 742 743 744 static int 745 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 746 { 747 struct file_disk *fdisk = ctx; 748 749 spdk_json_write_named_object_begin(w, "aio"); 750 751 spdk_json_write_named_string(w, "filename", fdisk->filename); 752 753 spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override); 754 755 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 756 757 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 758 759 spdk_json_write_object_end(w); 760 761 return 0; 762 } 763 764 static void 765 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 766 { 767 struct file_disk *fdisk = bdev->ctxt; 768 769 spdk_json_write_object_begin(w); 770 771 spdk_json_write_named_string(w, "method", "bdev_aio_create"); 772 773 spdk_json_write_named_object_begin(w, "params"); 774 spdk_json_write_named_string(w, "name", bdev->name); 775 if (fdisk->block_size_override) { 776 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 777 } 778 spdk_json_write_named_string(w, "filename", fdisk->filename); 779 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 780 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 781 spdk_json_write_object_end(w); 782 783 spdk_json_write_object_end(w); 784 } 785 786 static const struct spdk_bdev_fn_table aio_fn_table = { 787 .destruct = bdev_aio_destruct, 788 .submit_request = bdev_aio_submit_request, 789 .io_type_supported = bdev_aio_io_type_supported, 790 .get_io_channel = bdev_aio_get_io_channel, 791 .dump_info_json = bdev_aio_dump_info_json, 792 .write_config_json = bdev_aio_write_json_config, 793 }; 794 795 static void 796 aio_free_disk(struct file_disk *fdisk) 797 { 798 if (fdisk == NULL) { 799 return; 800 } 801 free(fdisk->filename); 802 free(fdisk->disk.name); 803 free(fdisk); 804 } 805 806 static int 807 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch) 808 { 809 int efd; 810 811 efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 812 if (efd < 0) { 813 return -1; 814 } 815 816 ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch); 817 if (ch->intr == NULL) { 818 close(efd); 819 return -1; 820 } 821 ch->efd = efd; 822 823 return 0; 824 } 825 826 static void 827 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch) 828 { 829 spdk_interrupt_unregister(&ch->intr); 830 close(ch->efd); 831 ch->efd = -1; 832 } 833 834 static void 835 bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode) 836 { 837 return; 838 } 839 840 static int 841 bdev_aio_group_create_cb(void *io_device, void *ctx_buf) 842 { 843 struct bdev_aio_group_channel *ch = ctx_buf; 844 int rc; 845 846 TAILQ_INIT(&ch->io_ch_head); 847 /* Initialize ch->efd to be invalid and unused. */ 848 ch->efd = -1; 849 if (spdk_interrupt_mode_is_enabled()) { 850 rc = bdev_aio_register_interrupt(ch); 851 if (rc < 0) { 852 SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n"); 853 return rc; 854 } 855 } 856 857 ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0); 858 spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL); 859 860 return 0; 861 } 862 863 static void 864 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) 865 { 866 struct bdev_aio_group_channel *ch = ctx_buf; 867 868 if (!TAILQ_EMPTY(&ch->io_ch_head)) { 869 SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n"); 870 } 871 872 spdk_poller_unregister(&ch->poller); 873 if (spdk_interrupt_mode_is_enabled()) { 874 bdev_aio_unregister_interrupt(ch); 875 } 876 } 877 878 int 879 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly, 880 bool fallocate) 881 { 882 struct file_disk *fdisk; 883 uint32_t detected_block_size; 884 uint64_t disk_size; 885 int rc; 886 887 #ifdef __FreeBSD__ 888 if (fallocate) { 889 SPDK_ERRLOG("Unable to support fallocate on this platform\n"); 890 return -ENOTSUP; 891 } 892 #endif 893 894 fdisk = calloc(1, sizeof(*fdisk)); 895 if (!fdisk) { 896 SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n"); 897 return -ENOMEM; 898 } 899 fdisk->readonly = readonly; 900 fdisk->fallocate = fallocate; 901 902 fdisk->filename = strdup(filename); 903 if (!fdisk->filename) { 904 rc = -ENOMEM; 905 goto error_return; 906 } 907 908 if (bdev_aio_open(fdisk)) { 909 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno); 910 rc = -errno; 911 goto error_return; 912 } 913 914 disk_size = spdk_fd_get_size(fdisk->fd); 915 916 fdisk->disk.name = strdup(name); 917 if (!fdisk->disk.name) { 918 rc = -ENOMEM; 919 goto error_return; 920 } 921 fdisk->disk.product_name = "AIO disk"; 922 fdisk->disk.module = &aio_if; 923 924 fdisk->disk.write_cache = 1; 925 926 detected_block_size = spdk_fd_get_blocklen(fdisk->fd); 927 if (block_size == 0) { 928 /* User did not specify block size - use autodetected block size. */ 929 if (detected_block_size == 0) { 930 SPDK_ERRLOG("Block size could not be auto-detected\n"); 931 rc = -EINVAL; 932 goto error_return; 933 } 934 fdisk->block_size_override = false; 935 block_size = detected_block_size; 936 } else { 937 if (block_size < detected_block_size) { 938 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 939 "auto-detected block size %" PRIu32 "\n", 940 block_size, detected_block_size); 941 rc = -EINVAL; 942 goto error_return; 943 } else if (detected_block_size != 0 && block_size != detected_block_size) { 944 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 945 "auto-detected block size %" PRIu32 "\n", 946 block_size, detected_block_size); 947 } 948 fdisk->block_size_override = true; 949 } 950 951 if (block_size < 512) { 952 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 953 rc = -EINVAL; 954 goto error_return; 955 } 956 957 if (!spdk_u32_is_pow2(block_size)) { 958 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 959 rc = -EINVAL; 960 goto error_return; 961 } 962 963 fdisk->disk.blocklen = block_size; 964 if (fdisk->block_size_override && detected_block_size) { 965 fdisk->disk.required_alignment = spdk_u32log2(detected_block_size); 966 } else { 967 fdisk->disk.required_alignment = spdk_u32log2(block_size); 968 } 969 970 if (disk_size % fdisk->disk.blocklen != 0) { 971 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 972 disk_size, fdisk->disk.blocklen); 973 rc = -EINVAL; 974 goto error_return; 975 } 976 977 fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen; 978 fdisk->disk.ctxt = fdisk; 979 980 fdisk->disk.fn_table = &aio_fn_table; 981 982 spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb, 983 sizeof(struct bdev_aio_io_channel), 984 fdisk->disk.name); 985 rc = spdk_bdev_register(&fdisk->disk); 986 if (rc) { 987 spdk_io_device_unregister(fdisk, NULL); 988 goto error_return; 989 } 990 991 TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link); 992 return 0; 993 994 error_return: 995 bdev_aio_close(fdisk); 996 aio_free_disk(fdisk); 997 return rc; 998 } 999 1000 static void 1001 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 1002 { 1003 } 1004 1005 int 1006 bdev_aio_rescan(const char *name) 1007 { 1008 struct spdk_bdev_desc *desc; 1009 struct spdk_bdev *bdev; 1010 struct file_disk *fdisk; 1011 uint64_t disk_size, blockcnt; 1012 int rc; 1013 1014 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc); 1015 if (rc != 0) { 1016 return rc; 1017 } 1018 1019 bdev = spdk_bdev_desc_get_bdev(desc); 1020 if (bdev->module != &aio_if) { 1021 rc = -ENODEV; 1022 goto exit; 1023 } 1024 1025 fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk); 1026 disk_size = spdk_fd_get_size(fdisk->fd); 1027 blockcnt = disk_size / bdev->blocklen; 1028 1029 if (bdev->blockcnt != blockcnt) { 1030 SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %" 1031 PRIu64 "\n", 1032 fdisk->filename, 1033 bdev->blockcnt, 1034 blockcnt); 1035 rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt); 1036 if (rc != 0) { 1037 SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n", 1038 fdisk->filename, rc); 1039 goto exit; 1040 } 1041 } 1042 1043 exit: 1044 spdk_bdev_close(desc); 1045 return rc; 1046 } 1047 1048 struct delete_aio_bdev_ctx { 1049 delete_aio_bdev_complete cb_fn; 1050 void *cb_arg; 1051 }; 1052 1053 static void 1054 aio_bdev_unregister_cb(void *arg, int bdeverrno) 1055 { 1056 struct delete_aio_bdev_ctx *ctx = arg; 1057 1058 ctx->cb_fn(ctx->cb_arg, bdeverrno); 1059 free(ctx); 1060 } 1061 1062 void 1063 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg) 1064 { 1065 struct delete_aio_bdev_ctx *ctx; 1066 int rc; 1067 1068 ctx = calloc(1, sizeof(*ctx)); 1069 if (ctx == NULL) { 1070 cb_fn(cb_arg, -ENOMEM); 1071 return; 1072 } 1073 1074 ctx->cb_fn = cb_fn; 1075 ctx->cb_arg = cb_arg; 1076 rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx); 1077 if (rc != 0) { 1078 aio_bdev_unregister_cb(ctx, rc); 1079 } 1080 } 1081 1082 static int 1083 bdev_aio_initialize(void) 1084 { 1085 spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb, 1086 sizeof(struct bdev_aio_group_channel), "aio_module"); 1087 1088 return 0; 1089 } 1090 1091 static void 1092 bdev_aio_fini(void) 1093 { 1094 spdk_io_device_unregister(&aio_if, NULL); 1095 } 1096 1097 SPDK_LOG_REGISTER_COMPONENT(aio) 1098