1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_aio.h" 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/barrier.h" 12 #include "spdk/bdev.h" 13 #include "spdk/bdev_module.h" 14 #include "spdk/env.h" 15 #include "spdk/fd.h" 16 #include "spdk/likely.h" 17 #include "spdk/thread.h" 18 #include "spdk/json.h" 19 #include "spdk/util.h" 20 #include "spdk/string.h" 21 22 #include "spdk/log.h" 23 24 #include <sys/eventfd.h> 25 26 #ifndef __FreeBSD__ 27 #include <libaio.h> 28 #endif 29 30 struct bdev_aio_io_channel { 31 uint64_t io_inflight; 32 #ifdef __FreeBSD__ 33 int kqfd; 34 #else 35 io_context_t io_ctx; 36 #endif 37 struct bdev_aio_group_channel *group_ch; 38 TAILQ_ENTRY(bdev_aio_io_channel) link; 39 }; 40 41 struct bdev_aio_group_channel { 42 /* eventfd for io completion notification in interrupt mode. 43 * Negative value like '-1' indicates it is invalid or unused. 44 */ 45 int efd; 46 struct spdk_interrupt *intr; 47 struct spdk_poller *poller; 48 TAILQ_HEAD(, bdev_aio_io_channel) io_ch_head; 49 }; 50 51 struct bdev_aio_task { 52 #ifdef __FreeBSD__ 53 struct aiocb aiocb; 54 #else 55 struct iocb iocb; 56 #endif 57 uint64_t len; 58 struct bdev_aio_io_channel *ch; 59 }; 60 61 struct file_disk { 62 struct bdev_aio_task *reset_task; 63 struct spdk_poller *reset_retry_timer; 64 struct spdk_bdev disk; 65 char *filename; 66 int fd; 67 TAILQ_ENTRY(file_disk) link; 68 bool block_size_override; 69 bool readonly; 70 bool fallocate; 71 }; 72 73 /* For user space reaping of completions */ 74 struct spdk_aio_ring { 75 uint32_t id; 76 uint32_t size; 77 uint32_t head; 78 uint32_t tail; 79 80 uint32_t version; 81 uint32_t compat_features; 82 uint32_t incompat_features; 83 uint32_t header_length; 84 }; 85 86 #define SPDK_AIO_RING_VERSION 0xa10a10a1 87 88 static int bdev_aio_initialize(void); 89 static void bdev_aio_fini(void); 90 static void aio_free_disk(struct file_disk *fdisk); 91 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head); 92 93 #define SPDK_AIO_QUEUE_DEPTH 128 94 #define MAX_EVENTS_PER_POLL 32 95 96 static int 97 bdev_aio_get_ctx_size(void) 98 { 99 return sizeof(struct bdev_aio_task); 100 } 101 102 static struct spdk_bdev_module aio_if = { 103 .name = "aio", 104 .module_init = bdev_aio_initialize, 105 .module_fini = bdev_aio_fini, 106 .get_ctx_size = bdev_aio_get_ctx_size, 107 }; 108 109 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if) 110 111 static int 112 bdev_aio_open(struct file_disk *disk) 113 { 114 int fd; 115 int io_flag = disk->readonly ? O_RDONLY : O_RDWR; 116 117 fd = open(disk->filename, io_flag | O_DIRECT); 118 if (fd < 0) { 119 /* Try without O_DIRECT for non-disk files */ 120 fd = open(disk->filename, io_flag); 121 if (fd < 0) { 122 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", 123 disk->filename, errno, spdk_strerror(errno)); 124 disk->fd = -1; 125 return -1; 126 } 127 } 128 129 disk->fd = fd; 130 131 return 0; 132 } 133 134 static int 135 bdev_aio_close(struct file_disk *disk) 136 { 137 int rc; 138 139 if (disk->fd == -1) { 140 return 0; 141 } 142 143 rc = close(disk->fd); 144 if (rc < 0) { 145 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", 146 disk->fd, errno, spdk_strerror(errno)); 147 return -1; 148 } 149 150 disk->fd = -1; 151 152 return 0; 153 } 154 155 #ifdef __FreeBSD__ 156 static int 157 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 158 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 159 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 160 { 161 struct aiocb *aiocb = &aio_task->aiocb; 162 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 163 164 memset(aiocb, 0, sizeof(struct aiocb)); 165 aiocb->aio_fildes = fdisk->fd; 166 aiocb->aio_iov = iov; 167 aiocb->aio_iovcnt = iovcnt; 168 aiocb->aio_offset = offset; 169 aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd; 170 aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task; 171 aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT; 172 173 aio_task->len = nbytes; 174 aio_task->ch = aio_ch; 175 176 if (type == SPDK_BDEV_IO_TYPE_READ) { 177 return aio_readv(aiocb); 178 } 179 180 return aio_writev(aiocb); 181 } 182 #else 183 static int 184 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk, 185 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 186 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 187 { 188 struct iocb *iocb = &aio_task->iocb; 189 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 190 191 if (type == SPDK_BDEV_IO_TYPE_READ) { 192 io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset); 193 } else { 194 io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset); 195 } 196 197 if (aio_ch->group_ch->efd >= 0) { 198 io_set_eventfd(iocb, aio_ch->group_ch->efd); 199 } 200 iocb->data = aio_task; 201 aio_task->len = nbytes; 202 aio_task->ch = aio_ch; 203 204 return io_submit(aio_ch->io_ctx, 1, &iocb); 205 } 206 #endif 207 208 static void 209 bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk, 210 struct spdk_io_channel *ch, struct bdev_aio_task *aio_task, 211 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) 212 { 213 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 214 int rc; 215 216 if (type == SPDK_BDEV_IO_TYPE_READ) { 217 SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n", 218 iovcnt, nbytes, offset); 219 } else { 220 SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n", 221 iovcnt, nbytes, offset); 222 } 223 224 rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset); 225 if (spdk_unlikely(rc < 0)) { 226 if (rc == -EAGAIN) { 227 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); 228 } else { 229 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc); 230 SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); 231 } 232 } else { 233 aio_ch->io_inflight++; 234 } 235 } 236 237 static void 238 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 239 { 240 int rc = fsync(fdisk->fd); 241 242 if (rc == 0) { 243 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 244 } else { 245 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 246 } 247 } 248 249 #ifndef __FreeBSD__ 250 static void 251 bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode) 252 { 253 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 254 struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx; 255 uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 256 uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 257 int rc; 258 259 if (!fdisk->fallocate) { 260 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP); 261 return; 262 } 263 264 rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes); 265 if (rc == 0) { 266 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 267 } else { 268 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno); 269 } 270 } 271 272 static void 273 bdev_aio_unmap(struct spdk_bdev_io *bdev_io) 274 { 275 int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; 276 277 bdev_aio_fallocate(bdev_io, mode); 278 } 279 280 281 static void 282 bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io) 283 { 284 int mode = FALLOC_FL_ZERO_RANGE; 285 286 bdev_aio_fallocate(bdev_io, mode); 287 } 288 #endif 289 290 static void 291 bdev_aio_destruct_cb(void *io_device) 292 { 293 struct file_disk *fdisk = io_device; 294 int rc = 0; 295 296 TAILQ_REMOVE(&g_aio_disk_head, fdisk, link); 297 rc = bdev_aio_close(fdisk); 298 if (rc < 0) { 299 SPDK_ERRLOG("bdev_aio_close() failed\n"); 300 } 301 aio_free_disk(fdisk); 302 } 303 304 static int 305 bdev_aio_destruct(void *ctx) 306 { 307 struct file_disk *fdisk = ctx; 308 309 spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb); 310 311 return 0; 312 } 313 314 #ifdef __FreeBSD__ 315 static int 316 bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events) 317 { 318 struct timespec ts; 319 int count; 320 321 memset(events, 0, max * sizeof(struct kevent)); 322 memset(&ts, 0, sizeof(ts)); 323 324 count = kevent(kq, NULL, 0, events, max, &ts); 325 if (count < 0) { 326 SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno)); 327 return -errno; 328 } 329 330 return count; 331 } 332 333 static int 334 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 335 { 336 int nr, i, res = 0; 337 struct bdev_aio_task *aio_task; 338 struct kevent events[SPDK_AIO_QUEUE_DEPTH]; 339 340 nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events); 341 if (nr < 0) { 342 return 0; 343 } 344 345 for (i = 0; i < nr; i++) { 346 aio_task = events[i].udata; 347 aio_task->ch->io_inflight--; 348 if (aio_task == NULL) { 349 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 350 break; 351 } else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) { 352 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 353 } else { 354 SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb)); 355 res = aio_error(&aio_task->aiocb); 356 if (res != 0) { 357 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 358 } else { 359 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 360 } 361 } 362 } 363 364 return nr; 365 } 366 #else 367 static int 368 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents) 369 { 370 uint32_t head, tail, count; 371 struct spdk_aio_ring *ring; 372 struct timespec timeout; 373 struct io_event *kevents; 374 375 ring = (struct spdk_aio_ring *)io_ctx; 376 377 if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) { 378 timeout.tv_sec = 0; 379 timeout.tv_nsec = 0; 380 381 return io_getevents(io_ctx, 0, max, uevents, &timeout); 382 } 383 384 /* Read the current state out of the ring */ 385 head = ring->head; 386 tail = ring->tail; 387 388 /* This memory barrier is required to prevent the loads above 389 * from being re-ordered with stores to the events array 390 * potentially occurring on other threads. */ 391 spdk_smp_rmb(); 392 393 /* Calculate how many items are in the circular ring */ 394 count = tail - head; 395 if (tail < head) { 396 count += ring->size; 397 } 398 399 /* Reduce the count to the limit provided by the user */ 400 count = spdk_min(max, count); 401 402 /* Grab the memory location of the event array */ 403 kevents = (struct io_event *)((uintptr_t)ring + ring->header_length); 404 405 /* Copy the events out of the ring. */ 406 if ((head + count) <= ring->size) { 407 /* Only one copy is required */ 408 memcpy(uevents, &kevents[head], count * sizeof(struct io_event)); 409 } else { 410 uint32_t first_part = ring->size - head; 411 /* Two copies are required */ 412 memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event)); 413 memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event)); 414 } 415 416 /* Update the head pointer. On x86, stores will not be reordered with older loads, 417 * so the copies out of the event array will always be complete prior to this 418 * update becoming visible. On other architectures this is not guaranteed, so 419 * add a barrier. */ 420 #if defined(__i386__) || defined(__x86_64__) 421 spdk_compiler_barrier(); 422 #else 423 spdk_smp_mb(); 424 #endif 425 ring->head = (head + count) % ring->size; 426 427 return count; 428 } 429 430 static int 431 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch) 432 { 433 int nr, i, res = 0; 434 struct bdev_aio_task *aio_task; 435 struct io_event events[SPDK_AIO_QUEUE_DEPTH]; 436 437 nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); 438 if (nr < 0) { 439 return 0; 440 } 441 442 for (i = 0; i < nr; i++) { 443 aio_task = events[i].data; 444 aio_task->ch->io_inflight--; 445 if (events[i].res == aio_task->len) { 446 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS); 447 } else { 448 /* From aio_abi.h, io_event.res is defined __s64, negative errno 449 * will be assigned to io_event.res for error situation. 450 * But from libaio.h, io_event.res is defined unsigned long, so 451 * convert it to signed value for error detection. 452 */ 453 SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res); 454 res = (int)events[i].res; 455 if (res < 0) { 456 spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res); 457 } else { 458 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); 459 } 460 } 461 } 462 463 return nr; 464 } 465 #endif 466 467 static int 468 bdev_aio_group_poll(void *arg) 469 { 470 struct bdev_aio_group_channel *group_ch = arg; 471 struct bdev_aio_io_channel *io_ch; 472 int nr = 0; 473 474 TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) { 475 nr += bdev_aio_io_channel_poll(io_ch); 476 } 477 478 return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 479 } 480 481 static int 482 bdev_aio_group_interrupt(void *arg) 483 { 484 struct bdev_aio_group_channel *group_ch = arg; 485 int rc; 486 uint64_t num_events; 487 488 assert(group_ch->efd >= 0); 489 490 /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH, 491 * io_getevent should be called again to ensure all completed IO are processed. 492 */ 493 rc = read(group_ch->efd, &num_events, sizeof(num_events)); 494 if (rc < 0) { 495 SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno)); 496 return -errno; 497 } 498 499 if (num_events > SPDK_AIO_QUEUE_DEPTH) { 500 num_events -= SPDK_AIO_QUEUE_DEPTH; 501 rc = write(group_ch->efd, &num_events, sizeof(num_events)); 502 if (rc < 0) { 503 SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno)); 504 } 505 } 506 507 return bdev_aio_group_poll(group_ch); 508 } 509 510 static void 511 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) 512 { 513 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 514 struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); 515 516 if (aio_ch->io_inflight) { 517 spdk_for_each_channel_continue(i, -1); 518 return; 519 } 520 521 spdk_for_each_channel_continue(i, 0); 522 } 523 524 static int bdev_aio_reset_retry_timer(void *arg); 525 526 static void 527 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) 528 { 529 struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i); 530 531 if (status == -1) { 532 fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500); 533 return; 534 } 535 536 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); 537 } 538 539 static int 540 bdev_aio_reset_retry_timer(void *arg) 541 { 542 struct file_disk *fdisk = arg; 543 544 if (fdisk->reset_retry_timer) { 545 spdk_poller_unregister(&fdisk->reset_retry_timer); 546 } 547 548 spdk_for_each_channel(fdisk, 549 _bdev_aio_get_io_inflight, 550 fdisk, 551 _bdev_aio_get_io_inflight_done); 552 553 return SPDK_POLLER_BUSY; 554 } 555 556 static void 557 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task) 558 { 559 fdisk->reset_task = aio_task; 560 561 bdev_aio_reset_retry_timer(fdisk); 562 } 563 564 static void 565 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 566 bool success) 567 { 568 if (!success) { 569 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 570 return; 571 } 572 573 switch (bdev_io->type) { 574 case SPDK_BDEV_IO_TYPE_READ: 575 case SPDK_BDEV_IO_TYPE_WRITE: 576 bdev_aio_rw(bdev_io->type, 577 (struct file_disk *)bdev_io->bdev->ctxt, 578 ch, 579 (struct bdev_aio_task *)bdev_io->driver_ctx, 580 bdev_io->u.bdev.iovs, 581 bdev_io->u.bdev.iovcnt, 582 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, 583 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); 584 break; 585 default: 586 SPDK_ERRLOG("Wrong io type\n"); 587 break; 588 } 589 } 590 591 static int 592 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 593 { 594 struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt; 595 596 switch (bdev_io->type) { 597 /* Read and write operations must be performed on buffers aligned to 598 * bdev->required_alignment. If user specified unaligned buffers, 599 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 600 case SPDK_BDEV_IO_TYPE_READ: 601 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 602 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 603 return 0; 604 case SPDK_BDEV_IO_TYPE_WRITE: 605 if (fdisk->readonly) { 606 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 607 } else { 608 spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, 609 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 610 } 611 return 0; 612 613 case SPDK_BDEV_IO_TYPE_FLUSH: 614 bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt, 615 (struct bdev_aio_task *)bdev_io->driver_ctx); 616 return 0; 617 618 case SPDK_BDEV_IO_TYPE_RESET: 619 bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt, 620 (struct bdev_aio_task *)bdev_io->driver_ctx); 621 return 0; 622 623 #ifndef __FreeBSD__ 624 case SPDK_BDEV_IO_TYPE_UNMAP: 625 bdev_aio_unmap(bdev_io); 626 return 0; 627 628 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 629 bdev_aio_write_zeros(bdev_io); 630 return 0; 631 #endif 632 633 default: 634 return -1; 635 } 636 } 637 638 static void 639 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 640 { 641 if (_bdev_aio_submit_request(ch, bdev_io) < 0) { 642 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 643 } 644 } 645 646 static bool 647 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 648 { 649 struct file_disk *fdisk = ctx; 650 651 switch (io_type) { 652 case SPDK_BDEV_IO_TYPE_READ: 653 case SPDK_BDEV_IO_TYPE_WRITE: 654 case SPDK_BDEV_IO_TYPE_FLUSH: 655 case SPDK_BDEV_IO_TYPE_RESET: 656 return true; 657 658 case SPDK_BDEV_IO_TYPE_UNMAP: 659 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 660 return fdisk->fallocate; 661 662 default: 663 return false; 664 } 665 } 666 667 #ifdef __FreeBSD__ 668 static int 669 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 670 { 671 ch->kqfd = kqueue(); 672 if (ch->kqfd < 0) { 673 SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno)); 674 return -1; 675 } 676 677 return 0; 678 } 679 680 static void 681 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 682 { 683 close(ch->kqfd); 684 } 685 #else 686 static int 687 bdev_aio_create_io(struct bdev_aio_io_channel *ch) 688 { 689 if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { 690 SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n"); 691 SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n"); 692 return -1; 693 } 694 695 return 0; 696 } 697 698 static void 699 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch) 700 { 701 io_destroy(ch->io_ctx); 702 } 703 #endif 704 705 static int 706 bdev_aio_create_cb(void *io_device, void *ctx_buf) 707 { 708 struct bdev_aio_io_channel *ch = ctx_buf; 709 int rc; 710 711 rc = bdev_aio_create_io(ch); 712 if (rc < 0) { 713 return rc; 714 } 715 716 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if)); 717 TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link); 718 719 return 0; 720 } 721 722 static void 723 bdev_aio_destroy_cb(void *io_device, void *ctx_buf) 724 { 725 struct bdev_aio_io_channel *ch = ctx_buf; 726 727 bdev_aio_destroy_io(ch); 728 729 assert(ch->group_ch); 730 TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link); 731 732 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); 733 } 734 735 static struct spdk_io_channel * 736 bdev_aio_get_io_channel(void *ctx) 737 { 738 struct file_disk *fdisk = ctx; 739 740 return spdk_get_io_channel(fdisk); 741 } 742 743 744 static int 745 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 746 { 747 struct file_disk *fdisk = ctx; 748 749 spdk_json_write_named_object_begin(w, "aio"); 750 751 spdk_json_write_named_string(w, "filename", fdisk->filename); 752 753 spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override); 754 755 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 756 757 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 758 759 spdk_json_write_object_end(w); 760 761 return 0; 762 } 763 764 static void 765 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 766 { 767 struct file_disk *fdisk = bdev->ctxt; 768 const struct spdk_uuid *uuid = spdk_bdev_get_uuid(bdev); 769 770 spdk_json_write_object_begin(w); 771 772 spdk_json_write_named_string(w, "method", "bdev_aio_create"); 773 774 spdk_json_write_named_object_begin(w, "params"); 775 spdk_json_write_named_string(w, "name", bdev->name); 776 if (fdisk->block_size_override) { 777 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 778 } 779 spdk_json_write_named_string(w, "filename", fdisk->filename); 780 spdk_json_write_named_bool(w, "readonly", fdisk->readonly); 781 spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate); 782 if (!spdk_uuid_is_null(uuid)) { 783 spdk_json_write_named_uuid(w, "uuid", uuid); 784 } 785 spdk_json_write_object_end(w); 786 787 spdk_json_write_object_end(w); 788 } 789 790 static const struct spdk_bdev_fn_table aio_fn_table = { 791 .destruct = bdev_aio_destruct, 792 .submit_request = bdev_aio_submit_request, 793 .io_type_supported = bdev_aio_io_type_supported, 794 .get_io_channel = bdev_aio_get_io_channel, 795 .dump_info_json = bdev_aio_dump_info_json, 796 .write_config_json = bdev_aio_write_json_config, 797 }; 798 799 static void 800 aio_free_disk(struct file_disk *fdisk) 801 { 802 if (fdisk == NULL) { 803 return; 804 } 805 free(fdisk->filename); 806 free(fdisk->disk.name); 807 free(fdisk); 808 } 809 810 static int 811 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch) 812 { 813 int efd; 814 815 efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 816 if (efd < 0) { 817 return -1; 818 } 819 820 ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch); 821 if (ch->intr == NULL) { 822 close(efd); 823 return -1; 824 } 825 ch->efd = efd; 826 827 return 0; 828 } 829 830 static void 831 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch) 832 { 833 spdk_interrupt_unregister(&ch->intr); 834 close(ch->efd); 835 ch->efd = -1; 836 } 837 838 static int 839 bdev_aio_group_create_cb(void *io_device, void *ctx_buf) 840 { 841 struct bdev_aio_group_channel *ch = ctx_buf; 842 int rc; 843 844 TAILQ_INIT(&ch->io_ch_head); 845 /* Initialize ch->efd to be invalid and unused. */ 846 ch->efd = -1; 847 if (spdk_interrupt_mode_is_enabled()) { 848 rc = bdev_aio_register_interrupt(ch); 849 if (rc < 0) { 850 SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n"); 851 return rc; 852 } 853 } 854 855 ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0); 856 spdk_poller_register_interrupt(ch->poller, NULL, NULL); 857 858 return 0; 859 } 860 861 static void 862 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) 863 { 864 struct bdev_aio_group_channel *ch = ctx_buf; 865 866 if (!TAILQ_EMPTY(&ch->io_ch_head)) { 867 SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n"); 868 } 869 870 spdk_poller_unregister(&ch->poller); 871 if (spdk_interrupt_mode_is_enabled()) { 872 bdev_aio_unregister_interrupt(ch); 873 } 874 } 875 876 int 877 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly, 878 bool fallocate, const struct spdk_uuid *uuid) 879 { 880 struct file_disk *fdisk; 881 uint32_t detected_block_size; 882 uint64_t disk_size; 883 int rc; 884 885 #ifdef __FreeBSD__ 886 if (fallocate) { 887 SPDK_ERRLOG("Unable to support fallocate on this platform\n"); 888 return -ENOTSUP; 889 } 890 #endif 891 892 fdisk = calloc(1, sizeof(*fdisk)); 893 if (!fdisk) { 894 SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n"); 895 return -ENOMEM; 896 } 897 fdisk->readonly = readonly; 898 fdisk->fallocate = fallocate; 899 900 fdisk->filename = strdup(filename); 901 if (!fdisk->filename) { 902 rc = -ENOMEM; 903 goto error_return; 904 } 905 906 if (bdev_aio_open(fdisk)) { 907 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno); 908 rc = -errno; 909 goto error_return; 910 } 911 912 disk_size = spdk_fd_get_size(fdisk->fd); 913 914 fdisk->disk.name = strdup(name); 915 if (!fdisk->disk.name) { 916 rc = -ENOMEM; 917 goto error_return; 918 } 919 fdisk->disk.product_name = "AIO disk"; 920 fdisk->disk.module = &aio_if; 921 922 fdisk->disk.write_cache = 1; 923 924 detected_block_size = spdk_fd_get_blocklen(fdisk->fd); 925 if (block_size == 0) { 926 /* User did not specify block size - use autodetected block size. */ 927 if (detected_block_size == 0) { 928 SPDK_ERRLOG("Block size could not be auto-detected\n"); 929 rc = -EINVAL; 930 goto error_return; 931 } 932 fdisk->block_size_override = false; 933 block_size = detected_block_size; 934 } else { 935 if (block_size < detected_block_size) { 936 SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " 937 "auto-detected block size %" PRIu32 "\n", 938 block_size, detected_block_size); 939 rc = -EINVAL; 940 goto error_return; 941 } else if (detected_block_size != 0 && block_size != detected_block_size) { 942 SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " 943 "auto-detected block size %" PRIu32 "\n", 944 block_size, detected_block_size); 945 } 946 fdisk->block_size_override = true; 947 } 948 949 if (block_size < 512) { 950 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 951 rc = -EINVAL; 952 goto error_return; 953 } 954 955 if (!spdk_u32_is_pow2(block_size)) { 956 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 957 rc = -EINVAL; 958 goto error_return; 959 } 960 961 fdisk->disk.blocklen = block_size; 962 if (fdisk->block_size_override && detected_block_size) { 963 fdisk->disk.required_alignment = spdk_u32log2(detected_block_size); 964 } else { 965 fdisk->disk.required_alignment = spdk_u32log2(block_size); 966 } 967 968 if (disk_size % fdisk->disk.blocklen != 0) { 969 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 970 disk_size, fdisk->disk.blocklen); 971 rc = -EINVAL; 972 goto error_return; 973 } 974 975 fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen; 976 fdisk->disk.ctxt = fdisk; 977 spdk_uuid_copy(&fdisk->disk.uuid, uuid); 978 979 fdisk->disk.fn_table = &aio_fn_table; 980 981 spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb, 982 sizeof(struct bdev_aio_io_channel), 983 fdisk->disk.name); 984 rc = spdk_bdev_register(&fdisk->disk); 985 if (rc) { 986 spdk_io_device_unregister(fdisk, NULL); 987 goto error_return; 988 } 989 990 TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link); 991 return 0; 992 993 error_return: 994 bdev_aio_close(fdisk); 995 aio_free_disk(fdisk); 996 return rc; 997 } 998 999 static void 1000 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 1001 { 1002 } 1003 1004 int 1005 bdev_aio_rescan(const char *name) 1006 { 1007 struct spdk_bdev_desc *desc; 1008 struct spdk_bdev *bdev; 1009 struct file_disk *fdisk; 1010 uint64_t disk_size, blockcnt; 1011 int rc; 1012 1013 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc); 1014 if (rc != 0) { 1015 return rc; 1016 } 1017 1018 bdev = spdk_bdev_desc_get_bdev(desc); 1019 if (bdev->module != &aio_if) { 1020 rc = -ENODEV; 1021 goto exit; 1022 } 1023 1024 fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk); 1025 disk_size = spdk_fd_get_size(fdisk->fd); 1026 blockcnt = disk_size / bdev->blocklen; 1027 1028 if (bdev->blockcnt != blockcnt) { 1029 SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %" 1030 PRIu64 "\n", 1031 fdisk->filename, 1032 bdev->blockcnt, 1033 blockcnt); 1034 rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt); 1035 if (rc != 0) { 1036 SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n", 1037 fdisk->filename, rc); 1038 goto exit; 1039 } 1040 } 1041 1042 exit: 1043 spdk_bdev_close(desc); 1044 return rc; 1045 } 1046 1047 struct delete_aio_bdev_ctx { 1048 delete_aio_bdev_complete cb_fn; 1049 void *cb_arg; 1050 }; 1051 1052 static void 1053 aio_bdev_unregister_cb(void *arg, int bdeverrno) 1054 { 1055 struct delete_aio_bdev_ctx *ctx = arg; 1056 1057 ctx->cb_fn(ctx->cb_arg, bdeverrno); 1058 free(ctx); 1059 } 1060 1061 void 1062 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg) 1063 { 1064 struct delete_aio_bdev_ctx *ctx; 1065 int rc; 1066 1067 ctx = calloc(1, sizeof(*ctx)); 1068 if (ctx == NULL) { 1069 cb_fn(cb_arg, -ENOMEM); 1070 return; 1071 } 1072 1073 ctx->cb_fn = cb_fn; 1074 ctx->cb_arg = cb_arg; 1075 rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx); 1076 if (rc != 0) { 1077 aio_bdev_unregister_cb(ctx, rc); 1078 } 1079 } 1080 1081 static int 1082 bdev_aio_initialize(void) 1083 { 1084 spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb, 1085 sizeof(struct bdev_aio_group_channel), "aio_module"); 1086 1087 return 0; 1088 } 1089 1090 static void 1091 bdev_aio_fini(void) 1092 { 1093 spdk_io_device_unregister(&aio_if, NULL); 1094 } 1095 1096 SPDK_LOG_REGISTER_COMPONENT(aio) 1097