xref: /spdk/module/bdev/aio/bdev_aio.c (revision 2f5c602574a98ede645991abe279a96e19c50196)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_aio.h"
35 
36 #include "spdk/stdinc.h"
37 
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/bdev_module.h"
41 #include "spdk/env.h"
42 #include "spdk/fd.h"
43 #include "spdk/likely.h"
44 #include "spdk/thread.h"
45 #include "spdk/json.h"
46 #include "spdk/util.h"
47 #include "spdk/string.h"
48 
49 #include "spdk/log.h"
50 
51 #include <sys/eventfd.h>
52 #include <libaio.h>
53 
54 struct bdev_aio_io_channel {
55 	uint64_t				io_inflight;
56 	io_context_t				io_ctx;
57 	struct bdev_aio_group_channel		*group_ch;
58 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
59 };
60 
61 struct bdev_aio_group_channel {
62 	/* eventfd for io completion notification in interrupt mode.
63 	 * Negative value like '-1' indicates it is invalid or unused.
64 	 */
65 	int					efd;
66 	struct spdk_interrupt			*intr;
67 	struct spdk_poller			*poller;
68 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
69 };
70 
71 struct bdev_aio_task {
72 	struct iocb			iocb;
73 	uint64_t			len;
74 	struct bdev_aio_io_channel	*ch;
75 };
76 
77 struct file_disk {
78 	struct bdev_aio_task	*reset_task;
79 	struct spdk_poller	*reset_retry_timer;
80 	struct spdk_bdev	disk;
81 	char			*filename;
82 	int			fd;
83 	TAILQ_ENTRY(file_disk)  link;
84 	bool			block_size_override;
85 };
86 
87 /* For user space reaping of completions */
88 struct spdk_aio_ring {
89 	uint32_t id;
90 	uint32_t size;
91 	uint32_t head;
92 	uint32_t tail;
93 
94 	uint32_t version;
95 	uint32_t compat_features;
96 	uint32_t incompat_features;
97 	uint32_t header_length;
98 };
99 
100 #define SPDK_AIO_RING_VERSION	0xa10a10a1
101 
102 static int bdev_aio_initialize(void);
103 static void bdev_aio_fini(void);
104 static void aio_free_disk(struct file_disk *fdisk);
105 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
106 
107 #define SPDK_AIO_QUEUE_DEPTH 128
108 #define MAX_EVENTS_PER_POLL 32
109 
110 static int
111 bdev_aio_get_ctx_size(void)
112 {
113 	return sizeof(struct bdev_aio_task);
114 }
115 
116 static struct spdk_bdev_module aio_if = {
117 	.name		= "aio",
118 	.module_init	= bdev_aio_initialize,
119 	.module_fini	= bdev_aio_fini,
120 	.get_ctx_size	= bdev_aio_get_ctx_size,
121 };
122 
123 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
124 
125 static int
126 bdev_aio_open(struct file_disk *disk)
127 {
128 	int fd;
129 
130 	fd = open(disk->filename, O_RDWR | O_DIRECT);
131 	if (fd < 0) {
132 		/* Try without O_DIRECT for non-disk files */
133 		fd = open(disk->filename, O_RDWR);
134 		if (fd < 0) {
135 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
136 				    disk->filename, errno, spdk_strerror(errno));
137 			disk->fd = -1;
138 			return -1;
139 		}
140 	}
141 
142 	disk->fd = fd;
143 
144 	return 0;
145 }
146 
147 static int
148 bdev_aio_close(struct file_disk *disk)
149 {
150 	int rc;
151 
152 	if (disk->fd == -1) {
153 		return 0;
154 	}
155 
156 	rc = close(disk->fd);
157 	if (rc < 0) {
158 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
159 			    disk->fd, errno, spdk_strerror(errno));
160 		return -1;
161 	}
162 
163 	disk->fd = -1;
164 
165 	return 0;
166 }
167 
168 static int64_t
169 bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
170 	       struct bdev_aio_task *aio_task,
171 	       struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
172 {
173 	struct iocb *iocb = &aio_task->iocb;
174 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
175 	int rc;
176 
177 	io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
178 	if (aio_ch->group_ch->efd >= 0) {
179 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
180 	}
181 	iocb->data = aio_task;
182 	aio_task->len = nbytes;
183 	aio_task->ch = aio_ch;
184 
185 	SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
186 		      iovcnt, nbytes, offset);
187 
188 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
189 	if (rc < 0) {
190 		if (rc == -EAGAIN) {
191 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
192 		} else {
193 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
194 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
195 		}
196 		return -1;
197 	}
198 	aio_ch->io_inflight++;
199 	return nbytes;
200 }
201 
202 static int64_t
203 bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
204 		struct bdev_aio_task *aio_task,
205 		struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
206 {
207 	struct iocb *iocb = &aio_task->iocb;
208 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
209 	int rc;
210 
211 	io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
212 	if (aio_ch->group_ch->efd >= 0) {
213 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
214 	}
215 	iocb->data = aio_task;
216 	aio_task->len = len;
217 	aio_task->ch = aio_ch;
218 
219 	SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
220 		      iovcnt, len, offset);
221 
222 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
223 	if (rc < 0) {
224 		if (rc == -EAGAIN) {
225 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
226 		} else {
227 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
228 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
229 		}
230 		return -1;
231 	}
232 	aio_ch->io_inflight++;
233 	return len;
234 }
235 
236 static void
237 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
238 {
239 	int rc = fsync(fdisk->fd);
240 
241 	if (rc == 0) {
242 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
243 	} else {
244 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
245 	}
246 }
247 
248 static void
249 bdev_aio_destruct_cb(void *io_device)
250 {
251 	struct file_disk *fdisk = io_device;
252 	int rc = 0;
253 
254 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
255 	rc = bdev_aio_close(fdisk);
256 	if (rc < 0) {
257 		SPDK_ERRLOG("bdev_aio_close() failed\n");
258 	}
259 
260 	aio_free_disk(fdisk);
261 }
262 
263 static int
264 bdev_aio_destruct(void *ctx)
265 {
266 	struct file_disk *fdisk = ctx;
267 
268 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
269 
270 	return 0;
271 }
272 
273 static int
274 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
275 {
276 	uint32_t head, tail, count;
277 	struct spdk_aio_ring *ring;
278 	struct timespec timeout;
279 	struct io_event *kevents;
280 
281 	ring = (struct spdk_aio_ring *)io_ctx;
282 
283 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
284 		timeout.tv_sec = 0;
285 		timeout.tv_nsec = 0;
286 
287 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
288 	}
289 
290 	/* Read the current state out of the ring */
291 	head = ring->head;
292 	tail = ring->tail;
293 
294 	/* This memory barrier is required to prevent the loads above
295 	 * from being re-ordered with stores to the events array
296 	 * potentially occurring on other threads. */
297 	spdk_smp_rmb();
298 
299 	/* Calculate how many items are in the circular ring */
300 	count = tail - head;
301 	if (tail < head) {
302 		count += ring->size;
303 	}
304 
305 	/* Reduce the count to the limit provided by the user */
306 	count = spdk_min(max, count);
307 
308 	/* Grab the memory location of the event array */
309 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
310 
311 	/* Copy the events out of the ring. */
312 	if ((head + count) <= ring->size) {
313 		/* Only one copy is required */
314 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
315 	} else {
316 		uint32_t first_part = ring->size - head;
317 		/* Two copies are required */
318 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
319 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
320 	}
321 
322 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
323 	 * so the copies out of the event array will always be complete prior to this
324 	 * update becoming visible. On other architectures this is not guaranteed, so
325 	 * add a barrier. */
326 #if defined(__i386__) || defined(__x86_64__)
327 	spdk_compiler_barrier();
328 #else
329 	spdk_smp_mb();
330 #endif
331 	ring->head = (head + count) % ring->size;
332 
333 	return count;
334 }
335 
336 static int
337 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
338 {
339 	int nr, i = 0;
340 	struct bdev_aio_task *aio_task;
341 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
342 	uint64_t io_result;
343 
344 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
345 
346 	if (nr < 0) {
347 		return 0;
348 	}
349 
350 #define MAX_AIO_ERRNO 256
351 	for (i = 0; i < nr; i++) {
352 		aio_task = events[i].data;
353 		aio_task->ch->io_inflight--;
354 		io_result = events[i].res;
355 		if (io_result == aio_task->len) {
356 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
357 		} else if (io_result < MAX_AIO_ERRNO) {
358 			/* Linux AIO will return its errno to io_event.res */
359 			int aio_errno = io_result;
360 
361 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -aio_errno);
362 		} else {
363 			SPDK_ERRLOG("failed to complete aio: requested len is %lu, but completed len is %lu.\n",
364 				    aio_task->len, io_result);
365 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
366 		}
367 	}
368 
369 	return nr;
370 }
371 
372 static int
373 bdev_aio_group_poll(void *arg)
374 {
375 	struct bdev_aio_group_channel *group_ch = arg;
376 	struct bdev_aio_io_channel *io_ch;
377 	int nr = 0;
378 
379 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
380 		nr += bdev_aio_io_channel_poll(io_ch);
381 	}
382 
383 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
384 }
385 
386 static int
387 bdev_aio_group_interrupt(void *arg)
388 {
389 	struct bdev_aio_group_channel *group_ch = arg;
390 	int rc;
391 	uint64_t num_events;
392 
393 	assert(group_ch->efd >= 0);
394 
395 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
396 	 * io_getevent should be called again to ensure all completed IO are processed.
397 	 */
398 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
399 	if (rc < 0) {
400 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
401 		return -errno;
402 	}
403 
404 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
405 		num_events -= SPDK_AIO_QUEUE_DEPTH;
406 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
407 		if (rc < 0) {
408 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
409 		}
410 	}
411 
412 	return bdev_aio_group_poll(group_ch);
413 }
414 
415 static void
416 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
417 {
418 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
419 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
420 
421 	if (aio_ch->io_inflight) {
422 		spdk_for_each_channel_continue(i, -1);
423 		return;
424 	}
425 
426 	spdk_for_each_channel_continue(i, 0);
427 }
428 
429 static int bdev_aio_reset_retry_timer(void *arg);
430 
431 static void
432 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
433 {
434 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
435 
436 	if (status == -1) {
437 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
438 		return;
439 	}
440 
441 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
442 }
443 
444 static int
445 bdev_aio_reset_retry_timer(void *arg)
446 {
447 	struct file_disk *fdisk = arg;
448 
449 	if (fdisk->reset_retry_timer) {
450 		spdk_poller_unregister(&fdisk->reset_retry_timer);
451 	}
452 
453 	spdk_for_each_channel(fdisk,
454 			      _bdev_aio_get_io_inflight,
455 			      fdisk,
456 			      _bdev_aio_get_io_inflight_done);
457 
458 	return SPDK_POLLER_BUSY;
459 }
460 
461 static void
462 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
463 {
464 	fdisk->reset_task = aio_task;
465 
466 	bdev_aio_reset_retry_timer(fdisk);
467 }
468 
469 static void
470 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
471 		    bool success)
472 {
473 	if (!success) {
474 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
475 		return;
476 	}
477 
478 	switch (bdev_io->type) {
479 	case SPDK_BDEV_IO_TYPE_READ:
480 		bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
481 			       ch,
482 			       (struct bdev_aio_task *)bdev_io->driver_ctx,
483 			       bdev_io->u.bdev.iovs,
484 			       bdev_io->u.bdev.iovcnt,
485 			       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
486 			       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
487 		break;
488 	case SPDK_BDEV_IO_TYPE_WRITE:
489 		bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
490 				ch,
491 				(struct bdev_aio_task *)bdev_io->driver_ctx,
492 				bdev_io->u.bdev.iovs,
493 				bdev_io->u.bdev.iovcnt,
494 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
495 				bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
496 		break;
497 	default:
498 		SPDK_ERRLOG("Wrong io type\n");
499 		break;
500 	}
501 }
502 
503 static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
504 {
505 	switch (bdev_io->type) {
506 	/* Read and write operations must be performed on buffers aligned to
507 	 * bdev->required_alignment. If user specified unaligned buffers,
508 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
509 	case SPDK_BDEV_IO_TYPE_READ:
510 	case SPDK_BDEV_IO_TYPE_WRITE:
511 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
512 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
513 		return 0;
514 	case SPDK_BDEV_IO_TYPE_FLUSH:
515 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
516 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
517 		return 0;
518 
519 	case SPDK_BDEV_IO_TYPE_RESET:
520 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
521 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
522 		return 0;
523 	default:
524 		return -1;
525 	}
526 }
527 
528 static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
529 {
530 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
531 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
532 	}
533 }
534 
535 static bool
536 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
537 {
538 	switch (io_type) {
539 	case SPDK_BDEV_IO_TYPE_READ:
540 	case SPDK_BDEV_IO_TYPE_WRITE:
541 	case SPDK_BDEV_IO_TYPE_FLUSH:
542 	case SPDK_BDEV_IO_TYPE_RESET:
543 		return true;
544 
545 	default:
546 		return false;
547 	}
548 }
549 
550 static int
551 bdev_aio_create_cb(void *io_device, void *ctx_buf)
552 {
553 	struct bdev_aio_io_channel *ch = ctx_buf;
554 
555 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
556 		SPDK_ERRLOG("async I/O context setup failure\n");
557 		return -1;
558 	}
559 
560 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
561 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
562 
563 	return 0;
564 }
565 
566 static void
567 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
568 {
569 	struct bdev_aio_io_channel *ch = ctx_buf;
570 
571 	io_destroy(ch->io_ctx);
572 
573 	assert(ch->group_ch);
574 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
575 
576 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
577 }
578 
579 static struct spdk_io_channel *
580 bdev_aio_get_io_channel(void *ctx)
581 {
582 	struct file_disk *fdisk = ctx;
583 
584 	return spdk_get_io_channel(fdisk);
585 }
586 
587 
588 static int
589 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
590 {
591 	struct file_disk *fdisk = ctx;
592 
593 	spdk_json_write_named_object_begin(w, "aio");
594 
595 	spdk_json_write_named_string(w, "filename", fdisk->filename);
596 
597 	spdk_json_write_object_end(w);
598 
599 	return 0;
600 }
601 
602 static void
603 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
604 {
605 	struct file_disk *fdisk = bdev->ctxt;
606 
607 	spdk_json_write_object_begin(w);
608 
609 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
610 
611 	spdk_json_write_named_object_begin(w, "params");
612 	spdk_json_write_named_string(w, "name", bdev->name);
613 	if (fdisk->block_size_override) {
614 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
615 	}
616 	spdk_json_write_named_string(w, "filename", fdisk->filename);
617 	spdk_json_write_object_end(w);
618 
619 	spdk_json_write_object_end(w);
620 }
621 
622 static const struct spdk_bdev_fn_table aio_fn_table = {
623 	.destruct		= bdev_aio_destruct,
624 	.submit_request		= bdev_aio_submit_request,
625 	.io_type_supported	= bdev_aio_io_type_supported,
626 	.get_io_channel		= bdev_aio_get_io_channel,
627 	.dump_info_json		= bdev_aio_dump_info_json,
628 	.write_config_json	= bdev_aio_write_json_config,
629 };
630 
631 static void aio_free_disk(struct file_disk *fdisk)
632 {
633 	if (fdisk == NULL) {
634 		return;
635 	}
636 	free(fdisk->filename);
637 	free(fdisk->disk.name);
638 	free(fdisk);
639 }
640 
641 static int
642 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
643 {
644 	int efd;
645 
646 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
647 	if (efd < 0) {
648 		return -1;
649 	}
650 
651 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
652 	if (ch->intr == NULL) {
653 		close(efd);
654 		return -1;
655 	}
656 	ch->efd = efd;
657 
658 	return 0;
659 }
660 
661 static void
662 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
663 {
664 	spdk_interrupt_unregister(&ch->intr);
665 	close(ch->efd);
666 	ch->efd = -1;
667 }
668 
669 static void
670 bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
671 {
672 	return;
673 }
674 
675 static int
676 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
677 {
678 	struct bdev_aio_group_channel *ch = ctx_buf;
679 	int rc;
680 
681 	TAILQ_INIT(&ch->io_ch_head);
682 	/* Initialize ch->efd to be invalid and unused. */
683 	ch->efd = -1;
684 	if (spdk_interrupt_mode_is_enabled()) {
685 		rc = bdev_aio_register_interrupt(ch);
686 		if (rc < 0) {
687 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
688 			return rc;
689 		}
690 	}
691 
692 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
693 	spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL);
694 
695 	return 0;
696 }
697 
698 static void
699 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
700 {
701 	struct bdev_aio_group_channel *ch = ctx_buf;
702 
703 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
704 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
705 	}
706 
707 	spdk_poller_unregister(&ch->poller);
708 	if (spdk_interrupt_mode_is_enabled()) {
709 		bdev_aio_unregister_interrupt(ch);
710 	}
711 }
712 
713 int
714 create_aio_bdev(const char *name, const char *filename, uint32_t block_size)
715 {
716 	struct file_disk *fdisk;
717 	uint32_t detected_block_size;
718 	uint64_t disk_size;
719 	int rc;
720 
721 	fdisk = calloc(1, sizeof(*fdisk));
722 	if (!fdisk) {
723 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
724 		return -ENOMEM;
725 	}
726 
727 	fdisk->filename = strdup(filename);
728 	if (!fdisk->filename) {
729 		rc = -ENOMEM;
730 		goto error_return;
731 	}
732 
733 	if (bdev_aio_open(fdisk)) {
734 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
735 		rc = -errno;
736 		goto error_return;
737 	}
738 
739 	disk_size = spdk_fd_get_size(fdisk->fd);
740 
741 	fdisk->disk.name = strdup(name);
742 	if (!fdisk->disk.name) {
743 		rc = -ENOMEM;
744 		goto error_return;
745 	}
746 	fdisk->disk.product_name = "AIO disk";
747 	fdisk->disk.module = &aio_if;
748 
749 	fdisk->disk.write_cache = 1;
750 
751 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
752 	if (block_size == 0) {
753 		/* User did not specify block size - use autodetected block size. */
754 		if (detected_block_size == 0) {
755 			SPDK_ERRLOG("Block size could not be auto-detected\n");
756 			rc = -EINVAL;
757 			goto error_return;
758 		}
759 		fdisk->block_size_override = false;
760 		block_size = detected_block_size;
761 	} else {
762 		if (block_size < detected_block_size) {
763 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
764 				    "auto-detected block size %" PRIu32 "\n",
765 				    block_size, detected_block_size);
766 			rc = -EINVAL;
767 			goto error_return;
768 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
769 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
770 				     "auto-detected block size %" PRIu32 "\n",
771 				     block_size, detected_block_size);
772 		}
773 		fdisk->block_size_override = true;
774 	}
775 
776 	if (block_size < 512) {
777 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
778 		rc = -EINVAL;
779 		goto error_return;
780 	}
781 
782 	if (!spdk_u32_is_pow2(block_size)) {
783 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
784 		rc = -EINVAL;
785 		goto error_return;
786 	}
787 
788 	fdisk->disk.blocklen = block_size;
789 	if (fdisk->block_size_override && detected_block_size) {
790 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
791 	} else {
792 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
793 	}
794 
795 	if (disk_size % fdisk->disk.blocklen != 0) {
796 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
797 			    disk_size, fdisk->disk.blocklen);
798 		rc = -EINVAL;
799 		goto error_return;
800 	}
801 
802 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
803 	fdisk->disk.ctxt = fdisk;
804 
805 	fdisk->disk.fn_table = &aio_fn_table;
806 
807 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
808 				sizeof(struct bdev_aio_io_channel),
809 				fdisk->disk.name);
810 	rc = spdk_bdev_register(&fdisk->disk);
811 	if (rc) {
812 		spdk_io_device_unregister(fdisk, NULL);
813 		goto error_return;
814 	}
815 
816 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
817 	return 0;
818 
819 error_return:
820 	bdev_aio_close(fdisk);
821 	aio_free_disk(fdisk);
822 	return rc;
823 }
824 
825 struct delete_aio_bdev_ctx {
826 	delete_aio_bdev_complete cb_fn;
827 	void *cb_arg;
828 };
829 
830 static void
831 aio_bdev_unregister_cb(void *arg, int bdeverrno)
832 {
833 	struct delete_aio_bdev_ctx *ctx = arg;
834 
835 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
836 	free(ctx);
837 }
838 
839 void
840 bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg)
841 {
842 	struct delete_aio_bdev_ctx *ctx;
843 
844 	if (!bdev || bdev->module != &aio_if) {
845 		cb_fn(cb_arg, -ENODEV);
846 		return;
847 	}
848 
849 	ctx = calloc(1, sizeof(*ctx));
850 	if (ctx == NULL) {
851 		cb_fn(cb_arg, -ENOMEM);
852 		return;
853 	}
854 
855 	ctx->cb_fn = cb_fn;
856 	ctx->cb_arg = cb_arg;
857 	spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, ctx);
858 }
859 
860 static int
861 bdev_aio_initialize(void)
862 {
863 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
864 				sizeof(struct bdev_aio_group_channel), "aio_module");
865 
866 	return 0;
867 }
868 
869 static void
870 bdev_aio_fini(void)
871 {
872 	spdk_io_device_unregister(&aio_if, NULL);
873 }
874 
875 SPDK_LOG_REGISTER_COMPONENT(aio)
876