xref: /spdk/module/bdev/aio/bdev_aio.c (revision 26ae3d66d89392c108a30e405ca8424617a03417)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_aio.h"
35 
36 #include "spdk/stdinc.h"
37 
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/bdev_module.h"
41 #include "spdk/env.h"
42 #include "spdk/fd.h"
43 #include "spdk/likely.h"
44 #include "spdk/thread.h"
45 #include "spdk/json.h"
46 #include "spdk/util.h"
47 #include "spdk/string.h"
48 
49 #include "spdk/log.h"
50 
51 #include <sys/eventfd.h>
52 #include <libaio.h>
53 
54 struct bdev_aio_io_channel {
55 	uint64_t				io_inflight;
56 	io_context_t				io_ctx;
57 	struct bdev_aio_group_channel		*group_ch;
58 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
59 };
60 
61 struct bdev_aio_group_channel {
62 	int					efd;
63 	struct spdk_interrupt			*intr;
64 	struct spdk_poller			*poller;
65 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
66 };
67 
68 struct bdev_aio_task {
69 	struct iocb			iocb;
70 	uint64_t			len;
71 	struct bdev_aio_io_channel	*ch;
72 	TAILQ_ENTRY(bdev_aio_task)	link;
73 };
74 
75 struct file_disk {
76 	struct bdev_aio_task	*reset_task;
77 	struct spdk_poller	*reset_retry_timer;
78 	struct spdk_bdev	disk;
79 	char			*filename;
80 	int			fd;
81 	TAILQ_ENTRY(file_disk)  link;
82 	bool			block_size_override;
83 };
84 
85 /* For user space reaping of completions */
86 struct spdk_aio_ring {
87 	uint32_t id;
88 	uint32_t size;
89 	uint32_t head;
90 	uint32_t tail;
91 
92 	uint32_t version;
93 	uint32_t compat_features;
94 	uint32_t incompat_features;
95 	uint32_t header_length;
96 };
97 
98 #define SPDK_AIO_RING_VERSION	0xa10a10a1
99 
100 static int bdev_aio_initialize(void);
101 static void bdev_aio_fini(void);
102 static void aio_free_disk(struct file_disk *fdisk);
103 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
104 
105 #define SPDK_AIO_QUEUE_DEPTH 128
106 #define MAX_EVENTS_PER_POLL 32
107 
108 static int
109 bdev_aio_get_ctx_size(void)
110 {
111 	return sizeof(struct bdev_aio_task);
112 }
113 
114 static struct spdk_bdev_module aio_if = {
115 	.name		= "aio",
116 	.module_init	= bdev_aio_initialize,
117 	.module_fini	= bdev_aio_fini,
118 	.get_ctx_size	= bdev_aio_get_ctx_size,
119 };
120 
121 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
122 
123 static int
124 bdev_aio_open(struct file_disk *disk)
125 {
126 	int fd;
127 
128 	fd = open(disk->filename, O_RDWR | O_DIRECT);
129 	if (fd < 0) {
130 		/* Try without O_DIRECT for non-disk files */
131 		fd = open(disk->filename, O_RDWR);
132 		if (fd < 0) {
133 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
134 				    disk->filename, errno, spdk_strerror(errno));
135 			disk->fd = -1;
136 			return -1;
137 		}
138 	}
139 
140 	disk->fd = fd;
141 
142 	return 0;
143 }
144 
145 static int
146 bdev_aio_close(struct file_disk *disk)
147 {
148 	int rc;
149 
150 	if (disk->fd == -1) {
151 		return 0;
152 	}
153 
154 	rc = close(disk->fd);
155 	if (rc < 0) {
156 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
157 			    disk->fd, errno, spdk_strerror(errno));
158 		return -1;
159 	}
160 
161 	disk->fd = -1;
162 
163 	return 0;
164 }
165 
166 static int64_t
167 bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
168 	       struct bdev_aio_task *aio_task,
169 	       struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
170 {
171 	struct iocb *iocb = &aio_task->iocb;
172 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
173 	int rc;
174 
175 	io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
176 	if (aio_ch->group_ch->efd) {
177 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
178 	}
179 	iocb->data = aio_task;
180 	aio_task->len = nbytes;
181 	aio_task->ch = aio_ch;
182 
183 	SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
184 		      iovcnt, nbytes, offset);
185 
186 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
187 	if (rc < 0) {
188 		if (rc == -EAGAIN) {
189 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
190 		} else {
191 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
192 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
193 		}
194 		return -1;
195 	}
196 	aio_ch->io_inflight++;
197 	return nbytes;
198 }
199 
200 static int64_t
201 bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
202 		struct bdev_aio_task *aio_task,
203 		struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
204 {
205 	struct iocb *iocb = &aio_task->iocb;
206 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
207 	int rc;
208 
209 	io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
210 	if (aio_ch->group_ch->efd) {
211 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
212 	}
213 	iocb->data = aio_task;
214 	aio_task->len = len;
215 	aio_task->ch = aio_ch;
216 
217 	SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
218 		      iovcnt, len, offset);
219 
220 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
221 	if (rc < 0) {
222 		if (rc == -EAGAIN) {
223 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
224 		} else {
225 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
226 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
227 		}
228 		return -1;
229 	}
230 	aio_ch->io_inflight++;
231 	return len;
232 }
233 
234 static void
235 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
236 {
237 	int rc = fsync(fdisk->fd);
238 
239 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task),
240 			      rc == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
241 }
242 
243 static int
244 bdev_aio_destruct(void *ctx)
245 {
246 	struct file_disk *fdisk = ctx;
247 	int rc = 0;
248 
249 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
250 	rc = bdev_aio_close(fdisk);
251 	if (rc < 0) {
252 		SPDK_ERRLOG("bdev_aio_close() failed\n");
253 	}
254 	spdk_io_device_unregister(fdisk, NULL);
255 	aio_free_disk(fdisk);
256 	return rc;
257 }
258 
259 static int
260 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
261 {
262 	uint32_t head, tail, count;
263 	struct spdk_aio_ring *ring;
264 	struct timespec timeout;
265 	struct io_event *kevents;
266 
267 	ring = (struct spdk_aio_ring *)io_ctx;
268 
269 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
270 		timeout.tv_sec = 0;
271 		timeout.tv_nsec = 0;
272 
273 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
274 	}
275 
276 	/* Read the current state out of the ring */
277 	head = ring->head;
278 	tail = ring->tail;
279 
280 	/* This memory barrier is required to prevent the loads above
281 	 * from being re-ordered with stores to the events array
282 	 * potentially occurring on other threads. */
283 	spdk_smp_rmb();
284 
285 	/* Calculate how many items are in the circular ring */
286 	count = tail - head;
287 	if (tail < head) {
288 		count += ring->size;
289 	}
290 
291 	/* Reduce the count to the limit provided by the user */
292 	count = spdk_min(max, count);
293 
294 	/* Grab the memory location of the event array */
295 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
296 
297 	/* Copy the events out of the ring. */
298 	if ((head + count) <= ring->size) {
299 		/* Only one copy is required */
300 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
301 	} else {
302 		uint32_t first_part = ring->size - head;
303 		/* Two copies are required */
304 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
305 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
306 	}
307 
308 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
309 	 * so the copies out of the event array will always be complete prior to this
310 	 * update becoming visible. On other architectures this is not guaranteed, so
311 	 * add a barrier. */
312 #if defined(__i386__) || defined(__x86_64__)
313 	spdk_compiler_barrier();
314 #else
315 	spdk_smp_mb();
316 #endif
317 	ring->head = (head + count) % ring->size;
318 
319 	return count;
320 }
321 
322 static int
323 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
324 {
325 	int nr, i = 0;
326 	enum spdk_bdev_io_status status;
327 	struct bdev_aio_task *aio_task;
328 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
329 
330 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
331 
332 	if (nr < 0) {
333 		return 0;
334 	}
335 
336 	for (i = 0; i < nr; i++) {
337 		aio_task = events[i].data;
338 		if (events[i].res != aio_task->len) {
339 			status = SPDK_BDEV_IO_STATUS_FAILED;
340 		} else {
341 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
342 		}
343 
344 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), status);
345 		aio_task->ch->io_inflight--;
346 	}
347 
348 	return nr;
349 }
350 
351 static int
352 bdev_aio_group_poll(void *arg)
353 {
354 	struct bdev_aio_group_channel *group_ch = arg;
355 	struct bdev_aio_io_channel *io_ch;
356 	int nr = 0;
357 
358 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
359 		nr += bdev_aio_io_channel_poll(io_ch);
360 	}
361 
362 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
363 }
364 
365 static int
366 bdev_aio_group_interrupt(void *arg)
367 {
368 	struct bdev_aio_group_channel *group_ch = arg;
369 	int rc;
370 	uint64_t num_events;
371 
372 	assert(group_ch->efd);
373 
374 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
375 	 * io_getevent should be called again to ensure all completed IO are processed.
376 	 */
377 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
378 	if (rc < 0) {
379 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
380 		return -errno;
381 	}
382 
383 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
384 		num_events -= SPDK_AIO_QUEUE_DEPTH;
385 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
386 		if (rc < 0) {
387 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
388 		}
389 	}
390 
391 	return bdev_aio_group_poll(group_ch);
392 }
393 
394 static void
395 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
396 {
397 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
398 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
399 
400 	if (aio_ch->io_inflight) {
401 		spdk_for_each_channel_continue(i, -1);
402 		return;
403 	}
404 
405 	spdk_for_each_channel_continue(i, 0);
406 }
407 
408 static int bdev_aio_reset_retry_timer(void *arg);
409 
410 static void
411 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
412 {
413 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
414 
415 	if (status == -1) {
416 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
417 		return;
418 	}
419 
420 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
421 }
422 
423 static int
424 bdev_aio_reset_retry_timer(void *arg)
425 {
426 	struct file_disk *fdisk = arg;
427 
428 	if (fdisk->reset_retry_timer) {
429 		spdk_poller_unregister(&fdisk->reset_retry_timer);
430 	}
431 
432 	spdk_for_each_channel(fdisk,
433 			      _bdev_aio_get_io_inflight,
434 			      fdisk,
435 			      _bdev_aio_get_io_inflight_done);
436 
437 	return SPDK_POLLER_BUSY;
438 }
439 
440 static void
441 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
442 {
443 	fdisk->reset_task = aio_task;
444 
445 	bdev_aio_reset_retry_timer(fdisk);
446 }
447 
448 static void
449 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
450 		    bool success)
451 {
452 	if (!success) {
453 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
454 		return;
455 	}
456 
457 	switch (bdev_io->type) {
458 	case SPDK_BDEV_IO_TYPE_READ:
459 		bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
460 			       ch,
461 			       (struct bdev_aio_task *)bdev_io->driver_ctx,
462 			       bdev_io->u.bdev.iovs,
463 			       bdev_io->u.bdev.iovcnt,
464 			       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
465 			       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
466 		break;
467 	case SPDK_BDEV_IO_TYPE_WRITE:
468 		bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
469 				ch,
470 				(struct bdev_aio_task *)bdev_io->driver_ctx,
471 				bdev_io->u.bdev.iovs,
472 				bdev_io->u.bdev.iovcnt,
473 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
474 				bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
475 		break;
476 	default:
477 		SPDK_ERRLOG("Wrong io type\n");
478 		break;
479 	}
480 }
481 
482 static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
483 {
484 	switch (bdev_io->type) {
485 	/* Read and write operations must be performed on buffers aligned to
486 	 * bdev->required_alignment. If user specified unaligned buffers,
487 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
488 	case SPDK_BDEV_IO_TYPE_READ:
489 	case SPDK_BDEV_IO_TYPE_WRITE:
490 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
491 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
492 		return 0;
493 	case SPDK_BDEV_IO_TYPE_FLUSH:
494 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
495 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
496 		return 0;
497 
498 	case SPDK_BDEV_IO_TYPE_RESET:
499 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
500 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
501 		return 0;
502 	default:
503 		return -1;
504 	}
505 }
506 
507 static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
508 {
509 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
510 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
511 	}
512 }
513 
514 static bool
515 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
516 {
517 	switch (io_type) {
518 	case SPDK_BDEV_IO_TYPE_READ:
519 	case SPDK_BDEV_IO_TYPE_WRITE:
520 	case SPDK_BDEV_IO_TYPE_FLUSH:
521 	case SPDK_BDEV_IO_TYPE_RESET:
522 		return true;
523 
524 	default:
525 		return false;
526 	}
527 }
528 
529 static int
530 bdev_aio_create_cb(void *io_device, void *ctx_buf)
531 {
532 	struct bdev_aio_io_channel *ch = ctx_buf;
533 
534 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
535 		SPDK_ERRLOG("async I/O context setup failure\n");
536 		return -1;
537 	}
538 
539 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
540 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
541 
542 	return 0;
543 }
544 
545 static void
546 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
547 {
548 	struct bdev_aio_io_channel *ch = ctx_buf;
549 
550 	io_destroy(ch->io_ctx);
551 
552 	assert(ch->group_ch);
553 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
554 
555 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
556 }
557 
558 static struct spdk_io_channel *
559 bdev_aio_get_io_channel(void *ctx)
560 {
561 	struct file_disk *fdisk = ctx;
562 
563 	return spdk_get_io_channel(fdisk);
564 }
565 
566 
567 static int
568 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
569 {
570 	struct file_disk *fdisk = ctx;
571 
572 	spdk_json_write_named_object_begin(w, "aio");
573 
574 	spdk_json_write_named_string(w, "filename", fdisk->filename);
575 
576 	spdk_json_write_object_end(w);
577 
578 	return 0;
579 }
580 
581 static void
582 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
583 {
584 	struct file_disk *fdisk = bdev->ctxt;
585 
586 	spdk_json_write_object_begin(w);
587 
588 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
589 
590 	spdk_json_write_named_object_begin(w, "params");
591 	spdk_json_write_named_string(w, "name", bdev->name);
592 	if (fdisk->block_size_override) {
593 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
594 	}
595 	spdk_json_write_named_string(w, "filename", fdisk->filename);
596 	spdk_json_write_object_end(w);
597 
598 	spdk_json_write_object_end(w);
599 }
600 
601 static const struct spdk_bdev_fn_table aio_fn_table = {
602 	.destruct		= bdev_aio_destruct,
603 	.submit_request		= bdev_aio_submit_request,
604 	.io_type_supported	= bdev_aio_io_type_supported,
605 	.get_io_channel		= bdev_aio_get_io_channel,
606 	.dump_info_json		= bdev_aio_dump_info_json,
607 	.write_config_json	= bdev_aio_write_json_config,
608 };
609 
610 static void aio_free_disk(struct file_disk *fdisk)
611 {
612 	if (fdisk == NULL) {
613 		return;
614 	}
615 	free(fdisk->filename);
616 	free(fdisk->disk.name);
617 	free(fdisk);
618 }
619 
620 static int
621 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
622 {
623 	int efd;
624 
625 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
626 	if (efd < 0) {
627 		return -1;
628 	}
629 
630 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
631 	if (ch->intr == NULL) {
632 		close(efd);
633 		return -1;
634 	}
635 	ch->efd = efd;
636 
637 	return 0;
638 }
639 
640 static void
641 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
642 {
643 	spdk_interrupt_unregister(&ch->intr);
644 	close(ch->efd);
645 	ch->efd = 0;
646 }
647 
648 static int
649 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
650 {
651 	struct bdev_aio_group_channel *ch = ctx_buf;
652 
653 	TAILQ_INIT(&ch->io_ch_head);
654 
655 	if (spdk_interrupt_mode_is_enabled()) {
656 		return bdev_aio_register_interrupt(ch);
657 	}
658 
659 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
660 
661 	return 0;
662 }
663 
664 static void
665 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
666 {
667 	struct bdev_aio_group_channel *ch = ctx_buf;
668 
669 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
670 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
671 	}
672 
673 	if (ch->intr) {
674 		bdev_aio_unregister_interrupt(ch);
675 		return;
676 	}
677 
678 	spdk_poller_unregister(&ch->poller);
679 }
680 
681 int
682 create_aio_bdev(const char *name, const char *filename, uint32_t block_size)
683 {
684 	struct file_disk *fdisk;
685 	uint32_t detected_block_size;
686 	uint64_t disk_size;
687 	int rc;
688 
689 	fdisk = calloc(1, sizeof(*fdisk));
690 	if (!fdisk) {
691 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
692 		return -ENOMEM;
693 	}
694 
695 	fdisk->filename = strdup(filename);
696 	if (!fdisk->filename) {
697 		rc = -ENOMEM;
698 		goto error_return;
699 	}
700 
701 	if (bdev_aio_open(fdisk)) {
702 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
703 		rc = -errno;
704 		goto error_return;
705 	}
706 
707 	disk_size = spdk_fd_get_size(fdisk->fd);
708 
709 	fdisk->disk.name = strdup(name);
710 	if (!fdisk->disk.name) {
711 		rc = -ENOMEM;
712 		goto error_return;
713 	}
714 	fdisk->disk.product_name = "AIO disk";
715 	fdisk->disk.module = &aio_if;
716 
717 	fdisk->disk.write_cache = 1;
718 
719 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
720 	if (block_size == 0) {
721 		/* User did not specify block size - use autodetected block size. */
722 		if (detected_block_size == 0) {
723 			SPDK_ERRLOG("Block size could not be auto-detected\n");
724 			rc = -EINVAL;
725 			goto error_return;
726 		}
727 		fdisk->block_size_override = false;
728 		block_size = detected_block_size;
729 	} else {
730 		if (block_size < detected_block_size) {
731 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
732 				    "auto-detected block size %" PRIu32 "\n",
733 				    block_size, detected_block_size);
734 			rc = -EINVAL;
735 			goto error_return;
736 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
737 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
738 				     "auto-detected block size %" PRIu32 "\n",
739 				     block_size, detected_block_size);
740 		}
741 		fdisk->block_size_override = true;
742 	}
743 
744 	if (block_size < 512) {
745 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
746 		rc = -EINVAL;
747 		goto error_return;
748 	}
749 
750 	if (!spdk_u32_is_pow2(block_size)) {
751 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
752 		rc = -EINVAL;
753 		goto error_return;
754 	}
755 
756 	fdisk->disk.blocklen = block_size;
757 	if (fdisk->block_size_override && detected_block_size) {
758 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
759 	} else {
760 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
761 	}
762 
763 	if (disk_size % fdisk->disk.blocklen != 0) {
764 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
765 			    disk_size, fdisk->disk.blocklen);
766 		rc = -EINVAL;
767 		goto error_return;
768 	}
769 
770 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
771 	fdisk->disk.ctxt = fdisk;
772 
773 	fdisk->disk.fn_table = &aio_fn_table;
774 
775 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
776 				sizeof(struct bdev_aio_io_channel),
777 				fdisk->disk.name);
778 	rc = spdk_bdev_register(&fdisk->disk);
779 	if (rc) {
780 		spdk_io_device_unregister(fdisk, NULL);
781 		goto error_return;
782 	}
783 
784 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
785 	return 0;
786 
787 error_return:
788 	bdev_aio_close(fdisk);
789 	aio_free_disk(fdisk);
790 	return rc;
791 }
792 
793 struct delete_aio_bdev_ctx {
794 	delete_aio_bdev_complete cb_fn;
795 	void *cb_arg;
796 };
797 
798 static void
799 aio_bdev_unregister_cb(void *arg, int bdeverrno)
800 {
801 	struct delete_aio_bdev_ctx *ctx = arg;
802 
803 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
804 	free(ctx);
805 }
806 
807 void
808 bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg)
809 {
810 	struct delete_aio_bdev_ctx *ctx;
811 
812 	if (!bdev || bdev->module != &aio_if) {
813 		cb_fn(cb_arg, -ENODEV);
814 		return;
815 	}
816 
817 	ctx = calloc(1, sizeof(*ctx));
818 	if (ctx == NULL) {
819 		cb_fn(cb_arg, -ENOMEM);
820 		return;
821 	}
822 
823 	ctx->cb_fn = cb_fn;
824 	ctx->cb_arg = cb_arg;
825 	spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, ctx);
826 }
827 
828 static int
829 bdev_aio_initialize(void)
830 {
831 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
832 				sizeof(struct bdev_aio_group_channel), "aio_module");
833 
834 	return 0;
835 }
836 
837 static void
838 bdev_aio_fini(void)
839 {
840 	spdk_io_device_unregister(&aio_if, NULL);
841 }
842 
843 SPDK_LOG_REGISTER_COMPONENT(aio)
844