xref: /spdk/module/bdev/aio/bdev_aio.c (revision 3630473789c359155f05075bea018c32d24032b3)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_aio.h"
35 
36 #include "spdk/stdinc.h"
37 
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/bdev_module.h"
41 #include "spdk/env.h"
42 #include "spdk/fd.h"
43 #include "spdk/likely.h"
44 #include "spdk/thread.h"
45 #include "spdk/json.h"
46 #include "spdk/util.h"
47 #include "spdk/string.h"
48 
49 #include "spdk/log.h"
50 
51 #include <sys/eventfd.h>
52 #include <libaio.h>
53 
54 struct bdev_aio_io_channel {
55 	uint64_t				io_inflight;
56 	io_context_t				io_ctx;
57 	struct bdev_aio_group_channel		*group_ch;
58 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
59 };
60 
61 struct bdev_aio_group_channel {
62 	/* eventfd for io completion notification in interrupt mode.
63 	 * Negative value like '-1' indicates it is invalid or unused.
64 	 */
65 	int					efd;
66 	struct spdk_interrupt			*intr;
67 	struct spdk_poller			*poller;
68 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
69 };
70 
71 struct bdev_aio_task {
72 	struct iocb			iocb;
73 	uint64_t			len;
74 	struct bdev_aio_io_channel	*ch;
75 	TAILQ_ENTRY(bdev_aio_task)	link;
76 };
77 
78 struct file_disk {
79 	struct bdev_aio_task	*reset_task;
80 	struct spdk_poller	*reset_retry_timer;
81 	struct spdk_bdev	disk;
82 	char			*filename;
83 	int			fd;
84 	TAILQ_ENTRY(file_disk)  link;
85 	bool			block_size_override;
86 };
87 
88 /* For user space reaping of completions */
89 struct spdk_aio_ring {
90 	uint32_t id;
91 	uint32_t size;
92 	uint32_t head;
93 	uint32_t tail;
94 
95 	uint32_t version;
96 	uint32_t compat_features;
97 	uint32_t incompat_features;
98 	uint32_t header_length;
99 };
100 
101 #define SPDK_AIO_RING_VERSION	0xa10a10a1
102 
103 static int bdev_aio_initialize(void);
104 static void bdev_aio_fini(void);
105 static void aio_free_disk(struct file_disk *fdisk);
106 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
107 
108 #define SPDK_AIO_QUEUE_DEPTH 128
109 #define MAX_EVENTS_PER_POLL 32
110 
111 static int
112 bdev_aio_get_ctx_size(void)
113 {
114 	return sizeof(struct bdev_aio_task);
115 }
116 
117 static struct spdk_bdev_module aio_if = {
118 	.name		= "aio",
119 	.module_init	= bdev_aio_initialize,
120 	.module_fini	= bdev_aio_fini,
121 	.get_ctx_size	= bdev_aio_get_ctx_size,
122 };
123 
124 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
125 
126 static int
127 bdev_aio_open(struct file_disk *disk)
128 {
129 	int fd;
130 
131 	fd = open(disk->filename, O_RDWR | O_DIRECT);
132 	if (fd < 0) {
133 		/* Try without O_DIRECT for non-disk files */
134 		fd = open(disk->filename, O_RDWR);
135 		if (fd < 0) {
136 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
137 				    disk->filename, errno, spdk_strerror(errno));
138 			disk->fd = -1;
139 			return -1;
140 		}
141 	}
142 
143 	disk->fd = fd;
144 
145 	return 0;
146 }
147 
148 static int
149 bdev_aio_close(struct file_disk *disk)
150 {
151 	int rc;
152 
153 	if (disk->fd == -1) {
154 		return 0;
155 	}
156 
157 	rc = close(disk->fd);
158 	if (rc < 0) {
159 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
160 			    disk->fd, errno, spdk_strerror(errno));
161 		return -1;
162 	}
163 
164 	disk->fd = -1;
165 
166 	return 0;
167 }
168 
169 static int64_t
170 bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
171 	       struct bdev_aio_task *aio_task,
172 	       struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
173 {
174 	struct iocb *iocb = &aio_task->iocb;
175 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
176 	int rc;
177 
178 	io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
179 	if (aio_ch->group_ch->efd >= 0) {
180 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
181 	}
182 	iocb->data = aio_task;
183 	aio_task->len = nbytes;
184 	aio_task->ch = aio_ch;
185 
186 	SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
187 		      iovcnt, nbytes, offset);
188 
189 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
190 	if (rc < 0) {
191 		if (rc == -EAGAIN) {
192 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
193 		} else {
194 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
195 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
196 		}
197 		return -1;
198 	}
199 	aio_ch->io_inflight++;
200 	return nbytes;
201 }
202 
203 static int64_t
204 bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
205 		struct bdev_aio_task *aio_task,
206 		struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
207 {
208 	struct iocb *iocb = &aio_task->iocb;
209 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
210 	int rc;
211 
212 	io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
213 	if (aio_ch->group_ch->efd >= 0) {
214 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
215 	}
216 	iocb->data = aio_task;
217 	aio_task->len = len;
218 	aio_task->ch = aio_ch;
219 
220 	SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
221 		      iovcnt, len, offset);
222 
223 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
224 	if (rc < 0) {
225 		if (rc == -EAGAIN) {
226 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
227 		} else {
228 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
229 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
230 		}
231 		return -1;
232 	}
233 	aio_ch->io_inflight++;
234 	return len;
235 }
236 
237 static void
238 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
239 {
240 	int rc = fsync(fdisk->fd);
241 
242 	if (rc == 0) {
243 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
244 	} else {
245 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
246 	}
247 }
248 
249 static void
250 bdev_aio_destruct_cb(void *io_device)
251 {
252 	struct file_disk *fdisk = io_device;
253 	int rc = 0;
254 
255 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
256 	rc = bdev_aio_close(fdisk);
257 	if (rc < 0) {
258 		SPDK_ERRLOG("bdev_aio_close() failed\n");
259 	}
260 
261 	aio_free_disk(fdisk);
262 }
263 
264 static int
265 bdev_aio_destruct(void *ctx)
266 {
267 	struct file_disk *fdisk = ctx;
268 
269 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
270 
271 	return 0;
272 }
273 
274 static int
275 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
276 {
277 	uint32_t head, tail, count;
278 	struct spdk_aio_ring *ring;
279 	struct timespec timeout;
280 	struct io_event *kevents;
281 
282 	ring = (struct spdk_aio_ring *)io_ctx;
283 
284 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
285 		timeout.tv_sec = 0;
286 		timeout.tv_nsec = 0;
287 
288 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
289 	}
290 
291 	/* Read the current state out of the ring */
292 	head = ring->head;
293 	tail = ring->tail;
294 
295 	/* This memory barrier is required to prevent the loads above
296 	 * from being re-ordered with stores to the events array
297 	 * potentially occurring on other threads. */
298 	spdk_smp_rmb();
299 
300 	/* Calculate how many items are in the circular ring */
301 	count = tail - head;
302 	if (tail < head) {
303 		count += ring->size;
304 	}
305 
306 	/* Reduce the count to the limit provided by the user */
307 	count = spdk_min(max, count);
308 
309 	/* Grab the memory location of the event array */
310 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
311 
312 	/* Copy the events out of the ring. */
313 	if ((head + count) <= ring->size) {
314 		/* Only one copy is required */
315 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
316 	} else {
317 		uint32_t first_part = ring->size - head;
318 		/* Two copies are required */
319 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
320 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
321 	}
322 
323 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
324 	 * so the copies out of the event array will always be complete prior to this
325 	 * update becoming visible. On other architectures this is not guaranteed, so
326 	 * add a barrier. */
327 #if defined(__i386__) || defined(__x86_64__)
328 	spdk_compiler_barrier();
329 #else
330 	spdk_smp_mb();
331 #endif
332 	ring->head = (head + count) % ring->size;
333 
334 	return count;
335 }
336 
337 static int
338 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
339 {
340 	int nr, i = 0;
341 	struct bdev_aio_task *aio_task;
342 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
343 	uint64_t io_result;
344 
345 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
346 
347 	if (nr < 0) {
348 		return 0;
349 	}
350 
351 #define MAX_AIO_ERRNO 256
352 	for (i = 0; i < nr; i++) {
353 		aio_task = events[i].data;
354 		aio_task->ch->io_inflight--;
355 		io_result = events[i].res;
356 		if (io_result == aio_task->len) {
357 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
358 		} else if (io_result < MAX_AIO_ERRNO) {
359 			/* Linux AIO will return its errno to io_event.res */
360 			int aio_errno = io_result;
361 
362 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -aio_errno);
363 		} else {
364 			SPDK_ERRLOG("failed to complete aio: requested len is %lu, but completed len is %lu.\n",
365 				    aio_task->len, io_result);
366 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
367 		}
368 	}
369 
370 	return nr;
371 }
372 
373 static int
374 bdev_aio_group_poll(void *arg)
375 {
376 	struct bdev_aio_group_channel *group_ch = arg;
377 	struct bdev_aio_io_channel *io_ch;
378 	int nr = 0;
379 
380 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
381 		nr += bdev_aio_io_channel_poll(io_ch);
382 	}
383 
384 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
385 }
386 
387 static int
388 bdev_aio_group_interrupt(void *arg)
389 {
390 	struct bdev_aio_group_channel *group_ch = arg;
391 	int rc;
392 	uint64_t num_events;
393 
394 	assert(group_ch->efd >= 0);
395 
396 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
397 	 * io_getevent should be called again to ensure all completed IO are processed.
398 	 */
399 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
400 	if (rc < 0) {
401 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
402 		return -errno;
403 	}
404 
405 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
406 		num_events -= SPDK_AIO_QUEUE_DEPTH;
407 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
408 		if (rc < 0) {
409 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
410 		}
411 	}
412 
413 	return bdev_aio_group_poll(group_ch);
414 }
415 
416 static void
417 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
418 {
419 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
420 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
421 
422 	if (aio_ch->io_inflight) {
423 		spdk_for_each_channel_continue(i, -1);
424 		return;
425 	}
426 
427 	spdk_for_each_channel_continue(i, 0);
428 }
429 
430 static int bdev_aio_reset_retry_timer(void *arg);
431 
432 static void
433 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
434 {
435 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
436 
437 	if (status == -1) {
438 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
439 		return;
440 	}
441 
442 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
443 }
444 
445 static int
446 bdev_aio_reset_retry_timer(void *arg)
447 {
448 	struct file_disk *fdisk = arg;
449 
450 	if (fdisk->reset_retry_timer) {
451 		spdk_poller_unregister(&fdisk->reset_retry_timer);
452 	}
453 
454 	spdk_for_each_channel(fdisk,
455 			      _bdev_aio_get_io_inflight,
456 			      fdisk,
457 			      _bdev_aio_get_io_inflight_done);
458 
459 	return SPDK_POLLER_BUSY;
460 }
461 
462 static void
463 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
464 {
465 	fdisk->reset_task = aio_task;
466 
467 	bdev_aio_reset_retry_timer(fdisk);
468 }
469 
470 static void
471 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
472 		    bool success)
473 {
474 	if (!success) {
475 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
476 		return;
477 	}
478 
479 	switch (bdev_io->type) {
480 	case SPDK_BDEV_IO_TYPE_READ:
481 		bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
482 			       ch,
483 			       (struct bdev_aio_task *)bdev_io->driver_ctx,
484 			       bdev_io->u.bdev.iovs,
485 			       bdev_io->u.bdev.iovcnt,
486 			       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
487 			       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
488 		break;
489 	case SPDK_BDEV_IO_TYPE_WRITE:
490 		bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
491 				ch,
492 				(struct bdev_aio_task *)bdev_io->driver_ctx,
493 				bdev_io->u.bdev.iovs,
494 				bdev_io->u.bdev.iovcnt,
495 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
496 				bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
497 		break;
498 	default:
499 		SPDK_ERRLOG("Wrong io type\n");
500 		break;
501 	}
502 }
503 
504 static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
505 {
506 	switch (bdev_io->type) {
507 	/* Read and write operations must be performed on buffers aligned to
508 	 * bdev->required_alignment. If user specified unaligned buffers,
509 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
510 	case SPDK_BDEV_IO_TYPE_READ:
511 	case SPDK_BDEV_IO_TYPE_WRITE:
512 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
513 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
514 		return 0;
515 	case SPDK_BDEV_IO_TYPE_FLUSH:
516 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
517 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
518 		return 0;
519 
520 	case SPDK_BDEV_IO_TYPE_RESET:
521 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
522 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
523 		return 0;
524 	default:
525 		return -1;
526 	}
527 }
528 
529 static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
530 {
531 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
532 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
533 	}
534 }
535 
536 static bool
537 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
538 {
539 	switch (io_type) {
540 	case SPDK_BDEV_IO_TYPE_READ:
541 	case SPDK_BDEV_IO_TYPE_WRITE:
542 	case SPDK_BDEV_IO_TYPE_FLUSH:
543 	case SPDK_BDEV_IO_TYPE_RESET:
544 		return true;
545 
546 	default:
547 		return false;
548 	}
549 }
550 
551 static int
552 bdev_aio_create_cb(void *io_device, void *ctx_buf)
553 {
554 	struct bdev_aio_io_channel *ch = ctx_buf;
555 
556 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
557 		SPDK_ERRLOG("async I/O context setup failure\n");
558 		return -1;
559 	}
560 
561 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
562 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
563 
564 	return 0;
565 }
566 
567 static void
568 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
569 {
570 	struct bdev_aio_io_channel *ch = ctx_buf;
571 
572 	io_destroy(ch->io_ctx);
573 
574 	assert(ch->group_ch);
575 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
576 
577 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
578 }
579 
580 static struct spdk_io_channel *
581 bdev_aio_get_io_channel(void *ctx)
582 {
583 	struct file_disk *fdisk = ctx;
584 
585 	return spdk_get_io_channel(fdisk);
586 }
587 
588 
589 static int
590 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
591 {
592 	struct file_disk *fdisk = ctx;
593 
594 	spdk_json_write_named_object_begin(w, "aio");
595 
596 	spdk_json_write_named_string(w, "filename", fdisk->filename);
597 
598 	spdk_json_write_object_end(w);
599 
600 	return 0;
601 }
602 
603 static void
604 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
605 {
606 	struct file_disk *fdisk = bdev->ctxt;
607 
608 	spdk_json_write_object_begin(w);
609 
610 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
611 
612 	spdk_json_write_named_object_begin(w, "params");
613 	spdk_json_write_named_string(w, "name", bdev->name);
614 	if (fdisk->block_size_override) {
615 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
616 	}
617 	spdk_json_write_named_string(w, "filename", fdisk->filename);
618 	spdk_json_write_object_end(w);
619 
620 	spdk_json_write_object_end(w);
621 }
622 
623 static const struct spdk_bdev_fn_table aio_fn_table = {
624 	.destruct		= bdev_aio_destruct,
625 	.submit_request		= bdev_aio_submit_request,
626 	.io_type_supported	= bdev_aio_io_type_supported,
627 	.get_io_channel		= bdev_aio_get_io_channel,
628 	.dump_info_json		= bdev_aio_dump_info_json,
629 	.write_config_json	= bdev_aio_write_json_config,
630 };
631 
632 static void aio_free_disk(struct file_disk *fdisk)
633 {
634 	if (fdisk == NULL) {
635 		return;
636 	}
637 	free(fdisk->filename);
638 	free(fdisk->disk.name);
639 	free(fdisk);
640 }
641 
642 static int
643 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
644 {
645 	int efd;
646 
647 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
648 	if (efd < 0) {
649 		return -1;
650 	}
651 
652 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
653 	if (ch->intr == NULL) {
654 		close(efd);
655 		return -1;
656 	}
657 	ch->efd = efd;
658 
659 	return 0;
660 }
661 
662 static void
663 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
664 {
665 	spdk_interrupt_unregister(&ch->intr);
666 	close(ch->efd);
667 	ch->efd = -1;
668 }
669 
670 static void
671 bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
672 {
673 	return;
674 }
675 
676 static int
677 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
678 {
679 	struct bdev_aio_group_channel *ch = ctx_buf;
680 	int rc;
681 
682 	TAILQ_INIT(&ch->io_ch_head);
683 	/* Initialize ch->efd to be invalid and unused. */
684 	ch->efd = -1;
685 	if (spdk_interrupt_mode_is_enabled()) {
686 		rc = bdev_aio_register_interrupt(ch);
687 		if (rc < 0) {
688 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
689 			return rc;
690 		}
691 	}
692 
693 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
694 	spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL);
695 
696 	return 0;
697 }
698 
699 static void
700 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
701 {
702 	struct bdev_aio_group_channel *ch = ctx_buf;
703 
704 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
705 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
706 	}
707 
708 	spdk_poller_unregister(&ch->poller);
709 	if (spdk_interrupt_mode_is_enabled()) {
710 		bdev_aio_unregister_interrupt(ch);
711 	}
712 }
713 
714 int
715 create_aio_bdev(const char *name, const char *filename, uint32_t block_size)
716 {
717 	struct file_disk *fdisk;
718 	uint32_t detected_block_size;
719 	uint64_t disk_size;
720 	int rc;
721 
722 	fdisk = calloc(1, sizeof(*fdisk));
723 	if (!fdisk) {
724 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
725 		return -ENOMEM;
726 	}
727 
728 	fdisk->filename = strdup(filename);
729 	if (!fdisk->filename) {
730 		rc = -ENOMEM;
731 		goto error_return;
732 	}
733 
734 	if (bdev_aio_open(fdisk)) {
735 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
736 		rc = -errno;
737 		goto error_return;
738 	}
739 
740 	disk_size = spdk_fd_get_size(fdisk->fd);
741 
742 	fdisk->disk.name = strdup(name);
743 	if (!fdisk->disk.name) {
744 		rc = -ENOMEM;
745 		goto error_return;
746 	}
747 	fdisk->disk.product_name = "AIO disk";
748 	fdisk->disk.module = &aio_if;
749 
750 	fdisk->disk.write_cache = 1;
751 
752 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
753 	if (block_size == 0) {
754 		/* User did not specify block size - use autodetected block size. */
755 		if (detected_block_size == 0) {
756 			SPDK_ERRLOG("Block size could not be auto-detected\n");
757 			rc = -EINVAL;
758 			goto error_return;
759 		}
760 		fdisk->block_size_override = false;
761 		block_size = detected_block_size;
762 	} else {
763 		if (block_size < detected_block_size) {
764 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
765 				    "auto-detected block size %" PRIu32 "\n",
766 				    block_size, detected_block_size);
767 			rc = -EINVAL;
768 			goto error_return;
769 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
770 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
771 				     "auto-detected block size %" PRIu32 "\n",
772 				     block_size, detected_block_size);
773 		}
774 		fdisk->block_size_override = true;
775 	}
776 
777 	if (block_size < 512) {
778 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
779 		rc = -EINVAL;
780 		goto error_return;
781 	}
782 
783 	if (!spdk_u32_is_pow2(block_size)) {
784 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
785 		rc = -EINVAL;
786 		goto error_return;
787 	}
788 
789 	fdisk->disk.blocklen = block_size;
790 	if (fdisk->block_size_override && detected_block_size) {
791 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
792 	} else {
793 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
794 	}
795 
796 	if (disk_size % fdisk->disk.blocklen != 0) {
797 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
798 			    disk_size, fdisk->disk.blocklen);
799 		rc = -EINVAL;
800 		goto error_return;
801 	}
802 
803 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
804 	fdisk->disk.ctxt = fdisk;
805 
806 	fdisk->disk.fn_table = &aio_fn_table;
807 
808 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
809 				sizeof(struct bdev_aio_io_channel),
810 				fdisk->disk.name);
811 	rc = spdk_bdev_register(&fdisk->disk);
812 	if (rc) {
813 		spdk_io_device_unregister(fdisk, NULL);
814 		goto error_return;
815 	}
816 
817 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
818 	return 0;
819 
820 error_return:
821 	bdev_aio_close(fdisk);
822 	aio_free_disk(fdisk);
823 	return rc;
824 }
825 
826 struct delete_aio_bdev_ctx {
827 	delete_aio_bdev_complete cb_fn;
828 	void *cb_arg;
829 };
830 
831 static void
832 aio_bdev_unregister_cb(void *arg, int bdeverrno)
833 {
834 	struct delete_aio_bdev_ctx *ctx = arg;
835 
836 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
837 	free(ctx);
838 }
839 
840 void
841 bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg)
842 {
843 	struct delete_aio_bdev_ctx *ctx;
844 
845 	if (!bdev || bdev->module != &aio_if) {
846 		cb_fn(cb_arg, -ENODEV);
847 		return;
848 	}
849 
850 	ctx = calloc(1, sizeof(*ctx));
851 	if (ctx == NULL) {
852 		cb_fn(cb_arg, -ENOMEM);
853 		return;
854 	}
855 
856 	ctx->cb_fn = cb_fn;
857 	ctx->cb_arg = cb_arg;
858 	spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, ctx);
859 }
860 
861 static int
862 bdev_aio_initialize(void)
863 {
864 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
865 				sizeof(struct bdev_aio_group_channel), "aio_module");
866 
867 	return 0;
868 }
869 
870 static void
871 bdev_aio_fini(void)
872 {
873 	spdk_io_device_unregister(&aio_if, NULL);
874 }
875 
876 SPDK_LOG_REGISTER_COMPONENT(aio)
877