xref: /spdk/module/bdev/aio/bdev_aio.c (revision 075d422f3480d3db11013734f833304606867da4)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_aio.h"
8 
9 #include "spdk/stdinc.h"
10 
11 #include "spdk/barrier.h"
12 #include "spdk/bdev.h"
13 #include "spdk/bdev_module.h"
14 #include "spdk/env.h"
15 #include "spdk/fd.h"
16 #include "spdk/likely.h"
17 #include "spdk/thread.h"
18 #include "spdk/json.h"
19 #include "spdk/util.h"
20 #include "spdk/string.h"
21 
22 #include "spdk/log.h"
23 
24 #include <sys/eventfd.h>
25 #include <libaio.h>
26 
27 struct bdev_aio_io_channel {
28 	uint64_t				io_inflight;
29 	io_context_t				io_ctx;
30 	struct bdev_aio_group_channel		*group_ch;
31 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
32 };
33 
34 struct bdev_aio_group_channel {
35 	/* eventfd for io completion notification in interrupt mode.
36 	 * Negative value like '-1' indicates it is invalid or unused.
37 	 */
38 	int					efd;
39 	struct spdk_interrupt			*intr;
40 	struct spdk_poller			*poller;
41 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
42 };
43 
44 struct bdev_aio_task {
45 	struct iocb			iocb;
46 	uint64_t			len;
47 	struct bdev_aio_io_channel	*ch;
48 };
49 
50 struct file_disk {
51 	struct bdev_aio_task	*reset_task;
52 	struct spdk_poller	*reset_retry_timer;
53 	struct spdk_bdev	disk;
54 	char			*filename;
55 	int			fd;
56 	TAILQ_ENTRY(file_disk)  link;
57 	bool			block_size_override;
58 	bool			readonly;
59 };
60 
61 /* For user space reaping of completions */
62 struct spdk_aio_ring {
63 	uint32_t id;
64 	uint32_t size;
65 	uint32_t head;
66 	uint32_t tail;
67 
68 	uint32_t version;
69 	uint32_t compat_features;
70 	uint32_t incompat_features;
71 	uint32_t header_length;
72 };
73 
74 #define SPDK_AIO_RING_VERSION	0xa10a10a1
75 
76 static int bdev_aio_initialize(void);
77 static void bdev_aio_fini(void);
78 static void aio_free_disk(struct file_disk *fdisk);
79 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
80 
81 #define SPDK_AIO_QUEUE_DEPTH 128
82 #define MAX_EVENTS_PER_POLL 32
83 
84 static int
85 bdev_aio_get_ctx_size(void)
86 {
87 	return sizeof(struct bdev_aio_task);
88 }
89 
90 static struct spdk_bdev_module aio_if = {
91 	.name		= "aio",
92 	.module_init	= bdev_aio_initialize,
93 	.module_fini	= bdev_aio_fini,
94 	.get_ctx_size	= bdev_aio_get_ctx_size,
95 };
96 
97 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
98 
99 static int
100 bdev_aio_open(struct file_disk *disk)
101 {
102 	int fd;
103 	int io_flag = disk->readonly ? O_RDONLY : O_RDWR;
104 
105 	fd = open(disk->filename, io_flag | O_DIRECT);
106 	if (fd < 0) {
107 		/* Try without O_DIRECT for non-disk files */
108 		fd = open(disk->filename, io_flag);
109 		if (fd < 0) {
110 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
111 				    disk->filename, errno, spdk_strerror(errno));
112 			disk->fd = -1;
113 			return -1;
114 		}
115 	}
116 
117 	disk->fd = fd;
118 
119 	return 0;
120 }
121 
122 static int
123 bdev_aio_close(struct file_disk *disk)
124 {
125 	int rc;
126 
127 	if (disk->fd == -1) {
128 		return 0;
129 	}
130 
131 	rc = close(disk->fd);
132 	if (rc < 0) {
133 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
134 			    disk->fd, errno, spdk_strerror(errno));
135 		return -1;
136 	}
137 
138 	disk->fd = -1;
139 
140 	return 0;
141 }
142 
143 static void
144 bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
145 	       struct bdev_aio_task *aio_task,
146 	       struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
147 {
148 	struct iocb *iocb = &aio_task->iocb;
149 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
150 	int rc;
151 
152 	io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
153 	if (aio_ch->group_ch->efd >= 0) {
154 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
155 	}
156 	iocb->data = aio_task;
157 	aio_task->len = nbytes;
158 	aio_task->ch = aio_ch;
159 
160 	SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
161 		      iovcnt, nbytes, offset);
162 
163 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
164 	if (spdk_unlikely(rc < 0)) {
165 		if (rc == -EAGAIN) {
166 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
167 		} else {
168 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
169 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
170 		}
171 	} else {
172 		aio_ch->io_inflight++;
173 	}
174 }
175 
176 static void
177 bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
178 		struct bdev_aio_task *aio_task,
179 		struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
180 {
181 	struct iocb *iocb = &aio_task->iocb;
182 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
183 	int rc;
184 
185 	io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
186 	if (aio_ch->group_ch->efd >= 0) {
187 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
188 	}
189 	iocb->data = aio_task;
190 	aio_task->len = len;
191 	aio_task->ch = aio_ch;
192 
193 	SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
194 		      iovcnt, len, offset);
195 
196 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
197 	if (spdk_unlikely(rc < 0)) {
198 		if (rc == -EAGAIN) {
199 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
200 		} else {
201 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
202 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
203 		}
204 	} else {
205 		aio_ch->io_inflight++;
206 	}
207 }
208 
209 static void
210 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
211 {
212 	int rc = fsync(fdisk->fd);
213 
214 	if (rc == 0) {
215 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
216 	} else {
217 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
218 	}
219 }
220 
221 static void
222 bdev_aio_destruct_cb(void *io_device)
223 {
224 	struct file_disk *fdisk = io_device;
225 	int rc = 0;
226 
227 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
228 	rc = bdev_aio_close(fdisk);
229 	if (rc < 0) {
230 		SPDK_ERRLOG("bdev_aio_close() failed\n");
231 	}
232 	aio_free_disk(fdisk);
233 }
234 
235 static int
236 bdev_aio_destruct(void *ctx)
237 {
238 	struct file_disk *fdisk = ctx;
239 
240 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
241 
242 	return 0;
243 }
244 
245 static int
246 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
247 {
248 	uint32_t head, tail, count;
249 	struct spdk_aio_ring *ring;
250 	struct timespec timeout;
251 	struct io_event *kevents;
252 
253 	ring = (struct spdk_aio_ring *)io_ctx;
254 
255 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
256 		timeout.tv_sec = 0;
257 		timeout.tv_nsec = 0;
258 
259 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
260 	}
261 
262 	/* Read the current state out of the ring */
263 	head = ring->head;
264 	tail = ring->tail;
265 
266 	/* This memory barrier is required to prevent the loads above
267 	 * from being re-ordered with stores to the events array
268 	 * potentially occurring on other threads. */
269 	spdk_smp_rmb();
270 
271 	/* Calculate how many items are in the circular ring */
272 	count = tail - head;
273 	if (tail < head) {
274 		count += ring->size;
275 	}
276 
277 	/* Reduce the count to the limit provided by the user */
278 	count = spdk_min(max, count);
279 
280 	/* Grab the memory location of the event array */
281 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
282 
283 	/* Copy the events out of the ring. */
284 	if ((head + count) <= ring->size) {
285 		/* Only one copy is required */
286 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
287 	} else {
288 		uint32_t first_part = ring->size - head;
289 		/* Two copies are required */
290 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
291 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
292 	}
293 
294 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
295 	 * so the copies out of the event array will always be complete prior to this
296 	 * update becoming visible. On other architectures this is not guaranteed, so
297 	 * add a barrier. */
298 #if defined(__i386__) || defined(__x86_64__)
299 	spdk_compiler_barrier();
300 #else
301 	spdk_smp_mb();
302 #endif
303 	ring->head = (head + count) % ring->size;
304 
305 	return count;
306 }
307 
308 static int
309 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
310 {
311 	int nr, i, res = 0;
312 	struct bdev_aio_task *aio_task;
313 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
314 
315 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
316 	if (nr < 0) {
317 		return 0;
318 	}
319 
320 	for (i = 0; i < nr; i++) {
321 		aio_task = events[i].data;
322 		aio_task->ch->io_inflight--;
323 		if (events[i].res == aio_task->len) {
324 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
325 		} else {
326 			/* From aio_abi.h, io_event.res is defined __s64, negative errno
327 			 * will be assigned to io_event.res for error situation.
328 			 * But from libaio.h, io_event.res is defined unsigned long, so
329 			 * convert it to signed value for error detection.
330 			 */
331 			SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
332 			res = (int)events[i].res;
333 			if (res < 0) {
334 				spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
335 			} else {
336 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
337 			}
338 		}
339 	}
340 
341 	return nr;
342 }
343 
344 static int
345 bdev_aio_group_poll(void *arg)
346 {
347 	struct bdev_aio_group_channel *group_ch = arg;
348 	struct bdev_aio_io_channel *io_ch;
349 	int nr = 0;
350 
351 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
352 		nr += bdev_aio_io_channel_poll(io_ch);
353 	}
354 
355 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
356 }
357 
358 static int
359 bdev_aio_group_interrupt(void *arg)
360 {
361 	struct bdev_aio_group_channel *group_ch = arg;
362 	int rc;
363 	uint64_t num_events;
364 
365 	assert(group_ch->efd >= 0);
366 
367 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
368 	 * io_getevent should be called again to ensure all completed IO are processed.
369 	 */
370 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
371 	if (rc < 0) {
372 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
373 		return -errno;
374 	}
375 
376 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
377 		num_events -= SPDK_AIO_QUEUE_DEPTH;
378 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
379 		if (rc < 0) {
380 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
381 		}
382 	}
383 
384 	return bdev_aio_group_poll(group_ch);
385 }
386 
387 static void
388 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
389 {
390 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
391 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
392 
393 	if (aio_ch->io_inflight) {
394 		spdk_for_each_channel_continue(i, -1);
395 		return;
396 	}
397 
398 	spdk_for_each_channel_continue(i, 0);
399 }
400 
401 static int bdev_aio_reset_retry_timer(void *arg);
402 
403 static void
404 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
405 {
406 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
407 
408 	if (status == -1) {
409 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
410 		return;
411 	}
412 
413 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
414 }
415 
416 static int
417 bdev_aio_reset_retry_timer(void *arg)
418 {
419 	struct file_disk *fdisk = arg;
420 
421 	if (fdisk->reset_retry_timer) {
422 		spdk_poller_unregister(&fdisk->reset_retry_timer);
423 	}
424 
425 	spdk_for_each_channel(fdisk,
426 			      _bdev_aio_get_io_inflight,
427 			      fdisk,
428 			      _bdev_aio_get_io_inflight_done);
429 
430 	return SPDK_POLLER_BUSY;
431 }
432 
433 static void
434 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
435 {
436 	fdisk->reset_task = aio_task;
437 
438 	bdev_aio_reset_retry_timer(fdisk);
439 }
440 
441 static void
442 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
443 		    bool success)
444 {
445 	if (!success) {
446 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
447 		return;
448 	}
449 
450 	switch (bdev_io->type) {
451 	case SPDK_BDEV_IO_TYPE_READ:
452 		bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
453 			       ch,
454 			       (struct bdev_aio_task *)bdev_io->driver_ctx,
455 			       bdev_io->u.bdev.iovs,
456 			       bdev_io->u.bdev.iovcnt,
457 			       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
458 			       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
459 		break;
460 	case SPDK_BDEV_IO_TYPE_WRITE:
461 		bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
462 				ch,
463 				(struct bdev_aio_task *)bdev_io->driver_ctx,
464 				bdev_io->u.bdev.iovs,
465 				bdev_io->u.bdev.iovcnt,
466 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
467 				bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
468 		break;
469 	default:
470 		SPDK_ERRLOG("Wrong io type\n");
471 		break;
472 	}
473 }
474 
475 static int
476 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
477 {
478 	struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
479 
480 	switch (bdev_io->type) {
481 	/* Read and write operations must be performed on buffers aligned to
482 	 * bdev->required_alignment. If user specified unaligned buffers,
483 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
484 	case SPDK_BDEV_IO_TYPE_READ:
485 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
486 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
487 		return 0;
488 	case SPDK_BDEV_IO_TYPE_WRITE:
489 		if (fdisk->readonly) {
490 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
491 		} else {
492 			spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
493 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
494 		}
495 		return 0;
496 
497 	case SPDK_BDEV_IO_TYPE_FLUSH:
498 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
499 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
500 		return 0;
501 
502 	case SPDK_BDEV_IO_TYPE_RESET:
503 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
504 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
505 		return 0;
506 	default:
507 		return -1;
508 	}
509 }
510 
511 static void
512 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
513 {
514 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
515 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
516 	}
517 }
518 
519 static bool
520 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
521 {
522 	switch (io_type) {
523 	case SPDK_BDEV_IO_TYPE_READ:
524 	case SPDK_BDEV_IO_TYPE_WRITE:
525 	case SPDK_BDEV_IO_TYPE_FLUSH:
526 	case SPDK_BDEV_IO_TYPE_RESET:
527 		return true;
528 
529 	default:
530 		return false;
531 	}
532 }
533 
534 static int
535 bdev_aio_create_cb(void *io_device, void *ctx_buf)
536 {
537 	struct bdev_aio_io_channel *ch = ctx_buf;
538 
539 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
540 		SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n");
541 		SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n");
542 		return -1;
543 	}
544 
545 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
546 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
547 
548 	return 0;
549 }
550 
551 static void
552 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
553 {
554 	struct bdev_aio_io_channel *ch = ctx_buf;
555 
556 	io_destroy(ch->io_ctx);
557 
558 	assert(ch->group_ch);
559 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
560 
561 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
562 }
563 
564 static struct spdk_io_channel *
565 bdev_aio_get_io_channel(void *ctx)
566 {
567 	struct file_disk *fdisk = ctx;
568 
569 	return spdk_get_io_channel(fdisk);
570 }
571 
572 
573 static int
574 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
575 {
576 	struct file_disk *fdisk = ctx;
577 
578 	spdk_json_write_named_object_begin(w, "aio");
579 
580 	spdk_json_write_named_string(w, "filename", fdisk->filename);
581 
582 	spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override);
583 
584 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
585 
586 	spdk_json_write_object_end(w);
587 
588 	return 0;
589 }
590 
591 static void
592 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
593 {
594 	struct file_disk *fdisk = bdev->ctxt;
595 
596 	spdk_json_write_object_begin(w);
597 
598 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
599 
600 	spdk_json_write_named_object_begin(w, "params");
601 	spdk_json_write_named_string(w, "name", bdev->name);
602 	if (fdisk->block_size_override) {
603 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
604 	}
605 	spdk_json_write_named_string(w, "filename", fdisk->filename);
606 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
607 	spdk_json_write_object_end(w);
608 
609 	spdk_json_write_object_end(w);
610 }
611 
612 static const struct spdk_bdev_fn_table aio_fn_table = {
613 	.destruct		= bdev_aio_destruct,
614 	.submit_request		= bdev_aio_submit_request,
615 	.io_type_supported	= bdev_aio_io_type_supported,
616 	.get_io_channel		= bdev_aio_get_io_channel,
617 	.dump_info_json		= bdev_aio_dump_info_json,
618 	.write_config_json	= bdev_aio_write_json_config,
619 };
620 
621 static void
622 aio_free_disk(struct file_disk *fdisk)
623 {
624 	if (fdisk == NULL) {
625 		return;
626 	}
627 	free(fdisk->filename);
628 	free(fdisk->disk.name);
629 	free(fdisk);
630 }
631 
632 static int
633 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
634 {
635 	int efd;
636 
637 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
638 	if (efd < 0) {
639 		return -1;
640 	}
641 
642 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
643 	if (ch->intr == NULL) {
644 		close(efd);
645 		return -1;
646 	}
647 	ch->efd = efd;
648 
649 	return 0;
650 }
651 
652 static void
653 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
654 {
655 	spdk_interrupt_unregister(&ch->intr);
656 	close(ch->efd);
657 	ch->efd = -1;
658 }
659 
660 static void
661 bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
662 {
663 	return;
664 }
665 
666 static int
667 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
668 {
669 	struct bdev_aio_group_channel *ch = ctx_buf;
670 	int rc;
671 
672 	TAILQ_INIT(&ch->io_ch_head);
673 	/* Initialize ch->efd to be invalid and unused. */
674 	ch->efd = -1;
675 	if (spdk_interrupt_mode_is_enabled()) {
676 		rc = bdev_aio_register_interrupt(ch);
677 		if (rc < 0) {
678 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
679 			return rc;
680 		}
681 	}
682 
683 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
684 	spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL);
685 
686 	return 0;
687 }
688 
689 static void
690 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
691 {
692 	struct bdev_aio_group_channel *ch = ctx_buf;
693 
694 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
695 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
696 	}
697 
698 	spdk_poller_unregister(&ch->poller);
699 	if (spdk_interrupt_mode_is_enabled()) {
700 		bdev_aio_unregister_interrupt(ch);
701 	}
702 }
703 
704 int
705 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly)
706 {
707 	struct file_disk *fdisk;
708 	uint32_t detected_block_size;
709 	uint64_t disk_size;
710 	int rc;
711 
712 	fdisk = calloc(1, sizeof(*fdisk));
713 	if (!fdisk) {
714 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
715 		return -ENOMEM;
716 	}
717 	fdisk->readonly = readonly;
718 
719 	fdisk->filename = strdup(filename);
720 	if (!fdisk->filename) {
721 		rc = -ENOMEM;
722 		goto error_return;
723 	}
724 
725 	if (bdev_aio_open(fdisk)) {
726 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
727 		rc = -errno;
728 		goto error_return;
729 	}
730 
731 	disk_size = spdk_fd_get_size(fdisk->fd);
732 
733 	fdisk->disk.name = strdup(name);
734 	if (!fdisk->disk.name) {
735 		rc = -ENOMEM;
736 		goto error_return;
737 	}
738 	fdisk->disk.product_name = "AIO disk";
739 	fdisk->disk.module = &aio_if;
740 
741 	fdisk->disk.write_cache = 1;
742 
743 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
744 	if (block_size == 0) {
745 		/* User did not specify block size - use autodetected block size. */
746 		if (detected_block_size == 0) {
747 			SPDK_ERRLOG("Block size could not be auto-detected\n");
748 			rc = -EINVAL;
749 			goto error_return;
750 		}
751 		fdisk->block_size_override = false;
752 		block_size = detected_block_size;
753 	} else {
754 		if (block_size < detected_block_size) {
755 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
756 				    "auto-detected block size %" PRIu32 "\n",
757 				    block_size, detected_block_size);
758 			rc = -EINVAL;
759 			goto error_return;
760 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
761 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
762 				     "auto-detected block size %" PRIu32 "\n",
763 				     block_size, detected_block_size);
764 		}
765 		fdisk->block_size_override = true;
766 	}
767 
768 	if (block_size < 512) {
769 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
770 		rc = -EINVAL;
771 		goto error_return;
772 	}
773 
774 	if (!spdk_u32_is_pow2(block_size)) {
775 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
776 		rc = -EINVAL;
777 		goto error_return;
778 	}
779 
780 	fdisk->disk.blocklen = block_size;
781 	if (fdisk->block_size_override && detected_block_size) {
782 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
783 	} else {
784 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
785 	}
786 
787 	if (disk_size % fdisk->disk.blocklen != 0) {
788 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
789 			    disk_size, fdisk->disk.blocklen);
790 		rc = -EINVAL;
791 		goto error_return;
792 	}
793 
794 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
795 	fdisk->disk.ctxt = fdisk;
796 
797 	fdisk->disk.fn_table = &aio_fn_table;
798 
799 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
800 				sizeof(struct bdev_aio_io_channel),
801 				fdisk->disk.name);
802 	rc = spdk_bdev_register(&fdisk->disk);
803 	if (rc) {
804 		spdk_io_device_unregister(fdisk, NULL);
805 		goto error_return;
806 	}
807 
808 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
809 	return 0;
810 
811 error_return:
812 	bdev_aio_close(fdisk);
813 	aio_free_disk(fdisk);
814 	return rc;
815 }
816 
817 static void
818 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
819 {
820 }
821 
822 int
823 bdev_aio_rescan(const char *name)
824 {
825 	struct spdk_bdev_desc *desc;
826 	struct spdk_bdev *bdev;
827 	struct file_disk *fdisk;
828 	uint64_t disk_size, blockcnt;
829 	int rc;
830 
831 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
832 	if (rc != 0) {
833 		return rc;
834 	}
835 
836 	bdev = spdk_bdev_desc_get_bdev(desc);
837 	if (bdev->module != &aio_if) {
838 		rc = -ENODEV;
839 		goto exit;
840 	}
841 
842 	fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
843 	disk_size = spdk_fd_get_size(fdisk->fd);
844 	blockcnt = disk_size / bdev->blocklen;
845 
846 	if (bdev->blockcnt != blockcnt) {
847 		SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
848 			       PRIu64 "\n",
849 			       fdisk->filename,
850 			       bdev->blockcnt,
851 			       blockcnt);
852 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
853 		if (rc != 0) {
854 			SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
855 				    fdisk->filename, rc);
856 			goto exit;
857 		}
858 	}
859 
860 exit:
861 	spdk_bdev_close(desc);
862 	return rc;
863 }
864 
865 struct delete_aio_bdev_ctx {
866 	delete_aio_bdev_complete cb_fn;
867 	void *cb_arg;
868 };
869 
870 static void
871 aio_bdev_unregister_cb(void *arg, int bdeverrno)
872 {
873 	struct delete_aio_bdev_ctx *ctx = arg;
874 
875 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
876 	free(ctx);
877 }
878 
879 void
880 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
881 {
882 	struct delete_aio_bdev_ctx *ctx;
883 	int rc;
884 
885 	ctx = calloc(1, sizeof(*ctx));
886 	if (ctx == NULL) {
887 		cb_fn(cb_arg, -ENOMEM);
888 		return;
889 	}
890 
891 	ctx->cb_fn = cb_fn;
892 	ctx->cb_arg = cb_arg;
893 	rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
894 	if (rc != 0) {
895 		aio_bdev_unregister_cb(ctx, rc);
896 	}
897 }
898 
899 static int
900 bdev_aio_initialize(void)
901 {
902 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
903 				sizeof(struct bdev_aio_group_channel), "aio_module");
904 
905 	return 0;
906 }
907 
908 static void
909 bdev_aio_fini(void)
910 {
911 	spdk_io_device_unregister(&aio_if, NULL);
912 }
913 
914 SPDK_LOG_REGISTER_COMPONENT(aio)
915