xref: /spdk/module/bdev/aio/bdev_aio.c (revision 307b8c112ffd90a26d53dd15fad67bd9038ef526)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_aio.h"
8 
9 #include "spdk/stdinc.h"
10 
11 #include "spdk/barrier.h"
12 #include "spdk/bdev.h"
13 #include "spdk/bdev_module.h"
14 #include "spdk/env.h"
15 #include "spdk/fd.h"
16 #include "spdk/likely.h"
17 #include "spdk/thread.h"
18 #include "spdk/json.h"
19 #include "spdk/util.h"
20 #include "spdk/string.h"
21 
22 #include "spdk/log.h"
23 
24 #include <sys/eventfd.h>
25 #include <libaio.h>
26 
27 struct bdev_aio_io_channel {
28 	uint64_t				io_inflight;
29 	io_context_t				io_ctx;
30 	struct bdev_aio_group_channel		*group_ch;
31 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
32 };
33 
34 struct bdev_aio_group_channel {
35 	/* eventfd for io completion notification in interrupt mode.
36 	 * Negative value like '-1' indicates it is invalid or unused.
37 	 */
38 	int					efd;
39 	struct spdk_interrupt			*intr;
40 	struct spdk_poller			*poller;
41 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
42 };
43 
44 struct bdev_aio_task {
45 	struct iocb			iocb;
46 	uint64_t			len;
47 	struct bdev_aio_io_channel	*ch;
48 };
49 
50 struct file_disk {
51 	struct bdev_aio_task	*reset_task;
52 	struct spdk_poller	*reset_retry_timer;
53 	struct spdk_bdev	disk;
54 	char			*filename;
55 	int			fd;
56 	TAILQ_ENTRY(file_disk)  link;
57 	bool			block_size_override;
58 };
59 
60 /* For user space reaping of completions */
61 struct spdk_aio_ring {
62 	uint32_t id;
63 	uint32_t size;
64 	uint32_t head;
65 	uint32_t tail;
66 
67 	uint32_t version;
68 	uint32_t compat_features;
69 	uint32_t incompat_features;
70 	uint32_t header_length;
71 };
72 
73 #define SPDK_AIO_RING_VERSION	0xa10a10a1
74 
75 static int bdev_aio_initialize(void);
76 static void bdev_aio_fini(void);
77 static void aio_free_disk(struct file_disk *fdisk);
78 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
79 
80 #define SPDK_AIO_QUEUE_DEPTH 128
81 #define MAX_EVENTS_PER_POLL 32
82 
83 static int
84 bdev_aio_get_ctx_size(void)
85 {
86 	return sizeof(struct bdev_aio_task);
87 }
88 
89 static struct spdk_bdev_module aio_if = {
90 	.name		= "aio",
91 	.module_init	= bdev_aio_initialize,
92 	.module_fini	= bdev_aio_fini,
93 	.get_ctx_size	= bdev_aio_get_ctx_size,
94 };
95 
96 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
97 
98 static int
99 bdev_aio_open(struct file_disk *disk)
100 {
101 	int fd;
102 
103 	fd = open(disk->filename, O_RDWR | O_DIRECT);
104 	if (fd < 0) {
105 		/* Try without O_DIRECT for non-disk files */
106 		fd = open(disk->filename, O_RDWR);
107 		if (fd < 0) {
108 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
109 				    disk->filename, errno, spdk_strerror(errno));
110 			disk->fd = -1;
111 			return -1;
112 		}
113 	}
114 
115 	disk->fd = fd;
116 
117 	return 0;
118 }
119 
120 static int
121 bdev_aio_close(struct file_disk *disk)
122 {
123 	int rc;
124 
125 	if (disk->fd == -1) {
126 		return 0;
127 	}
128 
129 	rc = close(disk->fd);
130 	if (rc < 0) {
131 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
132 			    disk->fd, errno, spdk_strerror(errno));
133 		return -1;
134 	}
135 
136 	disk->fd = -1;
137 
138 	return 0;
139 }
140 
141 static void
142 bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
143 	       struct bdev_aio_task *aio_task,
144 	       struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
145 {
146 	struct iocb *iocb = &aio_task->iocb;
147 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
148 	int rc;
149 
150 	io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
151 	if (aio_ch->group_ch->efd >= 0) {
152 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
153 	}
154 	iocb->data = aio_task;
155 	aio_task->len = nbytes;
156 	aio_task->ch = aio_ch;
157 
158 	SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
159 		      iovcnt, nbytes, offset);
160 
161 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
162 	if (spdk_unlikely(rc < 0)) {
163 		if (rc == -EAGAIN) {
164 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
165 		} else {
166 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
167 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
168 		}
169 	} else {
170 		aio_ch->io_inflight++;
171 	}
172 }
173 
174 static void
175 bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
176 		struct bdev_aio_task *aio_task,
177 		struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
178 {
179 	struct iocb *iocb = &aio_task->iocb;
180 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
181 	int rc;
182 
183 	io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
184 	if (aio_ch->group_ch->efd >= 0) {
185 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
186 	}
187 	iocb->data = aio_task;
188 	aio_task->len = len;
189 	aio_task->ch = aio_ch;
190 
191 	SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
192 		      iovcnt, len, offset);
193 
194 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
195 	if (spdk_unlikely(rc < 0)) {
196 		if (rc == -EAGAIN) {
197 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
198 		} else {
199 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
200 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
201 		}
202 	} else {
203 		aio_ch->io_inflight++;
204 	}
205 }
206 
207 static void
208 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
209 {
210 	int rc = fsync(fdisk->fd);
211 
212 	if (rc == 0) {
213 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
214 	} else {
215 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
216 	}
217 }
218 
219 static void
220 bdev_aio_destruct_cb(void *io_device)
221 {
222 	struct file_disk *fdisk = io_device;
223 	int rc = 0;
224 
225 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
226 	rc = bdev_aio_close(fdisk);
227 	if (rc < 0) {
228 		SPDK_ERRLOG("bdev_aio_close() failed\n");
229 	}
230 
231 	aio_free_disk(fdisk);
232 }
233 
234 static int
235 bdev_aio_destruct(void *ctx)
236 {
237 	struct file_disk *fdisk = ctx;
238 
239 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
240 
241 	return 0;
242 }
243 
244 static int
245 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
246 {
247 	uint32_t head, tail, count;
248 	struct spdk_aio_ring *ring;
249 	struct timespec timeout;
250 	struct io_event *kevents;
251 
252 	ring = (struct spdk_aio_ring *)io_ctx;
253 
254 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
255 		timeout.tv_sec = 0;
256 		timeout.tv_nsec = 0;
257 
258 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
259 	}
260 
261 	/* Read the current state out of the ring */
262 	head = ring->head;
263 	tail = ring->tail;
264 
265 	/* This memory barrier is required to prevent the loads above
266 	 * from being re-ordered with stores to the events array
267 	 * potentially occurring on other threads. */
268 	spdk_smp_rmb();
269 
270 	/* Calculate how many items are in the circular ring */
271 	count = tail - head;
272 	if (tail < head) {
273 		count += ring->size;
274 	}
275 
276 	/* Reduce the count to the limit provided by the user */
277 	count = spdk_min(max, count);
278 
279 	/* Grab the memory location of the event array */
280 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
281 
282 	/* Copy the events out of the ring. */
283 	if ((head + count) <= ring->size) {
284 		/* Only one copy is required */
285 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
286 	} else {
287 		uint32_t first_part = ring->size - head;
288 		/* Two copies are required */
289 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
290 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
291 	}
292 
293 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
294 	 * so the copies out of the event array will always be complete prior to this
295 	 * update becoming visible. On other architectures this is not guaranteed, so
296 	 * add a barrier. */
297 #if defined(__i386__) || defined(__x86_64__)
298 	spdk_compiler_barrier();
299 #else
300 	spdk_smp_mb();
301 #endif
302 	ring->head = (head + count) % ring->size;
303 
304 	return count;
305 }
306 
307 static int
308 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
309 {
310 	int nr, i, res = 0;
311 	struct bdev_aio_task *aio_task;
312 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
313 
314 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
315 	if (nr < 0) {
316 		return 0;
317 	}
318 
319 	for (i = 0; i < nr; i++) {
320 		aio_task = events[i].data;
321 		aio_task->ch->io_inflight--;
322 		if (events[i].res == aio_task->len) {
323 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
324 		} else {
325 			/* From aio_abi.h, io_event.res is defined __s64, negative errno
326 			 * will be assigned to io_event.res for error situation.
327 			 * But from libaio.h, io_event.res is defined unsigned long, so
328 			 * convert it to signed value for error detection.
329 			 */
330 			SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
331 			res = (int)events[i].res;
332 			if (res < 0) {
333 				spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
334 			} else {
335 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
336 			}
337 		}
338 	}
339 
340 	return nr;
341 }
342 
343 static int
344 bdev_aio_group_poll(void *arg)
345 {
346 	struct bdev_aio_group_channel *group_ch = arg;
347 	struct bdev_aio_io_channel *io_ch;
348 	int nr = 0;
349 
350 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
351 		nr += bdev_aio_io_channel_poll(io_ch);
352 	}
353 
354 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
355 }
356 
357 static int
358 bdev_aio_group_interrupt(void *arg)
359 {
360 	struct bdev_aio_group_channel *group_ch = arg;
361 	int rc;
362 	uint64_t num_events;
363 
364 	assert(group_ch->efd >= 0);
365 
366 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
367 	 * io_getevent should be called again to ensure all completed IO are processed.
368 	 */
369 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
370 	if (rc < 0) {
371 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
372 		return -errno;
373 	}
374 
375 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
376 		num_events -= SPDK_AIO_QUEUE_DEPTH;
377 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
378 		if (rc < 0) {
379 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
380 		}
381 	}
382 
383 	return bdev_aio_group_poll(group_ch);
384 }
385 
386 static void
387 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
388 {
389 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
390 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
391 
392 	if (aio_ch->io_inflight) {
393 		spdk_for_each_channel_continue(i, -1);
394 		return;
395 	}
396 
397 	spdk_for_each_channel_continue(i, 0);
398 }
399 
400 static int bdev_aio_reset_retry_timer(void *arg);
401 
402 static void
403 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
404 {
405 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
406 
407 	if (status == -1) {
408 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
409 		return;
410 	}
411 
412 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
413 }
414 
415 static int
416 bdev_aio_reset_retry_timer(void *arg)
417 {
418 	struct file_disk *fdisk = arg;
419 
420 	if (fdisk->reset_retry_timer) {
421 		spdk_poller_unregister(&fdisk->reset_retry_timer);
422 	}
423 
424 	spdk_for_each_channel(fdisk,
425 			      _bdev_aio_get_io_inflight,
426 			      fdisk,
427 			      _bdev_aio_get_io_inflight_done);
428 
429 	return SPDK_POLLER_BUSY;
430 }
431 
432 static void
433 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
434 {
435 	fdisk->reset_task = aio_task;
436 
437 	bdev_aio_reset_retry_timer(fdisk);
438 }
439 
440 static void
441 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
442 		    bool success)
443 {
444 	if (!success) {
445 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
446 		return;
447 	}
448 
449 	switch (bdev_io->type) {
450 	case SPDK_BDEV_IO_TYPE_READ:
451 		bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
452 			       ch,
453 			       (struct bdev_aio_task *)bdev_io->driver_ctx,
454 			       bdev_io->u.bdev.iovs,
455 			       bdev_io->u.bdev.iovcnt,
456 			       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
457 			       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
458 		break;
459 	case SPDK_BDEV_IO_TYPE_WRITE:
460 		bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
461 				ch,
462 				(struct bdev_aio_task *)bdev_io->driver_ctx,
463 				bdev_io->u.bdev.iovs,
464 				bdev_io->u.bdev.iovcnt,
465 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
466 				bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
467 		break;
468 	default:
469 		SPDK_ERRLOG("Wrong io type\n");
470 		break;
471 	}
472 }
473 
474 static int
475 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
476 {
477 	switch (bdev_io->type) {
478 	/* Read and write operations must be performed on buffers aligned to
479 	 * bdev->required_alignment. If user specified unaligned buffers,
480 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
481 	case SPDK_BDEV_IO_TYPE_READ:
482 	case SPDK_BDEV_IO_TYPE_WRITE:
483 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
484 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
485 		return 0;
486 	case SPDK_BDEV_IO_TYPE_FLUSH:
487 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
488 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
489 		return 0;
490 
491 	case SPDK_BDEV_IO_TYPE_RESET:
492 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
493 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
494 		return 0;
495 	default:
496 		return -1;
497 	}
498 }
499 
500 static void
501 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
502 {
503 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
504 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
505 	}
506 }
507 
508 static bool
509 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
510 {
511 	switch (io_type) {
512 	case SPDK_BDEV_IO_TYPE_READ:
513 	case SPDK_BDEV_IO_TYPE_WRITE:
514 	case SPDK_BDEV_IO_TYPE_FLUSH:
515 	case SPDK_BDEV_IO_TYPE_RESET:
516 		return true;
517 
518 	default:
519 		return false;
520 	}
521 }
522 
523 static int
524 bdev_aio_create_cb(void *io_device, void *ctx_buf)
525 {
526 	struct bdev_aio_io_channel *ch = ctx_buf;
527 
528 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
529 		SPDK_ERRLOG("async I/O context setup failure\n");
530 		return -1;
531 	}
532 
533 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
534 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
535 
536 	return 0;
537 }
538 
539 static void
540 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
541 {
542 	struct bdev_aio_io_channel *ch = ctx_buf;
543 
544 	io_destroy(ch->io_ctx);
545 
546 	assert(ch->group_ch);
547 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
548 
549 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
550 }
551 
552 static struct spdk_io_channel *
553 bdev_aio_get_io_channel(void *ctx)
554 {
555 	struct file_disk *fdisk = ctx;
556 
557 	return spdk_get_io_channel(fdisk);
558 }
559 
560 
561 static int
562 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
563 {
564 	struct file_disk *fdisk = ctx;
565 
566 	spdk_json_write_named_object_begin(w, "aio");
567 
568 	spdk_json_write_named_string(w, "filename", fdisk->filename);
569 
570 	spdk_json_write_object_end(w);
571 
572 	return 0;
573 }
574 
575 static void
576 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
577 {
578 	struct file_disk *fdisk = bdev->ctxt;
579 
580 	spdk_json_write_object_begin(w);
581 
582 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
583 
584 	spdk_json_write_named_object_begin(w, "params");
585 	spdk_json_write_named_string(w, "name", bdev->name);
586 	if (fdisk->block_size_override) {
587 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
588 	}
589 	spdk_json_write_named_string(w, "filename", fdisk->filename);
590 	spdk_json_write_object_end(w);
591 
592 	spdk_json_write_object_end(w);
593 }
594 
595 static const struct spdk_bdev_fn_table aio_fn_table = {
596 	.destruct		= bdev_aio_destruct,
597 	.submit_request		= bdev_aio_submit_request,
598 	.io_type_supported	= bdev_aio_io_type_supported,
599 	.get_io_channel		= bdev_aio_get_io_channel,
600 	.dump_info_json		= bdev_aio_dump_info_json,
601 	.write_config_json	= bdev_aio_write_json_config,
602 };
603 
604 static void
605 aio_free_disk(struct file_disk *fdisk)
606 {
607 	if (fdisk == NULL) {
608 		return;
609 	}
610 	free(fdisk->filename);
611 	free(fdisk->disk.name);
612 	free(fdisk);
613 }
614 
615 static int
616 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
617 {
618 	int efd;
619 
620 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
621 	if (efd < 0) {
622 		return -1;
623 	}
624 
625 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
626 	if (ch->intr == NULL) {
627 		close(efd);
628 		return -1;
629 	}
630 	ch->efd = efd;
631 
632 	return 0;
633 }
634 
635 static void
636 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
637 {
638 	spdk_interrupt_unregister(&ch->intr);
639 	close(ch->efd);
640 	ch->efd = -1;
641 }
642 
643 static void
644 bdev_aio_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
645 {
646 	return;
647 }
648 
649 static int
650 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
651 {
652 	struct bdev_aio_group_channel *ch = ctx_buf;
653 	int rc;
654 
655 	TAILQ_INIT(&ch->io_ch_head);
656 	/* Initialize ch->efd to be invalid and unused. */
657 	ch->efd = -1;
658 	if (spdk_interrupt_mode_is_enabled()) {
659 		rc = bdev_aio_register_interrupt(ch);
660 		if (rc < 0) {
661 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
662 			return rc;
663 		}
664 	}
665 
666 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
667 	spdk_poller_register_interrupt(ch->poller, bdev_aio_poller_set_interrupt_mode, NULL);
668 
669 	return 0;
670 }
671 
672 static void
673 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
674 {
675 	struct bdev_aio_group_channel *ch = ctx_buf;
676 
677 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
678 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
679 	}
680 
681 	spdk_poller_unregister(&ch->poller);
682 	if (spdk_interrupt_mode_is_enabled()) {
683 		bdev_aio_unregister_interrupt(ch);
684 	}
685 }
686 
687 int
688 create_aio_bdev(const char *name, const char *filename, uint32_t block_size)
689 {
690 	struct file_disk *fdisk;
691 	uint32_t detected_block_size;
692 	uint64_t disk_size;
693 	int rc;
694 
695 	fdisk = calloc(1, sizeof(*fdisk));
696 	if (!fdisk) {
697 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
698 		return -ENOMEM;
699 	}
700 
701 	fdisk->filename = strdup(filename);
702 	if (!fdisk->filename) {
703 		rc = -ENOMEM;
704 		goto error_return;
705 	}
706 
707 	if (bdev_aio_open(fdisk)) {
708 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
709 		rc = -errno;
710 		goto error_return;
711 	}
712 
713 	disk_size = spdk_fd_get_size(fdisk->fd);
714 
715 	fdisk->disk.name = strdup(name);
716 	if (!fdisk->disk.name) {
717 		rc = -ENOMEM;
718 		goto error_return;
719 	}
720 	fdisk->disk.product_name = "AIO disk";
721 	fdisk->disk.module = &aio_if;
722 
723 	fdisk->disk.write_cache = 1;
724 
725 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
726 	if (block_size == 0) {
727 		/* User did not specify block size - use autodetected block size. */
728 		if (detected_block_size == 0) {
729 			SPDK_ERRLOG("Block size could not be auto-detected\n");
730 			rc = -EINVAL;
731 			goto error_return;
732 		}
733 		fdisk->block_size_override = false;
734 		block_size = detected_block_size;
735 	} else {
736 		if (block_size < detected_block_size) {
737 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
738 				    "auto-detected block size %" PRIu32 "\n",
739 				    block_size, detected_block_size);
740 			rc = -EINVAL;
741 			goto error_return;
742 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
743 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
744 				     "auto-detected block size %" PRIu32 "\n",
745 				     block_size, detected_block_size);
746 		}
747 		fdisk->block_size_override = true;
748 	}
749 
750 	if (block_size < 512) {
751 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
752 		rc = -EINVAL;
753 		goto error_return;
754 	}
755 
756 	if (!spdk_u32_is_pow2(block_size)) {
757 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
758 		rc = -EINVAL;
759 		goto error_return;
760 	}
761 
762 	fdisk->disk.blocklen = block_size;
763 	if (fdisk->block_size_override && detected_block_size) {
764 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
765 	} else {
766 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
767 	}
768 
769 	if (disk_size % fdisk->disk.blocklen != 0) {
770 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
771 			    disk_size, fdisk->disk.blocklen);
772 		rc = -EINVAL;
773 		goto error_return;
774 	}
775 
776 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
777 	fdisk->disk.ctxt = fdisk;
778 
779 	fdisk->disk.fn_table = &aio_fn_table;
780 
781 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
782 				sizeof(struct bdev_aio_io_channel),
783 				fdisk->disk.name);
784 	rc = spdk_bdev_register(&fdisk->disk);
785 	if (rc) {
786 		spdk_io_device_unregister(fdisk, NULL);
787 		goto error_return;
788 	}
789 
790 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
791 	return 0;
792 
793 error_return:
794 	bdev_aio_close(fdisk);
795 	aio_free_disk(fdisk);
796 	return rc;
797 }
798 
799 static void
800 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
801 {
802 }
803 
804 int
805 bdev_aio_rescan(const char *name)
806 {
807 	struct spdk_bdev_desc *desc;
808 	struct spdk_bdev *bdev;
809 	struct file_disk *fdisk;
810 	uint64_t disk_size, blockcnt;
811 	int rc;
812 
813 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
814 	if (rc != 0) {
815 		return rc;
816 	}
817 
818 	bdev = spdk_bdev_desc_get_bdev(desc);
819 	if (bdev->module != &aio_if) {
820 		rc = -ENODEV;
821 		goto exit;
822 	}
823 
824 	fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
825 	disk_size = spdk_fd_get_size(fdisk->fd);
826 	blockcnt = disk_size / bdev->blocklen;
827 
828 	if (bdev->blockcnt != blockcnt) {
829 		SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
830 			       PRIu64 "\n",
831 			       fdisk->filename,
832 			       bdev->blockcnt,
833 			       blockcnt);
834 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
835 		if (rc != 0) {
836 			SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
837 				    fdisk->filename, rc);
838 			goto exit;
839 		}
840 	}
841 
842 exit:
843 	spdk_bdev_close(desc);
844 	return rc;
845 }
846 
847 struct delete_aio_bdev_ctx {
848 	delete_aio_bdev_complete cb_fn;
849 	void *cb_arg;
850 };
851 
852 static void
853 aio_bdev_unregister_cb(void *arg, int bdeverrno)
854 {
855 	struct delete_aio_bdev_ctx *ctx = arg;
856 
857 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
858 	free(ctx);
859 }
860 
861 void
862 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
863 {
864 	struct delete_aio_bdev_ctx *ctx;
865 	int rc;
866 
867 	ctx = calloc(1, sizeof(*ctx));
868 	if (ctx == NULL) {
869 		cb_fn(cb_arg, -ENOMEM);
870 		return;
871 	}
872 
873 	ctx->cb_fn = cb_fn;
874 	ctx->cb_arg = cb_arg;
875 	rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
876 	if (rc != 0) {
877 		aio_bdev_unregister_cb(ctx, rc);
878 	}
879 }
880 
881 static int
882 bdev_aio_initialize(void)
883 {
884 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
885 				sizeof(struct bdev_aio_group_channel), "aio_module");
886 
887 	return 0;
888 }
889 
890 static void
891 bdev_aio_fini(void)
892 {
893 	spdk_io_device_unregister(&aio_if, NULL);
894 }
895 
896 SPDK_LOG_REGISTER_COMPONENT(aio)
897