xref: /spdk/module/bdev/aio/bdev_aio.c (revision 95d6c9fac17572b107042103439aafd696d60b0e)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_aio.h"
8 
9 #include "spdk/stdinc.h"
10 
11 #include "spdk/barrier.h"
12 #include "spdk/bdev.h"
13 #include "spdk/bdev_module.h"
14 #include "spdk/env.h"
15 #include "spdk/fd.h"
16 #include "spdk/likely.h"
17 #include "spdk/thread.h"
18 #include "spdk/json.h"
19 #include "spdk/util.h"
20 #include "spdk/string.h"
21 
22 #include "spdk/log.h"
23 
24 #include <sys/eventfd.h>
25 
26 #ifndef __FreeBSD__
27 #include <libaio.h>
28 #endif
29 
30 struct bdev_aio_io_channel {
31 	uint64_t				io_inflight;
32 #ifdef __FreeBSD__
33 	int					kqfd;
34 #else
35 	io_context_t				io_ctx;
36 #endif
37 	struct bdev_aio_group_channel		*group_ch;
38 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
39 };
40 
41 struct bdev_aio_group_channel {
42 	/* eventfd for io completion notification in interrupt mode.
43 	 * Negative value like '-1' indicates it is invalid or unused.
44 	 */
45 	int					efd;
46 	struct spdk_interrupt			*intr;
47 	struct spdk_poller			*poller;
48 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
49 };
50 
51 struct bdev_aio_task {
52 #ifdef __FreeBSD__
53 	struct aiocb			aiocb;
54 #else
55 	struct iocb			iocb;
56 #endif
57 	uint64_t			len;
58 	struct bdev_aio_io_channel	*ch;
59 };
60 
61 struct file_disk {
62 	struct bdev_aio_task	*reset_task;
63 	struct spdk_poller	*reset_retry_timer;
64 	struct spdk_bdev	disk;
65 	char			*filename;
66 	int			fd;
67 	bool			use_nowait;
68 	TAILQ_ENTRY(file_disk)  link;
69 	bool			block_size_override;
70 	bool			readonly;
71 	bool			fallocate;
72 };
73 
74 /* For user space reaping of completions */
75 struct spdk_aio_ring {
76 	uint32_t id;
77 	uint32_t size;
78 	uint32_t head;
79 	uint32_t tail;
80 
81 	uint32_t version;
82 	uint32_t compat_features;
83 	uint32_t incompat_features;
84 	uint32_t header_length;
85 };
86 
87 #define SPDK_AIO_RING_VERSION	0xa10a10a1
88 
89 static int bdev_aio_initialize(void);
90 static void bdev_aio_fini(void);
91 static void aio_free_disk(struct file_disk *fdisk);
92 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
93 
94 #define SPDK_AIO_QUEUE_DEPTH 128
95 #define MAX_EVENTS_PER_POLL 32
96 
97 static int
98 bdev_aio_get_ctx_size(void)
99 {
100 	return sizeof(struct bdev_aio_task);
101 }
102 
103 static struct spdk_bdev_module aio_if = {
104 	.name		= "aio",
105 	.module_init	= bdev_aio_initialize,
106 	.module_fini	= bdev_aio_fini,
107 	.get_ctx_size	= bdev_aio_get_ctx_size,
108 };
109 
110 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
111 
112 static int
113 bdev_aio_open(struct file_disk *disk)
114 {
115 	int fd;
116 	int io_flag = disk->readonly ? O_RDONLY : O_RDWR;
117 	struct stat st;
118 
119 	fd = open(disk->filename, io_flag | O_DIRECT);
120 	if (fd < 0) {
121 		/* Try without O_DIRECT for non-disk files */
122 		fd = open(disk->filename, io_flag);
123 		if (fd < 0) {
124 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
125 				    disk->filename, errno, spdk_strerror(errno));
126 			disk->fd = -1;
127 			return -1;
128 		}
129 	}
130 
131 	disk->fd = fd;
132 	/* Some aio operations can block, for example if number outstanding
133 	 * I/O exceeds number of block layer tags. But not all files can
134 	 * support RWF_NOWAIT flag. So use RWF_NOWAIT on block devices only.
135 	 */
136 	disk->use_nowait = fstat(fd, &st) == 0 && S_ISBLK(st.st_mode);
137 
138 	return 0;
139 }
140 
141 static int
142 bdev_aio_close(struct file_disk *disk)
143 {
144 	int rc;
145 
146 	if (disk->fd == -1) {
147 		return 0;
148 	}
149 
150 	rc = close(disk->fd);
151 	if (rc < 0) {
152 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
153 			    disk->fd, errno, spdk_strerror(errno));
154 		return -1;
155 	}
156 
157 	disk->fd = -1;
158 
159 	return 0;
160 }
161 
162 #ifdef __FreeBSD__
163 static int
164 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
165 		   struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
166 		   struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
167 {
168 	struct aiocb *aiocb = &aio_task->aiocb;
169 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
170 
171 	memset(aiocb, 0, sizeof(struct aiocb));
172 	aiocb->aio_fildes = fdisk->fd;
173 	aiocb->aio_iov = iov;
174 	aiocb->aio_iovcnt = iovcnt;
175 	aiocb->aio_offset = offset;
176 	aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd;
177 	aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task;
178 	aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT;
179 
180 	aio_task->len = nbytes;
181 	aio_task->ch = aio_ch;
182 
183 	if (type == SPDK_BDEV_IO_TYPE_READ) {
184 		return aio_readv(aiocb);
185 	}
186 
187 	return aio_writev(aiocb);
188 }
189 #else
190 static int
191 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
192 		   struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
193 		   struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
194 {
195 	struct iocb *iocb = &aio_task->iocb;
196 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
197 
198 	if (type == SPDK_BDEV_IO_TYPE_READ) {
199 		io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
200 	} else {
201 		io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
202 	}
203 
204 	if (aio_ch->group_ch->efd >= 0) {
205 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
206 	}
207 	iocb->data = aio_task;
208 	if (fdisk->use_nowait) {
209 		iocb->aio_rw_flags = RWF_NOWAIT;
210 	}
211 	aio_task->len = nbytes;
212 	aio_task->ch = aio_ch;
213 
214 	return io_submit(aio_ch->io_ctx, 1, &iocb);
215 }
216 #endif
217 
218 static void
219 bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk,
220 	    struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
221 	    struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
222 {
223 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
224 	int rc;
225 
226 	if (type == SPDK_BDEV_IO_TYPE_READ) {
227 		SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
228 			      iovcnt, nbytes, offset);
229 	} else {
230 		SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
231 			      iovcnt, nbytes, offset);
232 	}
233 
234 	rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset);
235 	if (spdk_unlikely(rc < 0)) {
236 		if (rc == -EAGAIN) {
237 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
238 		} else {
239 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
240 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
241 		}
242 	} else {
243 		aio_ch->io_inflight++;
244 	}
245 }
246 
247 static void
248 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
249 {
250 	int rc = fsync(fdisk->fd);
251 
252 	if (rc == 0) {
253 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
254 	} else {
255 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
256 	}
257 }
258 
259 #ifndef __FreeBSD__
260 static void
261 bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode)
262 {
263 	struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
264 	struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx;
265 	uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen;
266 	uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
267 	int rc;
268 
269 	if (!fdisk->fallocate) {
270 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP);
271 		return;
272 	}
273 
274 	rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes);
275 	if (rc == 0) {
276 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
277 	} else {
278 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
279 	}
280 }
281 
282 static void
283 bdev_aio_unmap(struct spdk_bdev_io *bdev_io)
284 {
285 	int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE;
286 
287 	bdev_aio_fallocate(bdev_io, mode);
288 }
289 
290 
291 static void
292 bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io)
293 {
294 	int mode = FALLOC_FL_ZERO_RANGE;
295 
296 	bdev_aio_fallocate(bdev_io, mode);
297 }
298 #endif
299 
300 static void
301 bdev_aio_destruct_cb(void *io_device)
302 {
303 	struct file_disk *fdisk = io_device;
304 	int rc = 0;
305 
306 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
307 	rc = bdev_aio_close(fdisk);
308 	if (rc < 0) {
309 		SPDK_ERRLOG("bdev_aio_close() failed\n");
310 	}
311 	aio_free_disk(fdisk);
312 }
313 
314 static int
315 bdev_aio_destruct(void *ctx)
316 {
317 	struct file_disk *fdisk = ctx;
318 
319 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
320 
321 	return 0;
322 }
323 
324 #ifdef __FreeBSD__
325 static int
326 bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events)
327 {
328 	struct timespec ts;
329 	int count;
330 
331 	memset(events, 0, max * sizeof(struct kevent));
332 	memset(&ts, 0, sizeof(ts));
333 
334 	count = kevent(kq, NULL, 0, events, max, &ts);
335 	if (count < 0) {
336 		SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno));
337 		return -errno;
338 	}
339 
340 	return count;
341 }
342 
343 static int
344 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
345 {
346 	int nr, i, res = 0;
347 	struct bdev_aio_task *aio_task;
348 	struct kevent events[SPDK_AIO_QUEUE_DEPTH];
349 
350 	nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events);
351 	if (nr < 0) {
352 		return 0;
353 	}
354 
355 	for (i = 0; i < nr; i++) {
356 		aio_task = events[i].udata;
357 		aio_task->ch->io_inflight--;
358 		if (aio_task == NULL) {
359 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
360 			break;
361 		} else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) {
362 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
363 		} else {
364 			SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb));
365 			res = aio_error(&aio_task->aiocb);
366 			if (res != 0) {
367 				spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
368 			} else {
369 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
370 			}
371 		}
372 	}
373 
374 	return nr;
375 }
376 #else
377 static int
378 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
379 {
380 	uint32_t head, tail, count;
381 	struct spdk_aio_ring *ring;
382 	struct timespec timeout;
383 	struct io_event *kevents;
384 
385 	ring = (struct spdk_aio_ring *)io_ctx;
386 
387 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
388 		timeout.tv_sec = 0;
389 		timeout.tv_nsec = 0;
390 
391 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
392 	}
393 
394 	/* Read the current state out of the ring */
395 	head = ring->head;
396 	tail = ring->tail;
397 
398 	/* This memory barrier is required to prevent the loads above
399 	 * from being re-ordered with stores to the events array
400 	 * potentially occurring on other threads. */
401 	spdk_smp_rmb();
402 
403 	/* Calculate how many items are in the circular ring */
404 	count = tail - head;
405 	if (tail < head) {
406 		count += ring->size;
407 	}
408 
409 	/* Reduce the count to the limit provided by the user */
410 	count = spdk_min(max, count);
411 
412 	/* Grab the memory location of the event array */
413 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
414 
415 	/* Copy the events out of the ring. */
416 	if ((head + count) <= ring->size) {
417 		/* Only one copy is required */
418 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
419 	} else {
420 		uint32_t first_part = ring->size - head;
421 		/* Two copies are required */
422 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
423 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
424 	}
425 
426 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
427 	 * so the copies out of the event array will always be complete prior to this
428 	 * update becoming visible. On other architectures this is not guaranteed, so
429 	 * add a barrier. */
430 #if defined(__i386__) || defined(__x86_64__)
431 	spdk_compiler_barrier();
432 #else
433 	spdk_smp_mb();
434 #endif
435 	ring->head = (head + count) % ring->size;
436 
437 	return count;
438 }
439 
440 static int
441 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
442 {
443 	int nr, i, res = 0;
444 	struct bdev_aio_task *aio_task;
445 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
446 
447 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
448 	if (nr < 0) {
449 		return 0;
450 	}
451 
452 	for (i = 0; i < nr; i++) {
453 		aio_task = events[i].data;
454 		aio_task->ch->io_inflight--;
455 		if (events[i].res == aio_task->len) {
456 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
457 		} else {
458 			/* From aio_abi.h, io_event.res is defined __s64, negative errno
459 			 * will be assigned to io_event.res for error situation.
460 			 * But from libaio.h, io_event.res is defined unsigned long, so
461 			 * convert it to signed value for error detection.
462 			 */
463 			res = (int)events[i].res;
464 			if (res < 0) {
465 				if (res == -EAGAIN) {
466 					spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
467 				} else {
468 					SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
469 					spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
470 				}
471 			} else {
472 				SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
473 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
474 			}
475 		}
476 	}
477 
478 	return nr;
479 }
480 #endif
481 
482 static int
483 bdev_aio_group_poll(void *arg)
484 {
485 	struct bdev_aio_group_channel *group_ch = arg;
486 	struct bdev_aio_io_channel *io_ch;
487 	int nr = 0;
488 
489 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
490 		nr += bdev_aio_io_channel_poll(io_ch);
491 	}
492 
493 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
494 }
495 
496 static int
497 bdev_aio_group_interrupt(void *arg)
498 {
499 	struct bdev_aio_group_channel *group_ch = arg;
500 	int rc;
501 	uint64_t num_events;
502 
503 	assert(group_ch->efd >= 0);
504 
505 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
506 	 * io_getevent should be called again to ensure all completed IO are processed.
507 	 */
508 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
509 	if (rc < 0) {
510 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
511 		return -errno;
512 	}
513 
514 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
515 		num_events -= SPDK_AIO_QUEUE_DEPTH;
516 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
517 		if (rc < 0) {
518 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
519 		}
520 	}
521 
522 	return bdev_aio_group_poll(group_ch);
523 }
524 
525 static void
526 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
527 {
528 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
529 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
530 
531 	if (aio_ch->io_inflight) {
532 		spdk_for_each_channel_continue(i, -1);
533 		return;
534 	}
535 
536 	spdk_for_each_channel_continue(i, 0);
537 }
538 
539 static int bdev_aio_reset_retry_timer(void *arg);
540 
541 static void
542 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
543 {
544 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
545 
546 	if (status == -1) {
547 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
548 		return;
549 	}
550 
551 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
552 }
553 
554 static int
555 bdev_aio_reset_retry_timer(void *arg)
556 {
557 	struct file_disk *fdisk = arg;
558 
559 	if (fdisk->reset_retry_timer) {
560 		spdk_poller_unregister(&fdisk->reset_retry_timer);
561 	}
562 
563 	spdk_for_each_channel(fdisk,
564 			      _bdev_aio_get_io_inflight,
565 			      fdisk,
566 			      _bdev_aio_get_io_inflight_done);
567 
568 	return SPDK_POLLER_BUSY;
569 }
570 
571 static void
572 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
573 {
574 	fdisk->reset_task = aio_task;
575 
576 	bdev_aio_reset_retry_timer(fdisk);
577 }
578 
579 static void
580 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
581 		    bool success)
582 {
583 	if (!success) {
584 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
585 		return;
586 	}
587 
588 	switch (bdev_io->type) {
589 	case SPDK_BDEV_IO_TYPE_READ:
590 	case SPDK_BDEV_IO_TYPE_WRITE:
591 		bdev_aio_rw(bdev_io->type,
592 			    (struct file_disk *)bdev_io->bdev->ctxt,
593 			    ch,
594 			    (struct bdev_aio_task *)bdev_io->driver_ctx,
595 			    bdev_io->u.bdev.iovs,
596 			    bdev_io->u.bdev.iovcnt,
597 			    bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
598 			    bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
599 		break;
600 	default:
601 		SPDK_ERRLOG("Wrong io type\n");
602 		break;
603 	}
604 }
605 
606 static int
607 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
608 {
609 	struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
610 
611 	switch (bdev_io->type) {
612 	/* Read and write operations must be performed on buffers aligned to
613 	 * bdev->required_alignment. If user specified unaligned buffers,
614 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
615 	case SPDK_BDEV_IO_TYPE_READ:
616 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
617 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
618 		return 0;
619 	case SPDK_BDEV_IO_TYPE_WRITE:
620 		if (fdisk->readonly) {
621 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
622 		} else {
623 			spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
624 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
625 		}
626 		return 0;
627 
628 	case SPDK_BDEV_IO_TYPE_FLUSH:
629 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
630 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
631 		return 0;
632 
633 	case SPDK_BDEV_IO_TYPE_RESET:
634 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
635 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
636 		return 0;
637 
638 #ifndef __FreeBSD__
639 	case SPDK_BDEV_IO_TYPE_UNMAP:
640 		bdev_aio_unmap(bdev_io);
641 		return 0;
642 
643 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
644 		bdev_aio_write_zeros(bdev_io);
645 		return 0;
646 #endif
647 
648 	default:
649 		return -1;
650 	}
651 }
652 
653 static void
654 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
655 {
656 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
657 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
658 	}
659 }
660 
661 static bool
662 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
663 {
664 	struct file_disk *fdisk = ctx;
665 
666 	switch (io_type) {
667 	case SPDK_BDEV_IO_TYPE_READ:
668 	case SPDK_BDEV_IO_TYPE_WRITE:
669 	case SPDK_BDEV_IO_TYPE_FLUSH:
670 	case SPDK_BDEV_IO_TYPE_RESET:
671 		return true;
672 
673 	case SPDK_BDEV_IO_TYPE_UNMAP:
674 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
675 		return fdisk->fallocate;
676 
677 	default:
678 		return false;
679 	}
680 }
681 
682 #ifdef __FreeBSD__
683 static int
684 bdev_aio_create_io(struct bdev_aio_io_channel *ch)
685 {
686 	ch->kqfd = kqueue();
687 	if (ch->kqfd < 0) {
688 		SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno));
689 		return -1;
690 	}
691 
692 	return 0;
693 }
694 
695 static void
696 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
697 {
698 	close(ch->kqfd);
699 }
700 #else
701 static int
702 bdev_aio_create_io(struct bdev_aio_io_channel *ch)
703 {
704 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
705 		SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n");
706 		SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n");
707 		return -1;
708 	}
709 
710 	return 0;
711 }
712 
713 static void
714 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
715 {
716 	io_destroy(ch->io_ctx);
717 }
718 #endif
719 
720 static int
721 bdev_aio_create_cb(void *io_device, void *ctx_buf)
722 {
723 	struct bdev_aio_io_channel *ch = ctx_buf;
724 	int rc;
725 
726 	rc = bdev_aio_create_io(ch);
727 	if (rc < 0) {
728 		return rc;
729 	}
730 
731 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
732 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
733 
734 	return 0;
735 }
736 
737 static void
738 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
739 {
740 	struct bdev_aio_io_channel *ch = ctx_buf;
741 
742 	bdev_aio_destroy_io(ch);
743 
744 	assert(ch->group_ch);
745 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
746 
747 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
748 }
749 
750 static struct spdk_io_channel *
751 bdev_aio_get_io_channel(void *ctx)
752 {
753 	struct file_disk *fdisk = ctx;
754 
755 	return spdk_get_io_channel(fdisk);
756 }
757 
758 
759 static int
760 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
761 {
762 	struct file_disk *fdisk = ctx;
763 
764 	spdk_json_write_named_object_begin(w, "aio");
765 
766 	spdk_json_write_named_string(w, "filename", fdisk->filename);
767 
768 	spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override);
769 
770 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
771 
772 	spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
773 
774 	spdk_json_write_object_end(w);
775 
776 	return 0;
777 }
778 
779 static void
780 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
781 {
782 	struct file_disk *fdisk = bdev->ctxt;
783 	const struct spdk_uuid *uuid = spdk_bdev_get_uuid(bdev);
784 
785 	spdk_json_write_object_begin(w);
786 
787 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
788 
789 	spdk_json_write_named_object_begin(w, "params");
790 	spdk_json_write_named_string(w, "name", bdev->name);
791 	if (fdisk->block_size_override) {
792 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
793 	}
794 	spdk_json_write_named_string(w, "filename", fdisk->filename);
795 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
796 	spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
797 	if (!spdk_uuid_is_null(uuid)) {
798 		spdk_json_write_named_uuid(w, "uuid", uuid);
799 	}
800 	spdk_json_write_object_end(w);
801 
802 	spdk_json_write_object_end(w);
803 }
804 
805 static const struct spdk_bdev_fn_table aio_fn_table = {
806 	.destruct		= bdev_aio_destruct,
807 	.submit_request		= bdev_aio_submit_request,
808 	.io_type_supported	= bdev_aio_io_type_supported,
809 	.get_io_channel		= bdev_aio_get_io_channel,
810 	.dump_info_json		= bdev_aio_dump_info_json,
811 	.write_config_json	= bdev_aio_write_json_config,
812 };
813 
814 static void
815 aio_free_disk(struct file_disk *fdisk)
816 {
817 	if (fdisk == NULL) {
818 		return;
819 	}
820 	free(fdisk->filename);
821 	free(fdisk->disk.name);
822 	free(fdisk);
823 }
824 
825 static int
826 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
827 {
828 	int efd;
829 
830 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
831 	if (efd < 0) {
832 		return -1;
833 	}
834 
835 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
836 	if (ch->intr == NULL) {
837 		close(efd);
838 		return -1;
839 	}
840 	ch->efd = efd;
841 
842 	return 0;
843 }
844 
845 static void
846 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
847 {
848 	spdk_interrupt_unregister(&ch->intr);
849 	close(ch->efd);
850 	ch->efd = -1;
851 }
852 
853 static int
854 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
855 {
856 	struct bdev_aio_group_channel *ch = ctx_buf;
857 	int rc;
858 
859 	TAILQ_INIT(&ch->io_ch_head);
860 	/* Initialize ch->efd to be invalid and unused. */
861 	ch->efd = -1;
862 	if (spdk_interrupt_mode_is_enabled()) {
863 		rc = bdev_aio_register_interrupt(ch);
864 		if (rc < 0) {
865 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
866 			return rc;
867 		}
868 	}
869 
870 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
871 	spdk_poller_register_interrupt(ch->poller, NULL, NULL);
872 
873 	return 0;
874 }
875 
876 static void
877 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
878 {
879 	struct bdev_aio_group_channel *ch = ctx_buf;
880 
881 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
882 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
883 	}
884 
885 	spdk_poller_unregister(&ch->poller);
886 	if (spdk_interrupt_mode_is_enabled()) {
887 		bdev_aio_unregister_interrupt(ch);
888 	}
889 }
890 
891 int
892 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly,
893 		bool fallocate, const struct spdk_uuid *uuid)
894 {
895 	struct file_disk *fdisk;
896 	uint32_t detected_block_size;
897 	uint64_t disk_size;
898 	int rc;
899 
900 #ifdef __FreeBSD__
901 	if (fallocate) {
902 		SPDK_ERRLOG("Unable to support fallocate on this platform\n");
903 		return -ENOTSUP;
904 	}
905 #endif
906 
907 	fdisk = calloc(1, sizeof(*fdisk));
908 	if (!fdisk) {
909 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
910 		return -ENOMEM;
911 	}
912 	fdisk->readonly = readonly;
913 	fdisk->fallocate = fallocate;
914 
915 	fdisk->filename = strdup(filename);
916 	if (!fdisk->filename) {
917 		rc = -ENOMEM;
918 		goto error_return;
919 	}
920 
921 	if (bdev_aio_open(fdisk)) {
922 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
923 		rc = -errno;
924 		goto error_return;
925 	}
926 
927 	disk_size = spdk_fd_get_size(fdisk->fd);
928 
929 	fdisk->disk.name = strdup(name);
930 	if (!fdisk->disk.name) {
931 		rc = -ENOMEM;
932 		goto error_return;
933 	}
934 	fdisk->disk.product_name = "AIO disk";
935 	fdisk->disk.module = &aio_if;
936 
937 	fdisk->disk.write_cache = 1;
938 
939 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
940 	if (block_size == 0) {
941 		/* User did not specify block size - use autodetected block size. */
942 		if (detected_block_size == 0) {
943 			SPDK_ERRLOG("Block size could not be auto-detected\n");
944 			rc = -EINVAL;
945 			goto error_return;
946 		}
947 		fdisk->block_size_override = false;
948 		block_size = detected_block_size;
949 	} else {
950 		if (block_size < detected_block_size) {
951 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
952 				    "auto-detected block size %" PRIu32 "\n",
953 				    block_size, detected_block_size);
954 			rc = -EINVAL;
955 			goto error_return;
956 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
957 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
958 				     "auto-detected block size %" PRIu32 "\n",
959 				     block_size, detected_block_size);
960 		}
961 		fdisk->block_size_override = true;
962 	}
963 
964 	if (block_size < 512) {
965 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
966 		rc = -EINVAL;
967 		goto error_return;
968 	}
969 
970 	if (!spdk_u32_is_pow2(block_size)) {
971 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
972 		rc = -EINVAL;
973 		goto error_return;
974 	}
975 
976 	fdisk->disk.blocklen = block_size;
977 	if (fdisk->block_size_override && detected_block_size) {
978 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
979 	} else {
980 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
981 	}
982 
983 	if (disk_size % fdisk->disk.blocklen != 0) {
984 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
985 			    disk_size, fdisk->disk.blocklen);
986 		rc = -EINVAL;
987 		goto error_return;
988 	}
989 
990 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
991 	fdisk->disk.ctxt = fdisk;
992 	spdk_uuid_copy(&fdisk->disk.uuid, uuid);
993 
994 	fdisk->disk.fn_table = &aio_fn_table;
995 
996 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
997 				sizeof(struct bdev_aio_io_channel),
998 				fdisk->disk.name);
999 	rc = spdk_bdev_register(&fdisk->disk);
1000 	if (rc) {
1001 		spdk_io_device_unregister(fdisk, NULL);
1002 		goto error_return;
1003 	}
1004 
1005 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
1006 	return 0;
1007 
1008 error_return:
1009 	bdev_aio_close(fdisk);
1010 	aio_free_disk(fdisk);
1011 	return rc;
1012 }
1013 
1014 static void
1015 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
1016 {
1017 }
1018 
1019 int
1020 bdev_aio_rescan(const char *name)
1021 {
1022 	struct spdk_bdev_desc *desc;
1023 	struct spdk_bdev *bdev;
1024 	struct file_disk *fdisk;
1025 	uint64_t disk_size, blockcnt;
1026 	int rc;
1027 
1028 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
1029 	if (rc != 0) {
1030 		return rc;
1031 	}
1032 
1033 	bdev = spdk_bdev_desc_get_bdev(desc);
1034 	if (bdev->module != &aio_if) {
1035 		rc = -ENODEV;
1036 		goto exit;
1037 	}
1038 
1039 	fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
1040 	disk_size = spdk_fd_get_size(fdisk->fd);
1041 	blockcnt = disk_size / bdev->blocklen;
1042 
1043 	if (bdev->blockcnt != blockcnt) {
1044 		SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
1045 			       PRIu64 "\n",
1046 			       fdisk->filename,
1047 			       bdev->blockcnt,
1048 			       blockcnt);
1049 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
1050 		if (rc != 0) {
1051 			SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
1052 				    fdisk->filename, rc);
1053 			goto exit;
1054 		}
1055 	}
1056 
1057 exit:
1058 	spdk_bdev_close(desc);
1059 	return rc;
1060 }
1061 
1062 struct delete_aio_bdev_ctx {
1063 	delete_aio_bdev_complete cb_fn;
1064 	void *cb_arg;
1065 };
1066 
1067 static void
1068 aio_bdev_unregister_cb(void *arg, int bdeverrno)
1069 {
1070 	struct delete_aio_bdev_ctx *ctx = arg;
1071 
1072 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
1073 	free(ctx);
1074 }
1075 
1076 void
1077 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
1078 {
1079 	struct delete_aio_bdev_ctx *ctx;
1080 	int rc;
1081 
1082 	ctx = calloc(1, sizeof(*ctx));
1083 	if (ctx == NULL) {
1084 		cb_fn(cb_arg, -ENOMEM);
1085 		return;
1086 	}
1087 
1088 	ctx->cb_fn = cb_fn;
1089 	ctx->cb_arg = cb_arg;
1090 	rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
1091 	if (rc != 0) {
1092 		aio_bdev_unregister_cb(ctx, rc);
1093 	}
1094 }
1095 
1096 static int
1097 bdev_aio_initialize(void)
1098 {
1099 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
1100 				sizeof(struct bdev_aio_group_channel), "aio_module");
1101 
1102 	return 0;
1103 }
1104 
1105 static void
1106 bdev_aio_fini(void)
1107 {
1108 	spdk_io_device_unregister(&aio_if, NULL);
1109 }
1110 
1111 SPDK_LOG_REGISTER_COMPONENT(aio)
1112