xref: /spdk/module/bdev/aio/bdev_aio.c (revision d987d777d6b8ce05f11cb1d90f1241bfecfc9af4)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_aio.h"
8 
9 #include "spdk/stdinc.h"
10 
11 #include "spdk/barrier.h"
12 #include "spdk/bdev.h"
13 #include "spdk/bdev_module.h"
14 #include "spdk/env.h"
15 #include "spdk/fd.h"
16 #include "spdk/likely.h"
17 #include "spdk/thread.h"
18 #include "spdk/json.h"
19 #include "spdk/util.h"
20 #include "spdk/string.h"
21 
22 #include "spdk/log.h"
23 
24 #include <sys/eventfd.h>
25 
26 #ifndef __FreeBSD__
27 #include <libaio.h>
28 #endif
29 
30 struct bdev_aio_io_channel {
31 	uint64_t				io_inflight;
32 #ifdef __FreeBSD__
33 	int					kqfd;
34 #else
35 	io_context_t				io_ctx;
36 #endif
37 	struct bdev_aio_group_channel		*group_ch;
38 	TAILQ_ENTRY(bdev_aio_io_channel)	link;
39 };
40 
41 struct bdev_aio_group_channel {
42 	/* eventfd for io completion notification in interrupt mode.
43 	 * Negative value like '-1' indicates it is invalid or unused.
44 	 */
45 	int					efd;
46 	struct spdk_interrupt			*intr;
47 	struct spdk_poller			*poller;
48 	TAILQ_HEAD(, bdev_aio_io_channel)	io_ch_head;
49 };
50 
51 struct bdev_aio_task {
52 #ifdef __FreeBSD__
53 	struct aiocb			aiocb;
54 #else
55 	struct iocb			iocb;
56 #endif
57 	uint64_t			len;
58 	struct bdev_aio_io_channel	*ch;
59 };
60 
61 struct file_disk {
62 	struct bdev_aio_task	*reset_task;
63 	struct spdk_poller	*reset_retry_timer;
64 	struct spdk_bdev	disk;
65 	char			*filename;
66 	int			fd;
67 	TAILQ_ENTRY(file_disk)  link;
68 	bool			block_size_override;
69 	bool			readonly;
70 	bool			fallocate;
71 };
72 
73 /* For user space reaping of completions */
74 struct spdk_aio_ring {
75 	uint32_t id;
76 	uint32_t size;
77 	uint32_t head;
78 	uint32_t tail;
79 
80 	uint32_t version;
81 	uint32_t compat_features;
82 	uint32_t incompat_features;
83 	uint32_t header_length;
84 };
85 
86 #define SPDK_AIO_RING_VERSION	0xa10a10a1
87 
88 static int bdev_aio_initialize(void);
89 static void bdev_aio_fini(void);
90 static void aio_free_disk(struct file_disk *fdisk);
91 static TAILQ_HEAD(, file_disk) g_aio_disk_head = TAILQ_HEAD_INITIALIZER(g_aio_disk_head);
92 
93 #define SPDK_AIO_QUEUE_DEPTH 128
94 #define MAX_EVENTS_PER_POLL 32
95 
96 static int
97 bdev_aio_get_ctx_size(void)
98 {
99 	return sizeof(struct bdev_aio_task);
100 }
101 
102 static struct spdk_bdev_module aio_if = {
103 	.name		= "aio",
104 	.module_init	= bdev_aio_initialize,
105 	.module_fini	= bdev_aio_fini,
106 	.get_ctx_size	= bdev_aio_get_ctx_size,
107 };
108 
109 SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
110 
111 static int
112 bdev_aio_open(struct file_disk *disk)
113 {
114 	int fd;
115 	int io_flag = disk->readonly ? O_RDONLY : O_RDWR;
116 
117 	fd = open(disk->filename, io_flag | O_DIRECT);
118 	if (fd < 0) {
119 		/* Try without O_DIRECT for non-disk files */
120 		fd = open(disk->filename, io_flag);
121 		if (fd < 0) {
122 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
123 				    disk->filename, errno, spdk_strerror(errno));
124 			disk->fd = -1;
125 			return -1;
126 		}
127 	}
128 
129 	disk->fd = fd;
130 
131 	return 0;
132 }
133 
134 static int
135 bdev_aio_close(struct file_disk *disk)
136 {
137 	int rc;
138 
139 	if (disk->fd == -1) {
140 		return 0;
141 	}
142 
143 	rc = close(disk->fd);
144 	if (rc < 0) {
145 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
146 			    disk->fd, errno, spdk_strerror(errno));
147 		return -1;
148 	}
149 
150 	disk->fd = -1;
151 
152 	return 0;
153 }
154 
155 #ifdef __FreeBSD__
156 static int
157 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
158 		   struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
159 		   struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
160 {
161 	struct aiocb *aiocb = &aio_task->aiocb;
162 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
163 
164 	memset(aiocb, 0, sizeof(struct aiocb));
165 	aiocb->aio_fildes = fdisk->fd;
166 	aiocb->aio_iov = iov;
167 	aiocb->aio_iovcnt = iovcnt;
168 	aiocb->aio_offset = offset;
169 	aiocb->aio_sigevent.sigev_notify_kqueue = aio_ch->kqfd;
170 	aiocb->aio_sigevent.sigev_value.sival_ptr = aio_task;
171 	aiocb->aio_sigevent.sigev_notify = SIGEV_KEVENT;
172 
173 	aio_task->len = nbytes;
174 	aio_task->ch = aio_ch;
175 
176 	if (type == SPDK_BDEV_IO_TYPE_READ) {
177 		return aio_readv(aiocb);
178 	}
179 
180 	return aio_writev(aiocb);
181 }
182 #else
183 static int
184 bdev_aio_submit_io(enum spdk_bdev_io_type type, struct file_disk *fdisk,
185 		   struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
186 		   struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
187 {
188 	struct iocb *iocb = &aio_task->iocb;
189 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
190 
191 	if (type == SPDK_BDEV_IO_TYPE_READ) {
192 		io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
193 	} else {
194 		io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
195 	}
196 
197 	if (aio_ch->group_ch->efd >= 0) {
198 		io_set_eventfd(iocb, aio_ch->group_ch->efd);
199 	}
200 	iocb->data = aio_task;
201 	aio_task->len = nbytes;
202 	aio_task->ch = aio_ch;
203 
204 	return io_submit(aio_ch->io_ctx, 1, &iocb);
205 }
206 #endif
207 
208 static void
209 bdev_aio_rw(enum spdk_bdev_io_type type, struct file_disk *fdisk,
210 	    struct spdk_io_channel *ch, struct bdev_aio_task *aio_task,
211 	    struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
212 {
213 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
214 	int rc;
215 
216 	if (type == SPDK_BDEV_IO_TYPE_READ) {
217 		SPDK_DEBUGLOG(aio, "read %d iovs size %lu to off: %#lx\n",
218 			      iovcnt, nbytes, offset);
219 	} else {
220 		SPDK_DEBUGLOG(aio, "write %d iovs size %lu from off: %#lx\n",
221 			      iovcnt, nbytes, offset);
222 	}
223 
224 	rc = bdev_aio_submit_io(type, fdisk, ch, aio_task, iov, iovcnt, nbytes, offset);
225 	if (spdk_unlikely(rc < 0)) {
226 		if (rc == -EAGAIN) {
227 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
228 		} else {
229 			spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), rc);
230 			SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
231 		}
232 	} else {
233 		aio_ch->io_inflight++;
234 	}
235 }
236 
237 static void
238 bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
239 {
240 	int rc = fsync(fdisk->fd);
241 
242 	if (rc == 0) {
243 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
244 	} else {
245 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
246 	}
247 }
248 
249 #ifndef __FreeBSD__
250 static void
251 bdev_aio_fallocate(struct spdk_bdev_io *bdev_io, int mode)
252 {
253 	struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
254 	struct bdev_aio_task *aio_task = (struct bdev_aio_task *)bdev_io->driver_ctx;
255 	uint64_t offset_bytes = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen;
256 	uint64_t length_bytes = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
257 	int rc;
258 
259 	if (!fdisk->fallocate) {
260 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -ENOTSUP);
261 		return;
262 	}
263 
264 	rc = fallocate(fdisk->fd, mode, offset_bytes, length_bytes);
265 	if (rc == 0) {
266 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
267 	} else {
268 		spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), -errno);
269 	}
270 }
271 
272 static void
273 bdev_aio_unmap(struct spdk_bdev_io *bdev_io)
274 {
275 	int mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE;
276 
277 	bdev_aio_fallocate(bdev_io, mode);
278 }
279 
280 
281 static void
282 bdev_aio_write_zeros(struct spdk_bdev_io *bdev_io)
283 {
284 	int mode = FALLOC_FL_ZERO_RANGE;
285 
286 	bdev_aio_fallocate(bdev_io, mode);
287 }
288 #endif
289 
290 static void
291 bdev_aio_destruct_cb(void *io_device)
292 {
293 	struct file_disk *fdisk = io_device;
294 	int rc = 0;
295 
296 	TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
297 	rc = bdev_aio_close(fdisk);
298 	if (rc < 0) {
299 		SPDK_ERRLOG("bdev_aio_close() failed\n");
300 	}
301 	aio_free_disk(fdisk);
302 }
303 
304 static int
305 bdev_aio_destruct(void *ctx)
306 {
307 	struct file_disk *fdisk = ctx;
308 
309 	spdk_io_device_unregister(fdisk, bdev_aio_destruct_cb);
310 
311 	return 0;
312 }
313 
314 #ifdef __FreeBSD__
315 static int
316 bdev_user_io_getevents(int kq, unsigned int max, struct kevent *events)
317 {
318 	struct timespec ts;
319 	int count;
320 
321 	memset(events, 0, max * sizeof(struct kevent));
322 	memset(&ts, 0, sizeof(ts));
323 
324 	count = kevent(kq, NULL, 0, events, max, &ts);
325 	if (count < 0) {
326 		SPDK_ERRLOG("failed to get kevents: %s.\n", spdk_strerror(errno));
327 		return -errno;
328 	}
329 
330 	return count;
331 }
332 
333 static int
334 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
335 {
336 	int nr, i, res = 0;
337 	struct bdev_aio_task *aio_task;
338 	struct kevent events[SPDK_AIO_QUEUE_DEPTH];
339 
340 	nr = bdev_user_io_getevents(io_ch->kqfd, SPDK_AIO_QUEUE_DEPTH, events);
341 	if (nr < 0) {
342 		return 0;
343 	}
344 
345 	for (i = 0; i < nr; i++) {
346 		aio_task = events[i].udata;
347 		aio_task->ch->io_inflight--;
348 		if (aio_task == NULL) {
349 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
350 			break;
351 		} else if ((uint64_t)aio_return(&aio_task->aiocb) == aio_task->len) {
352 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
353 		} else {
354 			SPDK_ERRLOG("failed to complete aio: rc %d\n", aio_error(&aio_task->aiocb));
355 			res = aio_error(&aio_task->aiocb);
356 			if (res != 0) {
357 				spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
358 			} else {
359 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
360 			}
361 		}
362 	}
363 
364 	return nr;
365 }
366 #else
367 static int
368 bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
369 {
370 	uint32_t head, tail, count;
371 	struct spdk_aio_ring *ring;
372 	struct timespec timeout;
373 	struct io_event *kevents;
374 
375 	ring = (struct spdk_aio_ring *)io_ctx;
376 
377 	if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
378 		timeout.tv_sec = 0;
379 		timeout.tv_nsec = 0;
380 
381 		return io_getevents(io_ctx, 0, max, uevents, &timeout);
382 	}
383 
384 	/* Read the current state out of the ring */
385 	head = ring->head;
386 	tail = ring->tail;
387 
388 	/* This memory barrier is required to prevent the loads above
389 	 * from being re-ordered with stores to the events array
390 	 * potentially occurring on other threads. */
391 	spdk_smp_rmb();
392 
393 	/* Calculate how many items are in the circular ring */
394 	count = tail - head;
395 	if (tail < head) {
396 		count += ring->size;
397 	}
398 
399 	/* Reduce the count to the limit provided by the user */
400 	count = spdk_min(max, count);
401 
402 	/* Grab the memory location of the event array */
403 	kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
404 
405 	/* Copy the events out of the ring. */
406 	if ((head + count) <= ring->size) {
407 		/* Only one copy is required */
408 		memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
409 	} else {
410 		uint32_t first_part = ring->size - head;
411 		/* Two copies are required */
412 		memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
413 		memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
414 	}
415 
416 	/* Update the head pointer. On x86, stores will not be reordered with older loads,
417 	 * so the copies out of the event array will always be complete prior to this
418 	 * update becoming visible. On other architectures this is not guaranteed, so
419 	 * add a barrier. */
420 #if defined(__i386__) || defined(__x86_64__)
421 	spdk_compiler_barrier();
422 #else
423 	spdk_smp_mb();
424 #endif
425 	ring->head = (head + count) % ring->size;
426 
427 	return count;
428 }
429 
430 static int
431 bdev_aio_io_channel_poll(struct bdev_aio_io_channel *io_ch)
432 {
433 	int nr, i, res = 0;
434 	struct bdev_aio_task *aio_task;
435 	struct io_event events[SPDK_AIO_QUEUE_DEPTH];
436 
437 	nr = bdev_user_io_getevents(io_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
438 	if (nr < 0) {
439 		return 0;
440 	}
441 
442 	for (i = 0; i < nr; i++) {
443 		aio_task = events[i].data;
444 		aio_task->ch->io_inflight--;
445 		if (events[i].res == aio_task->len) {
446 			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_SUCCESS);
447 		} else {
448 			/* From aio_abi.h, io_event.res is defined __s64, negative errno
449 			 * will be assigned to io_event.res for error situation.
450 			 * But from libaio.h, io_event.res is defined unsigned long, so
451 			 * convert it to signed value for error detection.
452 			 */
453 			SPDK_ERRLOG("failed to complete aio: rc %"PRId64"\n", events[i].res);
454 			res = (int)events[i].res;
455 			if (res < 0) {
456 				spdk_bdev_io_complete_aio_status(spdk_bdev_io_from_ctx(aio_task), res);
457 			} else {
458 				spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
459 			}
460 		}
461 	}
462 
463 	return nr;
464 }
465 #endif
466 
467 static int
468 bdev_aio_group_poll(void *arg)
469 {
470 	struct bdev_aio_group_channel *group_ch = arg;
471 	struct bdev_aio_io_channel *io_ch;
472 	int nr = 0;
473 
474 	TAILQ_FOREACH(io_ch, &group_ch->io_ch_head, link) {
475 		nr += bdev_aio_io_channel_poll(io_ch);
476 	}
477 
478 	return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
479 }
480 
481 static int
482 bdev_aio_group_interrupt(void *arg)
483 {
484 	struct bdev_aio_group_channel *group_ch = arg;
485 	int rc;
486 	uint64_t num_events;
487 
488 	assert(group_ch->efd >= 0);
489 
490 	/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
491 	 * io_getevent should be called again to ensure all completed IO are processed.
492 	 */
493 	rc = read(group_ch->efd, &num_events, sizeof(num_events));
494 	if (rc < 0) {
495 		SPDK_ERRLOG("failed to acknowledge aio group: %s.\n", spdk_strerror(errno));
496 		return -errno;
497 	}
498 
499 	if (num_events > SPDK_AIO_QUEUE_DEPTH) {
500 		num_events -= SPDK_AIO_QUEUE_DEPTH;
501 		rc = write(group_ch->efd, &num_events, sizeof(num_events));
502 		if (rc < 0) {
503 			SPDK_ERRLOG("failed to notify aio group: %s.\n", spdk_strerror(errno));
504 		}
505 	}
506 
507 	return bdev_aio_group_poll(group_ch);
508 }
509 
510 static void
511 _bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
512 {
513 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
514 	struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
515 
516 	if (aio_ch->io_inflight) {
517 		spdk_for_each_channel_continue(i, -1);
518 		return;
519 	}
520 
521 	spdk_for_each_channel_continue(i, 0);
522 }
523 
524 static int bdev_aio_reset_retry_timer(void *arg);
525 
526 static void
527 _bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
528 {
529 	struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
530 
531 	if (status == -1) {
532 		fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
533 		return;
534 	}
535 
536 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
537 }
538 
539 static int
540 bdev_aio_reset_retry_timer(void *arg)
541 {
542 	struct file_disk *fdisk = arg;
543 
544 	if (fdisk->reset_retry_timer) {
545 		spdk_poller_unregister(&fdisk->reset_retry_timer);
546 	}
547 
548 	spdk_for_each_channel(fdisk,
549 			      _bdev_aio_get_io_inflight,
550 			      fdisk,
551 			      _bdev_aio_get_io_inflight_done);
552 
553 	return SPDK_POLLER_BUSY;
554 }
555 
556 static void
557 bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
558 {
559 	fdisk->reset_task = aio_task;
560 
561 	bdev_aio_reset_retry_timer(fdisk);
562 }
563 
564 static void
565 bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
566 		    bool success)
567 {
568 	if (!success) {
569 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
570 		return;
571 	}
572 
573 	switch (bdev_io->type) {
574 	case SPDK_BDEV_IO_TYPE_READ:
575 	case SPDK_BDEV_IO_TYPE_WRITE:
576 		bdev_aio_rw(bdev_io->type,
577 			    (struct file_disk *)bdev_io->bdev->ctxt,
578 			    ch,
579 			    (struct bdev_aio_task *)bdev_io->driver_ctx,
580 			    bdev_io->u.bdev.iovs,
581 			    bdev_io->u.bdev.iovcnt,
582 			    bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
583 			    bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
584 		break;
585 	default:
586 		SPDK_ERRLOG("Wrong io type\n");
587 		break;
588 	}
589 }
590 
591 static int
592 _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
593 {
594 	struct file_disk *fdisk = (struct file_disk *)bdev_io->bdev->ctxt;
595 
596 	switch (bdev_io->type) {
597 	/* Read and write operations must be performed on buffers aligned to
598 	 * bdev->required_alignment. If user specified unaligned buffers,
599 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
600 	case SPDK_BDEV_IO_TYPE_READ:
601 		spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
602 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
603 		return 0;
604 	case SPDK_BDEV_IO_TYPE_WRITE:
605 		if (fdisk->readonly) {
606 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
607 		} else {
608 			spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
609 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
610 		}
611 		return 0;
612 
613 	case SPDK_BDEV_IO_TYPE_FLUSH:
614 		bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
615 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
616 		return 0;
617 
618 	case SPDK_BDEV_IO_TYPE_RESET:
619 		bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
620 			       (struct bdev_aio_task *)bdev_io->driver_ctx);
621 		return 0;
622 
623 #ifndef __FreeBSD__
624 	case SPDK_BDEV_IO_TYPE_UNMAP:
625 		bdev_aio_unmap(bdev_io);
626 		return 0;
627 
628 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
629 		bdev_aio_write_zeros(bdev_io);
630 		return 0;
631 #endif
632 
633 	default:
634 		return -1;
635 	}
636 }
637 
638 static void
639 bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
640 {
641 	if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
642 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
643 	}
644 }
645 
646 static bool
647 bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
648 {
649 	struct file_disk *fdisk = ctx;
650 
651 	switch (io_type) {
652 	case SPDK_BDEV_IO_TYPE_READ:
653 	case SPDK_BDEV_IO_TYPE_WRITE:
654 	case SPDK_BDEV_IO_TYPE_FLUSH:
655 	case SPDK_BDEV_IO_TYPE_RESET:
656 		return true;
657 
658 	case SPDK_BDEV_IO_TYPE_UNMAP:
659 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
660 		return fdisk->fallocate;
661 
662 	default:
663 		return false;
664 	}
665 }
666 
667 #ifdef __FreeBSD__
668 static int
669 bdev_aio_create_io(struct bdev_aio_io_channel *ch)
670 {
671 	ch->kqfd = kqueue();
672 	if (ch->kqfd < 0) {
673 		SPDK_ERRLOG("async I/O context setup failure: %s.\n", spdk_strerror(errno));
674 		return -1;
675 	}
676 
677 	return 0;
678 }
679 
680 static void
681 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
682 {
683 	close(ch->kqfd);
684 }
685 #else
686 static int
687 bdev_aio_create_io(struct bdev_aio_io_channel *ch)
688 {
689 	if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
690 		SPDK_ERRLOG("Async I/O context setup failure, likely due to exceeding kernel limit.\n");
691 		SPDK_ERRLOG("This limit may be increased using 'sysctl -w fs.aio-max-nr'.\n");
692 		return -1;
693 	}
694 
695 	return 0;
696 }
697 
698 static void
699 bdev_aio_destroy_io(struct bdev_aio_io_channel *ch)
700 {
701 	io_destroy(ch->io_ctx);
702 }
703 #endif
704 
705 static int
706 bdev_aio_create_cb(void *io_device, void *ctx_buf)
707 {
708 	struct bdev_aio_io_channel *ch = ctx_buf;
709 	int rc;
710 
711 	rc = bdev_aio_create_io(ch);
712 	if (rc < 0) {
713 		return rc;
714 	}
715 
716 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
717 	TAILQ_INSERT_TAIL(&ch->group_ch->io_ch_head, ch, link);
718 
719 	return 0;
720 }
721 
722 static void
723 bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
724 {
725 	struct bdev_aio_io_channel *ch = ctx_buf;
726 
727 	bdev_aio_destroy_io(ch);
728 
729 	assert(ch->group_ch);
730 	TAILQ_REMOVE(&ch->group_ch->io_ch_head, ch, link);
731 
732 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
733 }
734 
735 static struct spdk_io_channel *
736 bdev_aio_get_io_channel(void *ctx)
737 {
738 	struct file_disk *fdisk = ctx;
739 
740 	return spdk_get_io_channel(fdisk);
741 }
742 
743 
744 static int
745 bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
746 {
747 	struct file_disk *fdisk = ctx;
748 
749 	spdk_json_write_named_object_begin(w, "aio");
750 
751 	spdk_json_write_named_string(w, "filename", fdisk->filename);
752 
753 	spdk_json_write_named_bool(w, "block_size_override", fdisk->block_size_override);
754 
755 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
756 
757 	spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
758 
759 	spdk_json_write_object_end(w);
760 
761 	return 0;
762 }
763 
764 static void
765 bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
766 {
767 	struct file_disk *fdisk = bdev->ctxt;
768 	const struct spdk_uuid *uuid = spdk_bdev_get_uuid(bdev);
769 
770 	spdk_json_write_object_begin(w);
771 
772 	spdk_json_write_named_string(w, "method", "bdev_aio_create");
773 
774 	spdk_json_write_named_object_begin(w, "params");
775 	spdk_json_write_named_string(w, "name", bdev->name);
776 	if (fdisk->block_size_override) {
777 		spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
778 	}
779 	spdk_json_write_named_string(w, "filename", fdisk->filename);
780 	spdk_json_write_named_bool(w, "readonly", fdisk->readonly);
781 	spdk_json_write_named_bool(w, "fallocate", fdisk->fallocate);
782 	if (!spdk_uuid_is_null(uuid)) {
783 		spdk_json_write_named_uuid(w, "uuid", uuid);
784 	}
785 	spdk_json_write_object_end(w);
786 
787 	spdk_json_write_object_end(w);
788 }
789 
790 static const struct spdk_bdev_fn_table aio_fn_table = {
791 	.destruct		= bdev_aio_destruct,
792 	.submit_request		= bdev_aio_submit_request,
793 	.io_type_supported	= bdev_aio_io_type_supported,
794 	.get_io_channel		= bdev_aio_get_io_channel,
795 	.dump_info_json		= bdev_aio_dump_info_json,
796 	.write_config_json	= bdev_aio_write_json_config,
797 };
798 
799 static void
800 aio_free_disk(struct file_disk *fdisk)
801 {
802 	if (fdisk == NULL) {
803 		return;
804 	}
805 	free(fdisk->filename);
806 	free(fdisk->disk.name);
807 	free(fdisk);
808 }
809 
810 static int
811 bdev_aio_register_interrupt(struct bdev_aio_group_channel *ch)
812 {
813 	int efd;
814 
815 	efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
816 	if (efd < 0) {
817 		return -1;
818 	}
819 
820 	ch->intr = SPDK_INTERRUPT_REGISTER(efd, bdev_aio_group_interrupt, ch);
821 	if (ch->intr == NULL) {
822 		close(efd);
823 		return -1;
824 	}
825 	ch->efd = efd;
826 
827 	return 0;
828 }
829 
830 static void
831 bdev_aio_unregister_interrupt(struct bdev_aio_group_channel *ch)
832 {
833 	spdk_interrupt_unregister(&ch->intr);
834 	close(ch->efd);
835 	ch->efd = -1;
836 }
837 
838 static int
839 bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
840 {
841 	struct bdev_aio_group_channel *ch = ctx_buf;
842 	int rc;
843 
844 	TAILQ_INIT(&ch->io_ch_head);
845 	/* Initialize ch->efd to be invalid and unused. */
846 	ch->efd = -1;
847 	if (spdk_interrupt_mode_is_enabled()) {
848 		rc = bdev_aio_register_interrupt(ch);
849 		if (rc < 0) {
850 			SPDK_ERRLOG("Failed to prepare intr resource to bdev_aio\n");
851 			return rc;
852 		}
853 	}
854 
855 	ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
856 	spdk_poller_register_interrupt(ch->poller, NULL, NULL);
857 
858 	return 0;
859 }
860 
861 static void
862 bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
863 {
864 	struct bdev_aio_group_channel *ch = ctx_buf;
865 
866 	if (!TAILQ_EMPTY(&ch->io_ch_head)) {
867 		SPDK_ERRLOG("Group channel of bdev aio has uncleared io channel\n");
868 	}
869 
870 	spdk_poller_unregister(&ch->poller);
871 	if (spdk_interrupt_mode_is_enabled()) {
872 		bdev_aio_unregister_interrupt(ch);
873 	}
874 }
875 
876 int
877 create_aio_bdev(const char *name, const char *filename, uint32_t block_size, bool readonly,
878 		bool fallocate, const struct spdk_uuid *uuid)
879 {
880 	struct file_disk *fdisk;
881 	uint32_t detected_block_size;
882 	uint64_t disk_size;
883 	int rc;
884 
885 #ifdef __FreeBSD__
886 	if (fallocate) {
887 		SPDK_ERRLOG("Unable to support fallocate on this platform\n");
888 		return -ENOTSUP;
889 	}
890 #endif
891 
892 	fdisk = calloc(1, sizeof(*fdisk));
893 	if (!fdisk) {
894 		SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
895 		return -ENOMEM;
896 	}
897 	fdisk->readonly = readonly;
898 	fdisk->fallocate = fallocate;
899 
900 	fdisk->filename = strdup(filename);
901 	if (!fdisk->filename) {
902 		rc = -ENOMEM;
903 		goto error_return;
904 	}
905 
906 	if (bdev_aio_open(fdisk)) {
907 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
908 		rc = -errno;
909 		goto error_return;
910 	}
911 
912 	disk_size = spdk_fd_get_size(fdisk->fd);
913 
914 	fdisk->disk.name = strdup(name);
915 	if (!fdisk->disk.name) {
916 		rc = -ENOMEM;
917 		goto error_return;
918 	}
919 	fdisk->disk.product_name = "AIO disk";
920 	fdisk->disk.module = &aio_if;
921 
922 	fdisk->disk.write_cache = 1;
923 
924 	detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
925 	if (block_size == 0) {
926 		/* User did not specify block size - use autodetected block size. */
927 		if (detected_block_size == 0) {
928 			SPDK_ERRLOG("Block size could not be auto-detected\n");
929 			rc = -EINVAL;
930 			goto error_return;
931 		}
932 		fdisk->block_size_override = false;
933 		block_size = detected_block_size;
934 	} else {
935 		if (block_size < detected_block_size) {
936 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
937 				    "auto-detected block size %" PRIu32 "\n",
938 				    block_size, detected_block_size);
939 			rc = -EINVAL;
940 			goto error_return;
941 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
942 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
943 				     "auto-detected block size %" PRIu32 "\n",
944 				     block_size, detected_block_size);
945 		}
946 		fdisk->block_size_override = true;
947 	}
948 
949 	if (block_size < 512) {
950 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
951 		rc = -EINVAL;
952 		goto error_return;
953 	}
954 
955 	if (!spdk_u32_is_pow2(block_size)) {
956 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
957 		rc = -EINVAL;
958 		goto error_return;
959 	}
960 
961 	fdisk->disk.blocklen = block_size;
962 	if (fdisk->block_size_override && detected_block_size) {
963 		fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
964 	} else {
965 		fdisk->disk.required_alignment = spdk_u32log2(block_size);
966 	}
967 
968 	if (disk_size % fdisk->disk.blocklen != 0) {
969 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
970 			    disk_size, fdisk->disk.blocklen);
971 		rc = -EINVAL;
972 		goto error_return;
973 	}
974 
975 	fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
976 	fdisk->disk.ctxt = fdisk;
977 	spdk_uuid_copy(&fdisk->disk.uuid, uuid);
978 
979 	fdisk->disk.fn_table = &aio_fn_table;
980 
981 	spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
982 				sizeof(struct bdev_aio_io_channel),
983 				fdisk->disk.name);
984 	rc = spdk_bdev_register(&fdisk->disk);
985 	if (rc) {
986 		spdk_io_device_unregister(fdisk, NULL);
987 		goto error_return;
988 	}
989 
990 	TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
991 	return 0;
992 
993 error_return:
994 	bdev_aio_close(fdisk);
995 	aio_free_disk(fdisk);
996 	return rc;
997 }
998 
999 static void
1000 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
1001 {
1002 }
1003 
1004 int
1005 bdev_aio_rescan(const char *name)
1006 {
1007 	struct spdk_bdev_desc *desc;
1008 	struct spdk_bdev *bdev;
1009 	struct file_disk *fdisk;
1010 	uint64_t disk_size, blockcnt;
1011 	int rc;
1012 
1013 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
1014 	if (rc != 0) {
1015 		return rc;
1016 	}
1017 
1018 	bdev = spdk_bdev_desc_get_bdev(desc);
1019 	if (bdev->module != &aio_if) {
1020 		rc = -ENODEV;
1021 		goto exit;
1022 	}
1023 
1024 	fdisk = SPDK_CONTAINEROF(bdev, struct file_disk, disk);
1025 	disk_size = spdk_fd_get_size(fdisk->fd);
1026 	blockcnt = disk_size / bdev->blocklen;
1027 
1028 	if (bdev->blockcnt != blockcnt) {
1029 		SPDK_NOTICELOG("AIO device is resized: bdev name %s, old block count %" PRIu64 ", new block count %"
1030 			       PRIu64 "\n",
1031 			       fdisk->filename,
1032 			       bdev->blockcnt,
1033 			       blockcnt);
1034 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
1035 		if (rc != 0) {
1036 			SPDK_ERRLOG("Could not change num blocks for aio bdev: name %s, errno: %d.\n",
1037 				    fdisk->filename, rc);
1038 			goto exit;
1039 		}
1040 	}
1041 
1042 exit:
1043 	spdk_bdev_close(desc);
1044 	return rc;
1045 }
1046 
1047 struct delete_aio_bdev_ctx {
1048 	delete_aio_bdev_complete cb_fn;
1049 	void *cb_arg;
1050 };
1051 
1052 static void
1053 aio_bdev_unregister_cb(void *arg, int bdeverrno)
1054 {
1055 	struct delete_aio_bdev_ctx *ctx = arg;
1056 
1057 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
1058 	free(ctx);
1059 }
1060 
1061 void
1062 bdev_aio_delete(const char *name, delete_aio_bdev_complete cb_fn, void *cb_arg)
1063 {
1064 	struct delete_aio_bdev_ctx *ctx;
1065 	int rc;
1066 
1067 	ctx = calloc(1, sizeof(*ctx));
1068 	if (ctx == NULL) {
1069 		cb_fn(cb_arg, -ENOMEM);
1070 		return;
1071 	}
1072 
1073 	ctx->cb_fn = cb_fn;
1074 	ctx->cb_arg = cb_arg;
1075 	rc = spdk_bdev_unregister_by_name(name, &aio_if, aio_bdev_unregister_cb, ctx);
1076 	if (rc != 0) {
1077 		aio_bdev_unregister_cb(ctx, rc);
1078 	}
1079 }
1080 
1081 static int
1082 bdev_aio_initialize(void)
1083 {
1084 	spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
1085 				sizeof(struct bdev_aio_group_channel), "aio_module");
1086 
1087 	return 0;
1088 }
1089 
1090 static void
1091 bdev_aio_fini(void)
1092 {
1093 	spdk_io_device_unregister(&aio_if, NULL);
1094 }
1095 
1096 SPDK_LOG_REGISTER_COMPONENT(aio)
1097