xref: /spdk/module/fsdev/aio/fsdev_aio.c (revision 42d1bd28396630ca9cfb81bf7934fb8872df47f0)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 #include "spdk/stdinc.h"
5 #include "spdk/event.h"
6 #include "spdk/log.h"
7 #include "spdk/string.h"
8 #include "spdk/config.h"
9 #include "spdk/util.h"
10 #include "spdk/thread.h"
11 #include "aio_mgr.h"
12 #include "fsdev_aio.h"
13 
14 #define IO_STATUS_ASYNC INT_MIN
15 
16 #ifndef UNUSED
17 #define UNUSED(x) (void)(x)
18 #endif
19 
20 /* See https://libfuse.github.io/doxygen/structfuse__conn__info.html */
21 #define MAX_BACKGROUND (100)
22 #define TIME_GRAN (1)
23 #define MAX_AIOS 256
24 #define DEFAULT_WRITEBACK_CACHE true
25 #define DEFAULT_MAX_WRITE 0x00020000
26 #define DEFAULT_XATTR_ENABLED false
27 #define DEFAULT_SKIP_RW false
28 #define DEFAULT_TIMEOUT_MS 0 /* to prevent the attribute caching */
29 
30 #ifdef SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIM
31 /* Linux */
32 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec)
33 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec)
34 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec)
35 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val)
36 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val)
37 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val)
38 #elif defined(SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIMESPEC)
39 /* FreeBSD */
40 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec)
41 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec)
42 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec)
43 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val)
44 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val)
45 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val)
46 #else
47 #define ST_ATIM_NSEC(stbuf) 0
48 #define ST_CTIM_NSEC(stbuf) 0
49 #define ST_MTIM_NSEC(stbuf) 0
50 #define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0)
51 #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0)
52 #define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0)
53 #endif
54 
55 struct lo_cred {
56 	uid_t euid;
57 	gid_t egid;
58 };
59 
60 /** Inode number type */
61 typedef uint64_t spdk_ino_t;
62 
63 struct lo_key {
64 	ino_t ino;
65 	dev_t dev;
66 };
67 
68 struct spdk_fsdev_file_handle {
69 	int fd;
70 	struct {
71 		DIR *dp;
72 		struct dirent *entry;
73 		off_t offset;
74 	} dir;
75 	struct spdk_fsdev_file_object *fobject;
76 	TAILQ_ENTRY(spdk_fsdev_file_handle) link;
77 };
78 
79 #define FOBJECT_FMT "ino=%" PRIu64 " dev=%" PRIu64
80 #define FOBJECT_ARGS(fo) ((uint64_t)(fo)->key.ino), ((uint64_t)(fo)->key.dev)
81 struct spdk_fsdev_file_object {
82 	uint32_t is_symlink : 1;
83 	uint32_t is_dir : 1;
84 	uint32_t reserved : 30;
85 	int fd;
86 	char *fd_str;
87 	struct lo_key key;
88 	uint64_t refcount;
89 	struct spdk_fsdev_file_object *parent_fobject;
90 	TAILQ_ENTRY(spdk_fsdev_file_object) link;
91 	TAILQ_HEAD(, spdk_fsdev_file_object) leafs;
92 	TAILQ_HEAD(, spdk_fsdev_file_handle) handles;
93 	struct spdk_spinlock lock;
94 	char name[];
95 };
96 
97 struct aio_fsdev {
98 	struct spdk_fsdev fsdev;
99 	struct spdk_fsdev_mount_opts mount_opts;
100 	char *root_path;
101 	int proc_self_fd;
102 	pthread_mutex_t mutex;
103 	struct spdk_fsdev_file_object *root;
104 	TAILQ_ENTRY(aio_fsdev) tailq;
105 	bool xattr_enabled;
106 	bool skip_rw;
107 };
108 
109 struct aio_fsdev_io {
110 	struct spdk_aio_mgr_io *aio;
111 	struct aio_io_channel *ch;
112 	TAILQ_ENTRY(aio_fsdev_io) link;
113 };
114 
115 struct aio_io_channel {
116 	struct spdk_poller *poller;
117 	struct spdk_aio_mgr *mgr;
118 	TAILQ_HEAD(, aio_fsdev_io) ios_in_progress;
119 	TAILQ_HEAD(, aio_fsdev_io) ios_to_complete;
120 };
121 
122 static TAILQ_HEAD(, aio_fsdev) g_aio_fsdev_head = TAILQ_HEAD_INITIALIZER(
123 			g_aio_fsdev_head);
124 
125 static inline struct aio_fsdev *
126 fsdev_to_aio_fsdev(struct spdk_fsdev *fsdev)
127 {
128 	return SPDK_CONTAINEROF(fsdev, struct aio_fsdev, fsdev);
129 }
130 
131 static inline struct spdk_fsdev_io *
132 aio_to_fsdev_io(const struct aio_fsdev_io *aio_io)
133 {
134 	return SPDK_CONTAINEROF(aio_io, struct spdk_fsdev_io, driver_ctx);
135 }
136 
137 static inline struct aio_fsdev_io *
138 fsdev_to_aio_io(const struct spdk_fsdev_io *fsdev_io)
139 {
140 	return (struct aio_fsdev_io *)fsdev_io->driver_ctx;
141 }
142 
143 static inline bool
144 fsdev_aio_is_valid_fobject(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject)
145 {
146 	return fobject != NULL;
147 }
148 
149 static inline bool
150 fsdev_aio_is_valid_fhandle(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_handle *fhandle)
151 {
152 	return fhandle != NULL;
153 }
154 
155 static int
156 is_dot_or_dotdot(const char *name)
157 {
158 	return name[0] == '.' && (name[1] == '\0' ||
159 				  (name[1] == '.' && name[2] == '\0'));
160 }
161 
162 /* Is `path` a single path component that is not "." or ".."? */
163 static int
164 is_safe_path_component(const char *path)
165 {
166 	if (strchr(path, '/')) {
167 		return 0;
168 	}
169 
170 	return !is_dot_or_dotdot(path);
171 }
172 
173 static struct spdk_fsdev_file_object *
174 lo_find_leaf_unsafe(struct spdk_fsdev_file_object *fobject, ino_t ino, dev_t dev)
175 {
176 	struct spdk_fsdev_file_object *leaf_fobject;
177 
178 	TAILQ_FOREACH(leaf_fobject, &fobject->leafs, link) {
179 		if (leaf_fobject->key.ino == ino && leaf_fobject->key.dev == dev) {
180 			return leaf_fobject;
181 		}
182 	}
183 
184 	return NULL;
185 }
186 
187 /* This function returns:
188  * 1 if the refcount is still non zero
189  * a negative  error number if the refcount became zero, the file object was deleted but the defered underlying file deletion failed
190  * 0 if the refcount became zero, the file object was deleted and eithr the underlying file deletion wasn't defered or succeeded
191  */
192 static int
193 file_object_unref(struct spdk_fsdev_file_object *fobject, uint32_t count)
194 {
195 	int res = 0;
196 
197 	spdk_spin_lock(&fobject->lock);
198 	assert(fobject->refcount >= count);
199 	fobject->refcount -= count;
200 	spdk_spin_unlock(&fobject->lock);
201 
202 	if (!fobject->refcount) {
203 		struct spdk_fsdev_file_object *parent_fobject = fobject->parent_fobject;
204 
205 		if (parent_fobject) {
206 			spdk_spin_lock(&parent_fobject->lock);
207 			TAILQ_REMOVE(&parent_fobject->leafs, fobject, link);
208 			spdk_spin_unlock(&parent_fobject->lock);
209 			file_object_unref(parent_fobject, 1); /* unref by the leaf */
210 		}
211 
212 		spdk_spin_destroy(&fobject->lock);
213 		close(fobject->fd);
214 		free(fobject->fd_str);
215 		free(fobject);
216 	}
217 
218 	return res;
219 }
220 
221 static void
222 file_object_ref(struct spdk_fsdev_file_object *fobject)
223 {
224 	spdk_spin_lock(&fobject->lock);
225 	fobject->refcount++;
226 	spdk_spin_unlock(&fobject->lock);
227 }
228 
229 static struct spdk_fsdev_file_object *
230 file_object_create_unsafe(struct spdk_fsdev_file_object *parent_fobject, int fd, ino_t ino,
231 			  dev_t dev, mode_t mode)
232 {
233 	struct spdk_fsdev_file_object *fobject;
234 
235 	fobject = calloc(1, sizeof(*fobject));
236 	if (!fobject) {
237 		SPDK_ERRLOG("Cannot alloc fobject\n");
238 		return NULL;
239 	}
240 
241 	fobject->fd_str = spdk_sprintf_alloc("%d", fd);
242 	if (!fobject->fd_str) {
243 		SPDK_ERRLOG("Cannot alloc fd_str\n");
244 		free(fobject);
245 		return NULL;
246 	}
247 
248 	fobject->fd = fd;
249 	fobject->key.ino = ino;
250 	fobject->key.dev = dev;
251 	fobject->refcount = 1;
252 	fobject->is_symlink = S_ISLNK(mode) ? 1 : 0;
253 	fobject->is_dir = S_ISDIR(mode) ? 1 : 0;
254 
255 	TAILQ_INIT(&fobject->handles);
256 	TAILQ_INIT(&fobject->leafs);
257 	spdk_spin_init(&fobject->lock);
258 
259 	if (parent_fobject) {
260 		fobject->parent_fobject = parent_fobject;
261 		TAILQ_INSERT_TAIL(&parent_fobject->leafs, fobject, link);
262 		parent_fobject->refcount++;
263 	}
264 
265 	return fobject;
266 }
267 
268 static struct spdk_fsdev_file_handle *
269 file_handle_create(struct spdk_fsdev_file_object *fobject, int fd)
270 {
271 	struct spdk_fsdev_file_handle *fhandle;
272 
273 	fhandle = calloc(1, sizeof(*fhandle));
274 	if (!fhandle) {
275 		SPDK_ERRLOG("Cannot alloc fhandle\n");
276 		return NULL;
277 	}
278 
279 	fhandle->fobject = fobject;
280 	fhandle->fd = fd;
281 
282 	spdk_spin_lock(&fobject->lock);
283 	fobject->refcount++;
284 	TAILQ_INSERT_TAIL(&fobject->handles, fhandle, link);
285 	spdk_spin_unlock(&fobject->lock);
286 
287 	return fhandle;
288 }
289 
290 static void
291 file_handle_delete(struct spdk_fsdev_file_handle *fhandle)
292 {
293 	struct spdk_fsdev_file_object *fobject = fhandle->fobject;
294 
295 	spdk_spin_lock(&fobject->lock);
296 	fobject->refcount--;
297 	TAILQ_REMOVE(&fobject->handles, fhandle, link);
298 	spdk_spin_unlock(&fobject->lock);
299 
300 	if (fhandle->dir.dp) {
301 		closedir(fhandle->dir.dp);
302 	}
303 
304 	close(fhandle->fd);
305 	free(fhandle);
306 }
307 
308 static int
309 file_object_fill_attr(struct spdk_fsdev_file_object *fobject, struct spdk_fsdev_file_attr *attr)
310 {
311 	struct stat stbuf;
312 	int res;
313 
314 	res = fstatat(fobject->fd, "", &stbuf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
315 	if (res == -1) {
316 		res = -errno;
317 		SPDK_ERRLOG("fstatat() failed with %d\n", res);
318 		return res;
319 	}
320 
321 	memset(attr, 0, sizeof(*attr));
322 
323 	attr->ino = stbuf.st_ino;
324 	attr->size = stbuf.st_size;
325 	attr->blocks = stbuf.st_blocks;
326 	attr->atime = stbuf.st_atime;
327 	attr->mtime = stbuf.st_mtime;
328 	attr->ctime = stbuf.st_ctime;
329 	attr->atimensec = ST_ATIM_NSEC(&stbuf);
330 	attr->mtimensec = ST_MTIM_NSEC(&stbuf);
331 	attr->ctimensec = ST_CTIM_NSEC(&stbuf);
332 	attr->mode = stbuf.st_mode;
333 	attr->nlink = stbuf.st_nlink;
334 	attr->uid = stbuf.st_uid;
335 	attr->gid = stbuf.st_gid;
336 	attr->rdev = stbuf.st_rdev;
337 	attr->blksize = stbuf.st_blksize;
338 	attr->valid_ms = DEFAULT_TIMEOUT_MS;
339 
340 	return 0;
341 }
342 
343 static int
344 utimensat_empty(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject,
345 		const struct timespec *tv)
346 {
347 	int res;
348 
349 	if (fobject->is_symlink) {
350 		res = utimensat(fobject->fd, "", tv, AT_EMPTY_PATH);
351 		if (res == -1 && errno == EINVAL) {
352 			/* Sorry, no race free way to set times on symlink. */
353 			errno = EPERM;
354 		}
355 	} else {
356 		res = utimensat(vfsdev->proc_self_fd, fobject->fd_str, tv, 0);
357 	}
358 
359 	return res;
360 }
361 
362 static void
363 fsdev_free_leafs(struct spdk_fsdev_file_object *fobject, bool unref_fobject)
364 {
365 	while (!TAILQ_EMPTY(&fobject->handles)) {
366 		struct spdk_fsdev_file_handle *fhandle = TAILQ_FIRST(&fobject->handles);
367 		file_handle_delete(fhandle);
368 #ifdef __clang_analyzer__
369 		/*
370 		 * scan-build fails to comprehend that file_handle_delete() removes the fhandle
371 		 * from the queue, so it thinks it's remained accessible and throws the "Use of
372 		 * memory after it is freed" error here.
373 		 * The loop below "teaches" the scan-build that the freed fhandle is not on the
374 		 * list anymore and supresses the error in this way.
375 		 */
376 		struct spdk_fsdev_file_handle *tmp;
377 		TAILQ_FOREACH(tmp, &fobject->handles, link) {
378 			assert(tmp != fhandle);
379 		}
380 #endif
381 	}
382 
383 	while (!TAILQ_EMPTY(&fobject->leafs)) {
384 		struct spdk_fsdev_file_object *leaf_fobject = TAILQ_FIRST(&fobject->leafs);
385 		fsdev_free_leafs(leaf_fobject, true);
386 	}
387 
388 	if (fobject->refcount && unref_fobject) {
389 		/* if still referenced - zero refcount */
390 		int res = file_object_unref(fobject, fobject->refcount);
391 		assert(res == 0);
392 		UNUSED(res);
393 	}
394 }
395 
396 static int
397 lo_getattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
398 {
399 	int res;
400 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
401 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getattr.fobject;
402 
403 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
404 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
405 		return -EINVAL;
406 	}
407 
408 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.getattr.attr);
409 	if (res) {
410 		SPDK_ERRLOG("Cannot fill attr for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), res);
411 		return res;
412 	}
413 
414 	SPDK_DEBUGLOG(fsdev_aio, "GETATTR succeeded for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
415 	return 0;
416 }
417 
418 static int
419 lo_opendir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
420 {
421 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
422 	int error;
423 	int fd;
424 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.opendir.fobject;
425 	uint32_t flags = fsdev_io->u_in.opendir.flags;
426 	struct spdk_fsdev_file_handle *fhandle = NULL;
427 
428 	UNUSED(flags);
429 
430 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
431 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
432 		return -EINVAL;
433 	}
434 
435 	fd = openat(fobject->fd, ".", O_RDONLY);
436 	if (fd == -1) {
437 		error = -errno;
438 		SPDK_ERRLOG("openat failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
439 		goto out_err;
440 	}
441 
442 	fhandle = file_handle_create(fobject, fd);
443 	if (fhandle == NULL) {
444 		error = -ENOMEM;
445 		SPDK_ERRLOG("file_handle_create failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject),
446 			    error);
447 		goto out_err;
448 	}
449 
450 	fhandle->dir.dp = fdopendir(fd);
451 	if (fhandle->dir.dp == NULL) {
452 		error = -errno;
453 		SPDK_ERRLOG("fdopendir failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
454 		goto out_err;
455 	}
456 
457 	fhandle->dir.offset = 0;
458 	fhandle->dir.entry = NULL;
459 
460 	SPDK_DEBUGLOG(fsdev_aio, "OPENDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
461 		      FOBJECT_ARGS(fobject), fhandle);
462 
463 	fsdev_io->u_out.opendir.fhandle = fhandle;
464 
465 	return 0;
466 
467 out_err:
468 	if (fhandle) {
469 		file_handle_delete(fhandle);
470 	} else if (fd != -1) {
471 		close(fd);
472 	}
473 
474 	return error;
475 }
476 
477 static int
478 lo_releasedir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
479 {
480 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
481 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.releasedir.fobject;
482 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.releasedir.fhandle;
483 
484 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
485 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
486 		return -EINVAL;
487 	}
488 
489 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
490 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
491 		return -EINVAL;
492 	}
493 
494 	SPDK_DEBUGLOG(fsdev_aio, "RELEASEDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
495 		      FOBJECT_ARGS(fobject), fhandle);
496 
497 	file_handle_delete(fhandle);
498 
499 	return 0;
500 }
501 
502 static int
503 lo_set_mount_opts(struct aio_fsdev *vfsdev, struct spdk_fsdev_mount_opts *opts)
504 {
505 	assert(opts != NULL);
506 	assert(opts->opts_size != 0);
507 
508 	UNUSED(vfsdev);
509 
510 	if (opts->opts_size > offsetof(struct spdk_fsdev_mount_opts, max_write)) {
511 		/* Set the value the aio fsdev was created with */
512 		opts->max_write = vfsdev->mount_opts.max_write;
513 	}
514 
515 	if (opts->opts_size > offsetof(struct spdk_fsdev_mount_opts, writeback_cache_enabled)) {
516 		if (vfsdev->mount_opts.writeback_cache_enabled) {
517 			/* The writeback_cache_enabled was enabled upon creation => we follow the opts */
518 			vfsdev->mount_opts.writeback_cache_enabled = opts->writeback_cache_enabled;
519 		} else {
520 			/* The writeback_cache_enabled was disabled upon creation => we reflect it in the opts */
521 			opts->writeback_cache_enabled = false;
522 		}
523 	}
524 
525 	/* The AIO doesn't apply any additional restrictions, so we just accept the requested opts */
526 	SPDK_DEBUGLOG(fsdev_aio,
527 		      "aio filesystem %s: opts updated: max_write=%" PRIu32 ", writeback_cache=%" PRIu8 "\n",
528 		      vfsdev->fsdev.name, vfsdev->mount_opts.max_write, vfsdev->mount_opts.writeback_cache_enabled);
529 
530 	return 0;
531 }
532 
533 static int
534 lo_mount(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
535 {
536 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
537 	struct spdk_fsdev_mount_opts *in_opts = &fsdev_io->u_in.mount.opts;
538 
539 	fsdev_io->u_out.mount.opts = *in_opts;
540 	lo_set_mount_opts(vfsdev, &fsdev_io->u_out.mount.opts);
541 	file_object_ref(vfsdev->root);
542 	fsdev_io->u_out.mount.root_fobject = vfsdev->root;
543 
544 	return 0;
545 }
546 
547 static int
548 lo_umount(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
549 {
550 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
551 
552 	fsdev_free_leafs(vfsdev->root, false);
553 	file_object_unref(vfsdev->root, 1); /* reference by mount */
554 
555 	return 0;
556 }
557 
558 static int
559 lo_do_lookup(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
560 	     const char *name, struct spdk_fsdev_file_object **pfobject,
561 	     struct spdk_fsdev_file_attr *attr)
562 {
563 	int newfd;
564 	int res;
565 	struct stat stat;
566 	struct spdk_fsdev_file_object *fobject;
567 
568 	/* Do not allow escaping root directory */
569 	if (parent_fobject == vfsdev->root && strcmp(name, "..") == 0) {
570 		name = ".";
571 	}
572 
573 	newfd = openat(parent_fobject->fd, name, O_PATH | O_NOFOLLOW);
574 	if (newfd == -1) {
575 		res = -errno;
576 		SPDK_DEBUGLOG(fsdev_aio, "openat( " FOBJECT_FMT " %s) failed with %d\n",
577 			      FOBJECT_ARGS(parent_fobject), name, res);
578 		return res;
579 	}
580 
581 	res = fstatat(newfd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
582 	if (res == -1) {
583 		res = -errno;
584 		SPDK_ERRLOG("fstatat(%s) failed with %d\n", name, res);
585 		close(newfd);
586 		return res;
587 	}
588 
589 	spdk_spin_lock(&parent_fobject->lock);
590 	fobject = lo_find_leaf_unsafe(parent_fobject, stat.st_ino, stat.st_dev);
591 	if (fobject) {
592 		close(newfd);
593 		newfd = -1;
594 		file_object_ref(fobject); /* reference by a lo_do_lookup caller */
595 	} else {
596 		fobject = file_object_create_unsafe(parent_fobject, newfd, stat.st_ino, stat.st_dev, stat.st_mode);
597 	}
598 	spdk_spin_unlock(&parent_fobject->lock);
599 
600 	if (!fobject) {
601 		SPDK_ERRLOG("Cannot create file object\n");
602 		close(newfd);
603 		return -ENOMEM;
604 	}
605 
606 	if (attr) {
607 		res = file_object_fill_attr(fobject, attr);
608 		if (res) {
609 			SPDK_ERRLOG("fill_attr(%s) failed with %d\n", name, res);
610 			file_object_unref(fobject, 1);
611 			if (newfd != -1) {
612 				close(newfd);
613 			}
614 			return res;
615 		}
616 	}
617 
618 	*pfobject = fobject;
619 
620 	SPDK_DEBUGLOG(fsdev_aio, "lookup(%s) in dir " FOBJECT_FMT ": "  FOBJECT_FMT " fd=%d\n",
621 		      name, FOBJECT_ARGS(parent_fobject), FOBJECT_ARGS(fobject), fobject->fd);
622 	return 0;
623 }
624 
625 static int
626 lo_lookup(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
627 {
628 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
629 	int err;
630 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.lookup.parent_fobject;
631 	char *name = fsdev_io->u_in.lookup.name;
632 
633 	if (!parent_fobject) {
634 		err = file_object_fill_attr(vfsdev->root, &fsdev_io->u_out.lookup.attr);
635 		if (err) {
636 			SPDK_DEBUGLOG(fsdev_aio, "file_object_fill_attr(root) failed with err=%d\n", err);
637 			return err;
638 		}
639 
640 		file_object_ref(vfsdev->root);
641 		fsdev_io->u_out.lookup.fobject = vfsdev->root;
642 		return 0;
643 	}
644 
645 	SPDK_DEBUGLOG(fsdev_aio, "  name %s\n", name);
646 
647 	/* Don't use is_safe_path_component(), allow "." and ".." for NFS export
648 	 * support.
649 	 */
650 	if (strchr(name, '/')) {
651 		return -EINVAL;
652 	}
653 
654 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fsdev_io->u_out.lookup.fobject,
655 			   &fsdev_io->u_out.lookup.attr);
656 	if (err) {
657 		SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, err);
658 		return err;
659 	}
660 
661 	return 0;
662 }
663 
664 /*
665  * Change to uid/gid of caller so that file is created with ownership of caller.
666  */
667 static int
668 lo_change_cred(const struct lo_cred *new, struct lo_cred *old)
669 {
670 	int res;
671 
672 	old->euid = geteuid();
673 	old->egid = getegid();
674 
675 	res = syscall(SYS_setresgid, -1, new->egid, -1);
676 	if (res == -1) {
677 		return -errno;
678 	}
679 
680 	res = syscall(SYS_setresuid, -1, new->euid, -1);
681 	if (res == -1) {
682 		int errno_save = -errno;
683 
684 		syscall(SYS_setresgid, -1, old->egid, -1);
685 		return errno_save;
686 	}
687 
688 	return 0;
689 }
690 
691 /* Regain Privileges */
692 static void
693 lo_restore_cred(struct lo_cred *old)
694 {
695 	int res;
696 
697 	res = syscall(SYS_setresuid, -1, old->euid, -1);
698 	if (res == -1) {
699 		SPDK_ERRLOG("seteuid(%u)", old->euid);
700 	}
701 
702 	res = syscall(SYS_setresgid, -1, old->egid, -1);
703 	if (res == -1) {
704 		SPDK_ERRLOG("setegid(%u)", old->egid);
705 	}
706 }
707 
708 static int
709 lo_readdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
710 {
711 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
712 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
713 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.readdir.fhandle;
714 	uint64_t offset = fsdev_io->u_in.readdir.offset;
715 
716 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
717 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
718 		return -EINVAL;
719 	}
720 
721 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
722 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
723 		return -EINVAL;
724 	}
725 
726 	if (((off_t)offset) != fhandle->dir.offset) {
727 		seekdir(fhandle->dir.dp, offset);
728 		fhandle->dir.entry = NULL;
729 		fhandle->dir.offset = offset;
730 	}
731 
732 	while (1) {
733 		off_t nextoff;
734 		const char *name;
735 		int res;
736 
737 		if (!fhandle->dir.entry) {
738 			errno = 0;
739 			fhandle->dir.entry = readdir(fhandle->dir.dp);
740 			if (!fhandle->dir.entry) {
741 				if (errno) {  /* Error */
742 					res = -errno;
743 					SPDK_ERRLOG("readdir failed with err=%d", res);
744 					return res;
745 				} else {  /* End of stream */
746 					break;
747 				}
748 			}
749 		}
750 
751 		nextoff = fhandle->dir.entry->d_off;
752 		name = fhandle->dir.entry->d_name;
753 
754 		/* Hide root's parent directory */
755 		if (fobject == vfsdev->root && strcmp(name, "..") == 0) {
756 			goto skip_entry;
757 		}
758 
759 		if (is_dot_or_dotdot(name)) {
760 			fsdev_io->u_out.readdir.fobject = NULL;
761 			memset(&fsdev_io->u_out.readdir.attr, 0, sizeof(fsdev_io->u_out.readdir.attr));
762 			fsdev_io->u_out.readdir.attr.ino = fhandle->dir.entry->d_ino;
763 			fsdev_io->u_out.readdir.attr.mode = DT_DIR << 12;
764 			goto skip_lookup;
765 		}
766 
767 		res = lo_do_lookup(vfsdev, fobject, name, &fsdev_io->u_out.readdir.fobject,
768 				   &fsdev_io->u_out.readdir.attr);
769 		if (res) {
770 			SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, res);
771 			return res;
772 		}
773 
774 skip_lookup:
775 		fsdev_io->u_out.readdir.name = name;
776 		fsdev_io->u_out.readdir.offset = nextoff;
777 
778 		res = fsdev_io->u_in.readdir.entry_cb_fn(fsdev_io, fsdev_io->internal.cb_arg);
779 		if (res) {
780 			if (fsdev_io->u_out.readdir.fobject) {
781 				file_object_unref(fsdev_io->u_out.readdir.fobject, 1);
782 			}
783 			break;
784 		}
785 
786 skip_entry:
787 		fhandle->dir.entry = NULL;
788 		fhandle->dir.offset = nextoff;
789 	}
790 
791 	SPDK_DEBUGLOG(fsdev_aio, "READDIR succeeded for " FOBJECT_FMT " (fh=%p, offset=%" PRIu64 ")\n",
792 		      FOBJECT_ARGS(fobject), fhandle, offset);
793 	return 0;
794 }
795 
796 static int
797 lo_forget(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
798 {
799 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
800 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
801 	uint64_t nlookup = fsdev_io->u_in.forget.nlookup;
802 
803 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
804 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
805 		return -EINVAL;
806 	}
807 
808 	file_object_unref(fobject, nlookup);
809 
810 	return 0;
811 }
812 
813 static uint32_t
814 update_open_flags(struct aio_fsdev *vfsdev, uint32_t flags)
815 {
816 	/*
817 	 * With writeback cache, kernel may send read requests even
818 	 * when userspace opened write-only
819 	 */
820 	if (vfsdev->mount_opts.writeback_cache_enabled && (flags & O_ACCMODE) == O_WRONLY) {
821 		flags &= ~O_ACCMODE;
822 		flags |= O_RDWR;
823 	}
824 
825 	/*
826 	 * With writeback cache, O_APPEND is handled by the kernel.
827 	 * This breaks atomicity (since the file may change in the
828 	 * underlying filesystem, so that the kernel's idea of the
829 	 * end of the file isn't accurate anymore). In this example,
830 	 * we just accept that. A more rigorous filesystem may want
831 	 * to return an error here
832 	 */
833 	if (vfsdev->mount_opts.writeback_cache_enabled && (flags & O_APPEND)) {
834 		flags &= ~O_APPEND;
835 	}
836 
837 	/*
838 	 * O_DIRECT in guest should not necessarily mean bypassing page
839 	 * cache on host as well. If somebody needs that behavior, it
840 	 * probably should be a configuration knob in daemon.
841 	 */
842 	flags &= ~O_DIRECT;
843 
844 	return flags;
845 }
846 
847 static int
848 lo_open(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
849 {
850 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
851 	int fd, saverr;
852 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.open.fobject;
853 	uint32_t flags = fsdev_io->u_in.open.flags;
854 	struct spdk_fsdev_file_handle *fhandle;
855 
856 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
857 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
858 		return -EINVAL;
859 	}
860 
861 	flags = update_open_flags(vfsdev, flags);
862 
863 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, flags & ~O_NOFOLLOW);
864 	if (fd == -1) {
865 		saverr = -errno;
866 		SPDK_ERRLOG("openat(%d, %s, 0x%08" PRIx32 ") failed with err=%d\n",
867 			    vfsdev->proc_self_fd, fobject->fd_str, flags, saverr);
868 		return saverr;
869 	}
870 
871 	fhandle = file_handle_create(fobject, fd);
872 	if (!fhandle) {
873 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
874 		close(fd);
875 		return -ENOMEM;
876 	}
877 
878 	fsdev_io->u_out.open.fhandle = fhandle;
879 
880 	SPDK_DEBUGLOG(fsdev_aio, "OPEN succeeded for " FOBJECT_FMT " (fh=%p, fd=%d)\n",
881 		      FOBJECT_ARGS(fobject), fhandle, fd);
882 
883 	return 0;
884 }
885 
886 static int
887 lo_flush(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
888 {
889 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
890 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flush.fobject;
891 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flush.fhandle;
892 	int res, saverr;
893 
894 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
895 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
896 		return -EINVAL;
897 	}
898 
899 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
900 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
901 		return -EINVAL;
902 	}
903 
904 	res = close(dup(fhandle->fd));
905 	if (res) {
906 		saverr = -errno;
907 		SPDK_ERRLOG("close(dup(%d)) failed for " FOBJECT_FMT " (fh=%p, err=%d)\n",
908 			    fhandle->fd, FOBJECT_ARGS(fobject), fhandle, saverr);
909 		return saverr;
910 	}
911 
912 	SPDK_DEBUGLOG(fsdev_aio, "FLUSH succeeded for " FOBJECT_FMT " (fh=%p)\n", FOBJECT_ARGS(fobject),
913 		      fhandle);
914 
915 	return 0;
916 }
917 
918 static int
919 lo_setattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
920 {
921 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
922 	int saverr;
923 	int res;
924 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setattr.fobject;
925 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.setattr.fhandle;
926 	uint32_t to_set = fsdev_io->u_in.setattr.to_set;
927 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_in.setattr.attr;
928 
929 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
930 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
931 		return -EINVAL;
932 	}
933 
934 	if (to_set & FSDEV_SET_ATTR_MODE) {
935 		if (fhandle) {
936 			res = fchmod(fhandle->fd, attr->mode);
937 		} else {
938 			res = fchmodat(vfsdev->proc_self_fd, fobject->fd_str, attr->mode, 0);
939 		}
940 		if (res == -1) {
941 			saverr = -errno;
942 			SPDK_ERRLOG("fchmod failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
943 			return saverr;
944 		}
945 	}
946 
947 	if (to_set & (FSDEV_SET_ATTR_UID | FSDEV_SET_ATTR_GID)) {
948 		uid_t uid = (to_set & FSDEV_SET_ATTR_UID) ? attr->uid : (uid_t) -1;
949 		gid_t gid = (to_set & FSDEV_SET_ATTR_GID) ? attr->gid : (gid_t) -1;
950 
951 		res = fchownat(fobject->fd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
952 		if (res == -1) {
953 			saverr = -errno;
954 			SPDK_ERRLOG("fchownat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
955 			return saverr;
956 		}
957 	}
958 
959 	if (to_set & FSDEV_SET_ATTR_SIZE) {
960 		int truncfd;
961 
962 		if (fhandle) {
963 			truncfd = fhandle->fd;
964 		} else {
965 			truncfd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
966 			if (truncfd < 0) {
967 				saverr = -errno;
968 				SPDK_ERRLOG("openat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
969 				return saverr;
970 			}
971 		}
972 
973 		res = ftruncate(truncfd, attr->size);
974 		if (!fhandle) {
975 			saverr = -errno;
976 			close(truncfd);
977 			errno = saverr;
978 		}
979 		if (res == -1) {
980 			saverr = -errno;
981 			SPDK_ERRLOG("ftruncate failed for " FOBJECT_FMT " (size=%" PRIu64 ")\n", FOBJECT_ARGS(fobject),
982 				    attr->size);
983 			return saverr;
984 		}
985 	}
986 
987 	if (to_set & (FSDEV_SET_ATTR_ATIME | FSDEV_SET_ATTR_MTIME)) {
988 		struct timespec tv[2];
989 
990 		tv[0].tv_sec = 0;
991 		tv[1].tv_sec = 0;
992 		tv[0].tv_nsec = UTIME_OMIT;
993 		tv[1].tv_nsec = UTIME_OMIT;
994 
995 		if (to_set & FSDEV_SET_ATTR_ATIME_NOW) {
996 			tv[0].tv_nsec = UTIME_NOW;
997 		} else if (to_set & FSDEV_SET_ATTR_ATIME) {
998 			tv[0].tv_sec = attr->atime;
999 			tv[0].tv_nsec = attr->atimensec;
1000 		}
1001 
1002 		if (to_set & FSDEV_SET_ATTR_MTIME_NOW) {
1003 			tv[1].tv_nsec = UTIME_NOW;
1004 		} else if (to_set & FSDEV_SET_ATTR_MTIME) {
1005 			tv[1].tv_sec = attr->mtime;
1006 			tv[1].tv_nsec = attr->mtimensec;
1007 		}
1008 
1009 		if (fhandle) {
1010 			res = futimens(fhandle->fd, tv);
1011 		} else {
1012 			res = utimensat_empty(vfsdev, fobject, tv);
1013 		}
1014 		if (res == -1) {
1015 			saverr = -errno;
1016 			SPDK_ERRLOG("futimens/utimensat_empty failed for " FOBJECT_FMT "\n",
1017 				    FOBJECT_ARGS(fobject));
1018 			return saverr;
1019 		}
1020 	}
1021 
1022 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.setattr.attr);
1023 	if (res) {
1024 		SPDK_ERRLOG("file_object_fill_attr failed for " FOBJECT_FMT "\n",
1025 			    FOBJECT_ARGS(fobject));
1026 		return res;
1027 	}
1028 
1029 	SPDK_DEBUGLOG(fsdev_aio, "SETATTR succeeded for " FOBJECT_FMT "\n",
1030 		      FOBJECT_ARGS(fobject));
1031 
1032 	return 0;
1033 }
1034 
1035 static int
1036 lo_create(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1037 {
1038 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1039 	int fd;
1040 	int err;
1041 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.create.parent_fobject;
1042 	const char *name = fsdev_io->u_in.create.name;
1043 	uint32_t mode = fsdev_io->u_in.create.mode;
1044 	uint32_t flags = fsdev_io->u_in.create.flags;
1045 	uint32_t umask = fsdev_io->u_in.create.umask;
1046 	struct lo_cred old_cred, new_cred = {
1047 		.euid = fsdev_io->u_in.create.euid,
1048 		.egid = fsdev_io->u_in.create.egid,
1049 	};
1050 	struct spdk_fsdev_file_object *fobject;
1051 	struct spdk_fsdev_file_handle *fhandle;
1052 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_out.create.attr;
1053 
1054 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1055 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1056 		return -EINVAL;
1057 	}
1058 
1059 	UNUSED(umask);
1060 
1061 	if (!is_safe_path_component(name)) {
1062 		SPDK_ERRLOG("CREATE: %s not a safe component\n", name);
1063 		return -EINVAL;
1064 	}
1065 
1066 	err = lo_change_cred(&new_cred, &old_cred);
1067 	if (err) {
1068 		SPDK_ERRLOG("CREATE: cannot change credentials\n");
1069 		return err;
1070 	}
1071 
1072 	flags = update_open_flags(vfsdev, flags);
1073 
1074 	fd = openat(parent_fobject->fd, name, (flags | O_CREAT) & ~O_NOFOLLOW, mode);
1075 	err = fd == -1 ? -errno : 0;
1076 	lo_restore_cred(&old_cred);
1077 
1078 	if (err) {
1079 		SPDK_ERRLOG("CREATE: openat failed with %d\n", err);
1080 		return err;
1081 	}
1082 
1083 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, attr);
1084 	if (err) {
1085 		SPDK_ERRLOG("CREATE: lookup failed with %d\n", err);
1086 		return err;
1087 	}
1088 
1089 	fhandle = file_handle_create(fobject, fd);
1090 	if (!fhandle) {
1091 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
1092 		close(fd);
1093 		file_object_unref(fobject, 1);
1094 		return -ENOMEM;
1095 	}
1096 
1097 	SPDK_DEBUGLOG(fsdev_aio, "CREATE: succeeded (name=%s " FOBJECT_FMT " fh=%p)\n",
1098 		      name, FOBJECT_ARGS(fobject), fhandle);
1099 
1100 	fsdev_io->u_out.create.fobject = fobject;
1101 	fsdev_io->u_out.create.fhandle = fhandle;
1102 
1103 	return 0;
1104 }
1105 
1106 static int
1107 lo_release(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1108 {
1109 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1110 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.release.fobject;
1111 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.release.fhandle;
1112 
1113 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1114 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1115 		return -EINVAL;
1116 	}
1117 
1118 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1119 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1120 		return -EINVAL;
1121 	}
1122 
1123 	SPDK_DEBUGLOG(fsdev_aio, "RELEASE succeeded for " FOBJECT_FMT " fh=%p)\n",
1124 		      FOBJECT_ARGS(fobject), fhandle);
1125 
1126 	file_handle_delete(fhandle);
1127 
1128 	return 0;
1129 }
1130 
1131 static void
1132 lo_read_cb(void *ctx, uint32_t data_size, int error)
1133 {
1134 	struct spdk_fsdev_io *fsdev_io = ctx;
1135 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1136 
1137 	if (vfsdev_io->aio) {
1138 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1139 	}
1140 
1141 	fsdev_io->u_out.read.data_size = data_size;
1142 
1143 	spdk_fsdev_io_complete(fsdev_io, error);
1144 }
1145 
1146 static int
1147 lo_read(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1148 {
1149 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1150 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1151 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1152 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.read.fobject;
1153 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.read.fhandle;
1154 	size_t size = fsdev_io->u_in.read.size;
1155 	uint64_t offs = fsdev_io->u_in.read.offs;
1156 	uint32_t flags = fsdev_io->u_in.read.flags;
1157 	struct iovec *outvec = fsdev_io->u_in.read.iov;
1158 	uint32_t outcnt = fsdev_io->u_in.read.iovcnt;
1159 
1160 	/* we don't suport the memory domains at the moment */
1161 	assert(!fsdev_io->u_in.read.opts || !fsdev_io->u_in.read.opts->memory_domain);
1162 
1163 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1164 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1165 		return -EINVAL;
1166 	}
1167 
1168 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1169 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1170 		return -EINVAL;
1171 	}
1172 
1173 	UNUSED(flags);
1174 
1175 	if (!outcnt || !outvec) {
1176 		SPDK_ERRLOG("bad outvec: iov=%p outcnt=%" PRIu32 "\n", outvec, outcnt);
1177 		return -EINVAL;
1178 	}
1179 
1180 	if (vfsdev->skip_rw) {
1181 		uint32_t i;
1182 
1183 		fsdev_io->u_out.read.data_size = 0;
1184 
1185 		for (i = 0; i < outcnt; i++, outvec++) {
1186 			fsdev_io->u_out.read.data_size += outvec->iov_len;
1187 		}
1188 
1189 		TAILQ_INSERT_TAIL(&ch->ios_to_complete, vfsdev_io, link);
1190 
1191 		return IO_STATUS_ASYNC;
1192 	}
1193 
1194 	vfsdev_io->aio = spdk_aio_mgr_read(ch->mgr, lo_read_cb, fsdev_io, fhandle->fd, offs, size, outvec,
1195 					   outcnt);
1196 	if (vfsdev_io->aio) {
1197 		vfsdev_io->ch = ch;
1198 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1199 	}
1200 
1201 	return IO_STATUS_ASYNC;
1202 }
1203 
1204 static void
1205 lo_write_cb(void *ctx, uint32_t data_size, int error)
1206 {
1207 	struct spdk_fsdev_io *fsdev_io = ctx;
1208 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1209 
1210 	if (vfsdev_io->aio) {
1211 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1212 	}
1213 
1214 	fsdev_io->u_out.write.data_size = data_size;
1215 
1216 	spdk_fsdev_io_complete(fsdev_io, error);
1217 }
1218 
1219 static int
1220 lo_write(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1221 {
1222 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1223 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1224 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1225 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.write.fobject;
1226 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.write.fhandle;
1227 	size_t size = fsdev_io->u_in.write.size;
1228 	uint64_t offs = fsdev_io->u_in.write.offs;
1229 	uint32_t flags = fsdev_io->u_in.write.flags;
1230 	const struct iovec *invec = fsdev_io->u_in.write.iov;
1231 	uint32_t incnt =  fsdev_io->u_in.write.iovcnt;
1232 
1233 	/* we don't suport the memory domains at the moment */
1234 	assert(!fsdev_io->u_in.write.opts || !fsdev_io->u_in.write.opts->memory_domain);
1235 
1236 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1237 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1238 		return -EINVAL;
1239 	}
1240 
1241 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1242 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1243 		return -EINVAL;
1244 	}
1245 
1246 	UNUSED(flags);
1247 
1248 	if (!incnt || !invec) { /* there should be at least one iovec with data */
1249 		SPDK_ERRLOG("bad invec: iov=%p cnt=%" PRIu32 "\n", invec, incnt);
1250 		return -EINVAL;
1251 	}
1252 
1253 	if (vfsdev->skip_rw) {
1254 		uint32_t i;
1255 
1256 		fsdev_io->u_out.write.data_size = 0;
1257 		for (i = 0; i < incnt; i++, invec++) {
1258 			fsdev_io->u_out.write.data_size += invec->iov_len;
1259 		}
1260 
1261 		TAILQ_INSERT_TAIL(&ch->ios_to_complete, vfsdev_io, link);
1262 
1263 		return IO_STATUS_ASYNC;
1264 	}
1265 
1266 	vfsdev_io->aio = spdk_aio_mgr_write(ch->mgr, lo_write_cb, fsdev_io,
1267 					    fhandle->fd, offs, size, invec, incnt);
1268 	if (vfsdev_io->aio) {
1269 		vfsdev_io->ch = ch;
1270 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1271 	}
1272 
1273 	return IO_STATUS_ASYNC;
1274 }
1275 
1276 static int
1277 lo_readlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1278 {
1279 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1280 	int res;
1281 	char *buf;
1282 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readlink.fobject;
1283 
1284 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1285 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1286 		return -EINVAL;
1287 	}
1288 
1289 	buf = malloc(PATH_MAX + 1);
1290 	if (!buf) {
1291 		SPDK_ERRLOG("malloc(%zu) failed\n", (size_t)(PATH_MAX + 1));
1292 		return -ENOMEM;
1293 	}
1294 
1295 	res = readlinkat(fobject->fd, "", buf, PATH_MAX + 1);
1296 	if (res == -1) {
1297 		int saverr = -errno;
1298 		SPDK_ERRLOG("readlinkat failed for " FOBJECT_FMT " with %d\n",
1299 			    FOBJECT_ARGS(fobject), saverr);
1300 		free(buf);
1301 		return saverr;
1302 	}
1303 
1304 	if (((uint32_t)res) == PATH_MAX + 1) {
1305 		SPDK_ERRLOG("buffer is too short\n");
1306 		free(buf);
1307 		return -ENAMETOOLONG;
1308 	}
1309 
1310 	buf[res] = 0;
1311 	fsdev_io->u_out.readlink.linkname = buf;
1312 
1313 	return 0;
1314 }
1315 
1316 static int
1317 lo_statfs(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1318 {
1319 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1320 	int res;
1321 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.statfs.fobject;
1322 	struct statvfs stbuf;
1323 
1324 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1325 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1326 		return -EINVAL;
1327 	}
1328 
1329 	res = fstatvfs(fobject->fd, &stbuf);
1330 	if (res == -1) {
1331 		int saverr = -errno;
1332 		SPDK_ERRLOG("fstatvfs failed with %d\n", saverr);
1333 		return saverr;
1334 	}
1335 
1336 	fsdev_io->u_out.statfs.statfs.blocks = stbuf.f_blocks;
1337 	fsdev_io->u_out.statfs.statfs.bfree = stbuf.f_bfree;
1338 	fsdev_io->u_out.statfs.statfs.bavail = stbuf.f_bavail;
1339 	fsdev_io->u_out.statfs.statfs.files = stbuf.f_files;
1340 	fsdev_io->u_out.statfs.statfs.ffree = stbuf.f_ffree;
1341 	fsdev_io->u_out.statfs.statfs.bsize = stbuf.f_bsize;
1342 	fsdev_io->u_out.statfs.statfs.namelen = stbuf.f_namemax;
1343 	fsdev_io->u_out.statfs.statfs.frsize = stbuf.f_frsize;
1344 
1345 	return 0;
1346 }
1347 
1348 static int
1349 lo_mknod_symlink(struct spdk_fsdev_io *fsdev_io, struct spdk_fsdev_file_object *parent_fobject,
1350 		 const char *name, mode_t mode, dev_t rdev, const char *link, uid_t euid, gid_t egid,
1351 		 struct spdk_fsdev_file_object **pfobject, struct spdk_fsdev_file_attr *attr)
1352 {
1353 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1354 	int res;
1355 	int saverr;
1356 	struct lo_cred old_cred, new_cred = {
1357 		.euid = euid,
1358 		.egid = egid,
1359 	};
1360 
1361 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1362 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1363 		return -EINVAL;
1364 	}
1365 
1366 	if (!is_safe_path_component(name)) {
1367 		SPDK_ERRLOG("%s isn'h safe\n", name);
1368 		return -EINVAL;
1369 	}
1370 
1371 	res = lo_change_cred(&new_cred, &old_cred);
1372 	if (res) {
1373 		SPDK_ERRLOG("cannot change cred (err=%d)\n", res);
1374 		return res;
1375 	}
1376 
1377 	if (S_ISDIR(mode)) {
1378 		res = mkdirat(parent_fobject->fd, name, mode);
1379 	} else if (S_ISLNK(mode)) {
1380 		if (link) {
1381 			res = symlinkat(link, parent_fobject->fd, name);
1382 		} else {
1383 			SPDK_ERRLOG("NULL link pointer\n");
1384 			errno = EINVAL;
1385 		}
1386 	} else {
1387 		res = mknodat(parent_fobject->fd, name, mode, rdev);
1388 	}
1389 	saverr = -errno;
1390 
1391 	lo_restore_cred(&old_cred);
1392 
1393 	if (res == -1) {
1394 		SPDK_ERRLOG("cannot mkdirat/symlinkat/mknodat (err=%d)\n", saverr);
1395 		return saverr;
1396 	}
1397 
1398 	res = lo_do_lookup(vfsdev, parent_fobject, name, pfobject, attr);
1399 	if (res) {
1400 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1401 		return res;
1402 	}
1403 
1404 	SPDK_DEBUGLOG(fsdev_aio, "lo_mknod_symlink(" FOBJECT_FMT "/%s -> " FOBJECT_FMT "\n",
1405 		      FOBJECT_ARGS(parent_fobject), name, FOBJECT_ARGS(*pfobject));
1406 
1407 	return 0;
1408 }
1409 
1410 static int
1411 lo_mknod(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1412 {
1413 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mknod.parent_fobject;
1414 	char *name = fsdev_io->u_in.mknod.name;
1415 	mode_t mode = fsdev_io->u_in.mknod.mode;
1416 	dev_t rdev = fsdev_io->u_in.mknod.rdev;
1417 	uid_t euid = fsdev_io->u_in.mknod.euid;
1418 	gid_t egid = fsdev_io->u_in.mknod.egid;
1419 
1420 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, mode, rdev, NULL, euid, egid,
1421 				&fsdev_io->u_out.mknod.fobject, &fsdev_io->u_out.mknod.attr);
1422 }
1423 
1424 static int
1425 lo_mkdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1426 {
1427 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mkdir.parent_fobject;
1428 	char *name = fsdev_io->u_in.mkdir.name;
1429 	mode_t mode = fsdev_io->u_in.mkdir.mode;
1430 	uid_t euid = fsdev_io->u_in.mkdir.euid;
1431 	gid_t egid = fsdev_io->u_in.mkdir.egid;
1432 
1433 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, S_IFDIR | mode, 0, NULL, euid, egid,
1434 				&fsdev_io->u_out.mkdir.fobject, &fsdev_io->u_out.mkdir.attr);
1435 }
1436 
1437 static int
1438 lo_symlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1439 {
1440 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.symlink.parent_fobject;
1441 	char *target = fsdev_io->u_in.symlink.target;
1442 	char *linkpath = fsdev_io->u_in.symlink.linkpath;
1443 	uid_t euid = fsdev_io->u_in.symlink.euid;
1444 	gid_t egid = fsdev_io->u_in.symlink.egid;
1445 
1446 	return lo_mknod_symlink(fsdev_io, parent_fobject, target, S_IFLNK, 0, linkpath, euid, egid,
1447 				&fsdev_io->u_out.symlink.fobject, &fsdev_io->u_out.symlink.attr);
1448 }
1449 
1450 static int
1451 lo_do_unlink(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
1452 	     const char *name, bool is_dir)
1453 {
1454 	/* fobject must be initialized to avoid a scan-build false positive */
1455 	struct spdk_fsdev_file_object *fobject = NULL;
1456 	int res;
1457 
1458 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1459 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1460 		return -EINVAL;
1461 	}
1462 
1463 	if (!is_safe_path_component(name)) {
1464 		SPDK_ERRLOG("%s isn't safe\n", name);
1465 		return -EINVAL;
1466 	}
1467 
1468 	res = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, NULL);
1469 	if (res) {
1470 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1471 		return -EIO;
1472 	}
1473 
1474 	res = unlinkat(parent_fobject->fd, name, is_dir ? AT_REMOVEDIR : 0);
1475 	if (res) {
1476 		res = -errno;
1477 		SPDK_WARNLOG("unlinkat(" FOBJECT_FMT " %s) failed (err=%d)\n",
1478 			     FOBJECT_ARGS(parent_fobject), name, res);
1479 	}
1480 
1481 	file_object_unref(fobject, 1);
1482 	return res;
1483 }
1484 
1485 static int
1486 lo_unlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1487 {
1488 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1489 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.unlink.parent_fobject;
1490 	char *name = fsdev_io->u_in.unlink.name;
1491 
1492 	return lo_do_unlink(vfsdev, parent_fobject, name, false);
1493 }
1494 
1495 static int
1496 lo_rmdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1497 {
1498 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1499 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rmdir.parent_fobject;
1500 	char *name = fsdev_io->u_in.rmdir.name;
1501 
1502 	return lo_do_unlink(vfsdev, parent_fobject, name, true);
1503 }
1504 
1505 static int
1506 lo_rename(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1507 {
1508 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1509 	int res, saverr;
1510 	/* old_fobject must be initialized to avoid a scan-build false positive */
1511 	struct spdk_fsdev_file_object *old_fobject = NULL;
1512 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rename.parent_fobject;
1513 	char *name = fsdev_io->u_in.rename.name;
1514 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.rename.new_parent_fobject;
1515 	char *new_name = fsdev_io->u_in.rename.new_name;
1516 	uint32_t flags = fsdev_io->u_in.rename.flags;
1517 
1518 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1519 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1520 		return -EINVAL;
1521 	}
1522 
1523 	if (!fsdev_aio_is_valid_fobject(vfsdev, new_parent_fobject)) {
1524 		SPDK_ERRLOG("Invalid new_parent_fobject: %p\n", new_parent_fobject);
1525 		return -EINVAL;
1526 	}
1527 
1528 	if (!is_safe_path_component(name)) {
1529 		SPDK_ERRLOG("name '%s' isn't safe\n", name);
1530 		return -EINVAL;
1531 	}
1532 
1533 	if (!is_safe_path_component(new_name)) {
1534 		SPDK_ERRLOG("newname '%s' isn't safe\n", new_name);
1535 		return -EINVAL;
1536 	}
1537 
1538 	res = lo_do_lookup(vfsdev, parent_fobject, name, &old_fobject, NULL);
1539 	if (res) {
1540 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1541 		return -EIO;
1542 	}
1543 
1544 	saverr = 0;
1545 	if (flags) {
1546 #ifndef SYS_renameat2
1547 		SPDK_ERRLOG("flags are not supported\n");
1548 		return -ENOTSUP;
1549 #else
1550 		res = syscall(SYS_renameat2, parent_fobject->fd, name, new_parent_fobject->fd,
1551 			      new_name, flags);
1552 		if (res == -1 && errno == ENOSYS) {
1553 			SPDK_ERRLOG("SYS_renameat2 returned ENOSYS\n");
1554 			saverr = -EINVAL;
1555 		} else if (res == -1) {
1556 			saverr = -errno;
1557 			SPDK_ERRLOG("SYS_renameat2 failed (err=%d))\n", saverr);
1558 		}
1559 #endif
1560 	} else {
1561 		res = renameat(parent_fobject->fd, name, new_parent_fobject->fd, new_name);
1562 		if (res == -1) {
1563 			saverr = -errno;
1564 			SPDK_ERRLOG("renameat failed (err=%d)\n", saverr);
1565 		}
1566 	}
1567 
1568 	file_object_unref(old_fobject, 1);
1569 
1570 	return saverr;
1571 }
1572 
1573 static int
1574 linkat_empty_nofollow(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject, int dfd,
1575 		      const char *name)
1576 {
1577 	int res;
1578 
1579 	if (fobject->is_symlink) {
1580 		res = linkat(fobject->fd, "", dfd, name, AT_EMPTY_PATH);
1581 		if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
1582 			/* Sorry, no race free way to hard-link a symlink. */
1583 			errno = EPERM;
1584 		}
1585 	} else {
1586 		res = linkat(vfsdev->proc_self_fd, fobject->fd_str, dfd, name, AT_SYMLINK_FOLLOW);
1587 	}
1588 
1589 	return res;
1590 }
1591 
1592 static int
1593 lo_link(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1594 {
1595 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1596 	int res;
1597 	int saverr;
1598 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.link.fobject;
1599 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.link.new_parent_fobject;
1600 	char *name = fsdev_io->u_in.link.name;
1601 
1602 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1603 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1604 		return -EINVAL;
1605 	}
1606 
1607 	if (!is_safe_path_component(name)) {
1608 		SPDK_ERRLOG("%s is not a safe component\n", name);
1609 		return -EINVAL;
1610 	}
1611 
1612 	res = linkat_empty_nofollow(vfsdev, fobject, new_parent_fobject->fd, name);
1613 	if (res == -1) {
1614 		saverr = -errno;
1615 		SPDK_ERRLOG("linkat_empty_nofollow failed " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s (err=%d)\n",
1616 			    FOBJECT_ARGS(fobject), FOBJECT_ARGS(new_parent_fobject), name, saverr);
1617 		return saverr;
1618 	}
1619 
1620 	res = lo_do_lookup(vfsdev, new_parent_fobject, name, &fsdev_io->u_out.link.fobject,
1621 			   &fsdev_io->u_out.link.attr);
1622 	if (res) {
1623 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1624 		return res;
1625 	}
1626 
1627 	SPDK_DEBUGLOG(fsdev_aio, "LINK succeeded for " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s\n",
1628 		      FOBJECT_ARGS(fobject), FOBJECT_ARGS(fsdev_io->u_out.link.fobject), name);
1629 
1630 	return 0;
1631 }
1632 
1633 static int
1634 lo_fsync(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1635 {
1636 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1637 	int res, saverr, fd;
1638 	char *buf;
1639 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsync.fobject;
1640 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsync.fhandle;
1641 	bool datasync = fsdev_io->u_in.fsync.datasync;
1642 
1643 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1644 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1645 		return -EINVAL;
1646 	}
1647 
1648 	if (!fhandle) {
1649 		res = asprintf(&buf, "%i", fobject->fd);
1650 		if (res == -1) {
1651 			saverr = -errno;
1652 			SPDK_ERRLOG("asprintf failed (errno=%d)\n", saverr);
1653 			return saverr;
1654 		}
1655 
1656 		fd = openat(vfsdev->proc_self_fd, buf, O_RDWR);
1657 		saverr = -errno;
1658 		free(buf);
1659 		if (fd == -1) {
1660 			SPDK_ERRLOG("openat failed (errno=%d)\n", saverr);
1661 			return saverr;
1662 		}
1663 	} else {
1664 		fd = fhandle->fd;
1665 	}
1666 
1667 	if (datasync) {
1668 		res = fdatasync(fd);
1669 	} else {
1670 		res = fsync(fd);
1671 	}
1672 
1673 	saverr = -errno;
1674 	if (!fhandle) {
1675 		close(fd);
1676 	}
1677 
1678 	if (res == -1) {
1679 		SPDK_ERRLOG("fdatasync/fsync failed for " FOBJECT_FMT " fh=%p (err=%d)\n",
1680 			    FOBJECT_ARGS(fobject), fhandle, saverr);
1681 		return saverr;
1682 	}
1683 
1684 	SPDK_DEBUGLOG(fsdev_aio, "FSYNC succeeded for " FOBJECT_FMT " fh=%p\n",
1685 		      FOBJECT_ARGS(fobject), fhandle);
1686 
1687 	return 0;
1688 }
1689 
1690 static int
1691 lo_setxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1692 {
1693 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1694 	ssize_t ret;
1695 	int saverr;
1696 	int fd = -1;
1697 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setxattr.fobject;
1698 	char *name = fsdev_io->u_in.setxattr.name;
1699 	char *value = fsdev_io->u_in.setxattr.value;
1700 	uint32_t size = fsdev_io->u_in.setxattr.size;
1701 	uint32_t flags = fsdev_io->u_in.setxattr.flags;
1702 
1703 	if (!vfsdev->xattr_enabled) {
1704 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1705 		return -ENOSYS;
1706 	}
1707 
1708 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1709 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1710 		return -EINVAL;
1711 	}
1712 
1713 	if (fobject->is_symlink) {
1714 		/* Sorry, no race free way to removexattr on symlink. */
1715 		SPDK_ERRLOG("cannot set xattr for symlink\n");
1716 		return -EPERM;
1717 	}
1718 
1719 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1720 	if (fd < 0) {
1721 		saverr = -errno;
1722 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1723 		return saverr;
1724 	}
1725 
1726 	ret = fsetxattr(fd, name, value, size, flags);
1727 	saverr = -errno;
1728 	close(fd);
1729 	if (ret == -1) {
1730 		if (saverr == -ENOTSUP) {
1731 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1732 		} else {
1733 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1734 		}
1735 		return saverr;
1736 	}
1737 
1738 	SPDK_DEBUGLOG(fsdev_aio,
1739 		      "SETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s size=%" PRIu32 "flags=0x%x" PRIx32 "\n",
1740 		      FOBJECT_ARGS(fobject), name, value, size, flags);
1741 
1742 	return 0;
1743 }
1744 
1745 static int
1746 lo_getxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1747 {
1748 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1749 	ssize_t ret;
1750 	int saverr;
1751 	int fd = -1;
1752 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getxattr.fobject;
1753 	char *name = fsdev_io->u_in.getxattr.name;
1754 	void *buffer = fsdev_io->u_in.getxattr.buffer;
1755 	size_t size = fsdev_io->u_in.getxattr.size;
1756 
1757 	if (!vfsdev->xattr_enabled) {
1758 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1759 		return -ENOSYS;
1760 	}
1761 
1762 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1763 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1764 		return -EINVAL;
1765 	}
1766 
1767 	if (fobject->is_symlink) {
1768 		/* Sorry, no race free way to getxattr on symlink. */
1769 		SPDK_ERRLOG("cannot get xattr for symlink\n");
1770 		return -EPERM;
1771 	}
1772 
1773 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1774 	if (fd < 0) {
1775 		saverr = -errno;
1776 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1777 		return saverr;
1778 	}
1779 
1780 	ret = fgetxattr(fd, name, buffer, size);
1781 	saverr = -errno;
1782 	close(fd);
1783 	if (ret == -1) {
1784 		if (saverr == -ENODATA) {
1785 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: no extended attribute '%s' found\n", name);
1786 		} else if (saverr == -ENOTSUP) {
1787 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: extended attributes are not supported or disabled\n");
1788 		} else {
1789 			SPDK_ERRLOG("fgetxattr failed with errno=%d\n", saverr);
1790 		}
1791 		return saverr;
1792 	}
1793 
1794 	fsdev_io->u_out.getxattr.value_size = ret;
1795 
1796 	SPDK_DEBUGLOG(fsdev_aio,
1797 		      "GETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s value_size=%zd\n",
1798 		      FOBJECT_ARGS(fobject), name, (char *)buffer, ret);
1799 
1800 	return 0;
1801 }
1802 
1803 static int
1804 lo_listxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1805 {
1806 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1807 	ssize_t ret;
1808 	int saverr;
1809 	int fd = -1;
1810 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.listxattr.fobject;
1811 	char *buffer = fsdev_io->u_in.listxattr.buffer;
1812 	size_t size = fsdev_io->u_in.listxattr.size;
1813 
1814 	if (!vfsdev->xattr_enabled) {
1815 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1816 		return -ENOSYS;
1817 	}
1818 
1819 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1820 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1821 		return -EINVAL;
1822 	}
1823 
1824 	if (fobject->is_symlink) {
1825 		/* Sorry, no race free way to listxattr on symlink. */
1826 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1827 		return -EPERM;
1828 	}
1829 
1830 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1831 	if (fd < 0) {
1832 		saverr = -errno;
1833 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1834 		return saverr;
1835 	}
1836 
1837 	ret = flistxattr(fd, buffer, size);
1838 	saverr = -errno;
1839 	close(fd);
1840 	if (ret == -1) {
1841 		if (saverr == -ENOTSUP) {
1842 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1843 		} else {
1844 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1845 		}
1846 		return saverr;
1847 	}
1848 
1849 	fsdev_io->u_out.listxattr.data_size = ret;
1850 	fsdev_io->u_out.listxattr.size_only = (size == 0);
1851 
1852 	SPDK_DEBUGLOG(fsdev_aio, "LISTXATTR succeeded for " FOBJECT_FMT " data_size=%zu\n",
1853 		      FOBJECT_ARGS(fobject), ret);
1854 
1855 	return 0;
1856 }
1857 
1858 static int
1859 lo_removexattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1860 {
1861 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1862 	ssize_t ret;
1863 	int saverr;
1864 	int fd = -1;
1865 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.removexattr.fobject;
1866 	char *name = fsdev_io->u_in.removexattr.name;
1867 
1868 	if (!vfsdev->xattr_enabled) {
1869 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1870 		return -ENOSYS;
1871 	}
1872 
1873 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1874 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1875 		return -EINVAL;
1876 	}
1877 
1878 	if (fobject->is_symlink) {
1879 		/* Sorry, no race free way to setxattr on symlink. */
1880 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1881 		return -EPERM;
1882 	}
1883 
1884 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1885 	if (fd < 0) {
1886 		saverr = -errno;
1887 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1888 		return saverr;
1889 	}
1890 
1891 	ret = fremovexattr(fd, name);
1892 	saverr = -errno;
1893 	close(fd);
1894 	if (ret == -1) {
1895 		if (saverr == -ENODATA) {
1896 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: no extended attribute '%s' found\n", name);
1897 		} else if (saverr == -ENOTSUP) {
1898 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: extended attributes are not supported or disabled\n");
1899 		} else {
1900 			SPDK_ERRLOG("fremovexattr failed with errno=%d\n", saverr);
1901 		}
1902 		return saverr;
1903 	}
1904 
1905 	SPDK_DEBUGLOG(fsdev_aio, "REMOVEXATTR succeeded for " FOBJECT_FMT " name=%s\n",
1906 		      FOBJECT_ARGS(fobject), name);
1907 
1908 	return 0;
1909 }
1910 
1911 static int
1912 lo_fsyncdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1913 {
1914 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1915 	int res;
1916 	int saverr = 0;
1917 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsyncdir.fobject;
1918 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsyncdir.fhandle;
1919 	bool datasync = fsdev_io->u_in.fsyncdir.datasync;
1920 
1921 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1922 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1923 		return -EINVAL;
1924 	}
1925 
1926 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1927 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1928 		return -EINVAL;
1929 	}
1930 
1931 	if (datasync) {
1932 		res = fdatasync(fhandle->fd);
1933 	} else {
1934 		res = fsync(fhandle->fd);
1935 	}
1936 
1937 	if (res == -1) {
1938 		saverr = -errno;
1939 		SPDK_ERRLOG("%s failed for fh=%p with err=%d\n",
1940 			    datasync ? "fdatasync" : "fsync", fhandle, saverr);
1941 		return saverr;
1942 	}
1943 
1944 	SPDK_DEBUGLOG(fsdev_aio, "FSYNCDIR succeeded for " FOBJECT_FMT " fh=%p datasync=%d\n",
1945 		      FOBJECT_ARGS(fobject), fhandle, datasync);
1946 
1947 	return 0;
1948 }
1949 
1950 static int
1951 lo_flock(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1952 {
1953 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1954 	int res;
1955 	int saverr = 0;
1956 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flock.fobject;
1957 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flock.fhandle;
1958 	int operation = fsdev_io->u_in.flock.operation;
1959 
1960 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1961 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1962 		return -EINVAL;
1963 	}
1964 
1965 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1966 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1967 		return -EINVAL;
1968 	}
1969 
1970 	res = flock(fhandle->fd, operation | LOCK_NB);
1971 	if (res == -1) {
1972 		saverr = -errno;
1973 		SPDK_ERRLOG("flock failed for fh=%p with err=%d\n", fhandle, saverr);
1974 		return saverr;
1975 	}
1976 
1977 	SPDK_DEBUGLOG(fsdev_aio, "FLOCK succeeded for " FOBJECT_FMT " fh=%p operation=%d\n",
1978 		      FOBJECT_ARGS(fobject), fhandle, operation);
1979 
1980 	return 0;
1981 }
1982 
1983 static int
1984 lo_fallocate(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1985 {
1986 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1987 	int err;
1988 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fallocate.fobject;
1989 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fallocate.fhandle;
1990 	uint32_t mode = fsdev_io->u_in.fallocate.mode;
1991 	uint64_t offset  = fsdev_io->u_in.fallocate.offset;
1992 	uint64_t length = fsdev_io->u_in.fallocate.length;
1993 
1994 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1995 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1996 		return -EINVAL;
1997 	}
1998 
1999 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
2000 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
2001 		return -EINVAL;
2002 	}
2003 
2004 	if (mode) {
2005 		SPDK_ERRLOG("non-zero mode is not suppored\n");
2006 		return -EOPNOTSUPP;
2007 	}
2008 
2009 	err = posix_fallocate(fhandle->fd, offset, length);
2010 	if (err) {
2011 		SPDK_ERRLOG("posix_fallocate failed for fh=%p with err=%d\n",
2012 			    fhandle, err);
2013 	}
2014 
2015 	SPDK_DEBUGLOG(fsdev_aio,
2016 		      "FALLOCATE returns %d for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " length=%" PRIu64 "\n",
2017 		      err, FOBJECT_ARGS(fobject), fhandle, offset, length);
2018 	return err;
2019 }
2020 
2021 static int
2022 lo_copy_file_range(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
2023 {
2024 #ifdef SPDK_CONFIG_COPY_FILE_RANGE
2025 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
2026 	ssize_t res;
2027 	int saverr = 0;
2028 	struct spdk_fsdev_file_object *fobject_in = fsdev_io->u_in.copy_file_range.fobject_in;
2029 	struct spdk_fsdev_file_handle *fhandle_in = fsdev_io->u_in.copy_file_range.fhandle_in;
2030 	off_t off_in = fsdev_io->u_in.copy_file_range.off_in;
2031 	struct spdk_fsdev_file_object *fobject_out = fsdev_io->u_in.copy_file_range.fobject_out;
2032 	struct spdk_fsdev_file_handle *fhandle_out = fsdev_io->u_in.copy_file_range.fhandle_out;
2033 	off_t off_out = fsdev_io->u_in.copy_file_range.off_out;
2034 	size_t len = fsdev_io->u_in.copy_file_range.len;
2035 	uint32_t flags = fsdev_io->u_in.copy_file_range.flags;
2036 
2037 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_in)) {
2038 		SPDK_ERRLOG("Invalid fobject_in: %p\n", fobject_in);
2039 		return -EINVAL;
2040 	}
2041 
2042 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_in)) {
2043 		SPDK_ERRLOG("Invalid fhandle_in: %p\n", fhandle_in);
2044 		return -EINVAL;
2045 	}
2046 
2047 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_out)) {
2048 		SPDK_ERRLOG("Invalid fobject_out: %p\n", fobject_out);
2049 		return -EINVAL;
2050 	}
2051 
2052 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_out)) {
2053 		SPDK_ERRLOG("Invalid fhandle_out: %p\n", fhandle_out);
2054 		return -EINVAL;
2055 	}
2056 
2057 	res = copy_file_range(fhandle_in->fd, &off_in, fhandle_out->fd, &off_out, len, flags);
2058 	if (res < 0) {
2059 		saverr = -errno;
2060 		SPDK_ERRLOG("copy_file_range failed with err=%d\n", saverr);
2061 		return saverr;
2062 	}
2063 
2064 	SPDK_DEBUGLOG(fsdev_aio,
2065 		      "COPY_FILE_RANGE succeeded for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " -> " FOBJECT_FMT
2066 		      " fh=%p offset=%" PRIu64 " (len-%zu flags=0x%" PRIx32 ")\n",
2067 		      FOBJECT_ARGS(fobject_in), fhandle_in, (uint64_t)off_in, FOBJECT_ARGS(fobject_out), fhandle_out,
2068 		      (uint64_t)off_out, len, flags);
2069 
2070 	return 0;
2071 #else
2072 	return -ENOSYS;
2073 #endif
2074 }
2075 
2076 static int
2077 lo_abort(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
2078 {
2079 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
2080 	struct aio_fsdev_io *vfsdev_io;
2081 	uint64_t unique_to_abort = fsdev_io->u_in.abort.unique_to_abort;
2082 
2083 	TAILQ_FOREACH(vfsdev_io, &ch->ios_in_progress, link) {
2084 		struct spdk_fsdev_io *_fsdev_io = aio_to_fsdev_io(vfsdev_io);
2085 		if (spdk_fsdev_io_get_unique(_fsdev_io) == unique_to_abort) {
2086 			spdk_aio_mgr_cancel(ch->mgr, vfsdev_io->aio);
2087 			return 0;
2088 		}
2089 	}
2090 
2091 	return 0;
2092 }
2093 
2094 static int
2095 aio_io_poll(void *arg)
2096 {
2097 	struct aio_fsdev_io *vfsdev_io, *tmp;
2098 	struct aio_io_channel *ch = arg;
2099 	int res = SPDK_POLLER_IDLE;
2100 
2101 	if (spdk_aio_mgr_poll(ch->mgr)) {
2102 		res = SPDK_POLLER_BUSY;
2103 	}
2104 
2105 	TAILQ_FOREACH_SAFE(vfsdev_io, &ch->ios_to_complete, link, tmp) {
2106 		struct spdk_fsdev_io *fsdev_io = aio_to_fsdev_io(vfsdev_io);
2107 
2108 		TAILQ_REMOVE(&ch->ios_to_complete, vfsdev_io, link);
2109 		spdk_fsdev_io_complete(fsdev_io, 0);
2110 		res = SPDK_POLLER_BUSY;
2111 	}
2112 
2113 	return res;
2114 }
2115 
2116 static int
2117 aio_fsdev_create_cb(void *io_device, void *ctx_buf)
2118 {
2119 	struct aio_io_channel *ch = ctx_buf;
2120 	struct spdk_thread *thread = spdk_get_thread();
2121 
2122 	ch->mgr = spdk_aio_mgr_create(MAX_AIOS);
2123 	if (!ch->mgr) {
2124 		SPDK_ERRLOG("aoi manager init for failed (thread=%s)\n", spdk_thread_get_name(thread));
2125 		return -ENOMEM;
2126 	}
2127 
2128 	ch->poller = SPDK_POLLER_REGISTER(aio_io_poll, ch, 0);
2129 	TAILQ_INIT(&ch->ios_in_progress);
2130 	TAILQ_INIT(&ch->ios_to_complete);
2131 
2132 	SPDK_DEBUGLOG(fsdev_aio, "Created aio fsdev IO channel: thread %s, thread id %" PRIu64
2133 		      "\n",
2134 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2135 	return 0;
2136 }
2137 
2138 static void
2139 aio_fsdev_destroy_cb(void *io_device, void *ctx_buf)
2140 {
2141 	struct aio_io_channel *ch = ctx_buf;
2142 	struct spdk_thread *thread = spdk_get_thread();
2143 
2144 	UNUSED(thread);
2145 
2146 	spdk_poller_unregister(&ch->poller);
2147 	spdk_aio_mgr_delete(ch->mgr);
2148 
2149 	SPDK_DEBUGLOG(fsdev_aio, "Destroyed aio fsdev IO channel: thread %s, thread id %" PRIu64
2150 		      "\n",
2151 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2152 }
2153 
2154 static int
2155 fsdev_aio_initialize(void)
2156 {
2157 	/*
2158 	 * We need to pick some unique address as our "io device" - so just use the
2159 	 *  address of the global tailq.
2160 	 */
2161 	spdk_io_device_register(&g_aio_fsdev_head,
2162 				aio_fsdev_create_cb, aio_fsdev_destroy_cb,
2163 				sizeof(struct aio_io_channel), "aio_fsdev");
2164 
2165 	return 0;
2166 }
2167 
2168 static void
2169 _fsdev_aio_finish_cb(void *arg)
2170 {
2171 	/* @todo: handle async module fini */
2172 	/* spdk_fsdev_module_fini_done(); */
2173 }
2174 
2175 static void
2176 fsdev_aio_finish(void)
2177 {
2178 	spdk_io_device_unregister(&g_aio_fsdev_head, _fsdev_aio_finish_cb);
2179 }
2180 
2181 static int
2182 fsdev_aio_get_ctx_size(void)
2183 {
2184 	return sizeof(struct aio_fsdev_io);
2185 }
2186 
2187 static struct spdk_fsdev_module aio_fsdev_module = {
2188 	.name = "aio",
2189 	.module_init = fsdev_aio_initialize,
2190 	.module_fini = fsdev_aio_finish,
2191 	.get_ctx_size	= fsdev_aio_get_ctx_size,
2192 };
2193 
2194 SPDK_FSDEV_MODULE_REGISTER(aio, &aio_fsdev_module);
2195 
2196 static void
2197 fsdev_aio_free(struct aio_fsdev *vfsdev)
2198 {
2199 	if (vfsdev->proc_self_fd != -1) {
2200 		close(vfsdev->proc_self_fd);
2201 	}
2202 
2203 	if (vfsdev->root) {
2204 		int destroyed = file_object_unref(vfsdev->root, 1);
2205 		assert(destroyed == 0);
2206 		UNUSED(destroyed);
2207 
2208 	}
2209 
2210 	free(vfsdev->fsdev.name);
2211 	free(vfsdev->root_path);
2212 
2213 	free(vfsdev);
2214 }
2215 
2216 static int
2217 fsdev_aio_destruct(void *ctx)
2218 {
2219 	struct aio_fsdev *vfsdev = ctx;
2220 
2221 	TAILQ_REMOVE(&g_aio_fsdev_head, vfsdev, tailq);
2222 
2223 	fsdev_free_leafs(vfsdev->root, true);
2224 	vfsdev->root = NULL;
2225 
2226 	pthread_mutex_destroy(&vfsdev->mutex);
2227 
2228 	fsdev_aio_free(vfsdev);
2229 	return 0;
2230 }
2231 
2232 typedef int (*fsdev_op_handler_func)(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io);
2233 
2234 static fsdev_op_handler_func handlers[] = {
2235 	[SPDK_FSDEV_IO_MOUNT] = lo_mount,
2236 	[SPDK_FSDEV_IO_UMOUNT] = lo_umount,
2237 	[SPDK_FSDEV_IO_LOOKUP] = lo_lookup,
2238 	[SPDK_FSDEV_IO_FORGET] = lo_forget,
2239 	[SPDK_FSDEV_IO_GETATTR] = lo_getattr,
2240 	[SPDK_FSDEV_IO_SETATTR] = lo_setattr,
2241 	[SPDK_FSDEV_IO_READLINK] = lo_readlink,
2242 	[SPDK_FSDEV_IO_SYMLINK] = lo_symlink,
2243 	[SPDK_FSDEV_IO_MKNOD] = lo_mknod,
2244 	[SPDK_FSDEV_IO_MKDIR] = lo_mkdir,
2245 	[SPDK_FSDEV_IO_UNLINK] = lo_unlink,
2246 	[SPDK_FSDEV_IO_RMDIR] = lo_rmdir,
2247 	[SPDK_FSDEV_IO_RENAME] = lo_rename,
2248 	[SPDK_FSDEV_IO_LINK] = lo_link,
2249 	[SPDK_FSDEV_IO_OPEN] = lo_open,
2250 	[SPDK_FSDEV_IO_READ] = lo_read,
2251 	[SPDK_FSDEV_IO_WRITE] = lo_write,
2252 	[SPDK_FSDEV_IO_STATFS] =  lo_statfs,
2253 	[SPDK_FSDEV_IO_RELEASE] = lo_release,
2254 	[SPDK_FSDEV_IO_FSYNC] = lo_fsync,
2255 	[SPDK_FSDEV_IO_SETXATTR] =  lo_setxattr,
2256 	[SPDK_FSDEV_IO_GETXATTR] =  lo_getxattr,
2257 	[SPDK_FSDEV_IO_LISTXATTR] = lo_listxattr,
2258 	[SPDK_FSDEV_IO_REMOVEXATTR] =  lo_removexattr,
2259 	[SPDK_FSDEV_IO_FLUSH] =  lo_flush,
2260 	[SPDK_FSDEV_IO_OPENDIR] =  lo_opendir,
2261 	[SPDK_FSDEV_IO_READDIR] =  lo_readdir,
2262 	[SPDK_FSDEV_IO_RELEASEDIR] = lo_releasedir,
2263 	[SPDK_FSDEV_IO_FSYNCDIR] = lo_fsyncdir,
2264 	[SPDK_FSDEV_IO_FLOCK] = lo_flock,
2265 	[SPDK_FSDEV_IO_CREATE] = lo_create,
2266 	[SPDK_FSDEV_IO_ABORT] = lo_abort,
2267 	[SPDK_FSDEV_IO_FALLOCATE] = lo_fallocate,
2268 	[SPDK_FSDEV_IO_COPY_FILE_RANGE] = lo_copy_file_range,
2269 };
2270 
2271 static void
2272 fsdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
2273 {
2274 	int status;
2275 	enum spdk_fsdev_io_type type = spdk_fsdev_io_get_type(fsdev_io);
2276 
2277 	assert(type >= 0 && type < __SPDK_FSDEV_IO_LAST);
2278 
2279 	status = handlers[type](ch, fsdev_io);
2280 	if (status != IO_STATUS_ASYNC) {
2281 		spdk_fsdev_io_complete(fsdev_io, status);
2282 	}
2283 }
2284 
2285 static struct spdk_io_channel *
2286 fsdev_aio_get_io_channel(void *ctx)
2287 {
2288 	return spdk_get_io_channel(&g_aio_fsdev_head);
2289 }
2290 
2291 static void
2292 fsdev_aio_write_config_json(struct spdk_fsdev *fsdev, struct spdk_json_write_ctx *w)
2293 {
2294 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev);
2295 
2296 	spdk_json_write_object_begin(w);
2297 	spdk_json_write_named_string(w, "method", "fsdev_aio_create");
2298 	spdk_json_write_named_object_begin(w, "params");
2299 	spdk_json_write_named_string(w, "name", spdk_fsdev_get_name(&vfsdev->fsdev));
2300 	spdk_json_write_named_string(w, "root_path", vfsdev->root_path);
2301 	spdk_json_write_named_bool(w, "enable_xattr", vfsdev->xattr_enabled);
2302 	spdk_json_write_named_bool(w, "enable_writeback_cache",
2303 				   !!vfsdev->mount_opts.writeback_cache_enabled);
2304 	spdk_json_write_named_uint32(w, "max_write", vfsdev->mount_opts.max_write);
2305 	spdk_json_write_named_bool(w, "skip_rw", vfsdev->skip_rw);
2306 	spdk_json_write_object_end(w); /* params */
2307 	spdk_json_write_object_end(w);
2308 }
2309 
2310 static const struct spdk_fsdev_fn_table aio_fn_table = {
2311 	.destruct		= fsdev_aio_destruct,
2312 	.submit_request		= fsdev_aio_submit_request,
2313 	.get_io_channel		= fsdev_aio_get_io_channel,
2314 	.write_config_json	= fsdev_aio_write_config_json,
2315 };
2316 
2317 static int
2318 setup_root(struct aio_fsdev *vfsdev)
2319 {
2320 	int fd, res;
2321 	struct stat stat;
2322 
2323 	fd = open(vfsdev->root_path, O_PATH);
2324 	if (fd == -1) {
2325 		res = -errno;
2326 		SPDK_ERRLOG("Cannot open root %s (err=%d)\n", vfsdev->root_path, res);
2327 		return res;
2328 	}
2329 
2330 	res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2331 	if (res == -1) {
2332 		res = -errno;
2333 		SPDK_ERRLOG("Cannot get root fstatat of %s (err=%d)\n", vfsdev->root_path, res);
2334 		close(fd);
2335 		return res;
2336 	}
2337 
2338 	vfsdev->root = file_object_create_unsafe(NULL, fd, stat.st_ino, stat.st_dev, stat.st_mode);
2339 	if (!vfsdev->root) {
2340 		SPDK_ERRLOG("Cannot alloc root\n");
2341 		close(fd);
2342 		return -ENOMEM;
2343 	}
2344 
2345 	SPDK_INFOLOG(fsdev_aio, "root (%s) fd=%d\n", vfsdev->root_path, fd);
2346 	return 0;
2347 }
2348 
2349 static int
2350 setup_proc_self_fd(struct aio_fsdev *vfsdev)
2351 {
2352 	vfsdev->proc_self_fd = open("/proc/self/fd", O_PATH);
2353 	if (vfsdev->proc_self_fd == -1) {
2354 		int saverr = -errno;
2355 		SPDK_ERRLOG("Failed to open procfs fd dir with %d\n", saverr);
2356 		return saverr;
2357 	}
2358 
2359 	SPDK_DEBUGLOG(fsdev_aio, "procfs fd dir opened (fd=%d)\n", vfsdev->proc_self_fd);
2360 	return 0;
2361 }
2362 
2363 void
2364 spdk_fsdev_aio_get_default_opts(struct spdk_fsdev_aio_opts *opts)
2365 {
2366 	assert(opts);
2367 
2368 	memset(opts, 0, sizeof(*opts));
2369 
2370 	opts->xattr_enabled = DEFAULT_XATTR_ENABLED;
2371 	opts->writeback_cache_enabled = DEFAULT_WRITEBACK_CACHE;
2372 	opts->max_write = DEFAULT_MAX_WRITE;
2373 	opts->skip_rw = DEFAULT_SKIP_RW;
2374 }
2375 
2376 int
2377 spdk_fsdev_aio_create(struct spdk_fsdev **fsdev, const char *name, const char *root_path,
2378 		      const struct spdk_fsdev_aio_opts *opts)
2379 {
2380 	struct aio_fsdev *vfsdev;
2381 	int rc;
2382 
2383 	vfsdev = calloc(1, sizeof(*vfsdev));
2384 	if (!vfsdev) {
2385 		SPDK_ERRLOG("Could not allocate aio_fsdev\n");
2386 		return -ENOMEM;
2387 	}
2388 
2389 	vfsdev->proc_self_fd = -1;
2390 
2391 	vfsdev->fsdev.name = strdup(name);
2392 	if (!vfsdev->fsdev.name) {
2393 		SPDK_ERRLOG("Could not strdup fsdev name: %s\n", name);
2394 		fsdev_aio_free(vfsdev);
2395 		return -ENOMEM;
2396 	}
2397 
2398 	vfsdev->root_path = strdup(root_path);
2399 	if (!vfsdev->root_path) {
2400 		SPDK_ERRLOG("Could not strdup root path: %s\n", root_path);
2401 		fsdev_aio_free(vfsdev);
2402 		return -ENOMEM;
2403 	}
2404 
2405 	rc = setup_root(vfsdev);
2406 	if (rc) {
2407 		SPDK_ERRLOG("Could not setup root: %s (err=%d)\n", root_path, rc);
2408 		fsdev_aio_free(vfsdev);
2409 		return rc;
2410 	}
2411 
2412 	rc = setup_proc_self_fd(vfsdev);
2413 	if (rc) {
2414 		SPDK_ERRLOG("Could not setup proc_self_fd (err=%d)\n", rc);
2415 		fsdev_aio_free(vfsdev);
2416 		return rc;
2417 	}
2418 
2419 	if (opts->xattr_enabled) {
2420 		SPDK_ERRLOG("Extended attributes can only be enabled in Linux\n");
2421 		fsdev_aio_free(vfsdev);
2422 		return rc;
2423 	}
2424 
2425 	vfsdev->xattr_enabled = opts->xattr_enabled;
2426 	vfsdev->fsdev.ctxt = vfsdev;
2427 	vfsdev->fsdev.fn_table = &aio_fn_table;
2428 	vfsdev->fsdev.module = &aio_fsdev_module;
2429 
2430 	pthread_mutex_init(&vfsdev->mutex, NULL);
2431 
2432 	rc = spdk_fsdev_register(&vfsdev->fsdev);
2433 	if (rc) {
2434 		fsdev_aio_free(vfsdev);
2435 		return rc;
2436 	}
2437 
2438 	vfsdev->mount_opts.writeback_cache_enabled = DEFAULT_WRITEBACK_CACHE;
2439 	vfsdev->mount_opts.max_write = DEFAULT_MAX_WRITE;
2440 
2441 	vfsdev->skip_rw = opts->skip_rw;
2442 
2443 	*fsdev = &(vfsdev->fsdev);
2444 	TAILQ_INSERT_TAIL(&g_aio_fsdev_head, vfsdev, tailq);
2445 	SPDK_DEBUGLOG(fsdev_aio, "Created aio filesystem %s (xattr_enabled=%" PRIu8 " writeback_cache=%"
2446 		      PRIu8 " max_write=%" PRIu32 " skip_rw=%" PRIu8 ")\n",
2447 		      vfsdev->fsdev.name, vfsdev->xattr_enabled, vfsdev->mount_opts.writeback_cache_enabled,
2448 		      vfsdev->mount_opts.max_write, vfsdev->skip_rw);
2449 	return rc;
2450 }
2451 void
2452 spdk_fsdev_aio_delete(const char *name,
2453 		      spdk_delete_aio_fsdev_complete cb_fn, void *cb_arg)
2454 {
2455 	int rc;
2456 
2457 	rc = spdk_fsdev_unregister_by_name(name, &aio_fsdev_module, cb_fn, cb_arg);
2458 	if (rc != 0) {
2459 		cb_fn(cb_arg, rc);
2460 	}
2461 
2462 	SPDK_DEBUGLOG(fsdev_aio, "Deleted aio filesystem %s\n", name);
2463 }
2464 
2465 SPDK_LOG_REGISTER_COMPONENT(fsdev_aio)
2466