xref: /spdk/module/fsdev/aio/fsdev_aio.c (revision 95d6c9fac17572b107042103439aafd696d60b0e)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 #include "spdk/stdinc.h"
5 #include "spdk/event.h"
6 #include "spdk/log.h"
7 #include "spdk/string.h"
8 #include "spdk/config.h"
9 #include "spdk/util.h"
10 #include "spdk/thread.h"
11 #include "aio_mgr.h"
12 #include "fsdev_aio.h"
13 
14 #define IO_STATUS_ASYNC INT_MIN
15 
16 #ifndef UNUSED
17 #define UNUSED(x) (void)(x)
18 #endif
19 
20 /* See https://libfuse.github.io/doxygen/structfuse__conn__info.html */
21 #define MAX_BACKGROUND (100)
22 #define TIME_GRAN (1)
23 #define MAX_AIOS 256
24 #define DEFAULT_WRITEBACK_CACHE true
25 #define DEFAULT_MAX_WRITE 0x00020000
26 #define DEFAULT_XATTR_ENABLED false
27 #define DEFAULT_SKIP_RW false
28 #define DEFAULT_TIMEOUT_MS 0 /* to prevent the attribute caching */
29 
30 #ifdef SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIM
31 /* Linux */
32 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec)
33 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec)
34 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec)
35 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val)
36 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val)
37 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val)
38 #elif defined(SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIMESPEC)
39 /* FreeBSD */
40 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec)
41 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec)
42 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec)
43 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val)
44 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val)
45 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val)
46 #else
47 #define ST_ATIM_NSEC(stbuf) 0
48 #define ST_CTIM_NSEC(stbuf) 0
49 #define ST_MTIM_NSEC(stbuf) 0
50 #define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0)
51 #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0)
52 #define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0)
53 #endif
54 
55 struct lo_cred {
56 	uid_t euid;
57 	gid_t egid;
58 };
59 
60 /** Inode number type */
61 typedef uint64_t spdk_ino_t;
62 
63 struct lo_key {
64 	ino_t ino;
65 	dev_t dev;
66 };
67 
68 struct spdk_fsdev_file_handle {
69 	int fd;
70 	struct {
71 		DIR *dp;
72 		struct dirent *entry;
73 		off_t offset;
74 	} dir;
75 	struct spdk_fsdev_file_object *fobject;
76 	TAILQ_ENTRY(spdk_fsdev_file_handle) link;
77 };
78 
79 #define FOBJECT_FMT "ino=%" PRIu64 " dev=%" PRIu64
80 #define FOBJECT_ARGS(fo) ((uint64_t)(fo)->key.ino), ((uint64_t)(fo)->key.dev)
81 struct spdk_fsdev_file_object {
82 	uint32_t is_symlink : 1;
83 	uint32_t is_dir : 1;
84 	uint32_t reserved : 30;
85 	int fd;
86 	char *fd_str;
87 	struct lo_key key;
88 	uint64_t refcount;
89 	struct spdk_fsdev_file_object *parent_fobject;
90 	TAILQ_ENTRY(spdk_fsdev_file_object) link;
91 	TAILQ_HEAD(, spdk_fsdev_file_object) leafs;
92 	TAILQ_HEAD(, spdk_fsdev_file_handle) handles;
93 	struct spdk_spinlock lock;
94 	char name[];
95 };
96 
97 struct aio_fsdev {
98 	struct spdk_fsdev fsdev;
99 	char *root_path;
100 	int proc_self_fd;
101 	pthread_mutex_t mutex;
102 	struct spdk_fsdev_file_object *root;
103 	TAILQ_ENTRY(aio_fsdev) tailq;
104 	bool xattr_enabled;
105 	bool skip_rw;
106 };
107 
108 struct aio_fsdev_io {
109 	struct spdk_aio_mgr_io *aio;
110 	struct aio_io_channel *ch;
111 	TAILQ_ENTRY(aio_fsdev_io) link;
112 };
113 
114 struct aio_io_channel {
115 	struct spdk_poller *poller;
116 	struct spdk_aio_mgr *mgr;
117 	TAILQ_HEAD(, aio_fsdev_io) ios_in_progress;
118 	TAILQ_HEAD(, aio_fsdev_io) ios_to_complete;
119 };
120 
121 static TAILQ_HEAD(, aio_fsdev) g_aio_fsdev_head = TAILQ_HEAD_INITIALIZER(
122 			g_aio_fsdev_head);
123 
124 static inline struct aio_fsdev *
125 fsdev_to_aio_fsdev(struct spdk_fsdev *fsdev)
126 {
127 	return SPDK_CONTAINEROF(fsdev, struct aio_fsdev, fsdev);
128 }
129 
130 static inline struct spdk_fsdev_io *
131 aio_to_fsdev_io(const struct aio_fsdev_io *aio_io)
132 {
133 	return SPDK_CONTAINEROF(aio_io, struct spdk_fsdev_io, driver_ctx);
134 }
135 
136 static inline struct aio_fsdev_io *
137 fsdev_to_aio_io(const struct spdk_fsdev_io *fsdev_io)
138 {
139 	return (struct aio_fsdev_io *)fsdev_io->driver_ctx;
140 }
141 
142 static inline bool
143 fsdev_aio_is_valid_fobject(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject)
144 {
145 	return fobject != NULL;
146 }
147 
148 static inline bool
149 fsdev_aio_is_valid_fhandle(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_handle *fhandle)
150 {
151 	return fhandle != NULL;
152 }
153 
154 static int
155 is_dot_or_dotdot(const char *name)
156 {
157 	return name[0] == '.' && (name[1] == '\0' ||
158 				  (name[1] == '.' && name[2] == '\0'));
159 }
160 
161 /* Is `path` a single path component that is not "." or ".."? */
162 static int
163 is_safe_path_component(const char *path)
164 {
165 	if (strchr(path, '/')) {
166 		return 0;
167 	}
168 
169 	return !is_dot_or_dotdot(path);
170 }
171 
172 static struct spdk_fsdev_file_object *
173 lo_find_leaf_unsafe(struct spdk_fsdev_file_object *fobject, ino_t ino, dev_t dev)
174 {
175 	struct spdk_fsdev_file_object *leaf_fobject;
176 
177 	TAILQ_FOREACH(leaf_fobject, &fobject->leafs, link) {
178 		if (leaf_fobject->key.ino == ino && leaf_fobject->key.dev == dev) {
179 			return leaf_fobject;
180 		}
181 	}
182 
183 	return NULL;
184 }
185 
186 /* This function returns:
187  * 1 if the refcount is still non zero
188  * a negative  error number if the refcount became zero, the file object was deleted but the defered underlying file deletion failed
189  * 0 if the refcount became zero, the file object was deleted and eithr the underlying file deletion wasn't defered or succeeded
190  */
191 static int
192 file_object_unref(struct spdk_fsdev_file_object *fobject, uint32_t count)
193 {
194 	int res = 0;
195 
196 	spdk_spin_lock(&fobject->lock);
197 	assert(fobject->refcount >= count);
198 	fobject->refcount -= count;
199 	spdk_spin_unlock(&fobject->lock);
200 
201 	if (!fobject->refcount) {
202 		struct spdk_fsdev_file_object *parent_fobject = fobject->parent_fobject;
203 
204 		if (parent_fobject) {
205 			spdk_spin_lock(&parent_fobject->lock);
206 			TAILQ_REMOVE(&parent_fobject->leafs, fobject, link);
207 			spdk_spin_unlock(&parent_fobject->lock);
208 			file_object_unref(parent_fobject, 1); /* unref by the leaf */
209 		}
210 
211 		spdk_spin_destroy(&fobject->lock);
212 		close(fobject->fd);
213 		free(fobject->fd_str);
214 		free(fobject);
215 	}
216 
217 	return res;
218 }
219 
220 static void
221 file_object_ref(struct spdk_fsdev_file_object *fobject)
222 {
223 	spdk_spin_lock(&fobject->lock);
224 	fobject->refcount++;
225 	spdk_spin_unlock(&fobject->lock);
226 }
227 
228 static struct spdk_fsdev_file_object *
229 file_object_create_unsafe(struct spdk_fsdev_file_object *parent_fobject, int fd, ino_t ino,
230 			  dev_t dev, mode_t mode)
231 {
232 	struct spdk_fsdev_file_object *fobject;
233 
234 	fobject = calloc(1, sizeof(*fobject));
235 	if (!fobject) {
236 		SPDK_ERRLOG("Cannot alloc fobject\n");
237 		return NULL;
238 	}
239 
240 	fobject->fd_str = spdk_sprintf_alloc("%d", fd);
241 	if (!fobject->fd_str) {
242 		SPDK_ERRLOG("Cannot alloc fd_str\n");
243 		free(fobject);
244 		return NULL;
245 	}
246 
247 	fobject->fd = fd;
248 	fobject->key.ino = ino;
249 	fobject->key.dev = dev;
250 	fobject->refcount = 1;
251 	fobject->is_symlink = S_ISLNK(mode) ? 1 : 0;
252 	fobject->is_dir = S_ISDIR(mode) ? 1 : 0;
253 
254 	TAILQ_INIT(&fobject->handles);
255 	TAILQ_INIT(&fobject->leafs);
256 	spdk_spin_init(&fobject->lock);
257 
258 	if (parent_fobject) {
259 		fobject->parent_fobject = parent_fobject;
260 		TAILQ_INSERT_TAIL(&parent_fobject->leafs, fobject, link);
261 		parent_fobject->refcount++;
262 	}
263 
264 	return fobject;
265 }
266 
267 static struct spdk_fsdev_file_handle *
268 file_handle_create(struct spdk_fsdev_file_object *fobject, int fd)
269 {
270 	struct spdk_fsdev_file_handle *fhandle;
271 
272 	fhandle = calloc(1, sizeof(*fhandle));
273 	if (!fhandle) {
274 		SPDK_ERRLOG("Cannot alloc fhandle\n");
275 		return NULL;
276 	}
277 
278 	fhandle->fobject = fobject;
279 	fhandle->fd = fd;
280 
281 	spdk_spin_lock(&fobject->lock);
282 	fobject->refcount++;
283 	TAILQ_INSERT_TAIL(&fobject->handles, fhandle, link);
284 	spdk_spin_unlock(&fobject->lock);
285 
286 	return fhandle;
287 }
288 
289 static void
290 file_handle_delete(struct spdk_fsdev_file_handle *fhandle)
291 {
292 	struct spdk_fsdev_file_object *fobject = fhandle->fobject;
293 
294 	spdk_spin_lock(&fobject->lock);
295 	fobject->refcount--;
296 	TAILQ_REMOVE(&fobject->handles, fhandle, link);
297 	spdk_spin_unlock(&fobject->lock);
298 
299 	if (fhandle->dir.dp) {
300 		closedir(fhandle->dir.dp);
301 	}
302 
303 	close(fhandle->fd);
304 	free(fhandle);
305 }
306 
307 static int
308 file_object_fill_attr(struct spdk_fsdev_file_object *fobject, struct spdk_fsdev_file_attr *attr)
309 {
310 	struct stat stbuf;
311 	int res;
312 
313 	res = fstatat(fobject->fd, "", &stbuf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
314 	if (res == -1) {
315 		res = -errno;
316 		SPDK_ERRLOG("fstatat() failed with %d\n", res);
317 		return res;
318 	}
319 
320 	memset(attr, 0, sizeof(*attr));
321 
322 	attr->ino = stbuf.st_ino;
323 	attr->size = stbuf.st_size;
324 	attr->blocks = stbuf.st_blocks;
325 	attr->atime = stbuf.st_atime;
326 	attr->mtime = stbuf.st_mtime;
327 	attr->ctime = stbuf.st_ctime;
328 	attr->atimensec = ST_ATIM_NSEC(&stbuf);
329 	attr->mtimensec = ST_MTIM_NSEC(&stbuf);
330 	attr->ctimensec = ST_CTIM_NSEC(&stbuf);
331 	attr->mode = stbuf.st_mode;
332 	attr->nlink = stbuf.st_nlink;
333 	attr->uid = stbuf.st_uid;
334 	attr->gid = stbuf.st_gid;
335 	attr->rdev = stbuf.st_rdev;
336 	attr->blksize = stbuf.st_blksize;
337 	attr->valid_ms = DEFAULT_TIMEOUT_MS;
338 
339 	return 0;
340 }
341 
342 static int
343 utimensat_empty(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject,
344 		const struct timespec *tv)
345 {
346 	int res;
347 
348 	if (fobject->is_symlink) {
349 		res = utimensat(fobject->fd, "", tv, AT_EMPTY_PATH);
350 		if (res == -1 && errno == EINVAL) {
351 			/* Sorry, no race free way to set times on symlink. */
352 			errno = EPERM;
353 		}
354 	} else {
355 		res = utimensat(vfsdev->proc_self_fd, fobject->fd_str, tv, 0);
356 	}
357 
358 	return res;
359 }
360 
361 static int
362 lo_getattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
363 {
364 	int res;
365 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
366 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getattr.fobject;
367 
368 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
369 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
370 		return -EINVAL;
371 	}
372 
373 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.getattr.attr);
374 	if (res) {
375 		SPDK_ERRLOG("Cannot fill attr for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), res);
376 		return res;
377 	}
378 
379 	SPDK_DEBUGLOG(fsdev_aio, "GETATTR succeeded for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
380 	return 0;
381 }
382 
383 static int
384 lo_opendir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
385 {
386 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
387 	int error;
388 	int fd;
389 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.opendir.fobject;
390 	uint32_t flags = fsdev_io->u_in.opendir.flags;
391 	struct spdk_fsdev_file_handle *fhandle = NULL;
392 
393 	UNUSED(flags);
394 
395 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
396 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
397 		return -EINVAL;
398 	}
399 
400 	fd = openat(fobject->fd, ".", O_RDONLY);
401 	if (fd == -1) {
402 		error = -errno;
403 		SPDK_ERRLOG("openat failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
404 		goto out_err;
405 	}
406 
407 	fhandle = file_handle_create(fobject, fd);
408 	if (fhandle == NULL) {
409 		error = -ENOMEM;
410 		SPDK_ERRLOG("file_handle_create failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject),
411 			    error);
412 		goto out_err;
413 	}
414 
415 	fhandle->dir.dp = fdopendir(fd);
416 	if (fhandle->dir.dp == NULL) {
417 		error = -errno;
418 		SPDK_ERRLOG("fdopendir failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
419 		goto out_err;
420 	}
421 
422 	fhandle->dir.offset = 0;
423 	fhandle->dir.entry = NULL;
424 
425 	SPDK_DEBUGLOG(fsdev_aio, "OPENDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
426 		      FOBJECT_ARGS(fobject), fhandle);
427 
428 	fsdev_io->u_out.opendir.fhandle = fhandle;
429 
430 	return 0;
431 
432 out_err:
433 	if (fhandle) {
434 		file_handle_delete(fhandle);
435 	} else if (fd != -1) {
436 		close(fd);
437 	}
438 
439 	return error;
440 }
441 
442 static int
443 lo_releasedir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
444 {
445 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
446 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.releasedir.fobject;
447 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.releasedir.fhandle;
448 
449 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
450 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
451 		return -EINVAL;
452 	}
453 
454 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
455 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
456 		return -EINVAL;
457 	}
458 
459 	SPDK_DEBUGLOG(fsdev_aio, "RELEASEDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
460 		      FOBJECT_ARGS(fobject), fhandle);
461 
462 	file_handle_delete(fhandle);
463 
464 	return 0;
465 }
466 
467 static int
468 lo_do_lookup(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
469 	     const char *name, struct spdk_fsdev_file_object **pfobject,
470 	     struct spdk_fsdev_file_attr *attr)
471 {
472 	int newfd;
473 	int res;
474 	struct stat stat;
475 	struct spdk_fsdev_file_object *fobject;
476 
477 	/* Do not allow escaping root directory */
478 	if (parent_fobject == vfsdev->root && strcmp(name, "..") == 0) {
479 		name = ".";
480 	}
481 
482 	newfd = openat(parent_fobject->fd, name, O_PATH | O_NOFOLLOW);
483 	if (newfd == -1) {
484 		res = -errno;
485 		SPDK_DEBUGLOG(fsdev_aio, "openat( " FOBJECT_FMT " %s) failed with %d\n",
486 			      FOBJECT_ARGS(parent_fobject), name, res);
487 		return res;
488 	}
489 
490 	res = fstatat(newfd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
491 	if (res == -1) {
492 		res = -errno;
493 		SPDK_ERRLOG("fstatat(%s) failed with %d\n", name, res);
494 		close(newfd);
495 		return res;
496 	}
497 
498 	spdk_spin_lock(&parent_fobject->lock);
499 	fobject = lo_find_leaf_unsafe(parent_fobject, stat.st_ino, stat.st_dev);
500 	if (fobject) {
501 		close(newfd);
502 		newfd = -1;
503 		file_object_ref(fobject); /* reference by a lo_do_lookup caller */
504 	} else {
505 		fobject = file_object_create_unsafe(parent_fobject, newfd, stat.st_ino, stat.st_dev, stat.st_mode);
506 	}
507 	spdk_spin_unlock(&parent_fobject->lock);
508 
509 	if (!fobject) {
510 		SPDK_ERRLOG("Cannot create file object\n");
511 		close(newfd);
512 		return -ENOMEM;
513 	}
514 
515 	if (attr) {
516 		res = file_object_fill_attr(fobject, attr);
517 		if (res) {
518 			SPDK_ERRLOG("fill_attr(%s) failed with %d\n", name, res);
519 			file_object_unref(fobject, 1);
520 			if (newfd != -1) {
521 				close(newfd);
522 			}
523 			return res;
524 		}
525 	}
526 
527 	*pfobject = fobject;
528 
529 	SPDK_DEBUGLOG(fsdev_aio, "lookup(%s) in dir " FOBJECT_FMT ": "  FOBJECT_FMT " fd=%d\n",
530 		      name, FOBJECT_ARGS(parent_fobject), FOBJECT_ARGS(fobject), fobject->fd);
531 	return 0;
532 }
533 
534 static int
535 lo_lookup(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
536 {
537 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
538 	int err;
539 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.lookup.parent_fobject;
540 	char *name = fsdev_io->u_in.lookup.name;
541 
542 	if (!parent_fobject) {
543 		err = file_object_fill_attr(vfsdev->root, &fsdev_io->u_out.lookup.attr);
544 		if (err) {
545 			SPDK_DEBUGLOG(fsdev_aio, "file_object_fill_attr(root) failed with err=%d\n", err);
546 			return err;
547 		}
548 
549 		file_object_ref(vfsdev->root);
550 		fsdev_io->u_out.lookup.fobject = vfsdev->root;
551 		return 0;
552 	}
553 
554 	SPDK_DEBUGLOG(fsdev_aio, "  name %s\n", name);
555 
556 	/* Don't use is_safe_path_component(), allow "." and ".." for NFS export
557 	 * support.
558 	 */
559 	if (strchr(name, '/')) {
560 		return -EINVAL;
561 	}
562 
563 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fsdev_io->u_out.lookup.fobject,
564 			   &fsdev_io->u_out.lookup.attr);
565 	if (err) {
566 		SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, err);
567 		return err;
568 	}
569 
570 	return 0;
571 }
572 
573 /*
574  * Change to uid/gid of caller so that file is created with ownership of caller.
575  */
576 static int
577 lo_change_cred(const struct lo_cred *new, struct lo_cred *old)
578 {
579 	int res;
580 
581 	old->euid = geteuid();
582 	old->egid = getegid();
583 
584 	res = syscall(SYS_setresgid, -1, new->egid, -1);
585 	if (res == -1) {
586 		return -errno;
587 	}
588 
589 	res = syscall(SYS_setresuid, -1, new->euid, -1);
590 	if (res == -1) {
591 		int errno_save = -errno;
592 
593 		syscall(SYS_setresgid, -1, old->egid, -1);
594 		return errno_save;
595 	}
596 
597 	return 0;
598 }
599 
600 /* Regain Privileges */
601 static void
602 lo_restore_cred(struct lo_cred *old)
603 {
604 	int res;
605 
606 	res = syscall(SYS_setresuid, -1, old->euid, -1);
607 	if (res == -1) {
608 		SPDK_ERRLOG("seteuid(%u)", old->euid);
609 	}
610 
611 	res = syscall(SYS_setresgid, -1, old->egid, -1);
612 	if (res == -1) {
613 		SPDK_ERRLOG("setegid(%u)", old->egid);
614 	}
615 }
616 
617 static int
618 lo_readdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
619 {
620 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
621 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
622 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.readdir.fhandle;
623 	uint64_t offset = fsdev_io->u_in.readdir.offset;
624 
625 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
626 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
627 		return -EINVAL;
628 	}
629 
630 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
631 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
632 		return -EINVAL;
633 	}
634 
635 	if (((off_t)offset) != fhandle->dir.offset) {
636 		seekdir(fhandle->dir.dp, offset);
637 		fhandle->dir.entry = NULL;
638 		fhandle->dir.offset = offset;
639 	}
640 
641 	while (1) {
642 		off_t nextoff;
643 		const char *name;
644 		int res;
645 
646 		if (!fhandle->dir.entry) {
647 			errno = 0;
648 			fhandle->dir.entry = readdir(fhandle->dir.dp);
649 			if (!fhandle->dir.entry) {
650 				if (errno) {  /* Error */
651 					res = -errno;
652 					SPDK_ERRLOG("readdir failed with err=%d", res);
653 					return res;
654 				} else {  /* End of stream */
655 					break;
656 				}
657 			}
658 		}
659 
660 		nextoff = fhandle->dir.entry->d_off;
661 		name = fhandle->dir.entry->d_name;
662 
663 		/* Hide root's parent directory */
664 		if (fobject == vfsdev->root && strcmp(name, "..") == 0) {
665 			goto skip_entry;
666 		}
667 
668 		if (is_dot_or_dotdot(name)) {
669 			fsdev_io->u_out.readdir.fobject = NULL;
670 			memset(&fsdev_io->u_out.readdir.attr, 0, sizeof(fsdev_io->u_out.readdir.attr));
671 			fsdev_io->u_out.readdir.attr.ino = fhandle->dir.entry->d_ino;
672 			fsdev_io->u_out.readdir.attr.mode = DT_DIR << 12;
673 			goto skip_lookup;
674 		}
675 
676 		res = lo_do_lookup(vfsdev, fobject, name, &fsdev_io->u_out.readdir.fobject,
677 				   &fsdev_io->u_out.readdir.attr);
678 		if (res) {
679 			SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, res);
680 			return res;
681 		}
682 
683 skip_lookup:
684 		fsdev_io->u_out.readdir.name = name;
685 		fsdev_io->u_out.readdir.offset = nextoff;
686 
687 		res = fsdev_io->u_in.readdir.entry_cb_fn(fsdev_io, fsdev_io->internal.cb_arg);
688 		if (res) {
689 			if (fsdev_io->u_out.readdir.fobject) {
690 				file_object_unref(fsdev_io->u_out.readdir.fobject, 1);
691 			}
692 			break;
693 		}
694 
695 skip_entry:
696 		fhandle->dir.entry = NULL;
697 		fhandle->dir.offset = nextoff;
698 	}
699 
700 	SPDK_DEBUGLOG(fsdev_aio, "READDIR succeeded for " FOBJECT_FMT " (fh=%p, offset=%" PRIu64 ")\n",
701 		      FOBJECT_ARGS(fobject), fhandle, offset);
702 	return 0;
703 }
704 
705 static int
706 lo_forget(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
707 {
708 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
709 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
710 	uint64_t nlookup = fsdev_io->u_in.forget.nlookup;
711 
712 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
713 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
714 		return -EINVAL;
715 	}
716 
717 	file_object_unref(fobject, nlookup);
718 
719 	return 0;
720 }
721 
722 static uint32_t
723 update_open_flags(struct aio_fsdev *vfsdev, uint32_t flags)
724 {
725 	/*
726 	 * With writeback cache, kernel may send read requests even
727 	 * when userspace opened write-only
728 	 */
729 	if (vfsdev->fsdev.opts.writeback_cache_enabled && (flags & O_ACCMODE) == O_WRONLY) {
730 		flags &= ~O_ACCMODE;
731 		flags |= O_RDWR;
732 	}
733 
734 	/*
735 	 * With writeback cache, O_APPEND is handled by the kernel.
736 	 * This breaks atomicity (since the file may change in the
737 	 * underlying filesystem, so that the kernel's idea of the
738 	 * end of the file isn't accurate anymore). In this example,
739 	 * we just accept that. A more rigorous filesystem may want
740 	 * to return an error here
741 	 */
742 	if (vfsdev->fsdev.opts.writeback_cache_enabled && (flags & O_APPEND)) {
743 		flags &= ~O_APPEND;
744 	}
745 
746 	/*
747 	 * O_DIRECT in guest should not necessarily mean bypassing page
748 	 * cache on host as well. If somebody needs that behavior, it
749 	 * probably should be a configuration knob in daemon.
750 	 */
751 	flags &= ~O_DIRECT;
752 
753 	return flags;
754 }
755 
756 static int
757 lo_open(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
758 {
759 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
760 	int fd, saverr;
761 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.open.fobject;
762 	uint32_t flags = fsdev_io->u_in.open.flags;
763 	struct spdk_fsdev_file_handle *fhandle;
764 
765 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
766 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
767 		return -EINVAL;
768 	}
769 
770 	flags = update_open_flags(vfsdev, flags);
771 
772 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, flags & ~O_NOFOLLOW);
773 	if (fd == -1) {
774 		saverr = -errno;
775 		SPDK_ERRLOG("openat(%d, %s, 0x%08" PRIx32 ") failed with err=%d\n",
776 			    vfsdev->proc_self_fd, fobject->fd_str, flags, saverr);
777 		return saverr;
778 	}
779 
780 	fhandle = file_handle_create(fobject, fd);
781 	if (!fhandle) {
782 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
783 		close(fd);
784 		return -ENOMEM;
785 	}
786 
787 	fsdev_io->u_out.open.fhandle = fhandle;
788 
789 	SPDK_DEBUGLOG(fsdev_aio, "OPEN succeeded for " FOBJECT_FMT " (fh=%p, fd=%d)\n",
790 		      FOBJECT_ARGS(fobject), fhandle, fd);
791 
792 	return 0;
793 }
794 
795 static int
796 lo_flush(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
797 {
798 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
799 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flush.fobject;
800 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flush.fhandle;
801 	int res, saverr;
802 
803 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
804 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
805 		return -EINVAL;
806 	}
807 
808 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
809 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
810 		return -EINVAL;
811 	}
812 
813 	res = close(dup(fhandle->fd));
814 	if (res) {
815 		saverr = -errno;
816 		SPDK_ERRLOG("close(dup(%d)) failed for " FOBJECT_FMT " (fh=%p, err=%d)\n",
817 			    fhandle->fd, FOBJECT_ARGS(fobject), fhandle, saverr);
818 		return saverr;
819 	}
820 
821 	SPDK_DEBUGLOG(fsdev_aio, "FLUSH succeeded for " FOBJECT_FMT " (fh=%p)\n", FOBJECT_ARGS(fobject),
822 		      fhandle);
823 
824 	return 0;
825 }
826 
827 static int
828 lo_setattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
829 {
830 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
831 	int saverr;
832 	int res;
833 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setattr.fobject;
834 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.setattr.fhandle;
835 	uint32_t to_set = fsdev_io->u_in.setattr.to_set;
836 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_in.setattr.attr;
837 
838 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
839 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
840 		return -EINVAL;
841 	}
842 
843 	if (to_set & FSDEV_SET_ATTR_MODE) {
844 		if (fhandle) {
845 			res = fchmod(fhandle->fd, attr->mode);
846 		} else {
847 			res = fchmodat(vfsdev->proc_self_fd, fobject->fd_str, attr->mode, 0);
848 		}
849 		if (res == -1) {
850 			saverr = -errno;
851 			SPDK_ERRLOG("fchmod failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
852 			return saverr;
853 		}
854 	}
855 
856 	if (to_set & (FSDEV_SET_ATTR_UID | FSDEV_SET_ATTR_GID)) {
857 		uid_t uid = (to_set & FSDEV_SET_ATTR_UID) ? attr->uid : (uid_t) -1;
858 		gid_t gid = (to_set & FSDEV_SET_ATTR_GID) ? attr->gid : (gid_t) -1;
859 
860 		res = fchownat(fobject->fd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
861 		if (res == -1) {
862 			saverr = -errno;
863 			SPDK_ERRLOG("fchownat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
864 			return saverr;
865 		}
866 	}
867 
868 	if (to_set & FSDEV_SET_ATTR_SIZE) {
869 		int truncfd;
870 
871 		if (fhandle) {
872 			truncfd = fhandle->fd;
873 		} else {
874 			truncfd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
875 			if (truncfd < 0) {
876 				saverr = -errno;
877 				SPDK_ERRLOG("openat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
878 				return saverr;
879 			}
880 		}
881 
882 		res = ftruncate(truncfd, attr->size);
883 		if (!fhandle) {
884 			saverr = -errno;
885 			close(truncfd);
886 			errno = saverr;
887 		}
888 		if (res == -1) {
889 			saverr = -errno;
890 			SPDK_ERRLOG("ftruncate failed for " FOBJECT_FMT " (size=%" PRIu64 ")\n", FOBJECT_ARGS(fobject),
891 				    attr->size);
892 			return saverr;
893 		}
894 	}
895 
896 	if (to_set & (FSDEV_SET_ATTR_ATIME | FSDEV_SET_ATTR_MTIME)) {
897 		struct timespec tv[2];
898 
899 		tv[0].tv_sec = 0;
900 		tv[1].tv_sec = 0;
901 		tv[0].tv_nsec = UTIME_OMIT;
902 		tv[1].tv_nsec = UTIME_OMIT;
903 
904 		if (to_set & FSDEV_SET_ATTR_ATIME_NOW) {
905 			tv[0].tv_nsec = UTIME_NOW;
906 		} else if (to_set & FSDEV_SET_ATTR_ATIME) {
907 			tv[0].tv_sec = attr->atime;
908 			tv[0].tv_nsec = attr->atimensec;
909 		}
910 
911 		if (to_set & FSDEV_SET_ATTR_MTIME_NOW) {
912 			tv[1].tv_nsec = UTIME_NOW;
913 		} else if (to_set & FSDEV_SET_ATTR_MTIME) {
914 			tv[1].tv_sec = attr->mtime;
915 			tv[1].tv_nsec = attr->mtimensec;
916 		}
917 
918 		if (fhandle) {
919 			res = futimens(fhandle->fd, tv);
920 		} else {
921 			res = utimensat_empty(vfsdev, fobject, tv);
922 		}
923 		if (res == -1) {
924 			saverr = -errno;
925 			SPDK_ERRLOG("futimens/utimensat_empty failed for " FOBJECT_FMT "\n",
926 				    FOBJECT_ARGS(fobject));
927 			return saverr;
928 		}
929 	}
930 
931 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.setattr.attr);
932 	if (res) {
933 		SPDK_ERRLOG("file_object_fill_attr failed for " FOBJECT_FMT "\n",
934 			    FOBJECT_ARGS(fobject));
935 		return res;
936 	}
937 
938 	SPDK_DEBUGLOG(fsdev_aio, "SETATTR succeeded for " FOBJECT_FMT "\n",
939 		      FOBJECT_ARGS(fobject));
940 
941 	return 0;
942 }
943 
944 static int
945 lo_create(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
946 {
947 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
948 	int fd;
949 	int err;
950 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.create.parent_fobject;
951 	const char *name = fsdev_io->u_in.create.name;
952 	uint32_t mode = fsdev_io->u_in.create.mode;
953 	uint32_t flags = fsdev_io->u_in.create.flags;
954 	uint32_t umask = fsdev_io->u_in.create.umask;
955 	struct lo_cred old_cred, new_cred = {
956 		.euid = fsdev_io->u_in.create.euid,
957 		.egid = fsdev_io->u_in.create.egid,
958 	};
959 	struct spdk_fsdev_file_object *fobject;
960 	struct spdk_fsdev_file_handle *fhandle;
961 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_out.create.attr;
962 
963 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
964 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
965 		return -EINVAL;
966 	}
967 
968 	UNUSED(umask);
969 
970 	if (!is_safe_path_component(name)) {
971 		SPDK_ERRLOG("CREATE: %s not a safe component\n", name);
972 		return -EINVAL;
973 	}
974 
975 	err = lo_change_cred(&new_cred, &old_cred);
976 	if (err) {
977 		SPDK_ERRLOG("CREATE: cannot change credentials\n");
978 		return err;
979 	}
980 
981 	flags = update_open_flags(vfsdev, flags);
982 
983 	fd = openat(parent_fobject->fd, name, (flags | O_CREAT) & ~O_NOFOLLOW, mode);
984 	err = fd == -1 ? -errno : 0;
985 	lo_restore_cred(&old_cred);
986 
987 	if (err) {
988 		SPDK_ERRLOG("CREATE: openat failed with %d\n", err);
989 		return err;
990 	}
991 
992 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, attr);
993 	if (err) {
994 		SPDK_ERRLOG("CREATE: lookup failed with %d\n", err);
995 		return err;
996 	}
997 
998 	fhandle = file_handle_create(fobject, fd);
999 	if (!fhandle) {
1000 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
1001 		close(fd);
1002 		file_object_unref(fobject, 1);
1003 		return -ENOMEM;
1004 	}
1005 
1006 	SPDK_DEBUGLOG(fsdev_aio, "CREATE: succeeded (name=%s " FOBJECT_FMT " fh=%p)\n",
1007 		      name, FOBJECT_ARGS(fobject), fhandle);
1008 
1009 	fsdev_io->u_out.create.fobject = fobject;
1010 	fsdev_io->u_out.create.fhandle = fhandle;
1011 
1012 	return 0;
1013 }
1014 
1015 static int
1016 lo_release(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1017 {
1018 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1019 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.release.fobject;
1020 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.release.fhandle;
1021 
1022 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1023 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1024 		return -EINVAL;
1025 	}
1026 
1027 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1028 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1029 		return -EINVAL;
1030 	}
1031 
1032 	SPDK_DEBUGLOG(fsdev_aio, "RELEASE succeeded for " FOBJECT_FMT " fh=%p)\n",
1033 		      FOBJECT_ARGS(fobject), fhandle);
1034 
1035 	file_handle_delete(fhandle);
1036 
1037 	return 0;
1038 }
1039 
1040 static void
1041 lo_read_cb(void *ctx, uint32_t data_size, int error)
1042 {
1043 	struct spdk_fsdev_io *fsdev_io = ctx;
1044 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1045 
1046 	if (vfsdev_io->aio) {
1047 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1048 	}
1049 
1050 	fsdev_io->u_out.read.data_size = data_size;
1051 
1052 	spdk_fsdev_io_complete(fsdev_io, error);
1053 }
1054 
1055 static int
1056 lo_read(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1057 {
1058 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1059 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1060 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1061 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.read.fobject;
1062 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.read.fhandle;
1063 	size_t size = fsdev_io->u_in.read.size;
1064 	uint64_t offs = fsdev_io->u_in.read.offs;
1065 	uint32_t flags = fsdev_io->u_in.read.flags;
1066 	struct iovec *outvec = fsdev_io->u_in.read.iov;
1067 	uint32_t outcnt = fsdev_io->u_in.read.iovcnt;
1068 
1069 	/* we don't suport the memory domains at the moment */
1070 	assert(!fsdev_io->u_in.read.opts || !fsdev_io->u_in.read.opts->memory_domain);
1071 
1072 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1073 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1074 		return -EINVAL;
1075 	}
1076 
1077 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1078 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1079 		return -EINVAL;
1080 	}
1081 
1082 	UNUSED(flags);
1083 
1084 	if (!outcnt || !outvec) {
1085 		SPDK_ERRLOG("bad outvec: iov=%p outcnt=%" PRIu32 "\n", outvec, outcnt);
1086 		return -EINVAL;
1087 	}
1088 
1089 	if (vfsdev->skip_rw) {
1090 		uint32_t i;
1091 
1092 		fsdev_io->u_out.read.data_size = 0;
1093 
1094 		for (i = 0; i < outcnt; i++, outvec++) {
1095 			fsdev_io->u_out.read.data_size += outvec->iov_len;
1096 		}
1097 
1098 		TAILQ_INSERT_TAIL(&ch->ios_to_complete, vfsdev_io, link);
1099 
1100 		return IO_STATUS_ASYNC;
1101 	}
1102 
1103 	vfsdev_io->aio = spdk_aio_mgr_read(ch->mgr, lo_read_cb, fsdev_io, fhandle->fd, offs, size, outvec,
1104 					   outcnt);
1105 	if (vfsdev_io->aio) {
1106 		vfsdev_io->ch = ch;
1107 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1108 	}
1109 
1110 	return IO_STATUS_ASYNC;
1111 }
1112 
1113 static void
1114 lo_write_cb(void *ctx, uint32_t data_size, int error)
1115 {
1116 	struct spdk_fsdev_io *fsdev_io = ctx;
1117 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1118 
1119 	if (vfsdev_io->aio) {
1120 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1121 	}
1122 
1123 	fsdev_io->u_out.write.data_size = data_size;
1124 
1125 	spdk_fsdev_io_complete(fsdev_io, error);
1126 }
1127 
1128 static int
1129 lo_write(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1130 {
1131 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1132 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1133 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1134 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.write.fobject;
1135 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.write.fhandle;
1136 	size_t size = fsdev_io->u_in.write.size;
1137 	uint64_t offs = fsdev_io->u_in.write.offs;
1138 	uint32_t flags = fsdev_io->u_in.write.flags;
1139 	const struct iovec *invec = fsdev_io->u_in.write.iov;
1140 	uint32_t incnt =  fsdev_io->u_in.write.iovcnt;
1141 
1142 	/* we don't suport the memory domains at the moment */
1143 	assert(!fsdev_io->u_in.write.opts || !fsdev_io->u_in.write.opts->memory_domain);
1144 
1145 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1146 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1147 		return -EINVAL;
1148 	}
1149 
1150 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1151 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1152 		return -EINVAL;
1153 	}
1154 
1155 	UNUSED(flags);
1156 
1157 	if (!incnt || !invec) { /* there should be at least one iovec with data */
1158 		SPDK_ERRLOG("bad invec: iov=%p cnt=%" PRIu32 "\n", invec, incnt);
1159 		return -EINVAL;
1160 	}
1161 
1162 	if (vfsdev->skip_rw) {
1163 		uint32_t i;
1164 
1165 		fsdev_io->u_out.write.data_size = 0;
1166 		for (i = 0; i < incnt; i++, invec++) {
1167 			fsdev_io->u_out.write.data_size += invec->iov_len;
1168 		}
1169 
1170 		TAILQ_INSERT_TAIL(&ch->ios_to_complete, vfsdev_io, link);
1171 
1172 		return IO_STATUS_ASYNC;
1173 	}
1174 
1175 	vfsdev_io->aio = spdk_aio_mgr_write(ch->mgr, lo_write_cb, fsdev_io,
1176 					    fhandle->fd, offs, size, invec, incnt);
1177 	if (vfsdev_io->aio) {
1178 		vfsdev_io->ch = ch;
1179 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1180 	}
1181 
1182 	return IO_STATUS_ASYNC;
1183 }
1184 
1185 static int
1186 lo_readlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1187 {
1188 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1189 	int res;
1190 	char *buf;
1191 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readlink.fobject;
1192 
1193 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1194 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1195 		return -EINVAL;
1196 	}
1197 
1198 	buf = malloc(PATH_MAX + 1);
1199 	if (!buf) {
1200 		SPDK_ERRLOG("malloc(%zu) failed\n", (size_t)(PATH_MAX + 1));
1201 		return -ENOMEM;
1202 	}
1203 
1204 	res = readlinkat(fobject->fd, "", buf, PATH_MAX + 1);
1205 	if (res == -1) {
1206 		int saverr = -errno;
1207 		SPDK_ERRLOG("readlinkat failed for " FOBJECT_FMT " with %d\n",
1208 			    FOBJECT_ARGS(fobject), saverr);
1209 		free(buf);
1210 		return saverr;
1211 	}
1212 
1213 	if (((uint32_t)res) == PATH_MAX + 1) {
1214 		SPDK_ERRLOG("buffer is too short\n");
1215 		free(buf);
1216 		return -ENAMETOOLONG;
1217 	}
1218 
1219 	buf[res] = 0;
1220 	fsdev_io->u_out.readlink.linkname = buf;
1221 
1222 	return 0;
1223 }
1224 
1225 static int
1226 lo_statfs(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1227 {
1228 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1229 	int res;
1230 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.statfs.fobject;
1231 	struct statvfs stbuf;
1232 
1233 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1234 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1235 		return -EINVAL;
1236 	}
1237 
1238 	res = fstatvfs(fobject->fd, &stbuf);
1239 	if (res == -1) {
1240 		int saverr = -errno;
1241 		SPDK_ERRLOG("fstatvfs failed with %d\n", saverr);
1242 		return saverr;
1243 	}
1244 
1245 	fsdev_io->u_out.statfs.statfs.blocks = stbuf.f_blocks;
1246 	fsdev_io->u_out.statfs.statfs.bfree = stbuf.f_bfree;
1247 	fsdev_io->u_out.statfs.statfs.bavail = stbuf.f_bavail;
1248 	fsdev_io->u_out.statfs.statfs.files = stbuf.f_files;
1249 	fsdev_io->u_out.statfs.statfs.ffree = stbuf.f_ffree;
1250 	fsdev_io->u_out.statfs.statfs.bsize = stbuf.f_bsize;
1251 	fsdev_io->u_out.statfs.statfs.namelen = stbuf.f_namemax;
1252 	fsdev_io->u_out.statfs.statfs.frsize = stbuf.f_frsize;
1253 
1254 	return 0;
1255 }
1256 
1257 static int
1258 lo_mknod_symlink(struct spdk_fsdev_io *fsdev_io, struct spdk_fsdev_file_object *parent_fobject,
1259 		 const char *name, mode_t mode, dev_t rdev, const char *link, uid_t euid, gid_t egid,
1260 		 struct spdk_fsdev_file_object **pfobject, struct spdk_fsdev_file_attr *attr)
1261 {
1262 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1263 	int res;
1264 	int saverr;
1265 	struct lo_cred old_cred, new_cred = {
1266 		.euid = euid,
1267 		.egid = egid,
1268 	};
1269 
1270 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1271 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1272 		return -EINVAL;
1273 	}
1274 
1275 	if (!is_safe_path_component(name)) {
1276 		SPDK_ERRLOG("%s isn'h safe\n", name);
1277 		return -EINVAL;
1278 	}
1279 
1280 	res = lo_change_cred(&new_cred, &old_cred);
1281 	if (res) {
1282 		SPDK_ERRLOG("cannot change cred (err=%d)\n", res);
1283 		return res;
1284 	}
1285 
1286 	if (S_ISDIR(mode)) {
1287 		res = mkdirat(parent_fobject->fd, name, mode);
1288 	} else if (S_ISLNK(mode)) {
1289 		if (link) {
1290 			res = symlinkat(link, parent_fobject->fd, name);
1291 		} else {
1292 			SPDK_ERRLOG("NULL link pointer\n");
1293 			errno = EINVAL;
1294 		}
1295 	} else {
1296 		res = mknodat(parent_fobject->fd, name, mode, rdev);
1297 	}
1298 	saverr = -errno;
1299 
1300 	lo_restore_cred(&old_cred);
1301 
1302 	if (res == -1) {
1303 		SPDK_ERRLOG("cannot mkdirat/symlinkat/mknodat (err=%d)\n", saverr);
1304 		return saverr;
1305 	}
1306 
1307 	res = lo_do_lookup(vfsdev, parent_fobject, name, pfobject, attr);
1308 	if (res) {
1309 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1310 		return res;
1311 	}
1312 
1313 	SPDK_DEBUGLOG(fsdev_aio, "lo_mknod_symlink(" FOBJECT_FMT "/%s -> " FOBJECT_FMT "\n",
1314 		      FOBJECT_ARGS(parent_fobject), name, FOBJECT_ARGS(*pfobject));
1315 
1316 	return 0;
1317 }
1318 
1319 static int
1320 lo_mknod(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1321 {
1322 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mknod.parent_fobject;
1323 	char *name = fsdev_io->u_in.mknod.name;
1324 	mode_t mode = fsdev_io->u_in.mknod.mode;
1325 	dev_t rdev = fsdev_io->u_in.mknod.rdev;
1326 	uid_t euid = fsdev_io->u_in.mknod.euid;
1327 	gid_t egid = fsdev_io->u_in.mknod.egid;
1328 
1329 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, mode, rdev, NULL, euid, egid,
1330 				&fsdev_io->u_out.mknod.fobject, &fsdev_io->u_out.mknod.attr);
1331 }
1332 
1333 static int
1334 lo_mkdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1335 {
1336 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mkdir.parent_fobject;
1337 	char *name = fsdev_io->u_in.mkdir.name;
1338 	mode_t mode = fsdev_io->u_in.mkdir.mode;
1339 	uid_t euid = fsdev_io->u_in.mkdir.euid;
1340 	gid_t egid = fsdev_io->u_in.mkdir.egid;
1341 
1342 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, S_IFDIR | mode, 0, NULL, euid, egid,
1343 				&fsdev_io->u_out.mkdir.fobject, &fsdev_io->u_out.mkdir.attr);
1344 }
1345 
1346 static int
1347 lo_symlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1348 {
1349 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.symlink.parent_fobject;
1350 	char *target = fsdev_io->u_in.symlink.target;
1351 	char *linkpath = fsdev_io->u_in.symlink.linkpath;
1352 	uid_t euid = fsdev_io->u_in.symlink.euid;
1353 	gid_t egid = fsdev_io->u_in.symlink.egid;
1354 
1355 	return lo_mknod_symlink(fsdev_io, parent_fobject, target, S_IFLNK, 0, linkpath, euid, egid,
1356 				&fsdev_io->u_out.symlink.fobject, &fsdev_io->u_out.symlink.attr);
1357 }
1358 
1359 static int
1360 lo_do_unlink(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
1361 	     const char *name, bool is_dir)
1362 {
1363 	/* fobject must be initialized to avoid a scan-build false positive */
1364 	struct spdk_fsdev_file_object *fobject = NULL;
1365 	int res;
1366 
1367 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1368 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1369 		return -EINVAL;
1370 	}
1371 
1372 	if (!is_safe_path_component(name)) {
1373 		SPDK_ERRLOG("%s isn't safe\n", name);
1374 		return -EINVAL;
1375 	}
1376 
1377 	res = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, NULL);
1378 	if (res) {
1379 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1380 		return -EIO;
1381 	}
1382 
1383 	res = unlinkat(parent_fobject->fd, name, is_dir ? AT_REMOVEDIR : 0);
1384 	if (res) {
1385 		res = -errno;
1386 		SPDK_WARNLOG("unlinkat(" FOBJECT_FMT " %s) failed (err=%d)\n",
1387 			     FOBJECT_ARGS(parent_fobject), name, res);
1388 	}
1389 
1390 	file_object_unref(fobject, 1);
1391 	return res;
1392 }
1393 
1394 static int
1395 lo_unlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1396 {
1397 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1398 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.unlink.parent_fobject;
1399 	char *name = fsdev_io->u_in.unlink.name;
1400 
1401 	return lo_do_unlink(vfsdev, parent_fobject, name, false);
1402 }
1403 
1404 static int
1405 lo_rmdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1406 {
1407 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1408 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rmdir.parent_fobject;
1409 	char *name = fsdev_io->u_in.rmdir.name;
1410 
1411 	return lo_do_unlink(vfsdev, parent_fobject, name, true);
1412 }
1413 
1414 static int
1415 lo_rename(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1416 {
1417 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1418 	int res, saverr;
1419 	/* old_fobject must be initialized to avoid a scan-build false positive */
1420 	struct spdk_fsdev_file_object *old_fobject = NULL;
1421 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rename.parent_fobject;
1422 	char *name = fsdev_io->u_in.rename.name;
1423 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.rename.new_parent_fobject;
1424 	char *new_name = fsdev_io->u_in.rename.new_name;
1425 	uint32_t flags = fsdev_io->u_in.rename.flags;
1426 
1427 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1428 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1429 		return -EINVAL;
1430 	}
1431 
1432 	if (!fsdev_aio_is_valid_fobject(vfsdev, new_parent_fobject)) {
1433 		SPDK_ERRLOG("Invalid new_parent_fobject: %p\n", new_parent_fobject);
1434 		return -EINVAL;
1435 	}
1436 
1437 	if (!is_safe_path_component(name)) {
1438 		SPDK_ERRLOG("name '%s' isn't safe\n", name);
1439 		return -EINVAL;
1440 	}
1441 
1442 	if (!is_safe_path_component(new_name)) {
1443 		SPDK_ERRLOG("newname '%s' isn't safe\n", new_name);
1444 		return -EINVAL;
1445 	}
1446 
1447 	res = lo_do_lookup(vfsdev, parent_fobject, name, &old_fobject, NULL);
1448 	if (res) {
1449 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1450 		return -EIO;
1451 	}
1452 
1453 	saverr = 0;
1454 	if (flags) {
1455 #ifndef SYS_renameat2
1456 		SPDK_ERRLOG("flags are not supported\n");
1457 		return -ENOTSUP;
1458 #else
1459 		res = syscall(SYS_renameat2, parent_fobject->fd, name, new_parent_fobject->fd,
1460 			      new_name, flags);
1461 		if (res == -1 && errno == ENOSYS) {
1462 			SPDK_ERRLOG("SYS_renameat2 returned ENOSYS\n");
1463 			saverr = -EINVAL;
1464 		} else if (res == -1) {
1465 			saverr = -errno;
1466 			SPDK_ERRLOG("SYS_renameat2 failed (err=%d))\n", saverr);
1467 		}
1468 #endif
1469 	} else {
1470 		res = renameat(parent_fobject->fd, name, new_parent_fobject->fd, new_name);
1471 		if (res == -1) {
1472 			saverr = -errno;
1473 			SPDK_ERRLOG("renameat failed (err=%d)\n", saverr);
1474 		}
1475 	}
1476 
1477 	file_object_unref(old_fobject, 1);
1478 
1479 	return saverr;
1480 }
1481 
1482 static int
1483 linkat_empty_nofollow(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject, int dfd,
1484 		      const char *name)
1485 {
1486 	int res;
1487 
1488 	if (fobject->is_symlink) {
1489 		res = linkat(fobject->fd, "", dfd, name, AT_EMPTY_PATH);
1490 		if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
1491 			/* Sorry, no race free way to hard-link a symlink. */
1492 			errno = EPERM;
1493 		}
1494 	} else {
1495 		res = linkat(vfsdev->proc_self_fd, fobject->fd_str, dfd, name, AT_SYMLINK_FOLLOW);
1496 	}
1497 
1498 	return res;
1499 }
1500 
1501 static int
1502 lo_link(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1503 {
1504 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1505 	int res;
1506 	int saverr;
1507 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.link.fobject;
1508 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.link.new_parent_fobject;
1509 	char *name = fsdev_io->u_in.link.name;
1510 
1511 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1512 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1513 		return -EINVAL;
1514 	}
1515 
1516 	if (!is_safe_path_component(name)) {
1517 		SPDK_ERRLOG("%s is not a safe component\n", name);
1518 		return -EINVAL;
1519 	}
1520 
1521 	res = linkat_empty_nofollow(vfsdev, fobject, new_parent_fobject->fd, name);
1522 	if (res == -1) {
1523 		saverr = -errno;
1524 		SPDK_ERRLOG("linkat_empty_nofollow failed " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s (err=%d)\n",
1525 			    FOBJECT_ARGS(fobject), FOBJECT_ARGS(new_parent_fobject), name, saverr);
1526 		return saverr;
1527 	}
1528 
1529 	res = lo_do_lookup(vfsdev, new_parent_fobject, name, &fsdev_io->u_out.link.fobject,
1530 			   &fsdev_io->u_out.link.attr);
1531 	if (res) {
1532 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1533 		return res;
1534 	}
1535 
1536 	SPDK_DEBUGLOG(fsdev_aio, "LINK succeeded for " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s\n",
1537 		      FOBJECT_ARGS(fobject), FOBJECT_ARGS(fsdev_io->u_out.link.fobject), name);
1538 
1539 	return 0;
1540 }
1541 
1542 static int
1543 lo_fsync(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1544 {
1545 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1546 	int res, saverr, fd;
1547 	char *buf;
1548 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsync.fobject;
1549 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsync.fhandle;
1550 	bool datasync = fsdev_io->u_in.fsync.datasync;
1551 
1552 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1553 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1554 		return -EINVAL;
1555 	}
1556 
1557 	if (!fhandle) {
1558 		res = asprintf(&buf, "%i", fobject->fd);
1559 		if (res == -1) {
1560 			saverr = -errno;
1561 			SPDK_ERRLOG("asprintf failed (errno=%d)\n", saverr);
1562 			return saverr;
1563 		}
1564 
1565 		fd = openat(vfsdev->proc_self_fd, buf, O_RDWR);
1566 		saverr = -errno;
1567 		free(buf);
1568 		if (fd == -1) {
1569 			SPDK_ERRLOG("openat failed (errno=%d)\n", saverr);
1570 			return saverr;
1571 		}
1572 	} else {
1573 		fd = fhandle->fd;
1574 	}
1575 
1576 	if (datasync) {
1577 		res = fdatasync(fd);
1578 	} else {
1579 		res = fsync(fd);
1580 	}
1581 
1582 	saverr = -errno;
1583 	if (!fhandle) {
1584 		close(fd);
1585 	}
1586 
1587 	if (res == -1) {
1588 		SPDK_ERRLOG("fdatasync/fsync failed for " FOBJECT_FMT " fh=%p (err=%d)\n",
1589 			    FOBJECT_ARGS(fobject), fhandle, saverr);
1590 		return saverr;
1591 	}
1592 
1593 	SPDK_DEBUGLOG(fsdev_aio, "FSYNC succeeded for " FOBJECT_FMT " fh=%p\n",
1594 		      FOBJECT_ARGS(fobject), fhandle);
1595 
1596 	return 0;
1597 }
1598 
1599 static int
1600 lo_setxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1601 {
1602 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1603 	ssize_t ret;
1604 	int saverr;
1605 	int fd = -1;
1606 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setxattr.fobject;
1607 	char *name = fsdev_io->u_in.setxattr.name;
1608 	char *value = fsdev_io->u_in.setxattr.value;
1609 	uint32_t size = fsdev_io->u_in.setxattr.size;
1610 	uint32_t flags = fsdev_io->u_in.setxattr.flags;
1611 
1612 	if (!vfsdev->xattr_enabled) {
1613 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1614 		return -ENOSYS;
1615 	}
1616 
1617 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1618 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1619 		return -EINVAL;
1620 	}
1621 
1622 	if (fobject->is_symlink) {
1623 		/* Sorry, no race free way to removexattr on symlink. */
1624 		SPDK_ERRLOG("cannot set xattr for symlink\n");
1625 		return -EPERM;
1626 	}
1627 
1628 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1629 	if (fd < 0) {
1630 		saverr = -errno;
1631 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1632 		return saverr;
1633 	}
1634 
1635 	ret = fsetxattr(fd, name, value, size, flags);
1636 	saverr = -errno;
1637 	close(fd);
1638 	if (ret == -1) {
1639 		if (saverr == -ENOTSUP) {
1640 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1641 		} else {
1642 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1643 		}
1644 		return saverr;
1645 	}
1646 
1647 	SPDK_DEBUGLOG(fsdev_aio,
1648 		      "SETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s size=%" PRIu32 "flags=0x%x" PRIx32 "\n",
1649 		      FOBJECT_ARGS(fobject), name, value, size, flags);
1650 
1651 	return 0;
1652 }
1653 
1654 static int
1655 lo_getxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1656 {
1657 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1658 	ssize_t ret;
1659 	int saverr;
1660 	int fd = -1;
1661 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getxattr.fobject;
1662 	char *name = fsdev_io->u_in.getxattr.name;
1663 	void *buffer = fsdev_io->u_in.getxattr.buffer;
1664 	size_t size = fsdev_io->u_in.getxattr.size;
1665 
1666 	if (!vfsdev->xattr_enabled) {
1667 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1668 		return -ENOSYS;
1669 	}
1670 
1671 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1672 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1673 		return -EINVAL;
1674 	}
1675 
1676 	if (fobject->is_symlink) {
1677 		/* Sorry, no race free way to getxattr on symlink. */
1678 		SPDK_ERRLOG("cannot get xattr for symlink\n");
1679 		return -EPERM;
1680 	}
1681 
1682 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1683 	if (fd < 0) {
1684 		saverr = -errno;
1685 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1686 		return saverr;
1687 	}
1688 
1689 	ret = fgetxattr(fd, name, buffer, size);
1690 	saverr = -errno;
1691 	close(fd);
1692 	if (ret == -1) {
1693 		if (saverr == -ENODATA) {
1694 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: no extended attribute '%s' found\n", name);
1695 		} else if (saverr == -ENOTSUP) {
1696 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: extended attributes are not supported or disabled\n");
1697 		} else {
1698 			SPDK_ERRLOG("fgetxattr failed with errno=%d\n", saverr);
1699 		}
1700 		return saverr;
1701 	}
1702 
1703 	fsdev_io->u_out.getxattr.value_size = ret;
1704 
1705 	SPDK_DEBUGLOG(fsdev_aio,
1706 		      "GETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s value_size=%zd\n",
1707 		      FOBJECT_ARGS(fobject), name, (char *)buffer, ret);
1708 
1709 	return 0;
1710 }
1711 
1712 static int
1713 lo_listxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1714 {
1715 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1716 	ssize_t ret;
1717 	int saverr;
1718 	int fd = -1;
1719 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.listxattr.fobject;
1720 	char *buffer = fsdev_io->u_in.listxattr.buffer;
1721 	size_t size = fsdev_io->u_in.listxattr.size;
1722 
1723 	if (!vfsdev->xattr_enabled) {
1724 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1725 		return -ENOSYS;
1726 	}
1727 
1728 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1729 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1730 		return -EINVAL;
1731 	}
1732 
1733 	if (fobject->is_symlink) {
1734 		/* Sorry, no race free way to listxattr on symlink. */
1735 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1736 		return -EPERM;
1737 	}
1738 
1739 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1740 	if (fd < 0) {
1741 		saverr = -errno;
1742 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1743 		return saverr;
1744 	}
1745 
1746 	ret = flistxattr(fd, buffer, size);
1747 	saverr = -errno;
1748 	close(fd);
1749 	if (ret == -1) {
1750 		if (saverr == -ENOTSUP) {
1751 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1752 		} else {
1753 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1754 		}
1755 		return saverr;
1756 	}
1757 
1758 	fsdev_io->u_out.listxattr.data_size = ret;
1759 	fsdev_io->u_out.listxattr.size_only = (size == 0);
1760 
1761 	SPDK_DEBUGLOG(fsdev_aio, "LISTXATTR succeeded for " FOBJECT_FMT " data_size=%zu\n",
1762 		      FOBJECT_ARGS(fobject), ret);
1763 
1764 	return 0;
1765 }
1766 
1767 static int
1768 lo_removexattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1769 {
1770 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1771 	ssize_t ret;
1772 	int saverr;
1773 	int fd = -1;
1774 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.removexattr.fobject;
1775 	char *name = fsdev_io->u_in.removexattr.name;
1776 
1777 	if (!vfsdev->xattr_enabled) {
1778 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1779 		return -ENOSYS;
1780 	}
1781 
1782 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1783 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1784 		return -EINVAL;
1785 	}
1786 
1787 	if (fobject->is_symlink) {
1788 		/* Sorry, no race free way to setxattr on symlink. */
1789 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1790 		return -EPERM;
1791 	}
1792 
1793 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1794 	if (fd < 0) {
1795 		saverr = -errno;
1796 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1797 		return saverr;
1798 	}
1799 
1800 	ret = fremovexattr(fd, name);
1801 	saverr = -errno;
1802 	close(fd);
1803 	if (ret == -1) {
1804 		if (saverr == -ENODATA) {
1805 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: no extended attribute '%s' found\n", name);
1806 		} else if (saverr == -ENOTSUP) {
1807 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: extended attributes are not supported or disabled\n");
1808 		} else {
1809 			SPDK_ERRLOG("fremovexattr failed with errno=%d\n", saverr);
1810 		}
1811 		return saverr;
1812 	}
1813 
1814 	SPDK_DEBUGLOG(fsdev_aio, "REMOVEXATTR succeeded for " FOBJECT_FMT " name=%s\n",
1815 		      FOBJECT_ARGS(fobject), name);
1816 
1817 	return 0;
1818 }
1819 
1820 static int
1821 lo_fsyncdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1822 {
1823 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1824 	int res;
1825 	int saverr = 0;
1826 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsyncdir.fobject;
1827 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsyncdir.fhandle;
1828 	bool datasync = fsdev_io->u_in.fsyncdir.datasync;
1829 
1830 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1831 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1832 		return -EINVAL;
1833 	}
1834 
1835 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1836 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1837 		return -EINVAL;
1838 	}
1839 
1840 	if (datasync) {
1841 		res = fdatasync(fhandle->fd);
1842 	} else {
1843 		res = fsync(fhandle->fd);
1844 	}
1845 
1846 	if (res == -1) {
1847 		saverr = -errno;
1848 		SPDK_ERRLOG("%s failed for fh=%p with err=%d\n",
1849 			    datasync ? "fdatasync" : "fsync", fhandle, saverr);
1850 		return saverr;
1851 	}
1852 
1853 	SPDK_DEBUGLOG(fsdev_aio, "FSYNCDIR succeeded for " FOBJECT_FMT " fh=%p datasync=%d\n",
1854 		      FOBJECT_ARGS(fobject), fhandle, datasync);
1855 
1856 	return 0;
1857 }
1858 
1859 static int
1860 lo_flock(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1861 {
1862 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1863 	int res;
1864 	int saverr = 0;
1865 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flock.fobject;
1866 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flock.fhandle;
1867 	int operation = fsdev_io->u_in.flock.operation;
1868 
1869 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1870 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1871 		return -EINVAL;
1872 	}
1873 
1874 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1875 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1876 		return -EINVAL;
1877 	}
1878 
1879 	res = flock(fhandle->fd, operation | LOCK_NB);
1880 	if (res == -1) {
1881 		saverr = -errno;
1882 		SPDK_ERRLOG("flock failed for fh=%p with err=%d\n", fhandle, saverr);
1883 		return saverr;
1884 	}
1885 
1886 	SPDK_DEBUGLOG(fsdev_aio, "FLOCK succeeded for " FOBJECT_FMT " fh=%p operation=%d\n",
1887 		      FOBJECT_ARGS(fobject), fhandle, operation);
1888 
1889 	return 0;
1890 }
1891 
1892 static int
1893 lo_fallocate(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1894 {
1895 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1896 	int err;
1897 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fallocate.fobject;
1898 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fallocate.fhandle;
1899 	uint32_t mode = fsdev_io->u_in.fallocate.mode;
1900 	uint64_t offset  = fsdev_io->u_in.fallocate.offset;
1901 	uint64_t length = fsdev_io->u_in.fallocate.length;
1902 
1903 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1904 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1905 		return -EINVAL;
1906 	}
1907 
1908 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1909 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1910 		return -EINVAL;
1911 	}
1912 
1913 	if (mode) {
1914 		SPDK_ERRLOG("non-zero mode is not suppored\n");
1915 		return -EOPNOTSUPP;
1916 	}
1917 
1918 	err = posix_fallocate(fhandle->fd, offset, length);
1919 	if (err) {
1920 		SPDK_ERRLOG("posix_fallocate failed for fh=%p with err=%d\n",
1921 			    fhandle, err);
1922 	}
1923 
1924 	SPDK_DEBUGLOG(fsdev_aio,
1925 		      "FALLOCATE returns %d for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " length=%" PRIu64 "\n",
1926 		      err, FOBJECT_ARGS(fobject), fhandle, offset, length);
1927 	return err;
1928 }
1929 
1930 static int
1931 lo_copy_file_range(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1932 {
1933 #ifdef SPDK_CONFIG_COPY_FILE_RANGE
1934 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1935 	ssize_t res;
1936 	int saverr = 0;
1937 	struct spdk_fsdev_file_object *fobject_in = fsdev_io->u_in.copy_file_range.fobject_in;
1938 	struct spdk_fsdev_file_handle *fhandle_in = fsdev_io->u_in.copy_file_range.fhandle_in;
1939 	off_t off_in = fsdev_io->u_in.copy_file_range.off_in;
1940 	struct spdk_fsdev_file_object *fobject_out = fsdev_io->u_in.copy_file_range.fobject_out;
1941 	struct spdk_fsdev_file_handle *fhandle_out = fsdev_io->u_in.copy_file_range.fhandle_out;
1942 	off_t off_out = fsdev_io->u_in.copy_file_range.off_out;
1943 	size_t len = fsdev_io->u_in.copy_file_range.len;
1944 	uint32_t flags = fsdev_io->u_in.copy_file_range.flags;
1945 
1946 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_in)) {
1947 		SPDK_ERRLOG("Invalid fobject_in: %p\n", fobject_in);
1948 		return -EINVAL;
1949 	}
1950 
1951 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_in)) {
1952 		SPDK_ERRLOG("Invalid fhandle_in: %p\n", fhandle_in);
1953 		return -EINVAL;
1954 	}
1955 
1956 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_out)) {
1957 		SPDK_ERRLOG("Invalid fobject_out: %p\n", fobject_out);
1958 		return -EINVAL;
1959 	}
1960 
1961 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_out)) {
1962 		SPDK_ERRLOG("Invalid fhandle_out: %p\n", fhandle_out);
1963 		return -EINVAL;
1964 	}
1965 
1966 	res = copy_file_range(fhandle_in->fd, &off_in, fhandle_out->fd, &off_out, len, flags);
1967 	if (res < 0) {
1968 		saverr = -errno;
1969 		SPDK_ERRLOG("copy_file_range failed with err=%d\n", saverr);
1970 		return saverr;
1971 	}
1972 
1973 	SPDK_DEBUGLOG(fsdev_aio,
1974 		      "COPY_FILE_RANGE succeeded for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " -> " FOBJECT_FMT
1975 		      " fh=%p offset=%" PRIu64 " (len-%zu flags=0x%" PRIx32 ")\n",
1976 		      FOBJECT_ARGS(fobject_in), fhandle_in, (uint64_t)off_in, FOBJECT_ARGS(fobject_out), fhandle_out,
1977 		      (uint64_t)off_out, len, flags);
1978 
1979 	return 0;
1980 #else
1981 	return -ENOSYS;
1982 #endif
1983 }
1984 
1985 static int
1986 lo_abort(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1987 {
1988 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1989 	struct aio_fsdev_io *vfsdev_io;
1990 	uint64_t unique_to_abort = fsdev_io->u_in.abort.unique_to_abort;
1991 
1992 	TAILQ_FOREACH(vfsdev_io, &ch->ios_in_progress, link) {
1993 		struct spdk_fsdev_io *_fsdev_io = aio_to_fsdev_io(vfsdev_io);
1994 		if (spdk_fsdev_io_get_unique(_fsdev_io) == unique_to_abort) {
1995 			spdk_aio_mgr_cancel(ch->mgr, vfsdev_io->aio);
1996 			return 0;
1997 		}
1998 	}
1999 
2000 	return 0;
2001 }
2002 
2003 static int
2004 aio_io_poll(void *arg)
2005 {
2006 	struct aio_fsdev_io *vfsdev_io, *tmp;
2007 	struct aio_io_channel *ch = arg;
2008 	int res = SPDK_POLLER_IDLE;
2009 
2010 	if (spdk_aio_mgr_poll(ch->mgr)) {
2011 		res = SPDK_POLLER_BUSY;
2012 	}
2013 
2014 	TAILQ_FOREACH_SAFE(vfsdev_io, &ch->ios_to_complete, link, tmp) {
2015 		struct spdk_fsdev_io *fsdev_io = aio_to_fsdev_io(vfsdev_io);
2016 
2017 		TAILQ_REMOVE(&ch->ios_to_complete, vfsdev_io, link);
2018 		spdk_fsdev_io_complete(fsdev_io, 0);
2019 		res = SPDK_POLLER_BUSY;
2020 	}
2021 
2022 	return res;
2023 }
2024 
2025 static int
2026 aio_fsdev_create_cb(void *io_device, void *ctx_buf)
2027 {
2028 	struct aio_io_channel *ch = ctx_buf;
2029 	struct spdk_thread *thread = spdk_get_thread();
2030 
2031 	ch->mgr = spdk_aio_mgr_create(MAX_AIOS);
2032 	if (!ch->mgr) {
2033 		SPDK_ERRLOG("aoi manager init for failed (thread=%s)\n", spdk_thread_get_name(thread));
2034 		return -ENOMEM;
2035 	}
2036 
2037 	ch->poller = SPDK_POLLER_REGISTER(aio_io_poll, ch, 0);
2038 	TAILQ_INIT(&ch->ios_in_progress);
2039 	TAILQ_INIT(&ch->ios_to_complete);
2040 
2041 	SPDK_DEBUGLOG(fsdev_aio, "Created aio fsdev IO channel: thread %s, thread id %" PRIu64
2042 		      "\n",
2043 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2044 	return 0;
2045 }
2046 
2047 static void
2048 aio_fsdev_destroy_cb(void *io_device, void *ctx_buf)
2049 {
2050 	struct aio_io_channel *ch = ctx_buf;
2051 	struct spdk_thread *thread = spdk_get_thread();
2052 
2053 	UNUSED(thread);
2054 
2055 	spdk_poller_unregister(&ch->poller);
2056 	spdk_aio_mgr_delete(ch->mgr);
2057 
2058 	SPDK_DEBUGLOG(fsdev_aio, "Destroyed aio fsdev IO channel: thread %s, thread id %" PRIu64
2059 		      "\n",
2060 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2061 }
2062 
2063 static int
2064 fsdev_aio_initialize(void)
2065 {
2066 	/*
2067 	 * We need to pick some unique address as our "io device" - so just use the
2068 	 *  address of the global tailq.
2069 	 */
2070 	spdk_io_device_register(&g_aio_fsdev_head,
2071 				aio_fsdev_create_cb, aio_fsdev_destroy_cb,
2072 				sizeof(struct aio_io_channel), "aio_fsdev");
2073 
2074 	return 0;
2075 }
2076 
2077 static void
2078 _fsdev_aio_finish_cb(void *arg)
2079 {
2080 	/* @todo: handle async module fini */
2081 	/* spdk_fsdev_module_fini_done(); */
2082 }
2083 
2084 static void
2085 fsdev_aio_finish(void)
2086 {
2087 	spdk_io_device_unregister(&g_aio_fsdev_head, _fsdev_aio_finish_cb);
2088 }
2089 
2090 static int
2091 fsdev_aio_get_ctx_size(void)
2092 {
2093 	return sizeof(struct aio_fsdev_io);
2094 }
2095 
2096 static struct spdk_fsdev_module aio_fsdev_module = {
2097 	.name = "aio",
2098 	.module_init = fsdev_aio_initialize,
2099 	.module_fini = fsdev_aio_finish,
2100 	.get_ctx_size	= fsdev_aio_get_ctx_size,
2101 };
2102 
2103 SPDK_FSDEV_MODULE_REGISTER(aio, &aio_fsdev_module);
2104 
2105 static void
2106 fsdev_aio_free(struct aio_fsdev *vfsdev)
2107 {
2108 	if (vfsdev->proc_self_fd != -1) {
2109 		close(vfsdev->proc_self_fd);
2110 	}
2111 
2112 	if (vfsdev->root) {
2113 		int destroyed = file_object_unref(vfsdev->root, 1);
2114 		assert(destroyed == 0);
2115 		UNUSED(destroyed);
2116 
2117 	}
2118 
2119 	free(vfsdev->fsdev.name);
2120 	free(vfsdev->root_path);
2121 
2122 	free(vfsdev);
2123 }
2124 
2125 static void
2126 fsdev_free_leafs(struct spdk_fsdev_file_object *fobject)
2127 {
2128 	while (!TAILQ_EMPTY(&fobject->handles)) {
2129 		struct spdk_fsdev_file_handle *fhandle = TAILQ_FIRST(&fobject->handles);
2130 		file_handle_delete(fhandle);
2131 #ifdef __clang_analyzer__
2132 		/*
2133 		 * scan-build fails to comprehend that file_handle_delete() removes the fhandle
2134 		 * from the queue, so it thinks it's remained accessible and throws the "Use of
2135 		 * memory after it is freed" error here.
2136 		 * The loop below "teaches" the scan-build that the freed fhandle is not on the
2137 		 * list anymore and supresses the error in this way.
2138 		 */
2139 		struct spdk_fsdev_file_handle *tmp;
2140 		TAILQ_FOREACH(tmp, &fobject->handles, link) {
2141 			assert(tmp != fhandle);
2142 		}
2143 #endif
2144 	}
2145 
2146 	while (!TAILQ_EMPTY(&fobject->leafs)) {
2147 		struct spdk_fsdev_file_object *leaf_fobject = TAILQ_FIRST(&fobject->leafs);
2148 		fsdev_free_leafs(leaf_fobject);
2149 	}
2150 
2151 	if (fobject->refcount) {
2152 		/* if still referenced - zero refcount */
2153 		int res = file_object_unref(fobject, fobject->refcount);
2154 		assert(res == 0);
2155 		UNUSED(res);
2156 	}
2157 }
2158 
2159 static int
2160 fsdev_aio_destruct(void *ctx)
2161 {
2162 	struct aio_fsdev *vfsdev = ctx;
2163 
2164 	TAILQ_REMOVE(&g_aio_fsdev_head, vfsdev, tailq);
2165 
2166 	fsdev_free_leafs(vfsdev->root);
2167 	vfsdev->root = NULL;
2168 
2169 	pthread_mutex_destroy(&vfsdev->mutex);
2170 
2171 	fsdev_aio_free(vfsdev);
2172 	return 0;
2173 }
2174 
2175 typedef int (*fsdev_op_handler_func)(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io);
2176 
2177 static fsdev_op_handler_func handlers[] = {
2178 	[SPDK_FSDEV_IO_LOOKUP] = lo_lookup,
2179 	[SPDK_FSDEV_IO_FORGET] = lo_forget,
2180 	[SPDK_FSDEV_IO_GETATTR] = lo_getattr,
2181 	[SPDK_FSDEV_IO_SETATTR] = lo_setattr,
2182 	[SPDK_FSDEV_IO_READLINK] = lo_readlink,
2183 	[SPDK_FSDEV_IO_SYMLINK] = lo_symlink,
2184 	[SPDK_FSDEV_IO_MKNOD] = lo_mknod,
2185 	[SPDK_FSDEV_IO_MKDIR] = lo_mkdir,
2186 	[SPDK_FSDEV_IO_UNLINK] = lo_unlink,
2187 	[SPDK_FSDEV_IO_RMDIR] = lo_rmdir,
2188 	[SPDK_FSDEV_IO_RENAME] = lo_rename,
2189 	[SPDK_FSDEV_IO_LINK] = lo_link,
2190 	[SPDK_FSDEV_IO_OPEN] = lo_open,
2191 	[SPDK_FSDEV_IO_READ] = lo_read,
2192 	[SPDK_FSDEV_IO_WRITE] = lo_write,
2193 	[SPDK_FSDEV_IO_STATFS] =  lo_statfs,
2194 	[SPDK_FSDEV_IO_RELEASE] = lo_release,
2195 	[SPDK_FSDEV_IO_FSYNC] = lo_fsync,
2196 	[SPDK_FSDEV_IO_SETXATTR] =  lo_setxattr,
2197 	[SPDK_FSDEV_IO_GETXATTR] =  lo_getxattr,
2198 	[SPDK_FSDEV_IO_LISTXATTR] = lo_listxattr,
2199 	[SPDK_FSDEV_IO_REMOVEXATTR] =  lo_removexattr,
2200 	[SPDK_FSDEV_IO_FLUSH] =  lo_flush,
2201 	[SPDK_FSDEV_IO_OPENDIR] =  lo_opendir,
2202 	[SPDK_FSDEV_IO_READDIR] =  lo_readdir,
2203 	[SPDK_FSDEV_IO_RELEASEDIR] = lo_releasedir,
2204 	[SPDK_FSDEV_IO_FSYNCDIR] = lo_fsyncdir,
2205 	[SPDK_FSDEV_IO_FLOCK] = lo_flock,
2206 	[SPDK_FSDEV_IO_CREATE] = lo_create,
2207 	[SPDK_FSDEV_IO_ABORT] = lo_abort,
2208 	[SPDK_FSDEV_IO_FALLOCATE] = lo_fallocate,
2209 	[SPDK_FSDEV_IO_COPY_FILE_RANGE] = lo_copy_file_range,
2210 };
2211 
2212 static void
2213 fsdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
2214 {
2215 	int status;
2216 	enum spdk_fsdev_io_type type = spdk_fsdev_io_get_type(fsdev_io);
2217 
2218 	assert(type >= 0 && type < __SPDK_FSDEV_IO_LAST);
2219 
2220 	status = handlers[type](ch, fsdev_io);
2221 	if (status != IO_STATUS_ASYNC) {
2222 		spdk_fsdev_io_complete(fsdev_io, status);
2223 	}
2224 }
2225 
2226 static struct spdk_io_channel *
2227 fsdev_aio_get_io_channel(void *ctx)
2228 {
2229 	return spdk_get_io_channel(&g_aio_fsdev_head);
2230 }
2231 
2232 static int
2233 fsdev_aio_negotiate_opts(void *ctx, struct spdk_fsdev_open_opts *opts)
2234 {
2235 	struct aio_fsdev *vfsdev = ctx;
2236 
2237 	assert(opts != 0);
2238 	assert(opts->opts_size != 0);
2239 
2240 	UNUSED(vfsdev);
2241 
2242 	if (opts->opts_size > offsetof(struct spdk_fsdev_open_opts, max_write)) {
2243 		/* Set the value the aio fsdev was created with */
2244 		opts->max_write = vfsdev->fsdev.opts.max_write;
2245 	}
2246 
2247 	if (opts->opts_size > offsetof(struct spdk_fsdev_open_opts, writeback_cache_enabled)) {
2248 		if (vfsdev->fsdev.opts.writeback_cache_enabled) {
2249 			/* The writeback_cache_enabled was enabled upon creation => we follow the opts */
2250 			vfsdev->fsdev.opts.writeback_cache_enabled = opts->writeback_cache_enabled;
2251 		} else {
2252 			/* The writeback_cache_enabled was disabled upon creation => we reflect it in the opts */
2253 			opts->writeback_cache_enabled = false;
2254 		}
2255 	}
2256 
2257 	/* The AIO doesn't apply any additional restrictions, so we just accept the requested opts */
2258 	SPDK_DEBUGLOG(fsdev_aio,
2259 		      "aio filesystem %s: opts updated: max_write=%" PRIu32 ", writeback_cache=%" PRIu8 "\n",
2260 		      vfsdev->fsdev.name, vfsdev->fsdev.opts.max_write, vfsdev->fsdev.opts.writeback_cache_enabled);
2261 
2262 	return 0;
2263 }
2264 
2265 static void
2266 fsdev_aio_write_config_json(struct spdk_fsdev *fsdev, struct spdk_json_write_ctx *w)
2267 {
2268 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev);
2269 
2270 	spdk_json_write_object_begin(w);
2271 	spdk_json_write_named_string(w, "method", "fsdev_aio_create");
2272 	spdk_json_write_named_object_begin(w, "params");
2273 	spdk_json_write_named_string(w, "name", spdk_fsdev_get_name(&vfsdev->fsdev));
2274 	spdk_json_write_named_string(w, "root_path", vfsdev->root_path);
2275 	spdk_json_write_named_bool(w, "enable_xattr", vfsdev->xattr_enabled);
2276 	spdk_json_write_named_bool(w, "enable_writeback_cache",
2277 				   !!vfsdev->fsdev.opts.writeback_cache_enabled);
2278 	spdk_json_write_named_uint32(w, "max_write", vfsdev->fsdev.opts.max_write);
2279 	spdk_json_write_named_bool(w, "skip_rw", vfsdev->skip_rw);
2280 	spdk_json_write_object_end(w); /* params */
2281 	spdk_json_write_object_end(w);
2282 }
2283 
2284 static const struct spdk_fsdev_fn_table aio_fn_table = {
2285 	.destruct		= fsdev_aio_destruct,
2286 	.submit_request		= fsdev_aio_submit_request,
2287 	.get_io_channel		= fsdev_aio_get_io_channel,
2288 	.negotiate_opts		= fsdev_aio_negotiate_opts,
2289 	.write_config_json	= fsdev_aio_write_config_json,
2290 };
2291 
2292 static int
2293 setup_root(struct aio_fsdev *vfsdev)
2294 {
2295 	int fd, res;
2296 	struct stat stat;
2297 
2298 	fd = open(vfsdev->root_path, O_PATH);
2299 	if (fd == -1) {
2300 		res = -errno;
2301 		SPDK_ERRLOG("Cannot open root %s (err=%d)\n", vfsdev->root_path, res);
2302 		return res;
2303 	}
2304 
2305 	res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2306 	if (res == -1) {
2307 		res = -errno;
2308 		SPDK_ERRLOG("Cannot get root fstatat of %s (err=%d)\n", vfsdev->root_path, res);
2309 		close(fd);
2310 		return res;
2311 	}
2312 
2313 	vfsdev->root = file_object_create_unsafe(NULL, fd, stat.st_ino, stat.st_dev, stat.st_mode);
2314 	if (!vfsdev->root) {
2315 		SPDK_ERRLOG("Cannot alloc root\n");
2316 		close(fd);
2317 		return -ENOMEM;
2318 	}
2319 
2320 	SPDK_INFOLOG(fsdev_aio, "root (%s) fd=%d\n", vfsdev->root_path, fd);
2321 	return 0;
2322 }
2323 
2324 static int
2325 setup_proc_self_fd(struct aio_fsdev *vfsdev)
2326 {
2327 	vfsdev->proc_self_fd = open("/proc/self/fd", O_PATH);
2328 	if (vfsdev->proc_self_fd == -1) {
2329 		int saverr = -errno;
2330 		SPDK_ERRLOG("Failed to open procfs fd dir with %d\n", saverr);
2331 		return saverr;
2332 	}
2333 
2334 	SPDK_DEBUGLOG(fsdev_aio, "procfs fd dir opened (fd=%d)\n", vfsdev->proc_self_fd);
2335 	return 0;
2336 }
2337 
2338 void
2339 spdk_fsdev_aio_get_default_opts(struct spdk_fsdev_aio_opts *opts)
2340 {
2341 	assert(opts);
2342 
2343 	memset(opts, 0, sizeof(*opts));
2344 
2345 	opts->xattr_enabled = DEFAULT_XATTR_ENABLED;
2346 	opts->writeback_cache_enabled = DEFAULT_WRITEBACK_CACHE;
2347 	opts->max_write = DEFAULT_MAX_WRITE;
2348 	opts->skip_rw = DEFAULT_SKIP_RW;
2349 }
2350 
2351 int
2352 spdk_fsdev_aio_create(struct spdk_fsdev **fsdev, const char *name, const char *root_path,
2353 		      const struct spdk_fsdev_aio_opts *opts)
2354 {
2355 	struct aio_fsdev *vfsdev;
2356 	int rc;
2357 
2358 	vfsdev = calloc(1, sizeof(*vfsdev));
2359 	if (!vfsdev) {
2360 		SPDK_ERRLOG("Could not allocate aio_fsdev\n");
2361 		return -ENOMEM;
2362 	}
2363 
2364 	vfsdev->proc_self_fd = -1;
2365 
2366 	vfsdev->fsdev.name = strdup(name);
2367 	if (!vfsdev->fsdev.name) {
2368 		SPDK_ERRLOG("Could not strdup fsdev name: %s\n", name);
2369 		fsdev_aio_free(vfsdev);
2370 		return -ENOMEM;
2371 	}
2372 
2373 	vfsdev->root_path = strdup(root_path);
2374 	if (!vfsdev->root_path) {
2375 		SPDK_ERRLOG("Could not strdup root path: %s\n", root_path);
2376 		fsdev_aio_free(vfsdev);
2377 		return -ENOMEM;
2378 	}
2379 
2380 	rc = setup_root(vfsdev);
2381 	if (rc) {
2382 		SPDK_ERRLOG("Could not setup root: %s (err=%d)\n", root_path, rc);
2383 		fsdev_aio_free(vfsdev);
2384 		return rc;
2385 	}
2386 
2387 	rc = setup_proc_self_fd(vfsdev);
2388 	if (rc) {
2389 		SPDK_ERRLOG("Could not setup proc_self_fd (err=%d)\n", rc);
2390 		fsdev_aio_free(vfsdev);
2391 		return rc;
2392 	}
2393 
2394 	if (opts->xattr_enabled) {
2395 		SPDK_ERRLOG("Extended attributes can only be enabled in Linux\n");
2396 		fsdev_aio_free(vfsdev);
2397 		return rc;
2398 	}
2399 
2400 	vfsdev->xattr_enabled = opts->xattr_enabled;
2401 	vfsdev->fsdev.ctxt = vfsdev;
2402 	vfsdev->fsdev.fn_table = &aio_fn_table;
2403 	vfsdev->fsdev.module = &aio_fsdev_module;
2404 
2405 	pthread_mutex_init(&vfsdev->mutex, NULL);
2406 
2407 	rc = spdk_fsdev_register(&vfsdev->fsdev);
2408 	if (rc) {
2409 		fsdev_aio_free(vfsdev);
2410 		return rc;
2411 	}
2412 
2413 	vfsdev->fsdev.opts.writeback_cache_enabled = opts->writeback_cache_enabled;
2414 	vfsdev->fsdev.opts.max_write = opts->max_write;
2415 
2416 	vfsdev->skip_rw = opts->skip_rw;
2417 
2418 	*fsdev = &(vfsdev->fsdev);
2419 	TAILQ_INSERT_TAIL(&g_aio_fsdev_head, vfsdev, tailq);
2420 	SPDK_DEBUGLOG(fsdev_aio, "Created aio filesystem %s (xattr_enabled=%" PRIu8 " writeback_cache=%"
2421 		      PRIu8 " max_write=%" PRIu32 " skip_rw=%" PRIu8 ")\n",
2422 		      vfsdev->fsdev.name, vfsdev->xattr_enabled, vfsdev->fsdev.opts.writeback_cache_enabled,
2423 		      vfsdev->fsdev.opts.max_write, vfsdev->skip_rw);
2424 	return rc;
2425 }
2426 void
2427 spdk_fsdev_aio_delete(const char *name,
2428 		      spdk_delete_aio_fsdev_complete cb_fn, void *cb_arg)
2429 {
2430 	int rc;
2431 
2432 	rc = spdk_fsdev_unregister_by_name(name, &aio_fsdev_module, cb_fn, cb_arg);
2433 	if (rc != 0) {
2434 		cb_fn(cb_arg, rc);
2435 	}
2436 
2437 	SPDK_DEBUGLOG(fsdev_aio, "Deleted aio filesystem %s\n", name);
2438 }
2439 
2440 SPDK_LOG_REGISTER_COMPONENT(fsdev_aio)
2441