xref: /spdk/module/fsdev/aio/fsdev_aio.c (revision cfa0a248e28dc42bd51b24c4d4ab64e0b5dd7854)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 #include "spdk/stdinc.h"
5 #include "spdk/event.h"
6 #include "spdk/log.h"
7 #include "spdk/string.h"
8 #include "spdk/config.h"
9 #include "spdk/util.h"
10 #include "spdk/thread.h"
11 #include "aio_mgr.h"
12 #include "fsdev_aio.h"
13 
14 #define IO_STATUS_ASYNC INT_MIN
15 
16 #ifndef UNUSED
17 #define UNUSED(x) (void)(x)
18 #endif
19 
20 /* See https://libfuse.github.io/doxygen/structfuse__conn__info.html */
21 #define MAX_BACKGROUND (100)
22 #define TIME_GRAN (1)
23 #define MAX_AIOS 256
24 #define DEFAULT_WRITEBACK_CACHE true
25 #define DEFAULT_MAX_WRITE 0x00020000
26 #define DEFAULT_XATTR_ENABLED false
27 #define DEFAULT_TIMEOUT_MS 0 /* to prevent the attribute caching */
28 
29 #ifdef SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIM
30 /* Linux */
31 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec)
32 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec)
33 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec)
34 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val)
35 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val)
36 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val)
37 #elif defined(SPDK_CONFIG_HAVE_STRUCT_STAT_ST_ATIMESPEC)
38 /* FreeBSD */
39 #define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec)
40 #define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec)
41 #define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec)
42 #define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val)
43 #define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val)
44 #define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val)
45 #else
46 #define ST_ATIM_NSEC(stbuf) 0
47 #define ST_CTIM_NSEC(stbuf) 0
48 #define ST_MTIM_NSEC(stbuf) 0
49 #define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0)
50 #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0)
51 #define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0)
52 #endif
53 
54 struct lo_cred {
55 	uid_t euid;
56 	gid_t egid;
57 };
58 
59 /** Inode number type */
60 typedef uint64_t spdk_ino_t;
61 
62 struct lo_key {
63 	ino_t ino;
64 	dev_t dev;
65 };
66 
67 struct spdk_fsdev_file_handle {
68 	int fd;
69 	struct {
70 		DIR *dp;
71 		struct dirent *entry;
72 		off_t offset;
73 	} dir;
74 	struct spdk_fsdev_file_object *fobject;
75 	TAILQ_ENTRY(spdk_fsdev_file_handle) link;
76 };
77 
78 #define FOBJECT_FMT "ino=%" PRIu64 " dev=%" PRIu64
79 #define FOBJECT_ARGS(fo) ((uint64_t)(fo)->key.ino), ((uint64_t)(fo)->key.dev)
80 struct spdk_fsdev_file_object {
81 	uint32_t is_symlink : 1;
82 	uint32_t is_dir : 1;
83 	uint32_t reserved : 30;
84 	int fd;
85 	char *fd_str;
86 	struct lo_key key;
87 	uint64_t refcount;
88 	struct spdk_fsdev_file_object *parent_fobject;
89 	TAILQ_ENTRY(spdk_fsdev_file_object) link;
90 	TAILQ_HEAD(, spdk_fsdev_file_object) leafs;
91 	TAILQ_HEAD(, spdk_fsdev_file_handle) handles;
92 	struct spdk_spinlock lock;
93 	char name[];
94 };
95 
96 struct aio_fsdev {
97 	struct spdk_fsdev fsdev;
98 	char *root_path;
99 	int proc_self_fd;
100 	pthread_mutex_t mutex;
101 	struct spdk_fsdev_file_object *root;
102 	TAILQ_ENTRY(aio_fsdev) tailq;
103 	bool xattr_enabled;
104 };
105 
106 struct aio_fsdev_io {
107 	struct spdk_aio_mgr_io *aio;
108 	struct aio_io_channel *ch;
109 	TAILQ_ENTRY(aio_fsdev_io) link;
110 };
111 
112 struct aio_io_channel {
113 	struct spdk_poller *poller;
114 	struct spdk_aio_mgr *mgr;
115 	TAILQ_HEAD(, aio_fsdev_io) ios_in_progress;
116 };
117 
118 static TAILQ_HEAD(, aio_fsdev) g_aio_fsdev_head = TAILQ_HEAD_INITIALIZER(
119 			g_aio_fsdev_head);
120 
121 static inline struct aio_fsdev *
122 fsdev_to_aio_fsdev(struct spdk_fsdev *fsdev)
123 {
124 	return SPDK_CONTAINEROF(fsdev, struct aio_fsdev, fsdev);
125 }
126 
127 static inline struct spdk_fsdev_io *
128 aio_to_fsdev_io(const struct aio_fsdev_io *aio_io)
129 {
130 	return SPDK_CONTAINEROF(aio_io, struct spdk_fsdev_io, driver_ctx);
131 }
132 
133 static inline struct aio_fsdev_io *
134 fsdev_to_aio_io(const struct spdk_fsdev_io *fsdev_io)
135 {
136 	return (struct aio_fsdev_io *)fsdev_io->driver_ctx;
137 }
138 
139 static inline bool
140 fsdev_aio_is_valid_fobject(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject)
141 {
142 	return fobject != NULL;
143 }
144 
145 static inline bool
146 fsdev_aio_is_valid_fhandle(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_handle *fhandle)
147 {
148 	return fhandle != NULL;
149 }
150 
151 static int
152 is_dot_or_dotdot(const char *name)
153 {
154 	return name[0] == '.' && (name[1] == '\0' ||
155 				  (name[1] == '.' && name[2] == '\0'));
156 }
157 
158 /* Is `path` a single path component that is not "." or ".."? */
159 static int
160 is_safe_path_component(const char *path)
161 {
162 	if (strchr(path, '/')) {
163 		return 0;
164 	}
165 
166 	return !is_dot_or_dotdot(path);
167 }
168 
169 static struct spdk_fsdev_file_object *
170 lo_find_leaf_unsafe(struct spdk_fsdev_file_object *fobject, ino_t ino, dev_t dev)
171 {
172 	struct spdk_fsdev_file_object *leaf_fobject;
173 
174 	TAILQ_FOREACH(leaf_fobject, &fobject->leafs, link) {
175 		if (leaf_fobject->key.ino == ino && leaf_fobject->key.dev == dev) {
176 			return leaf_fobject;
177 		}
178 	}
179 
180 	return NULL;
181 }
182 
183 /* This function returns:
184  * 1 if the refcount is still non zero
185  * a negative  error number if the refcount became zero, the file object was deleted but the defered underlying file deletion failed
186  * 0 if the refcount became zero, the file object was deleted and eithr the underlying file deletion wasn't defered or succeeded
187  */
188 static int
189 file_object_unref(struct spdk_fsdev_file_object *fobject, uint32_t count)
190 {
191 	int res = 0;
192 
193 	spdk_spin_lock(&fobject->lock);
194 	assert(fobject->refcount >= count);
195 	fobject->refcount -= count;
196 	spdk_spin_unlock(&fobject->lock);
197 
198 	if (!fobject->refcount) {
199 		struct spdk_fsdev_file_object *parent_fobject = fobject->parent_fobject;
200 
201 		if (parent_fobject) {
202 			spdk_spin_lock(&parent_fobject->lock);
203 			TAILQ_REMOVE(&parent_fobject->leafs, fobject, link);
204 			spdk_spin_unlock(&parent_fobject->lock);
205 			file_object_unref(parent_fobject, 1); /* unref by the leaf */
206 		}
207 
208 		spdk_spin_destroy(&fobject->lock);
209 		close(fobject->fd);
210 		free(fobject->fd_str);
211 		free(fobject);
212 	}
213 
214 	return res;
215 }
216 
217 static void
218 file_object_ref(struct spdk_fsdev_file_object *fobject)
219 {
220 	spdk_spin_lock(&fobject->lock);
221 	fobject->refcount++;
222 	spdk_spin_unlock(&fobject->lock);
223 }
224 
225 static struct spdk_fsdev_file_object *
226 file_object_create_unsafe(struct spdk_fsdev_file_object *parent_fobject, int fd, ino_t ino,
227 			  dev_t dev, mode_t mode)
228 {
229 	struct spdk_fsdev_file_object *fobject;
230 
231 	fobject = calloc(1, sizeof(*fobject));
232 	if (!fobject) {
233 		SPDK_ERRLOG("Cannot alloc fobject\n");
234 		return NULL;
235 	}
236 
237 	fobject->fd_str = spdk_sprintf_alloc("%d", fd);
238 	if (!fobject->fd_str) {
239 		SPDK_ERRLOG("Cannot alloc fd_str\n");
240 		free(fobject);
241 		return NULL;
242 	}
243 
244 	fobject->fd = fd;
245 	fobject->key.ino = ino;
246 	fobject->key.dev = dev;
247 	fobject->refcount = 1;
248 	fobject->is_symlink = S_ISLNK(mode) ? 1 : 0;
249 	fobject->is_dir = S_ISDIR(mode) ? 1 : 0;
250 
251 	TAILQ_INIT(&fobject->handles);
252 	TAILQ_INIT(&fobject->leafs);
253 	spdk_spin_init(&fobject->lock);
254 
255 	if (parent_fobject) {
256 		fobject->parent_fobject = parent_fobject;
257 		TAILQ_INSERT_TAIL(&parent_fobject->leafs, fobject, link);
258 		parent_fobject->refcount++;
259 	}
260 
261 	return fobject;
262 }
263 
264 static struct spdk_fsdev_file_handle *
265 file_handle_create(struct spdk_fsdev_file_object *fobject, int fd)
266 {
267 	struct spdk_fsdev_file_handle *fhandle;
268 
269 	fhandle = calloc(1, sizeof(*fhandle));
270 	if (!fhandle) {
271 		SPDK_ERRLOG("Cannot alloc fhandle\n");
272 		return NULL;
273 	}
274 
275 	fhandle->fobject = fobject;
276 	fhandle->fd = fd;
277 
278 	spdk_spin_lock(&fobject->lock);
279 	fobject->refcount++;
280 	TAILQ_INSERT_TAIL(&fobject->handles, fhandle, link);
281 	spdk_spin_unlock(&fobject->lock);
282 
283 	return fhandle;
284 }
285 
286 static void
287 file_handle_delete(struct spdk_fsdev_file_handle *fhandle)
288 {
289 	struct spdk_fsdev_file_object *fobject = fhandle->fobject;
290 
291 	spdk_spin_lock(&fobject->lock);
292 	fobject->refcount--;
293 	TAILQ_REMOVE(&fobject->handles, fhandle, link);
294 	spdk_spin_unlock(&fobject->lock);
295 
296 	if (fhandle->dir.dp) {
297 		closedir(fhandle->dir.dp);
298 	}
299 
300 	close(fhandle->fd);
301 	free(fhandle);
302 }
303 
304 static int
305 file_object_fill_attr(struct spdk_fsdev_file_object *fobject, struct spdk_fsdev_file_attr *attr)
306 {
307 	struct stat stbuf;
308 	int res;
309 
310 	res = fstatat(fobject->fd, "", &stbuf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
311 	if (res == -1) {
312 		res = -errno;
313 		SPDK_ERRLOG("fstatat() failed with %d\n", res);
314 		return res;
315 	}
316 
317 	memset(attr, 0, sizeof(*attr));
318 
319 	attr->ino = stbuf.st_ino;
320 	attr->size = stbuf.st_size;
321 	attr->blocks = stbuf.st_blocks;
322 	attr->atime = stbuf.st_atime;
323 	attr->mtime = stbuf.st_mtime;
324 	attr->ctime = stbuf.st_ctime;
325 	attr->atimensec = ST_ATIM_NSEC(&stbuf);
326 	attr->mtimensec = ST_MTIM_NSEC(&stbuf);
327 	attr->ctimensec = ST_CTIM_NSEC(&stbuf);
328 	attr->mode = stbuf.st_mode;
329 	attr->nlink = stbuf.st_nlink;
330 	attr->uid = stbuf.st_uid;
331 	attr->gid = stbuf.st_gid;
332 	attr->rdev = stbuf.st_rdev;
333 	attr->blksize = stbuf.st_blksize;
334 	attr->valid_ms = DEFAULT_TIMEOUT_MS;
335 
336 	return 0;
337 }
338 
339 static int
340 utimensat_empty(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject,
341 		const struct timespec *tv)
342 {
343 	int res;
344 
345 	if (fobject->is_symlink) {
346 		res = utimensat(fobject->fd, "", tv, AT_EMPTY_PATH);
347 		if (res == -1 && errno == EINVAL) {
348 			/* Sorry, no race free way to set times on symlink. */
349 			errno = EPERM;
350 		}
351 	} else {
352 		res = utimensat(vfsdev->proc_self_fd, fobject->fd_str, tv, 0);
353 	}
354 
355 	return res;
356 }
357 
358 static int
359 lo_getattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
360 {
361 	int res;
362 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
363 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getattr.fobject;
364 
365 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
366 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
367 		return -EINVAL;
368 	}
369 
370 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.getattr.attr);
371 	if (res) {
372 		SPDK_ERRLOG("Cannot fill attr for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), res);
373 		return res;
374 	}
375 
376 	SPDK_DEBUGLOG(fsdev_aio, "GETATTR succeeded for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
377 	return 0;
378 }
379 
380 static int
381 lo_opendir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
382 {
383 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
384 	int error;
385 	int fd;
386 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.opendir.fobject;
387 	uint32_t flags = fsdev_io->u_in.opendir.flags;
388 	struct spdk_fsdev_file_handle *fhandle = NULL;
389 
390 	UNUSED(flags);
391 
392 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
393 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
394 		return -EINVAL;
395 	}
396 
397 	fd = openat(fobject->fd, ".", O_RDONLY);
398 	if (fd == -1) {
399 		error = -errno;
400 		SPDK_ERRLOG("openat failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
401 		goto out_err;
402 	}
403 
404 	fhandle = file_handle_create(fobject, fd);
405 	if (fhandle == NULL) {
406 		error = -ENOMEM;
407 		SPDK_ERRLOG("file_handle_create failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject),
408 			    error);
409 		goto out_err;
410 	}
411 
412 	fhandle->dir.dp = fdopendir(fd);
413 	if (fhandle->dir.dp == NULL) {
414 		error = -errno;
415 		SPDK_ERRLOG("fdopendir failed for " FOBJECT_FMT " (err=%d)\n", FOBJECT_ARGS(fobject), error);
416 		goto out_err;
417 	}
418 
419 	fhandle->dir.offset = 0;
420 	fhandle->dir.entry = NULL;
421 
422 	SPDK_DEBUGLOG(fsdev_aio, "OPENDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
423 		      FOBJECT_ARGS(fobject), fhandle);
424 
425 	fsdev_io->u_out.opendir.fhandle = fhandle;
426 
427 	return 0;
428 
429 out_err:
430 	if (fhandle) {
431 		file_handle_delete(fhandle);
432 	} else if (fd != -1) {
433 		close(fd);
434 	}
435 
436 	return error;
437 }
438 
439 static int
440 lo_releasedir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
441 {
442 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
443 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.releasedir.fobject;
444 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.releasedir.fhandle;
445 
446 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
447 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
448 		return -EINVAL;
449 	}
450 
451 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
452 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
453 		return -EINVAL;
454 	}
455 
456 	SPDK_DEBUGLOG(fsdev_aio, "RELEASEDIR succeeded for " FOBJECT_FMT " (fh=%p)\n",
457 		      FOBJECT_ARGS(fobject), fhandle);
458 
459 	file_handle_delete(fhandle);
460 
461 	return 0;
462 }
463 
464 static int
465 lo_do_lookup(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
466 	     const char *name, struct spdk_fsdev_file_object **pfobject,
467 	     struct spdk_fsdev_file_attr *attr)
468 {
469 	int newfd;
470 	int res;
471 	struct stat stat;
472 	struct spdk_fsdev_file_object *fobject;
473 
474 	/* Do not allow escaping root directory */
475 	if (parent_fobject == vfsdev->root && strcmp(name, "..") == 0) {
476 		name = ".";
477 	}
478 
479 	newfd = openat(parent_fobject->fd, name, O_PATH | O_NOFOLLOW);
480 	if (newfd == -1) {
481 		res = -errno;
482 		SPDK_DEBUGLOG(fsdev_aio, "openat( " FOBJECT_FMT " %s) failed with %d\n",
483 			      FOBJECT_ARGS(parent_fobject), name, res);
484 		return res;
485 	}
486 
487 	res = fstatat(newfd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
488 	if (res == -1) {
489 		res = -errno;
490 		SPDK_ERRLOG("fstatat(%s) failed with %d\n", name, res);
491 		close(newfd);
492 		return res;
493 	}
494 
495 	spdk_spin_lock(&parent_fobject->lock);
496 	fobject = lo_find_leaf_unsafe(parent_fobject, stat.st_ino, stat.st_dev);
497 	if (fobject) {
498 		close(newfd);
499 		newfd = -1;
500 		file_object_ref(fobject); /* reference by a lo_do_lookup caller */
501 	} else {
502 		fobject = file_object_create_unsafe(parent_fobject, newfd, stat.st_ino, stat.st_dev, stat.st_mode);
503 	}
504 	spdk_spin_unlock(&parent_fobject->lock);
505 
506 	if (!fobject) {
507 		SPDK_ERRLOG("Cannot create file object\n");
508 		close(newfd);
509 		return -ENOMEM;
510 	}
511 
512 	if (attr) {
513 		res = file_object_fill_attr(fobject, attr);
514 		if (res) {
515 			SPDK_ERRLOG("fill_attr(%s) failed with %d\n", name, res);
516 			file_object_unref(fobject, 1);
517 			if (newfd != -1) {
518 				close(newfd);
519 			}
520 			return res;
521 		}
522 	}
523 
524 	*pfobject = fobject;
525 
526 	SPDK_DEBUGLOG(fsdev_aio, "lookup(%s) in dir " FOBJECT_FMT ": "  FOBJECT_FMT " fd=%d\n",
527 		      name, FOBJECT_ARGS(parent_fobject), FOBJECT_ARGS(fobject), fobject->fd);
528 	return 0;
529 }
530 
531 static int
532 lo_lookup(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
533 {
534 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
535 	int err;
536 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.lookup.parent_fobject;
537 	char *name = fsdev_io->u_in.lookup.name;
538 
539 	if (!parent_fobject) {
540 		err = file_object_fill_attr(vfsdev->root, &fsdev_io->u_out.lookup.attr);
541 		if (err) {
542 			SPDK_DEBUGLOG(fsdev_aio, "file_object_fill_attr(root) failed with err=%d\n", err);
543 			return err;
544 		}
545 
546 		file_object_ref(vfsdev->root);
547 		fsdev_io->u_out.lookup.fobject = vfsdev->root;
548 		return 0;
549 	}
550 
551 	SPDK_DEBUGLOG(fsdev_aio, "  name %s\n", name);
552 
553 	/* Don't use is_safe_path_component(), allow "." and ".." for NFS export
554 	 * support.
555 	 */
556 	if (strchr(name, '/')) {
557 		return -EINVAL;
558 	}
559 
560 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fsdev_io->u_out.lookup.fobject,
561 			   &fsdev_io->u_out.lookup.attr);
562 	if (err) {
563 		SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, err);
564 		return err;
565 	}
566 
567 	return 0;
568 }
569 
570 /*
571  * Change to uid/gid of caller so that file is created with ownership of caller.
572  */
573 static int
574 lo_change_cred(const struct lo_cred *new, struct lo_cred *old)
575 {
576 	int res;
577 
578 	old->euid = geteuid();
579 	old->egid = getegid();
580 
581 	res = syscall(SYS_setresgid, -1, new->egid, -1);
582 	if (res == -1) {
583 		return -errno;
584 	}
585 
586 	res = syscall(SYS_setresuid, -1, new->euid, -1);
587 	if (res == -1) {
588 		int errno_save = -errno;
589 
590 		syscall(SYS_setresgid, -1, old->egid, -1);
591 		return errno_save;
592 	}
593 
594 	return 0;
595 }
596 
597 /* Regain Privileges */
598 static void
599 lo_restore_cred(struct lo_cred *old)
600 {
601 	int res;
602 
603 	res = syscall(SYS_setresuid, -1, old->euid, -1);
604 	if (res == -1) {
605 		SPDK_ERRLOG("seteuid(%u)", old->euid);
606 	}
607 
608 	res = syscall(SYS_setresgid, -1, old->egid, -1);
609 	if (res == -1) {
610 		SPDK_ERRLOG("setegid(%u)", old->egid);
611 	}
612 }
613 
614 static int
615 lo_readdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
616 {
617 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
618 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
619 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.readdir.fhandle;
620 	uint64_t offset = fsdev_io->u_in.readdir.offset;
621 
622 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
623 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
624 		return -EINVAL;
625 	}
626 
627 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
628 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
629 		return -EINVAL;
630 	}
631 
632 	if (((off_t)offset) != fhandle->dir.offset) {
633 		seekdir(fhandle->dir.dp, offset);
634 		fhandle->dir.entry = NULL;
635 		fhandle->dir.offset = offset;
636 	}
637 
638 	while (1) {
639 		off_t nextoff;
640 		const char *name;
641 		int res;
642 
643 		if (!fhandle->dir.entry) {
644 			errno = 0;
645 			fhandle->dir.entry = readdir(fhandle->dir.dp);
646 			if (!fhandle->dir.entry) {
647 				if (errno) {  /* Error */
648 					res = -errno;
649 					SPDK_ERRLOG("readdir failed with err=%d", res);
650 					return res;
651 				} else {  /* End of stream */
652 					break;
653 				}
654 			}
655 		}
656 
657 		nextoff = fhandle->dir.entry->d_off;
658 		name = fhandle->dir.entry->d_name;
659 
660 		/* Hide root's parent directory */
661 		if (fobject == vfsdev->root && strcmp(name, "..") == 0) {
662 			goto skip_entry;
663 		}
664 
665 		if (is_dot_or_dotdot(name)) {
666 			fsdev_io->u_out.readdir.fobject = NULL;
667 			memset(&fsdev_io->u_out.readdir.attr, 0, sizeof(fsdev_io->u_out.readdir.attr));
668 			fsdev_io->u_out.readdir.attr.ino = fhandle->dir.entry->d_ino;
669 			fsdev_io->u_out.readdir.attr.mode = DT_DIR << 12;
670 			goto skip_lookup;
671 		}
672 
673 		res = lo_do_lookup(vfsdev, fobject, name, &fsdev_io->u_out.readdir.fobject,
674 				   &fsdev_io->u_out.readdir.attr);
675 		if (res) {
676 			SPDK_DEBUGLOG(fsdev_aio, "lo_do_lookup(%s) failed with err=%d\n", name, res);
677 			return res;
678 		}
679 
680 skip_lookup:
681 		fsdev_io->u_out.readdir.name = name;
682 		fsdev_io->u_out.readdir.offset = nextoff;
683 
684 		res = fsdev_io->u_in.readdir.entry_cb_fn(fsdev_io, fsdev_io->internal.cb_arg);
685 		if (res) {
686 			if (fsdev_io->u_out.readdir.fobject) {
687 				file_object_unref(fsdev_io->u_out.readdir.fobject, 1);
688 			}
689 			break;
690 		}
691 
692 skip_entry:
693 		fhandle->dir.entry = NULL;
694 		fhandle->dir.offset = nextoff;
695 	}
696 
697 	SPDK_DEBUGLOG(fsdev_aio, "READDIR succeeded for " FOBJECT_FMT " (fh=%p, offset=%" PRIu64 ")\n",
698 		      FOBJECT_ARGS(fobject), fhandle, offset);
699 	return 0;
700 }
701 
702 static int
703 lo_forget(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
704 {
705 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
706 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readdir.fobject;
707 	uint64_t nlookup = fsdev_io->u_in.forget.nlookup;
708 
709 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
710 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
711 		return -EINVAL;
712 	}
713 
714 	file_object_unref(fobject, nlookup);
715 
716 	return 0;
717 }
718 
719 static uint32_t
720 update_open_flags(struct aio_fsdev *vfsdev, uint32_t flags)
721 {
722 	/*
723 	 * With writeback cache, kernel may send read requests even
724 	 * when userspace opened write-only
725 	 */
726 	if (vfsdev->fsdev.opts.writeback_cache_enabled && (flags & O_ACCMODE) == O_WRONLY) {
727 		flags &= ~O_ACCMODE;
728 		flags |= O_RDWR;
729 	}
730 
731 	/*
732 	 * With writeback cache, O_APPEND is handled by the kernel.
733 	 * This breaks atomicity (since the file may change in the
734 	 * underlying filesystem, so that the kernel's idea of the
735 	 * end of the file isn't accurate anymore). In this example,
736 	 * we just accept that. A more rigorous filesystem may want
737 	 * to return an error here
738 	 */
739 	if (vfsdev->fsdev.opts.writeback_cache_enabled && (flags & O_APPEND)) {
740 		flags &= ~O_APPEND;
741 	}
742 
743 	/*
744 	 * O_DIRECT in guest should not necessarily mean bypassing page
745 	 * cache on host as well. If somebody needs that behavior, it
746 	 * probably should be a configuration knob in daemon.
747 	 */
748 	flags &= ~O_DIRECT;
749 
750 	return flags;
751 }
752 
753 static int
754 lo_open(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
755 {
756 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
757 	int fd, saverr;
758 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.open.fobject;
759 	uint32_t flags = fsdev_io->u_in.open.flags;
760 	struct spdk_fsdev_file_handle *fhandle;
761 
762 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
763 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
764 		return -EINVAL;
765 	}
766 
767 	flags = update_open_flags(vfsdev, flags);
768 
769 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, flags & ~O_NOFOLLOW);
770 	if (fd == -1) {
771 		saverr = -errno;
772 		SPDK_ERRLOG("openat(%d, %s, 0x%08" PRIx32 ") failed with err=%d\n",
773 			    vfsdev->proc_self_fd, fobject->fd_str, flags, saverr);
774 		return saverr;
775 	}
776 
777 	fhandle = file_handle_create(fobject, fd);
778 	if (!fhandle) {
779 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
780 		close(fd);
781 		return -ENOMEM;
782 	}
783 
784 	fsdev_io->u_out.open.fhandle = fhandle;
785 
786 	SPDK_DEBUGLOG(fsdev_aio, "OPEN succeeded for " FOBJECT_FMT " (fh=%p, fd=%d)\n",
787 		      FOBJECT_ARGS(fobject), fhandle, fd);
788 
789 	return 0;
790 }
791 
792 static int
793 lo_flush(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
794 {
795 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
796 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flush.fobject;
797 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flush.fhandle;
798 	int res, saverr;
799 
800 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
801 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
802 		return -EINVAL;
803 	}
804 
805 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
806 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
807 		return -EINVAL;
808 	}
809 
810 	res = close(dup(fhandle->fd));
811 	if (res) {
812 		saverr = -errno;
813 		SPDK_ERRLOG("close(dup(%d)) failed for " FOBJECT_FMT " (fh=%p, err=%d)\n",
814 			    fhandle->fd, FOBJECT_ARGS(fobject), fhandle, saverr);
815 		return saverr;
816 	}
817 
818 	SPDK_DEBUGLOG(fsdev_aio, "FLUSH succeeded for " FOBJECT_FMT " (fh=%p)\n", FOBJECT_ARGS(fobject),
819 		      fhandle);
820 
821 	return 0;
822 }
823 
824 static int
825 lo_setattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
826 {
827 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
828 	int saverr;
829 	int res;
830 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setattr.fobject;
831 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.setattr.fhandle;
832 	uint32_t to_set = fsdev_io->u_in.setattr.to_set;
833 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_in.setattr.attr;
834 
835 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
836 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
837 		return -EINVAL;
838 	}
839 
840 	if (to_set & FSDEV_SET_ATTR_MODE) {
841 		if (fhandle) {
842 			res = fchmod(fhandle->fd, attr->mode);
843 		} else {
844 			res = fchmodat(vfsdev->proc_self_fd, fobject->fd_str, attr->mode, 0);
845 		}
846 		if (res == -1) {
847 			saverr = -errno;
848 			SPDK_ERRLOG("fchmod failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
849 			return saverr;
850 		}
851 	}
852 
853 	if (to_set & (FSDEV_SET_ATTR_UID | FSDEV_SET_ATTR_GID)) {
854 		uid_t uid = (to_set & FSDEV_SET_ATTR_UID) ? attr->uid : (uid_t) -1;
855 		gid_t gid = (to_set & FSDEV_SET_ATTR_GID) ? attr->gid : (gid_t) -1;
856 
857 		res = fchownat(fobject->fd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
858 		if (res == -1) {
859 			saverr = -errno;
860 			SPDK_ERRLOG("fchownat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
861 			return saverr;
862 		}
863 	}
864 
865 	if (to_set & FSDEV_SET_ATTR_SIZE) {
866 		int truncfd;
867 
868 		if (fhandle) {
869 			truncfd = fhandle->fd;
870 		} else {
871 			truncfd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
872 			if (truncfd < 0) {
873 				saverr = -errno;
874 				SPDK_ERRLOG("openat failed for " FOBJECT_FMT "\n", FOBJECT_ARGS(fobject));
875 				return saverr;
876 			}
877 		}
878 
879 		res = ftruncate(truncfd, attr->size);
880 		if (!fhandle) {
881 			saverr = -errno;
882 			close(truncfd);
883 			errno = saverr;
884 		}
885 		if (res == -1) {
886 			saverr = -errno;
887 			SPDK_ERRLOG("ftruncate failed for " FOBJECT_FMT " (size=%" PRIu64 ")\n", FOBJECT_ARGS(fobject),
888 				    attr->size);
889 			return saverr;
890 		}
891 	}
892 
893 	if (to_set & (FSDEV_SET_ATTR_ATIME | FSDEV_SET_ATTR_MTIME)) {
894 		struct timespec tv[2];
895 
896 		tv[0].tv_sec = 0;
897 		tv[1].tv_sec = 0;
898 		tv[0].tv_nsec = UTIME_OMIT;
899 		tv[1].tv_nsec = UTIME_OMIT;
900 
901 		if (to_set & FSDEV_SET_ATTR_ATIME_NOW) {
902 			tv[0].tv_nsec = UTIME_NOW;
903 		} else if (to_set & FSDEV_SET_ATTR_ATIME) {
904 			tv[0].tv_sec = attr->atime;
905 			tv[0].tv_nsec = attr->atimensec;
906 		}
907 
908 		if (to_set & FSDEV_SET_ATTR_MTIME_NOW) {
909 			tv[1].tv_nsec = UTIME_NOW;
910 		} else if (to_set & FSDEV_SET_ATTR_MTIME) {
911 			tv[1].tv_sec = attr->mtime;
912 			tv[1].tv_nsec = attr->mtimensec;
913 		}
914 
915 		if (fhandle) {
916 			res = futimens(fhandle->fd, tv);
917 		} else {
918 			res = utimensat_empty(vfsdev, fobject, tv);
919 		}
920 		if (res == -1) {
921 			saverr = -errno;
922 			SPDK_ERRLOG("futimens/utimensat_empty failed for " FOBJECT_FMT "\n",
923 				    FOBJECT_ARGS(fobject));
924 			return saverr;
925 		}
926 	}
927 
928 	res = file_object_fill_attr(fobject, &fsdev_io->u_out.setattr.attr);
929 	if (res) {
930 		SPDK_ERRLOG("file_object_fill_attr failed for " FOBJECT_FMT "\n",
931 			    FOBJECT_ARGS(fobject));
932 		return res;
933 	}
934 
935 	SPDK_DEBUGLOG(fsdev_aio, "SETATTR succeeded for " FOBJECT_FMT "\n",
936 		      FOBJECT_ARGS(fobject));
937 
938 	return 0;
939 }
940 
941 static int
942 lo_create(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
943 {
944 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
945 	int fd;
946 	int err;
947 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.create.parent_fobject;
948 	const char *name = fsdev_io->u_in.create.name;
949 	uint32_t mode = fsdev_io->u_in.create.mode;
950 	uint32_t flags = fsdev_io->u_in.create.flags;
951 	uint32_t umask = fsdev_io->u_in.create.umask;
952 	struct lo_cred old_cred, new_cred = {
953 		.euid = fsdev_io->u_in.create.euid,
954 		.egid = fsdev_io->u_in.create.egid,
955 	};
956 	struct spdk_fsdev_file_object *fobject;
957 	struct spdk_fsdev_file_handle *fhandle;
958 	struct spdk_fsdev_file_attr *attr = &fsdev_io->u_out.create.attr;
959 
960 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
961 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
962 		return -EINVAL;
963 	}
964 
965 	UNUSED(umask);
966 
967 	if (!is_safe_path_component(name)) {
968 		SPDK_ERRLOG("CREATE: %s not a safe component\n", name);
969 		return -EINVAL;
970 	}
971 
972 	err = lo_change_cred(&new_cred, &old_cred);
973 	if (err) {
974 		SPDK_ERRLOG("CREATE: cannot change credentials\n");
975 		return err;
976 	}
977 
978 	flags = update_open_flags(vfsdev, flags);
979 
980 	fd = openat(parent_fobject->fd, name, (flags | O_CREAT) & ~O_NOFOLLOW, mode);
981 	err = fd == -1 ? -errno : 0;
982 	lo_restore_cred(&old_cred);
983 
984 	if (err) {
985 		SPDK_ERRLOG("CREATE: openat failed with %d\n", err);
986 		return err;
987 	}
988 
989 	err = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, attr);
990 	if (err) {
991 		SPDK_ERRLOG("CREATE: lookup failed with %d\n", err);
992 		return err;
993 	}
994 
995 	fhandle = file_handle_create(fobject, fd);
996 	if (!fhandle) {
997 		SPDK_ERRLOG("cannot create a file handle (fd=%d)\n", fd);
998 		close(fd);
999 		file_object_unref(fobject, 1);
1000 		return -ENOMEM;
1001 	}
1002 
1003 	SPDK_DEBUGLOG(fsdev_aio, "CREATE: succeeded (name=%s " FOBJECT_FMT " fh=%p)\n",
1004 		      name, FOBJECT_ARGS(fobject), fhandle);
1005 
1006 	fsdev_io->u_out.create.fobject = fobject;
1007 	fsdev_io->u_out.create.fhandle = fhandle;
1008 
1009 	return 0;
1010 }
1011 
1012 static int
1013 lo_release(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1014 {
1015 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1016 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.release.fobject;
1017 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.release.fhandle;
1018 
1019 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1020 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1021 		return -EINVAL;
1022 	}
1023 
1024 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1025 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1026 		return -EINVAL;
1027 	}
1028 
1029 	SPDK_DEBUGLOG(fsdev_aio, "RELEASE succeeded for " FOBJECT_FMT " fh=%p)\n",
1030 		      FOBJECT_ARGS(fobject), fhandle);
1031 
1032 	file_handle_delete(fhandle);
1033 
1034 	return 0;
1035 }
1036 
1037 static void
1038 lo_read_cb(void *ctx, uint32_t data_size, int error)
1039 {
1040 	struct spdk_fsdev_io *fsdev_io = ctx;
1041 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1042 
1043 	if (vfsdev_io->aio) {
1044 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1045 	}
1046 
1047 	fsdev_io->u_out.read.data_size = data_size;
1048 
1049 	spdk_fsdev_io_complete(fsdev_io, error);
1050 }
1051 
1052 static int
1053 lo_read(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1054 {
1055 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1056 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1057 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1058 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.read.fobject;
1059 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.read.fhandle;
1060 	size_t size = fsdev_io->u_in.read.size;
1061 	uint64_t offs = fsdev_io->u_in.read.offs;
1062 	uint32_t flags = fsdev_io->u_in.read.flags;
1063 	struct iovec *outvec = fsdev_io->u_in.read.iov;
1064 	uint32_t outcnt = fsdev_io->u_in.read.iovcnt;
1065 
1066 	/* we don't suport the memory domains at the moment */
1067 	assert(!fsdev_io->u_in.read.opts || !fsdev_io->u_in.read.opts->memory_domain);
1068 
1069 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1070 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1071 		return -EINVAL;
1072 	}
1073 
1074 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1075 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1076 		return -EINVAL;
1077 	}
1078 
1079 	UNUSED(flags);
1080 
1081 	if (!outcnt || !outvec) {
1082 		SPDK_ERRLOG("bad outvec: iov=%p outcnt=%" PRIu32 "\n", outvec, outcnt);
1083 		return -EINVAL;
1084 	}
1085 
1086 	vfsdev_io->aio = spdk_aio_mgr_read(ch->mgr, lo_read_cb, fsdev_io, fhandle->fd, offs, size, outvec,
1087 					   outcnt);
1088 	if (vfsdev_io->aio) {
1089 		vfsdev_io->ch = ch;
1090 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1091 	}
1092 
1093 	return IO_STATUS_ASYNC;
1094 }
1095 
1096 static void
1097 lo_write_cb(void *ctx, uint32_t data_size, int error)
1098 {
1099 	struct spdk_fsdev_io *fsdev_io = ctx;
1100 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1101 
1102 	if (vfsdev_io->aio) {
1103 		TAILQ_REMOVE(&vfsdev_io->ch->ios_in_progress, vfsdev_io, link);
1104 	}
1105 
1106 	fsdev_io->u_out.write.data_size = data_size;
1107 
1108 	spdk_fsdev_io_complete(fsdev_io, error);
1109 }
1110 
1111 static int
1112 lo_write(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1113 {
1114 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1115 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1116 	struct aio_fsdev_io *vfsdev_io = fsdev_to_aio_io(fsdev_io);
1117 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.write.fobject;
1118 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.write.fhandle;
1119 	size_t size = fsdev_io->u_in.write.size;
1120 	uint64_t offs = fsdev_io->u_in.write.offs;
1121 	uint32_t flags = fsdev_io->u_in.write.flags;
1122 	const struct iovec *invec = fsdev_io->u_in.write.iov;
1123 	uint32_t incnt =  fsdev_io->u_in.write.iovcnt;
1124 
1125 	/* we don't suport the memory domains at the moment */
1126 	assert(!fsdev_io->u_in.write.opts || !fsdev_io->u_in.write.opts->memory_domain);
1127 
1128 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1129 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1130 		return -EINVAL;
1131 	}
1132 
1133 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1134 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1135 		return -EINVAL;
1136 	}
1137 
1138 	UNUSED(flags);
1139 
1140 	if (!incnt || !invec) { /* there should be at least one iovec with data */
1141 		SPDK_ERRLOG("bad invec: iov=%p cnt=%" PRIu32 "\n", invec, incnt);
1142 		return -EINVAL;
1143 	}
1144 
1145 	vfsdev_io->aio = spdk_aio_mgr_write(ch->mgr, lo_write_cb, fsdev_io,
1146 					    fhandle->fd, offs, size, invec, incnt);
1147 	if (vfsdev_io->aio) {
1148 		vfsdev_io->ch = ch;
1149 		TAILQ_INSERT_TAIL(&ch->ios_in_progress, vfsdev_io, link);
1150 	}
1151 
1152 	return IO_STATUS_ASYNC;
1153 }
1154 
1155 static int
1156 lo_readlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1157 {
1158 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1159 	int res;
1160 	char *buf;
1161 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.readlink.fobject;
1162 
1163 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1164 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1165 		return -EINVAL;
1166 	}
1167 
1168 	buf = malloc(PATH_MAX + 1);
1169 	if (!buf) {
1170 		SPDK_ERRLOG("malloc(%zu) failed\n", (size_t)(PATH_MAX + 1));
1171 		return -ENOMEM;
1172 	}
1173 
1174 	res = readlinkat(fobject->fd, "", buf, PATH_MAX + 1);
1175 	if (res == -1) {
1176 		int saverr = -errno;
1177 		SPDK_ERRLOG("readlinkat failed for " FOBJECT_FMT " with %d\n",
1178 			    FOBJECT_ARGS(fobject), saverr);
1179 		free(buf);
1180 		return saverr;
1181 	}
1182 
1183 	if (((uint32_t)res) == PATH_MAX + 1) {
1184 		SPDK_ERRLOG("buffer is too short\n");
1185 		free(buf);
1186 		return -ENAMETOOLONG;
1187 	}
1188 
1189 	buf[res] = 0;
1190 	fsdev_io->u_out.readlink.linkname = buf;
1191 
1192 	return 0;
1193 }
1194 
1195 static int
1196 lo_statfs(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1197 {
1198 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1199 	int res;
1200 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.statfs.fobject;
1201 	struct statvfs stbuf;
1202 
1203 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1204 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1205 		return -EINVAL;
1206 	}
1207 
1208 	res = fstatvfs(fobject->fd, &stbuf);
1209 	if (res == -1) {
1210 		int saverr = -errno;
1211 		SPDK_ERRLOG("fstatvfs failed with %d\n", saverr);
1212 		return saverr;
1213 	}
1214 
1215 	fsdev_io->u_out.statfs.statfs.blocks = stbuf.f_blocks;
1216 	fsdev_io->u_out.statfs.statfs.bfree = stbuf.f_bfree;
1217 	fsdev_io->u_out.statfs.statfs.bavail = stbuf.f_bavail;
1218 	fsdev_io->u_out.statfs.statfs.files = stbuf.f_files;
1219 	fsdev_io->u_out.statfs.statfs.ffree = stbuf.f_ffree;
1220 	fsdev_io->u_out.statfs.statfs.bsize = stbuf.f_bsize;
1221 	fsdev_io->u_out.statfs.statfs.namelen = stbuf.f_namemax;
1222 	fsdev_io->u_out.statfs.statfs.frsize = stbuf.f_frsize;
1223 
1224 	return 0;
1225 }
1226 
1227 static int
1228 lo_mknod_symlink(struct spdk_fsdev_io *fsdev_io, struct spdk_fsdev_file_object *parent_fobject,
1229 		 const char *name, mode_t mode, dev_t rdev, const char *link, uid_t euid, gid_t egid,
1230 		 struct spdk_fsdev_file_object **pfobject, struct spdk_fsdev_file_attr *attr)
1231 {
1232 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1233 	int res;
1234 	int saverr;
1235 	struct lo_cred old_cred, new_cred = {
1236 		.euid = euid,
1237 		.egid = egid,
1238 	};
1239 
1240 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1241 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1242 		return -EINVAL;
1243 	}
1244 
1245 	if (!is_safe_path_component(name)) {
1246 		SPDK_ERRLOG("%s isn'h safe\n", name);
1247 		return -EINVAL;
1248 	}
1249 
1250 	res = lo_change_cred(&new_cred, &old_cred);
1251 	if (res) {
1252 		SPDK_ERRLOG("cannot change cred (err=%d)\n", res);
1253 		return res;
1254 	}
1255 
1256 	if (S_ISDIR(mode)) {
1257 		res = mkdirat(parent_fobject->fd, name, mode);
1258 	} else if (S_ISLNK(mode)) {
1259 		if (link) {
1260 			res = symlinkat(link, parent_fobject->fd, name);
1261 		} else {
1262 			SPDK_ERRLOG("NULL link pointer\n");
1263 			errno = EINVAL;
1264 		}
1265 	} else {
1266 		res = mknodat(parent_fobject->fd, name, mode, rdev);
1267 	}
1268 	saverr = -errno;
1269 
1270 	lo_restore_cred(&old_cred);
1271 
1272 	if (res == -1) {
1273 		SPDK_ERRLOG("cannot mkdirat/symlinkat/mknodat (err=%d)\n", saverr);
1274 		return saverr;
1275 	}
1276 
1277 	res = lo_do_lookup(vfsdev, parent_fobject, name, pfobject, attr);
1278 	if (res) {
1279 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1280 		return res;
1281 	}
1282 
1283 	SPDK_DEBUGLOG(fsdev_aio, "lo_mknod_symlink(" FOBJECT_FMT "/%s -> " FOBJECT_FMT "\n",
1284 		      FOBJECT_ARGS(parent_fobject), name, FOBJECT_ARGS(*pfobject));
1285 
1286 	return 0;
1287 }
1288 
1289 static int
1290 lo_mknod(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1291 {
1292 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mknod.parent_fobject;
1293 	char *name = fsdev_io->u_in.mknod.name;
1294 	mode_t mode = fsdev_io->u_in.mknod.mode;
1295 	dev_t rdev = fsdev_io->u_in.mknod.rdev;
1296 	uid_t euid = fsdev_io->u_in.mknod.euid;
1297 	gid_t egid = fsdev_io->u_in.mknod.egid;
1298 
1299 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, mode, rdev, NULL, euid, egid,
1300 				&fsdev_io->u_out.mknod.fobject, &fsdev_io->u_out.mknod.attr);
1301 }
1302 
1303 static int
1304 lo_mkdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1305 {
1306 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.mkdir.parent_fobject;
1307 	char *name = fsdev_io->u_in.mkdir.name;
1308 	mode_t mode = fsdev_io->u_in.mkdir.mode;
1309 	uid_t euid = fsdev_io->u_in.mkdir.euid;
1310 	gid_t egid = fsdev_io->u_in.mkdir.egid;
1311 
1312 	return lo_mknod_symlink(fsdev_io, parent_fobject, name, S_IFDIR | mode, 0, NULL, euid, egid,
1313 				&fsdev_io->u_out.mkdir.fobject, &fsdev_io->u_out.mkdir.attr);
1314 }
1315 
1316 static int
1317 lo_symlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1318 {
1319 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.symlink.parent_fobject;
1320 	char *target = fsdev_io->u_in.symlink.target;
1321 	char *linkpath = fsdev_io->u_in.symlink.linkpath;
1322 	uid_t euid = fsdev_io->u_in.symlink.euid;
1323 	gid_t egid = fsdev_io->u_in.symlink.egid;
1324 
1325 	return lo_mknod_symlink(fsdev_io, parent_fobject, target, S_IFLNK, 0, linkpath, euid, egid,
1326 				&fsdev_io->u_out.symlink.fobject, &fsdev_io->u_out.symlink.attr);
1327 }
1328 
1329 static int
1330 lo_do_unlink(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *parent_fobject,
1331 	     const char *name, bool is_dir)
1332 {
1333 	/* fobject must be initialized to avoid a scan-build false positive */
1334 	struct spdk_fsdev_file_object *fobject = NULL;
1335 	int res;
1336 
1337 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1338 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1339 		return -EINVAL;
1340 	}
1341 
1342 	if (!is_safe_path_component(name)) {
1343 		SPDK_ERRLOG("%s isn't safe\n", name);
1344 		return -EINVAL;
1345 	}
1346 
1347 	res = lo_do_lookup(vfsdev, parent_fobject, name, &fobject, NULL);
1348 	if (res) {
1349 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1350 		return -EIO;
1351 	}
1352 
1353 	res = unlinkat(parent_fobject->fd, name, is_dir ? AT_REMOVEDIR : 0);
1354 	if (res) {
1355 		res = -errno;
1356 		SPDK_WARNLOG("unlinkat(" FOBJECT_FMT " %s) failed (err=%d)\n",
1357 			     FOBJECT_ARGS(parent_fobject), name, res);
1358 	}
1359 
1360 	file_object_unref(fobject, 1);
1361 	return res;
1362 }
1363 
1364 static int
1365 lo_unlink(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1366 {
1367 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1368 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.unlink.parent_fobject;
1369 	char *name = fsdev_io->u_in.unlink.name;
1370 
1371 	return lo_do_unlink(vfsdev, parent_fobject, name, false);
1372 }
1373 
1374 static int
1375 lo_rmdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1376 {
1377 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1378 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rmdir.parent_fobject;
1379 	char *name = fsdev_io->u_in.rmdir.name;
1380 
1381 	return lo_do_unlink(vfsdev, parent_fobject, name, true);
1382 }
1383 
1384 static int
1385 lo_rename(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1386 {
1387 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1388 	int res, saverr;
1389 	/* old_fobject must be initialized to avoid a scan-build false positive */
1390 	struct spdk_fsdev_file_object *old_fobject = NULL;
1391 	struct spdk_fsdev_file_object *parent_fobject = fsdev_io->u_in.rename.parent_fobject;
1392 	char *name = fsdev_io->u_in.rename.name;
1393 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.rename.new_parent_fobject;
1394 	char *new_name = fsdev_io->u_in.rename.new_name;
1395 	uint32_t flags = fsdev_io->u_in.rename.flags;
1396 
1397 	if (!fsdev_aio_is_valid_fobject(vfsdev, parent_fobject)) {
1398 		SPDK_ERRLOG("Invalid parent_fobject: %p\n", parent_fobject);
1399 		return -EINVAL;
1400 	}
1401 
1402 	if (!fsdev_aio_is_valid_fobject(vfsdev, new_parent_fobject)) {
1403 		SPDK_ERRLOG("Invalid new_parent_fobject: %p\n", new_parent_fobject);
1404 		return -EINVAL;
1405 	}
1406 
1407 	if (!is_safe_path_component(name)) {
1408 		SPDK_ERRLOG("name '%s' isn't safe\n", name);
1409 		return -EINVAL;
1410 	}
1411 
1412 	if (!is_safe_path_component(new_name)) {
1413 		SPDK_ERRLOG("newname '%s' isn't safe\n", new_name);
1414 		return -EINVAL;
1415 	}
1416 
1417 	res = lo_do_lookup(vfsdev, parent_fobject, name, &old_fobject, NULL);
1418 	if (res) {
1419 		SPDK_ERRLOG("can't find '%s' under " FOBJECT_FMT "\n", name, FOBJECT_ARGS(parent_fobject));
1420 		return -EIO;
1421 	}
1422 
1423 	saverr = 0;
1424 	if (flags) {
1425 #ifndef SYS_renameat2
1426 		SPDK_ERRLOG("flags are not supported\n");
1427 		return -ENOTSUP;
1428 #else
1429 		res = syscall(SYS_renameat2, parent_fobject->fd, name, new_parent_fobject->fd,
1430 			      new_name, flags);
1431 		if (res == -1 && errno == ENOSYS) {
1432 			SPDK_ERRLOG("SYS_renameat2 returned ENOSYS\n");
1433 			saverr = -EINVAL;
1434 		} else if (res == -1) {
1435 			saverr = -errno;
1436 			SPDK_ERRLOG("SYS_renameat2 failed (err=%d))\n", saverr);
1437 		}
1438 #endif
1439 	} else {
1440 		res = renameat(parent_fobject->fd, name, new_parent_fobject->fd, new_name);
1441 		if (res == -1) {
1442 			saverr = -errno;
1443 			SPDK_ERRLOG("renameat failed (err=%d)\n", saverr);
1444 		}
1445 	}
1446 
1447 	file_object_unref(old_fobject, 1);
1448 
1449 	return saverr;
1450 }
1451 
1452 static int
1453 linkat_empty_nofollow(struct aio_fsdev *vfsdev, struct spdk_fsdev_file_object *fobject, int dfd,
1454 		      const char *name)
1455 {
1456 	int res;
1457 
1458 	if (fobject->is_symlink) {
1459 		res = linkat(fobject->fd, "", dfd, name, AT_EMPTY_PATH);
1460 		if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
1461 			/* Sorry, no race free way to hard-link a symlink. */
1462 			errno = EPERM;
1463 		}
1464 	} else {
1465 		res = linkat(vfsdev->proc_self_fd, fobject->fd_str, dfd, name, AT_SYMLINK_FOLLOW);
1466 	}
1467 
1468 	return res;
1469 }
1470 
1471 static int
1472 lo_link(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1473 {
1474 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1475 	int res;
1476 	int saverr;
1477 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.link.fobject;
1478 	struct spdk_fsdev_file_object *new_parent_fobject = fsdev_io->u_in.link.new_parent_fobject;
1479 	char *name = fsdev_io->u_in.link.name;
1480 
1481 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1482 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1483 		return -EINVAL;
1484 	}
1485 
1486 	if (!is_safe_path_component(name)) {
1487 		SPDK_ERRLOG("%s is not a safe component\n", name);
1488 		return -EINVAL;
1489 	}
1490 
1491 	res = linkat_empty_nofollow(vfsdev, fobject, new_parent_fobject->fd, name);
1492 	if (res == -1) {
1493 		saverr = -errno;
1494 		SPDK_ERRLOG("linkat_empty_nofollow failed " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s (err=%d)\n",
1495 			    FOBJECT_ARGS(fobject), FOBJECT_ARGS(new_parent_fobject), name, saverr);
1496 		return saverr;
1497 	}
1498 
1499 	res = lo_do_lookup(vfsdev, new_parent_fobject, name, &fsdev_io->u_out.link.fobject,
1500 			   &fsdev_io->u_out.link.attr);
1501 	if (res) {
1502 		SPDK_ERRLOG("lookup failed (err=%d)\n", res);
1503 		return res;
1504 	}
1505 
1506 	SPDK_DEBUGLOG(fsdev_aio, "LINK succeeded for " FOBJECT_FMT " -> " FOBJECT_FMT " name=%s\n",
1507 		      FOBJECT_ARGS(fobject), FOBJECT_ARGS(fsdev_io->u_out.link.fobject), name);
1508 
1509 	return 0;
1510 }
1511 
1512 static int
1513 lo_fsync(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1514 {
1515 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1516 	int res, saverr, fd;
1517 	char *buf;
1518 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsync.fobject;
1519 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsync.fhandle;
1520 	bool datasync = fsdev_io->u_in.fsync.datasync;
1521 
1522 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1523 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1524 		return -EINVAL;
1525 	}
1526 
1527 	if (!fhandle) {
1528 		res = asprintf(&buf, "%i", fobject->fd);
1529 		if (res == -1) {
1530 			saverr = -errno;
1531 			SPDK_ERRLOG("asprintf failed (errno=%d)\n", saverr);
1532 			return saverr;
1533 		}
1534 
1535 		fd = openat(vfsdev->proc_self_fd, buf, O_RDWR);
1536 		saverr = -errno;
1537 		free(buf);
1538 		if (fd == -1) {
1539 			SPDK_ERRLOG("openat failed (errno=%d)\n", saverr);
1540 			return saverr;
1541 		}
1542 	} else {
1543 		fd = fhandle->fd;
1544 	}
1545 
1546 	if (datasync) {
1547 		res = fdatasync(fd);
1548 	} else {
1549 		res = fsync(fd);
1550 	}
1551 
1552 	saverr = -errno;
1553 	if (!fhandle) {
1554 		close(fd);
1555 	}
1556 
1557 	if (res == -1) {
1558 		SPDK_ERRLOG("fdatasync/fsync failed for " FOBJECT_FMT " fh=%p (err=%d)\n",
1559 			    FOBJECT_ARGS(fobject), fhandle, saverr);
1560 		return saverr;
1561 	}
1562 
1563 	SPDK_DEBUGLOG(fsdev_aio, "FSYNC succeeded for " FOBJECT_FMT " fh=%p\n",
1564 		      FOBJECT_ARGS(fobject), fhandle);
1565 
1566 	return 0;
1567 }
1568 
1569 static int
1570 lo_setxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1571 {
1572 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1573 	ssize_t ret;
1574 	int saverr;
1575 	int fd = -1;
1576 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.setxattr.fobject;
1577 	char *name = fsdev_io->u_in.setxattr.name;
1578 	char *value = fsdev_io->u_in.setxattr.value;
1579 	uint32_t size = fsdev_io->u_in.setxattr.size;
1580 	uint32_t flags = fsdev_io->u_in.setxattr.flags;
1581 
1582 	if (!vfsdev->xattr_enabled) {
1583 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1584 		return -ENOSYS;
1585 	}
1586 
1587 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1588 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1589 		return -EINVAL;
1590 	}
1591 
1592 	if (fobject->is_symlink) {
1593 		/* Sorry, no race free way to removexattr on symlink. */
1594 		SPDK_ERRLOG("cannot set xattr for symlink\n");
1595 		return -EPERM;
1596 	}
1597 
1598 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1599 	if (fd < 0) {
1600 		saverr = -errno;
1601 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1602 		return saverr;
1603 	}
1604 
1605 	ret = fsetxattr(fd, name, value, size, flags);
1606 	saverr = -errno;
1607 	close(fd);
1608 	if (ret == -1) {
1609 		if (saverr == -ENOTSUP) {
1610 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1611 		} else {
1612 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1613 		}
1614 		return saverr;
1615 	}
1616 
1617 	SPDK_DEBUGLOG(fsdev_aio,
1618 		      "SETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s size=%" PRIu32 "flags=0x%x" PRIx32 "\n",
1619 		      FOBJECT_ARGS(fobject), name, value, size, flags);
1620 
1621 	return 0;
1622 }
1623 
1624 static int
1625 lo_getxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1626 {
1627 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1628 	ssize_t ret;
1629 	int saverr;
1630 	int fd = -1;
1631 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.getxattr.fobject;
1632 	char *name = fsdev_io->u_in.getxattr.name;
1633 	void *buffer = fsdev_io->u_in.getxattr.buffer;
1634 	size_t size = fsdev_io->u_in.getxattr.size;
1635 
1636 	if (!vfsdev->xattr_enabled) {
1637 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1638 		return -ENOSYS;
1639 	}
1640 
1641 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1642 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1643 		return -EINVAL;
1644 	}
1645 
1646 	if (fobject->is_symlink) {
1647 		/* Sorry, no race free way to getxattr on symlink. */
1648 		SPDK_ERRLOG("cannot get xattr for symlink\n");
1649 		return -EPERM;
1650 	}
1651 
1652 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDWR);
1653 	if (fd < 0) {
1654 		saverr = -errno;
1655 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1656 		return saverr;
1657 	}
1658 
1659 	ret = fgetxattr(fd, name, buffer, size);
1660 	saverr = -errno;
1661 	close(fd);
1662 	if (ret == -1) {
1663 		if (saverr == -ENODATA) {
1664 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: no extended attribute '%s' found\n", name);
1665 		} else if (saverr == -ENOTSUP) {
1666 			SPDK_INFOLOG(fsdev_aio, "fgetxattr: extended attributes are not supported or disabled\n");
1667 		} else {
1668 			SPDK_ERRLOG("fgetxattr failed with errno=%d\n", saverr);
1669 		}
1670 		return saverr;
1671 	}
1672 
1673 	fsdev_io->u_out.getxattr.value_size = ret;
1674 
1675 	SPDK_DEBUGLOG(fsdev_aio,
1676 		      "GETXATTR succeeded for " FOBJECT_FMT " name=%s value=%s value_size=%zd\n",
1677 		      FOBJECT_ARGS(fobject), name, (char *)buffer, ret);
1678 
1679 	return 0;
1680 }
1681 
1682 static int
1683 lo_listxattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1684 {
1685 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1686 	ssize_t ret;
1687 	int saverr;
1688 	int fd = -1;
1689 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.listxattr.fobject;
1690 	char *buffer = fsdev_io->u_in.listxattr.buffer;
1691 	size_t size = fsdev_io->u_in.listxattr.size;
1692 
1693 	if (!vfsdev->xattr_enabled) {
1694 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1695 		return -ENOSYS;
1696 	}
1697 
1698 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1699 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1700 		return -EINVAL;
1701 	}
1702 
1703 	if (fobject->is_symlink) {
1704 		/* Sorry, no race free way to listxattr on symlink. */
1705 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1706 		return -EPERM;
1707 	}
1708 
1709 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1710 	if (fd < 0) {
1711 		saverr = -errno;
1712 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1713 		return saverr;
1714 	}
1715 
1716 	ret = flistxattr(fd, buffer, size);
1717 	saverr = -errno;
1718 	close(fd);
1719 	if (ret == -1) {
1720 		if (saverr == -ENOTSUP) {
1721 			SPDK_INFOLOG(fsdev_aio, "flistxattr: extended attributes are not supported or disabled\n");
1722 		} else {
1723 			SPDK_ERRLOG("flistxattr failed with errno=%d\n", saverr);
1724 		}
1725 		return saverr;
1726 	}
1727 
1728 	fsdev_io->u_out.listxattr.data_size = ret;
1729 	fsdev_io->u_out.listxattr.size_only = (size == 0);
1730 
1731 	SPDK_DEBUGLOG(fsdev_aio, "LISTXATTR succeeded for " FOBJECT_FMT " data_size=%zu\n",
1732 		      FOBJECT_ARGS(fobject), ret);
1733 
1734 	return 0;
1735 }
1736 
1737 static int
1738 lo_removexattr(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1739 {
1740 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1741 	ssize_t ret;
1742 	int saverr;
1743 	int fd = -1;
1744 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.removexattr.fobject;
1745 	char *name = fsdev_io->u_in.removexattr.name;
1746 
1747 	if (!vfsdev->xattr_enabled) {
1748 		SPDK_INFOLOG(fsdev_aio, "xattr is disabled by config\n");
1749 		return -ENOSYS;
1750 	}
1751 
1752 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1753 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1754 		return -EINVAL;
1755 	}
1756 
1757 	if (fobject->is_symlink) {
1758 		/* Sorry, no race free way to setxattr on symlink. */
1759 		SPDK_ERRLOG("cannot list xattr for symlink\n");
1760 		return -EPERM;
1761 	}
1762 
1763 	fd = openat(vfsdev->proc_self_fd, fobject->fd_str, O_RDONLY);
1764 	if (fd < 0) {
1765 		saverr = -errno;
1766 		SPDK_ERRLOG("openat failed with errno=%d\n", saverr);
1767 		return saverr;
1768 	}
1769 
1770 	ret = fremovexattr(fd, name);
1771 	saverr = -errno;
1772 	close(fd);
1773 	if (ret == -1) {
1774 		if (saverr == -ENODATA) {
1775 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: no extended attribute '%s' found\n", name);
1776 		} else if (saverr == -ENOTSUP) {
1777 			SPDK_INFOLOG(fsdev_aio, "fremovexattr: extended attributes are not supported or disabled\n");
1778 		} else {
1779 			SPDK_ERRLOG("fremovexattr failed with errno=%d\n", saverr);
1780 		}
1781 		return saverr;
1782 	}
1783 
1784 	SPDK_DEBUGLOG(fsdev_aio, "REMOVEXATTR succeeded for " FOBJECT_FMT " name=%s\n",
1785 		      FOBJECT_ARGS(fobject), name);
1786 
1787 	return 0;
1788 }
1789 
1790 static int
1791 lo_fsyncdir(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1792 {
1793 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1794 	int res;
1795 	int saverr = 0;
1796 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fsyncdir.fobject;
1797 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fsyncdir.fhandle;
1798 	bool datasync = fsdev_io->u_in.fsyncdir.datasync;
1799 
1800 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1801 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1802 		return -EINVAL;
1803 	}
1804 
1805 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1806 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1807 		return -EINVAL;
1808 	}
1809 
1810 	if (datasync) {
1811 		res = fdatasync(fhandle->fd);
1812 	} else {
1813 		res = fsync(fhandle->fd);
1814 	}
1815 
1816 	if (res == -1) {
1817 		saverr = -errno;
1818 		SPDK_ERRLOG("%s failed for fh=%p with err=%d\n",
1819 			    datasync ? "fdatasync" : "fsync", fhandle, saverr);
1820 		return saverr;
1821 	}
1822 
1823 	SPDK_DEBUGLOG(fsdev_aio, "FSYNCDIR succeeded for " FOBJECT_FMT " fh=%p datasync=%d\n",
1824 		      FOBJECT_ARGS(fobject), fhandle, datasync);
1825 
1826 	return 0;
1827 }
1828 
1829 static int
1830 lo_flock(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1831 {
1832 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1833 	int res;
1834 	int saverr = 0;
1835 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.flock.fobject;
1836 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.flock.fhandle;
1837 	int operation = fsdev_io->u_in.flock.operation;
1838 
1839 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1840 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1841 		return -EINVAL;
1842 	}
1843 
1844 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1845 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1846 		return -EINVAL;
1847 	}
1848 
1849 	res = flock(fhandle->fd, operation | LOCK_NB);
1850 	if (res == -1) {
1851 		saverr = -errno;
1852 		SPDK_ERRLOG("flock failed for fh=%p with err=%d\n", fhandle, saverr);
1853 		return saverr;
1854 	}
1855 
1856 	SPDK_DEBUGLOG(fsdev_aio, "FLOCK succeeded for " FOBJECT_FMT " fh=%p operation=%d\n",
1857 		      FOBJECT_ARGS(fobject), fhandle, operation);
1858 
1859 	return 0;
1860 }
1861 
1862 static int
1863 lo_fallocate(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1864 {
1865 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1866 	int err;
1867 	struct spdk_fsdev_file_object *fobject = fsdev_io->u_in.fallocate.fobject;
1868 	struct spdk_fsdev_file_handle *fhandle = fsdev_io->u_in.fallocate.fhandle;
1869 	uint32_t mode = fsdev_io->u_in.fallocate.mode;
1870 	uint64_t offset  = fsdev_io->u_in.fallocate.offset;
1871 	uint64_t length = fsdev_io->u_in.fallocate.length;
1872 
1873 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject)) {
1874 		SPDK_ERRLOG("Invalid fobject: %p\n", fobject);
1875 		return -EINVAL;
1876 	}
1877 
1878 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle)) {
1879 		SPDK_ERRLOG("Invalid fhandle: %p\n", fhandle);
1880 		return -EINVAL;
1881 	}
1882 
1883 	if (mode) {
1884 		SPDK_ERRLOG("non-zero mode is not suppored\n");
1885 		return -EOPNOTSUPP;
1886 	}
1887 
1888 	err = posix_fallocate(fhandle->fd, offset, length);
1889 	if (err) {
1890 		SPDK_ERRLOG("posix_fallocate failed for fh=%p with err=%d\n",
1891 			    fhandle, err);
1892 	}
1893 
1894 	SPDK_DEBUGLOG(fsdev_aio,
1895 		      "FALLOCATE returns %d for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " length=%" PRIu64 "\n",
1896 		      err, FOBJECT_ARGS(fobject), fhandle, offset, length);
1897 	return err;
1898 }
1899 
1900 static int
1901 lo_copy_file_range(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
1902 {
1903 #ifdef SPDK_CONFIG_COPY_FILE_RANGE
1904 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev_io->fsdev);
1905 	ssize_t res;
1906 	int saverr = 0;
1907 	struct spdk_fsdev_file_object *fobject_in = fsdev_io->u_in.copy_file_range.fobject_in;
1908 	struct spdk_fsdev_file_handle *fhandle_in = fsdev_io->u_in.copy_file_range.fhandle_in;
1909 	off_t off_in = fsdev_io->u_in.copy_file_range.off_in;
1910 	struct spdk_fsdev_file_object *fobject_out = fsdev_io->u_in.copy_file_range.fobject_out;
1911 	struct spdk_fsdev_file_handle *fhandle_out = fsdev_io->u_in.copy_file_range.fhandle_out;
1912 	off_t off_out = fsdev_io->u_in.copy_file_range.off_out;
1913 	size_t len = fsdev_io->u_in.copy_file_range.len;
1914 	uint32_t flags = fsdev_io->u_in.copy_file_range.flags;
1915 
1916 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_in)) {
1917 		SPDK_ERRLOG("Invalid fobject_in: %p\n", fobject_in);
1918 		return -EINVAL;
1919 	}
1920 
1921 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_in)) {
1922 		SPDK_ERRLOG("Invalid fhandle_in: %p\n", fhandle_in);
1923 		return -EINVAL;
1924 	}
1925 
1926 	if (!fsdev_aio_is_valid_fobject(vfsdev, fobject_out)) {
1927 		SPDK_ERRLOG("Invalid fobject_out: %p\n", fobject_out);
1928 		return -EINVAL;
1929 	}
1930 
1931 	if (!fsdev_aio_is_valid_fhandle(vfsdev, fhandle_out)) {
1932 		SPDK_ERRLOG("Invalid fhandle_out: %p\n", fhandle_out);
1933 		return -EINVAL;
1934 	}
1935 
1936 	res = copy_file_range(fhandle_in->fd, &off_in, fhandle_out->fd, &off_out, len, flags);
1937 	if (res < 0) {
1938 		saverr = -errno;
1939 		SPDK_ERRLOG("copy_file_range failed with err=%d\n", saverr);
1940 		return saverr;
1941 	}
1942 
1943 	SPDK_DEBUGLOG(fsdev_aio,
1944 		      "COPY_FILE_RANGE succeeded for " FOBJECT_FMT " fh=%p offset=%" PRIu64 " -> " FOBJECT_FMT
1945 		      " fh=%p offset=%" PRIu64 " (len-%zu flags=0x%" PRIx32 ")\n",
1946 		      FOBJECT_ARGS(fobject_in), fhandle_in, (uint64_t)off_in, FOBJECT_ARGS(fobject_out), fhandle_out,
1947 		      (uint64_t)off_out, len, flags);
1948 
1949 	return 0;
1950 #else
1951 	return -ENOSYS;
1952 #endif
1953 }
1954 
1955 static int
1956 lo_abort(struct spdk_io_channel *_ch, struct spdk_fsdev_io *fsdev_io)
1957 {
1958 	struct aio_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1959 	struct aio_fsdev_io *vfsdev_io;
1960 	uint64_t unique_to_abort = fsdev_io->u_in.abort.unique_to_abort;
1961 
1962 	TAILQ_FOREACH(vfsdev_io, &ch->ios_in_progress, link) {
1963 		struct spdk_fsdev_io *_fsdev_io = aio_to_fsdev_io(vfsdev_io);
1964 		if (spdk_fsdev_io_get_unique(_fsdev_io) == unique_to_abort) {
1965 			spdk_aio_mgr_cancel(ch->mgr, vfsdev_io->aio);
1966 			return 0;
1967 		}
1968 	}
1969 
1970 	return 0;
1971 }
1972 
1973 static int
1974 aio_io_poll(void *arg)
1975 {
1976 	struct aio_io_channel *ch = arg;
1977 
1978 	spdk_aio_mgr_poll(ch->mgr);
1979 
1980 	return SPDK_POLLER_IDLE;
1981 }
1982 
1983 static int
1984 aio_fsdev_create_cb(void *io_device, void *ctx_buf)
1985 {
1986 	struct aio_io_channel *ch = ctx_buf;
1987 	struct spdk_thread *thread = spdk_get_thread();
1988 
1989 	ch->mgr = spdk_aio_mgr_create(MAX_AIOS);
1990 	if (!ch->mgr) {
1991 		SPDK_ERRLOG("aoi manager init for failed (thread=%s)\n", spdk_thread_get_name(thread));
1992 		return -ENOMEM;
1993 	}
1994 
1995 	ch->poller = SPDK_POLLER_REGISTER(aio_io_poll, ch, 0);
1996 	TAILQ_INIT(&ch->ios_in_progress);
1997 
1998 	SPDK_DEBUGLOG(fsdev_aio, "Created aio fsdev IO channel: thread %s, thread id %" PRIu64
1999 		      "\n",
2000 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2001 	return 0;
2002 }
2003 
2004 static void
2005 aio_fsdev_destroy_cb(void *io_device, void *ctx_buf)
2006 {
2007 	struct aio_io_channel *ch = ctx_buf;
2008 	struct spdk_thread *thread = spdk_get_thread();
2009 
2010 	UNUSED(thread);
2011 
2012 	spdk_poller_unregister(&ch->poller);
2013 	spdk_aio_mgr_delete(ch->mgr);
2014 
2015 	SPDK_DEBUGLOG(fsdev_aio, "Destroyed aio fsdev IO channel: thread %s, thread id %" PRIu64
2016 		      "\n",
2017 		      spdk_thread_get_name(thread), spdk_thread_get_id(thread));
2018 }
2019 
2020 static int
2021 fsdev_aio_initialize(void)
2022 {
2023 	/*
2024 	 * We need to pick some unique address as our "io device" - so just use the
2025 	 *  address of the global tailq.
2026 	 */
2027 	spdk_io_device_register(&g_aio_fsdev_head,
2028 				aio_fsdev_create_cb, aio_fsdev_destroy_cb,
2029 				sizeof(struct aio_io_channel), "aio_fsdev");
2030 
2031 	return 0;
2032 }
2033 
2034 static void
2035 _fsdev_aio_finish_cb(void *arg)
2036 {
2037 	/* @todo: handle async module fini */
2038 	/* spdk_fsdev_module_fini_done(); */
2039 }
2040 
2041 static void
2042 fsdev_aio_finish(void)
2043 {
2044 	spdk_io_device_unregister(&g_aio_fsdev_head, _fsdev_aio_finish_cb);
2045 }
2046 
2047 static int
2048 fsdev_aio_get_ctx_size(void)
2049 {
2050 	return sizeof(struct aio_fsdev_io);
2051 }
2052 
2053 static struct spdk_fsdev_module aio_fsdev_module = {
2054 	.name = "aio",
2055 	.module_init = fsdev_aio_initialize,
2056 	.module_fini = fsdev_aio_finish,
2057 	.get_ctx_size	= fsdev_aio_get_ctx_size,
2058 };
2059 
2060 SPDK_FSDEV_MODULE_REGISTER(aio, &aio_fsdev_module);
2061 
2062 static void
2063 fsdev_aio_free(struct aio_fsdev *vfsdev)
2064 {
2065 	if (vfsdev->proc_self_fd != -1) {
2066 		close(vfsdev->proc_self_fd);
2067 	}
2068 
2069 	if (vfsdev->root) {
2070 		int destroyed = file_object_unref(vfsdev->root, 1);
2071 		assert(destroyed == 0);
2072 		UNUSED(destroyed);
2073 
2074 	}
2075 
2076 	free(vfsdev->fsdev.name);
2077 	free(vfsdev->root_path);
2078 
2079 	free(vfsdev);
2080 }
2081 
2082 static void
2083 fsdev_free_leafs(struct spdk_fsdev_file_object *fobject)
2084 {
2085 	while (!TAILQ_EMPTY(&fobject->handles)) {
2086 		struct spdk_fsdev_file_handle *fhandle = TAILQ_FIRST(&fobject->handles);
2087 		file_handle_delete(fhandle);
2088 #ifdef __clang_analyzer__
2089 		/*
2090 		 * scan-build fails to comprehend that file_handle_delete() removes the fhandle
2091 		 * from the queue, so it thinks it's remained accessible and throws the "Use of
2092 		 * memory after it is freed" error here.
2093 		 * The loop below "teaches" the scan-build that the freed fhandle is not on the
2094 		 * list anymore and supresses the error in this way.
2095 		 */
2096 		struct spdk_fsdev_file_handle *tmp;
2097 		TAILQ_FOREACH(tmp, &fobject->handles, link) {
2098 			assert(tmp != fhandle);
2099 		}
2100 #endif
2101 	}
2102 
2103 	while (!TAILQ_EMPTY(&fobject->leafs)) {
2104 		struct spdk_fsdev_file_object *leaf_fobject = TAILQ_FIRST(&fobject->leafs);
2105 		fsdev_free_leafs(leaf_fobject);
2106 	}
2107 
2108 	if (fobject->refcount) {
2109 		/* if still referenced - zero refcount */
2110 		int res = file_object_unref(fobject, fobject->refcount);
2111 		assert(res == 0);
2112 		UNUSED(res);
2113 	}
2114 }
2115 
2116 static int
2117 fsdev_aio_destruct(void *ctx)
2118 {
2119 	struct aio_fsdev *vfsdev = ctx;
2120 
2121 	TAILQ_REMOVE(&g_aio_fsdev_head, vfsdev, tailq);
2122 
2123 	fsdev_free_leafs(vfsdev->root);
2124 	vfsdev->root = NULL;
2125 
2126 	pthread_mutex_destroy(&vfsdev->mutex);
2127 
2128 	fsdev_aio_free(vfsdev);
2129 	return 0;
2130 }
2131 
2132 typedef int (*fsdev_op_handler_func)(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io);
2133 
2134 static fsdev_op_handler_func handlers[] = {
2135 	[SPDK_FSDEV_IO_LOOKUP] = lo_lookup,
2136 	[SPDK_FSDEV_IO_FORGET] = lo_forget,
2137 	[SPDK_FSDEV_IO_GETATTR] = lo_getattr,
2138 	[SPDK_FSDEV_IO_SETATTR] = lo_setattr,
2139 	[SPDK_FSDEV_IO_READLINK] = lo_readlink,
2140 	[SPDK_FSDEV_IO_SYMLINK] = lo_symlink,
2141 	[SPDK_FSDEV_IO_MKNOD] = lo_mknod,
2142 	[SPDK_FSDEV_IO_MKDIR] = lo_mkdir,
2143 	[SPDK_FSDEV_IO_UNLINK] = lo_unlink,
2144 	[SPDK_FSDEV_IO_RMDIR] = lo_rmdir,
2145 	[SPDK_FSDEV_IO_RENAME] = lo_rename,
2146 	[SPDK_FSDEV_IO_LINK] = lo_link,
2147 	[SPDK_FSDEV_IO_OPEN] = lo_open,
2148 	[SPDK_FSDEV_IO_READ] = lo_read,
2149 	[SPDK_FSDEV_IO_WRITE] = lo_write,
2150 	[SPDK_FSDEV_IO_STATFS] =  lo_statfs,
2151 	[SPDK_FSDEV_IO_RELEASE] = lo_release,
2152 	[SPDK_FSDEV_IO_FSYNC] = lo_fsync,
2153 	[SPDK_FSDEV_IO_SETXATTR] =  lo_setxattr,
2154 	[SPDK_FSDEV_IO_GETXATTR] =  lo_getxattr,
2155 	[SPDK_FSDEV_IO_LISTXATTR] = lo_listxattr,
2156 	[SPDK_FSDEV_IO_REMOVEXATTR] =  lo_removexattr,
2157 	[SPDK_FSDEV_IO_FLUSH] =  lo_flush,
2158 	[SPDK_FSDEV_IO_OPENDIR] =  lo_opendir,
2159 	[SPDK_FSDEV_IO_READDIR] =  lo_readdir,
2160 	[SPDK_FSDEV_IO_RELEASEDIR] = lo_releasedir,
2161 	[SPDK_FSDEV_IO_FSYNCDIR] = lo_fsyncdir,
2162 	[SPDK_FSDEV_IO_FLOCK] = lo_flock,
2163 	[SPDK_FSDEV_IO_CREATE] = lo_create,
2164 	[SPDK_FSDEV_IO_ABORT] = lo_abort,
2165 	[SPDK_FSDEV_IO_FALLOCATE] = lo_fallocate,
2166 	[SPDK_FSDEV_IO_COPY_FILE_RANGE] = lo_copy_file_range,
2167 };
2168 
2169 static void
2170 fsdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_fsdev_io *fsdev_io)
2171 {
2172 	int status;
2173 	enum spdk_fsdev_io_type type = spdk_fsdev_io_get_type(fsdev_io);
2174 
2175 	assert(type >= 0 && type < __SPDK_FSDEV_IO_LAST);
2176 
2177 	status = handlers[type](ch, fsdev_io);
2178 	if (status != IO_STATUS_ASYNC) {
2179 		spdk_fsdev_io_complete(fsdev_io, status);
2180 	}
2181 }
2182 
2183 static struct spdk_io_channel *
2184 fsdev_aio_get_io_channel(void *ctx)
2185 {
2186 	return spdk_get_io_channel(&g_aio_fsdev_head);
2187 }
2188 
2189 static int
2190 fsdev_aio_negotiate_opts(void *ctx, struct spdk_fsdev_open_opts *opts)
2191 {
2192 	struct aio_fsdev *vfsdev = ctx;
2193 
2194 	assert(opts != 0);
2195 	assert(opts->opts_size != 0);
2196 
2197 	UNUSED(vfsdev);
2198 
2199 	if (opts->opts_size > offsetof(struct spdk_fsdev_open_opts, max_write)) {
2200 		/* Set the value the aio fsdev was created with */
2201 		opts->max_write = vfsdev->fsdev.opts.max_write;
2202 	}
2203 
2204 	if (opts->opts_size > offsetof(struct spdk_fsdev_open_opts, writeback_cache_enabled)) {
2205 		if (vfsdev->fsdev.opts.writeback_cache_enabled) {
2206 			/* The writeback_cache_enabled was enabled upon creation => we follow the opts */
2207 			vfsdev->fsdev.opts.writeback_cache_enabled = opts->writeback_cache_enabled;
2208 		} else {
2209 			/* The writeback_cache_enabled was disabled upon creation => we reflect it in the opts */
2210 			opts->writeback_cache_enabled = false;
2211 		}
2212 	}
2213 
2214 	/* The AIO doesn't apply any additional restrictions, so we just accept the requested opts */
2215 	SPDK_DEBUGLOG(fsdev_aio,
2216 		      "aio filesystem %s: opts updated: max_write=%" PRIu32 ", writeback_cache=%" PRIu8 "\n",
2217 		      vfsdev->fsdev.name, vfsdev->fsdev.opts.max_write, vfsdev->fsdev.opts.writeback_cache_enabled);
2218 
2219 	return 0;
2220 }
2221 
2222 static void
2223 fsdev_aio_write_config_json(struct spdk_fsdev *fsdev, struct spdk_json_write_ctx *w)
2224 {
2225 	struct aio_fsdev *vfsdev = fsdev_to_aio_fsdev(fsdev);
2226 
2227 	spdk_json_write_object_begin(w);
2228 	spdk_json_write_named_string(w, "method", "fsdev_aio_create");
2229 	spdk_json_write_named_object_begin(w, "params");
2230 	spdk_json_write_named_string(w, "name", spdk_fsdev_get_name(&vfsdev->fsdev));
2231 	spdk_json_write_named_string(w, "root_path", vfsdev->root_path);
2232 	spdk_json_write_named_bool(w, "enable_xattr", vfsdev->xattr_enabled);
2233 	spdk_json_write_named_bool(w, "enable_writeback_cache",
2234 				   !!vfsdev->fsdev.opts.writeback_cache_enabled);
2235 	spdk_json_write_named_uint32(w, "max_write", vfsdev->fsdev.opts.max_write);
2236 	spdk_json_write_object_end(w); /* params */
2237 	spdk_json_write_object_end(w);
2238 }
2239 
2240 static const struct spdk_fsdev_fn_table aio_fn_table = {
2241 	.destruct		= fsdev_aio_destruct,
2242 	.submit_request		= fsdev_aio_submit_request,
2243 	.get_io_channel		= fsdev_aio_get_io_channel,
2244 	.negotiate_opts		= fsdev_aio_negotiate_opts,
2245 	.write_config_json	= fsdev_aio_write_config_json,
2246 };
2247 
2248 static int
2249 setup_root(struct aio_fsdev *vfsdev)
2250 {
2251 	int fd, res;
2252 	struct stat stat;
2253 
2254 	fd = open(vfsdev->root_path, O_PATH);
2255 	if (fd == -1) {
2256 		res = -errno;
2257 		SPDK_ERRLOG("Cannot open root %s (err=%d)\n", vfsdev->root_path, res);
2258 		return res;
2259 	}
2260 
2261 	res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2262 	if (res == -1) {
2263 		res = -errno;
2264 		SPDK_ERRLOG("Cannot get root fstatat of %s (err=%d)\n", vfsdev->root_path, res);
2265 		close(fd);
2266 		return res;
2267 	}
2268 
2269 	vfsdev->root = file_object_create_unsafe(NULL, fd, stat.st_ino, stat.st_dev, stat.st_mode);
2270 	if (!vfsdev->root) {
2271 		SPDK_ERRLOG("Cannot alloc root\n");
2272 		close(fd);
2273 		return -ENOMEM;
2274 	}
2275 
2276 	SPDK_INFOLOG(fsdev_aio, "root (%s) fd=%d\n", vfsdev->root_path, fd);
2277 	return 0;
2278 }
2279 
2280 static int
2281 setup_proc_self_fd(struct aio_fsdev *vfsdev)
2282 {
2283 	vfsdev->proc_self_fd = open("/proc/self/fd", O_PATH);
2284 	if (vfsdev->proc_self_fd == -1) {
2285 		int saverr = -errno;
2286 		SPDK_ERRLOG("Failed to open procfs fd dir with %d\n", saverr);
2287 		return saverr;
2288 	}
2289 
2290 	SPDK_DEBUGLOG(fsdev_aio, "procfs fd dir opened (fd=%d)\n", vfsdev->proc_self_fd);
2291 	return 0;
2292 }
2293 
2294 void
2295 spdk_fsdev_aio_get_default_opts(struct spdk_fsdev_aio_opts *opts)
2296 {
2297 	assert(opts);
2298 
2299 	memset(opts, 0, sizeof(*opts));
2300 
2301 	opts->xattr_enabled = DEFAULT_XATTR_ENABLED;
2302 	opts->writeback_cache_enabled = DEFAULT_WRITEBACK_CACHE;
2303 	opts->max_write = DEFAULT_MAX_WRITE;
2304 }
2305 
2306 int
2307 spdk_fsdev_aio_create(struct spdk_fsdev **fsdev, const char *name, const char *root_path,
2308 		      const struct spdk_fsdev_aio_opts *opts)
2309 {
2310 	struct aio_fsdev *vfsdev;
2311 	int rc;
2312 
2313 	vfsdev = calloc(1, sizeof(*vfsdev));
2314 	if (!vfsdev) {
2315 		SPDK_ERRLOG("Could not allocate aio_fsdev\n");
2316 		return -ENOMEM;
2317 	}
2318 
2319 	vfsdev->proc_self_fd = -1;
2320 
2321 	vfsdev->fsdev.name = strdup(name);
2322 	if (!vfsdev->fsdev.name) {
2323 		SPDK_ERRLOG("Could not strdup fsdev name: %s\n", name);
2324 		fsdev_aio_free(vfsdev);
2325 		return -ENOMEM;
2326 	}
2327 
2328 	vfsdev->root_path = strdup(root_path);
2329 	if (!vfsdev->root_path) {
2330 		SPDK_ERRLOG("Could not strdup root path: %s\n", root_path);
2331 		fsdev_aio_free(vfsdev);
2332 		return -ENOMEM;
2333 	}
2334 
2335 	rc = setup_root(vfsdev);
2336 	if (rc) {
2337 		SPDK_ERRLOG("Could not setup root: %s (err=%d)\n", root_path, rc);
2338 		fsdev_aio_free(vfsdev);
2339 		return rc;
2340 	}
2341 
2342 	rc = setup_proc_self_fd(vfsdev);
2343 	if (rc) {
2344 		SPDK_ERRLOG("Could not setup proc_self_fd (err=%d)\n", rc);
2345 		fsdev_aio_free(vfsdev);
2346 		return rc;
2347 	}
2348 
2349 	if (opts->xattr_enabled) {
2350 		SPDK_ERRLOG("Extended attributes can only be enabled in Linux\n");
2351 		fsdev_aio_free(vfsdev);
2352 		return rc;
2353 	}
2354 
2355 	vfsdev->xattr_enabled = opts->xattr_enabled;
2356 	vfsdev->fsdev.ctxt = vfsdev;
2357 	vfsdev->fsdev.fn_table = &aio_fn_table;
2358 	vfsdev->fsdev.module = &aio_fsdev_module;
2359 
2360 	pthread_mutex_init(&vfsdev->mutex, NULL);
2361 
2362 	rc = spdk_fsdev_register(&vfsdev->fsdev);
2363 	if (rc) {
2364 		fsdev_aio_free(vfsdev);
2365 		return rc;
2366 	}
2367 
2368 	vfsdev->fsdev.opts.writeback_cache_enabled = opts->writeback_cache_enabled;
2369 	vfsdev->fsdev.opts.max_write = opts->max_write;
2370 
2371 	*fsdev = &(vfsdev->fsdev);
2372 	TAILQ_INSERT_TAIL(&g_aio_fsdev_head, vfsdev, tailq);
2373 	SPDK_DEBUGLOG(fsdev_aio, "Created aio filesystem %s (xattr_enabled=%" PRIu8 " writeback_cache=%"
2374 		      PRIu8 " max_write=%" PRIu32 ")\n",
2375 		      vfsdev->fsdev.name, vfsdev->xattr_enabled, vfsdev->fsdev.opts.writeback_cache_enabled,
2376 		      vfsdev->fsdev.opts.max_write);
2377 	return rc;
2378 }
2379 void
2380 spdk_fsdev_aio_delete(const char *name,
2381 		      spdk_delete_aio_fsdev_complete cb_fn, void *cb_arg)
2382 {
2383 	int rc;
2384 
2385 	rc = spdk_fsdev_unregister_by_name(name, &aio_fsdev_module, cb_fn, cb_arg);
2386 	if (rc != 0) {
2387 		cb_fn(cb_arg, rc);
2388 	}
2389 
2390 	SPDK_DEBUGLOG(fsdev_aio, "Deleted aio filesystem %s\n", name);
2391 }
2392 
2393 SPDK_LOG_REGISTER_COMPONENT(fsdev_aio)
2394