xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 8b1c477bfb0e6273b4538f99383bf83270ac142c)
1 /*	$NetBSD: vfs_syscalls.c,v 1.570 2024/12/07 02:23:09 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.570 2024/12/07 02:23:09 riastradh Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/types.h>
82 
83 #include <sys/atomic.h>
84 #include <sys/buf.h>
85 #include <sys/compat_stub.h>
86 #include <sys/dirent.h>
87 #include <sys/event.h>
88 #include <sys/extattr.h>
89 #include <sys/fcntl.h>
90 #include <sys/file.h>
91 #ifdef FILEASSOC
92 #include <sys/fileassoc.h>
93 #endif /* FILEASSOC */
94 #include <sys/filedesc.h>
95 #include <sys/fstrans.h>
96 #include <sys/kauth.h>
97 #include <sys/kernel.h>
98 #include <sys/kmem.h>
99 #include <sys/ktrace.h>
100 #include <sys/module.h>
101 #include <sys/mount.h>
102 #include <sys/namei.h>
103 #include <sys/proc.h>
104 #include <sys/quota.h>
105 #include <sys/quotactl.h>
106 #include <sys/stat.h>
107 #include <sys/syscallargs.h>
108 #include <sys/sysctl.h>
109 #include <sys/systm.h>
110 #include <sys/uio.h>
111 #include <sys/verified_exec.h>
112 #include <sys/vfs_syscalls.h>
113 #include <sys/vnode.h>
114 
115 #include <miscfs/genfs/genfs.h>
116 #include <miscfs/specfs/specdev.h>
117 
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 #include <nfs/nfsproto.h>
121 #include <nfs/rpcv2.h>
122 
123 /* XXX this shouldn't be here */
124 #ifndef OFF_T_MAX
125 #define OFF_T_MAX __type_max(off_t)
126 #endif
127 
128 static int change_flags(struct vnode *, u_long, struct lwp *);
129 static int change_mode(struct vnode *, int, struct lwp *);
130 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
131 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
132 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
133     enum uio_seg);
134 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
135 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
136     enum uio_seg);
137 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
138     enum uio_seg, int);
139 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
140     size_t, register_t *);
141 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
142 
143 static int fd_nameiat(struct lwp *, int, struct nameidata *);
144 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
145     namei_simple_flags_t, struct vnode **);
146 
147 /*
148  * This table is used to maintain compatibility with 4.3BSD
149  * and NetBSD 0.9 mount syscalls - and possibly other systems.
150  * Note, the order is important!
151  *
152  * Do not modify this table. It should only contain filesystems
153  * supported by NetBSD 0.9 and 4.3BSD.
154  */
155 const char * const mountcompatnames[] = {
156 	NULL,		/* 0 = MOUNT_NONE */
157 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
158 	MOUNT_NFS,	/* 2 */
159 	MOUNT_MFS,	/* 3 */
160 	MOUNT_MSDOS,	/* 4 */
161 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
162 	MOUNT_FDESC,	/* 6 */
163 	MOUNT_KERNFS,	/* 7 */
164 	NULL,		/* 8 = MOUNT_DEVFS */
165 	MOUNT_AFS,	/* 9 */
166 };
167 
168 const u_int nmountcompatnames = __arraycount(mountcompatnames);
169 
170 /*
171  * Filter event method for EVFILT_FS.
172  */
173 static struct klist fs_klist;
174 static kmutex_t fs_klist_lock;
175 
176 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
177 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
178 
179 void
180 vfs_evfilt_fs_init(void)
181 {
182 
183 	klist_init(&fs_klist);
184 	mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
185 }
186 
187 static int
188 filt_fsattach(struct knote *kn)
189 {
190 
191 	mutex_enter(&fs_klist_lock);
192 	kn->kn_flags |= EV_CLEAR;
193 	klist_insert(&fs_klist, kn);
194 	mutex_exit(&fs_klist_lock);
195 
196 	return 0;
197 }
198 
199 static void
200 filt_fsdetach(struct knote *kn)
201 {
202 
203 	mutex_enter(&fs_klist_lock);
204 	klist_remove(&fs_klist, kn);
205 	mutex_exit(&fs_klist_lock);
206 }
207 
208 static int
209 filt_fs(struct knote *kn, long hint)
210 {
211 	int rv;
212 
213 	if (hint & NOTE_SUBMIT) {
214 		KASSERT(mutex_owned(&fs_klist_lock));
215 		kn->kn_fflags |= hint & ~NOTE_SUBMIT;
216 	} else {
217 		mutex_enter(&fs_klist_lock);
218 	}
219 
220 	rv = (kn->kn_fflags != 0);
221 
222 	if ((hint & NOTE_SUBMIT) == 0) {
223 		mutex_exit(&fs_klist_lock);
224 	}
225 
226 	return rv;
227 }
228 
229 /* referenced in kern_event.c */
230 const struct filterops fs_filtops = {
231 	.f_flags = FILTEROP_MPSAFE,
232 	.f_attach = filt_fsattach,
233 	.f_detach = filt_fsdetach,
234 	.f_event = filt_fs,
235 };
236 
237 static int
238 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
239 {
240 	file_t *dfp;
241 	int error;
242 	const char *path = pathbuf_stringcopy_get(ndp->ni_pathbuf);
243 
244 	if (fdat != AT_FDCWD && path[0] != '/') {
245 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
246 			goto out;
247 
248 		NDAT(ndp, dfp->f_vnode);
249 	}
250 
251 	error = namei(ndp);
252 
253 	if (fdat != AT_FDCWD && path[0] != '/')
254 		fd_putfile(fdat);
255 out:
256 	pathbuf_stringcopy_put(ndp->ni_pathbuf, path);
257 	return error;
258 }
259 
260 static int
261 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
262     namei_simple_flags_t sflags, struct vnode **vp_ret)
263 {
264 	file_t *dfp;
265 	struct vnode *dvp;
266 	int error;
267 	struct pathbuf *pb;
268 	const char *p;
269 
270 	error = pathbuf_copyin(path, &pb);
271 	if (error) {
272 		return error;
273 	}
274 	p = pathbuf_stringcopy_get(pb);
275 
276 	if (fdat != AT_FDCWD && p[0] != '/') {
277 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
278 			goto out;
279 
280 		dvp = dfp->f_vnode;
281 	} else {
282 		dvp = NULL;
283 	}
284 
285 	error = nameiat_simple(dvp, pb, sflags, vp_ret);
286 
287 	if (fdat != AT_FDCWD && p[0] != '/')
288 		fd_putfile(fdat);
289 
290 out:
291 	pathbuf_stringcopy_put(pb, p);
292 	pathbuf_destroy(pb);
293 
294 	return error;
295 }
296 
297 static int
298 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
299 {
300 	int error;
301 
302 	fp->f_flag = flags & FMASK;
303 	fp->f_type = DTYPE_VNODE;
304 	fp->f_ops = &vnops;
305 	fp->f_vnode = vp;
306 
307 	if (flags & (O_EXLOCK | O_SHLOCK)) {
308 		struct flock lf;
309 		int type;
310 
311 		lf.l_whence = SEEK_SET;
312 		lf.l_start = 0;
313 		lf.l_len = 0;
314 		if (flags & O_EXLOCK)
315 			lf.l_type = F_WRLCK;
316 		else
317 			lf.l_type = F_RDLCK;
318 		type = F_FLOCK;
319 		if ((flags & FNONBLOCK) == 0)
320 			type |= F_WAIT;
321 		VOP_UNLOCK(vp);
322 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
323 		if (error) {
324 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
325 			fd_abort(l->l_proc, fp, indx);
326 			return error;
327 		}
328 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
329 		atomic_or_uint(&fp->f_flag, FHASLOCK);
330 	}
331 	if (flags & O_CLOEXEC)
332 		fd_set_exclose(l, indx, true);
333 	return 0;
334 }
335 
336 static int
337 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
338     void *data, size_t *data_len)
339 {
340 	struct mount *mp;
341 	int error = 0, saved_flags;
342 
343 	mp = vp->v_mount;
344 	saved_flags = mp->mnt_flag;
345 
346 	/* We can operate only on VV_ROOT nodes. */
347 	if ((vp->v_vflag & VV_ROOT) == 0) {
348 		error = EINVAL;
349 		goto out;
350 	}
351 
352 	/*
353 	 * We only allow the filesystem to be reloaded if it
354 	 * is currently mounted read-only.  Additionally, we
355 	 * prevent read-write to read-only downgrades.
356 	 */
357 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
358 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
359 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
360 		error = EOPNOTSUPP;	/* Needs translation */
361 		goto out;
362 	}
363 
364 	/*
365 	 * Enabling MNT_UNION requires a covered mountpoint and
366 	 * must not happen on the root mount.
367 	 */
368 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
369 		error = EOPNOTSUPP;
370 		goto out;
371 	}
372 
373 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
374 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
375 	if (error)
376 		goto out;
377 
378 	error = vfs_suspend(mp, 0);
379 	if (error)
380 		goto out;
381 
382 	mutex_enter(mp->mnt_updating);
383 
384 	mp->mnt_flag &= ~MNT_OP_FLAGS;
385 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
386 
387 	/*
388 	 * Set the mount level flags.
389 	 */
390 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
391 		if ((flags & MNT_RDONLY))
392 			mp->mnt_iflag |= IMNT_WANTRDONLY;
393 		else
394 			mp->mnt_iflag |= IMNT_WANTRDWR;
395 	}
396 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
397 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
398 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
399 		mp->mnt_flag &= ~MNT_RDONLY;
400 
401 	error = VFS_MOUNT(mp, path, data, data_len);
402 
403 	if (error && data != NULL) {
404 		int error2;
405 
406 		/*
407 		 * Update failed; let's try and see if it was an
408 		 * export request.  For compat with 3.0 and earlier.
409 		 */
410 		error2 = vfs_hooks_reexport(mp, path, data);
411 
412 		/*
413 		 * Only update error code if the export request was
414 		 * understood but some problem occurred while
415 		 * processing it.
416 		 */
417 		if (error2 != EJUSTRETURN)
418 			error = error2;
419 	}
420 
421 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
422 		mp->mnt_flag |= MNT_RDONLY;
423 	if (error)
424 		mp->mnt_flag = saved_flags;
425 	mp->mnt_flag &= ~MNT_OP_FLAGS;
426 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
427 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
428 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
429 			vfs_syncer_add_to_worklist(mp);
430 	} else {
431 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
432 			vfs_syncer_remove_from_worklist(mp);
433 	}
434 	mutex_exit(mp->mnt_updating);
435 	vfs_resume(mp);
436 
437 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
438 	    (flags & MNT_EXTATTR)) {
439 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
440 			NULL, 0, NULL) != 0) {
441 			printf("%s: failed to start extattr, error = %d",
442 			    mp->mnt_stat.f_mntonname, error);
443 			mp->mnt_flag &= ~MNT_EXTATTR;
444 		}
445 	}
446 
447 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
448 	    !(flags & MNT_EXTATTR)) {
449 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
450 			NULL, 0, NULL) != 0) {
451 			printf("%s: failed to stop extattr, error = %d",
452 			    mp->mnt_stat.f_mntonname, error);
453 			mp->mnt_flag |= MNT_RDONLY;
454 		}
455 	}
456 out:
457 	return (error);
458 }
459 
460 static int
461 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
462     struct vfsops **vfsops)
463 {
464 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
465 	int error;
466 
467 	if (type_seg == UIO_USERSPACE) {
468 		/* Copy file-system type from userspace.  */
469 		error = copyinstr(fstype, fstypename, sizeof(fstypename),
470 		    NULL);
471 	} else {
472 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
473 		KASSERT(error == 0);
474 	}
475 
476 	if (error) {
477 		/*
478 		 * Historically, filesystem types were identified by numbers.
479 		 * If we get an integer for the filesystem type instead of a
480 		 * string, we check to see if it matches one of the historic
481 		 * filesystem types.
482 		 */
483 		u_long fsindex = (u_long)fstype;
484 		if (fsindex >= nmountcompatnames ||
485 		    mountcompatnames[fsindex] == NULL)
486 			return ENODEV;
487 		strlcpy(fstypename, mountcompatnames[fsindex],
488 		    sizeof(fstypename));
489 	}
490 
491 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
492 	if (strcmp(fstypename, "ufs") == 0)
493 		fstypename[0] = 'f';
494 
495 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
496 		return 0;
497 
498 	/* If we can autoload a vfs module, try again */
499 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
500 
501 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
502 		return 0;
503 
504 	return ENODEV;
505 }
506 
507 static int
508 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
509     void *data, size_t *data_len)
510 {
511 	struct mount *mp;
512 	int error;
513 
514 	/* If MNT_GETARGS is specified, it should be the only flag. */
515 	if (flags & ~MNT_GETARGS)
516 		return EINVAL;
517 
518 	mp = vp->v_mount;
519 
520 	/* XXX: probably some notion of "can see" here if we want isolation. */
521 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
522 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
523 	if (error)
524 		return error;
525 
526 	if ((vp->v_vflag & VV_ROOT) == 0)
527 		return EINVAL;
528 
529 	if (vfs_busy(mp))
530 		return EPERM;
531 
532 	mutex_enter(mp->mnt_updating);
533 	mp->mnt_flag &= ~MNT_OP_FLAGS;
534 	mp->mnt_flag |= MNT_GETARGS;
535 	error = VFS_MOUNT(mp, path, data, data_len);
536 	mp->mnt_flag &= ~MNT_OP_FLAGS;
537 	mutex_exit(mp->mnt_updating);
538 
539 	vfs_unbusy(mp);
540 	return (error);
541 }
542 
543 int
544 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap,
545     register_t *retval)
546 {
547 	/* {
548 		syscallarg(const char *) type;
549 		syscallarg(const char *) path;
550 		syscallarg(int) flags;
551 		syscallarg(void *) data;
552 		syscallarg(size_t) data_len;
553 	} */
554 
555 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE,
556 	    SCARG(uap, path), SCARG(uap, flags),
557 	    SCARG(uap, data), UIO_USERSPACE, SCARG(uap, data_len),
558 	    retval);
559 }
560 
561 int
562 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
563     const char *path, int flags,
564     void *data, enum uio_seg data_seg, size_t data_len,
565     register_t *retval)
566 {
567 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
568 	struct vnode *vp;
569 	void *data_buf = data;
570 	bool vfsopsrele = false;
571 	size_t alloc_sz = 0;
572 	int error;
573 
574 	/*
575 	 * Get vnode to be covered
576 	 */
577 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
578 	if (error != 0) {
579 		vp = NULL;
580 		goto done;
581 	}
582 
583 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
584 		vfsops = vp->v_mount->mnt_op;
585 	} else {
586 		/* 'type' is userspace */
587 		error = mount_get_vfsops(type, type_seg, &vfsops);
588 		if (error != 0)
589 			goto done;
590 		vfsopsrele = true;
591 	}
592 
593 	/*
594 	 * We allow data to be NULL, even for userspace. Some fs's don't need
595 	 * it. The others will handle NULL.
596 	 */
597 	if (data != NULL && data_seg == UIO_USERSPACE) {
598 		if (data_len == 0) {
599 			/* No length supplied, use default for filesystem */
600 			data_len = vfsops->vfs_min_mount_data;
601 
602 			/*
603 			 * Hopefully a longer buffer won't make copyin() fail.
604 			 * For compatibility with 3.0 and earlier.
605 			 */
606 			if (flags & MNT_UPDATE
607 			    && data_len < sizeof (struct mnt_export_args30))
608 				data_len = sizeof (struct mnt_export_args30);
609 		}
610 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
611 			error = EINVAL;
612 			goto done;
613 		}
614 		alloc_sz = data_len;
615 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
616 
617 		/* NFS needs the buffer even for mnt_getargs .... */
618 		error = copyin(data, data_buf, data_len);
619 		if (error != 0)
620 			goto done;
621 	}
622 
623 	if (flags & MNT_GETARGS) {
624 		if (data_len == 0) {
625 			error = EINVAL;
626 			goto done;
627 		}
628 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
629 		if (error != 0)
630 			goto done;
631 		if (data_seg == UIO_USERSPACE)
632 			error = copyout(data_buf, data, data_len);
633 		*retval = data_len;
634 	} else if (flags & MNT_UPDATE) {
635 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
636 	} else {
637 		/* Locking is handled internally in mount_domount(). */
638 		KASSERT(vfsopsrele == true);
639 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
640 		    &data_len);
641 		vfsopsrele = false;
642 	}
643 	if (!error) {
644 		mutex_enter(&fs_klist_lock);
645 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
646 		mutex_exit(&fs_klist_lock);
647 	}
648 
649 done:
650 	if (vfsopsrele)
651 		vfs_delref(vfsops);
652 	if (vp != NULL) {
653 		vrele(vp);
654 	}
655 	if (data_buf != data)
656 		kmem_free(data_buf, alloc_sz);
657 	return (error);
658 }
659 
660 /*
661  * Unmount a file system.
662  *
663  * Note: unmount takes a path to the vnode mounted on as argument,
664  * not special file (as before).
665  */
666 /* ARGSUSED */
667 int
668 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap,
669     register_t *retval)
670 {
671 	/* {
672 		syscallarg(const char *) path;
673 		syscallarg(int) flags;
674 	} */
675 	struct vnode *vp;
676 	struct mount *mp;
677 	int error;
678 	struct pathbuf *pb;
679 	struct nameidata nd;
680 
681 	error = pathbuf_copyin(SCARG(uap, path), &pb);
682 	if (error) {
683 		return error;
684 	}
685 
686 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
687 	if ((error = namei(&nd)) != 0) {
688 		pathbuf_destroy(pb);
689 		return error;
690 	}
691 	vp = nd.ni_vp;
692 	pathbuf_destroy(pb);
693 
694 	mp = vp->v_mount;
695 	vfs_ref(mp);
696 	VOP_UNLOCK(vp);
697 
698 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
699 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
700 	if (error) {
701 		vrele(vp);
702 		vfs_rele(mp);
703 		return (error);
704 	}
705 
706 	/*
707 	 * Don't allow unmounting the root file system.
708 	 */
709 	if (mp->mnt_flag & MNT_ROOTFS) {
710 		vrele(vp);
711 		vfs_rele(mp);
712 		return (EINVAL);
713 	}
714 
715 	/*
716 	 * Must be the root of the filesystem
717 	 */
718 	if ((vp->v_vflag & VV_ROOT) == 0) {
719 		vrele(vp);
720 		vfs_rele(mp);
721 		return (EINVAL);
722 	}
723 
724 	vrele(vp);
725 	error = dounmount(mp, SCARG(uap, flags), l);
726 	vfs_rele(mp);
727 	if (!error) {
728 		mutex_enter(&fs_klist_lock);
729 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
730 		mutex_exit(&fs_klist_lock);
731 	}
732 	return error;
733 }
734 
735 /*
736  * Sync each mounted filesystem.
737  */
738 #ifdef DEBUG
739 int syncprt = 0;
740 struct ctldebug debug0 = { "syncprt", &syncprt };
741 #endif
742 
743 void
744 do_sys_sync(struct lwp *l)
745 {
746 	mount_iterator_t *iter;
747 	struct mount *mp;
748 	int asyncflag;
749 
750 	mountlist_iterator_init(&iter);
751 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
752 		mutex_enter(mp->mnt_updating);
753 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
754 			/*
755 			 * Temporarily clear the MNT_ASYNC flags so that
756 			 * bwrite() doesnt convert the sync writes to
757 			 * delayed writes.
758 			 */
759 			asyncflag = mp->mnt_flag & MNT_ASYNC;
760 			mp->mnt_flag &= ~MNT_ASYNC;
761 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
762 			mp->mnt_flag |= asyncflag;
763 		}
764 		mutex_exit(mp->mnt_updating);
765 	}
766 	mountlist_iterator_destroy(iter);
767 #ifdef DEBUG
768 	if (syncprt)
769 		vfs_bufstats();
770 #endif /* DEBUG */
771 }
772 
773 static bool
774 sync_vnode_filter(void *cookie, vnode_t *vp)
775 {
776 
777 	if (vp->v_numoutput > 0) {
778 		++*(int *)cookie;
779 	}
780 	return false;
781 }
782 
783 int
784 vfs_syncwait(void)
785 {
786 	int nbusy, nbusy_prev, iter;
787 	struct vnode_iterator *vniter;
788 	mount_iterator_t *mpiter;
789 	struct mount *mp;
790 
791 	for (nbusy_prev = 0, iter = 0; iter < 20;) {
792 		nbusy = 0;
793 		mountlist_iterator_init(&mpiter);
794 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
795 			vnode_t *vp __diagused;
796 			vfs_vnode_iterator_init(mp, &vniter);
797 			vp = vfs_vnode_iterator_next(vniter,
798 			    sync_vnode_filter, &nbusy);
799 			KASSERT(vp == NULL);
800 			vfs_vnode_iterator_destroy(vniter);
801 		}
802 		mountlist_iterator_destroy(mpiter);
803 
804 		if (nbusy == 0)
805 			break;
806 		if (nbusy_prev == 0)
807 			nbusy_prev = nbusy;
808 		printf("%d ", nbusy);
809 		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
810 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
811 			iter++;
812 		else
813 			nbusy_prev = nbusy;
814 	}
815 
816 	if (nbusy) {
817 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
818 		printf("giving up\nPrinting vnodes for busy buffers\n");
819 		mountlist_iterator_init(&mpiter);
820 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
821 			vnode_t *vp;
822 			vfs_vnode_iterator_init(mp, &vniter);
823 			vp = vfs_vnode_iterator_next(vniter,
824 			    NULL, NULL);
825 			mutex_enter(vp->v_interlock);
826 			if (vp->v_numoutput > 0)
827 				vprint(NULL, vp);
828 			mutex_exit(vp->v_interlock);
829 			vrele(vp);
830 			vfs_vnode_iterator_destroy(vniter);
831 		}
832 		mountlist_iterator_destroy(mpiter);
833 #endif
834 	}
835 
836 	return nbusy;
837 }
838 
839 /* ARGSUSED */
840 int
841 sys_sync(struct lwp *l, const void *v, register_t *retval)
842 {
843 
844 	do_sys_sync(l);
845 	return (0);
846 }
847 
848 /*
849  * Access or change filesystem quotas.
850  *
851  * (this is really 14 different calls bundled into one)
852  */
853 
854 static int
855 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
856 {
857 	struct quotastat info_k;
858 	int error;
859 
860 	/* ensure any padding bytes are cleared */
861 	memset(&info_k, 0, sizeof(info_k));
862 
863 	error = vfs_quotactl_stat(mp, &info_k);
864 	if (error) {
865 		return error;
866 	}
867 
868 	return copyout(&info_k, info_u, sizeof(info_k));
869 }
870 
871 static int
872 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
873     struct quotaidtypestat *info_u)
874 {
875 	struct quotaidtypestat info_k;
876 	int error;
877 
878 	/* ensure any padding bytes are cleared */
879 	memset(&info_k, 0, sizeof(info_k));
880 
881 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
882 	if (error) {
883 		return error;
884 	}
885 
886 	return copyout(&info_k, info_u, sizeof(info_k));
887 }
888 
889 static int
890 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
891     struct quotaobjtypestat *info_u)
892 {
893 	struct quotaobjtypestat info_k;
894 	int error;
895 
896 	/* ensure any padding bytes are cleared */
897 	memset(&info_k, 0, sizeof(info_k));
898 
899 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
900 	if (error) {
901 		return error;
902 	}
903 
904 	return copyout(&info_k, info_u, sizeof(info_k));
905 }
906 
907 static int
908 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
909     struct quotaval *val_u)
910 {
911 	struct quotakey key_k;
912 	struct quotaval val_k;
913 	int error;
914 
915 	/* ensure any padding bytes are cleared */
916 	memset(&val_k, 0, sizeof(val_k));
917 
918 	error = copyin(key_u, &key_k, sizeof(key_k));
919 	if (error) {
920 		return error;
921 	}
922 
923 	error = vfs_quotactl_get(mp, &key_k, &val_k);
924 	if (error) {
925 		return error;
926 	}
927 
928 	return copyout(&val_k, val_u, sizeof(val_k));
929 }
930 
931 static int
932 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
933     const struct quotaval *val_u)
934 {
935 	struct quotakey key_k;
936 	struct quotaval val_k;
937 	int error;
938 
939 	error = copyin(key_u, &key_k, sizeof(key_k));
940 	if (error) {
941 		return error;
942 	}
943 
944 	error = copyin(val_u, &val_k, sizeof(val_k));
945 	if (error) {
946 		return error;
947 	}
948 
949 	return vfs_quotactl_put(mp, &key_k, &val_k);
950 }
951 
952 static int
953 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
954 {
955 	struct quotakey key_k;
956 	int error;
957 
958 	error = copyin(key_u, &key_k, sizeof(key_k));
959 	if (error) {
960 		return error;
961 	}
962 
963 	return vfs_quotactl_del(mp, &key_k);
964 }
965 
966 static int
967 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
968 {
969 	struct quotakcursor cursor_k;
970 	int error;
971 
972 	/* ensure any padding bytes are cleared */
973 	memset(&cursor_k, 0, sizeof(cursor_k));
974 
975 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
976 	if (error) {
977 		return error;
978 	}
979 
980 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
981 }
982 
983 static int
984 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
985 {
986 	struct quotakcursor cursor_k;
987 	int error;
988 
989 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
990 	if (error) {
991 		return error;
992 	}
993 
994 	return vfs_quotactl_cursorclose(mp, &cursor_k);
995 }
996 
997 static int
998 do_sys_quotactl_cursorskipidtype(struct mount *mp,
999     struct quotakcursor *cursor_u, int idtype)
1000 {
1001 	struct quotakcursor cursor_k;
1002 	int error;
1003 
1004 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1005 	if (error) {
1006 		return error;
1007 	}
1008 
1009 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
1010 	if (error) {
1011 		return error;
1012 	}
1013 
1014 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1015 }
1016 
1017 static int
1018 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
1019     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
1020     unsigned *ret_u)
1021 {
1022 #define CGET_STACK_MAX 8
1023 	struct quotakcursor cursor_k;
1024 	struct quotakey stackkeys[CGET_STACK_MAX];
1025 	struct quotaval stackvals[CGET_STACK_MAX];
1026 	struct quotakey *keys_k;
1027 	struct quotaval *vals_k;
1028 	unsigned ret_k;
1029 	int error;
1030 
1031 	if (maxnum > 128) {
1032 		maxnum = 128;
1033 	}
1034 
1035 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1036 	if (error) {
1037 		return error;
1038 	}
1039 
1040 	if (maxnum <= CGET_STACK_MAX) {
1041 		keys_k = stackkeys;
1042 		vals_k = stackvals;
1043 		/* ensure any padding bytes are cleared */
1044 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1045 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1046 	} else {
1047 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1048 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1049 	}
1050 
1051 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1052 	    &ret_k);
1053 	if (error) {
1054 		goto fail;
1055 	}
1056 
1057 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1058 	if (error) {
1059 		goto fail;
1060 	}
1061 
1062 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1063 	if (error) {
1064 		goto fail;
1065 	}
1066 
1067 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1068 	if (error) {
1069 		goto fail;
1070 	}
1071 
1072 	/* do last to maximize the chance of being able to recover a failure */
1073 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1074 
1075 fail:
1076 	if (keys_k != stackkeys) {
1077 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1078 	}
1079 	if (vals_k != stackvals) {
1080 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1081 	}
1082 	return error;
1083 }
1084 
1085 static int
1086 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1087     int *ret_u)
1088 {
1089 	struct quotakcursor cursor_k;
1090 	int ret_k;
1091 	int error;
1092 
1093 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1094 	if (error) {
1095 		return error;
1096 	}
1097 
1098 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1099 	if (error) {
1100 		return error;
1101 	}
1102 
1103 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1104 	if (error) {
1105 		return error;
1106 	}
1107 
1108 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1109 }
1110 
1111 static int
1112 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1113 {
1114 	struct quotakcursor cursor_k;
1115 	int error;
1116 
1117 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1118 	if (error) {
1119 		return error;
1120 	}
1121 
1122 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1123 	if (error) {
1124 		return error;
1125 	}
1126 
1127 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1128 }
1129 
1130 static int
1131 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1132 {
1133 	char *path_k;
1134 	int error;
1135 
1136 	/* XXX this should probably be a struct pathbuf */
1137 	path_k = PNBUF_GET();
1138 	error = copyin(path_u, path_k, PATH_MAX);
1139 	if (error) {
1140 		PNBUF_PUT(path_k);
1141 		return error;
1142 	}
1143 
1144 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1145 
1146 	PNBUF_PUT(path_k);
1147 	return error;
1148 }
1149 
1150 static int
1151 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1152 {
1153 
1154 	return vfs_quotactl_quotaoff(mp, idtype);
1155 }
1156 
1157 int
1158 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1159 {
1160 	struct mount *mp;
1161 	struct vnode *vp;
1162 	int error;
1163 
1164 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1165 	if (error != 0)
1166 		return (error);
1167 	mp = vp->v_mount;
1168 
1169 	switch (args->qc_op) {
1170 	case QUOTACTL_STAT:
1171 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1172 		break;
1173 	case QUOTACTL_IDTYPESTAT:
1174 		error = do_sys_quotactl_idtypestat(mp,
1175 		    args->u.idtypestat.qc_idtype,
1176 		    args->u.idtypestat.qc_info);
1177 		break;
1178 	case QUOTACTL_OBJTYPESTAT:
1179 		error = do_sys_quotactl_objtypestat(mp,
1180 		    args->u.objtypestat.qc_objtype,
1181 		    args->u.objtypestat.qc_info);
1182 		break;
1183 	case QUOTACTL_GET:
1184 		error = do_sys_quotactl_get(mp,
1185 		    args->u.get.qc_key,
1186 		    args->u.get.qc_val);
1187 		break;
1188 	case QUOTACTL_PUT:
1189 		error = do_sys_quotactl_put(mp,
1190 		    args->u.put.qc_key,
1191 		    args->u.put.qc_val);
1192 		break;
1193 	case QUOTACTL_DEL:
1194 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1195 		break;
1196 	case QUOTACTL_CURSOROPEN:
1197 		error = do_sys_quotactl_cursoropen(mp,
1198 		    args->u.cursoropen.qc_cursor);
1199 		break;
1200 	case QUOTACTL_CURSORCLOSE:
1201 		error = do_sys_quotactl_cursorclose(mp,
1202 		    args->u.cursorclose.qc_cursor);
1203 		break;
1204 	case QUOTACTL_CURSORSKIPIDTYPE:
1205 		error = do_sys_quotactl_cursorskipidtype(mp,
1206 		    args->u.cursorskipidtype.qc_cursor,
1207 		    args->u.cursorskipidtype.qc_idtype);
1208 		break;
1209 	case QUOTACTL_CURSORGET:
1210 		error = do_sys_quotactl_cursorget(mp,
1211 		    args->u.cursorget.qc_cursor,
1212 		    args->u.cursorget.qc_keys,
1213 		    args->u.cursorget.qc_vals,
1214 		    args->u.cursorget.qc_maxnum,
1215 		    args->u.cursorget.qc_ret);
1216 		break;
1217 	case QUOTACTL_CURSORATEND:
1218 		error = do_sys_quotactl_cursoratend(mp,
1219 		    args->u.cursoratend.qc_cursor,
1220 		    args->u.cursoratend.qc_ret);
1221 		break;
1222 	case QUOTACTL_CURSORREWIND:
1223 		error = do_sys_quotactl_cursorrewind(mp,
1224 		    args->u.cursorrewind.qc_cursor);
1225 		break;
1226 	case QUOTACTL_QUOTAON:
1227 		error = do_sys_quotactl_quotaon(mp,
1228 		    args->u.quotaon.qc_idtype,
1229 		    args->u.quotaon.qc_quotafile);
1230 		break;
1231 	case QUOTACTL_QUOTAOFF:
1232 		error = do_sys_quotactl_quotaoff(mp,
1233 		    args->u.quotaoff.qc_idtype);
1234 		break;
1235 	default:
1236 		error = EINVAL;
1237 		break;
1238 	}
1239 
1240 	vrele(vp);
1241 	return error;
1242 }
1243 
1244 /* ARGSUSED */
1245 int
1246 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1247     register_t *retval)
1248 {
1249 	/* {
1250 		syscallarg(const char *) path;
1251 		syscallarg(struct quotactl_args *) args;
1252 	} */
1253 	struct quotactl_args args;
1254 	int error;
1255 
1256 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1257 	if (error) {
1258 		return error;
1259 	}
1260 
1261 	return do_sys_quotactl(SCARG(uap, path), &args);
1262 }
1263 
1264 int
1265 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1266     int root)
1267 {
1268 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1269 	bool chrooted;
1270 	int error = 0;
1271 
1272 	KASSERT(l == curlwp);
1273 
1274 	/*
1275 	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1276 	 * since it would imply chroots can be escaped.  Just make sure this
1277 	 * routine is self-consistent.
1278 	 */
1279 	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1280 
1281 	/*
1282 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1283 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1284 	 * overrides MNT_NOWAIT.
1285 	 */
1286 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1287 	    (flags != MNT_WAIT && flags != 0)) {
1288 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1289 	} else {
1290 		/* Get the filesystem stats now */
1291 		memset(sp, 0, sizeof(*sp));
1292 		if ((error = VFS_STATVFS(mp, sp)) != 0)
1293 			return error;
1294 		if (!chrooted)
1295 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1296 	}
1297 
1298 	if (chrooted) {
1299 		size_t len;
1300 		char *bp;
1301 		char c;
1302 		char *path = PNBUF_GET();
1303 
1304 		bp = path + MAXPATHLEN;
1305 		*--bp = '\0';
1306 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1307 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1308 		    MAXPATHLEN / 2, 0, l);
1309 		rw_exit(&cwdi->cwdi_lock);
1310 		if (error) {
1311 			PNBUF_PUT(path);
1312 			return error;
1313 		}
1314 		len = strlen(bp);
1315 		if (len != 1) {
1316 			/*
1317 			 * for mount points that are below our root, we can see
1318 			 * them, so we fix up the pathname and return them. The
1319 			 * rest we cannot see, so we don't allow viewing the
1320 			 * data.
1321 			 */
1322 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1323 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1324 				(void)strlcpy(sp->f_mntonname,
1325 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1326 				    sizeof(sp->f_mntonname));
1327 			} else {
1328 				if (root)
1329 					(void)strlcpy(sp->f_mntonname, "/",
1330 					    sizeof(sp->f_mntonname));
1331 				else
1332 					error = EPERM;
1333 			}
1334 		}
1335 		PNBUF_PUT(path);
1336 	}
1337 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1338 	return error;
1339 }
1340 
1341 /*
1342  * Get filesystem statistics by path.
1343  */
1344 int
1345 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1346 {
1347 	struct mount *mp;
1348 	int error;
1349 	struct vnode *vp;
1350 
1351 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1352 	if (error != 0)
1353 		return error;
1354 	mp = vp->v_mount;
1355 	error = dostatvfs(mp, sb, l, flags, 1);
1356 	vrele(vp);
1357 	return error;
1358 }
1359 
1360 /* ARGSUSED */
1361 int
1362 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap,
1363     register_t *retval)
1364 {
1365 	/* {
1366 		syscallarg(const char *) path;
1367 		syscallarg(struct statvfs *) buf;
1368 		syscallarg(int) flags;
1369 	} */
1370 	struct statvfs *sb;
1371 	int error;
1372 
1373 	sb = STATVFSBUF_GET();
1374 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1375 	if (error == 0)
1376 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1377 	STATVFSBUF_PUT(sb);
1378 	return error;
1379 }
1380 
1381 /*
1382  * Get filesystem statistics by fd.
1383  */
1384 int
1385 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1386 {
1387 	file_t *fp;
1388 	struct mount *mp;
1389 	int error;
1390 
1391 	/* fd_getvnode() will use the descriptor for us */
1392 	if ((error = fd_getvnode(fd, &fp)) != 0)
1393 		return (error);
1394 	mp = fp->f_vnode->v_mount;
1395 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1396 	fd_putfile(fd);
1397 	return error;
1398 }
1399 
1400 /* ARGSUSED */
1401 int
1402 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap,
1403     register_t *retval)
1404 {
1405 	/* {
1406 		syscallarg(int) fd;
1407 		syscallarg(struct statvfs *) buf;
1408 		syscallarg(int) flags;
1409 	} */
1410 	struct statvfs *sb;
1411 	int error;
1412 
1413 	sb = STATVFSBUF_GET();
1414 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1415 	if (error == 0)
1416 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1417 	STATVFSBUF_PUT(sb);
1418 	return error;
1419 }
1420 
1421 /*
1422  * Get statistics on all filesystems.
1423  */
1424 int
1425 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1426     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1427     register_t *retval)
1428 {
1429 	int root = 0;
1430 	mount_iterator_t *iter;
1431 	struct proc *p = l->l_proc;
1432 	struct mount *mp;
1433 	struct statvfs *sb;
1434 	size_t count, maxcount;
1435 	int error = 0;
1436 
1437 	sb = STATVFSBUF_GET();
1438 	maxcount = bufsize / entry_sz;
1439 	count = 0;
1440 	mountlist_iterator_init(&iter);
1441 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1442 		if (sfsp && count < maxcount) {
1443 			error = dostatvfs(mp, sb, l, flags, 0);
1444 			if (error) {
1445 				error = 0;
1446 				continue;
1447 			}
1448 			error = copyfn(sb, sfsp, entry_sz);
1449 			if (error)
1450 				goto out;
1451 			sfsp = (char *)sfsp + entry_sz;
1452 			root |= strcmp(sb->f_mntonname, "/") == 0;
1453 		}
1454 		count++;
1455 	}
1456 
1457 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1458 		/*
1459 		 * fake a root entry
1460 		 */
1461 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1462 		    sb, l, flags, 1);
1463 		if (error != 0)
1464 			goto out;
1465 		if (sfsp) {
1466 			error = copyfn(sb, sfsp, entry_sz);
1467 			if (error != 0)
1468 				goto out;
1469 		}
1470 		count++;
1471 	}
1472 	if (sfsp && count > maxcount)
1473 		*retval = maxcount;
1474 	else
1475 		*retval = count;
1476 out:
1477 	mountlist_iterator_destroy(iter);
1478 	STATVFSBUF_PUT(sb);
1479 	return error;
1480 }
1481 
1482 int
1483 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1484     register_t *retval)
1485 {
1486 	/* {
1487 		syscallarg(struct statvfs *) buf;
1488 		syscallarg(size_t) bufsize;
1489 		syscallarg(int) flags;
1490 	} */
1491 
1492 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1493 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1494 }
1495 
1496 /*
1497  * Change current working directory to a given file descriptor.
1498  */
1499 int
1500 do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
1501 {
1502 	struct proc *p = l->l_proc;
1503 	struct cwdinfo *cwdi;
1504 	struct vnode *vp, *tdp;
1505 	struct mount *mp;
1506 	file_t *fp;
1507 	int error;
1508 
1509 	/* fd_getvnode() will use the descriptor for us */
1510 	if ((error = fd_getvnode(fd, &fp)) != 0)
1511 		return error;
1512 	vp = fp->f_vnode;
1513 
1514 	vref(vp);
1515 	vn_lock(vp, LK_SHARED | LK_RETRY);
1516 	if (vp->v_type != VDIR)
1517 		error = ENOTDIR;
1518 	else
1519 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1520 	if (error) {
1521 		vput(vp);
1522 		goto out;
1523 	}
1524 	while ((mp = vp->v_mountedhere) != NULL) {
1525 		error = vfs_busy(mp);
1526 		vput(vp);
1527 		if (error != 0)
1528 			goto out;
1529 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1530 		vfs_unbusy(mp);
1531 		if (error)
1532 			goto out;
1533 		vp = tdp;
1534 	}
1535 	VOP_UNLOCK(vp);
1536 
1537 	/*
1538 	 * Disallow changing to a directory not under the process's
1539 	 * current root directory (if there is one).
1540 	 */
1541 	cwdi = p->p_cwdi;
1542 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1543 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1544 		vrele(vp);
1545 		error = EPERM;	/* operation not permitted */
1546 	} else {
1547 		vrele(cwdi->cwdi_cdir);
1548 		cwdi->cwdi_cdir = vp;
1549 	}
1550 	rw_exit(&cwdi->cwdi_lock);
1551 
1552 out:
1553 	fd_putfile(fd);
1554 	return error;
1555 }
1556 
1557 /*
1558  * Change current working directory to a given file descriptor.
1559  */
1560 /* ARGSUSED */
1561 int
1562 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap,
1563     register_t *retval)
1564 {
1565 	/* {
1566 		syscallarg(int) fd;
1567 	} */
1568 
1569 	return do_sys_fchdir(l, SCARG(uap, fd), retval);
1570 }
1571 
1572 /*
1573  * Change this process's notion of the root directory to a given file
1574  * descriptor.
1575  */
1576 int
1577 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap,
1578     register_t *retval)
1579 {
1580 	struct vnode	*vp;
1581 	file_t	*fp;
1582 	int		 error, fd = SCARG(uap, fd);
1583 
1584 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1585 		    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1586 		return error;
1587 	/* fd_getvnode() will use the descriptor for us */
1588 	if ((error = fd_getvnode(fd, &fp)) != 0)
1589 		return error;
1590 	vp = fp->f_vnode;
1591 	vn_lock(vp, LK_SHARED | LK_RETRY);
1592 	if (vp->v_type != VDIR)
1593 		error = ENOTDIR;
1594 	else
1595 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1596 	VOP_UNLOCK(vp);
1597 	if (error)
1598 		goto out;
1599 	vref(vp);
1600 	change_root(vp);
1601 
1602 out:
1603 	fd_putfile(fd);
1604 	return (error);
1605 }
1606 
1607 /*
1608  * Change current working directory (``.'').
1609  */
1610 int
1611 do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
1612     register_t *retval)
1613 {
1614 	struct proc *p = l->l_proc;
1615 	struct cwdinfo * cwdi;
1616 	int error;
1617 	struct vnode *vp;
1618 
1619 	if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
1620 		return error;
1621 	cwdi = p->p_cwdi;
1622 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1623 	vrele(cwdi->cwdi_cdir);
1624 	cwdi->cwdi_cdir = vp;
1625 	rw_exit(&cwdi->cwdi_lock);
1626 	return 0;
1627 }
1628 
1629 /*
1630  * Change current working directory (``.'').
1631  */
1632 /* ARGSUSED */
1633 int
1634 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1635 {
1636 	/* {
1637 		syscallarg(const char *) path;
1638 	} */
1639 
1640 	return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
1641 }
1642 
1643 /*
1644  * Change notion of root (``/'') directory.
1645  */
1646 /* ARGSUSED */
1647 int
1648 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap,
1649     register_t *retval)
1650 {
1651 	/* {
1652 		syscallarg(const char *) path;
1653 	} */
1654 	int error;
1655 	struct vnode *vp;
1656 
1657 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1658 		    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1659 		return (error);
1660 
1661 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1662 	if (error == 0)
1663 		change_root(vp);
1664 	return error;
1665 }
1666 
1667 /*
1668  * Common routine for chroot and fchroot.
1669  * NB: callers need to properly authorize the change root operation.
1670  */
1671 void
1672 change_root(struct vnode *vp)
1673 {
1674 	kauth_cred_t ncred;
1675 	struct lwp *l = curlwp;
1676 	struct proc *p = l->l_proc;
1677 	struct cwdinfo *cwdi = p->p_cwdi;
1678 
1679 	ncred = kauth_cred_alloc();
1680 
1681 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1682 	if (cwdi->cwdi_rdir != NULL)
1683 		vrele(cwdi->cwdi_rdir);
1684 	cwdi->cwdi_rdir = vp;
1685 
1686 	/*
1687 	 * Prevent escaping from chroot by putting the root under
1688 	 * the working directory.  Silently chdir to / if we aren't
1689 	 * already there.
1690 	 */
1691 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1692 		/*
1693 		 * XXX would be more failsafe to change directory to a
1694 		 * deadfs node here instead
1695 		 */
1696 		vrele(cwdi->cwdi_cdir);
1697 		vref(vp);
1698 		cwdi->cwdi_cdir = vp;
1699 	}
1700 	rw_exit(&cwdi->cwdi_lock);
1701 
1702 	/* Get a write lock on the process credential. */
1703 	proc_crmod_enter();
1704 
1705 	kauth_cred_clone(p->p_cred, ncred);
1706 	kauth_proc_chroot(ncred, p->p_cwdi);
1707 
1708 	/* Broadcast our credentials to the process and other LWPs. */
1709 	proc_crmod_leave(ncred, p->p_cred, true);
1710 }
1711 
1712 /*
1713  * Common routine for chroot and chdir.
1714  * XXX "where" should be enum uio_seg
1715  */
1716 int
1717 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1718 {
1719 	struct pathbuf *pb;
1720 	struct nameidata nd;
1721 	int error;
1722 
1723 	error = pathbuf_maybe_copyin(path, where, &pb);
1724 	if (error) {
1725 		return error;
1726 	}
1727 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1728 	if ((error = namei(&nd)) != 0) {
1729 		pathbuf_destroy(pb);
1730 		return error;
1731 	}
1732 	*vpp = nd.ni_vp;
1733 	pathbuf_destroy(pb);
1734 
1735 	if ((*vpp)->v_type != VDIR)
1736 		error = ENOTDIR;
1737 	else
1738 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1739 
1740 	if (error)
1741 		vput(*vpp);
1742 	else
1743 		VOP_UNLOCK(*vpp);
1744 	return (error);
1745 }
1746 
1747 /*
1748  * Internals of sys_open - path has already been converted into a pathbuf
1749  * (so we can easily reuse this function from other parts of the kernel,
1750  * like posix_spawn post-processing).
1751  */
1752 int
1753 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1754     int open_mode, int *fd)
1755 {
1756 	struct proc *p = l->l_proc;
1757 	struct cwdinfo *cwdi = p->p_cwdi;
1758 	file_t *fp;
1759 	struct vnode *vp;
1760 	int dupfd;
1761 	bool dupfd_move;
1762 	int flags, cmode;
1763 	int indx, error;
1764 
1765 	if (open_flags & O_SEARCH) {
1766 		open_flags &= ~(int)O_SEARCH;
1767 	}
1768 
1769 	/*
1770 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1771 	 * may be specified.
1772 	 */
1773 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1774 		return EINVAL;
1775 
1776 	flags = FFLAGS(open_flags);
1777 	if ((flags & (FREAD | FWRITE)) == 0)
1778 		return EINVAL;
1779 
1780 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1781 		return error;
1782 	}
1783 
1784 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1785 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1786 
1787 	error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1788 	    &vp, &dupfd_move, &dupfd);
1789 	if (error != 0) {
1790 		fd_abort(p, fp, indx);
1791 		return error;
1792 	}
1793 
1794 	if (vp == NULL) {
1795 		fd_abort(p, fp, indx);
1796 		error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1797 		if (error)
1798 			return error;
1799 		*fd = indx;
1800 	} else {
1801 		error = open_setfp(l, fp, vp, indx, flags);
1802 		if (error)
1803 			return error;
1804 		VOP_UNLOCK(vp);
1805 		*fd = indx;
1806 		fd_affix(p, fp, indx);
1807 	}
1808 
1809 	return 0;
1810 }
1811 
1812 int
1813 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1814 {
1815 	struct pathbuf *pb;
1816 	int error, oflags;
1817 
1818 	oflags = FFLAGS(open_flags);
1819 	if ((oflags & (FREAD | FWRITE)) == 0)
1820 		return EINVAL;
1821 
1822 	pb = pathbuf_create(path);
1823 	if (pb == NULL)
1824 		return ENOMEM;
1825 
1826 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1827 	pathbuf_destroy(pb);
1828 
1829 	return error;
1830 }
1831 
1832 static int
1833 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1834     int mode, int *fd)
1835 {
1836 	file_t *dfp = NULL;
1837 	struct vnode *dvp = NULL;
1838 	struct pathbuf *pb;
1839 	const char *pathstring = NULL;
1840 	int error;
1841 
1842 	if (path == NULL) {
1843 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1844 		if (error == ENOSYS)
1845 			goto no_compat;
1846 		if (error)
1847 			return error;
1848 	} else {
1849 no_compat:
1850 		error = pathbuf_copyin(path, &pb);
1851 		if (error)
1852 			return error;
1853 	}
1854 
1855 	pathstring = pathbuf_stringcopy_get(pb);
1856 
1857 	/*
1858 	 * fdat is ignored if:
1859 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1860 	 * 2) if path is absolute, then fdat is useless.
1861 	 */
1862 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1863 		/* fd_getvnode() will use the descriptor for us */
1864 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1865 			goto out;
1866 
1867 		dvp = dfp->f_vnode;
1868 	}
1869 
1870 	error = do_open(l, dvp, pb, flags, mode, fd);
1871 
1872 	if (dfp != NULL)
1873 		fd_putfile(fdat);
1874 out:
1875 	pathbuf_stringcopy_put(pb, pathstring);
1876 	pathbuf_destroy(pb);
1877 	return error;
1878 }
1879 
1880 int
1881 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1882 {
1883 	/* {
1884 		syscallarg(const char *) path;
1885 		syscallarg(int) flags;
1886 		syscallarg(int) mode;
1887 	} */
1888 	int error;
1889 	int fd;
1890 
1891 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1892 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1893 
1894 	if (error == 0)
1895 		*retval = fd;
1896 
1897 	return error;
1898 }
1899 
1900 int
1901 sys_openat(struct lwp *l, const struct sys_openat_args *uap,
1902     register_t *retval)
1903 {
1904 	/* {
1905 		syscallarg(int) fd;
1906 		syscallarg(const char *) path;
1907 		syscallarg(int) oflags;
1908 		syscallarg(int) mode;
1909 	} */
1910 	int error;
1911 	int fd;
1912 
1913 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1914 	    SCARG(uap, oflags), SCARG(uap, mode), &fd);
1915 
1916 	if (error == 0)
1917 		*retval = fd;
1918 
1919 	return error;
1920 }
1921 
1922 static void
1923 vfs__fhfree(fhandle_t *fhp)
1924 {
1925 	size_t fhsize;
1926 
1927 	fhsize = FHANDLE_SIZE(fhp);
1928 	kmem_free(fhp, fhsize);
1929 }
1930 
1931 /*
1932  * vfs_composefh: compose a filehandle.
1933  */
1934 
1935 int
1936 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1937 {
1938 	struct mount *mp;
1939 	struct fid *fidp;
1940 	int error;
1941 	size_t needfhsize;
1942 	size_t fidsize;
1943 
1944 	mp = vp->v_mount;
1945 	fidp = NULL;
1946 	if (*fh_size < FHANDLE_SIZE_MIN) {
1947 		fidsize = 0;
1948 	} else {
1949 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1950 		if (fhp != NULL) {
1951 			memset(fhp, 0, *fh_size);
1952 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1953 			fidp = &fhp->fh_fid;
1954 		}
1955 	}
1956 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1957 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1958 	if (error == 0 && *fh_size < needfhsize) {
1959 		error = E2BIG;
1960 	}
1961 	*fh_size = needfhsize;
1962 	return error;
1963 }
1964 
1965 int
1966 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1967 {
1968 	struct mount *mp;
1969 	fhandle_t *fhp;
1970 	size_t fhsize;
1971 	size_t fidsize;
1972 	int error;
1973 
1974 	mp = vp->v_mount;
1975 	fidsize = 0;
1976 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1977 	KASSERT(error != 0);
1978 	if (error != E2BIG) {
1979 		goto out;
1980 	}
1981 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1982 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1983 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1984 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1985 	if (error == 0) {
1986 		KASSERT(FHANDLE_SIZE(fhp) == fhsize);
1987 		KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize);
1988 		*fhpp = fhp;
1989 	} else {
1990 		kmem_free(fhp, fhsize);
1991 	}
1992 out:
1993 	return error;
1994 }
1995 
1996 void
1997 vfs_composefh_free(fhandle_t *fhp)
1998 {
1999 
2000 	vfs__fhfree(fhp);
2001 }
2002 
2003 /*
2004  * vfs_fhtovp: lookup a vnode by a filehandle.
2005  */
2006 
2007 int
2008 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
2009 {
2010 	struct mount *mp;
2011 	int error;
2012 
2013 	*vpp = NULL;
2014 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
2015 	if (mp == NULL) {
2016 		error = ESTALE;
2017 		goto out;
2018 	}
2019 	if (mp->mnt_op->vfs_fhtovp == NULL) {
2020 		error = EOPNOTSUPP;
2021 		goto out;
2022 	}
2023 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
2024 out:
2025 	return error;
2026 }
2027 
2028 /*
2029  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
2030  * the needed size.
2031  */
2032 
2033 int
2034 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
2035 {
2036 	fhandle_t *fhp;
2037 	int error;
2038 
2039 	if (fhsize > FHANDLE_SIZE_MAX) {
2040 		return EINVAL;
2041 	}
2042 	if (fhsize < FHANDLE_SIZE_MIN) {
2043 		return EINVAL;
2044 	}
2045 again:
2046 	fhp = kmem_alloc(fhsize, KM_SLEEP);
2047 	error = copyin(ufhp, fhp, fhsize);
2048 	if (error == 0) {
2049 		/* XXX this check shouldn't be here */
2050 		if (FHANDLE_SIZE(fhp) == fhsize) {
2051 			*fhpp = fhp;
2052 			return 0;
2053 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
2054 			/*
2055 			 * a kludge for nfsv2 padded handles.
2056 			 */
2057 			size_t sz;
2058 
2059 			sz = FHANDLE_SIZE(fhp);
2060 			kmem_free(fhp, fhsize);
2061 			fhsize = sz;
2062 			goto again;
2063 		} else {
2064 			/*
2065 			 * userland told us wrong size.
2066 			 */
2067 			error = EINVAL;
2068 		}
2069 	}
2070 	kmem_free(fhp, fhsize);
2071 	return error;
2072 }
2073 
2074 void
2075 vfs_copyinfh_free(fhandle_t *fhp)
2076 {
2077 
2078 	vfs__fhfree(fhp);
2079 }
2080 
2081 /*
2082  * Get file handle system call
2083  */
2084 int
2085 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap,
2086     register_t *retval)
2087 {
2088 	/* {
2089 		syscallarg(char *) fname;
2090 		syscallarg(fhandle_t *) fhp;
2091 		syscallarg(size_t *) fh_size;
2092 	} */
2093 	struct vnode *vp;
2094 	fhandle_t *fh;
2095 	int error;
2096 	struct pathbuf *pb;
2097 	struct nameidata nd;
2098 	size_t sz;
2099 	size_t usz;
2100 
2101 	/*
2102 	 * Must be super user
2103 	 */
2104 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2105 	    0, NULL, NULL, NULL);
2106 	if (error)
2107 		return (error);
2108 
2109 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
2110 	if (error) {
2111 		return error;
2112 	}
2113 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2114 	error = namei(&nd);
2115 	if (error) {
2116 		pathbuf_destroy(pb);
2117 		return error;
2118 	}
2119 	vp = nd.ni_vp;
2120 	pathbuf_destroy(pb);
2121 
2122 	error = vfs_composefh_alloc(vp, &fh);
2123 	vput(vp);
2124 	if (error != 0) {
2125 		return error;
2126 	}
2127 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2128 	if (error != 0) {
2129 		goto out;
2130 	}
2131 	sz = FHANDLE_SIZE(fh);
2132 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2133 	if (error != 0) {
2134 		goto out;
2135 	}
2136 	if (usz >= sz) {
2137 		error = copyout(fh, SCARG(uap, fhp), sz);
2138 	} else {
2139 		error = E2BIG;
2140 	}
2141 out:
2142 	vfs_composefh_free(fh);
2143 	return (error);
2144 }
2145 
2146 /*
2147  * Open a file given a file handle.
2148  *
2149  * Check permissions, allocate an open file structure,
2150  * and call the device open routine if any.
2151  */
2152 
2153 int
2154 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2155     register_t *retval)
2156 {
2157 	file_t *fp;
2158 	struct vnode *vp = NULL;
2159 	kauth_cred_t cred = l->l_cred;
2160 	file_t *nfp;
2161 	int indx, error;
2162 	struct vattr va;
2163 	fhandle_t *fh;
2164 	int flags;
2165 	proc_t *p;
2166 
2167 	p = curproc;
2168 
2169 	/*
2170 	 * Must be super user
2171 	 */
2172 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2173 		    0, NULL, NULL, NULL)))
2174 		return (error);
2175 
2176 	if (oflags & O_SEARCH) {
2177 		oflags &= ~(int)O_SEARCH;
2178 	}
2179 
2180 	flags = FFLAGS(oflags);
2181 	if ((flags & (FREAD | FWRITE)) == 0)
2182 		return (EINVAL);
2183 	if ((flags & O_CREAT))
2184 		return (EINVAL);
2185 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2186 		return (error);
2187 	fp = nfp;
2188 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2189 	if (error != 0) {
2190 		goto bad;
2191 	}
2192 	error = vfs_fhtovp(fh, &vp);
2193 	vfs_copyinfh_free(fh);
2194 	if (error != 0) {
2195 		goto bad;
2196 	}
2197 
2198 	/* Now do an effective vn_open */
2199 
2200 	if (vp->v_type == VSOCK) {
2201 		error = EOPNOTSUPP;
2202 		goto bad;
2203 	}
2204 	error = vn_openchk(vp, cred, flags);
2205 	if (error != 0)
2206 		goto bad;
2207 	if (flags & O_TRUNC) {
2208 		VOP_UNLOCK(vp);			/* XXX */
2209 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2210 		vattr_null(&va);
2211 		va.va_size = 0;
2212 		error = VOP_SETATTR(vp, &va, cred);
2213 		if (error)
2214 			goto bad;
2215 	}
2216 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2217 		goto bad;
2218 	if (flags & FWRITE) {
2219 		mutex_enter(vp->v_interlock);
2220 		vp->v_writecount++;
2221 		mutex_exit(vp->v_interlock);
2222 	}
2223 
2224 	/* done with modified vn_open, now finish what sys_open does. */
2225 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2226 		return error;
2227 
2228 	VOP_UNLOCK(vp);
2229 	*retval = indx;
2230 	fd_affix(p, fp, indx);
2231 	return (0);
2232 
2233 bad:
2234 	fd_abort(p, fp, indx);
2235 	if (vp != NULL)
2236 		vput(vp);
2237 	if (error == EDUPFD || error == EMOVEFD) {
2238 		/* XXX should probably close curlwp->l_dupfd */
2239 		error = EOPNOTSUPP;
2240 	}
2241 	return (error);
2242 }
2243 
2244 int
2245 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap,
2246     register_t *retval)
2247 {
2248 	/* {
2249 		syscallarg(const void *) fhp;
2250 		syscallarg(size_t) fh_size;
2251 		syscallarg(int) flags;
2252 	} */
2253 
2254 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2255 	    SCARG(uap, flags), retval);
2256 }
2257 
2258 int
2259 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2260 {
2261 	int error;
2262 	fhandle_t *fh;
2263 	struct vnode *vp;
2264 
2265 	/*
2266 	 * Must be super user
2267 	 */
2268 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2269 		    0, NULL, NULL, NULL)))
2270 		return error;
2271 
2272 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2273 	if (error != 0)
2274 		return error;
2275 
2276 	error = vfs_fhtovp(fh, &vp);
2277 	vfs_copyinfh_free(fh);
2278 	if (error != 0)
2279 		return error;
2280 
2281 	error = vn_stat(vp, sb);
2282 	vput(vp);
2283 	return error;
2284 }
2285 
2286 /* ARGSUSED */
2287 int
2288 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap,
2289     register_t *retval)
2290 {
2291 	/* {
2292 		syscallarg(const void *) fhp;
2293 		syscallarg(size_t) fh_size;
2294 		syscallarg(struct stat *) sb;
2295 	} */
2296 	struct stat sb;
2297 	int error;
2298 
2299 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2300 	if (error)
2301 		return error;
2302 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2303 }
2304 
2305 int
2306 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize,
2307     struct statvfs *sb, int flags)
2308 {
2309 	fhandle_t *fh;
2310 	struct mount *mp;
2311 	struct vnode *vp;
2312 	int error;
2313 
2314 	/*
2315 	 * Must be super user
2316 	 */
2317 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2318 		    0, NULL, NULL, NULL)))
2319 		return error;
2320 
2321 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2322 	if (error != 0)
2323 		return error;
2324 
2325 	error = vfs_fhtovp(fh, &vp);
2326 	vfs_copyinfh_free(fh);
2327 	if (error != 0)
2328 		return error;
2329 
2330 	mp = vp->v_mount;
2331 	error = dostatvfs(mp, sb, l, flags, 1);
2332 	vput(vp);
2333 	return error;
2334 }
2335 
2336 /* ARGSUSED */
2337 int
2338 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap,
2339     register_t *retval)
2340 {
2341 	/* {
2342 		syscallarg(const void *) fhp;
2343 		syscallarg(size_t) fh_size;
2344 		syscallarg(struct statvfs *) buf;
2345 		syscallarg(int)	flags;
2346 	} */
2347 	struct statvfs *sb = STATVFSBUF_GET();
2348 	int error;
2349 
2350 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2351 	    SCARG(uap, flags));
2352 	if (error == 0)
2353 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2354 	STATVFSBUF_PUT(sb);
2355 	return error;
2356 }
2357 
2358 int
2359 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2360     dev_t dev)
2361 {
2362 
2363 	/*
2364 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2365 	 * in mode and dev=0.
2366 	 *
2367 	 * In all the other cases it's implementation defined behavior.
2368 	 */
2369 
2370 	if ((mode & S_IFIFO) && dev == 0)
2371 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2372 	else
2373 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2374 		    UIO_USERSPACE);
2375 }
2376 
2377 /*
2378  * Create a special file.
2379  */
2380 /* ARGSUSED */
2381 int
2382 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2383     register_t *retval)
2384 {
2385 	/* {
2386 		syscallarg(const char *) path;
2387 		syscallarg(mode_t) mode;
2388 		syscallarg(dev_t) dev;
2389 	} */
2390 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2391 	    SCARG(uap, mode), SCARG(uap, dev));
2392 }
2393 
2394 int
2395 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2396     register_t *retval)
2397 {
2398 	/* {
2399 		syscallarg(int) fd;
2400 		syscallarg(const char *) path;
2401 		syscallarg(mode_t) mode;
2402 		syscallarg(int) pad;
2403 		syscallarg(dev_t) dev;
2404 	} */
2405 
2406 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2407 	    SCARG(uap, mode), SCARG(uap, dev));
2408 }
2409 
2410 int
2411 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2412     enum uio_seg seg)
2413 {
2414 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2415 }
2416 
2417 int
2418 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2419     dev_t dev, enum uio_seg seg)
2420 {
2421 	struct proc *p = l->l_proc;
2422 	struct vnode *vp;
2423 	struct vattr vattr;
2424 	int error, optype;
2425 	struct pathbuf *pb;
2426 	struct nameidata nd;
2427 	const char *pathstring;
2428 
2429 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2430 		    0, NULL, NULL, NULL)) != 0)
2431 		return (error);
2432 
2433 	optype = VOP_MKNOD_DESCOFFSET;
2434 
2435 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2436 	if (error) {
2437 		return error;
2438 	}
2439 	pathstring = pathbuf_stringcopy_get(pb);
2440 	if (pathstring == NULL) {
2441 		pathbuf_destroy(pb);
2442 		return ENOMEM;
2443 	}
2444 
2445 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2446 
2447 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2448 		goto out;
2449 	vp = nd.ni_vp;
2450 
2451 	if (vp != NULL)
2452 		error = EEXIST;
2453 	else {
2454 		vattr_null(&vattr);
2455 		/* We will read cwdi->cwdi_cmask unlocked. */
2456 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2457 		vattr.va_rdev = dev;
2458 
2459 		switch (mode & S_IFMT) {
2460 		case S_IFMT:	/* used by badsect to flag bad sectors */
2461 			vattr.va_type = VBAD;
2462 			break;
2463 		case S_IFCHR:
2464 			vattr.va_type = VCHR;
2465 			break;
2466 		case S_IFBLK:
2467 			vattr.va_type = VBLK;
2468 			break;
2469 		case S_IFWHT:
2470 			optype = VOP_WHITEOUT_DESCOFFSET;
2471 			break;
2472 		case S_IFREG:
2473 #if NVERIEXEC > 0
2474 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2475 			    O_CREAT);
2476 #endif /* NVERIEXEC > 0 */
2477 			vattr.va_type = VREG;
2478 			vattr.va_rdev = VNOVAL;
2479 			optype = VOP_CREATE_DESCOFFSET;
2480 			break;
2481 		default:
2482 			error = EINVAL;
2483 			break;
2484 		}
2485 
2486 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2487 		    vattr.va_rdev == VNOVAL)
2488 			error = EINVAL;
2489 	}
2490 
2491 	if (!error) {
2492 		switch (optype) {
2493 		case VOP_WHITEOUT_DESCOFFSET:
2494 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2495 			if (error)
2496 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2497 			vput(nd.ni_dvp);
2498 			break;
2499 
2500 		case VOP_MKNOD_DESCOFFSET:
2501 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2502 			    &nd.ni_cnd, &vattr);
2503 			if (error == 0)
2504 				vrele(nd.ni_vp);
2505 			vput(nd.ni_dvp);
2506 			break;
2507 
2508 		case VOP_CREATE_DESCOFFSET:
2509 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2510 			    &nd.ni_cnd, &vattr);
2511 			if (error == 0)
2512 				vrele(nd.ni_vp);
2513 			vput(nd.ni_dvp);
2514 			break;
2515 		}
2516 	} else {
2517 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2518 		if (nd.ni_dvp == vp)
2519 			vrele(nd.ni_dvp);
2520 		else
2521 			vput(nd.ni_dvp);
2522 		if (vp)
2523 			vrele(vp);
2524 	}
2525 out:
2526 	pathbuf_stringcopy_put(pb, pathstring);
2527 	pathbuf_destroy(pb);
2528 	return (error);
2529 }
2530 
2531 /*
2532  * Create a named pipe.
2533  */
2534 /* ARGSUSED */
2535 int
2536 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap,
2537     register_t *retval)
2538 {
2539 	/* {
2540 		syscallarg(const char *) path;
2541 		syscallarg(int) mode;
2542 	} */
2543 
2544 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path),
2545 	    SCARG(uap, mode));
2546 }
2547 
2548 int
2549 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2550     register_t *retval)
2551 {
2552 	/* {
2553 		syscallarg(int) fd;
2554 		syscallarg(const char *) path;
2555 		syscallarg(int) mode;
2556 	} */
2557 
2558 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2559 	    SCARG(uap, mode));
2560 }
2561 
2562 static int
2563 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2564 {
2565 	struct proc *p = l->l_proc;
2566 	struct vattr vattr;
2567 	int error;
2568 	struct pathbuf *pb;
2569 	struct nameidata nd;
2570 
2571 	error = pathbuf_copyin(path, &pb);
2572 	if (error) {
2573 		return error;
2574 	}
2575 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2576 
2577 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2578 		pathbuf_destroy(pb);
2579 		return error;
2580 	}
2581 	if (nd.ni_vp != NULL) {
2582 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2583 		if (nd.ni_dvp == nd.ni_vp)
2584 			vrele(nd.ni_dvp);
2585 		else
2586 			vput(nd.ni_dvp);
2587 		vrele(nd.ni_vp);
2588 		pathbuf_destroy(pb);
2589 		return (EEXIST);
2590 	}
2591 	vattr_null(&vattr);
2592 	vattr.va_type = VFIFO;
2593 	/* We will read cwdi->cwdi_cmask unlocked. */
2594 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2595 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2596 	if (error == 0)
2597 		vrele(nd.ni_vp);
2598 	vput(nd.ni_dvp);
2599 	pathbuf_destroy(pb);
2600 	return (error);
2601 }
2602 
2603 /*
2604  * Make a hard file link.
2605  */
2606 /* ARGSUSED */
2607 int
2608 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2609     const char *link, int follow, register_t *retval)
2610 {
2611 	struct vnode *vp;
2612 	struct pathbuf *linkpb;
2613 	struct nameidata nd;
2614 	namei_simple_flags_t ns_flags;
2615 	int error;
2616 
2617 	if (follow & AT_SYMLINK_FOLLOW)
2618 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2619 	else
2620 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2621 
2622 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2623 	if (error != 0)
2624 		return (error);
2625 	error = pathbuf_copyin(link, &linkpb);
2626 	if (error) {
2627 		goto out1;
2628 	}
2629 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2630 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2631 		goto out2;
2632 	if (nd.ni_vp) {
2633 		error = EEXIST;
2634 		goto abortop;
2635 	}
2636 	/* Prevent hard links on directories. */
2637 	if (vp->v_type == VDIR) {
2638 		error = EPERM;
2639 		goto abortop;
2640 	}
2641 	/* Prevent cross-mount operation. */
2642 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2643 		error = EXDEV;
2644 		goto abortop;
2645 	}
2646 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2647 	VOP_UNLOCK(nd.ni_dvp);
2648 	vrele(nd.ni_dvp);
2649 out2:
2650 	pathbuf_destroy(linkpb);
2651 out1:
2652 	vrele(vp);
2653 	return (error);
2654 abortop:
2655 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2656 	if (nd.ni_dvp == nd.ni_vp)
2657 		vrele(nd.ni_dvp);
2658 	else
2659 		vput(nd.ni_dvp);
2660 	if (nd.ni_vp != NULL)
2661 		vrele(nd.ni_vp);
2662 	goto out2;
2663 }
2664 
2665 int
2666 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2667 {
2668 	/* {
2669 		syscallarg(const char *) path;
2670 		syscallarg(const char *) link;
2671 	} */
2672 	const char *path = SCARG(uap, path);
2673 	const char *link = SCARG(uap, link);
2674 
2675 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2676 	    AT_SYMLINK_FOLLOW, retval);
2677 }
2678 
2679 int
2680 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2681     register_t *retval)
2682 {
2683 	/* {
2684 		syscallarg(int) fd1;
2685 		syscallarg(const char *) name1;
2686 		syscallarg(int) fd2;
2687 		syscallarg(const char *) name2;
2688 		syscallarg(int) flags;
2689 	} */
2690 	int fd1 = SCARG(uap, fd1);
2691 	const char *name1 = SCARG(uap, name1);
2692 	int fd2 = SCARG(uap, fd2);
2693 	const char *name2 = SCARG(uap, name2);
2694 	int follow;
2695 
2696 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2697 
2698 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2699 }
2700 
2701 int
2702 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2703 {
2704 
2705 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2706 }
2707 
2708 static int
2709 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2710     const char *link, enum uio_seg seg)
2711 {
2712 	struct proc *p = curproc;
2713 	struct vattr vattr;
2714 	char *path;
2715 	int error;
2716 	size_t len;
2717 	struct pathbuf *linkpb;
2718 	struct nameidata nd;
2719 
2720 	KASSERT(l != NULL || fdat == AT_FDCWD);
2721 
2722 	path = PNBUF_GET();
2723 	if (seg == UIO_USERSPACE) {
2724 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2725 			goto out1;
2726 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2727 			goto out1;
2728 	} else {
2729 		len = strlen(patharg) + 1;
2730 		KASSERT(len <= MAXPATHLEN);
2731 		memcpy(path, patharg, len);
2732 		linkpb = pathbuf_create(link);
2733 		if (linkpb == NULL) {
2734 			error = ENOMEM;
2735 			goto out1;
2736 		}
2737 	}
2738 	ktrkuser("symlink-target", path, len - 1);
2739 
2740 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2741 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2742 		goto out2;
2743 	if (nd.ni_vp) {
2744 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2745 		if (nd.ni_dvp == nd.ni_vp)
2746 			vrele(nd.ni_dvp);
2747 		else
2748 			vput(nd.ni_dvp);
2749 		vrele(nd.ni_vp);
2750 		error = EEXIST;
2751 		goto out2;
2752 	}
2753 	vattr_null(&vattr);
2754 	vattr.va_type = VLNK;
2755 	/* We will read cwdi->cwdi_cmask unlocked. */
2756 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2757 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2758 	if (error == 0)
2759 		vrele(nd.ni_vp);
2760 	vput(nd.ni_dvp);
2761 out2:
2762 	pathbuf_destroy(linkpb);
2763 out1:
2764 	PNBUF_PUT(path);
2765 	return (error);
2766 }
2767 
2768 /*
2769  * Make a symbolic link.
2770  */
2771 /* ARGSUSED */
2772 int
2773 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2774 {
2775 	/* {
2776 		syscallarg(const char *) path;
2777 		syscallarg(const char *) link;
2778 	} */
2779 
2780 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2781 	    UIO_USERSPACE);
2782 }
2783 
2784 int
2785 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2786     register_t *retval)
2787 {
2788 	/* {
2789 		syscallarg(const char *) path1;
2790 		syscallarg(int) fd;
2791 		syscallarg(const char *) path2;
2792 	} */
2793 
2794 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2795 	    SCARG(uap, path2), UIO_USERSPACE);
2796 }
2797 
2798 /*
2799  * Delete a whiteout from the filesystem.
2800  */
2801 /* ARGSUSED */
2802 int
2803 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap,
2804     register_t *retval)
2805 {
2806 	/* {
2807 		syscallarg(const char *) path;
2808 	} */
2809 	int error;
2810 	struct pathbuf *pb;
2811 	struct nameidata nd;
2812 
2813 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2814 	if (error) {
2815 		return error;
2816 	}
2817 
2818 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2819 	error = namei(&nd);
2820 	if (error) {
2821 		pathbuf_destroy(pb);
2822 		return (error);
2823 	}
2824 
2825 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2826 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2827 		if (nd.ni_dvp == nd.ni_vp)
2828 			vrele(nd.ni_dvp);
2829 		else
2830 			vput(nd.ni_dvp);
2831 		if (nd.ni_vp)
2832 			vrele(nd.ni_vp);
2833 		pathbuf_destroy(pb);
2834 		return (EEXIST);
2835 	}
2836 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2837 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2838 	vput(nd.ni_dvp);
2839 	pathbuf_destroy(pb);
2840 	return (error);
2841 }
2842 
2843 /*
2844  * Delete a name from the filesystem.
2845  */
2846 /* ARGSUSED */
2847 int
2848 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap,
2849     register_t *retval)
2850 {
2851 	/* {
2852 		syscallarg(const char *) path;
2853 	} */
2854 
2855 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0,
2856 	    UIO_USERSPACE);
2857 }
2858 
2859 int
2860 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2861     register_t *retval)
2862 {
2863 	/* {
2864 		syscallarg(int) fd;
2865 		syscallarg(const char *) path;
2866 		syscallarg(int) flag;
2867 	} */
2868 
2869 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2870 	    SCARG(uap, flag), UIO_USERSPACE);
2871 }
2872 
2873 int
2874 do_sys_unlink(const char *arg, enum uio_seg seg)
2875 {
2876 
2877 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2878 }
2879 
2880 static int
2881 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2882     enum uio_seg seg)
2883 {
2884 	struct vnode *vp;
2885 	int error;
2886 	struct pathbuf *pb;
2887 	struct nameidata nd;
2888 	const char *pathstring;
2889 
2890 	KASSERT(l != NULL || fdat == AT_FDCWD);
2891 
2892 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2893 	if (error) {
2894 		return error;
2895 	}
2896 	pathstring = pathbuf_stringcopy_get(pb);
2897 	if (pathstring == NULL) {
2898 		pathbuf_destroy(pb);
2899 		return ENOMEM;
2900 	}
2901 
2902 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2903 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2904 		goto out;
2905 	vp = nd.ni_vp;
2906 
2907 	/*
2908 	 * The root of a mounted filesystem cannot be deleted.
2909 	 */
2910 	if ((vp->v_vflag & VV_ROOT) != 0) {
2911 		error = EBUSY;
2912 		goto abort;
2913 	}
2914 
2915 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2916 		error = EBUSY;
2917 		goto abort;
2918 	}
2919 
2920 	/*
2921 	 * No rmdir "." please.
2922 	 */
2923 	if (nd.ni_dvp == vp) {
2924 		error = EINVAL;
2925 		goto abort;
2926 	}
2927 
2928 	/*
2929 	 * AT_REMOVEDIR is required to remove a directory
2930 	 */
2931 	if (vp->v_type == VDIR) {
2932 		if (!(flags & AT_REMOVEDIR)) {
2933 			error = EPERM;
2934 			goto abort;
2935 		} else {
2936 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2937 			vput(nd.ni_dvp);
2938 			goto out;
2939 		}
2940 	}
2941 
2942 	/*
2943 	 * Starting here we only deal with non directories.
2944 	 */
2945 	if (flags & AT_REMOVEDIR) {
2946 		error = ENOTDIR;
2947 		goto abort;
2948 	}
2949 
2950 #if NVERIEXEC > 0
2951 	/* Handle remove requests for veriexec entries. */
2952 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2953 		goto abort;
2954 	}
2955 #endif /* NVERIEXEC > 0 */
2956 
2957 #ifdef FILEASSOC
2958 	(void)fileassoc_file_delete(vp);
2959 #endif /* FILEASSOC */
2960 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2961 	vput(nd.ni_dvp);
2962 	goto out;
2963 
2964 abort:
2965 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2966 	if (nd.ni_dvp == vp)
2967 		vrele(nd.ni_dvp);
2968 	else
2969 		vput(nd.ni_dvp);
2970 	vput(vp);
2971 
2972 out:
2973 	pathbuf_stringcopy_put(pb, pathstring);
2974 	pathbuf_destroy(pb);
2975 	return (error);
2976 }
2977 
2978 /*
2979  * Reposition read/write file offset.
2980  */
2981 int
2982 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2983 {
2984 	/* {
2985 		syscallarg(int) fd;
2986 		syscallarg(int) pad;
2987 		syscallarg(off_t) offset;
2988 		syscallarg(int) whence;
2989 	} */
2990 	file_t *fp;
2991 	int error, fd;
2992 
2993 	switch (SCARG(uap, whence)) {
2994 	case SEEK_CUR:
2995 	case SEEK_END:
2996 	case SEEK_SET:
2997 		break;
2998 	default:
2999 		return EINVAL;
3000 	}
3001 
3002 	fd = SCARG(uap, fd);
3003 
3004 	if ((fp = fd_getfile(fd)) == NULL)
3005 		return (EBADF);
3006 
3007 	if (fp->f_ops->fo_seek == NULL) {
3008 		error = ESPIPE;
3009 		goto out;
3010 	}
3011 
3012 	error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
3013 	    SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
3014 out:
3015 	fd_putfile(fd);
3016 	return (error);
3017 }
3018 
3019 /*
3020  * Positional read system call.
3021  */
3022 int
3023 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
3024 {
3025 	/* {
3026 		syscallarg(int) fd;
3027 		syscallarg(void *) buf;
3028 		syscallarg(size_t) nbyte;
3029 		syscallarg(off_t) offset;
3030 	} */
3031 	file_t *fp;
3032 	off_t offset;
3033 	int error, fd = SCARG(uap, fd);
3034 
3035 	if ((fp = fd_getfile(fd)) == NULL)
3036 		return (EBADF);
3037 
3038 	if ((fp->f_flag & FREAD) == 0) {
3039 		fd_putfile(fd);
3040 		return (EBADF);
3041 	}
3042 
3043 	if (fp->f_ops->fo_seek == NULL) {
3044 		error = ESPIPE;
3045 		goto out;
3046 	}
3047 
3048 	offset = SCARG(uap, offset);
3049 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3050 	if (error)
3051 		goto out;
3052 
3053 	/* dofileread() will unuse the descriptor for us */
3054 	return dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3055 	    &offset, 0, retval);
3056 
3057 out:
3058 	fd_putfile(fd);
3059 	return (error);
3060 }
3061 
3062 /*
3063  * Positional scatter read system call.
3064  */
3065 int
3066 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap,
3067     register_t *retval)
3068 {
3069 	/* {
3070 		syscallarg(int) fd;
3071 		syscallarg(const struct iovec *) iovp;
3072 		syscallarg(int) iovcnt;
3073 		syscallarg(off_t) offset;
3074 	} */
3075 	off_t offset = SCARG(uap, offset);
3076 
3077 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3078 	    SCARG(uap, iovcnt), &offset, 0, retval);
3079 }
3080 
3081 /*
3082  * Positional write system call.
3083  */
3084 int
3085 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap,
3086     register_t *retval)
3087 {
3088 	/* {
3089 		syscallarg(int) fd;
3090 		syscallarg(const void *) buf;
3091 		syscallarg(size_t) nbyte;
3092 		syscallarg(off_t) offset;
3093 	} */
3094 	file_t *fp;
3095 	off_t offset;
3096 	int error, fd = SCARG(uap, fd);
3097 
3098 	if ((fp = fd_getfile(fd)) == NULL)
3099 		return (EBADF);
3100 
3101 	if ((fp->f_flag & FWRITE) == 0) {
3102 		fd_putfile(fd);
3103 		return (EBADF);
3104 	}
3105 
3106 	if (fp->f_ops->fo_seek == NULL) {
3107 		error = ESPIPE;
3108 		goto out;
3109 	}
3110 
3111 	offset = SCARG(uap, offset);
3112 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3113 	if (error)
3114 		goto out;
3115 
3116 	/* dofilewrite() will unuse the descriptor for us */
3117 	return dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3118 	    &offset, 0, retval);
3119 
3120 out:
3121 	fd_putfile(fd);
3122 	return (error);
3123 }
3124 
3125 /*
3126  * Positional gather write system call.
3127  */
3128 int
3129 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap,
3130     register_t *retval)
3131 {
3132 	/* {
3133 		syscallarg(int) fd;
3134 		syscallarg(const struct iovec *) iovp;
3135 		syscallarg(int) iovcnt;
3136 		syscallarg(off_t) offset;
3137 	} */
3138 	off_t offset = SCARG(uap, offset);
3139 
3140 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3141 	    SCARG(uap, iovcnt), &offset, 0, retval);
3142 }
3143 
3144 /*
3145  * Check access permissions.
3146  */
3147 int
3148 sys_access(struct lwp *l, const struct sys_access_args *uap,
3149     register_t *retval)
3150 {
3151 	/* {
3152 		syscallarg(const char *) path;
3153 		syscallarg(int) flags;
3154 	} */
3155 
3156 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3157 	    SCARG(uap, flags), 0);
3158 }
3159 
3160 int
3161 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3162     int mode, int flags)
3163 {
3164 	kauth_cred_t cred;
3165 	struct vnode *vp;
3166 	int error, nd_flag, vmode;
3167 	struct pathbuf *pb;
3168 	struct nameidata nd;
3169 
3170 	CTASSERT(F_OK == 0);
3171 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3172 		/* nonsense mode */
3173 		return EINVAL;
3174 	}
3175 
3176 	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3177 	if (flags & AT_SYMLINK_NOFOLLOW)
3178 		nd_flag &= ~FOLLOW;
3179 
3180 	error = pathbuf_copyin(path, &pb);
3181 	if (error)
3182 		return error;
3183 
3184 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3185 
3186 	/* Override default credentials */
3187 	if (!(flags & AT_EACCESS)) {
3188 		cred = kauth_cred_dup(l->l_cred);
3189 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3190 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3191 	} else
3192 		cred = l->l_cred;
3193 	nd.ni_cnd.cn_cred = cred;
3194 
3195 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3196 		pathbuf_destroy(pb);
3197 		goto out;
3198 	}
3199 	vp = nd.ni_vp;
3200 	pathbuf_destroy(pb);
3201 
3202 	/* Flags == 0 means only check for existence. */
3203 	if (mode) {
3204 		vmode = 0;
3205 		if (mode & R_OK)
3206 			vmode |= VREAD;
3207 		if (mode & W_OK)
3208 			vmode |= VWRITE;
3209 		if (mode & X_OK)
3210 			vmode |= VEXEC;
3211 
3212 		error = VOP_ACCESS(vp, vmode, cred);
3213 		if (!error && (vmode & VWRITE))
3214 			error = vn_writechk(vp);
3215 	}
3216 	vput(vp);
3217 out:
3218 	if (!(flags & AT_EACCESS))
3219 		kauth_cred_free(cred);
3220 	return (error);
3221 }
3222 
3223 int
3224 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3225     register_t *retval)
3226 {
3227 	/* {
3228 		syscallarg(int) fd;
3229 		syscallarg(const char *) path;
3230 		syscallarg(int) amode;
3231 		syscallarg(int) flag;
3232 	} */
3233 
3234 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3235 	    SCARG(uap, amode), SCARG(uap, flag));
3236 }
3237 
3238 /*
3239  * Common code for all sys_stat functions, including compat versions.
3240  */
3241 int
3242 do_sys_stat(const char *userpath, unsigned int nd_flag, struct stat *sb)
3243 {
3244 
3245 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3246 }
3247 
3248 int
3249 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3250     unsigned int nd_flag, struct stat *sb)
3251 {
3252 	int error;
3253 	struct pathbuf *pb;
3254 	struct nameidata nd;
3255 
3256 	KASSERT(l != NULL || fdat == AT_FDCWD);
3257 
3258 	error = pathbuf_copyin(userpath, &pb);
3259 	if (error) {
3260 		return error;
3261 	}
3262 
3263 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3264 
3265 	error = fd_nameiat(l, fdat, &nd);
3266 	if (error != 0) {
3267 		pathbuf_destroy(pb);
3268 		return error;
3269 	}
3270 	error = vn_stat(nd.ni_vp, sb);
3271 	vput(nd.ni_vp);
3272 	pathbuf_destroy(pb);
3273 	return error;
3274 }
3275 
3276 /*
3277  * Get file status; this version follows links.
3278  */
3279 /* ARGSUSED */
3280 int
3281 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap,
3282     register_t *retval)
3283 {
3284 	/* {
3285 		syscallarg(const char *) path;
3286 		syscallarg(struct stat *) ub;
3287 	} */
3288 	struct stat sb;
3289 	int error;
3290 
3291 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3292 	if (error)
3293 		return error;
3294 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3295 }
3296 
3297 /*
3298  * Get file status; this version does not follow links.
3299  */
3300 /* ARGSUSED */
3301 int
3302 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap,
3303     register_t *retval)
3304 {
3305 	/* {
3306 		syscallarg(const char *) path;
3307 		syscallarg(struct stat *) ub;
3308 	} */
3309 	struct stat sb;
3310 	int error;
3311 
3312 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3313 	if (error)
3314 		return error;
3315 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3316 }
3317 
3318 int
3319 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3320     register_t *retval)
3321 {
3322 	/* {
3323 		syscallarg(int) fd;
3324 		syscallarg(const char *) path;
3325 		syscallarg(struct stat *) buf;
3326 		syscallarg(int) flag;
3327 	} */
3328 	unsigned int nd_flag;
3329 	struct stat sb;
3330 	int error;
3331 
3332 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3333 		nd_flag = NOFOLLOW;
3334 	else
3335 		nd_flag = FOLLOW;
3336 
3337 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3338 	    &sb);
3339 	if (error)
3340 		return error;
3341 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3342 }
3343 
3344 static int
3345 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3346 {
3347 	int error;
3348 	struct pathbuf *pb;
3349 	struct nameidata nd;
3350 
3351 	error = pathbuf_copyin(path, &pb);
3352 	if (error) {
3353 		return error;
3354 	}
3355 	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3356 	if ((error = namei(&nd)) != 0) {
3357 		pathbuf_destroy(pb);
3358 		return error;
3359 	}
3360 	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3361 	vput(nd.ni_vp);
3362 	pathbuf_destroy(pb);
3363 	return error;
3364 }
3365 
3366 /*
3367  * Get configurable pathname variables.
3368  */
3369 /* ARGSUSED */
3370 int
3371 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3372     register_t *retval)
3373 {
3374 	/* {
3375 		syscallarg(const char *) path;
3376 		syscallarg(int) name;
3377 	} */
3378 
3379 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3380 	    FOLLOW);
3381 }
3382 
3383 /* ARGSUSED */
3384 int
3385 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3386     register_t *retval)
3387 {
3388 	/* {
3389 		syscallarg(const char *) path;
3390 		syscallarg(int) name;
3391 	} */
3392 
3393 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3394 	    NOFOLLOW);
3395 }
3396 
3397 /*
3398  * Return target name of a symbolic link.
3399  */
3400 /* ARGSUSED */
3401 int
3402 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3403     register_t *retval)
3404 {
3405 	/* {
3406 		syscallarg(const char *) path;
3407 		syscallarg(char *) buf;
3408 		syscallarg(size_t) count;
3409 	} */
3410 
3411 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3412 	    SCARG(uap, buf), SCARG(uap, count), retval);
3413 }
3414 
3415 static int
3416 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3417     size_t count, register_t *retval)
3418 {
3419 	struct vnode *vp;
3420 	struct iovec aiov;
3421 	struct uio auio;
3422 	int error;
3423 	struct pathbuf *pb;
3424 	struct nameidata nd;
3425 
3426 	error = pathbuf_copyin(path, &pb);
3427 	if (error) {
3428 		return error;
3429 	}
3430 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT,
3431 	    pb);
3432 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3433 		pathbuf_destroy(pb);
3434 		return error;
3435 	}
3436 	vp = nd.ni_vp;
3437 	pathbuf_destroy(pb);
3438 	if (vp->v_type != VLNK)
3439 		error = EINVAL;
3440 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3441 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3442 		aiov.iov_base = buf;
3443 		aiov.iov_len = count;
3444 		auio.uio_iov = &aiov;
3445 		auio.uio_iovcnt = 1;
3446 		auio.uio_offset = 0;
3447 		auio.uio_rw = UIO_READ;
3448 		KASSERT(l == curlwp);
3449 		auio.uio_vmspace = l->l_proc->p_vmspace;
3450 		auio.uio_resid = count;
3451 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3452 			*retval = count - auio.uio_resid;
3453 	}
3454 	vput(vp);
3455 	return (error);
3456 }
3457 
3458 int
3459 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3460     register_t *retval)
3461 {
3462 	/* {
3463 		syscallarg(int) fd;
3464 		syscallarg(const char *) path;
3465 		syscallarg(char *) buf;
3466 		syscallarg(size_t) bufsize;
3467 	} */
3468 
3469 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3470 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3471 }
3472 
3473 /*
3474  * Change flags of a file given a path name.
3475  */
3476 /* ARGSUSED */
3477 int
3478 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap,
3479     register_t *retval)
3480 {
3481 	/* {
3482 		syscallarg(const char *) path;
3483 		syscallarg(u_long) flags;
3484 	} */
3485 	struct vnode *vp;
3486 	int error;
3487 
3488 	error = namei_simple_user(SCARG(uap, path),
3489 	    NSM_FOLLOW_TRYEMULROOT, &vp);
3490 	if (error != 0)
3491 		return (error);
3492 	error = change_flags(vp, SCARG(uap, flags), l);
3493 	vput(vp);
3494 	return (error);
3495 }
3496 
3497 /*
3498  * Change flags of a file given a file descriptor.
3499  */
3500 /* ARGSUSED */
3501 int
3502 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap,
3503     register_t *retval)
3504 {
3505 	/* {
3506 		syscallarg(int) fd;
3507 		syscallarg(u_long) flags;
3508 	} */
3509 	struct vnode *vp;
3510 	file_t *fp;
3511 	int error;
3512 
3513 	/* fd_getvnode() will use the descriptor for us */
3514 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3515 		return (error);
3516 	vp = fp->f_vnode;
3517 	error = change_flags(vp, SCARG(uap, flags), l);
3518 	VOP_UNLOCK(vp);
3519 	fd_putfile(SCARG(uap, fd));
3520 	return (error);
3521 }
3522 
3523 /*
3524  * Change flags of a file given a path name; this version does
3525  * not follow links.
3526  */
3527 int
3528 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap,
3529     register_t *retval)
3530 {
3531 	/* {
3532 		syscallarg(const char *) path;
3533 		syscallarg(u_long) flags;
3534 	} */
3535 	struct vnode *vp;
3536 	int error;
3537 
3538 	error = namei_simple_user(SCARG(uap, path),
3539 	    NSM_NOFOLLOW_TRYEMULROOT, &vp);
3540 	if (error != 0)
3541 		return (error);
3542 	error = change_flags(vp, SCARG(uap, flags), l);
3543 	vput(vp);
3544 	return (error);
3545 }
3546 
3547 /*
3548  * Common routine to change flags of a file.
3549  */
3550 int
3551 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3552 {
3553 	struct vattr vattr;
3554 	int error;
3555 
3556 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3557 
3558 	vattr_null(&vattr);
3559 	vattr.va_flags = flags;
3560 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3561 
3562 	return (error);
3563 }
3564 
3565 /*
3566  * Change mode of a file given path name; this version follows links.
3567  */
3568 /* ARGSUSED */
3569 int
3570 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3571 {
3572 	/* {
3573 		syscallarg(const char *) path;
3574 		syscallarg(int) mode;
3575 	} */
3576 
3577 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3578 	    SCARG(uap, mode), 0);
3579 }
3580 
3581 int
3582 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3583 {
3584 	int error;
3585 	struct vnode *vp;
3586 	namei_simple_flags_t ns_flag;
3587 
3588 	if (flags & AT_SYMLINK_NOFOLLOW)
3589 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3590 	else
3591 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3592 
3593 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3594 	if (error != 0)
3595 		return error;
3596 
3597 	error = change_mode(vp, mode, l);
3598 
3599 	vrele(vp);
3600 
3601 	return (error);
3602 }
3603 
3604 /*
3605  * Change mode of a file given a file descriptor.
3606  */
3607 /* ARGSUSED */
3608 int
3609 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap,
3610     register_t *retval)
3611 {
3612 	/* {
3613 		syscallarg(int) fd;
3614 		syscallarg(int) mode;
3615 	} */
3616 	file_t *fp;
3617 	int error;
3618 
3619 	/* fd_getvnode() will use the descriptor for us */
3620 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3621 		return (error);
3622 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3623 	fd_putfile(SCARG(uap, fd));
3624 	return (error);
3625 }
3626 
3627 int
3628 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3629     register_t *retval)
3630 {
3631 	/* {
3632 		syscallarg(int) fd;
3633 		syscallarg(const char *) path;
3634 		syscallarg(int) mode;
3635 		syscallarg(int) flag;
3636 	} */
3637 
3638 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3639 	    SCARG(uap, mode), SCARG(uap, flag));
3640 }
3641 
3642 /*
3643  * Change mode of a file given path name; this version does not follow links.
3644  */
3645 /* ARGSUSED */
3646 int
3647 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap,
3648     register_t *retval)
3649 {
3650 	/* {
3651 		syscallarg(const char *) path;
3652 		syscallarg(int) mode;
3653 	} */
3654 	int error;
3655 	struct vnode *vp;
3656 
3657 	error = namei_simple_user(SCARG(uap, path),
3658 	    NSM_NOFOLLOW_TRYEMULROOT, &vp);
3659 	if (error != 0)
3660 		return (error);
3661 
3662 	error = change_mode(vp, SCARG(uap, mode), l);
3663 
3664 	vrele(vp);
3665 	return (error);
3666 }
3667 
3668 /*
3669  * Common routine to set mode given a vnode.
3670  */
3671 static int
3672 change_mode(struct vnode *vp, int mode, struct lwp *l)
3673 {
3674 	struct vattr vattr;
3675 	int error;
3676 
3677 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3678 	vattr_null(&vattr);
3679 	vattr.va_mode = mode & ALLPERMS;
3680 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3681 	VOP_UNLOCK(vp);
3682 	return (error);
3683 }
3684 
3685 /*
3686  * Set ownership given a path name; this version follows links.
3687  */
3688 /* ARGSUSED */
3689 int
3690 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3691 {
3692 	/* {
3693 		syscallarg(const char *) path;
3694 		syscallarg(uid_t) uid;
3695 		syscallarg(gid_t) gid;
3696 	} */
3697 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3698 	    SCARG(uap, gid), 0);
3699 }
3700 
3701 int
3702 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3703    gid_t gid, int flags)
3704 {
3705 	int error;
3706 	struct vnode *vp;
3707 	namei_simple_flags_t ns_flag;
3708 
3709 	if (flags & AT_SYMLINK_NOFOLLOW)
3710 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3711 	else
3712 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3713 
3714 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3715 	if (error != 0)
3716 		return error;
3717 
3718 	error = change_owner(vp, uid, gid, l, 0);
3719 
3720 	vrele(vp);
3721 
3722 	return (error);
3723 }
3724 
3725 /*
3726  * Set ownership given a path name; this version follows links.
3727  * Provides POSIX semantics.
3728  */
3729 /* ARGSUSED */
3730 int
3731 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap,
3732     register_t *retval)
3733 {
3734 	/* {
3735 		syscallarg(const char *) path;
3736 		syscallarg(uid_t) uid;
3737 		syscallarg(gid_t) gid;
3738 	} */
3739 	int error;
3740 	struct vnode *vp;
3741 
3742 	error = namei_simple_user(SCARG(uap, path),
3743 	    NSM_FOLLOW_TRYEMULROOT, &vp);
3744 	if (error != 0)
3745 		return (error);
3746 
3747 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3748 
3749 	vrele(vp);
3750 	return (error);
3751 }
3752 
3753 /*
3754  * Set ownership given a file descriptor.
3755  */
3756 /* ARGSUSED */
3757 int
3758 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap,
3759     register_t *retval)
3760 {
3761 	/* {
3762 		syscallarg(int) fd;
3763 		syscallarg(uid_t) uid;
3764 		syscallarg(gid_t) gid;
3765 	} */
3766 	int error;
3767 	file_t *fp;
3768 
3769 	/* fd_getvnode() will use the descriptor for us */
3770 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3771 		return (error);
3772 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3773 	    l, 0);
3774 	fd_putfile(SCARG(uap, fd));
3775 	return (error);
3776 }
3777 
3778 int
3779 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3780     register_t *retval)
3781 {
3782 	/* {
3783 		syscallarg(int) fd;
3784 		syscallarg(const char *) path;
3785 		syscallarg(uid_t) owner;
3786 		syscallarg(gid_t) group;
3787 		syscallarg(int) flag;
3788 	} */
3789 
3790 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3791 	    SCARG(uap, owner), SCARG(uap, group),
3792 	    SCARG(uap, flag));
3793 }
3794 
3795 /*
3796  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3797  */
3798 /* ARGSUSED */
3799 int
3800 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap,
3801     register_t *retval)
3802 {
3803 	/* {
3804 		syscallarg(int) fd;
3805 		syscallarg(uid_t) uid;
3806 		syscallarg(gid_t) gid;
3807 	} */
3808 	int error;
3809 	file_t *fp;
3810 
3811 	/* fd_getvnode() will use the descriptor for us */
3812 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3813 		return (error);
3814 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3815 	    l, 1);
3816 	fd_putfile(SCARG(uap, fd));
3817 	return (error);
3818 }
3819 
3820 /*
3821  * Set ownership given a path name; this version does not follow links.
3822  */
3823 /* ARGSUSED */
3824 int
3825 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap,
3826     register_t *retval)
3827 {
3828 	/* {
3829 		syscallarg(const char *) path;
3830 		syscallarg(uid_t) uid;
3831 		syscallarg(gid_t) gid;
3832 	} */
3833 	int error;
3834 	struct vnode *vp;
3835 
3836 	error = namei_simple_user(SCARG(uap, path),
3837 	    NSM_NOFOLLOW_TRYEMULROOT, &vp);
3838 	if (error != 0)
3839 		return (error);
3840 
3841 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3842 
3843 	vrele(vp);
3844 	return (error);
3845 }
3846 
3847 /*
3848  * Set ownership given a path name; this version does not follow links.
3849  * Provides POSIX/XPG semantics.
3850  */
3851 /* ARGSUSED */
3852 int
3853 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap,
3854     register_t *retval)
3855 {
3856 	/* {
3857 		syscallarg(const char *) path;
3858 		syscallarg(uid_t) uid;
3859 		syscallarg(gid_t) gid;
3860 	} */
3861 	int error;
3862 	struct vnode *vp;
3863 
3864 	error = namei_simple_user(SCARG(uap, path),
3865 	    NSM_NOFOLLOW_TRYEMULROOT, &vp);
3866 	if (error != 0)
3867 		return (error);
3868 
3869 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3870 
3871 	vrele(vp);
3872 	return (error);
3873 }
3874 
3875 /*
3876  * Common routine to set ownership given a vnode.
3877  */
3878 static int
3879 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3880     int posix_semantics)
3881 {
3882 	struct vattr vattr;
3883 	mode_t newmode;
3884 	int error;
3885 
3886 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3887 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3888 		goto out;
3889 
3890 #define CHANGED(x) ((int)(x) != -1)
3891 	newmode = vattr.va_mode;
3892 	if (posix_semantics) {
3893 		/*
3894 		 * POSIX/XPG semantics: if the caller is not the super-user,
3895 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3896 		 * the XPG consider the behaviour for calls by the super-user
3897 		 * implementation-defined; we leave the set-user-id and set-
3898 		 * group-id settings intact in that case.
3899 		 */
3900 		if (vattr.va_mode & S_ISUID) {
3901 			if (kauth_authorize_vnode(l->l_cred,
3902 				KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3903 				newmode &= ~S_ISUID;
3904 		}
3905 		if (vattr.va_mode & S_ISGID) {
3906 			if (kauth_authorize_vnode(l->l_cred,
3907 				KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3908 				newmode &= ~S_ISGID;
3909 		}
3910 	} else {
3911 		/*
3912 		 * NetBSD semantics: when changing owner and/or group,
3913 		 * clear the respective bit(s).
3914 		 */
3915 		if (CHANGED(uid))
3916 			newmode &= ~S_ISUID;
3917 		if (CHANGED(gid))
3918 			newmode &= ~S_ISGID;
3919 	}
3920 	/* Update va_mode iff altered. */
3921 	if (vattr.va_mode == newmode)
3922 		newmode = VNOVAL;
3923 
3924 	vattr_null(&vattr);
3925 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3926 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3927 	vattr.va_mode = newmode;
3928 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3929 #undef CHANGED
3930 
3931 out:
3932 	VOP_UNLOCK(vp);
3933 	return (error);
3934 }
3935 
3936 /*
3937  * Set the access and modification times given a path name; this
3938  * version follows links.
3939  */
3940 /* ARGSUSED */
3941 int
3942 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3943     register_t *retval)
3944 {
3945 	/* {
3946 		syscallarg(const char *) path;
3947 		syscallarg(const struct timeval *) tptr;
3948 	} */
3949 
3950 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3951 	    SCARG(uap, tptr), UIO_USERSPACE);
3952 }
3953 
3954 /*
3955  * Set the access and modification times given a file descriptor.
3956  */
3957 /* ARGSUSED */
3958 int
3959 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3960     register_t *retval)
3961 {
3962 	/* {
3963 		syscallarg(int) fd;
3964 		syscallarg(const struct timeval *) tptr;
3965 	} */
3966 	int error;
3967 	file_t *fp;
3968 
3969 	/* fd_getvnode() will use the descriptor for us */
3970 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3971 		return (error);
3972 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3973 	    UIO_USERSPACE);
3974 	fd_putfile(SCARG(uap, fd));
3975 	return (error);
3976 }
3977 
3978 int
3979 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3980     register_t *retval)
3981 {
3982 	/* {
3983 		syscallarg(int) fd;
3984 		syscallarg(const struct timespec *) tptr;
3985 	} */
3986 	int error;
3987 	file_t *fp;
3988 
3989 	/* fd_getvnode() will use the descriptor for us */
3990 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3991 		return (error);
3992 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3993 	    SCARG(uap, tptr), UIO_USERSPACE);
3994 	fd_putfile(SCARG(uap, fd));
3995 	return (error);
3996 }
3997 
3998 /*
3999  * Set the access and modification times given a path name; this
4000  * version does not follow links.
4001  */
4002 int
4003 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
4004     register_t *retval)
4005 {
4006 	/* {
4007 		syscallarg(const char *) path;
4008 		syscallarg(const struct timeval *) tptr;
4009 	} */
4010 
4011 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
4012 	    SCARG(uap, tptr), UIO_USERSPACE);
4013 }
4014 
4015 int
4016 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
4017     register_t *retval)
4018 {
4019 	/* {
4020 		syscallarg(int) fd;
4021 		syscallarg(const char *) path;
4022 		syscallarg(const struct timespec *) tptr;
4023 		syscallarg(int) flag;
4024 	} */
4025 	int follow;
4026 	const struct timespec *tptr;
4027 	int error;
4028 
4029 	tptr = SCARG(uap, tptr);
4030 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
4031 
4032 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
4033 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
4034 
4035 	return error;
4036 }
4037 
4038 /*
4039  * Common routine to set access and modification times given a vnode.
4040  */
4041 int
4042 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
4043     const struct timespec *tptr, enum uio_seg seg)
4044 {
4045 
4046 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
4047 }
4048 
4049 int
4050 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
4051     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
4052 {
4053 	struct vattr vattr;
4054 	int error, dorele = 0;
4055 	namei_simple_flags_t sflags;
4056 	bool vanull, setbirthtime;
4057 	struct timespec ts[2];
4058 
4059 	KASSERT(l != NULL || fdat == AT_FDCWD);
4060 
4061 	/*
4062 	 * I have checked all callers and they pass either FOLLOW,
4063 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
4064 	 * is 0. More to the point, they don't pass anything else.
4065 	 * Let's keep it that way at least until the namei interfaces
4066 	 * are fully sanitized.
4067 	 */
4068 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
4069 	sflags = (flag == FOLLOW) ?
4070 	    NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
4071 
4072 	if (tptr == NULL) {
4073 		vanull = true;
4074 		nanotime(&ts[0]);
4075 		ts[1] = ts[0];
4076 	} else {
4077 		vanull = false;
4078 		if (seg != UIO_SYSSPACE) {
4079 			error = copyin(tptr, ts, sizeof (ts));
4080 			if (error != 0)
4081 				return error;
4082 		} else {
4083 			ts[0] = tptr[0];
4084 			ts[1] = tptr[1];
4085 		}
4086 	}
4087 
4088 	if (ts[0].tv_nsec == UTIME_NOW) {
4089 		nanotime(&ts[0]);
4090 		if (ts[1].tv_nsec == UTIME_NOW) {
4091 			vanull = true;
4092 			ts[1] = ts[0];
4093 		}
4094 	} else if (ts[1].tv_nsec == UTIME_NOW)
4095 		nanotime(&ts[1]);
4096 
4097 	if (vp == NULL) {
4098 		/* note: SEG describes TPTR, not PATH; PATH is always user */
4099 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4100 		if (error != 0)
4101 			return error;
4102 		dorele = 1;
4103 	}
4104 
4105 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4106 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4107 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
4108 	vattr_null(&vattr);
4109 
4110 	if (ts[0].tv_nsec != UTIME_OMIT)
4111 		vattr.va_atime = ts[0];
4112 
4113 	if (ts[1].tv_nsec != UTIME_OMIT) {
4114 		vattr.va_mtime = ts[1];
4115 		if (setbirthtime)
4116 			vattr.va_birthtime = ts[1];
4117 	}
4118 
4119 	if (vanull)
4120 		vattr.va_vaflags |= VA_UTIMES_NULL;
4121 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
4122 	VOP_UNLOCK(vp);
4123 
4124 	if (dorele != 0)
4125 		vrele(vp);
4126 
4127 	return error;
4128 }
4129 
4130 int
4131 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4132     const struct timeval *tptr, enum uio_seg seg)
4133 {
4134 	struct timespec ts[2];
4135 	struct timespec *tsptr = NULL;
4136 	int error;
4137 
4138 	if (tptr != NULL) {
4139 		struct timeval tv[2];
4140 
4141 		if (seg != UIO_SYSSPACE) {
4142 			error = copyin(tptr, tv, sizeof(tv));
4143 			if (error != 0)
4144 				return error;
4145 			tptr = tv;
4146 		}
4147 
4148 		if ((tptr[0].tv_usec == UTIME_NOW) ||
4149 		    (tptr[0].tv_usec == UTIME_OMIT))
4150 			ts[0].tv_nsec = tptr[0].tv_usec;
4151 		else {
4152 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4153 				return EINVAL;
4154 
4155 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4156 		}
4157 
4158 		if ((tptr[1].tv_usec == UTIME_NOW) ||
4159 		    (tptr[1].tv_usec == UTIME_OMIT))
4160 			ts[1].tv_nsec = tptr[1].tv_usec;
4161 		else {
4162 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4163 				return EINVAL;
4164 
4165 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4166 		}
4167 
4168 		tsptr = &ts[0];
4169 	}
4170 
4171 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4172 }
4173 
4174 /*
4175  * Truncate a file given its path name.
4176  */
4177 /* ARGSUSED */
4178 int
4179 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap,
4180     register_t *retval)
4181 {
4182 	/* {
4183 		syscallarg(const char *) path;
4184 		syscallarg(int) pad;
4185 		syscallarg(off_t) length;
4186 	} */
4187 	struct vnode *vp;
4188 	struct vattr vattr;
4189 	int error;
4190 
4191 	if (SCARG(uap, length) < 0)
4192 		return EINVAL;
4193 
4194 	error = namei_simple_user(SCARG(uap, path),
4195 	    NSM_FOLLOW_TRYEMULROOT, &vp);
4196 	if (error != 0)
4197 		return (error);
4198 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4199 	if (vp->v_type == VDIR)
4200 		error = EISDIR;
4201 	else if ((error = vn_writechk(vp)) == 0 &&
4202 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4203 		vattr_null(&vattr);
4204 		vattr.va_size = SCARG(uap, length);
4205 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4206 	}
4207 	vput(vp);
4208 	return (error);
4209 }
4210 
4211 /*
4212  * Truncate a file given a file descriptor.
4213  */
4214 /* ARGSUSED */
4215 int
4216 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap,
4217     register_t *retval)
4218 {
4219 	/* {
4220 		syscallarg(int) fd;
4221 		syscallarg(int) pad;
4222 		syscallarg(off_t) length;
4223 	} */
4224 	file_t *fp;
4225 	int error, fd = SCARG(uap, fd);
4226 
4227 	fp = fd_getfile(fd);
4228 	if (fp == NULL)
4229 		return EBADF;
4230 	if (fp->f_ops->fo_truncate == NULL)
4231 		error = EOPNOTSUPP;
4232 	else
4233 		error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
4234 
4235 	fd_putfile(fd);
4236 	return error;
4237 }
4238 
4239 /*
4240  * Sync an open file.
4241  */
4242 /* ARGSUSED */
4243 int
4244 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4245 {
4246 	/* {
4247 		syscallarg(int) fd;
4248 	} */
4249 	struct vnode *vp;
4250 	file_t *fp;
4251 	int error;
4252 
4253 	/* fd_getvnode() will use the descriptor for us */
4254 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4255 		return (error);
4256 	vp = fp->f_vnode;
4257 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4258 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4259 	VOP_UNLOCK(vp);
4260 	fd_putfile(SCARG(uap, fd));
4261 	return (error);
4262 }
4263 
4264 /*
4265  * Sync a range of file data.  API modeled after that found in AIX.
4266  *
4267  * FDATASYNC indicates that we need only save enough metadata to be able
4268  * to re-read the written data.
4269  */
4270 /* ARGSUSED */
4271 int
4272 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap,
4273     register_t *retval)
4274 {
4275 	/* {
4276 		syscallarg(int) fd;
4277 		syscallarg(int) flags;
4278 		syscallarg(off_t) start;
4279 		syscallarg(off_t) length;
4280 	} */
4281 	struct vnode *vp;
4282 	file_t *fp;
4283 	int flags, nflags;
4284 	off_t s, e, len;
4285 	int error;
4286 
4287 	/* fd_getvnode() will use the descriptor for us */
4288 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4289 		return (error);
4290 
4291 	if ((fp->f_flag & FWRITE) == 0) {
4292 		error = EBADF;
4293 		goto out;
4294 	}
4295 
4296 	flags = SCARG(uap, flags);
4297 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4298 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4299 		error = EINVAL;
4300 		goto out;
4301 	}
4302 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4303 	if (flags & FDATASYNC)
4304 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4305 	else
4306 		nflags = FSYNC_WAIT;
4307 	if (flags & FDISKSYNC)
4308 		nflags |= FSYNC_CACHE;
4309 
4310 	len = SCARG(uap, length);
4311 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4312 	if (len) {
4313 		s = SCARG(uap, start);
4314 		if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4315 			error = EINVAL;
4316 			goto out;
4317 		}
4318 		e = s + len;
4319 		KASSERT(s <= e);
4320 	} else {
4321 		e = 0;
4322 		s = 0;
4323 	}
4324 
4325 	vp = fp->f_vnode;
4326 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4327 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4328 	VOP_UNLOCK(vp);
4329 out:
4330 	fd_putfile(SCARG(uap, fd));
4331 	return (error);
4332 }
4333 
4334 /*
4335  * Sync the data of an open file.
4336  */
4337 /* ARGSUSED */
4338 int
4339 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap,
4340     register_t *retval)
4341 {
4342 	/* {
4343 		syscallarg(int) fd;
4344 	} */
4345 	struct vnode *vp;
4346 	file_t *fp;
4347 	int error;
4348 
4349 	/* fd_getvnode() will use the descriptor for us */
4350 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4351 		return (error);
4352 	vp = fp->f_vnode;
4353 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4354 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4355 	VOP_UNLOCK(vp);
4356 	fd_putfile(SCARG(uap, fd));
4357 	return (error);
4358 }
4359 
4360 /*
4361  * Rename files, (standard) BSD semantics frontend.
4362  */
4363 /* ARGSUSED */
4364 int
4365 sys_rename(struct lwp *l, const struct sys_rename_args *uap,
4366     register_t *retval)
4367 {
4368 	/* {
4369 		syscallarg(const char *) from;
4370 		syscallarg(const char *) to;
4371 	} */
4372 
4373 	return do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4374 	    SCARG(uap, to), UIO_USERSPACE, 0);
4375 }
4376 
4377 int
4378 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4379     register_t *retval)
4380 {
4381 	/* {
4382 		syscallarg(int) fromfd;
4383 		syscallarg(const char *) from;
4384 		syscallarg(int) tofd;
4385 		syscallarg(const char *) to;
4386 	} */
4387 
4388 	return do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4389 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0);
4390 }
4391 
4392 /*
4393  * Rename files, POSIX semantics frontend.
4394  */
4395 /* ARGSUSED */
4396 int
4397 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap,
4398     register_t *retval)
4399 {
4400 	/* {
4401 		syscallarg(const char *) from;
4402 		syscallarg(const char *) to;
4403 	} */
4404 
4405 	return do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4406 	    SCARG(uap, to), UIO_USERSPACE, 1);
4407 }
4408 
4409 /*
4410  * Rename files.  Source and destination must either both be directories,
4411  * or both not be directories.  If target is a directory, it must be empty.
4412  * If `from' and `to' refer to the same object, the value of the `retain'
4413  * argument is used to determine whether `from' will be
4414  *
4415  * (retain == 0)	deleted unless `from' and `to' refer to the same
4416  *			object in the file system's name space (BSD).
4417  * (retain == 1)	always retained (POSIX).
4418  *
4419  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4420  */
4421 int
4422 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4423 {
4424 
4425 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg,
4426 	    retain);
4427 }
4428 
4429 static int
4430 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4431     const char *to, enum uio_seg seg, int retain)
4432 {
4433 	struct pathbuf *fpb, *tpb;
4434 	struct nameidata fnd, tnd;
4435 	struct vnode *fdvp, *fvp;
4436 	struct vnode *tdvp, *tvp;
4437 	struct mount *mp, *tmp;
4438 	int error;
4439 
4440 	KASSERT(l != NULL || fromfd == AT_FDCWD);
4441 	KASSERT(l != NULL || tofd == AT_FDCWD);
4442 
4443 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4444 	if (error)
4445 		goto out0;
4446 	KASSERT(fpb != NULL);
4447 
4448 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4449 	if (error)
4450 		goto out1;
4451 	KASSERT(tpb != NULL);
4452 
4453 	/*
4454 	 * Lookup from.
4455 	 *
4456 	 * XXX LOCKPARENT is wrong because we don't actually want it
4457 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4458 	 * insane, so for the time being we need to leave it like this.
4459 	 */
4460 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4461 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4462 		goto out2;
4463 
4464 	/*
4465 	 * Pull out the important results of the lookup, fdvp and fvp.
4466 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4467 	 */
4468 	fdvp = fnd.ni_dvp;
4469 	fvp = fnd.ni_vp;
4470 	mp = fdvp->v_mount;
4471 	KASSERT(fdvp != NULL);
4472 	KASSERT(fvp != NULL);
4473 	KASSERT(fdvp == fvp || VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
4474 	/*
4475 	 * Bracket the operation with fstrans_start()/fstrans_done().
4476 	 *
4477 	 * Inside the bracket this file system cannot be unmounted so
4478 	 * a vnode on this file system cannot change its v_mount.
4479 	 * A vnode on another file system may still change to dead mount.
4480 	 */
4481 	fstrans_start(mp);
4482 
4483 	/*
4484 	 * Make sure neither fdvp nor fvp is locked.
4485 	 */
4486 	if (fdvp != fvp)
4487 		VOP_UNLOCK(fdvp);
4488 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4489 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4490 
4491 	/*
4492 	 * Reject renaming `.' and `..'.  Can't do this until after
4493 	 * namei because we need namei's parsing to find the final
4494 	 * component name.  (namei should just leave us with the final
4495 	 * component name and not look it up itself, but anyway...)
4496 	 *
4497 	 * This was here before because we used to relookup from
4498 	 * instead of to and relookup requires the caller to check
4499 	 * this, but now file systems may depend on this check, so we
4500 	 * must retain it until the file systems are all rototilled.
4501 	 */
4502 	if ((fnd.ni_cnd.cn_namelen == 1 &&
4503 		fnd.ni_cnd.cn_nameptr[0] == '.') ||
4504 	    (fnd.ni_cnd.cn_namelen == 2 &&
4505 		fnd.ni_cnd.cn_nameptr[0] == '.' &&
4506 		fnd.ni_cnd.cn_nameptr[1] == '.')) {
4507 		error = EINVAL;	/* XXX EISDIR?  */
4508 		goto abort0;
4509 	}
4510 
4511 	/*
4512 	 * Lookup to.
4513 	 *
4514 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4515 	 * fvp here to decide whether to add CREATEDIR is a load of
4516 	 * bollocks because fvp might be the wrong node by now, since
4517 	 * fdvp is unlocked.
4518 	 *
4519 	 * XXX Why not pass CREATEDIR always?
4520 	 */
4521 	NDINIT(&tnd, RENAME,
4522 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4523 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4524 	    tpb);
4525 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4526 		goto abort0;
4527 
4528 	/*
4529 	 * Pull out the important results of the lookup, tdvp and tvp.
4530 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4531 	 */
4532 	tdvp = tnd.ni_dvp;
4533 	tvp = tnd.ni_vp;
4534 	KASSERT(tdvp != NULL);
4535 	KASSERT(tdvp == tvp || VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4536 
4537 	if (fvp->v_type == VDIR)
4538 		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4539 	/*
4540 	 * Make sure neither tdvp nor tvp is locked.
4541 	 */
4542 	if (tdvp != tvp)
4543 		VOP_UNLOCK(tdvp);
4544 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4545 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4546 
4547 	/*
4548 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4549 	 * these, which is why we must do this here.  Once upon a time
4550 	 * we relooked up from instead of to, and consequently didn't
4551 	 * need this check, but now that we relookup to instead of
4552 	 * from, we need this; and we shall need it forever forward
4553 	 * until the VOP_RENAME protocol changes, because file systems
4554 	 * will no doubt begin to depend on this check.
4555 	 */
4556 	if (tnd.ni_cnd.cn_namelen == 1 && tnd.ni_cnd.cn_nameptr[0] == '.') {
4557 		error = EISDIR;
4558 		goto abort1;
4559 	}
4560 	if (tnd.ni_cnd.cn_namelen == 2 &&
4561 	    tnd.ni_cnd.cn_nameptr[0] == '.' &&
4562 	    tnd.ni_cnd.cn_nameptr[1] == '.') {
4563 		error = EINVAL;
4564 		goto abort1;
4565 	}
4566 
4567 	/*
4568 	 * Make sure the mount points match.  Although we don't hold
4569 	 * any vnode locks, the v_mount on fdvp file system are stable.
4570 	 *
4571 	 * Unmounting another file system at an inopportune moment may
4572 	 * cause tdvp to disappear and change its v_mount to dead.
4573 	 *
4574 	 * So in either case different v_mount means cross-device rename.
4575 	 */
4576 	KASSERT(mp != NULL);
4577 	tmp = tdvp->v_mount;
4578 
4579 	if (mp != tmp) {
4580 		error = EXDEV;
4581 		goto abort1;
4582 	}
4583 
4584 	/*
4585 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4586 	 * Nothing is locked currently, so taking this lock is safe.
4587 	 */
4588 	error = VFS_RENAMELOCK_ENTER(mp);
4589 	if (error)
4590 		goto abort1;
4591 
4592 	/*
4593 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4594 	 * and nothing is locked except for the vfs rename lock.
4595 	 *
4596 	 * The next step is a little rain dance to conform to the
4597 	 * insane lock protocol, even though it does nothing to ward
4598 	 * off race conditions.
4599 	 *
4600 	 * We need tdvp and tvp to be locked.  However, because we have
4601 	 * unlocked tdvp in order to hold no locks while we take the
4602 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4603 	 * lock it even if the sensible file systems will just unlock
4604 	 * it straight away.  Consequently, we must lock tdvp and then
4605 	 * relookup tvp to get it locked.
4606 	 *
4607 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4608 	 * and various file systems insanely depend on the semantics of
4609 	 * this brain damage, the lookup of to must be the last lookup
4610 	 * before VOP_RENAME.
4611 	 */
4612 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4613 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4614 	if (error)
4615 		goto abort2;
4616 
4617 	/*
4618 	 * Drop the old tvp and pick up the new one -- which might be
4619 	 * the same, but that doesn't matter to us.  After this, tdvp
4620 	 * and tvp should both be locked.
4621 	 */
4622 	if (tvp != NULL)
4623 		vrele(tvp);
4624 	tvp = tnd.ni_vp;
4625 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4626 	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
4627 
4628 	/*
4629 	 * The old do_sys_rename had various consistency checks here
4630 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4631 	 * will become bogus soon in any sensible file system, so the
4632 	 * only purpose in putting these checks here is to give lip
4633 	 * service to these screw cases and to acknowledge that they
4634 	 * exist, not actually to handle them, but here you go
4635 	 * anyway...
4636 	 */
4637 
4638 	/*
4639 	 * Acknowledge that directories and non-directories aren't
4640 	 * supposed to mix.
4641 	 */
4642 	if (tvp != NULL) {
4643 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
4644 			error = ENOTDIR;
4645 			goto abort3;
4646 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
4647 			error = EISDIR;
4648 			goto abort3;
4649 		}
4650 	}
4651 
4652 	/*
4653 	 * Acknowledge some random screw case, among the dozens that
4654 	 * might arise.
4655 	 */
4656 	if (fvp == tdvp) {
4657 		error = EINVAL;
4658 		goto abort3;
4659 	}
4660 
4661 	/*
4662 	 * Acknowledge that POSIX has a wacky screw case.
4663 	 *
4664 	 * XXX Eventually the retain flag needs to be passed on to
4665 	 * VOP_RENAME.
4666 	 */
4667 	if (fvp == tvp) {
4668 		if (retain) {
4669 			error = 0;
4670 			goto abort3;
4671 		} else if (fdvp == tdvp &&
4672 		    fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen &&
4673 		    0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4674 			fnd.ni_cnd.cn_namelen)) {
4675 			error = 0;
4676 			goto abort3;
4677 		}
4678 	}
4679 
4680 	/*
4681 	 * Make sure veriexec can screw us up.  (But a race can screw
4682 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4683 	 * bogus.)
4684 	 */
4685 #if NVERIEXEC > 0
4686 	{
4687 		char *f1, *f2;
4688 		size_t f1_len;
4689 		size_t f2_len;
4690 
4691 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4692 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4693 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4694 
4695 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4696 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4697 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4698 
4699 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4700 
4701 		kmem_free(f1, f1_len);
4702 		kmem_free(f2, f2_len);
4703 
4704 		if (error)
4705 			goto abort3;
4706 	}
4707 #endif /* NVERIEXEC > 0 */
4708 
4709 	/*
4710 	 * All ready.  Incant the rename vop.
4711 	 */
4712 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4713 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4714 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4715 	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
4716 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4717 
4718 	/*
4719 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4720 	 * tdvp and tvp.  But we can't assert any of that.
4721 	 */
4722 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4723 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4724 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4725 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4726 
4727 	/*
4728 	 * So all we have left to do is to drop the rename lock and
4729 	 * destroy the pathbufs.
4730 	 */
4731 	VFS_RENAMELOCK_EXIT(mp);
4732 	fstrans_done(mp);
4733 	goto out2;
4734 
4735 abort3:	if (tvp != NULL && tvp != tdvp)
4736 		VOP_UNLOCK(tvp);
4737 abort2:	VOP_UNLOCK(tdvp);
4738 	VFS_RENAMELOCK_EXIT(mp);
4739 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4740 	vrele(tdvp);
4741 	if (tvp != NULL)
4742 		vrele(tvp);
4743 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4744 	vrele(fdvp);
4745 	vrele(fvp);
4746 	fstrans_done(mp);
4747 out2:	pathbuf_destroy(tpb);
4748 out1:	pathbuf_destroy(fpb);
4749 out0:	return error;
4750 }
4751 
4752 /*
4753  * Make a directory file.
4754  */
4755 /* ARGSUSED */
4756 int
4757 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4758 {
4759 	/* {
4760 		syscallarg(const char *) path;
4761 		syscallarg(int) mode;
4762 	} */
4763 
4764 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4765 	    SCARG(uap, mode), UIO_USERSPACE);
4766 }
4767 
4768 int
4769 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4770     register_t *retval)
4771 {
4772 	/* {
4773 		syscallarg(int) fd;
4774 		syscallarg(const char *) path;
4775 		syscallarg(int) mode;
4776 	} */
4777 
4778 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4779 	    SCARG(uap, mode), UIO_USERSPACE);
4780 }
4781 
4782 int
4783 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4784 {
4785 
4786 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4787 }
4788 
4789 static int
4790 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4791     enum uio_seg seg)
4792 {
4793 	struct proc *p = curlwp->l_proc;
4794 	struct vnode *vp;
4795 	struct vattr vattr;
4796 	int error;
4797 	struct pathbuf *pb;
4798 	struct nameidata nd;
4799 
4800 	KASSERT(l != NULL || fdat == AT_FDCWD);
4801 
4802 	/* XXX bollocks, should pass in a pathbuf */
4803 	error = pathbuf_maybe_copyin(path, seg, &pb);
4804 	if (error) {
4805 		return error;
4806 	}
4807 
4808 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4809 
4810 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4811 		pathbuf_destroy(pb);
4812 		return (error);
4813 	}
4814 	vp = nd.ni_vp;
4815 	if (vp != NULL) {
4816 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4817 		if (nd.ni_dvp == vp)
4818 			vrele(nd.ni_dvp);
4819 		else
4820 			vput(nd.ni_dvp);
4821 		vrele(vp);
4822 		pathbuf_destroy(pb);
4823 		return (EEXIST);
4824 	}
4825 	vattr_null(&vattr);
4826 	vattr.va_type = VDIR;
4827 	/* We will read cwdi->cwdi_cmask unlocked. */
4828 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4829 	nd.ni_cnd.cn_flags |= WILLBEDIR;
4830 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4831 	if (!error)
4832 		vrele(nd.ni_vp);
4833 	vput(nd.ni_dvp);
4834 	pathbuf_destroy(pb);
4835 	return (error);
4836 }
4837 
4838 /*
4839  * Remove a directory file.
4840  */
4841 /* ARGSUSED */
4842 int
4843 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4844 {
4845 	/* {
4846 		syscallarg(char *) path;
4847 	} */
4848 
4849 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), AT_REMOVEDIR,
4850 	    UIO_USERSPACE);
4851 }
4852 
4853 /*
4854  * Read a block of directory entries in a file system independent format.
4855  */
4856 int
4857 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap,
4858     register_t *retval)
4859 {
4860 	/* {
4861 		syscallarg(int) fd;
4862 		syscallarg(char *) buf;
4863 		syscallarg(size_t) count;
4864 	} */
4865 	file_t *fp;
4866 	int error, done;
4867 
4868 	/* fd_getvnode() will use the descriptor for us */
4869 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4870 		return (error);
4871 	if ((fp->f_flag & FREAD) == 0) {
4872 		error = EBADF;
4873 		goto out;
4874 	}
4875 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4876 	    SCARG(uap, count), &done, l, 0, 0);
4877 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4878 	*retval = done;
4879 out:
4880 	fd_putfile(SCARG(uap, fd));
4881 	return (error);
4882 }
4883 
4884 /*
4885  * Set the mode mask for creation of filesystem nodes.
4886  */
4887 int
4888 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4889 {
4890 	/* {
4891 		syscallarg(mode_t) newmask;
4892 	} */
4893 
4894 	/*
4895 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4896 	 * serialization with those reads is required.  It's important to
4897 	 * return a coherent answer for the caller of umask() though, and
4898 	 * the atomic operation accomplishes that.
4899 	 */
4900 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4901 	    SCARG(uap, newmask) & ALLPERMS);
4902 
4903 	return (0);
4904 }
4905 
4906 int
4907 dorevoke(struct vnode *vp, kauth_cred_t cred)
4908 {
4909 	struct vattr vattr;
4910 	int error, fs_decision;
4911 
4912 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4913 	error = VOP_GETATTR(vp, &vattr, cred);
4914 	VOP_UNLOCK(vp);
4915 	if (error != 0)
4916 		return error;
4917 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4918 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4919 	    fs_decision);
4920 	if (!error)
4921 		VOP_REVOKE(vp, REVOKEALL);
4922 	return (error);
4923 }
4924 
4925 /*
4926  * Void all references to file by ripping underlying filesystem
4927  * away from vnode.
4928  */
4929 /* ARGSUSED */
4930 int
4931 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap,
4932     register_t *retval)
4933 {
4934 	/* {
4935 		syscallarg(const char *) path;
4936 	} */
4937 	struct vnode *vp;
4938 	int error;
4939 
4940 	error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT,
4941 	    &vp);
4942 	if (error != 0)
4943 		return (error);
4944 	error = dorevoke(vp, l->l_cred);
4945 	vrele(vp);
4946 	return (error);
4947 }
4948 
4949 /*
4950  * Allocate backing store for a file, filling a hole without having to
4951  * explicitly write anything out.
4952  */
4953 /* ARGSUSED */
4954 int
4955 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4956     register_t *retval)
4957 {
4958 	/* {
4959 		syscallarg(int) fd;
4960 		syscallarg(off_t) pos;
4961 		syscallarg(off_t) len;
4962 	} */
4963 	int fd;
4964 	off_t pos, len;
4965 	struct file *fp;
4966 	struct vnode *vp;
4967 	int error;
4968 
4969 	fd = SCARG(uap, fd);
4970 	pos = SCARG(uap, pos);
4971 	len = SCARG(uap, len);
4972 
4973 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4974 		*retval = EINVAL;
4975 		return 0;
4976 	}
4977 
4978 	error = fd_getvnode(fd, &fp);
4979 	if (error) {
4980 		*retval = error;
4981 		return 0;
4982 	}
4983 	if ((fp->f_flag & FWRITE) == 0) {
4984 		error = EBADF;
4985 		goto fail;
4986 	}
4987 	vp = fp->f_vnode;
4988 
4989 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4990 	if (vp->v_type == VDIR) {
4991 		error = EISDIR;
4992 	} else {
4993 		error = VOP_FALLOCATE(vp, pos, len);
4994 	}
4995 	VOP_UNLOCK(vp);
4996 
4997 fail:
4998 	fd_putfile(fd);
4999 	*retval = error;
5000 	return 0;
5001 }
5002 
5003 /*
5004  * Deallocate backing store for a file, creating a hole. Also used for
5005  * invoking TRIM on disks.
5006  */
5007 /* ARGSUSED */
5008 int
5009 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
5010     register_t *retval)
5011 {
5012 	/* {
5013 		syscallarg(int) fd;
5014 		syscallarg(off_t) pos;
5015 		syscallarg(off_t) len;
5016 	} */
5017 	int fd;
5018 	off_t pos, len;
5019 	struct file *fp;
5020 	struct vnode *vp;
5021 	int error;
5022 
5023 	fd = SCARG(uap, fd);
5024 	pos = SCARG(uap, pos);
5025 	len = SCARG(uap, len);
5026 
5027 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
5028 		return EINVAL;
5029 	}
5030 
5031 	error = fd_getvnode(fd, &fp);
5032 	if (error) {
5033 		return error;
5034 	}
5035 	if ((fp->f_flag & FWRITE) == 0) {
5036 		error = EBADF;
5037 		goto fail;
5038 	}
5039 	vp = fp->f_vnode;
5040 
5041 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5042 	if (vp->v_type == VDIR) {
5043 		error = EISDIR;
5044 	} else {
5045 		error = VOP_FDISCARD(vp, pos, len);
5046 	}
5047 	VOP_UNLOCK(vp);
5048 
5049 fail:
5050 	fd_putfile(fd);
5051 	return error;
5052 }
5053