xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 122b5006ee1bd67145794b4cde92f4fe4781a5ec)
1 /*	$NetBSD: vfs_syscalls.c,v 1.553 2021/09/26 21:29:38 thorpej Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.553 2021/09/26 21:29:38 thorpej Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/fstrans.h>
91 #include <sys/proc.h>
92 #include <sys/uio.h>
93 #include <sys/kmem.h>
94 #include <sys/dirent.h>
95 #include <sys/sysctl.h>
96 #include <sys/syscallargs.h>
97 #include <sys/vfs_syscalls.h>
98 #include <sys/quota.h>
99 #include <sys/quotactl.h>
100 #include <sys/ktrace.h>
101 #ifdef FILEASSOC
102 #include <sys/fileassoc.h>
103 #endif /* FILEASSOC */
104 #include <sys/extattr.h>
105 #include <sys/verified_exec.h>
106 #include <sys/kauth.h>
107 #include <sys/atomic.h>
108 #include <sys/module.h>
109 #include <sys/buf.h>
110 #include <sys/event.h>
111 #include <sys/compat_stub.h>
112 
113 #include <miscfs/genfs/genfs.h>
114 #include <miscfs/specfs/specdev.h>
115 
116 #include <nfs/rpcv2.h>
117 #include <nfs/nfsproto.h>
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 
121 /* XXX this shouldn't be here */
122 #ifndef OFF_T_MAX
123 #define OFF_T_MAX __type_max(off_t)
124 #endif
125 
126 static int change_flags(struct vnode *, u_long, struct lwp *);
127 static int change_mode(struct vnode *, int, struct lwp *);
128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131     enum uio_seg);
132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134     enum uio_seg);
135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136     enum uio_seg, int);
137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138     size_t, register_t *);
139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140 
141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143     namei_simple_flags_t, struct vnode **);
144 
145 /*
146  * This table is used to maintain compatibility with 4.3BSD
147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
148  * Note, the order is important!
149  *
150  * Do not modify this table. It should only contain filesystems
151  * supported by NetBSD 0.9 and 4.3BSD.
152  */
153 const char * const mountcompatnames[] = {
154 	NULL,		/* 0 = MOUNT_NONE */
155 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156 	MOUNT_NFS,	/* 2 */
157 	MOUNT_MFS,	/* 3 */
158 	MOUNT_MSDOS,	/* 4 */
159 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160 	MOUNT_FDESC,	/* 6 */
161 	MOUNT_KERNFS,	/* 7 */
162 	NULL,		/* 8 = MOUNT_DEVFS */
163 	MOUNT_AFS,	/* 9 */
164 };
165 
166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
167 
168 /*
169  * Filter event method for EVFILT_FS.
170  */
171 static struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
172 kmutex_t fs_klist_lock;
173 
174 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
175 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
176 
177 static int
178 filt_fsattach(struct knote *kn)
179 {
180 	mutex_enter(&fs_klist_lock);
181 	kn->kn_flags |= EV_CLEAR;
182 	SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
183 	mutex_exit(&fs_klist_lock);
184 
185 	return 0;
186 }
187 
188 static void
189 filt_fsdetach(struct knote *kn)
190 {
191 	mutex_enter(&fs_klist_lock);
192 	SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
193 	mutex_exit(&fs_klist_lock);
194 }
195 
196 static int
197 filt_fs(struct knote *kn, long hint)
198 {
199 	int rv;
200 
201 	if (hint & NOTE_SUBMIT) {
202 		KASSERT(mutex_owned(&fs_klist_lock));
203 		kn->kn_fflags |= hint & ~NOTE_SUBMIT;
204 	} else {
205 		mutex_enter(&fs_klist_lock);
206 	}
207 
208 	rv = (kn->kn_fflags != 0);
209 
210 	if ((hint & NOTE_SUBMIT) == 0) {
211 		mutex_exit(&fs_klist_lock);
212 	}
213 
214 	return rv;
215 }
216 
217 /* referenced in kern_event.c */
218 const struct filterops fs_filtops = {
219 	.f_flags = FILTEROP_MPSAFE,
220 	.f_attach = filt_fsattach,
221 	.f_detach = filt_fsdetach,
222 	.f_event = filt_fs,
223 };
224 
225 static int
226 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
227 {
228 	file_t *dfp;
229 	int error;
230 
231 	if (fdat != AT_FDCWD) {
232 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
233 			goto out;
234 
235 		NDAT(ndp, dfp->f_vnode);
236 	}
237 
238 	error = namei(ndp);
239 
240 	if (fdat != AT_FDCWD)
241 		fd_putfile(fdat);
242 out:
243 	return error;
244 }
245 
246 static int
247 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
248     namei_simple_flags_t sflags, struct vnode **vp_ret)
249 {
250 	file_t *dfp;
251 	struct vnode *dvp;
252 	int error;
253 
254 	if (fdat != AT_FDCWD) {
255 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
256 			goto out;
257 
258 		dvp = dfp->f_vnode;
259 	} else {
260 		dvp = NULL;
261 	}
262 
263 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
264 
265 	if (fdat != AT_FDCWD)
266 		fd_putfile(fdat);
267 out:
268 	return error;
269 }
270 
271 static int
272 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
273 {
274 	int error;
275 
276 	fp->f_flag = flags & FMASK;
277 	fp->f_type = DTYPE_VNODE;
278 	fp->f_ops = &vnops;
279 	fp->f_vnode = vp;
280 
281 	if (flags & (O_EXLOCK | O_SHLOCK)) {
282 		struct flock lf;
283 		int type;
284 
285 		lf.l_whence = SEEK_SET;
286 		lf.l_start = 0;
287 		lf.l_len = 0;
288 		if (flags & O_EXLOCK)
289 			lf.l_type = F_WRLCK;
290 		else
291 			lf.l_type = F_RDLCK;
292 		type = F_FLOCK;
293 		if ((flags & FNONBLOCK) == 0)
294 			type |= F_WAIT;
295 		VOP_UNLOCK(vp);
296 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
297 		if (error) {
298 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
299 			fd_abort(l->l_proc, fp, indx);
300 			return error;
301 		}
302 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
303 		atomic_or_uint(&fp->f_flag, FHASLOCK);
304 	}
305 	if (flags & O_CLOEXEC)
306 		fd_set_exclose(l, indx, true);
307 	return 0;
308 }
309 
310 static int
311 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
312     void *data, size_t *data_len)
313 {
314 	struct mount *mp;
315 	int error = 0, saved_flags;
316 
317 	mp = vp->v_mount;
318 	saved_flags = mp->mnt_flag;
319 
320 	/* We can operate only on VV_ROOT nodes. */
321 	if ((vp->v_vflag & VV_ROOT) == 0) {
322 		error = EINVAL;
323 		goto out;
324 	}
325 
326 	/*
327 	 * We only allow the filesystem to be reloaded if it
328 	 * is currently mounted read-only.  Additionally, we
329 	 * prevent read-write to read-only downgrades.
330 	 */
331 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
332 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
333 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
334 		error = EOPNOTSUPP;	/* Needs translation */
335 		goto out;
336 	}
337 
338 	/*
339 	 * Enabling MNT_UNION requires a covered mountpoint and
340 	 * must not happen on the root mount.
341 	 */
342 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
343 		error = EOPNOTSUPP;
344 		goto out;
345 	}
346 
347 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
348 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
349 	if (error)
350 		goto out;
351 
352 	error = vfs_suspend(mp, 0);
353 	if (error)
354 		goto out;
355 
356 	mutex_enter(mp->mnt_updating);
357 
358 	mp->mnt_flag &= ~MNT_OP_FLAGS;
359 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
360 
361 	/*
362 	 * Set the mount level flags.
363 	 */
364 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
365 		if ((flags & MNT_RDONLY))
366 			mp->mnt_iflag |= IMNT_WANTRDONLY;
367 		else
368 			mp->mnt_iflag |= IMNT_WANTRDWR;
369 	}
370 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
371 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
372 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
373 		mp->mnt_flag &= ~MNT_RDONLY;
374 
375 	error = VFS_MOUNT(mp, path, data, data_len);
376 
377 	if (error && data != NULL) {
378 		int error2;
379 
380 		/*
381 		 * Update failed; let's try and see if it was an
382 		 * export request.  For compat with 3.0 and earlier.
383 		 */
384 		error2 = vfs_hooks_reexport(mp, path, data);
385 
386 		/*
387 		 * Only update error code if the export request was
388 		 * understood but some problem occurred while
389 		 * processing it.
390 		 */
391 		if (error2 != EJUSTRETURN)
392 			error = error2;
393 	}
394 
395 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
396 		mp->mnt_flag |= MNT_RDONLY;
397 	if (error)
398 		mp->mnt_flag = saved_flags;
399 	mp->mnt_flag &= ~MNT_OP_FLAGS;
400 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
401 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
402 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
403 			vfs_syncer_add_to_worklist(mp);
404 	} else {
405 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
406 			vfs_syncer_remove_from_worklist(mp);
407 	}
408 	mutex_exit(mp->mnt_updating);
409 	vfs_resume(mp);
410 
411 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
412 	    (flags & MNT_EXTATTR)) {
413 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
414 				   NULL, 0, NULL) != 0) {
415 			printf("%s: failed to start extattr, error = %d",
416 			       mp->mnt_stat.f_mntonname, error);
417 			mp->mnt_flag &= ~MNT_EXTATTR;
418 		}
419 	}
420 
421 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
422 	    !(flags & MNT_EXTATTR)) {
423 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
424 				   NULL, 0, NULL) != 0) {
425 			printf("%s: failed to stop extattr, error = %d",
426 			       mp->mnt_stat.f_mntonname, error);
427 			mp->mnt_flag |= MNT_RDONLY;
428 		}
429 	}
430  out:
431 	return (error);
432 }
433 
434 static int
435 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
436     struct vfsops **vfsops)
437 {
438 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
439 	int error;
440 
441 	if (type_seg == UIO_USERSPACE) {
442 		/* Copy file-system type from userspace.  */
443 		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
444 	} else {
445 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
446 		KASSERT(error == 0);
447 	}
448 
449 	if (error) {
450 		/*
451 		 * Historically, filesystem types were identified by numbers.
452 		 * If we get an integer for the filesystem type instead of a
453 		 * string, we check to see if it matches one of the historic
454 		 * filesystem types.
455 		 */
456 		u_long fsindex = (u_long)fstype;
457 		if (fsindex >= nmountcompatnames ||
458 		    mountcompatnames[fsindex] == NULL)
459 			return ENODEV;
460 		strlcpy(fstypename, mountcompatnames[fsindex],
461 		    sizeof(fstypename));
462 	}
463 
464 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
465 	if (strcmp(fstypename, "ufs") == 0)
466 		fstypename[0] = 'f';
467 
468 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
469 		return 0;
470 
471 	/* If we can autoload a vfs module, try again */
472 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
473 
474 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
475 		return 0;
476 
477 	return ENODEV;
478 }
479 
480 static int
481 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
482     void *data, size_t *data_len)
483 {
484 	struct mount *mp;
485 	int error;
486 
487 	/* If MNT_GETARGS is specified, it should be the only flag. */
488 	if (flags & ~MNT_GETARGS)
489 		return EINVAL;
490 
491 	mp = vp->v_mount;
492 
493 	/* XXX: probably some notion of "can see" here if we want isolation. */
494 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
495 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
496 	if (error)
497 		return error;
498 
499 	if ((vp->v_vflag & VV_ROOT) == 0)
500 		return EINVAL;
501 
502 	if (vfs_busy(mp))
503 		return EPERM;
504 
505 	mutex_enter(mp->mnt_updating);
506 	mp->mnt_flag &= ~MNT_OP_FLAGS;
507 	mp->mnt_flag |= MNT_GETARGS;
508 	error = VFS_MOUNT(mp, path, data, data_len);
509 	mp->mnt_flag &= ~MNT_OP_FLAGS;
510 	mutex_exit(mp->mnt_updating);
511 
512 	vfs_unbusy(mp);
513 	return (error);
514 }
515 
516 int
517 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
518 {
519 	/* {
520 		syscallarg(const char *) type;
521 		syscallarg(const char *) path;
522 		syscallarg(int) flags;
523 		syscallarg(void *) data;
524 		syscallarg(size_t) data_len;
525 	} */
526 
527 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
528 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
529 	    SCARG(uap, data_len), retval);
530 }
531 
532 int
533 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
534     const char *path, int flags, void *data, enum uio_seg data_seg,
535     size_t data_len, register_t *retval)
536 {
537 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
538 	struct vnode *vp;
539 	void *data_buf = data;
540 	bool vfsopsrele = false;
541 	size_t alloc_sz = 0;
542 	int error;
543 
544 	/*
545 	 * Get vnode to be covered
546 	 */
547 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
548 	if (error != 0) {
549 		vp = NULL;
550 		goto done;
551 	}
552 
553 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
554 		vfsops = vp->v_mount->mnt_op;
555 	} else {
556 		/* 'type' is userspace */
557 		error = mount_get_vfsops(type, type_seg, &vfsops);
558 		if (error != 0)
559 			goto done;
560 		vfsopsrele = true;
561 	}
562 
563 	/*
564 	 * We allow data to be NULL, even for userspace. Some fs's don't need
565 	 * it. The others will handle NULL.
566 	 */
567 	if (data != NULL && data_seg == UIO_USERSPACE) {
568 		if (data_len == 0) {
569 			/* No length supplied, use default for filesystem */
570 			data_len = vfsops->vfs_min_mount_data;
571 
572 			/*
573 			 * Hopefully a longer buffer won't make copyin() fail.
574 			 * For compatibility with 3.0 and earlier.
575 			 */
576 			if (flags & MNT_UPDATE
577 			    && data_len < sizeof (struct mnt_export_args30))
578 				data_len = sizeof (struct mnt_export_args30);
579 		}
580 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
581 			error = EINVAL;
582 			goto done;
583 		}
584 		alloc_sz = data_len;
585 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
586 
587 		/* NFS needs the buffer even for mnt_getargs .... */
588 		error = copyin(data, data_buf, data_len);
589 		if (error != 0)
590 			goto done;
591 	}
592 
593 	if (flags & MNT_GETARGS) {
594 		if (data_len == 0) {
595 			error = EINVAL;
596 			goto done;
597 		}
598 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
599 		if (error != 0)
600 			goto done;
601 		if (data_seg == UIO_USERSPACE)
602 			error = copyout(data_buf, data, data_len);
603 		*retval = data_len;
604 	} else if (flags & MNT_UPDATE) {
605 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
606 	} else {
607 		/* Locking is handled internally in mount_domount(). */
608 		KASSERT(vfsopsrele == true);
609 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
610 		    &data_len);
611 		vfsopsrele = false;
612 	}
613 	if (!error) {
614 		mutex_enter(&fs_klist_lock);
615 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
616 		mutex_exit(&fs_klist_lock);
617 	}
618 
619     done:
620 	if (vfsopsrele)
621 		vfs_delref(vfsops);
622     	if (vp != NULL) {
623 	    	vrele(vp);
624 	}
625 	if (data_buf != data)
626 		kmem_free(data_buf, alloc_sz);
627 	return (error);
628 }
629 
630 /*
631  * Unmount a file system.
632  *
633  * Note: unmount takes a path to the vnode mounted on as argument,
634  * not special file (as before).
635  */
636 /* ARGSUSED */
637 int
638 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
639 {
640 	/* {
641 		syscallarg(const char *) path;
642 		syscallarg(int) flags;
643 	} */
644 	struct vnode *vp;
645 	struct mount *mp;
646 	int error;
647 	struct pathbuf *pb;
648 	struct nameidata nd;
649 
650 	error = pathbuf_copyin(SCARG(uap, path), &pb);
651 	if (error) {
652 		return error;
653 	}
654 
655 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
656 	if ((error = namei(&nd)) != 0) {
657 		pathbuf_destroy(pb);
658 		return error;
659 	}
660 	vp = nd.ni_vp;
661 	pathbuf_destroy(pb);
662 
663 	mp = vp->v_mount;
664 	vfs_ref(mp);
665 	VOP_UNLOCK(vp);
666 
667 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
668 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
669 	if (error) {
670 		vrele(vp);
671 		vfs_rele(mp);
672 		return (error);
673 	}
674 
675 	/*
676 	 * Don't allow unmounting the root file system.
677 	 */
678 	if (mp->mnt_flag & MNT_ROOTFS) {
679 		vrele(vp);
680 		vfs_rele(mp);
681 		return (EINVAL);
682 	}
683 
684 	/*
685 	 * Must be the root of the filesystem
686 	 */
687 	if ((vp->v_vflag & VV_ROOT) == 0) {
688 		vrele(vp);
689 		vfs_rele(mp);
690 		return (EINVAL);
691 	}
692 
693 	vrele(vp);
694 	error = dounmount(mp, SCARG(uap, flags), l);
695 	vfs_rele(mp);
696 	if (!error) {
697 		mutex_enter(&fs_klist_lock);
698 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
699 		mutex_exit(&fs_klist_lock);
700 	}
701 	return error;
702 }
703 
704 /*
705  * Sync each mounted filesystem.
706  */
707 #ifdef DEBUG
708 int syncprt = 0;
709 struct ctldebug debug0 = { "syncprt", &syncprt };
710 #endif
711 
712 void
713 do_sys_sync(struct lwp *l)
714 {
715 	mount_iterator_t *iter;
716 	struct mount *mp;
717 	int asyncflag;
718 
719 	mountlist_iterator_init(&iter);
720 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
721 		mutex_enter(mp->mnt_updating);
722 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
723 			asyncflag = mp->mnt_flag & MNT_ASYNC;
724 			mp->mnt_flag &= ~MNT_ASYNC;
725 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
726 			if (asyncflag)
727 				 mp->mnt_flag |= MNT_ASYNC;
728 		}
729 		mutex_exit(mp->mnt_updating);
730 	}
731 	mountlist_iterator_destroy(iter);
732 #ifdef DEBUG
733 	if (syncprt)
734 		vfs_bufstats();
735 #endif /* DEBUG */
736 }
737 
738 static bool
739 sync_vnode_filter(void *cookie, vnode_t *vp)
740 {
741 
742 	if (vp->v_numoutput > 0) {
743 		++*(int *)cookie;
744 	}
745 	return false;
746 }
747 
748 int
749 vfs_syncwait(void)
750 {
751 	int nbusy, nbusy_prev, iter;
752 	struct vnode_iterator *vniter;
753 	mount_iterator_t *mpiter;
754 	struct mount *mp;
755 
756 	for (nbusy_prev = 0, iter = 0; iter < 20;) {
757 		nbusy = 0;
758 		mountlist_iterator_init(&mpiter);
759 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
760 			vnode_t *vp __diagused;
761 			vfs_vnode_iterator_init(mp, &vniter);
762 			vp = vfs_vnode_iterator_next(vniter,
763 			    sync_vnode_filter, &nbusy);
764 			KASSERT(vp == NULL);
765 			vfs_vnode_iterator_destroy(vniter);
766 		}
767 		mountlist_iterator_destroy(mpiter);
768 
769 		if (nbusy == 0)
770 			break;
771 		if (nbusy_prev == 0)
772 			nbusy_prev = nbusy;
773 		printf("%d ", nbusy);
774 		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
775 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
776 			iter++;
777 		else
778 			nbusy_prev = nbusy;
779 	}
780 
781 	if (nbusy) {
782 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
783 		printf("giving up\nPrinting vnodes for busy buffers\n");
784 		mountlist_iterator_init(&mpiter);
785 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
786 			vnode_t *vp;
787 			vfs_vnode_iterator_init(mp, &vniter);
788 			vp = vfs_vnode_iterator_next(vniter,
789 			    NULL, NULL);
790 			mutex_enter(vp->v_interlock);
791 			if (vp->v_numoutput > 0)
792 				vprint(NULL, vp);
793 			mutex_exit(vp->v_interlock);
794 			vrele(vp);
795 			vfs_vnode_iterator_destroy(vniter);
796 		}
797 		mountlist_iterator_destroy(mpiter);
798 #endif
799 	}
800 
801 	return nbusy;
802 }
803 
804 /* ARGSUSED */
805 int
806 sys_sync(struct lwp *l, const void *v, register_t *retval)
807 {
808 	do_sys_sync(l);
809 	return (0);
810 }
811 
812 
813 /*
814  * Access or change filesystem quotas.
815  *
816  * (this is really 14 different calls bundled into one)
817  */
818 
819 static int
820 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
821 {
822 	struct quotastat info_k;
823 	int error;
824 
825 	/* ensure any padding bytes are cleared */
826 	memset(&info_k, 0, sizeof(info_k));
827 
828 	error = vfs_quotactl_stat(mp, &info_k);
829 	if (error) {
830 		return error;
831 	}
832 
833 	return copyout(&info_k, info_u, sizeof(info_k));
834 }
835 
836 static int
837 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
838     struct quotaidtypestat *info_u)
839 {
840 	struct quotaidtypestat info_k;
841 	int error;
842 
843 	/* ensure any padding bytes are cleared */
844 	memset(&info_k, 0, sizeof(info_k));
845 
846 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
847 	if (error) {
848 		return error;
849 	}
850 
851 	return copyout(&info_k, info_u, sizeof(info_k));
852 }
853 
854 static int
855 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
856     struct quotaobjtypestat *info_u)
857 {
858 	struct quotaobjtypestat info_k;
859 	int error;
860 
861 	/* ensure any padding bytes are cleared */
862 	memset(&info_k, 0, sizeof(info_k));
863 
864 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
865 	if (error) {
866 		return error;
867 	}
868 
869 	return copyout(&info_k, info_u, sizeof(info_k));
870 }
871 
872 static int
873 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
874     struct quotaval *val_u)
875 {
876 	struct quotakey key_k;
877 	struct quotaval val_k;
878 	int error;
879 
880 	/* ensure any padding bytes are cleared */
881 	memset(&val_k, 0, sizeof(val_k));
882 
883 	error = copyin(key_u, &key_k, sizeof(key_k));
884 	if (error) {
885 		return error;
886 	}
887 
888 	error = vfs_quotactl_get(mp, &key_k, &val_k);
889 	if (error) {
890 		return error;
891 	}
892 
893 	return copyout(&val_k, val_u, sizeof(val_k));
894 }
895 
896 static int
897 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
898     const struct quotaval *val_u)
899 {
900 	struct quotakey key_k;
901 	struct quotaval val_k;
902 	int error;
903 
904 	error = copyin(key_u, &key_k, sizeof(key_k));
905 	if (error) {
906 		return error;
907 	}
908 
909 	error = copyin(val_u, &val_k, sizeof(val_k));
910 	if (error) {
911 		return error;
912 	}
913 
914 	return vfs_quotactl_put(mp, &key_k, &val_k);
915 }
916 
917 static int
918 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
919 {
920 	struct quotakey key_k;
921 	int error;
922 
923 	error = copyin(key_u, &key_k, sizeof(key_k));
924 	if (error) {
925 		return error;
926 	}
927 
928 	return vfs_quotactl_del(mp, &key_k);
929 }
930 
931 static int
932 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
933 {
934 	struct quotakcursor cursor_k;
935 	int error;
936 
937 	/* ensure any padding bytes are cleared */
938 	memset(&cursor_k, 0, sizeof(cursor_k));
939 
940 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
941 	if (error) {
942 		return error;
943 	}
944 
945 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
946 }
947 
948 static int
949 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
950 {
951 	struct quotakcursor cursor_k;
952 	int error;
953 
954 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
955 	if (error) {
956 		return error;
957 	}
958 
959 	return vfs_quotactl_cursorclose(mp, &cursor_k);
960 }
961 
962 static int
963 do_sys_quotactl_cursorskipidtype(struct mount *mp,
964     struct quotakcursor *cursor_u, int idtype)
965 {
966 	struct quotakcursor cursor_k;
967 	int error;
968 
969 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
970 	if (error) {
971 		return error;
972 	}
973 
974 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
975 	if (error) {
976 		return error;
977 	}
978 
979 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
980 }
981 
982 static int
983 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
984     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
985     unsigned *ret_u)
986 {
987 #define CGET_STACK_MAX 8
988 	struct quotakcursor cursor_k;
989 	struct quotakey stackkeys[CGET_STACK_MAX];
990 	struct quotaval stackvals[CGET_STACK_MAX];
991 	struct quotakey *keys_k;
992 	struct quotaval *vals_k;
993 	unsigned ret_k;
994 	int error;
995 
996 	if (maxnum > 128) {
997 		maxnum = 128;
998 	}
999 
1000 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1001 	if (error) {
1002 		return error;
1003 	}
1004 
1005 	if (maxnum <= CGET_STACK_MAX) {
1006 		keys_k = stackkeys;
1007 		vals_k = stackvals;
1008 		/* ensure any padding bytes are cleared */
1009 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1010 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1011 	} else {
1012 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1013 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1014 	}
1015 
1016 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1017 				       &ret_k);
1018 	if (error) {
1019 		goto fail;
1020 	}
1021 
1022 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1023 	if (error) {
1024 		goto fail;
1025 	}
1026 
1027 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1028 	if (error) {
1029 		goto fail;
1030 	}
1031 
1032 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1033 	if (error) {
1034 		goto fail;
1035 	}
1036 
1037 	/* do last to maximize the chance of being able to recover a failure */
1038 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1039 
1040 fail:
1041 	if (keys_k != stackkeys) {
1042 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1043 	}
1044 	if (vals_k != stackvals) {
1045 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1046 	}
1047 	return error;
1048 }
1049 
1050 static int
1051 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1052     int *ret_u)
1053 {
1054 	struct quotakcursor cursor_k;
1055 	int ret_k;
1056 	int error;
1057 
1058 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1059 	if (error) {
1060 		return error;
1061 	}
1062 
1063 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1064 	if (error) {
1065 		return error;
1066 	}
1067 
1068 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1069 	if (error) {
1070 		return error;
1071 	}
1072 
1073 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1074 }
1075 
1076 static int
1077 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1078 {
1079 	struct quotakcursor cursor_k;
1080 	int error;
1081 
1082 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1083 	if (error) {
1084 		return error;
1085 	}
1086 
1087 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1088 	if (error) {
1089 		return error;
1090 	}
1091 
1092 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1093 }
1094 
1095 static int
1096 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1097 {
1098 	char *path_k;
1099 	int error;
1100 
1101 	/* XXX this should probably be a struct pathbuf */
1102 	path_k = PNBUF_GET();
1103 	error = copyin(path_u, path_k, PATH_MAX);
1104 	if (error) {
1105 		PNBUF_PUT(path_k);
1106 		return error;
1107 	}
1108 
1109 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1110 
1111 	PNBUF_PUT(path_k);
1112 	return error;
1113 }
1114 
1115 static int
1116 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1117 {
1118 	return vfs_quotactl_quotaoff(mp, idtype);
1119 }
1120 
1121 int
1122 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1123 {
1124 	struct mount *mp;
1125 	struct vnode *vp;
1126 	int error;
1127 
1128 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1129 	if (error != 0)
1130 		return (error);
1131 	mp = vp->v_mount;
1132 
1133 	switch (args->qc_op) {
1134 	    case QUOTACTL_STAT:
1135 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1136 		break;
1137 	    case QUOTACTL_IDTYPESTAT:
1138 		error = do_sys_quotactl_idtypestat(mp,
1139 				args->u.idtypestat.qc_idtype,
1140 				args->u.idtypestat.qc_info);
1141 		break;
1142 	    case QUOTACTL_OBJTYPESTAT:
1143 		error = do_sys_quotactl_objtypestat(mp,
1144 				args->u.objtypestat.qc_objtype,
1145 				args->u.objtypestat.qc_info);
1146 		break;
1147 	    case QUOTACTL_GET:
1148 		error = do_sys_quotactl_get(mp,
1149 				args->u.get.qc_key,
1150 				args->u.get.qc_val);
1151 		break;
1152 	    case QUOTACTL_PUT:
1153 		error = do_sys_quotactl_put(mp,
1154 				args->u.put.qc_key,
1155 				args->u.put.qc_val);
1156 		break;
1157 	    case QUOTACTL_DEL:
1158 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1159 		break;
1160 	    case QUOTACTL_CURSOROPEN:
1161 		error = do_sys_quotactl_cursoropen(mp,
1162 				args->u.cursoropen.qc_cursor);
1163 		break;
1164 	    case QUOTACTL_CURSORCLOSE:
1165 		error = do_sys_quotactl_cursorclose(mp,
1166 				args->u.cursorclose.qc_cursor);
1167 		break;
1168 	    case QUOTACTL_CURSORSKIPIDTYPE:
1169 		error = do_sys_quotactl_cursorskipidtype(mp,
1170 				args->u.cursorskipidtype.qc_cursor,
1171 				args->u.cursorskipidtype.qc_idtype);
1172 		break;
1173 	    case QUOTACTL_CURSORGET:
1174 		error = do_sys_quotactl_cursorget(mp,
1175 				args->u.cursorget.qc_cursor,
1176 				args->u.cursorget.qc_keys,
1177 				args->u.cursorget.qc_vals,
1178 				args->u.cursorget.qc_maxnum,
1179 				args->u.cursorget.qc_ret);
1180 		break;
1181 	    case QUOTACTL_CURSORATEND:
1182 		error = do_sys_quotactl_cursoratend(mp,
1183 				args->u.cursoratend.qc_cursor,
1184 				args->u.cursoratend.qc_ret);
1185 		break;
1186 	    case QUOTACTL_CURSORREWIND:
1187 		error = do_sys_quotactl_cursorrewind(mp,
1188 				args->u.cursorrewind.qc_cursor);
1189 		break;
1190 	    case QUOTACTL_QUOTAON:
1191 		error = do_sys_quotactl_quotaon(mp,
1192 				args->u.quotaon.qc_idtype,
1193 				args->u.quotaon.qc_quotafile);
1194 		break;
1195 	    case QUOTACTL_QUOTAOFF:
1196 		error = do_sys_quotactl_quotaoff(mp,
1197 				args->u.quotaoff.qc_idtype);
1198 		break;
1199 	    default:
1200 		error = EINVAL;
1201 		break;
1202 	}
1203 
1204 	vrele(vp);
1205 	return error;
1206 }
1207 
1208 /* ARGSUSED */
1209 int
1210 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1211     register_t *retval)
1212 {
1213 	/* {
1214 		syscallarg(const char *) path;
1215 		syscallarg(struct quotactl_args *) args;
1216 	} */
1217 	struct quotactl_args args;
1218 	int error;
1219 
1220 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1221 	if (error) {
1222 		return error;
1223 	}
1224 
1225 	return do_sys_quotactl(SCARG(uap, path), &args);
1226 }
1227 
1228 int
1229 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1230     int root)
1231 {
1232 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1233 	bool chrooted;
1234 	int error = 0;
1235 
1236 	KASSERT(l == curlwp);
1237 
1238 	/*
1239 	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1240 	 * since it would imply chroots can be escaped.  Just make sure this
1241 	 * routine is self-consistent.
1242 	 */
1243 	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1244 
1245 	/*
1246 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1247 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1248 	 * overrides MNT_NOWAIT.
1249 	 */
1250 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1251 	    (flags != MNT_WAIT && flags != 0)) {
1252 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1253 	} else {
1254 		/* Get the filesystem stats now */
1255 		memset(sp, 0, sizeof(*sp));
1256 		if ((error = VFS_STATVFS(mp, sp)) != 0)
1257 			return error;
1258 		if (!chrooted)
1259 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1260 	}
1261 
1262 	if (chrooted) {
1263 		size_t len;
1264 		char *bp;
1265 		char c;
1266 		char *path = PNBUF_GET();
1267 
1268 		bp = path + MAXPATHLEN;
1269 		*--bp = '\0';
1270 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1271 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1272 		    MAXPATHLEN / 2, 0, l);
1273 		rw_exit(&cwdi->cwdi_lock);
1274 		if (error) {
1275 			PNBUF_PUT(path);
1276 			return error;
1277 		}
1278 		len = strlen(bp);
1279 		if (len != 1) {
1280 			/*
1281 			 * for mount points that are below our root, we can see
1282 			 * them, so we fix up the pathname and return them. The
1283 			 * rest we cannot see, so we don't allow viewing the
1284 			 * data.
1285 			 */
1286 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1287 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1288 				(void)strlcpy(sp->f_mntonname,
1289 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1290 				    sizeof(sp->f_mntonname));
1291 			} else {
1292 				if (root)
1293 					(void)strlcpy(sp->f_mntonname, "/",
1294 					    sizeof(sp->f_mntonname));
1295 				else
1296 					error = EPERM;
1297 			}
1298 		}
1299 		PNBUF_PUT(path);
1300 	}
1301 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1302 	return error;
1303 }
1304 
1305 /*
1306  * Get filesystem statistics by path.
1307  */
1308 int
1309 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1310 {
1311 	struct mount *mp;
1312 	int error;
1313 	struct vnode *vp;
1314 
1315 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1316 	if (error != 0)
1317 		return error;
1318 	mp = vp->v_mount;
1319 	error = dostatvfs(mp, sb, l, flags, 1);
1320 	vrele(vp);
1321 	return error;
1322 }
1323 
1324 /* ARGSUSED */
1325 int
1326 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1327 {
1328 	/* {
1329 		syscallarg(const char *) path;
1330 		syscallarg(struct statvfs *) buf;
1331 		syscallarg(int) flags;
1332 	} */
1333 	struct statvfs *sb;
1334 	int error;
1335 
1336 	sb = STATVFSBUF_GET();
1337 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1338 	if (error == 0)
1339 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1340 	STATVFSBUF_PUT(sb);
1341 	return error;
1342 }
1343 
1344 /*
1345  * Get filesystem statistics by fd.
1346  */
1347 int
1348 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1349 {
1350 	file_t *fp;
1351 	struct mount *mp;
1352 	int error;
1353 
1354 	/* fd_getvnode() will use the descriptor for us */
1355 	if ((error = fd_getvnode(fd, &fp)) != 0)
1356 		return (error);
1357 	mp = fp->f_vnode->v_mount;
1358 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1359 	fd_putfile(fd);
1360 	return error;
1361 }
1362 
1363 /* ARGSUSED */
1364 int
1365 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1366 {
1367 	/* {
1368 		syscallarg(int) fd;
1369 		syscallarg(struct statvfs *) buf;
1370 		syscallarg(int) flags;
1371 	} */
1372 	struct statvfs *sb;
1373 	int error;
1374 
1375 	sb = STATVFSBUF_GET();
1376 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1377 	if (error == 0)
1378 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1379 	STATVFSBUF_PUT(sb);
1380 	return error;
1381 }
1382 
1383 
1384 /*
1385  * Get statistics on all filesystems.
1386  */
1387 int
1388 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1389     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1390     register_t *retval)
1391 {
1392 	int root = 0;
1393 	mount_iterator_t *iter;
1394 	struct proc *p = l->l_proc;
1395 	struct mount *mp;
1396 	struct statvfs *sb;
1397 	size_t count, maxcount;
1398 	int error = 0;
1399 
1400 	sb = STATVFSBUF_GET();
1401 	maxcount = bufsize / entry_sz;
1402 	count = 0;
1403 	mountlist_iterator_init(&iter);
1404 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1405 		if (sfsp && count < maxcount) {
1406 			error = dostatvfs(mp, sb, l, flags, 0);
1407 			if (error) {
1408 				error = 0;
1409 				continue;
1410 			}
1411 			error = copyfn(sb, sfsp, entry_sz);
1412 			if (error)
1413 				goto out;
1414 			sfsp = (char *)sfsp + entry_sz;
1415 			root |= strcmp(sb->f_mntonname, "/") == 0;
1416 		}
1417 		count++;
1418 	}
1419 
1420 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1421 		/*
1422 		 * fake a root entry
1423 		 */
1424 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1425 		    sb, l, flags, 1);
1426 		if (error != 0)
1427 			goto out;
1428 		if (sfsp) {
1429 			error = copyfn(sb, sfsp, entry_sz);
1430 			if (error != 0)
1431 				goto out;
1432 		}
1433 		count++;
1434 	}
1435 	if (sfsp && count > maxcount)
1436 		*retval = maxcount;
1437 	else
1438 		*retval = count;
1439 out:
1440 	mountlist_iterator_destroy(iter);
1441 	STATVFSBUF_PUT(sb);
1442 	return error;
1443 }
1444 
1445 int
1446 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1447     register_t *retval)
1448 {
1449 	/* {
1450 		syscallarg(struct statvfs *) buf;
1451 		syscallarg(size_t) bufsize;
1452 		syscallarg(int) flags;
1453 	} */
1454 
1455 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1456 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1457 }
1458 
1459 /*
1460  * Change current working directory to a given file descriptor.
1461  */
1462 /* ARGSUSED */
1463 int
1464 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1465 {
1466 	/* {
1467 		syscallarg(int) fd;
1468 	} */
1469 	struct proc *p = l->l_proc;
1470 	struct cwdinfo *cwdi;
1471 	struct vnode *vp, *tdp;
1472 	struct mount *mp;
1473 	file_t *fp;
1474 	int error, fd;
1475 
1476 	/* fd_getvnode() will use the descriptor for us */
1477 	fd = SCARG(uap, fd);
1478 	if ((error = fd_getvnode(fd, &fp)) != 0)
1479 		return (error);
1480 	vp = fp->f_vnode;
1481 
1482 	vref(vp);
1483 	vn_lock(vp, LK_SHARED | LK_RETRY);
1484 	if (vp->v_type != VDIR)
1485 		error = ENOTDIR;
1486 	else
1487 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1488 	if (error) {
1489 		vput(vp);
1490 		goto out;
1491 	}
1492 	while ((mp = vp->v_mountedhere) != NULL) {
1493 		error = vfs_busy(mp);
1494 		vput(vp);
1495 		if (error != 0)
1496 			goto out;
1497 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1498 		vfs_unbusy(mp);
1499 		if (error)
1500 			goto out;
1501 		vp = tdp;
1502 	}
1503 	VOP_UNLOCK(vp);
1504 
1505 	/*
1506 	 * Disallow changing to a directory not under the process's
1507 	 * current root directory (if there is one).
1508 	 */
1509 	cwdi = p->p_cwdi;
1510 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1511 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1512 		vrele(vp);
1513 		error = EPERM;	/* operation not permitted */
1514 	} else {
1515 		vrele(cwdi->cwdi_cdir);
1516 		cwdi->cwdi_cdir = vp;
1517 	}
1518 	rw_exit(&cwdi->cwdi_lock);
1519 
1520  out:
1521 	fd_putfile(fd);
1522 	return (error);
1523 }
1524 
1525 /*
1526  * Change this process's notion of the root directory to a given file
1527  * descriptor.
1528  */
1529 int
1530 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1531 {
1532 	struct vnode	*vp;
1533 	file_t	*fp;
1534 	int		 error, fd = SCARG(uap, fd);
1535 
1536 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1537  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1538 		return error;
1539 	/* fd_getvnode() will use the descriptor for us */
1540 	if ((error = fd_getvnode(fd, &fp)) != 0)
1541 		return error;
1542 	vp = fp->f_vnode;
1543 	vn_lock(vp, LK_SHARED | LK_RETRY);
1544 	if (vp->v_type != VDIR)
1545 		error = ENOTDIR;
1546 	else
1547 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1548 	VOP_UNLOCK(vp);
1549 	if (error)
1550 		goto out;
1551 	vref(vp);
1552 	change_root(vp);
1553 
1554  out:
1555 	fd_putfile(fd);
1556 	return (error);
1557 }
1558 
1559 /*
1560  * Change current working directory (``.'').
1561  */
1562 /* ARGSUSED */
1563 int
1564 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1565 {
1566 	/* {
1567 		syscallarg(const char *) path;
1568 	} */
1569 	struct proc *p = l->l_proc;
1570 	struct cwdinfo *cwdi;
1571 	int error;
1572 	struct vnode *vp;
1573 
1574 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1575 				  &vp, l)) != 0)
1576 		return (error);
1577 	cwdi = p->p_cwdi;
1578 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1579 	vrele(cwdi->cwdi_cdir);
1580 	cwdi->cwdi_cdir = vp;
1581 	rw_exit(&cwdi->cwdi_lock);
1582 	return (0);
1583 }
1584 
1585 /*
1586  * Change notion of root (``/'') directory.
1587  */
1588 /* ARGSUSED */
1589 int
1590 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1591 {
1592 	/* {
1593 		syscallarg(const char *) path;
1594 	} */
1595 	int error;
1596 	struct vnode *vp;
1597 
1598 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1599 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1600 		return (error);
1601 
1602 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1603 	if (error == 0)
1604 		change_root(vp);
1605 	return error;
1606 }
1607 
1608 /*
1609  * Common routine for chroot and fchroot.
1610  * NB: callers need to properly authorize the change root operation.
1611  */
1612 void
1613 change_root(struct vnode *vp)
1614 {
1615 	kauth_cred_t ncred;
1616 	struct lwp *l = curlwp;
1617 	struct proc *p = l->l_proc;
1618 	struct cwdinfo *cwdi = p->p_cwdi;
1619 
1620 	ncred = kauth_cred_alloc();
1621 
1622 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1623 	if (cwdi->cwdi_rdir != NULL)
1624 		vrele(cwdi->cwdi_rdir);
1625 	cwdi->cwdi_rdir = vp;
1626 
1627 	/*
1628 	 * Prevent escaping from chroot by putting the root under
1629 	 * the working directory.  Silently chdir to / if we aren't
1630 	 * already there.
1631 	 */
1632 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1633 		/*
1634 		 * XXX would be more failsafe to change directory to a
1635 		 * deadfs node here instead
1636 		 */
1637 		vrele(cwdi->cwdi_cdir);
1638 		vref(vp);
1639 		cwdi->cwdi_cdir = vp;
1640 	}
1641 	rw_exit(&cwdi->cwdi_lock);
1642 
1643 	/* Get a write lock on the process credential. */
1644 	proc_crmod_enter();
1645 
1646 	kauth_cred_clone(p->p_cred, ncred);
1647 	kauth_proc_chroot(ncred, p->p_cwdi);
1648 
1649 	/* Broadcast our credentials to the process and other LWPs. */
1650  	proc_crmod_leave(ncred, p->p_cred, true);
1651 }
1652 
1653 /*
1654  * Common routine for chroot and chdir.
1655  * XXX "where" should be enum uio_seg
1656  */
1657 int
1658 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1659 {
1660 	struct pathbuf *pb;
1661 	struct nameidata nd;
1662 	int error;
1663 
1664 	error = pathbuf_maybe_copyin(path, where, &pb);
1665 	if (error) {
1666 		return error;
1667 	}
1668 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1669 	if ((error = namei(&nd)) != 0) {
1670 		pathbuf_destroy(pb);
1671 		return error;
1672 	}
1673 	*vpp = nd.ni_vp;
1674 	pathbuf_destroy(pb);
1675 
1676 	if ((*vpp)->v_type != VDIR)
1677 		error = ENOTDIR;
1678 	else
1679 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1680 
1681 	if (error)
1682 		vput(*vpp);
1683 	else
1684 		VOP_UNLOCK(*vpp);
1685 	return (error);
1686 }
1687 
1688 /*
1689  * Internals of sys_open - path has already been converted into a pathbuf
1690  * (so we can easily reuse this function from other parts of the kernel,
1691  * like posix_spawn post-processing).
1692  */
1693 int
1694 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1695 	int open_mode, int *fd)
1696 {
1697 	struct proc *p = l->l_proc;
1698 	struct cwdinfo *cwdi = p->p_cwdi;
1699 	file_t *fp;
1700 	struct vnode *vp;
1701 	int dupfd;
1702 	bool dupfd_move;
1703 	int flags, cmode;
1704 	int indx, error;
1705 
1706 	if (open_flags & O_SEARCH) {
1707 		open_flags &= ~(int)O_SEARCH;
1708 	}
1709 
1710 	/*
1711 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1712 	 * may be specified.
1713 	 */
1714 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1715 		return EINVAL;
1716 
1717 	flags = FFLAGS(open_flags);
1718 	if ((flags & (FREAD | FWRITE)) == 0)
1719 		return EINVAL;
1720 
1721 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1722 		return error;
1723 	}
1724 
1725 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1726 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1727 
1728 	error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1729 	    &vp, &dupfd_move, &dupfd);
1730 	if (error != 0) {
1731 		fd_abort(p, fp, indx);
1732 		if (error == ERESTART)
1733 			error = EINTR;
1734 		return error;
1735 	}
1736 
1737 	if (vp == NULL) {
1738 		fd_abort(p, fp, indx);
1739 		error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1740 		if (error)
1741 			return error;
1742 		*fd = indx;
1743 	} else {
1744 		error = open_setfp(l, fp, vp, indx, flags);
1745 		if (error)
1746 			return error;
1747 		VOP_UNLOCK(vp);
1748 		*fd = indx;
1749 		fd_affix(p, fp, indx);
1750 	}
1751 
1752 	return 0;
1753 }
1754 
1755 int
1756 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1757 {
1758 	struct pathbuf *pb;
1759 	int error, oflags;
1760 
1761 	oflags = FFLAGS(open_flags);
1762 	if ((oflags & (FREAD | FWRITE)) == 0)
1763 		return EINVAL;
1764 
1765 	pb = pathbuf_create(path);
1766 	if (pb == NULL)
1767 		return ENOMEM;
1768 
1769 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1770 	pathbuf_destroy(pb);
1771 
1772 	return error;
1773 }
1774 
1775 static int
1776 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1777     int mode, int *fd)
1778 {
1779 	file_t *dfp = NULL;
1780 	struct vnode *dvp = NULL;
1781 	struct pathbuf *pb;
1782 	const char *pathstring = NULL;
1783 	int error;
1784 
1785 	if (path == NULL) {
1786 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1787 		if (error == ENOSYS)
1788 			goto no_compat;
1789 		if (error)
1790 			return error;
1791 	} else {
1792 no_compat:
1793 		error = pathbuf_copyin(path, &pb);
1794 		if (error)
1795 			return error;
1796 	}
1797 
1798 	pathstring = pathbuf_stringcopy_get(pb);
1799 
1800 	/*
1801 	 * fdat is ignored if:
1802 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1803 	 * 2) if path is absolute, then fdat is useless.
1804 	 */
1805 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1806 		/* fd_getvnode() will use the descriptor for us */
1807 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1808 			goto out;
1809 
1810 		dvp = dfp->f_vnode;
1811 	}
1812 
1813 	error = do_open(l, dvp, pb, flags, mode, fd);
1814 
1815 	if (dfp != NULL)
1816 		fd_putfile(fdat);
1817 out:
1818 	pathbuf_stringcopy_put(pb, pathstring);
1819 	pathbuf_destroy(pb);
1820 	return error;
1821 }
1822 
1823 int
1824 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1825 {
1826 	/* {
1827 		syscallarg(const char *) path;
1828 		syscallarg(int) flags;
1829 		syscallarg(int) mode;
1830 	} */
1831 	int error;
1832 	int fd;
1833 
1834 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1835 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1836 
1837 	if (error == 0)
1838 		*retval = fd;
1839 
1840 	return error;
1841 }
1842 
1843 int
1844 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1845 {
1846 	/* {
1847 		syscallarg(int) fd;
1848 		syscallarg(const char *) path;
1849 		syscallarg(int) oflags;
1850 		syscallarg(int) mode;
1851 	} */
1852 	int error;
1853 	int fd;
1854 
1855 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1856 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1857 
1858 	if (error == 0)
1859 		*retval = fd;
1860 
1861 	return error;
1862 }
1863 
1864 static void
1865 vfs__fhfree(fhandle_t *fhp)
1866 {
1867 	size_t fhsize;
1868 
1869 	fhsize = FHANDLE_SIZE(fhp);
1870 	kmem_free(fhp, fhsize);
1871 }
1872 
1873 /*
1874  * vfs_composefh: compose a filehandle.
1875  */
1876 
1877 int
1878 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1879 {
1880 	struct mount *mp;
1881 	struct fid *fidp;
1882 	int error;
1883 	size_t needfhsize;
1884 	size_t fidsize;
1885 
1886 	mp = vp->v_mount;
1887 	fidp = NULL;
1888 	if (*fh_size < FHANDLE_SIZE_MIN) {
1889 		fidsize = 0;
1890 	} else {
1891 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1892 		if (fhp != NULL) {
1893 			memset(fhp, 0, *fh_size);
1894 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1895 			fidp = &fhp->fh_fid;
1896 		}
1897 	}
1898 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1899 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1900 	if (error == 0 && *fh_size < needfhsize) {
1901 		error = E2BIG;
1902 	}
1903 	*fh_size = needfhsize;
1904 	return error;
1905 }
1906 
1907 int
1908 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1909 {
1910 	struct mount *mp;
1911 	fhandle_t *fhp;
1912 	size_t fhsize;
1913 	size_t fidsize;
1914 	int error;
1915 
1916 	mp = vp->v_mount;
1917 	fidsize = 0;
1918 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1919 	KASSERT(error != 0);
1920 	if (error != E2BIG) {
1921 		goto out;
1922 	}
1923 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1924 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1925 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1926 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1927 	if (error == 0) {
1928 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1929 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1930 		*fhpp = fhp;
1931 	} else {
1932 		kmem_free(fhp, fhsize);
1933 	}
1934 out:
1935 	return error;
1936 }
1937 
1938 void
1939 vfs_composefh_free(fhandle_t *fhp)
1940 {
1941 
1942 	vfs__fhfree(fhp);
1943 }
1944 
1945 /*
1946  * vfs_fhtovp: lookup a vnode by a filehandle.
1947  */
1948 
1949 int
1950 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1951 {
1952 	struct mount *mp;
1953 	int error;
1954 
1955 	*vpp = NULL;
1956 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1957 	if (mp == NULL) {
1958 		error = ESTALE;
1959 		goto out;
1960 	}
1961 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1962 		error = EOPNOTSUPP;
1963 		goto out;
1964 	}
1965 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1966 out:
1967 	return error;
1968 }
1969 
1970 /*
1971  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1972  * the needed size.
1973  */
1974 
1975 int
1976 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1977 {
1978 	fhandle_t *fhp;
1979 	int error;
1980 
1981 	if (fhsize > FHANDLE_SIZE_MAX) {
1982 		return EINVAL;
1983 	}
1984 	if (fhsize < FHANDLE_SIZE_MIN) {
1985 		return EINVAL;
1986 	}
1987 again:
1988 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1989 	error = copyin(ufhp, fhp, fhsize);
1990 	if (error == 0) {
1991 		/* XXX this check shouldn't be here */
1992 		if (FHANDLE_SIZE(fhp) == fhsize) {
1993 			*fhpp = fhp;
1994 			return 0;
1995 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1996 			/*
1997 			 * a kludge for nfsv2 padded handles.
1998 			 */
1999 			size_t sz;
2000 
2001 			sz = FHANDLE_SIZE(fhp);
2002 			kmem_free(fhp, fhsize);
2003 			fhsize = sz;
2004 			goto again;
2005 		} else {
2006 			/*
2007 			 * userland told us wrong size.
2008 			 */
2009 		    	error = EINVAL;
2010 		}
2011 	}
2012 	kmem_free(fhp, fhsize);
2013 	return error;
2014 }
2015 
2016 void
2017 vfs_copyinfh_free(fhandle_t *fhp)
2018 {
2019 
2020 	vfs__fhfree(fhp);
2021 }
2022 
2023 /*
2024  * Get file handle system call
2025  */
2026 int
2027 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
2028 {
2029 	/* {
2030 		syscallarg(char *) fname;
2031 		syscallarg(fhandle_t *) fhp;
2032 		syscallarg(size_t *) fh_size;
2033 	} */
2034 	struct vnode *vp;
2035 	fhandle_t *fh;
2036 	int error;
2037 	struct pathbuf *pb;
2038 	struct nameidata nd;
2039 	size_t sz;
2040 	size_t usz;
2041 
2042 	/*
2043 	 * Must be super user
2044 	 */
2045 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2046 	    0, NULL, NULL, NULL);
2047 	if (error)
2048 		return (error);
2049 
2050 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
2051 	if (error) {
2052 		return error;
2053 	}
2054 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2055 	error = namei(&nd);
2056 	if (error) {
2057 		pathbuf_destroy(pb);
2058 		return error;
2059 	}
2060 	vp = nd.ni_vp;
2061 	pathbuf_destroy(pb);
2062 
2063 	error = vfs_composefh_alloc(vp, &fh);
2064 	vput(vp);
2065 	if (error != 0) {
2066 		return error;
2067 	}
2068 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2069 	if (error != 0) {
2070 		goto out;
2071 	}
2072 	sz = FHANDLE_SIZE(fh);
2073 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2074 	if (error != 0) {
2075 		goto out;
2076 	}
2077 	if (usz >= sz) {
2078 		error = copyout(fh, SCARG(uap, fhp), sz);
2079 	} else {
2080 		error = E2BIG;
2081 	}
2082 out:
2083 	vfs_composefh_free(fh);
2084 	return (error);
2085 }
2086 
2087 /*
2088  * Open a file given a file handle.
2089  *
2090  * Check permissions, allocate an open file structure,
2091  * and call the device open routine if any.
2092  */
2093 
2094 int
2095 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2096     register_t *retval)
2097 {
2098 	file_t *fp;
2099 	struct vnode *vp = NULL;
2100 	kauth_cred_t cred = l->l_cred;
2101 	file_t *nfp;
2102 	int indx, error;
2103 	struct vattr va;
2104 	fhandle_t *fh;
2105 	int flags;
2106 	proc_t *p;
2107 
2108 	p = curproc;
2109 
2110 	/*
2111 	 * Must be super user
2112 	 */
2113 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2114 	    0, NULL, NULL, NULL)))
2115 		return (error);
2116 
2117 	if (oflags & O_SEARCH) {
2118 		oflags &= ~(int)O_SEARCH;
2119 	}
2120 
2121 	flags = FFLAGS(oflags);
2122 	if ((flags & (FREAD | FWRITE)) == 0)
2123 		return (EINVAL);
2124 	if ((flags & O_CREAT))
2125 		return (EINVAL);
2126 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2127 		return (error);
2128 	fp = nfp;
2129 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2130 	if (error != 0) {
2131 		goto bad;
2132 	}
2133 	error = vfs_fhtovp(fh, &vp);
2134 	vfs_copyinfh_free(fh);
2135 	if (error != 0) {
2136 		goto bad;
2137 	}
2138 
2139 	/* Now do an effective vn_open */
2140 
2141 	if (vp->v_type == VSOCK) {
2142 		error = EOPNOTSUPP;
2143 		goto bad;
2144 	}
2145 	error = vn_openchk(vp, cred, flags);
2146 	if (error != 0)
2147 		goto bad;
2148 	if (flags & O_TRUNC) {
2149 		VOP_UNLOCK(vp);			/* XXX */
2150 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2151 		vattr_null(&va);
2152 		va.va_size = 0;
2153 		error = VOP_SETATTR(vp, &va, cred);
2154 		if (error)
2155 			goto bad;
2156 	}
2157 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2158 		goto bad;
2159 	if (flags & FWRITE) {
2160 		mutex_enter(vp->v_interlock);
2161 		vp->v_writecount++;
2162 		mutex_exit(vp->v_interlock);
2163 	}
2164 
2165 	/* done with modified vn_open, now finish what sys_open does. */
2166 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2167 		return error;
2168 
2169 	VOP_UNLOCK(vp);
2170 	*retval = indx;
2171 	fd_affix(p, fp, indx);
2172 	return (0);
2173 
2174 bad:
2175 	fd_abort(p, fp, indx);
2176 	if (vp != NULL)
2177 		vput(vp);
2178 	if (error == EDUPFD || error == EMOVEFD) {
2179 		/* XXX should probably close curlwp->l_dupfd */
2180 		error = EOPNOTSUPP;
2181 	}
2182 	return (error);
2183 }
2184 
2185 int
2186 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2187 {
2188 	/* {
2189 		syscallarg(const void *) fhp;
2190 		syscallarg(size_t) fh_size;
2191 		syscallarg(int) flags;
2192 	} */
2193 
2194 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2195 	    SCARG(uap, flags), retval);
2196 }
2197 
2198 int
2199 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2200 {
2201 	int error;
2202 	fhandle_t *fh;
2203 	struct vnode *vp;
2204 
2205 	/*
2206 	 * Must be super user
2207 	 */
2208 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2209 	    0, NULL, NULL, NULL)))
2210 		return (error);
2211 
2212 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2213 	if (error != 0)
2214 		return error;
2215 
2216 	error = vfs_fhtovp(fh, &vp);
2217 	vfs_copyinfh_free(fh);
2218 	if (error != 0)
2219 		return error;
2220 
2221 	error = vn_stat(vp, sb);
2222 	vput(vp);
2223 	return error;
2224 }
2225 
2226 
2227 /* ARGSUSED */
2228 int
2229 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2230 {
2231 	/* {
2232 		syscallarg(const void *) fhp;
2233 		syscallarg(size_t) fh_size;
2234 		syscallarg(struct stat *) sb;
2235 	} */
2236 	struct stat sb;
2237 	int error;
2238 
2239 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2240 	if (error)
2241 		return error;
2242 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2243 }
2244 
2245 int
2246 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2247     int flags)
2248 {
2249 	fhandle_t *fh;
2250 	struct mount *mp;
2251 	struct vnode *vp;
2252 	int error;
2253 
2254 	/*
2255 	 * Must be super user
2256 	 */
2257 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2258 	    0, NULL, NULL, NULL)))
2259 		return error;
2260 
2261 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2262 	if (error != 0)
2263 		return error;
2264 
2265 	error = vfs_fhtovp(fh, &vp);
2266 	vfs_copyinfh_free(fh);
2267 	if (error != 0)
2268 		return error;
2269 
2270 	mp = vp->v_mount;
2271 	error = dostatvfs(mp, sb, l, flags, 1);
2272 	vput(vp);
2273 	return error;
2274 }
2275 
2276 /* ARGSUSED */
2277 int
2278 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2279 {
2280 	/* {
2281 		syscallarg(const void *) fhp;
2282 		syscallarg(size_t) fh_size;
2283 		syscallarg(struct statvfs *) buf;
2284 		syscallarg(int)	flags;
2285 	} */
2286 	struct statvfs *sb = STATVFSBUF_GET();
2287 	int error;
2288 
2289 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2290 	    SCARG(uap, flags));
2291 	if (error == 0)
2292 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2293 	STATVFSBUF_PUT(sb);
2294 	return error;
2295 }
2296 
2297 int
2298 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2299     dev_t dev)
2300 {
2301 
2302 	/*
2303 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2304 	 * in mode and dev=0.
2305 	 *
2306 	 * In all the other cases it's implementation defined behavior.
2307 	 */
2308 
2309 	if ((mode & S_IFIFO) && dev == 0)
2310 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2311 	else
2312 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2313 		    UIO_USERSPACE);
2314 }
2315 
2316 /*
2317  * Create a special file.
2318  */
2319 /* ARGSUSED */
2320 int
2321 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2322     register_t *retval)
2323 {
2324 	/* {
2325 		syscallarg(const char *) path;
2326 		syscallarg(mode_t) mode;
2327 		syscallarg(dev_t) dev;
2328 	} */
2329 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2330 	    SCARG(uap, mode), SCARG(uap, dev));
2331 }
2332 
2333 int
2334 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2335     register_t *retval)
2336 {
2337 	/* {
2338 		syscallarg(int) fd;
2339 		syscallarg(const char *) path;
2340 		syscallarg(mode_t) mode;
2341 		syscallarg(int) pad;
2342 		syscallarg(dev_t) dev;
2343 	} */
2344 
2345 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2346 	    SCARG(uap, mode), SCARG(uap, dev));
2347 }
2348 
2349 int
2350 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2351     enum uio_seg seg)
2352 {
2353 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2354 }
2355 
2356 int
2357 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2358     dev_t dev, enum uio_seg seg)
2359 {
2360 	struct proc *p = l->l_proc;
2361 	struct vnode *vp;
2362 	struct vattr vattr;
2363 	int error, optype;
2364 	struct pathbuf *pb;
2365 	struct nameidata nd;
2366 	const char *pathstring;
2367 
2368 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2369 	    0, NULL, NULL, NULL)) != 0)
2370 		return (error);
2371 
2372 	optype = VOP_MKNOD_DESCOFFSET;
2373 
2374 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2375 	if (error) {
2376 		return error;
2377 	}
2378 	pathstring = pathbuf_stringcopy_get(pb);
2379 	if (pathstring == NULL) {
2380 		pathbuf_destroy(pb);
2381 		return ENOMEM;
2382 	}
2383 
2384 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2385 
2386 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2387 		goto out;
2388 	vp = nd.ni_vp;
2389 
2390 	if (vp != NULL)
2391 		error = EEXIST;
2392 	else {
2393 		vattr_null(&vattr);
2394 		/* We will read cwdi->cwdi_cmask unlocked. */
2395 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2396 		vattr.va_rdev = dev;
2397 
2398 		switch (mode & S_IFMT) {
2399 		case S_IFMT:	/* used by badsect to flag bad sectors */
2400 			vattr.va_type = VBAD;
2401 			break;
2402 		case S_IFCHR:
2403 			vattr.va_type = VCHR;
2404 			break;
2405 		case S_IFBLK:
2406 			vattr.va_type = VBLK;
2407 			break;
2408 		case S_IFWHT:
2409 			optype = VOP_WHITEOUT_DESCOFFSET;
2410 			break;
2411 		case S_IFREG:
2412 #if NVERIEXEC > 0
2413 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2414 			    O_CREAT);
2415 #endif /* NVERIEXEC > 0 */
2416 			vattr.va_type = VREG;
2417 			vattr.va_rdev = VNOVAL;
2418 			optype = VOP_CREATE_DESCOFFSET;
2419 			break;
2420 		default:
2421 			error = EINVAL;
2422 			break;
2423 		}
2424 
2425 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2426 		    vattr.va_rdev == VNOVAL)
2427 			error = EINVAL;
2428 	}
2429 
2430 	if (!error) {
2431 		switch (optype) {
2432 		case VOP_WHITEOUT_DESCOFFSET:
2433 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2434 			if (error)
2435 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2436 			vput(nd.ni_dvp);
2437 			break;
2438 
2439 		case VOP_MKNOD_DESCOFFSET:
2440 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2441 						&nd.ni_cnd, &vattr);
2442 			if (error == 0)
2443 				vrele(nd.ni_vp);
2444 			vput(nd.ni_dvp);
2445 			break;
2446 
2447 		case VOP_CREATE_DESCOFFSET:
2448 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2449 						&nd.ni_cnd, &vattr);
2450 			if (error == 0)
2451 				vrele(nd.ni_vp);
2452 			vput(nd.ni_dvp);
2453 			break;
2454 		}
2455 	} else {
2456 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2457 		if (nd.ni_dvp == vp)
2458 			vrele(nd.ni_dvp);
2459 		else
2460 			vput(nd.ni_dvp);
2461 		if (vp)
2462 			vrele(vp);
2463 	}
2464 out:
2465 	pathbuf_stringcopy_put(pb, pathstring);
2466 	pathbuf_destroy(pb);
2467 	return (error);
2468 }
2469 
2470 /*
2471  * Create a named pipe.
2472  */
2473 /* ARGSUSED */
2474 int
2475 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2476 {
2477 	/* {
2478 		syscallarg(const char *) path;
2479 		syscallarg(int) mode;
2480 	} */
2481 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2482 }
2483 
2484 int
2485 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2486     register_t *retval)
2487 {
2488 	/* {
2489 		syscallarg(int) fd;
2490 		syscallarg(const char *) path;
2491 		syscallarg(int) mode;
2492 	} */
2493 
2494 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2495 	    SCARG(uap, mode));
2496 }
2497 
2498 static int
2499 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2500 {
2501 	struct proc *p = l->l_proc;
2502 	struct vattr vattr;
2503 	int error;
2504 	struct pathbuf *pb;
2505 	struct nameidata nd;
2506 
2507 	error = pathbuf_copyin(path, &pb);
2508 	if (error) {
2509 		return error;
2510 	}
2511 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2512 
2513 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2514 		pathbuf_destroy(pb);
2515 		return error;
2516 	}
2517 	if (nd.ni_vp != NULL) {
2518 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2519 		if (nd.ni_dvp == nd.ni_vp)
2520 			vrele(nd.ni_dvp);
2521 		else
2522 			vput(nd.ni_dvp);
2523 		vrele(nd.ni_vp);
2524 		pathbuf_destroy(pb);
2525 		return (EEXIST);
2526 	}
2527 	vattr_null(&vattr);
2528 	vattr.va_type = VFIFO;
2529 	/* We will read cwdi->cwdi_cmask unlocked. */
2530 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2531 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2532 	if (error == 0)
2533 		vrele(nd.ni_vp);
2534 	vput(nd.ni_dvp);
2535 	pathbuf_destroy(pb);
2536 	return (error);
2537 }
2538 
2539 /*
2540  * Make a hard file link.
2541  */
2542 /* ARGSUSED */
2543 int
2544 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2545     const char *link, int follow, register_t *retval)
2546 {
2547 	struct vnode *vp;
2548 	struct pathbuf *linkpb;
2549 	struct nameidata nd;
2550 	namei_simple_flags_t ns_flags;
2551 	int error;
2552 
2553 	if (follow & AT_SYMLINK_FOLLOW)
2554 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2555 	else
2556 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2557 
2558 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2559 	if (error != 0)
2560 		return (error);
2561 	error = pathbuf_copyin(link, &linkpb);
2562 	if (error) {
2563 		goto out1;
2564 	}
2565 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2566 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2567 		goto out2;
2568 	if (nd.ni_vp) {
2569 		error = EEXIST;
2570 		goto abortop;
2571 	}
2572 	/* Prevent hard links on directories. */
2573 	if (vp->v_type == VDIR) {
2574 		error = EPERM;
2575 		goto abortop;
2576 	}
2577 	/* Prevent cross-mount operation. */
2578 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2579 		error = EXDEV;
2580 		goto abortop;
2581 	}
2582 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2583 	VOP_UNLOCK(nd.ni_dvp);
2584 	vrele(nd.ni_dvp);
2585 out2:
2586 	pathbuf_destroy(linkpb);
2587 out1:
2588 	vrele(vp);
2589 	return (error);
2590 abortop:
2591 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2592 	if (nd.ni_dvp == nd.ni_vp)
2593 		vrele(nd.ni_dvp);
2594 	else
2595 		vput(nd.ni_dvp);
2596 	if (nd.ni_vp != NULL)
2597 		vrele(nd.ni_vp);
2598 	goto out2;
2599 }
2600 
2601 int
2602 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2603 {
2604 	/* {
2605 		syscallarg(const char *) path;
2606 		syscallarg(const char *) link;
2607 	} */
2608 	const char *path = SCARG(uap, path);
2609 	const char *link = SCARG(uap, link);
2610 
2611 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2612 	    AT_SYMLINK_FOLLOW, retval);
2613 }
2614 
2615 int
2616 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2617     register_t *retval)
2618 {
2619 	/* {
2620 		syscallarg(int) fd1;
2621 		syscallarg(const char *) name1;
2622 		syscallarg(int) fd2;
2623 		syscallarg(const char *) name2;
2624 		syscallarg(int) flags;
2625 	} */
2626 	int fd1 = SCARG(uap, fd1);
2627 	const char *name1 = SCARG(uap, name1);
2628 	int fd2 = SCARG(uap, fd2);
2629 	const char *name2 = SCARG(uap, name2);
2630 	int follow;
2631 
2632 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2633 
2634 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2635 }
2636 
2637 
2638 int
2639 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2640 {
2641 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2642 }
2643 
2644 static int
2645 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2646     const char *link, enum uio_seg seg)
2647 {
2648 	struct proc *p = curproc;
2649 	struct vattr vattr;
2650 	char *path;
2651 	int error;
2652 	size_t len;
2653 	struct pathbuf *linkpb;
2654 	struct nameidata nd;
2655 
2656 	KASSERT(l != NULL || fdat == AT_FDCWD);
2657 
2658 	path = PNBUF_GET();
2659 	if (seg == UIO_USERSPACE) {
2660 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2661 			goto out1;
2662 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2663 			goto out1;
2664 	} else {
2665 		len = strlen(patharg) + 1;
2666 		KASSERT(len <= MAXPATHLEN);
2667 		memcpy(path, patharg, len);
2668 		linkpb = pathbuf_create(link);
2669 		if (linkpb == NULL) {
2670 			error = ENOMEM;
2671 			goto out1;
2672 		}
2673 	}
2674 	ktrkuser("symlink-target", path, len - 1);
2675 
2676 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2677 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2678 		goto out2;
2679 	if (nd.ni_vp) {
2680 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2681 		if (nd.ni_dvp == nd.ni_vp)
2682 			vrele(nd.ni_dvp);
2683 		else
2684 			vput(nd.ni_dvp);
2685 		vrele(nd.ni_vp);
2686 		error = EEXIST;
2687 		goto out2;
2688 	}
2689 	vattr_null(&vattr);
2690 	vattr.va_type = VLNK;
2691 	/* We will read cwdi->cwdi_cmask unlocked. */
2692 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2693 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2694 	if (error == 0)
2695 		vrele(nd.ni_vp);
2696 	vput(nd.ni_dvp);
2697 out2:
2698 	pathbuf_destroy(linkpb);
2699 out1:
2700 	PNBUF_PUT(path);
2701 	return (error);
2702 }
2703 
2704 /*
2705  * Make a symbolic link.
2706  */
2707 /* ARGSUSED */
2708 int
2709 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2710 {
2711 	/* {
2712 		syscallarg(const char *) path;
2713 		syscallarg(const char *) link;
2714 	} */
2715 
2716 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2717 	    UIO_USERSPACE);
2718 }
2719 
2720 int
2721 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2722     register_t *retval)
2723 {
2724 	/* {
2725 		syscallarg(const char *) path1;
2726 		syscallarg(int) fd;
2727 		syscallarg(const char *) path2;
2728 	} */
2729 
2730 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2731 	    SCARG(uap, path2), UIO_USERSPACE);
2732 }
2733 
2734 /*
2735  * Delete a whiteout from the filesystem.
2736  */
2737 /* ARGSUSED */
2738 int
2739 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2740 {
2741 	/* {
2742 		syscallarg(const char *) path;
2743 	} */
2744 	int error;
2745 	struct pathbuf *pb;
2746 	struct nameidata nd;
2747 
2748 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2749 	if (error) {
2750 		return error;
2751 	}
2752 
2753 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2754 	error = namei(&nd);
2755 	if (error) {
2756 		pathbuf_destroy(pb);
2757 		return (error);
2758 	}
2759 
2760 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2761 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2762 		if (nd.ni_dvp == nd.ni_vp)
2763 			vrele(nd.ni_dvp);
2764 		else
2765 			vput(nd.ni_dvp);
2766 		if (nd.ni_vp)
2767 			vrele(nd.ni_vp);
2768 		pathbuf_destroy(pb);
2769 		return (EEXIST);
2770 	}
2771 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2772 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2773 	vput(nd.ni_dvp);
2774 	pathbuf_destroy(pb);
2775 	return (error);
2776 }
2777 
2778 /*
2779  * Delete a name from the filesystem.
2780  */
2781 /* ARGSUSED */
2782 int
2783 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2784 {
2785 	/* {
2786 		syscallarg(const char *) path;
2787 	} */
2788 
2789 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2790 }
2791 
2792 int
2793 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2794     register_t *retval)
2795 {
2796 	/* {
2797 		syscallarg(int) fd;
2798 		syscallarg(const char *) path;
2799 		syscallarg(int) flag;
2800 	} */
2801 
2802 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2803 	    SCARG(uap, flag), UIO_USERSPACE);
2804 }
2805 
2806 int
2807 do_sys_unlink(const char *arg, enum uio_seg seg)
2808 {
2809 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2810 }
2811 
2812 static int
2813 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2814     enum uio_seg seg)
2815 {
2816 	struct vnode *vp;
2817 	int error;
2818 	struct pathbuf *pb;
2819 	struct nameidata nd;
2820 	const char *pathstring;
2821 
2822 	KASSERT(l != NULL || fdat == AT_FDCWD);
2823 
2824 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2825 	if (error) {
2826 		return error;
2827 	}
2828 	pathstring = pathbuf_stringcopy_get(pb);
2829 	if (pathstring == NULL) {
2830 		pathbuf_destroy(pb);
2831 		return ENOMEM;
2832 	}
2833 
2834 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2835 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2836 		goto out;
2837 	vp = nd.ni_vp;
2838 
2839 	/*
2840 	 * The root of a mounted filesystem cannot be deleted.
2841 	 */
2842 	if ((vp->v_vflag & VV_ROOT) != 0) {
2843 		error = EBUSY;
2844 		goto abort;
2845 	}
2846 
2847 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2848 		error = EBUSY;
2849 		goto abort;
2850 	}
2851 
2852 	/*
2853 	 * No rmdir "." please.
2854 	 */
2855 	if (nd.ni_dvp == vp) {
2856 		error = EINVAL;
2857 		goto abort;
2858 	}
2859 
2860 	/*
2861 	 * AT_REMOVEDIR is required to remove a directory
2862 	 */
2863 	if (vp->v_type == VDIR) {
2864 		if (!(flags & AT_REMOVEDIR)) {
2865 			error = EPERM;
2866 			goto abort;
2867 		} else {
2868 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2869 			vput(nd.ni_dvp);
2870 			goto out;
2871 		}
2872 	}
2873 
2874 	/*
2875 	 * Starting here we only deal with non directories.
2876 	 */
2877 	if (flags & AT_REMOVEDIR) {
2878 		error = ENOTDIR;
2879 		goto abort;
2880 	}
2881 
2882 #if NVERIEXEC > 0
2883 	/* Handle remove requests for veriexec entries. */
2884 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2885 		goto abort;
2886 	}
2887 #endif /* NVERIEXEC > 0 */
2888 
2889 #ifdef FILEASSOC
2890 	(void)fileassoc_file_delete(vp);
2891 #endif /* FILEASSOC */
2892 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2893 	vput(nd.ni_dvp);
2894 	goto out;
2895 
2896 abort:
2897 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2898 	if (nd.ni_dvp == vp)
2899 		vrele(nd.ni_dvp);
2900 	else
2901 		vput(nd.ni_dvp);
2902 	vput(vp);
2903 
2904 out:
2905 	pathbuf_stringcopy_put(pb, pathstring);
2906 	pathbuf_destroy(pb);
2907 	return (error);
2908 }
2909 
2910 /*
2911  * Reposition read/write file offset.
2912  */
2913 int
2914 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2915 {
2916 	/* {
2917 		syscallarg(int) fd;
2918 		syscallarg(int) pad;
2919 		syscallarg(off_t) offset;
2920 		syscallarg(int) whence;
2921 	} */
2922 	file_t *fp;
2923 	int error, fd;
2924 
2925 	switch (SCARG(uap, whence)) {
2926 	case SEEK_CUR:
2927 	case SEEK_END:
2928 	case SEEK_SET:
2929 		break;
2930 	default:
2931 		return EINVAL;
2932 	}
2933 
2934 	fd = SCARG(uap, fd);
2935 
2936 	if ((fp = fd_getfile(fd)) == NULL)
2937 		return (EBADF);
2938 
2939 	if (fp->f_ops->fo_seek == NULL) {
2940 		error = ESPIPE;
2941 		goto out;
2942 	}
2943 
2944 	error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
2945 	    SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
2946  out:
2947  	fd_putfile(fd);
2948 	return (error);
2949 }
2950 
2951 /*
2952  * Positional read system call.
2953  */
2954 int
2955 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2956 {
2957 	/* {
2958 		syscallarg(int) fd;
2959 		syscallarg(void *) buf;
2960 		syscallarg(size_t) nbyte;
2961 		syscallarg(off_t) offset;
2962 	} */
2963 	file_t *fp;
2964 	off_t offset;
2965 	int error, fd = SCARG(uap, fd);
2966 
2967 	if ((fp = fd_getfile(fd)) == NULL)
2968 		return (EBADF);
2969 
2970 	if ((fp->f_flag & FREAD) == 0) {
2971 		fd_putfile(fd);
2972 		return (EBADF);
2973 	}
2974 
2975 	if (fp->f_ops->fo_seek == NULL) {
2976 		error = ESPIPE;
2977 		goto out;
2978 	}
2979 
2980 	offset = SCARG(uap, offset);
2981 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
2982 	if (error)
2983 		goto out;
2984 
2985 	/* dofileread() will unuse the descriptor for us */
2986 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2987 	    &offset, 0, retval));
2988 
2989  out:
2990 	fd_putfile(fd);
2991 	return (error);
2992 }
2993 
2994 /*
2995  * Positional scatter read system call.
2996  */
2997 int
2998 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2999 {
3000 	/* {
3001 		syscallarg(int) fd;
3002 		syscallarg(const struct iovec *) iovp;
3003 		syscallarg(int) iovcnt;
3004 		syscallarg(off_t) offset;
3005 	} */
3006 	off_t offset = SCARG(uap, offset);
3007 
3008 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3009 	    SCARG(uap, iovcnt), &offset, 0, retval);
3010 }
3011 
3012 /*
3013  * Positional write system call.
3014  */
3015 int
3016 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
3017 {
3018 	/* {
3019 		syscallarg(int) fd;
3020 		syscallarg(const void *) buf;
3021 		syscallarg(size_t) nbyte;
3022 		syscallarg(off_t) offset;
3023 	} */
3024 	file_t *fp;
3025 	off_t offset;
3026 	int error, fd = SCARG(uap, fd);
3027 
3028 	if ((fp = fd_getfile(fd)) == NULL)
3029 		return (EBADF);
3030 
3031 	if ((fp->f_flag & FWRITE) == 0) {
3032 		fd_putfile(fd);
3033 		return (EBADF);
3034 	}
3035 
3036 	if (fp->f_ops->fo_seek == NULL) {
3037 		error = ESPIPE;
3038 		goto out;
3039 	}
3040 
3041 	offset = SCARG(uap, offset);
3042 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3043 	if (error)
3044 		goto out;
3045 
3046 	/* dofilewrite() will unuse the descriptor for us */
3047 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3048 	    &offset, 0, retval));
3049 
3050  out:
3051 	fd_putfile(fd);
3052 	return (error);
3053 }
3054 
3055 /*
3056  * Positional gather write system call.
3057  */
3058 int
3059 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
3060 {
3061 	/* {
3062 		syscallarg(int) fd;
3063 		syscallarg(const struct iovec *) iovp;
3064 		syscallarg(int) iovcnt;
3065 		syscallarg(off_t) offset;
3066 	} */
3067 	off_t offset = SCARG(uap, offset);
3068 
3069 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3070 	    SCARG(uap, iovcnt), &offset, 0, retval);
3071 }
3072 
3073 /*
3074  * Check access permissions.
3075  */
3076 int
3077 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
3078 {
3079 	/* {
3080 		syscallarg(const char *) path;
3081 		syscallarg(int) flags;
3082 	} */
3083 
3084 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3085 	     SCARG(uap, flags), 0);
3086 }
3087 
3088 int
3089 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3090     int mode, int flags)
3091 {
3092 	kauth_cred_t cred;
3093 	struct vnode *vp;
3094 	int error, nd_flag, vmode;
3095 	struct pathbuf *pb;
3096 	struct nameidata nd;
3097 
3098 	CTASSERT(F_OK == 0);
3099 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3100 		/* nonsense mode */
3101 		return EINVAL;
3102 	}
3103 
3104 	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3105 	if (flags & AT_SYMLINK_NOFOLLOW)
3106 		nd_flag &= ~FOLLOW;
3107 
3108 	error = pathbuf_copyin(path, &pb);
3109 	if (error)
3110 		return error;
3111 
3112 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3113 
3114 	/* Override default credentials */
3115 	cred = kauth_cred_dup(l->l_cred);
3116 	if (!(flags & AT_EACCESS)) {
3117 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3118 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3119 	}
3120 	nd.ni_cnd.cn_cred = cred;
3121 
3122 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3123 		pathbuf_destroy(pb);
3124 		goto out;
3125 	}
3126 	vp = nd.ni_vp;
3127 	pathbuf_destroy(pb);
3128 
3129 	/* Flags == 0 means only check for existence. */
3130 	if (mode) {
3131 		vmode = 0;
3132 		if (mode & R_OK)
3133 			vmode |= VREAD;
3134 		if (mode & W_OK)
3135 			vmode |= VWRITE;
3136 		if (mode & X_OK)
3137 			vmode |= VEXEC;
3138 
3139 		error = VOP_ACCESS(vp, vmode, cred);
3140 		if (!error && (vmode & VWRITE))
3141 			error = vn_writechk(vp);
3142 	}
3143 	vput(vp);
3144 out:
3145 	kauth_cred_free(cred);
3146 	return (error);
3147 }
3148 
3149 int
3150 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3151     register_t *retval)
3152 {
3153 	/* {
3154 		syscallarg(int) fd;
3155 		syscallarg(const char *) path;
3156 		syscallarg(int) amode;
3157 		syscallarg(int) flag;
3158 	} */
3159 
3160 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3161 	     SCARG(uap, amode), SCARG(uap, flag));
3162 }
3163 
3164 /*
3165  * Common code for all sys_stat functions, including compat versions.
3166  */
3167 int
3168 do_sys_stat(const char *userpath, unsigned int nd_flag,
3169     struct stat *sb)
3170 {
3171 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3172 }
3173 
3174 int
3175 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3176     unsigned int nd_flag, struct stat *sb)
3177 {
3178 	int error;
3179 	struct pathbuf *pb;
3180 	struct nameidata nd;
3181 
3182 	KASSERT(l != NULL || fdat == AT_FDCWD);
3183 
3184 	error = pathbuf_copyin(userpath, &pb);
3185 	if (error) {
3186 		return error;
3187 	}
3188 
3189 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3190 
3191 	error = fd_nameiat(l, fdat, &nd);
3192 	if (error != 0) {
3193 		pathbuf_destroy(pb);
3194 		return error;
3195 	}
3196 	error = vn_stat(nd.ni_vp, sb);
3197 	vput(nd.ni_vp);
3198 	pathbuf_destroy(pb);
3199 	return error;
3200 }
3201 
3202 /*
3203  * Get file status; this version follows links.
3204  */
3205 /* ARGSUSED */
3206 int
3207 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3208 {
3209 	/* {
3210 		syscallarg(const char *) path;
3211 		syscallarg(struct stat *) ub;
3212 	} */
3213 	struct stat sb;
3214 	int error;
3215 
3216 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3217 	if (error)
3218 		return error;
3219 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3220 }
3221 
3222 /*
3223  * Get file status; this version does not follow links.
3224  */
3225 /* ARGSUSED */
3226 int
3227 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3228 {
3229 	/* {
3230 		syscallarg(const char *) path;
3231 		syscallarg(struct stat *) ub;
3232 	} */
3233 	struct stat sb;
3234 	int error;
3235 
3236 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3237 	if (error)
3238 		return error;
3239 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3240 }
3241 
3242 int
3243 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3244     register_t *retval)
3245 {
3246 	/* {
3247 		syscallarg(int) fd;
3248 		syscallarg(const char *) path;
3249 		syscallarg(struct stat *) buf;
3250 		syscallarg(int) flag;
3251 	} */
3252 	unsigned int nd_flag;
3253 	struct stat sb;
3254 	int error;
3255 
3256 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3257 		nd_flag = NOFOLLOW;
3258 	else
3259 		nd_flag = FOLLOW;
3260 
3261 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3262 	    &sb);
3263 	if (error)
3264 		return error;
3265 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3266 }
3267 
3268 static int
3269 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3270 {
3271 	int error;
3272 	struct pathbuf *pb;
3273 	struct nameidata nd;
3274 
3275 	error = pathbuf_copyin(path, &pb);
3276 	if (error) {
3277 		return error;
3278 	}
3279 	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3280 	if ((error = namei(&nd)) != 0) {
3281 		pathbuf_destroy(pb);
3282 		return error;
3283 	}
3284 	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3285 	vput(nd.ni_vp);
3286 	pathbuf_destroy(pb);
3287 	return error;
3288 }
3289 
3290 /*
3291  * Get configurable pathname variables.
3292  */
3293 /* ARGSUSED */
3294 int
3295 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3296     register_t *retval)
3297 {
3298 	/* {
3299 		syscallarg(const char *) path;
3300 		syscallarg(int) name;
3301 	} */
3302 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3303 	    FOLLOW);
3304 }
3305 
3306 /* ARGSUSED */
3307 int
3308 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3309     register_t *retval)
3310 {
3311 	/* {
3312 		syscallarg(const char *) path;
3313 		syscallarg(int) name;
3314 	} */
3315 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3316 	    NOFOLLOW);
3317 }
3318 
3319 /*
3320  * Return target name of a symbolic link.
3321  */
3322 /* ARGSUSED */
3323 int
3324 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3325     register_t *retval)
3326 {
3327 	/* {
3328 		syscallarg(const char *) path;
3329 		syscallarg(char *) buf;
3330 		syscallarg(size_t) count;
3331 	} */
3332 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3333 	    SCARG(uap, buf), SCARG(uap, count), retval);
3334 }
3335 
3336 static int
3337 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3338     size_t count, register_t *retval)
3339 {
3340 	struct vnode *vp;
3341 	struct iovec aiov;
3342 	struct uio auio;
3343 	int error;
3344 	struct pathbuf *pb;
3345 	struct nameidata nd;
3346 
3347 	error = pathbuf_copyin(path, &pb);
3348 	if (error) {
3349 		return error;
3350 	}
3351 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
3352 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3353 		pathbuf_destroy(pb);
3354 		return error;
3355 	}
3356 	vp = nd.ni_vp;
3357 	pathbuf_destroy(pb);
3358 	if (vp->v_type != VLNK)
3359 		error = EINVAL;
3360 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3361 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3362 		aiov.iov_base = buf;
3363 		aiov.iov_len = count;
3364 		auio.uio_iov = &aiov;
3365 		auio.uio_iovcnt = 1;
3366 		auio.uio_offset = 0;
3367 		auio.uio_rw = UIO_READ;
3368 		KASSERT(l == curlwp);
3369 		auio.uio_vmspace = l->l_proc->p_vmspace;
3370 		auio.uio_resid = count;
3371 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3372 			*retval = count - auio.uio_resid;
3373 	}
3374 	vput(vp);
3375 	return (error);
3376 }
3377 
3378 int
3379 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3380     register_t *retval)
3381 {
3382 	/* {
3383 		syscallarg(int) fd;
3384 		syscallarg(const char *) path;
3385 		syscallarg(char *) buf;
3386 		syscallarg(size_t) bufsize;
3387 	} */
3388 
3389 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3390 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3391 }
3392 
3393 /*
3394  * Change flags of a file given a path name.
3395  */
3396 /* ARGSUSED */
3397 int
3398 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3399 {
3400 	/* {
3401 		syscallarg(const char *) path;
3402 		syscallarg(u_long) flags;
3403 	} */
3404 	struct vnode *vp;
3405 	int error;
3406 
3407 	error = namei_simple_user(SCARG(uap, path),
3408 				NSM_FOLLOW_TRYEMULROOT, &vp);
3409 	if (error != 0)
3410 		return (error);
3411 	error = change_flags(vp, SCARG(uap, flags), l);
3412 	vput(vp);
3413 	return (error);
3414 }
3415 
3416 /*
3417  * Change flags of a file given a file descriptor.
3418  */
3419 /* ARGSUSED */
3420 int
3421 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3422 {
3423 	/* {
3424 		syscallarg(int) fd;
3425 		syscallarg(u_long) flags;
3426 	} */
3427 	struct vnode *vp;
3428 	file_t *fp;
3429 	int error;
3430 
3431 	/* fd_getvnode() will use the descriptor for us */
3432 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3433 		return (error);
3434 	vp = fp->f_vnode;
3435 	error = change_flags(vp, SCARG(uap, flags), l);
3436 	VOP_UNLOCK(vp);
3437 	fd_putfile(SCARG(uap, fd));
3438 	return (error);
3439 }
3440 
3441 /*
3442  * Change flags of a file given a path name; this version does
3443  * not follow links.
3444  */
3445 int
3446 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3447 {
3448 	/* {
3449 		syscallarg(const char *) path;
3450 		syscallarg(u_long) flags;
3451 	} */
3452 	struct vnode *vp;
3453 	int error;
3454 
3455 	error = namei_simple_user(SCARG(uap, path),
3456 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3457 	if (error != 0)
3458 		return (error);
3459 	error = change_flags(vp, SCARG(uap, flags), l);
3460 	vput(vp);
3461 	return (error);
3462 }
3463 
3464 /*
3465  * Common routine to change flags of a file.
3466  */
3467 int
3468 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3469 {
3470 	struct vattr vattr;
3471 	int error;
3472 
3473 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3474 
3475 	vattr_null(&vattr);
3476 	vattr.va_flags = flags;
3477 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3478 
3479 	return (error);
3480 }
3481 
3482 /*
3483  * Change mode of a file given path name; this version follows links.
3484  */
3485 /* ARGSUSED */
3486 int
3487 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3488 {
3489 	/* {
3490 		syscallarg(const char *) path;
3491 		syscallarg(int) mode;
3492 	} */
3493 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3494 			      SCARG(uap, mode), 0);
3495 }
3496 
3497 int
3498 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3499 {
3500 	int error;
3501 	struct vnode *vp;
3502 	namei_simple_flags_t ns_flag;
3503 
3504 	if (flags & AT_SYMLINK_NOFOLLOW)
3505 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3506 	else
3507 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3508 
3509 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3510 	if (error != 0)
3511 		return error;
3512 
3513 	error = change_mode(vp, mode, l);
3514 
3515 	vrele(vp);
3516 
3517 	return (error);
3518 }
3519 
3520 /*
3521  * Change mode of a file given a file descriptor.
3522  */
3523 /* ARGSUSED */
3524 int
3525 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3526 {
3527 	/* {
3528 		syscallarg(int) fd;
3529 		syscallarg(int) mode;
3530 	} */
3531 	file_t *fp;
3532 	int error;
3533 
3534 	/* fd_getvnode() will use the descriptor for us */
3535 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3536 		return (error);
3537 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3538 	fd_putfile(SCARG(uap, fd));
3539 	return (error);
3540 }
3541 
3542 int
3543 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3544     register_t *retval)
3545 {
3546 	/* {
3547 		syscallarg(int) fd;
3548 		syscallarg(const char *) path;
3549 		syscallarg(int) mode;
3550 		syscallarg(int) flag;
3551 	} */
3552 
3553 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3554 			      SCARG(uap, mode), SCARG(uap, flag));
3555 }
3556 
3557 /*
3558  * Change mode of a file given path name; this version does not follow links.
3559  */
3560 /* ARGSUSED */
3561 int
3562 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3563 {
3564 	/* {
3565 		syscallarg(const char *) path;
3566 		syscallarg(int) mode;
3567 	} */
3568 	int error;
3569 	struct vnode *vp;
3570 
3571 	error = namei_simple_user(SCARG(uap, path),
3572 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3573 	if (error != 0)
3574 		return (error);
3575 
3576 	error = change_mode(vp, SCARG(uap, mode), l);
3577 
3578 	vrele(vp);
3579 	return (error);
3580 }
3581 
3582 /*
3583  * Common routine to set mode given a vnode.
3584  */
3585 static int
3586 change_mode(struct vnode *vp, int mode, struct lwp *l)
3587 {
3588 	struct vattr vattr;
3589 	int error;
3590 
3591 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3592 	vattr_null(&vattr);
3593 	vattr.va_mode = mode & ALLPERMS;
3594 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3595 	VOP_UNLOCK(vp);
3596 	return (error);
3597 }
3598 
3599 /*
3600  * Set ownership given a path name; this version follows links.
3601  */
3602 /* ARGSUSED */
3603 int
3604 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3605 {
3606 	/* {
3607 		syscallarg(const char *) path;
3608 		syscallarg(uid_t) uid;
3609 		syscallarg(gid_t) gid;
3610 	} */
3611 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3612 			      SCARG(uap, gid), 0);
3613 }
3614 
3615 int
3616 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3617    gid_t gid, int flags)
3618 {
3619 	int error;
3620 	struct vnode *vp;
3621 	namei_simple_flags_t ns_flag;
3622 
3623 	if (flags & AT_SYMLINK_NOFOLLOW)
3624 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3625 	else
3626 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3627 
3628 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3629 	if (error != 0)
3630 		return error;
3631 
3632 	error = change_owner(vp, uid, gid, l, 0);
3633 
3634 	vrele(vp);
3635 
3636 	return (error);
3637 }
3638 
3639 /*
3640  * Set ownership given a path name; this version follows links.
3641  * Provides POSIX semantics.
3642  */
3643 /* ARGSUSED */
3644 int
3645 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3646 {
3647 	/* {
3648 		syscallarg(const char *) path;
3649 		syscallarg(uid_t) uid;
3650 		syscallarg(gid_t) gid;
3651 	} */
3652 	int error;
3653 	struct vnode *vp;
3654 
3655 	error = namei_simple_user(SCARG(uap, path),
3656 				NSM_FOLLOW_TRYEMULROOT, &vp);
3657 	if (error != 0)
3658 		return (error);
3659 
3660 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3661 
3662 	vrele(vp);
3663 	return (error);
3664 }
3665 
3666 /*
3667  * Set ownership given a file descriptor.
3668  */
3669 /* ARGSUSED */
3670 int
3671 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3672 {
3673 	/* {
3674 		syscallarg(int) fd;
3675 		syscallarg(uid_t) uid;
3676 		syscallarg(gid_t) gid;
3677 	} */
3678 	int error;
3679 	file_t *fp;
3680 
3681 	/* fd_getvnode() will use the descriptor for us */
3682 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3683 		return (error);
3684 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3685 	    l, 0);
3686 	fd_putfile(SCARG(uap, fd));
3687 	return (error);
3688 }
3689 
3690 int
3691 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3692     register_t *retval)
3693 {
3694 	/* {
3695 		syscallarg(int) fd;
3696 		syscallarg(const char *) path;
3697 		syscallarg(uid_t) owner;
3698 		syscallarg(gid_t) group;
3699 		syscallarg(int) flag;
3700 	} */
3701 
3702 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3703 			      SCARG(uap, owner), SCARG(uap, group),
3704 			      SCARG(uap, flag));
3705 }
3706 
3707 /*
3708  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3709  */
3710 /* ARGSUSED */
3711 int
3712 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3713 {
3714 	/* {
3715 		syscallarg(int) fd;
3716 		syscallarg(uid_t) uid;
3717 		syscallarg(gid_t) gid;
3718 	} */
3719 	int error;
3720 	file_t *fp;
3721 
3722 	/* fd_getvnode() will use the descriptor for us */
3723 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3724 		return (error);
3725 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3726 	    l, 1);
3727 	fd_putfile(SCARG(uap, fd));
3728 	return (error);
3729 }
3730 
3731 /*
3732  * Set ownership given a path name; this version does not follow links.
3733  */
3734 /* ARGSUSED */
3735 int
3736 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3737 {
3738 	/* {
3739 		syscallarg(const char *) path;
3740 		syscallarg(uid_t) uid;
3741 		syscallarg(gid_t) gid;
3742 	} */
3743 	int error;
3744 	struct vnode *vp;
3745 
3746 	error = namei_simple_user(SCARG(uap, path),
3747 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3748 	if (error != 0)
3749 		return (error);
3750 
3751 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3752 
3753 	vrele(vp);
3754 	return (error);
3755 }
3756 
3757 /*
3758  * Set ownership given a path name; this version does not follow links.
3759  * Provides POSIX/XPG semantics.
3760  */
3761 /* ARGSUSED */
3762 int
3763 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3764 {
3765 	/* {
3766 		syscallarg(const char *) path;
3767 		syscallarg(uid_t) uid;
3768 		syscallarg(gid_t) gid;
3769 	} */
3770 	int error;
3771 	struct vnode *vp;
3772 
3773 	error = namei_simple_user(SCARG(uap, path),
3774 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3775 	if (error != 0)
3776 		return (error);
3777 
3778 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3779 
3780 	vrele(vp);
3781 	return (error);
3782 }
3783 
3784 /*
3785  * Common routine to set ownership given a vnode.
3786  */
3787 static int
3788 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3789     int posix_semantics)
3790 {
3791 	struct vattr vattr;
3792 	mode_t newmode;
3793 	int error;
3794 
3795 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3796 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3797 		goto out;
3798 
3799 #define CHANGED(x) ((int)(x) != -1)
3800 	newmode = vattr.va_mode;
3801 	if (posix_semantics) {
3802 		/*
3803 		 * POSIX/XPG semantics: if the caller is not the super-user,
3804 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3805 		 * the XPG consider the behaviour for calls by the super-user
3806 		 * implementation-defined; we leave the set-user-id and set-
3807 		 * group-id settings intact in that case.
3808 		 */
3809 		if (vattr.va_mode & S_ISUID) {
3810 			if (kauth_authorize_vnode(l->l_cred,
3811 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3812 				newmode &= ~S_ISUID;
3813 		}
3814 		if (vattr.va_mode & S_ISGID) {
3815 			if (kauth_authorize_vnode(l->l_cred,
3816 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3817 				newmode &= ~S_ISGID;
3818 		}
3819 	} else {
3820 		/*
3821 		 * NetBSD semantics: when changing owner and/or group,
3822 		 * clear the respective bit(s).
3823 		 */
3824 		if (CHANGED(uid))
3825 			newmode &= ~S_ISUID;
3826 		if (CHANGED(gid))
3827 			newmode &= ~S_ISGID;
3828 	}
3829 	/* Update va_mode iff altered. */
3830 	if (vattr.va_mode == newmode)
3831 		newmode = VNOVAL;
3832 
3833 	vattr_null(&vattr);
3834 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3835 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3836 	vattr.va_mode = newmode;
3837 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3838 #undef CHANGED
3839 
3840 out:
3841 	VOP_UNLOCK(vp);
3842 	return (error);
3843 }
3844 
3845 /*
3846  * Set the access and modification times given a path name; this
3847  * version follows links.
3848  */
3849 /* ARGSUSED */
3850 int
3851 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3852     register_t *retval)
3853 {
3854 	/* {
3855 		syscallarg(const char *) path;
3856 		syscallarg(const struct timeval *) tptr;
3857 	} */
3858 
3859 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3860 	    SCARG(uap, tptr), UIO_USERSPACE);
3861 }
3862 
3863 /*
3864  * Set the access and modification times given a file descriptor.
3865  */
3866 /* ARGSUSED */
3867 int
3868 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3869     register_t *retval)
3870 {
3871 	/* {
3872 		syscallarg(int) fd;
3873 		syscallarg(const struct timeval *) tptr;
3874 	} */
3875 	int error;
3876 	file_t *fp;
3877 
3878 	/* fd_getvnode() will use the descriptor for us */
3879 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3880 		return (error);
3881 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3882 	    UIO_USERSPACE);
3883 	fd_putfile(SCARG(uap, fd));
3884 	return (error);
3885 }
3886 
3887 int
3888 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3889     register_t *retval)
3890 {
3891 	/* {
3892 		syscallarg(int) fd;
3893 		syscallarg(const struct timespec *) tptr;
3894 	} */
3895 	int error;
3896 	file_t *fp;
3897 
3898 	/* fd_getvnode() will use the descriptor for us */
3899 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3900 		return (error);
3901 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3902 	    SCARG(uap, tptr), UIO_USERSPACE);
3903 	fd_putfile(SCARG(uap, fd));
3904 	return (error);
3905 }
3906 
3907 /*
3908  * Set the access and modification times given a path name; this
3909  * version does not follow links.
3910  */
3911 int
3912 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3913     register_t *retval)
3914 {
3915 	/* {
3916 		syscallarg(const char *) path;
3917 		syscallarg(const struct timeval *) tptr;
3918 	} */
3919 
3920 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3921 	    SCARG(uap, tptr), UIO_USERSPACE);
3922 }
3923 
3924 int
3925 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3926     register_t *retval)
3927 {
3928 	/* {
3929 		syscallarg(int) fd;
3930 		syscallarg(const char *) path;
3931 		syscallarg(const struct timespec *) tptr;
3932 		syscallarg(int) flag;
3933 	} */
3934 	int follow;
3935 	const struct timespec *tptr;
3936 	int error;
3937 
3938 	tptr = SCARG(uap, tptr);
3939 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3940 
3941 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3942 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3943 
3944 	return error;
3945 }
3946 
3947 /*
3948  * Common routine to set access and modification times given a vnode.
3949  */
3950 int
3951 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3952     const struct timespec *tptr, enum uio_seg seg)
3953 {
3954 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3955 }
3956 
3957 int
3958 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3959     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3960 {
3961 	struct vattr vattr;
3962 	int error, dorele = 0;
3963 	namei_simple_flags_t sflags;
3964 	bool vanull, setbirthtime;
3965 	struct timespec ts[2];
3966 
3967 	KASSERT(l != NULL || fdat == AT_FDCWD);
3968 
3969 	/*
3970 	 * I have checked all callers and they pass either FOLLOW,
3971 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3972 	 * is 0. More to the point, they don't pass anything else.
3973 	 * Let's keep it that way at least until the namei interfaces
3974 	 * are fully sanitized.
3975 	 */
3976 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3977 	sflags = (flag == FOLLOW) ?
3978 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3979 
3980 	if (tptr == NULL) {
3981 		vanull = true;
3982 		nanotime(&ts[0]);
3983 		ts[1] = ts[0];
3984 	} else {
3985 		vanull = false;
3986 		if (seg != UIO_SYSSPACE) {
3987 			error = copyin(tptr, ts, sizeof (ts));
3988 			if (error != 0)
3989 				return error;
3990 		} else {
3991 			ts[0] = tptr[0];
3992 			ts[1] = tptr[1];
3993 		}
3994 	}
3995 
3996 	if (ts[0].tv_nsec == UTIME_NOW) {
3997 		nanotime(&ts[0]);
3998 		if (ts[1].tv_nsec == UTIME_NOW) {
3999 			vanull = true;
4000 			ts[1] = ts[0];
4001 		}
4002 	} else if (ts[1].tv_nsec == UTIME_NOW)
4003 		nanotime(&ts[1]);
4004 
4005 	if (vp == NULL) {
4006 		/* note: SEG describes TPTR, not PATH; PATH is always user */
4007 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4008 		if (error != 0)
4009 			return error;
4010 		dorele = 1;
4011 	}
4012 
4013 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4014 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4015 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
4016 	vattr_null(&vattr);
4017 
4018 	if (ts[0].tv_nsec != UTIME_OMIT)
4019 		vattr.va_atime = ts[0];
4020 
4021 	if (ts[1].tv_nsec != UTIME_OMIT) {
4022 		vattr.va_mtime = ts[1];
4023 		if (setbirthtime)
4024 			vattr.va_birthtime = ts[1];
4025 	}
4026 
4027 	if (vanull)
4028 		vattr.va_vaflags |= VA_UTIMES_NULL;
4029 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
4030 	VOP_UNLOCK(vp);
4031 
4032 	if (dorele != 0)
4033 		vrele(vp);
4034 
4035 	return error;
4036 }
4037 
4038 int
4039 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4040     const struct timeval *tptr, enum uio_seg seg)
4041 {
4042 	struct timespec ts[2];
4043 	struct timespec *tsptr = NULL;
4044 	int error;
4045 
4046 	if (tptr != NULL) {
4047 		struct timeval tv[2];
4048 
4049 		if (seg != UIO_SYSSPACE) {
4050 			error = copyin(tptr, tv, sizeof(tv));
4051 			if (error != 0)
4052 				return error;
4053 			tptr = tv;
4054 		}
4055 
4056 		if ((tptr[0].tv_usec == UTIME_NOW) ||
4057 		    (tptr[0].tv_usec == UTIME_OMIT))
4058 			ts[0].tv_nsec = tptr[0].tv_usec;
4059 		else {
4060 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4061 				return EINVAL;
4062 
4063 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4064 		}
4065 
4066 		if ((tptr[1].tv_usec == UTIME_NOW) ||
4067 		    (tptr[1].tv_usec == UTIME_OMIT))
4068 			ts[1].tv_nsec = tptr[1].tv_usec;
4069 		else {
4070 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4071 				return EINVAL;
4072 
4073 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4074 		}
4075 
4076 		tsptr = &ts[0];
4077 	}
4078 
4079 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4080 }
4081 
4082 /*
4083  * Truncate a file given its path name.
4084  */
4085 /* ARGSUSED */
4086 int
4087 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
4088 {
4089 	/* {
4090 		syscallarg(const char *) path;
4091 		syscallarg(int) pad;
4092 		syscallarg(off_t) length;
4093 	} */
4094 	struct vnode *vp;
4095 	struct vattr vattr;
4096 	int error;
4097 
4098 	if (SCARG(uap, length) < 0)
4099 		return EINVAL;
4100 
4101 	error = namei_simple_user(SCARG(uap, path),
4102 				NSM_FOLLOW_TRYEMULROOT, &vp);
4103 	if (error != 0)
4104 		return (error);
4105 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4106 	if (vp->v_type == VDIR)
4107 		error = EISDIR;
4108 	else if ((error = vn_writechk(vp)) == 0 &&
4109 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4110 		vattr_null(&vattr);
4111 		vattr.va_size = SCARG(uap, length);
4112 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4113 	}
4114 	vput(vp);
4115 	return (error);
4116 }
4117 
4118 /*
4119  * Truncate a file given a file descriptor.
4120  */
4121 /* ARGSUSED */
4122 int
4123 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
4124 {
4125 	/* {
4126 		syscallarg(int) fd;
4127 		syscallarg(int) pad;
4128 		syscallarg(off_t) length;
4129 	} */
4130 	struct vattr vattr;
4131 	struct vnode *vp;
4132 	file_t *fp;
4133 	int error;
4134 
4135 	if (SCARG(uap, length) < 0)
4136 		return EINVAL;
4137 
4138 	/* fd_getvnode() will use the descriptor for us */
4139 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4140 		return (error);
4141 	if ((fp->f_flag & FWRITE) == 0) {
4142 		error = EINVAL;
4143 		goto out;
4144 	}
4145 	vp = fp->f_vnode;
4146 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4147 	if (vp->v_type == VDIR)
4148 		error = EISDIR;
4149 	else if ((error = vn_writechk(vp)) == 0) {
4150 		vattr_null(&vattr);
4151 		vattr.va_size = SCARG(uap, length);
4152 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
4153 	}
4154 	VOP_UNLOCK(vp);
4155  out:
4156 	fd_putfile(SCARG(uap, fd));
4157 	return (error);
4158 }
4159 
4160 /*
4161  * Sync an open file.
4162  */
4163 /* ARGSUSED */
4164 int
4165 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4166 {
4167 	/* {
4168 		syscallarg(int) fd;
4169 	} */
4170 	struct vnode *vp;
4171 	file_t *fp;
4172 	int error;
4173 
4174 	/* fd_getvnode() will use the descriptor for us */
4175 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4176 		return (error);
4177 	vp = fp->f_vnode;
4178 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4179 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4180 	VOP_UNLOCK(vp);
4181 	fd_putfile(SCARG(uap, fd));
4182 	return (error);
4183 }
4184 
4185 /*
4186  * Sync a range of file data.  API modeled after that found in AIX.
4187  *
4188  * FDATASYNC indicates that we need only save enough metadata to be able
4189  * to re-read the written data.
4190  */
4191 /* ARGSUSED */
4192 int
4193 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4194 {
4195 	/* {
4196 		syscallarg(int) fd;
4197 		syscallarg(int) flags;
4198 		syscallarg(off_t) start;
4199 		syscallarg(off_t) length;
4200 	} */
4201 	struct vnode *vp;
4202 	file_t *fp;
4203 	int flags, nflags;
4204 	off_t s, e, len;
4205 	int error;
4206 
4207 	/* fd_getvnode() will use the descriptor for us */
4208 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4209 		return (error);
4210 
4211 	if ((fp->f_flag & FWRITE) == 0) {
4212 		error = EBADF;
4213 		goto out;
4214 	}
4215 
4216 	flags = SCARG(uap, flags);
4217 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4218 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4219 		error = EINVAL;
4220 		goto out;
4221 	}
4222 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4223 	if (flags & FDATASYNC)
4224 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4225 	else
4226 		nflags = FSYNC_WAIT;
4227 	if (flags & FDISKSYNC)
4228 		nflags |= FSYNC_CACHE;
4229 
4230 	len = SCARG(uap, length);
4231 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4232 	if (len) {
4233 		s = SCARG(uap, start);
4234 		if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4235 			error = EINVAL;
4236 			goto out;
4237 		}
4238 		e = s + len;
4239 		KASSERT(s <= e);
4240 	} else {
4241 		e = 0;
4242 		s = 0;
4243 	}
4244 
4245 	vp = fp->f_vnode;
4246 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4247 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4248 	VOP_UNLOCK(vp);
4249 out:
4250 	fd_putfile(SCARG(uap, fd));
4251 	return (error);
4252 }
4253 
4254 /*
4255  * Sync the data of an open file.
4256  */
4257 /* ARGSUSED */
4258 int
4259 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4260 {
4261 	/* {
4262 		syscallarg(int) fd;
4263 	} */
4264 	struct vnode *vp;
4265 	file_t *fp;
4266 	int error;
4267 
4268 	/* fd_getvnode() will use the descriptor for us */
4269 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4270 		return (error);
4271 	vp = fp->f_vnode;
4272 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4273 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4274 	VOP_UNLOCK(vp);
4275 	fd_putfile(SCARG(uap, fd));
4276 	return (error);
4277 }
4278 
4279 /*
4280  * Rename files, (standard) BSD semantics frontend.
4281  */
4282 /* ARGSUSED */
4283 int
4284 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4285 {
4286 	/* {
4287 		syscallarg(const char *) from;
4288 		syscallarg(const char *) to;
4289 	} */
4290 
4291 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4292 	    SCARG(uap, to), UIO_USERSPACE, 0));
4293 }
4294 
4295 int
4296 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4297     register_t *retval)
4298 {
4299 	/* {
4300 		syscallarg(int) fromfd;
4301 		syscallarg(const char *) from;
4302 		syscallarg(int) tofd;
4303 		syscallarg(const char *) to;
4304 	} */
4305 
4306 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4307 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4308 }
4309 
4310 /*
4311  * Rename files, POSIX semantics frontend.
4312  */
4313 /* ARGSUSED */
4314 int
4315 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4316 {
4317 	/* {
4318 		syscallarg(const char *) from;
4319 		syscallarg(const char *) to;
4320 	} */
4321 
4322 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4323 	    SCARG(uap, to), UIO_USERSPACE, 1));
4324 }
4325 
4326 /*
4327  * Rename files.  Source and destination must either both be directories,
4328  * or both not be directories.  If target is a directory, it must be empty.
4329  * If `from' and `to' refer to the same object, the value of the `retain'
4330  * argument is used to determine whether `from' will be
4331  *
4332  * (retain == 0)	deleted unless `from' and `to' refer to the same
4333  *			object in the file system's name space (BSD).
4334  * (retain == 1)	always retained (POSIX).
4335  *
4336  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4337  */
4338 int
4339 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4340 {
4341 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4342 }
4343 
4344 static int
4345 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4346     const char *to, enum uio_seg seg, int retain)
4347 {
4348 	struct pathbuf *fpb, *tpb;
4349 	struct nameidata fnd, tnd;
4350 	struct vnode *fdvp, *fvp;
4351 	struct vnode *tdvp, *tvp;
4352 	struct mount *mp, *tmp;
4353 	int error;
4354 
4355 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4356 
4357 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4358 	if (error)
4359 		goto out0;
4360 	KASSERT(fpb != NULL);
4361 
4362 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4363 	if (error)
4364 		goto out1;
4365 	KASSERT(tpb != NULL);
4366 
4367 	/*
4368 	 * Lookup from.
4369 	 *
4370 	 * XXX LOCKPARENT is wrong because we don't actually want it
4371 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4372 	 * insane, so for the time being we need to leave it like this.
4373 	 */
4374 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4375 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4376 		goto out2;
4377 
4378 	/*
4379 	 * Pull out the important results of the lookup, fdvp and fvp.
4380 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4381 	 */
4382 	fdvp = fnd.ni_dvp;
4383 	fvp = fnd.ni_vp;
4384 	mp = fdvp->v_mount;
4385 	KASSERT(fdvp != NULL);
4386 	KASSERT(fvp != NULL);
4387 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4388 	/*
4389 	 * Bracket the operation with fstrans_start()/fstrans_done().
4390 	 *
4391 	 * Inside the bracket this file system cannot be unmounted so
4392 	 * a vnode on this file system cannot change its v_mount.
4393 	 * A vnode on another file system may still change to dead mount.
4394 	 */
4395 	fstrans_start(mp);
4396 
4397 	/*
4398 	 * Make sure neither fdvp nor fvp is locked.
4399 	 */
4400 	if (fdvp != fvp)
4401 		VOP_UNLOCK(fdvp);
4402 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4403 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4404 
4405 	/*
4406 	 * Reject renaming `.' and `..'.  Can't do this until after
4407 	 * namei because we need namei's parsing to find the final
4408 	 * component name.  (namei should just leave us with the final
4409 	 * component name and not look it up itself, but anyway...)
4410 	 *
4411 	 * This was here before because we used to relookup from
4412 	 * instead of to and relookup requires the caller to check
4413 	 * this, but now file systems may depend on this check, so we
4414 	 * must retain it until the file systems are all rototilled.
4415 	 */
4416 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4417 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4418 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4419 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4420 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4421 		error = EINVAL;	/* XXX EISDIR?  */
4422 		goto abort0;
4423 	}
4424 
4425 	/*
4426 	 * Lookup to.
4427 	 *
4428 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4429 	 * fvp here to decide whether to add CREATEDIR is a load of
4430 	 * bollocks because fvp might be the wrong node by now, since
4431 	 * fdvp is unlocked.
4432 	 *
4433 	 * XXX Why not pass CREATEDIR always?
4434 	 */
4435 	NDINIT(&tnd, RENAME,
4436 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4437 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4438 	    tpb);
4439 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4440 		goto abort0;
4441 
4442 	/*
4443 	 * Pull out the important results of the lookup, tdvp and tvp.
4444 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4445 	 */
4446 	tdvp = tnd.ni_dvp;
4447 	tvp = tnd.ni_vp;
4448 	KASSERT(tdvp != NULL);
4449 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4450 
4451 	if (fvp->v_type == VDIR)
4452 		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4453 	/*
4454 	 * Make sure neither tdvp nor tvp is locked.
4455 	 */
4456 	if (tdvp != tvp)
4457 		VOP_UNLOCK(tdvp);
4458 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4459 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4460 
4461 	/*
4462 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4463 	 * these, which is why we must do this here.  Once upon a time
4464 	 * we relooked up from instead of to, and consequently didn't
4465 	 * need this check, but now that we relookup to instead of
4466 	 * from, we need this; and we shall need it forever forward
4467 	 * until the VOP_RENAME protocol changes, because file systems
4468 	 * will no doubt begin to depend on this check.
4469 	 */
4470 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4471 		error = EISDIR;
4472 		goto abort1;
4473 	}
4474 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4475 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4476 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4477 		error = EINVAL;
4478 		goto abort1;
4479 	}
4480 
4481 	/*
4482 	 * Make sure the mount points match.  Although we don't hold
4483 	 * any vnode locks, the v_mount on fdvp file system are stable.
4484 	 *
4485 	 * Unmounting another file system at an inopportune moment may
4486 	 * cause tdvp to disappear and change its v_mount to dead.
4487 	 *
4488 	 * So in either case different v_mount means cross-device rename.
4489 	 */
4490 	KASSERT(mp != NULL);
4491 	tmp = tdvp->v_mount;
4492 
4493 	if (mp != tmp) {
4494 		error = EXDEV;
4495 		goto abort1;
4496 	}
4497 
4498 	/*
4499 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4500 	 * Nothing is locked currently, so taking this lock is safe.
4501 	 */
4502 	error = VFS_RENAMELOCK_ENTER(mp);
4503 	if (error)
4504 		goto abort1;
4505 
4506 	/*
4507 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4508 	 * and nothing is locked except for the vfs rename lock.
4509 	 *
4510 	 * The next step is a little rain dance to conform to the
4511 	 * insane lock protocol, even though it does nothing to ward
4512 	 * off race conditions.
4513 	 *
4514 	 * We need tdvp and tvp to be locked.  However, because we have
4515 	 * unlocked tdvp in order to hold no locks while we take the
4516 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4517 	 * lock it even if the sensible file systems will just unlock
4518 	 * it straight away.  Consequently, we must lock tdvp and then
4519 	 * relookup tvp to get it locked.
4520 	 *
4521 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4522 	 * and various file systems insanely depend on the semantics of
4523 	 * this brain damage, the lookup of to must be the last lookup
4524 	 * before VOP_RENAME.
4525 	 */
4526 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4527 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4528 	if (error)
4529 		goto abort2;
4530 
4531 	/*
4532 	 * Drop the old tvp and pick up the new one -- which might be
4533 	 * the same, but that doesn't matter to us.  After this, tdvp
4534 	 * and tvp should both be locked.
4535 	 */
4536 	if (tvp != NULL)
4537 		vrele(tvp);
4538 	tvp = tnd.ni_vp;
4539 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4540 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4541 
4542 	/*
4543 	 * The old do_sys_rename had various consistency checks here
4544 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4545 	 * will become bogus soon in any sensible file system, so the
4546 	 * only purpose in putting these checks here is to give lip
4547 	 * service to these screw cases and to acknowledge that they
4548 	 * exist, not actually to handle them, but here you go
4549 	 * anyway...
4550 	 */
4551 
4552 	/*
4553 	 * Acknowledge that directories and non-directories aren't
4554 	 * suposed to mix.
4555 	 */
4556 	if (tvp != NULL) {
4557 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4558 			error = ENOTDIR;
4559 			goto abort3;
4560 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4561 			error = EISDIR;
4562 			goto abort3;
4563 		}
4564 	}
4565 
4566 	/*
4567 	 * Acknowledge some random screw case, among the dozens that
4568 	 * might arise.
4569 	 */
4570 	if (fvp == tdvp) {
4571 		error = EINVAL;
4572 		goto abort3;
4573 	}
4574 
4575 	/*
4576 	 * Acknowledge that POSIX has a wacky screw case.
4577 	 *
4578 	 * XXX Eventually the retain flag needs to be passed on to
4579 	 * VOP_RENAME.
4580 	 */
4581 	if (fvp == tvp) {
4582 		if (retain) {
4583 			error = 0;
4584 			goto abort3;
4585 		} else if ((fdvp == tdvp) &&
4586 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4587 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4588 			fnd.ni_cnd.cn_namelen))) {
4589 			error = 0;
4590 			goto abort3;
4591 		}
4592 	}
4593 
4594 	/*
4595 	 * Make sure veriexec can screw us up.  (But a race can screw
4596 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4597 	 * bogus.)
4598 	 */
4599 #if NVERIEXEC > 0
4600 	{
4601 		char *f1, *f2;
4602 		size_t f1_len;
4603 		size_t f2_len;
4604 
4605 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4606 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4607 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4608 
4609 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4610 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4611 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4612 
4613 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4614 
4615 		kmem_free(f1, f1_len);
4616 		kmem_free(f2, f2_len);
4617 
4618 		if (error)
4619 			goto abort3;
4620 	}
4621 #endif /* NVERIEXEC > 0 */
4622 
4623 	/*
4624 	 * All ready.  Incant the rename vop.
4625 	 */
4626 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4627 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4628 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4629 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4630 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4631 
4632 	/*
4633 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4634 	 * tdvp and tvp.  But we can't assert any of that.
4635 	 */
4636 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4637 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4638 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4639 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4640 
4641 	/*
4642 	 * So all we have left to do is to drop the rename lock and
4643 	 * destroy the pathbufs.
4644 	 */
4645 	VFS_RENAMELOCK_EXIT(mp);
4646 	fstrans_done(mp);
4647 	goto out2;
4648 
4649 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4650 		VOP_UNLOCK(tvp);
4651 abort2:	VOP_UNLOCK(tdvp);
4652 	VFS_RENAMELOCK_EXIT(mp);
4653 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4654 	vrele(tdvp);
4655 	if (tvp != NULL)
4656 		vrele(tvp);
4657 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4658 	vrele(fdvp);
4659 	vrele(fvp);
4660 	fstrans_done(mp);
4661 out2:	pathbuf_destroy(tpb);
4662 out1:	pathbuf_destroy(fpb);
4663 out0:	return error;
4664 }
4665 
4666 /*
4667  * Make a directory file.
4668  */
4669 /* ARGSUSED */
4670 int
4671 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4672 {
4673 	/* {
4674 		syscallarg(const char *) path;
4675 		syscallarg(int) mode;
4676 	} */
4677 
4678 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4679 	    SCARG(uap, mode), UIO_USERSPACE);
4680 }
4681 
4682 int
4683 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4684     register_t *retval)
4685 {
4686 	/* {
4687 		syscallarg(int) fd;
4688 		syscallarg(const char *) path;
4689 		syscallarg(int) mode;
4690 	} */
4691 
4692 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4693 	    SCARG(uap, mode), UIO_USERSPACE);
4694 }
4695 
4696 
4697 int
4698 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4699 {
4700 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4701 }
4702 
4703 static int
4704 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4705     enum uio_seg seg)
4706 {
4707 	struct proc *p = curlwp->l_proc;
4708 	struct vnode *vp;
4709 	struct vattr vattr;
4710 	int error;
4711 	struct pathbuf *pb;
4712 	struct nameidata nd;
4713 
4714 	KASSERT(l != NULL || fdat == AT_FDCWD);
4715 
4716 	/* XXX bollocks, should pass in a pathbuf */
4717 	error = pathbuf_maybe_copyin(path, seg, &pb);
4718 	if (error) {
4719 		return error;
4720 	}
4721 
4722 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4723 
4724 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4725 		pathbuf_destroy(pb);
4726 		return (error);
4727 	}
4728 	vp = nd.ni_vp;
4729 	if (vp != NULL) {
4730 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4731 		if (nd.ni_dvp == vp)
4732 			vrele(nd.ni_dvp);
4733 		else
4734 			vput(nd.ni_dvp);
4735 		vrele(vp);
4736 		pathbuf_destroy(pb);
4737 		return (EEXIST);
4738 	}
4739 	vattr_null(&vattr);
4740 	vattr.va_type = VDIR;
4741 	/* We will read cwdi->cwdi_cmask unlocked. */
4742 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4743 	nd.ni_cnd.cn_flags |= WILLBEDIR;
4744 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4745 	if (!error)
4746 		vrele(nd.ni_vp);
4747 	vput(nd.ni_dvp);
4748 	pathbuf_destroy(pb);
4749 	return (error);
4750 }
4751 
4752 /*
4753  * Remove a directory file.
4754  */
4755 /* ARGSUSED */
4756 int
4757 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4758 {
4759 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4760 	    AT_REMOVEDIR, UIO_USERSPACE);
4761 }
4762 
4763 /*
4764  * Read a block of directory entries in a file system independent format.
4765  */
4766 int
4767 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4768 {
4769 	/* {
4770 		syscallarg(int) fd;
4771 		syscallarg(char *) buf;
4772 		syscallarg(size_t) count;
4773 	} */
4774 	file_t *fp;
4775 	int error, done;
4776 
4777 	/* fd_getvnode() will use the descriptor for us */
4778 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4779 		return (error);
4780 	if ((fp->f_flag & FREAD) == 0) {
4781 		error = EBADF;
4782 		goto out;
4783 	}
4784 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4785 			SCARG(uap, count), &done, l, 0, 0);
4786 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4787 	*retval = done;
4788  out:
4789 	fd_putfile(SCARG(uap, fd));
4790 	return (error);
4791 }
4792 
4793 /*
4794  * Set the mode mask for creation of filesystem nodes.
4795  */
4796 int
4797 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4798 {
4799 	/* {
4800 		syscallarg(mode_t) newmask;
4801 	} */
4802 
4803 	/*
4804 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4805 	 * serialization with those reads is required.  It's important to
4806 	 * return a coherent answer for the caller of umask() though, and
4807 	 * the atomic operation accomplishes that.
4808 	 */
4809 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4810 	    SCARG(uap, newmask) & ALLPERMS);
4811 
4812 	return (0);
4813 }
4814 
4815 int
4816 dorevoke(struct vnode *vp, kauth_cred_t cred)
4817 {
4818 	struct vattr vattr;
4819 	int error, fs_decision;
4820 
4821 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4822 	error = VOP_GETATTR(vp, &vattr, cred);
4823 	VOP_UNLOCK(vp);
4824 	if (error != 0)
4825 		return error;
4826 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4827 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4828 	    fs_decision);
4829 	if (!error)
4830 		VOP_REVOKE(vp, REVOKEALL);
4831 	return (error);
4832 }
4833 
4834 /*
4835  * Void all references to file by ripping underlying filesystem
4836  * away from vnode.
4837  */
4838 /* ARGSUSED */
4839 int
4840 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4841 {
4842 	/* {
4843 		syscallarg(const char *) path;
4844 	} */
4845 	struct vnode *vp;
4846 	int error;
4847 
4848 	error = namei_simple_user(SCARG(uap, path),
4849 				NSM_FOLLOW_TRYEMULROOT, &vp);
4850 	if (error != 0)
4851 		return (error);
4852 	error = dorevoke(vp, l->l_cred);
4853 	vrele(vp);
4854 	return (error);
4855 }
4856 
4857 /*
4858  * Allocate backing store for a file, filling a hole without having to
4859  * explicitly write anything out.
4860  */
4861 /* ARGSUSED */
4862 int
4863 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4864 		register_t *retval)
4865 {
4866 	/* {
4867 		syscallarg(int) fd;
4868 		syscallarg(off_t) pos;
4869 		syscallarg(off_t) len;
4870 	} */
4871 	int fd;
4872 	off_t pos, len;
4873 	struct file *fp;
4874 	struct vnode *vp;
4875 	int error;
4876 
4877 	fd = SCARG(uap, fd);
4878 	pos = SCARG(uap, pos);
4879 	len = SCARG(uap, len);
4880 
4881 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4882 		*retval = EINVAL;
4883 		return 0;
4884 	}
4885 
4886 	error = fd_getvnode(fd, &fp);
4887 	if (error) {
4888 		*retval = error;
4889 		return 0;
4890 	}
4891 	if ((fp->f_flag & FWRITE) == 0) {
4892 		error = EBADF;
4893 		goto fail;
4894 	}
4895 	vp = fp->f_vnode;
4896 
4897 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4898 	if (vp->v_type == VDIR) {
4899 		error = EISDIR;
4900 	} else {
4901 		error = VOP_FALLOCATE(vp, pos, len);
4902 	}
4903 	VOP_UNLOCK(vp);
4904 
4905 fail:
4906 	fd_putfile(fd);
4907 	*retval = error;
4908 	return 0;
4909 }
4910 
4911 /*
4912  * Deallocate backing store for a file, creating a hole. Also used for
4913  * invoking TRIM on disks.
4914  */
4915 /* ARGSUSED */
4916 int
4917 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4918 		register_t *retval)
4919 {
4920 	/* {
4921 		syscallarg(int) fd;
4922 		syscallarg(off_t) pos;
4923 		syscallarg(off_t) len;
4924 	} */
4925 	int fd;
4926 	off_t pos, len;
4927 	struct file *fp;
4928 	struct vnode *vp;
4929 	int error;
4930 
4931 	fd = SCARG(uap, fd);
4932 	pos = SCARG(uap, pos);
4933 	len = SCARG(uap, len);
4934 
4935 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4936 		return EINVAL;
4937 	}
4938 
4939 	error = fd_getvnode(fd, &fp);
4940 	if (error) {
4941 		return error;
4942 	}
4943 	if ((fp->f_flag & FWRITE) == 0) {
4944 		error = EBADF;
4945 		goto fail;
4946 	}
4947 	vp = fp->f_vnode;
4948 
4949 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4950 	if (vp->v_type == VDIR) {
4951 		error = EISDIR;
4952 	} else {
4953 		error = VOP_FDISCARD(vp, pos, len);
4954 	}
4955 	VOP_UNLOCK(vp);
4956 
4957 fail:
4958 	fd_putfile(fd);
4959 	return error;
4960 }
4961