xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision a6f3f22f245acb8ee3bbf6871d7dce989204fa97)
1 /*	$NetBSD: vfs_syscalls.c,v 1.500 2015/07/24 13:02:52 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.500 2015/07/24 13:02:52 maxv Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/proc.h>
91 #include <sys/uio.h>
92 #include <sys/kmem.h>
93 #include <sys/dirent.h>
94 #include <sys/sysctl.h>
95 #include <sys/syscallargs.h>
96 #include <sys/vfs_syscalls.h>
97 #include <sys/quota.h>
98 #include <sys/quotactl.h>
99 #include <sys/ktrace.h>
100 #ifdef FILEASSOC
101 #include <sys/fileassoc.h>
102 #endif /* FILEASSOC */
103 #include <sys/extattr.h>
104 #include <sys/verified_exec.h>
105 #include <sys/kauth.h>
106 #include <sys/atomic.h>
107 #include <sys/module.h>
108 #include <sys/buf.h>
109 
110 #include <miscfs/genfs/genfs.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <nfs/rpcv2.h>
114 #include <nfs/nfsproto.h>
115 #include <nfs/nfs.h>
116 #include <nfs/nfs_var.h>
117 
118 /* XXX this shouldn't be here */
119 #ifndef OFF_T_MAX
120 #define OFF_T_MAX __type_max(off_t)
121 #endif
122 
123 static int change_flags(struct vnode *, u_long, struct lwp *);
124 static int change_mode(struct vnode *, int, struct lwp *);
125 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
126 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
127 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
128     enum uio_seg);
129 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
130 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
131     enum uio_seg);
132 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
133     enum uio_seg, int);
134 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
135     size_t, register_t *);
136 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
137 
138 static int fd_nameiat(struct lwp *, int, struct nameidata *);
139 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
140     namei_simple_flags_t, struct vnode **);
141 
142 
143 /*
144  * This table is used to maintain compatibility with 4.3BSD
145  * and NetBSD 0.9 mount syscalls - and possibly other systems.
146  * Note, the order is important!
147  *
148  * Do not modify this table. It should only contain filesystems
149  * supported by NetBSD 0.9 and 4.3BSD.
150  */
151 const char * const mountcompatnames[] = {
152 	NULL,		/* 0 = MOUNT_NONE */
153 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
154 	MOUNT_NFS,	/* 2 */
155 	MOUNT_MFS,	/* 3 */
156 	MOUNT_MSDOS,	/* 4 */
157 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
158 	MOUNT_FDESC,	/* 6 */
159 	MOUNT_KERNFS,	/* 7 */
160 	NULL,		/* 8 = MOUNT_DEVFS */
161 	MOUNT_AFS,	/* 9 */
162 };
163 
164 const int nmountcompatnames = __arraycount(mountcompatnames);
165 
166 static int
167 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
168 {
169 	file_t *dfp;
170 	int error;
171 
172 	if (fdat != AT_FDCWD) {
173 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
174 			goto out;
175 
176 		NDAT(ndp, dfp->f_vnode);
177 	}
178 
179 	error = namei(ndp);
180 
181 	if (fdat != AT_FDCWD)
182 		fd_putfile(fdat);
183 out:
184 	return error;
185 }
186 
187 static int
188 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
189     namei_simple_flags_t sflags, struct vnode **vp_ret)
190 {
191 	file_t *dfp;
192 	struct vnode *dvp;
193 	int error;
194 
195 	if (fdat != AT_FDCWD) {
196 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
197 			goto out;
198 
199 		dvp = dfp->f_vnode;
200 	} else {
201 		dvp = NULL;
202 	}
203 
204 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
205 
206 	if (fdat != AT_FDCWD)
207 		fd_putfile(fdat);
208 out:
209 	return error;
210 }
211 
212 static int
213 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
214 {
215 	int error;
216 
217 	fp->f_flag = flags & FMASK;
218 	fp->f_type = DTYPE_VNODE;
219 	fp->f_ops = &vnops;
220 	fp->f_vnode = vp;
221 
222 	if (flags & (O_EXLOCK | O_SHLOCK)) {
223 		struct flock lf;
224 		int type;
225 
226 		lf.l_whence = SEEK_SET;
227 		lf.l_start = 0;
228 		lf.l_len = 0;
229 		if (flags & O_EXLOCK)
230 			lf.l_type = F_WRLCK;
231 		else
232 			lf.l_type = F_RDLCK;
233 		type = F_FLOCK;
234 		if ((flags & FNONBLOCK) == 0)
235 			type |= F_WAIT;
236 		VOP_UNLOCK(vp);
237 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
238 		if (error) {
239 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
240 			fd_abort(l->l_proc, fp, indx);
241 			return error;
242 		}
243 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
244 		atomic_or_uint(&fp->f_flag, FHASLOCK);
245 	}
246 	if (flags & O_CLOEXEC)
247 		fd_set_exclose(l, indx, true);
248 	return 0;
249 }
250 
251 static int
252 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
253     void *data, size_t *data_len)
254 {
255 	struct mount *mp;
256 	int error = 0, saved_flags;
257 
258 	mp = vp->v_mount;
259 	saved_flags = mp->mnt_flag;
260 
261 	/* We can operate only on VV_ROOT nodes. */
262 	if ((vp->v_vflag & VV_ROOT) == 0) {
263 		error = EINVAL;
264 		goto out;
265 	}
266 
267 	/*
268 	 * We only allow the filesystem to be reloaded if it
269 	 * is currently mounted read-only.  Additionally, we
270 	 * prevent read-write to read-only downgrades.
271 	 */
272 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
273 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
274 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
275 		error = EOPNOTSUPP;	/* Needs translation */
276 		goto out;
277 	}
278 
279 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
280 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
281 	if (error)
282 		goto out;
283 
284 	if (vfs_busy(mp, NULL)) {
285 		error = EPERM;
286 		goto out;
287 	}
288 
289 	mutex_enter(&mp->mnt_updating);
290 
291 	mp->mnt_flag &= ~MNT_OP_FLAGS;
292 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
293 
294 	/*
295 	 * Set the mount level flags.
296 	 */
297 	if (flags & MNT_RDONLY)
298 		mp->mnt_flag |= MNT_RDONLY;
299 	else if (mp->mnt_flag & MNT_RDONLY)
300 		mp->mnt_iflag |= IMNT_WANTRDWR;
301 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
302 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
303 	error = VFS_MOUNT(mp, path, data, data_len);
304 
305 	if (error && data != NULL) {
306 		int error2;
307 
308 		/*
309 		 * Update failed; let's try and see if it was an
310 		 * export request.  For compat with 3.0 and earlier.
311 		 */
312 		error2 = vfs_hooks_reexport(mp, path, data);
313 
314 		/*
315 		 * Only update error code if the export request was
316 		 * understood but some problem occurred while
317 		 * processing it.
318 		 */
319 		if (error2 != EJUSTRETURN)
320 			error = error2;
321 	}
322 
323 	if (mp->mnt_iflag & IMNT_WANTRDWR)
324 		mp->mnt_flag &= ~MNT_RDONLY;
325 	if (error)
326 		mp->mnt_flag = saved_flags;
327 	mp->mnt_flag &= ~MNT_OP_FLAGS;
328 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
329 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
330 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
331 			vfs_syncer_add_to_worklist(mp);
332 	} else {
333 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
334 			vfs_syncer_remove_from_worklist(mp);
335 	}
336 	mutex_exit(&mp->mnt_updating);
337 	vfs_unbusy(mp, false, NULL);
338 
339 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
340 	    (flags & MNT_EXTATTR)) {
341 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
342 				   NULL, 0, NULL) != 0) {
343 			printf("%s: failed to start extattr, error = %d",
344 			       mp->mnt_stat.f_mntonname, error);
345 			mp->mnt_flag &= ~MNT_EXTATTR;
346 		}
347 	}
348 
349 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
350 	    !(flags & MNT_EXTATTR)) {
351 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
352 				   NULL, 0, NULL) != 0) {
353 			printf("%s: failed to stop extattr, error = %d",
354 			       mp->mnt_stat.f_mntonname, error);
355 			mp->mnt_flag |= MNT_RDONLY;
356 		}
357 	}
358  out:
359 	return (error);
360 }
361 
362 static int
363 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
364 {
365 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
366 	int error;
367 
368 	/* Copy file-system type from userspace.  */
369 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
370 	if (error) {
371 		/*
372 		 * Historically, filesystem types were identified by numbers.
373 		 * If we get an integer for the filesystem type instead of a
374 		 * string, we check to see if it matches one of the historic
375 		 * filesystem types.
376 		 */
377 		u_long fsindex = (u_long)fstype;
378 		if (fsindex >= nmountcompatnames ||
379 		    mountcompatnames[fsindex] == NULL)
380 			return ENODEV;
381 		strlcpy(fstypename, mountcompatnames[fsindex],
382 		    sizeof(fstypename));
383 	}
384 
385 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
386 	if (strcmp(fstypename, "ufs") == 0)
387 		fstypename[0] = 'f';
388 
389 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
390 		return 0;
391 
392 	/* If we can autoload a vfs module, try again */
393 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
394 
395 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
396 		return 0;
397 
398 	return ENODEV;
399 }
400 
401 static int
402 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
403     void *data, size_t *data_len)
404 {
405 	struct mount *mp;
406 	int error;
407 
408 	/* If MNT_GETARGS is specified, it should be the only flag. */
409 	if (flags & ~MNT_GETARGS)
410 		return EINVAL;
411 
412 	mp = vp->v_mount;
413 
414 	/* XXX: probably some notion of "can see" here if we want isolation. */
415 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
416 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
417 	if (error)
418 		return error;
419 
420 	if ((vp->v_vflag & VV_ROOT) == 0)
421 		return EINVAL;
422 
423 	if (vfs_busy(mp, NULL))
424 		return EPERM;
425 
426 	mutex_enter(&mp->mnt_updating);
427 	mp->mnt_flag &= ~MNT_OP_FLAGS;
428 	mp->mnt_flag |= MNT_GETARGS;
429 	error = VFS_MOUNT(mp, path, data, data_len);
430 	mp->mnt_flag &= ~MNT_OP_FLAGS;
431 	mutex_exit(&mp->mnt_updating);
432 
433 	vfs_unbusy(mp, false, NULL);
434 	return (error);
435 }
436 
437 int
438 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
439 {
440 	/* {
441 		syscallarg(const char *) type;
442 		syscallarg(const char *) path;
443 		syscallarg(int) flags;
444 		syscallarg(void *) data;
445 		syscallarg(size_t) data_len;
446 	} */
447 
448 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
449 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
450 	    SCARG(uap, data_len), retval);
451 }
452 
453 int
454 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
455     const char *path, int flags, void *data, enum uio_seg data_seg,
456     size_t data_len, register_t *retval)
457 {
458 	struct vnode *vp;
459 	void *data_buf = data;
460 	bool vfsopsrele = false;
461 	size_t alloc_sz = 0;
462 	int error;
463 
464 	/* XXX: The calling convention of this routine is totally bizarre */
465 	if (vfsops)
466 		vfsopsrele = true;
467 
468 	/*
469 	 * Get vnode to be covered
470 	 */
471 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
472 	if (error != 0) {
473 		vp = NULL;
474 		goto done;
475 	}
476 
477 	if (vfsops == NULL) {
478 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
479 			vfsops = vp->v_mount->mnt_op;
480 		} else {
481 			/* 'type' is userspace */
482 			error = mount_get_vfsops(type, &vfsops);
483 			if (error != 0)
484 				goto done;
485 			vfsopsrele = true;
486 		}
487 	}
488 
489 	/*
490 	 * We allow data to be NULL, even for userspace. Some fs's don't need
491 	 * it. The others will handle NULL.
492 	 */
493 	if (data != NULL && data_seg == UIO_USERSPACE) {
494 		if (data_len == 0) {
495 			/* No length supplied, use default for filesystem */
496 			data_len = vfsops->vfs_min_mount_data;
497 
498 			/*
499 			 * Hopefully a longer buffer won't make copyin() fail.
500 			 * For compatibility with 3.0 and earlier.
501 			 */
502 			if (flags & MNT_UPDATE
503 			    && data_len < sizeof (struct mnt_export_args30))
504 				data_len = sizeof (struct mnt_export_args30);
505 		}
506 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
507 			error = EINVAL;
508 			goto done;
509 		}
510 		alloc_sz = data_len;
511 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
512 
513 		/* NFS needs the buffer even for mnt_getargs .... */
514 		error = copyin(data, data_buf, data_len);
515 		if (error != 0)
516 			goto done;
517 	}
518 
519 	if (flags & MNT_GETARGS) {
520 		if (data_len == 0) {
521 			error = EINVAL;
522 			goto done;
523 		}
524 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
525 		if (error != 0)
526 			goto done;
527 		if (data_seg == UIO_USERSPACE)
528 			error = copyout(data_buf, data, data_len);
529 		*retval = data_len;
530 	} else if (flags & MNT_UPDATE) {
531 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
532 	} else {
533 		/* Locking is handled internally in mount_domount(). */
534 		KASSERT(vfsopsrele == true);
535 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
536 		    &data_len);
537 		vfsopsrele = false;
538 	}
539 
540     done:
541 	if (vfsopsrele)
542 		vfs_delref(vfsops);
543     	if (vp != NULL) {
544 	    	vrele(vp);
545 	}
546 	if (data_buf != data)
547 		kmem_free(data_buf, alloc_sz);
548 	return (error);
549 }
550 
551 /*
552  * Unmount a file system.
553  *
554  * Note: unmount takes a path to the vnode mounted on as argument,
555  * not special file (as before).
556  */
557 /* ARGSUSED */
558 int
559 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
560 {
561 	/* {
562 		syscallarg(const char *) path;
563 		syscallarg(int) flags;
564 	} */
565 	struct vnode *vp;
566 	struct mount *mp;
567 	int error;
568 	struct pathbuf *pb;
569 	struct nameidata nd;
570 
571 	error = pathbuf_copyin(SCARG(uap, path), &pb);
572 	if (error) {
573 		return error;
574 	}
575 
576 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
577 	if ((error = namei(&nd)) != 0) {
578 		pathbuf_destroy(pb);
579 		return error;
580 	}
581 	vp = nd.ni_vp;
582 	pathbuf_destroy(pb);
583 
584 	mp = vp->v_mount;
585 	atomic_inc_uint(&mp->mnt_refcnt);
586 	VOP_UNLOCK(vp);
587 
588 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
589 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
590 	if (error) {
591 		vrele(vp);
592 		vfs_destroy(mp);
593 		return (error);
594 	}
595 
596 	/*
597 	 * Don't allow unmounting the root file system.
598 	 */
599 	if (mp->mnt_flag & MNT_ROOTFS) {
600 		vrele(vp);
601 		vfs_destroy(mp);
602 		return (EINVAL);
603 	}
604 
605 	/*
606 	 * Must be the root of the filesystem
607 	 */
608 	if ((vp->v_vflag & VV_ROOT) == 0) {
609 		vrele(vp);
610 		vfs_destroy(mp);
611 		return (EINVAL);
612 	}
613 
614 	vrele(vp);
615 	error = dounmount(mp, SCARG(uap, flags), l);
616 	vfs_destroy(mp);
617 	return error;
618 }
619 
620 /*
621  * Sync each mounted filesystem.
622  */
623 #ifdef DEBUG
624 int syncprt = 0;
625 struct ctldebug debug0 = { "syncprt", &syncprt };
626 #endif
627 
628 void
629 do_sys_sync(struct lwp *l)
630 {
631 	struct mount *mp, *nmp;
632 	int asyncflag;
633 
634 	mutex_enter(&mountlist_lock);
635 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
636 		if (vfs_busy(mp, &nmp)) {
637 			continue;
638 		}
639 		mutex_enter(&mp->mnt_updating);
640 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
641 			asyncflag = mp->mnt_flag & MNT_ASYNC;
642 			mp->mnt_flag &= ~MNT_ASYNC;
643 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
644 			if (asyncflag)
645 				 mp->mnt_flag |= MNT_ASYNC;
646 		}
647 		mutex_exit(&mp->mnt_updating);
648 		vfs_unbusy(mp, false, &nmp);
649 	}
650 	mutex_exit(&mountlist_lock);
651 #ifdef DEBUG
652 	if (syncprt)
653 		vfs_bufstats();
654 #endif /* DEBUG */
655 }
656 
657 /* ARGSUSED */
658 int
659 sys_sync(struct lwp *l, const void *v, register_t *retval)
660 {
661 	do_sys_sync(l);
662 	return (0);
663 }
664 
665 
666 /*
667  * Access or change filesystem quotas.
668  *
669  * (this is really 14 different calls bundled into one)
670  */
671 
672 static int
673 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
674 {
675 	struct quotastat info_k;
676 	int error;
677 
678 	/* ensure any padding bytes are cleared */
679 	memset(&info_k, 0, sizeof(info_k));
680 
681 	error = vfs_quotactl_stat(mp, &info_k);
682 	if (error) {
683 		return error;
684 	}
685 
686 	return copyout(&info_k, info_u, sizeof(info_k));
687 }
688 
689 static int
690 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
691     struct quotaidtypestat *info_u)
692 {
693 	struct quotaidtypestat info_k;
694 	int error;
695 
696 	/* ensure any padding bytes are cleared */
697 	memset(&info_k, 0, sizeof(info_k));
698 
699 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
700 	if (error) {
701 		return error;
702 	}
703 
704 	return copyout(&info_k, info_u, sizeof(info_k));
705 }
706 
707 static int
708 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
709     struct quotaobjtypestat *info_u)
710 {
711 	struct quotaobjtypestat info_k;
712 	int error;
713 
714 	/* ensure any padding bytes are cleared */
715 	memset(&info_k, 0, sizeof(info_k));
716 
717 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
718 	if (error) {
719 		return error;
720 	}
721 
722 	return copyout(&info_k, info_u, sizeof(info_k));
723 }
724 
725 static int
726 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
727     struct quotaval *val_u)
728 {
729 	struct quotakey key_k;
730 	struct quotaval val_k;
731 	int error;
732 
733 	/* ensure any padding bytes are cleared */
734 	memset(&val_k, 0, sizeof(val_k));
735 
736 	error = copyin(key_u, &key_k, sizeof(key_k));
737 	if (error) {
738 		return error;
739 	}
740 
741 	error = vfs_quotactl_get(mp, &key_k, &val_k);
742 	if (error) {
743 		return error;
744 	}
745 
746 	return copyout(&val_k, val_u, sizeof(val_k));
747 }
748 
749 static int
750 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
751     const struct quotaval *val_u)
752 {
753 	struct quotakey key_k;
754 	struct quotaval val_k;
755 	int error;
756 
757 	error = copyin(key_u, &key_k, sizeof(key_k));
758 	if (error) {
759 		return error;
760 	}
761 
762 	error = copyin(val_u, &val_k, sizeof(val_k));
763 	if (error) {
764 		return error;
765 	}
766 
767 	return vfs_quotactl_put(mp, &key_k, &val_k);
768 }
769 
770 static int
771 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
772 {
773 	struct quotakey key_k;
774 	int error;
775 
776 	error = copyin(key_u, &key_k, sizeof(key_k));
777 	if (error) {
778 		return error;
779 	}
780 
781 	return vfs_quotactl_del(mp, &key_k);
782 }
783 
784 static int
785 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
786 {
787 	struct quotakcursor cursor_k;
788 	int error;
789 
790 	/* ensure any padding bytes are cleared */
791 	memset(&cursor_k, 0, sizeof(cursor_k));
792 
793 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
794 	if (error) {
795 		return error;
796 	}
797 
798 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
799 }
800 
801 static int
802 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
803 {
804 	struct quotakcursor cursor_k;
805 	int error;
806 
807 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
808 	if (error) {
809 		return error;
810 	}
811 
812 	return vfs_quotactl_cursorclose(mp, &cursor_k);
813 }
814 
815 static int
816 do_sys_quotactl_cursorskipidtype(struct mount *mp,
817     struct quotakcursor *cursor_u, int idtype)
818 {
819 	struct quotakcursor cursor_k;
820 	int error;
821 
822 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
823 	if (error) {
824 		return error;
825 	}
826 
827 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
828 	if (error) {
829 		return error;
830 	}
831 
832 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
833 }
834 
835 static int
836 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
837     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
838     unsigned *ret_u)
839 {
840 #define CGET_STACK_MAX 8
841 	struct quotakcursor cursor_k;
842 	struct quotakey stackkeys[CGET_STACK_MAX];
843 	struct quotaval stackvals[CGET_STACK_MAX];
844 	struct quotakey *keys_k;
845 	struct quotaval *vals_k;
846 	unsigned ret_k;
847 	int error;
848 
849 	if (maxnum > 128) {
850 		maxnum = 128;
851 	}
852 
853 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
854 	if (error) {
855 		return error;
856 	}
857 
858 	if (maxnum <= CGET_STACK_MAX) {
859 		keys_k = stackkeys;
860 		vals_k = stackvals;
861 		/* ensure any padding bytes are cleared */
862 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
863 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
864 	} else {
865 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
866 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
867 	}
868 
869 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
870 				       &ret_k);
871 	if (error) {
872 		goto fail;
873 	}
874 
875 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
876 	if (error) {
877 		goto fail;
878 	}
879 
880 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
881 	if (error) {
882 		goto fail;
883 	}
884 
885 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
886 	if (error) {
887 		goto fail;
888 	}
889 
890 	/* do last to maximize the chance of being able to recover a failure */
891 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
892 
893 fail:
894 	if (keys_k != stackkeys) {
895 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
896 	}
897 	if (vals_k != stackvals) {
898 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
899 	}
900 	return error;
901 }
902 
903 static int
904 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
905     int *ret_u)
906 {
907 	struct quotakcursor cursor_k;
908 	int ret_k;
909 	int error;
910 
911 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
912 	if (error) {
913 		return error;
914 	}
915 
916 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
917 	if (error) {
918 		return error;
919 	}
920 
921 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
922 	if (error) {
923 		return error;
924 	}
925 
926 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
927 }
928 
929 static int
930 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
931 {
932 	struct quotakcursor cursor_k;
933 	int error;
934 
935 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
936 	if (error) {
937 		return error;
938 	}
939 
940 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
941 	if (error) {
942 		return error;
943 	}
944 
945 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
946 }
947 
948 static int
949 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
950 {
951 	char *path_k;
952 	int error;
953 
954 	/* XXX this should probably be a struct pathbuf */
955 	path_k = PNBUF_GET();
956 	error = copyin(path_u, path_k, PATH_MAX);
957 	if (error) {
958 		PNBUF_PUT(path_k);
959 		return error;
960 	}
961 
962 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
963 
964 	PNBUF_PUT(path_k);
965 	return error;
966 }
967 
968 static int
969 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
970 {
971 	return vfs_quotactl_quotaoff(mp, idtype);
972 }
973 
974 int
975 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
976 {
977 	struct mount *mp;
978 	struct vnode *vp;
979 	int error;
980 
981 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
982 	if (error != 0)
983 		return (error);
984 	mp = vp->v_mount;
985 
986 	switch (args->qc_op) {
987 	    case QUOTACTL_STAT:
988 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
989 		break;
990 	    case QUOTACTL_IDTYPESTAT:
991 		error = do_sys_quotactl_idtypestat(mp,
992 				args->u.idtypestat.qc_idtype,
993 				args->u.idtypestat.qc_info);
994 		break;
995 	    case QUOTACTL_OBJTYPESTAT:
996 		error = do_sys_quotactl_objtypestat(mp,
997 				args->u.objtypestat.qc_objtype,
998 				args->u.objtypestat.qc_info);
999 		break;
1000 	    case QUOTACTL_GET:
1001 		error = do_sys_quotactl_get(mp,
1002 				args->u.get.qc_key,
1003 				args->u.get.qc_val);
1004 		break;
1005 	    case QUOTACTL_PUT:
1006 		error = do_sys_quotactl_put(mp,
1007 				args->u.put.qc_key,
1008 				args->u.put.qc_val);
1009 		break;
1010 	    case QUOTACTL_DEL:
1011 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1012 		break;
1013 	    case QUOTACTL_CURSOROPEN:
1014 		error = do_sys_quotactl_cursoropen(mp,
1015 				args->u.cursoropen.qc_cursor);
1016 		break;
1017 	    case QUOTACTL_CURSORCLOSE:
1018 		error = do_sys_quotactl_cursorclose(mp,
1019 				args->u.cursorclose.qc_cursor);
1020 		break;
1021 	    case QUOTACTL_CURSORSKIPIDTYPE:
1022 		error = do_sys_quotactl_cursorskipidtype(mp,
1023 				args->u.cursorskipidtype.qc_cursor,
1024 				args->u.cursorskipidtype.qc_idtype);
1025 		break;
1026 	    case QUOTACTL_CURSORGET:
1027 		error = do_sys_quotactl_cursorget(mp,
1028 				args->u.cursorget.qc_cursor,
1029 				args->u.cursorget.qc_keys,
1030 				args->u.cursorget.qc_vals,
1031 				args->u.cursorget.qc_maxnum,
1032 				args->u.cursorget.qc_ret);
1033 		break;
1034 	    case QUOTACTL_CURSORATEND:
1035 		error = do_sys_quotactl_cursoratend(mp,
1036 				args->u.cursoratend.qc_cursor,
1037 				args->u.cursoratend.qc_ret);
1038 		break;
1039 	    case QUOTACTL_CURSORREWIND:
1040 		error = do_sys_quotactl_cursorrewind(mp,
1041 				args->u.cursorrewind.qc_cursor);
1042 		break;
1043 	    case QUOTACTL_QUOTAON:
1044 		error = do_sys_quotactl_quotaon(mp,
1045 				args->u.quotaon.qc_idtype,
1046 				args->u.quotaon.qc_quotafile);
1047 		break;
1048 	    case QUOTACTL_QUOTAOFF:
1049 		error = do_sys_quotactl_quotaoff(mp,
1050 				args->u.quotaoff.qc_idtype);
1051 		break;
1052 	    default:
1053 		error = EINVAL;
1054 		break;
1055 	}
1056 
1057 	vrele(vp);
1058 	return error;
1059 }
1060 
1061 /* ARGSUSED */
1062 int
1063 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1064     register_t *retval)
1065 {
1066 	/* {
1067 		syscallarg(const char *) path;
1068 		syscallarg(struct quotactl_args *) args;
1069 	} */
1070 	struct quotactl_args args;
1071 	int error;
1072 
1073 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1074 	if (error) {
1075 		return error;
1076 	}
1077 
1078 	return do_sys_quotactl(SCARG(uap, path), &args);
1079 }
1080 
1081 int
1082 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1083     int root)
1084 {
1085 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1086 	int error = 0;
1087 
1088 	/*
1089 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1090 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1091 	 * overrides MNT_NOWAIT.
1092 	 */
1093 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1094 	    (flags != MNT_WAIT && flags != 0)) {
1095 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1096 		goto done;
1097 	}
1098 
1099 	/* Get the filesystem stats now */
1100 	memset(sp, 0, sizeof(*sp));
1101 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
1102 		return error;
1103 	}
1104 
1105 	if (cwdi->cwdi_rdir == NULL)
1106 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1107 done:
1108 	if (cwdi->cwdi_rdir != NULL) {
1109 		size_t len;
1110 		char *bp;
1111 		char c;
1112 		char *path = PNBUF_GET();
1113 
1114 		bp = path + MAXPATHLEN;
1115 		*--bp = '\0';
1116 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1117 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1118 		    MAXPATHLEN / 2, 0, l);
1119 		rw_exit(&cwdi->cwdi_lock);
1120 		if (error) {
1121 			PNBUF_PUT(path);
1122 			return error;
1123 		}
1124 		len = strlen(bp);
1125 		if (len != 1) {
1126 			/*
1127 			 * for mount points that are below our root, we can see
1128 			 * them, so we fix up the pathname and return them. The
1129 			 * rest we cannot see, so we don't allow viewing the
1130 			 * data.
1131 			 */
1132 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1133 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1134 				(void)strlcpy(sp->f_mntonname,
1135 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1136 				    sizeof(sp->f_mntonname));
1137 			} else {
1138 				if (root)
1139 					(void)strlcpy(sp->f_mntonname, "/",
1140 					    sizeof(sp->f_mntonname));
1141 				else
1142 					error = EPERM;
1143 			}
1144 		}
1145 		PNBUF_PUT(path);
1146 	}
1147 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1148 	return error;
1149 }
1150 
1151 /*
1152  * Get filesystem statistics by path.
1153  */
1154 int
1155 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1156 {
1157 	struct mount *mp;
1158 	int error;
1159 	struct vnode *vp;
1160 
1161 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1162 	if (error != 0)
1163 		return error;
1164 	mp = vp->v_mount;
1165 	error = dostatvfs(mp, sb, l, flags, 1);
1166 	vrele(vp);
1167 	return error;
1168 }
1169 
1170 /* ARGSUSED */
1171 int
1172 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
1173 {
1174 	/* {
1175 		syscallarg(const char *) path;
1176 		syscallarg(struct statvfs *) buf;
1177 		syscallarg(int) flags;
1178 	} */
1179 	struct statvfs *sb;
1180 	int error;
1181 
1182 	sb = STATVFSBUF_GET();
1183 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1184 	if (error == 0)
1185 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1186 	STATVFSBUF_PUT(sb);
1187 	return error;
1188 }
1189 
1190 /*
1191  * Get filesystem statistics by fd.
1192  */
1193 int
1194 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1195 {
1196 	file_t *fp;
1197 	struct mount *mp;
1198 	int error;
1199 
1200 	/* fd_getvnode() will use the descriptor for us */
1201 	if ((error = fd_getvnode(fd, &fp)) != 0)
1202 		return (error);
1203 	mp = fp->f_vnode->v_mount;
1204 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1205 	fd_putfile(fd);
1206 	return error;
1207 }
1208 
1209 /* ARGSUSED */
1210 int
1211 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
1212 {
1213 	/* {
1214 		syscallarg(int) fd;
1215 		syscallarg(struct statvfs *) buf;
1216 		syscallarg(int) flags;
1217 	} */
1218 	struct statvfs *sb;
1219 	int error;
1220 
1221 	sb = STATVFSBUF_GET();
1222 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1223 	if (error == 0)
1224 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1225 	STATVFSBUF_PUT(sb);
1226 	return error;
1227 }
1228 
1229 
1230 /*
1231  * Get statistics on all filesystems.
1232  */
1233 int
1234 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1235     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1236     register_t *retval)
1237 {
1238 	int root = 0;
1239 	struct proc *p = l->l_proc;
1240 	struct mount *mp, *nmp;
1241 	struct statvfs *sb;
1242 	size_t count, maxcount;
1243 	int error = 0;
1244 
1245 	sb = STATVFSBUF_GET();
1246 	maxcount = bufsize / entry_sz;
1247 	mutex_enter(&mountlist_lock);
1248 	count = 0;
1249 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1250 		if (vfs_busy(mp, &nmp)) {
1251 			continue;
1252 		}
1253 		if (sfsp && count < maxcount) {
1254 			error = dostatvfs(mp, sb, l, flags, 0);
1255 			if (error) {
1256 				vfs_unbusy(mp, false, &nmp);
1257 				error = 0;
1258 				continue;
1259 			}
1260 			error = copyfn(sb, sfsp, entry_sz);
1261 			if (error) {
1262 				vfs_unbusy(mp, false, NULL);
1263 				goto out;
1264 			}
1265 			sfsp = (char *)sfsp + entry_sz;
1266 			root |= strcmp(sb->f_mntonname, "/") == 0;
1267 		}
1268 		count++;
1269 		vfs_unbusy(mp, false, &nmp);
1270 	}
1271 	mutex_exit(&mountlist_lock);
1272 
1273 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1274 		/*
1275 		 * fake a root entry
1276 		 */
1277 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1278 		    sb, l, flags, 1);
1279 		if (error != 0)
1280 			goto out;
1281 		if (sfsp) {
1282 			error = copyfn(sb, sfsp, entry_sz);
1283 			if (error != 0)
1284 				goto out;
1285 		}
1286 		count++;
1287 	}
1288 	if (sfsp && count > maxcount)
1289 		*retval = maxcount;
1290 	else
1291 		*retval = count;
1292 out:
1293 	STATVFSBUF_PUT(sb);
1294 	return error;
1295 }
1296 
1297 int
1298 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1299 {
1300 	/* {
1301 		syscallarg(struct statvfs *) buf;
1302 		syscallarg(size_t) bufsize;
1303 		syscallarg(int) flags;
1304 	} */
1305 
1306 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1307 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1308 }
1309 
1310 /*
1311  * Change current working directory to a given file descriptor.
1312  */
1313 /* ARGSUSED */
1314 int
1315 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1316 {
1317 	/* {
1318 		syscallarg(int) fd;
1319 	} */
1320 	struct proc *p = l->l_proc;
1321 	struct cwdinfo *cwdi;
1322 	struct vnode *vp, *tdp;
1323 	struct mount *mp;
1324 	file_t *fp;
1325 	int error, fd;
1326 
1327 	/* fd_getvnode() will use the descriptor for us */
1328 	fd = SCARG(uap, fd);
1329 	if ((error = fd_getvnode(fd, &fp)) != 0)
1330 		return (error);
1331 	vp = fp->f_vnode;
1332 
1333 	vref(vp);
1334 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1335 	if (vp->v_type != VDIR)
1336 		error = ENOTDIR;
1337 	else
1338 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1339 	if (error) {
1340 		vput(vp);
1341 		goto out;
1342 	}
1343 	while ((mp = vp->v_mountedhere) != NULL) {
1344 		error = vfs_busy(mp, NULL);
1345 		vput(vp);
1346 		if (error != 0)
1347 			goto out;
1348 		error = VFS_ROOT(mp, &tdp);
1349 		vfs_unbusy(mp, false, NULL);
1350 		if (error)
1351 			goto out;
1352 		vp = tdp;
1353 	}
1354 	VOP_UNLOCK(vp);
1355 
1356 	/*
1357 	 * Disallow changing to a directory not under the process's
1358 	 * current root directory (if there is one).
1359 	 */
1360 	cwdi = p->p_cwdi;
1361 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1362 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1363 		vrele(vp);
1364 		error = EPERM;	/* operation not permitted */
1365 	} else {
1366 		vrele(cwdi->cwdi_cdir);
1367 		cwdi->cwdi_cdir = vp;
1368 	}
1369 	rw_exit(&cwdi->cwdi_lock);
1370 
1371  out:
1372 	fd_putfile(fd);
1373 	return (error);
1374 }
1375 
1376 /*
1377  * Change this process's notion of the root directory to a given file
1378  * descriptor.
1379  */
1380 int
1381 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1382 {
1383 	struct proc *p = l->l_proc;
1384 	struct vnode	*vp;
1385 	file_t	*fp;
1386 	int		 error, fd = SCARG(uap, fd);
1387 
1388 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1389  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1390 		return error;
1391 	/* fd_getvnode() will use the descriptor for us */
1392 	if ((error = fd_getvnode(fd, &fp)) != 0)
1393 		return error;
1394 	vp = fp->f_vnode;
1395 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1396 	if (vp->v_type != VDIR)
1397 		error = ENOTDIR;
1398 	else
1399 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1400 	VOP_UNLOCK(vp);
1401 	if (error)
1402 		goto out;
1403 	vref(vp);
1404 
1405 	change_root(p->p_cwdi, vp, l);
1406 
1407  out:
1408 	fd_putfile(fd);
1409 	return (error);
1410 }
1411 
1412 /*
1413  * Change current working directory (``.'').
1414  */
1415 /* ARGSUSED */
1416 int
1417 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1418 {
1419 	/* {
1420 		syscallarg(const char *) path;
1421 	} */
1422 	struct proc *p = l->l_proc;
1423 	struct cwdinfo *cwdi;
1424 	int error;
1425 	struct vnode *vp;
1426 
1427 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1428 				  &vp, l)) != 0)
1429 		return (error);
1430 	cwdi = p->p_cwdi;
1431 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1432 	vrele(cwdi->cwdi_cdir);
1433 	cwdi->cwdi_cdir = vp;
1434 	rw_exit(&cwdi->cwdi_lock);
1435 	return (0);
1436 }
1437 
1438 /*
1439  * Change notion of root (``/'') directory.
1440  */
1441 /* ARGSUSED */
1442 int
1443 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1444 {
1445 	/* {
1446 		syscallarg(const char *) path;
1447 	} */
1448 	struct proc *p = l->l_proc;
1449 	int error;
1450 	struct vnode *vp;
1451 
1452 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1453 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1454 		return (error);
1455 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1456 				  &vp, l)) != 0)
1457 		return (error);
1458 
1459 	change_root(p->p_cwdi, vp, l);
1460 
1461 	return (0);
1462 }
1463 
1464 /*
1465  * Common routine for chroot and fchroot.
1466  * NB: callers need to properly authorize the change root operation.
1467  */
1468 void
1469 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1470 {
1471 	struct proc *p = l->l_proc;
1472 	kauth_cred_t ncred;
1473 
1474 	ncred = kauth_cred_alloc();
1475 
1476 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1477 	if (cwdi->cwdi_rdir != NULL)
1478 		vrele(cwdi->cwdi_rdir);
1479 	cwdi->cwdi_rdir = vp;
1480 
1481 	/*
1482 	 * Prevent escaping from chroot by putting the root under
1483 	 * the working directory.  Silently chdir to / if we aren't
1484 	 * already there.
1485 	 */
1486 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1487 		/*
1488 		 * XXX would be more failsafe to change directory to a
1489 		 * deadfs node here instead
1490 		 */
1491 		vrele(cwdi->cwdi_cdir);
1492 		vref(vp);
1493 		cwdi->cwdi_cdir = vp;
1494 	}
1495 	rw_exit(&cwdi->cwdi_lock);
1496 
1497 	/* Get a write lock on the process credential. */
1498 	proc_crmod_enter();
1499 
1500 	kauth_cred_clone(p->p_cred, ncred);
1501 	kauth_proc_chroot(ncred, p->p_cwdi);
1502 
1503 	/* Broadcast our credentials to the process and other LWPs. */
1504  	proc_crmod_leave(ncred, p->p_cred, true);
1505 }
1506 
1507 /*
1508  * Common routine for chroot and chdir.
1509  * XXX "where" should be enum uio_seg
1510  */
1511 int
1512 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1513 {
1514 	struct pathbuf *pb;
1515 	struct nameidata nd;
1516 	int error;
1517 
1518 	error = pathbuf_maybe_copyin(path, where, &pb);
1519 	if (error) {
1520 		return error;
1521 	}
1522 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1523 	if ((error = namei(&nd)) != 0) {
1524 		pathbuf_destroy(pb);
1525 		return error;
1526 	}
1527 	*vpp = nd.ni_vp;
1528 	pathbuf_destroy(pb);
1529 
1530 	if ((*vpp)->v_type != VDIR)
1531 		error = ENOTDIR;
1532 	else
1533 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1534 
1535 	if (error)
1536 		vput(*vpp);
1537 	else
1538 		VOP_UNLOCK(*vpp);
1539 	return (error);
1540 }
1541 
1542 /*
1543  * Internals of sys_open - path has already been converted into a pathbuf
1544  * (so we can easily reuse this function from other parts of the kernel,
1545  * like posix_spawn post-processing).
1546  */
1547 int
1548 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1549 	int open_mode, int *fd)
1550 {
1551 	struct proc *p = l->l_proc;
1552 	struct cwdinfo *cwdi = p->p_cwdi;
1553 	file_t *fp;
1554 	struct vnode *vp;
1555 	int flags, cmode;
1556 	int indx, error;
1557 	struct nameidata nd;
1558 
1559 	if (open_flags & O_SEARCH) {
1560 		open_flags &= ~(int)O_SEARCH;
1561 	}
1562 
1563 	flags = FFLAGS(open_flags);
1564 	if ((flags & (FREAD | FWRITE)) == 0)
1565 		return EINVAL;
1566 
1567 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1568 		return error;
1569 	}
1570 
1571 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1572 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1573 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1574 	if (dvp != NULL)
1575 		NDAT(&nd, dvp);
1576 
1577 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1578 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1579 		fd_abort(p, fp, indx);
1580 		if ((error == EDUPFD || error == EMOVEFD) &&
1581 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1582 		    (error =
1583 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1584 			*fd = indx;
1585 			return 0;
1586 		}
1587 		if (error == ERESTART)
1588 			error = EINTR;
1589 		return error;
1590 	}
1591 
1592 	l->l_dupfd = 0;
1593 	vp = nd.ni_vp;
1594 
1595 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1596 		return error;
1597 
1598 	VOP_UNLOCK(vp);
1599 	*fd = indx;
1600 	fd_affix(p, fp, indx);
1601 	return 0;
1602 }
1603 
1604 int
1605 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1606 {
1607 	struct pathbuf *pb;
1608 	int error, oflags;
1609 
1610 	oflags = FFLAGS(open_flags);
1611 	if ((oflags & (FREAD | FWRITE)) == 0)
1612 		return EINVAL;
1613 
1614 	pb = pathbuf_create(path);
1615 	if (pb == NULL)
1616 		return ENOMEM;
1617 
1618 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1619 	pathbuf_destroy(pb);
1620 
1621 	return error;
1622 }
1623 
1624 /*
1625  * Check permissions, allocate an open file structure,
1626  * and call the device open routine if any.
1627  */
1628 static int
1629 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1630     int mode, int *fd)
1631 {
1632 	file_t *dfp = NULL;
1633 	struct vnode *dvp = NULL;
1634 	struct pathbuf *pb;
1635 	int error;
1636 
1637 #ifdef COMPAT_10	/* XXX: and perhaps later */
1638 	if (path == NULL) {
1639 		pb = pathbuf_create(".");
1640 		if (pb == NULL)
1641 			return ENOMEM;
1642 	} else
1643 #endif
1644 	{
1645 		error = pathbuf_copyin(path, &pb);
1646 		if (error)
1647 			return error;
1648 	}
1649 
1650 	if (fdat != AT_FDCWD) {
1651 		/* fd_getvnode() will use the descriptor for us */
1652 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1653 			goto out;
1654 
1655 		dvp = dfp->f_vnode;
1656 	}
1657 
1658 	error = do_open(l, dvp, pb, flags, mode, fd);
1659 
1660 	if (dfp != NULL)
1661 		fd_putfile(fdat);
1662 out:
1663 	pathbuf_destroy(pb);
1664 	return error;
1665 }
1666 
1667 int
1668 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1669 {
1670 	/* {
1671 		syscallarg(const char *) path;
1672 		syscallarg(int) flags;
1673 		syscallarg(int) mode;
1674 	} */
1675 	int error;
1676 	int fd;
1677 
1678 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1679 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1680 
1681 	if (error == 0)
1682 		*retval = fd;
1683 
1684 	return error;
1685 }
1686 
1687 int
1688 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1689 {
1690 	/* {
1691 		syscallarg(int) fd;
1692 		syscallarg(const char *) path;
1693 		syscallarg(int) oflags;
1694 		syscallarg(int) mode;
1695 	} */
1696 	int error;
1697 	int fd;
1698 
1699 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1700 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1701 
1702 	if (error == 0)
1703 		*retval = fd;
1704 
1705 	return error;
1706 }
1707 
1708 static void
1709 vfs__fhfree(fhandle_t *fhp)
1710 {
1711 	size_t fhsize;
1712 
1713 	fhsize = FHANDLE_SIZE(fhp);
1714 	kmem_free(fhp, fhsize);
1715 }
1716 
1717 /*
1718  * vfs_composefh: compose a filehandle.
1719  */
1720 
1721 int
1722 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1723 {
1724 	struct mount *mp;
1725 	struct fid *fidp;
1726 	int error;
1727 	size_t needfhsize;
1728 	size_t fidsize;
1729 
1730 	mp = vp->v_mount;
1731 	fidp = NULL;
1732 	if (*fh_size < FHANDLE_SIZE_MIN) {
1733 		fidsize = 0;
1734 	} else {
1735 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1736 		if (fhp != NULL) {
1737 			memset(fhp, 0, *fh_size);
1738 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1739 			fidp = &fhp->fh_fid;
1740 		}
1741 	}
1742 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1743 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1744 	if (error == 0 && *fh_size < needfhsize) {
1745 		error = E2BIG;
1746 	}
1747 	*fh_size = needfhsize;
1748 	return error;
1749 }
1750 
1751 int
1752 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1753 {
1754 	struct mount *mp;
1755 	fhandle_t *fhp;
1756 	size_t fhsize;
1757 	size_t fidsize;
1758 	int error;
1759 
1760 	mp = vp->v_mount;
1761 	fidsize = 0;
1762 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1763 	KASSERT(error != 0);
1764 	if (error != E2BIG) {
1765 		goto out;
1766 	}
1767 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1768 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1769 	if (fhp == NULL) {
1770 		error = ENOMEM;
1771 		goto out;
1772 	}
1773 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1774 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1775 	if (error == 0) {
1776 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1777 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1778 		*fhpp = fhp;
1779 	} else {
1780 		kmem_free(fhp, fhsize);
1781 	}
1782 out:
1783 	return error;
1784 }
1785 
1786 void
1787 vfs_composefh_free(fhandle_t *fhp)
1788 {
1789 
1790 	vfs__fhfree(fhp);
1791 }
1792 
1793 /*
1794  * vfs_fhtovp: lookup a vnode by a filehandle.
1795  */
1796 
1797 int
1798 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1799 {
1800 	struct mount *mp;
1801 	int error;
1802 
1803 	*vpp = NULL;
1804 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1805 	if (mp == NULL) {
1806 		error = ESTALE;
1807 		goto out;
1808 	}
1809 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1810 		error = EOPNOTSUPP;
1811 		goto out;
1812 	}
1813 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1814 out:
1815 	return error;
1816 }
1817 
1818 /*
1819  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1820  * the needed size.
1821  */
1822 
1823 int
1824 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1825 {
1826 	fhandle_t *fhp;
1827 	int error;
1828 
1829 	if (fhsize > FHANDLE_SIZE_MAX) {
1830 		return EINVAL;
1831 	}
1832 	if (fhsize < FHANDLE_SIZE_MIN) {
1833 		return EINVAL;
1834 	}
1835 again:
1836 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1837 	if (fhp == NULL) {
1838 		return ENOMEM;
1839 	}
1840 	error = copyin(ufhp, fhp, fhsize);
1841 	if (error == 0) {
1842 		/* XXX this check shouldn't be here */
1843 		if (FHANDLE_SIZE(fhp) == fhsize) {
1844 			*fhpp = fhp;
1845 			return 0;
1846 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1847 			/*
1848 			 * a kludge for nfsv2 padded handles.
1849 			 */
1850 			size_t sz;
1851 
1852 			sz = FHANDLE_SIZE(fhp);
1853 			kmem_free(fhp, fhsize);
1854 			fhsize = sz;
1855 			goto again;
1856 		} else {
1857 			/*
1858 			 * userland told us wrong size.
1859 			 */
1860 		    	error = EINVAL;
1861 		}
1862 	}
1863 	kmem_free(fhp, fhsize);
1864 	return error;
1865 }
1866 
1867 void
1868 vfs_copyinfh_free(fhandle_t *fhp)
1869 {
1870 
1871 	vfs__fhfree(fhp);
1872 }
1873 
1874 /*
1875  * Get file handle system call
1876  */
1877 int
1878 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1879 {
1880 	/* {
1881 		syscallarg(char *) fname;
1882 		syscallarg(fhandle_t *) fhp;
1883 		syscallarg(size_t *) fh_size;
1884 	} */
1885 	struct vnode *vp;
1886 	fhandle_t *fh;
1887 	int error;
1888 	struct pathbuf *pb;
1889 	struct nameidata nd;
1890 	size_t sz;
1891 	size_t usz;
1892 
1893 	/*
1894 	 * Must be super user
1895 	 */
1896 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1897 	    0, NULL, NULL, NULL);
1898 	if (error)
1899 		return (error);
1900 
1901 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1902 	if (error) {
1903 		return error;
1904 	}
1905 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1906 	error = namei(&nd);
1907 	if (error) {
1908 		pathbuf_destroy(pb);
1909 		return error;
1910 	}
1911 	vp = nd.ni_vp;
1912 	pathbuf_destroy(pb);
1913 
1914 	error = vfs_composefh_alloc(vp, &fh);
1915 	vput(vp);
1916 	if (error != 0) {
1917 		return error;
1918 	}
1919 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1920 	if (error != 0) {
1921 		goto out;
1922 	}
1923 	sz = FHANDLE_SIZE(fh);
1924 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1925 	if (error != 0) {
1926 		goto out;
1927 	}
1928 	if (usz >= sz) {
1929 		error = copyout(fh, SCARG(uap, fhp), sz);
1930 	} else {
1931 		error = E2BIG;
1932 	}
1933 out:
1934 	vfs_composefh_free(fh);
1935 	return (error);
1936 }
1937 
1938 /*
1939  * Open a file given a file handle.
1940  *
1941  * Check permissions, allocate an open file structure,
1942  * and call the device open routine if any.
1943  */
1944 
1945 int
1946 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1947     register_t *retval)
1948 {
1949 	file_t *fp;
1950 	struct vnode *vp = NULL;
1951 	kauth_cred_t cred = l->l_cred;
1952 	file_t *nfp;
1953 	int indx, error;
1954 	struct vattr va;
1955 	fhandle_t *fh;
1956 	int flags;
1957 	proc_t *p;
1958 
1959 	p = curproc;
1960 
1961 	/*
1962 	 * Must be super user
1963 	 */
1964 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1965 	    0, NULL, NULL, NULL)))
1966 		return (error);
1967 
1968 	if (oflags & O_SEARCH) {
1969 		oflags &= ~(int)O_SEARCH;
1970 	}
1971 
1972 	flags = FFLAGS(oflags);
1973 	if ((flags & (FREAD | FWRITE)) == 0)
1974 		return (EINVAL);
1975 	if ((flags & O_CREAT))
1976 		return (EINVAL);
1977 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1978 		return (error);
1979 	fp = nfp;
1980 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1981 	if (error != 0) {
1982 		goto bad;
1983 	}
1984 	error = vfs_fhtovp(fh, &vp);
1985 	vfs_copyinfh_free(fh);
1986 	if (error != 0) {
1987 		goto bad;
1988 	}
1989 
1990 	/* Now do an effective vn_open */
1991 
1992 	if (vp->v_type == VSOCK) {
1993 		error = EOPNOTSUPP;
1994 		goto bad;
1995 	}
1996 	error = vn_openchk(vp, cred, flags);
1997 	if (error != 0)
1998 		goto bad;
1999 	if (flags & O_TRUNC) {
2000 		VOP_UNLOCK(vp);			/* XXX */
2001 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2002 		vattr_null(&va);
2003 		va.va_size = 0;
2004 		error = VOP_SETATTR(vp, &va, cred);
2005 		if (error)
2006 			goto bad;
2007 	}
2008 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2009 		goto bad;
2010 	if (flags & FWRITE) {
2011 		mutex_enter(vp->v_interlock);
2012 		vp->v_writecount++;
2013 		mutex_exit(vp->v_interlock);
2014 	}
2015 
2016 	/* done with modified vn_open, now finish what sys_open does. */
2017 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2018 		return error;
2019 
2020 	VOP_UNLOCK(vp);
2021 	*retval = indx;
2022 	fd_affix(p, fp, indx);
2023 	return (0);
2024 
2025 bad:
2026 	fd_abort(p, fp, indx);
2027 	if (vp != NULL)
2028 		vput(vp);
2029 	return (error);
2030 }
2031 
2032 int
2033 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2034 {
2035 	/* {
2036 		syscallarg(const void *) fhp;
2037 		syscallarg(size_t) fh_size;
2038 		syscallarg(int) flags;
2039 	} */
2040 
2041 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2042 	    SCARG(uap, flags), retval);
2043 }
2044 
2045 int
2046 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2047 {
2048 	int error;
2049 	fhandle_t *fh;
2050 	struct vnode *vp;
2051 
2052 	/*
2053 	 * Must be super user
2054 	 */
2055 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2056 	    0, NULL, NULL, NULL)))
2057 		return (error);
2058 
2059 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2060 	if (error != 0)
2061 		return error;
2062 
2063 	error = vfs_fhtovp(fh, &vp);
2064 	vfs_copyinfh_free(fh);
2065 	if (error != 0)
2066 		return error;
2067 
2068 	error = vn_stat(vp, sb);
2069 	vput(vp);
2070 	return error;
2071 }
2072 
2073 
2074 /* ARGSUSED */
2075 int
2076 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2077 {
2078 	/* {
2079 		syscallarg(const void *) fhp;
2080 		syscallarg(size_t) fh_size;
2081 		syscallarg(struct stat *) sb;
2082 	} */
2083 	struct stat sb;
2084 	int error;
2085 
2086 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2087 	if (error)
2088 		return error;
2089 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2090 }
2091 
2092 int
2093 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2094     int flags)
2095 {
2096 	fhandle_t *fh;
2097 	struct mount *mp;
2098 	struct vnode *vp;
2099 	int error;
2100 
2101 	/*
2102 	 * Must be super user
2103 	 */
2104 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2105 	    0, NULL, NULL, NULL)))
2106 		return error;
2107 
2108 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2109 	if (error != 0)
2110 		return error;
2111 
2112 	error = vfs_fhtovp(fh, &vp);
2113 	vfs_copyinfh_free(fh);
2114 	if (error != 0)
2115 		return error;
2116 
2117 	mp = vp->v_mount;
2118 	error = dostatvfs(mp, sb, l, flags, 1);
2119 	vput(vp);
2120 	return error;
2121 }
2122 
2123 /* ARGSUSED */
2124 int
2125 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
2126 {
2127 	/* {
2128 		syscallarg(const void *) fhp;
2129 		syscallarg(size_t) fh_size;
2130 		syscallarg(struct statvfs *) buf;
2131 		syscallarg(int)	flags;
2132 	} */
2133 	struct statvfs *sb = STATVFSBUF_GET();
2134 	int error;
2135 
2136 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2137 	    SCARG(uap, flags));
2138 	if (error == 0)
2139 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2140 	STATVFSBUF_PUT(sb);
2141 	return error;
2142 }
2143 
2144 /*
2145  * Create a special file.
2146  */
2147 /* ARGSUSED */
2148 int
2149 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2150     register_t *retval)
2151 {
2152 	/* {
2153 		syscallarg(const char *) path;
2154 		syscallarg(mode_t) mode;
2155 		syscallarg(dev_t) dev;
2156 	} */
2157 	return do_sys_mknodat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode),
2158 	    SCARG(uap, dev), retval, UIO_USERSPACE);
2159 }
2160 
2161 int
2162 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2163     register_t *retval)
2164 {
2165 	/* {
2166 		syscallarg(int) fd;
2167 		syscallarg(const char *) path;
2168 		syscallarg(mode_t) mode;
2169 		syscallarg(int) pad;
2170 		syscallarg(dev_t) dev;
2171 	} */
2172 
2173 	return do_sys_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2174 	    SCARG(uap, mode), SCARG(uap, dev), retval, UIO_USERSPACE);
2175 }
2176 
2177 int
2178 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2179     register_t *retval, enum uio_seg seg)
2180 {
2181 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, retval, seg);
2182 }
2183 
2184 int
2185 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2186     dev_t dev, register_t *retval, enum uio_seg seg)
2187 {
2188 	struct proc *p = l->l_proc;
2189 	struct vnode *vp;
2190 	struct vattr vattr;
2191 	int error, optype;
2192 	struct pathbuf *pb;
2193 	struct nameidata nd;
2194 	const char *pathstring;
2195 
2196 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2197 	    0, NULL, NULL, NULL)) != 0)
2198 		return (error);
2199 
2200 	optype = VOP_MKNOD_DESCOFFSET;
2201 
2202 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2203 	if (error) {
2204 		return error;
2205 	}
2206 	pathstring = pathbuf_stringcopy_get(pb);
2207 	if (pathstring == NULL) {
2208 		pathbuf_destroy(pb);
2209 		return ENOMEM;
2210 	}
2211 
2212 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2213 
2214 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2215 		goto out;
2216 	vp = nd.ni_vp;
2217 
2218 	if (vp != NULL)
2219 		error = EEXIST;
2220 	else {
2221 		vattr_null(&vattr);
2222 		/* We will read cwdi->cwdi_cmask unlocked. */
2223 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2224 		vattr.va_rdev = dev;
2225 
2226 		switch (mode & S_IFMT) {
2227 		case S_IFMT:	/* used by badsect to flag bad sectors */
2228 			vattr.va_type = VBAD;
2229 			break;
2230 		case S_IFCHR:
2231 			vattr.va_type = VCHR;
2232 			break;
2233 		case S_IFBLK:
2234 			vattr.va_type = VBLK;
2235 			break;
2236 		case S_IFWHT:
2237 			optype = VOP_WHITEOUT_DESCOFFSET;
2238 			break;
2239 		case S_IFREG:
2240 #if NVERIEXEC > 0
2241 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2242 			    O_CREAT);
2243 #endif /* NVERIEXEC > 0 */
2244 			vattr.va_type = VREG;
2245 			vattr.va_rdev = VNOVAL;
2246 			optype = VOP_CREATE_DESCOFFSET;
2247 			break;
2248 		default:
2249 			error = EINVAL;
2250 			break;
2251 		}
2252 	}
2253 	if (error == 0 && optype == VOP_MKNOD_DESCOFFSET
2254 	    && vattr.va_rdev == VNOVAL)
2255 		error = EINVAL;
2256 	if (!error) {
2257 		switch (optype) {
2258 		case VOP_WHITEOUT_DESCOFFSET:
2259 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2260 			if (error)
2261 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2262 			vput(nd.ni_dvp);
2263 			break;
2264 
2265 		case VOP_MKNOD_DESCOFFSET:
2266 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2267 						&nd.ni_cnd, &vattr);
2268 			if (error == 0)
2269 				vrele(nd.ni_vp);
2270 			vput(nd.ni_dvp);
2271 			break;
2272 
2273 		case VOP_CREATE_DESCOFFSET:
2274 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2275 						&nd.ni_cnd, &vattr);
2276 			if (error == 0)
2277 				vrele(nd.ni_vp);
2278 			vput(nd.ni_dvp);
2279 			break;
2280 		}
2281 	} else {
2282 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2283 		if (nd.ni_dvp == vp)
2284 			vrele(nd.ni_dvp);
2285 		else
2286 			vput(nd.ni_dvp);
2287 		if (vp)
2288 			vrele(vp);
2289 	}
2290 out:
2291 	pathbuf_stringcopy_put(pb, pathstring);
2292 	pathbuf_destroy(pb);
2293 	return (error);
2294 }
2295 
2296 /*
2297  * Create a named pipe.
2298  */
2299 /* ARGSUSED */
2300 int
2301 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2302 {
2303 	/* {
2304 		syscallarg(const char *) path;
2305 		syscallarg(int) mode;
2306 	} */
2307 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2308 }
2309 
2310 int
2311 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2312     register_t *retval)
2313 {
2314 	/* {
2315 		syscallarg(int) fd;
2316 		syscallarg(const char *) path;
2317 		syscallarg(int) mode;
2318 	} */
2319 
2320 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2321 	    SCARG(uap, mode));
2322 }
2323 
2324 static int
2325 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2326 {
2327 	struct proc *p = l->l_proc;
2328 	struct vattr vattr;
2329 	int error;
2330 	struct pathbuf *pb;
2331 	struct nameidata nd;
2332 
2333 	error = pathbuf_copyin(path, &pb);
2334 	if (error) {
2335 		return error;
2336 	}
2337 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2338 
2339 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2340 		pathbuf_destroy(pb);
2341 		return error;
2342 	}
2343 	if (nd.ni_vp != NULL) {
2344 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2345 		if (nd.ni_dvp == nd.ni_vp)
2346 			vrele(nd.ni_dvp);
2347 		else
2348 			vput(nd.ni_dvp);
2349 		vrele(nd.ni_vp);
2350 		pathbuf_destroy(pb);
2351 		return (EEXIST);
2352 	}
2353 	vattr_null(&vattr);
2354 	vattr.va_type = VFIFO;
2355 	/* We will read cwdi->cwdi_cmask unlocked. */
2356 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2357 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2358 	if (error == 0)
2359 		vrele(nd.ni_vp);
2360 	vput(nd.ni_dvp);
2361 	pathbuf_destroy(pb);
2362 	return (error);
2363 }
2364 
2365 /*
2366  * Make a hard file link.
2367  */
2368 /* ARGSUSED */
2369 int
2370 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2371     const char *link, int follow, register_t *retval)
2372 {
2373 	struct vnode *vp;
2374 	struct pathbuf *linkpb;
2375 	struct nameidata nd;
2376 	namei_simple_flags_t ns_flags;
2377 	int error;
2378 
2379 	if (follow & AT_SYMLINK_FOLLOW)
2380 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2381 	else
2382 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2383 
2384 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2385 	if (error != 0)
2386 		return (error);
2387 	error = pathbuf_copyin(link, &linkpb);
2388 	if (error) {
2389 		goto out1;
2390 	}
2391 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2392 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2393 		goto out2;
2394 	if (nd.ni_vp) {
2395 		error = EEXIST;
2396 		goto abortop;
2397 	}
2398 	/* Prevent hard links on directories. */
2399 	if (vp->v_type == VDIR) {
2400 		error = EPERM;
2401 		goto abortop;
2402 	}
2403 	/* Prevent cross-mount operation. */
2404 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2405 		error = EXDEV;
2406 		goto abortop;
2407 	}
2408 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2409 	VOP_UNLOCK(nd.ni_dvp);
2410 	vrele(nd.ni_dvp);
2411 out2:
2412 	pathbuf_destroy(linkpb);
2413 out1:
2414 	vrele(vp);
2415 	return (error);
2416 abortop:
2417 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2418 	if (nd.ni_dvp == nd.ni_vp)
2419 		vrele(nd.ni_dvp);
2420 	else
2421 		vput(nd.ni_dvp);
2422 	if (nd.ni_vp != NULL)
2423 		vrele(nd.ni_vp);
2424 	goto out2;
2425 }
2426 
2427 int
2428 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2429 {
2430 	/* {
2431 		syscallarg(const char *) path;
2432 		syscallarg(const char *) link;
2433 	} */
2434 	const char *path = SCARG(uap, path);
2435 	const char *link = SCARG(uap, link);
2436 
2437 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2438 	    AT_SYMLINK_FOLLOW, retval);
2439 }
2440 
2441 int
2442 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2443     register_t *retval)
2444 {
2445 	/* {
2446 		syscallarg(int) fd1;
2447 		syscallarg(const char *) name1;
2448 		syscallarg(int) fd2;
2449 		syscallarg(const char *) name2;
2450 		syscallarg(int) flags;
2451 	} */
2452 	int fd1 = SCARG(uap, fd1);
2453 	const char *name1 = SCARG(uap, name1);
2454 	int fd2 = SCARG(uap, fd2);
2455 	const char *name2 = SCARG(uap, name2);
2456 	int follow;
2457 
2458 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2459 
2460 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2461 }
2462 
2463 
2464 int
2465 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2466 {
2467 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2468 }
2469 
2470 static int
2471 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2472     const char *link, enum uio_seg seg)
2473 {
2474 	struct proc *p = curproc;
2475 	struct vattr vattr;
2476 	char *path;
2477 	int error;
2478 	struct pathbuf *linkpb;
2479 	struct nameidata nd;
2480 
2481 	KASSERT(l != NULL || fdat == AT_FDCWD);
2482 
2483 	path = PNBUF_GET();
2484 	if (seg == UIO_USERSPACE) {
2485 		if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != 0)
2486 			goto out1;
2487 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2488 			goto out1;
2489 	} else {
2490 		KASSERT(strlen(patharg) < MAXPATHLEN);
2491 		strcpy(path, patharg);
2492 		linkpb = pathbuf_create(link);
2493 		if (linkpb == NULL) {
2494 			error = ENOMEM;
2495 			goto out1;
2496 		}
2497 	}
2498 	ktrkuser("symlink-target", path, strlen(path));
2499 
2500 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2501 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2502 		goto out2;
2503 	if (nd.ni_vp) {
2504 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2505 		if (nd.ni_dvp == nd.ni_vp)
2506 			vrele(nd.ni_dvp);
2507 		else
2508 			vput(nd.ni_dvp);
2509 		vrele(nd.ni_vp);
2510 		error = EEXIST;
2511 		goto out2;
2512 	}
2513 	vattr_null(&vattr);
2514 	vattr.va_type = VLNK;
2515 	/* We will read cwdi->cwdi_cmask unlocked. */
2516 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2517 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2518 	if (error == 0)
2519 		vrele(nd.ni_vp);
2520 	vput(nd.ni_dvp);
2521 out2:
2522 	pathbuf_destroy(linkpb);
2523 out1:
2524 	PNBUF_PUT(path);
2525 	return (error);
2526 }
2527 
2528 /*
2529  * Make a symbolic link.
2530  */
2531 /* ARGSUSED */
2532 int
2533 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2534 {
2535 	/* {
2536 		syscallarg(const char *) path;
2537 		syscallarg(const char *) link;
2538 	} */
2539 
2540 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2541 	    UIO_USERSPACE);
2542 }
2543 
2544 int
2545 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2546     register_t *retval)
2547 {
2548 	/* {
2549 		syscallarg(const char *) path1;
2550 		syscallarg(int) fd;
2551 		syscallarg(const char *) path2;
2552 	} */
2553 
2554 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2555 	    SCARG(uap, path2), UIO_USERSPACE);
2556 }
2557 
2558 /*
2559  * Delete a whiteout from the filesystem.
2560  */
2561 /* ARGSUSED */
2562 int
2563 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2564 {
2565 	/* {
2566 		syscallarg(const char *) path;
2567 	} */
2568 	int error;
2569 	struct pathbuf *pb;
2570 	struct nameidata nd;
2571 
2572 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2573 	if (error) {
2574 		return error;
2575 	}
2576 
2577 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2578 	error = namei(&nd);
2579 	if (error) {
2580 		pathbuf_destroy(pb);
2581 		return (error);
2582 	}
2583 
2584 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2585 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2586 		if (nd.ni_dvp == nd.ni_vp)
2587 			vrele(nd.ni_dvp);
2588 		else
2589 			vput(nd.ni_dvp);
2590 		if (nd.ni_vp)
2591 			vrele(nd.ni_vp);
2592 		pathbuf_destroy(pb);
2593 		return (EEXIST);
2594 	}
2595 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2596 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2597 	vput(nd.ni_dvp);
2598 	pathbuf_destroy(pb);
2599 	return (error);
2600 }
2601 
2602 /*
2603  * Delete a name from the filesystem.
2604  */
2605 /* ARGSUSED */
2606 int
2607 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2608 {
2609 	/* {
2610 		syscallarg(const char *) path;
2611 	} */
2612 
2613 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2614 }
2615 
2616 int
2617 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2618     register_t *retval)
2619 {
2620 	/* {
2621 		syscallarg(int) fd;
2622 		syscallarg(const char *) path;
2623 		syscallarg(int) flag;
2624 	} */
2625 
2626 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2627 	    SCARG(uap, flag), UIO_USERSPACE);
2628 }
2629 
2630 int
2631 do_sys_unlink(const char *arg, enum uio_seg seg)
2632 {
2633 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2634 }
2635 
2636 static int
2637 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2638     enum uio_seg seg)
2639 {
2640 	struct vnode *vp;
2641 	int error;
2642 	struct pathbuf *pb;
2643 	struct nameidata nd;
2644 	const char *pathstring;
2645 
2646 	KASSERT(l != NULL || fdat == AT_FDCWD);
2647 
2648 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2649 	if (error) {
2650 		return error;
2651 	}
2652 	pathstring = pathbuf_stringcopy_get(pb);
2653 	if (pathstring == NULL) {
2654 		pathbuf_destroy(pb);
2655 		return ENOMEM;
2656 	}
2657 
2658 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2659 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2660 		goto out;
2661 	vp = nd.ni_vp;
2662 
2663 	/*
2664 	 * The root of a mounted filesystem cannot be deleted.
2665 	 */
2666 	if ((vp->v_vflag & VV_ROOT) != 0) {
2667 		error = EBUSY;
2668 		goto abort;
2669 	}
2670 
2671 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2672 		error = EBUSY;
2673 		goto abort;
2674 	}
2675 
2676 	/*
2677 	 * No rmdir "." please.
2678 	 */
2679 	if (nd.ni_dvp == vp) {
2680 		error = EINVAL;
2681 		goto abort;
2682 	}
2683 
2684 	/*
2685 	 * AT_REMOVEDIR is required to remove a directory
2686 	 */
2687 	if (vp->v_type == VDIR) {
2688 		if (!(flags & AT_REMOVEDIR)) {
2689 			error = EPERM;
2690 			goto abort;
2691 		} else {
2692 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2693 			goto out;
2694 		}
2695 	}
2696 
2697 	/*
2698 	 * Starting here we only deal with non directories.
2699 	 */
2700 	if (flags & AT_REMOVEDIR) {
2701 		error = ENOTDIR;
2702 		goto abort;
2703 	}
2704 
2705 #if NVERIEXEC > 0
2706 	/* Handle remove requests for veriexec entries. */
2707 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2708 		goto abort;
2709 	}
2710 #endif /* NVERIEXEC > 0 */
2711 
2712 #ifdef FILEASSOC
2713 	(void)fileassoc_file_delete(vp);
2714 #endif /* FILEASSOC */
2715 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2716 	goto out;
2717 
2718 abort:
2719 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2720 	if (nd.ni_dvp == vp)
2721 		vrele(nd.ni_dvp);
2722 	else
2723 		vput(nd.ni_dvp);
2724 	vput(vp);
2725 
2726 out:
2727 	pathbuf_stringcopy_put(pb, pathstring);
2728 	pathbuf_destroy(pb);
2729 	return (error);
2730 }
2731 
2732 /*
2733  * Reposition read/write file offset.
2734  */
2735 int
2736 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2737 {
2738 	/* {
2739 		syscallarg(int) fd;
2740 		syscallarg(int) pad;
2741 		syscallarg(off_t) offset;
2742 		syscallarg(int) whence;
2743 	} */
2744 	kauth_cred_t cred = l->l_cred;
2745 	file_t *fp;
2746 	struct vnode *vp;
2747 	struct vattr vattr;
2748 	off_t newoff;
2749 	int error, fd;
2750 
2751 	fd = SCARG(uap, fd);
2752 
2753 	if ((fp = fd_getfile(fd)) == NULL)
2754 		return (EBADF);
2755 
2756 	vp = fp->f_vnode;
2757 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2758 		error = ESPIPE;
2759 		goto out;
2760 	}
2761 
2762 	switch (SCARG(uap, whence)) {
2763 	case SEEK_CUR:
2764 		newoff = fp->f_offset + SCARG(uap, offset);
2765 		break;
2766 	case SEEK_END:
2767 		vn_lock(vp, LK_SHARED | LK_RETRY);
2768 		error = VOP_GETATTR(vp, &vattr, cred);
2769 		VOP_UNLOCK(vp);
2770 		if (error) {
2771 			goto out;
2772 		}
2773 		newoff = SCARG(uap, offset) + vattr.va_size;
2774 		break;
2775 	case SEEK_SET:
2776 		newoff = SCARG(uap, offset);
2777 		break;
2778 	default:
2779 		error = EINVAL;
2780 		goto out;
2781 	}
2782 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2783 		*(off_t *)retval = fp->f_offset = newoff;
2784 	}
2785  out:
2786  	fd_putfile(fd);
2787 	return (error);
2788 }
2789 
2790 /*
2791  * Positional read system call.
2792  */
2793 int
2794 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2795 {
2796 	/* {
2797 		syscallarg(int) fd;
2798 		syscallarg(void *) buf;
2799 		syscallarg(size_t) nbyte;
2800 		syscallarg(off_t) offset;
2801 	} */
2802 	file_t *fp;
2803 	struct vnode *vp;
2804 	off_t offset;
2805 	int error, fd = SCARG(uap, fd);
2806 
2807 	if ((fp = fd_getfile(fd)) == NULL)
2808 		return (EBADF);
2809 
2810 	if ((fp->f_flag & FREAD) == 0) {
2811 		fd_putfile(fd);
2812 		return (EBADF);
2813 	}
2814 
2815 	vp = fp->f_vnode;
2816 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2817 		error = ESPIPE;
2818 		goto out;
2819 	}
2820 
2821 	offset = SCARG(uap, offset);
2822 
2823 	/*
2824 	 * XXX This works because no file systems actually
2825 	 * XXX take any action on the seek operation.
2826 	 */
2827 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2828 		goto out;
2829 
2830 	/* dofileread() will unuse the descriptor for us */
2831 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2832 	    &offset, 0, retval));
2833 
2834  out:
2835 	fd_putfile(fd);
2836 	return (error);
2837 }
2838 
2839 /*
2840  * Positional scatter read system call.
2841  */
2842 int
2843 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2844 {
2845 	/* {
2846 		syscallarg(int) fd;
2847 		syscallarg(const struct iovec *) iovp;
2848 		syscallarg(int) iovcnt;
2849 		syscallarg(off_t) offset;
2850 	} */
2851 	off_t offset = SCARG(uap, offset);
2852 
2853 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2854 	    SCARG(uap, iovcnt), &offset, 0, retval);
2855 }
2856 
2857 /*
2858  * Positional write system call.
2859  */
2860 int
2861 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2862 {
2863 	/* {
2864 		syscallarg(int) fd;
2865 		syscallarg(const void *) buf;
2866 		syscallarg(size_t) nbyte;
2867 		syscallarg(off_t) offset;
2868 	} */
2869 	file_t *fp;
2870 	struct vnode *vp;
2871 	off_t offset;
2872 	int error, fd = SCARG(uap, fd);
2873 
2874 	if ((fp = fd_getfile(fd)) == NULL)
2875 		return (EBADF);
2876 
2877 	if ((fp->f_flag & FWRITE) == 0) {
2878 		fd_putfile(fd);
2879 		return (EBADF);
2880 	}
2881 
2882 	vp = fp->f_vnode;
2883 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2884 		error = ESPIPE;
2885 		goto out;
2886 	}
2887 
2888 	offset = SCARG(uap, offset);
2889 
2890 	/*
2891 	 * XXX This works because no file systems actually
2892 	 * XXX take any action on the seek operation.
2893 	 */
2894 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2895 		goto out;
2896 
2897 	/* dofilewrite() will unuse the descriptor for us */
2898 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2899 	    &offset, 0, retval));
2900 
2901  out:
2902 	fd_putfile(fd);
2903 	return (error);
2904 }
2905 
2906 /*
2907  * Positional gather write system call.
2908  */
2909 int
2910 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2911 {
2912 	/* {
2913 		syscallarg(int) fd;
2914 		syscallarg(const struct iovec *) iovp;
2915 		syscallarg(int) iovcnt;
2916 		syscallarg(off_t) offset;
2917 	} */
2918 	off_t offset = SCARG(uap, offset);
2919 
2920 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2921 	    SCARG(uap, iovcnt), &offset, 0, retval);
2922 }
2923 
2924 /*
2925  * Check access permissions.
2926  */
2927 int
2928 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2929 {
2930 	/* {
2931 		syscallarg(const char *) path;
2932 		syscallarg(int) flags;
2933 	} */
2934 
2935 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
2936 	     SCARG(uap, flags), 0);
2937 }
2938 
2939 int
2940 do_sys_accessat(struct lwp *l, int fdat, const char *path,
2941     int mode, int flags)
2942 {
2943 	kauth_cred_t cred;
2944 	struct vnode *vp;
2945 	int error, nd_flag, vmode;
2946 	struct pathbuf *pb;
2947 	struct nameidata nd;
2948 
2949 	CTASSERT(F_OK == 0);
2950 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
2951 		/* nonsense mode */
2952 		return EINVAL;
2953 	}
2954 
2955 	nd_flag = FOLLOW | LOCKLEAF | TRYEMULROOT;
2956 	if (flags & AT_SYMLINK_NOFOLLOW)
2957 		nd_flag &= ~FOLLOW;
2958 
2959 	error = pathbuf_copyin(path, &pb);
2960 	if (error)
2961 		return error;
2962 
2963 	NDINIT(&nd, LOOKUP, nd_flag, pb);
2964 
2965 	/* Override default credentials */
2966 	cred = kauth_cred_dup(l->l_cred);
2967 	if (!(flags & AT_EACCESS)) {
2968 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2969 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2970 	}
2971 	nd.ni_cnd.cn_cred = cred;
2972 
2973 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2974 		pathbuf_destroy(pb);
2975 		goto out;
2976 	}
2977 	vp = nd.ni_vp;
2978 	pathbuf_destroy(pb);
2979 
2980 	/* Flags == 0 means only check for existence. */
2981 	if (mode) {
2982 		vmode = 0;
2983 		if (mode & R_OK)
2984 			vmode |= VREAD;
2985 		if (mode & W_OK)
2986 			vmode |= VWRITE;
2987 		if (mode & X_OK)
2988 			vmode |= VEXEC;
2989 
2990 		error = VOP_ACCESS(vp, vmode, cred);
2991 		if (!error && (vmode & VWRITE))
2992 			error = vn_writechk(vp);
2993 	}
2994 	vput(vp);
2995 out:
2996 	kauth_cred_free(cred);
2997 	return (error);
2998 }
2999 
3000 int
3001 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3002     register_t *retval)
3003 {
3004 	/* {
3005 		syscallarg(int) fd;
3006 		syscallarg(const char *) path;
3007 		syscallarg(int) amode;
3008 		syscallarg(int) flag;
3009 	} */
3010 
3011 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3012 	     SCARG(uap, amode), SCARG(uap, flag));
3013 }
3014 
3015 /*
3016  * Common code for all sys_stat functions, including compat versions.
3017  */
3018 int
3019 do_sys_stat(const char *userpath, unsigned int nd_flag,
3020     struct stat *sb)
3021 {
3022 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3023 }
3024 
3025 int
3026 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3027     unsigned int nd_flag, struct stat *sb)
3028 {
3029 	int error;
3030 	struct pathbuf *pb;
3031 	struct nameidata nd;
3032 
3033 	KASSERT(l != NULL || fdat == AT_FDCWD);
3034 
3035 	error = pathbuf_copyin(userpath, &pb);
3036 	if (error) {
3037 		return error;
3038 	}
3039 
3040 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3041 
3042 	error = fd_nameiat(l, fdat, &nd);
3043 	if (error != 0) {
3044 		pathbuf_destroy(pb);
3045 		return error;
3046 	}
3047 	error = vn_stat(nd.ni_vp, sb);
3048 	vput(nd.ni_vp);
3049 	pathbuf_destroy(pb);
3050 	return error;
3051 }
3052 
3053 /*
3054  * Get file status; this version follows links.
3055  */
3056 /* ARGSUSED */
3057 int
3058 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3059 {
3060 	/* {
3061 		syscallarg(const char *) path;
3062 		syscallarg(struct stat *) ub;
3063 	} */
3064 	struct stat sb;
3065 	int error;
3066 
3067 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3068 	if (error)
3069 		return error;
3070 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3071 }
3072 
3073 /*
3074  * Get file status; this version does not follow links.
3075  */
3076 /* ARGSUSED */
3077 int
3078 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3079 {
3080 	/* {
3081 		syscallarg(const char *) path;
3082 		syscallarg(struct stat *) ub;
3083 	} */
3084 	struct stat sb;
3085 	int error;
3086 
3087 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3088 	if (error)
3089 		return error;
3090 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3091 }
3092 
3093 int
3094 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3095     register_t *retval)
3096 {
3097 	/* {
3098 		syscallarg(int) fd;
3099 		syscallarg(const char *) path;
3100 		syscallarg(struct stat *) buf;
3101 		syscallarg(int) flag;
3102 	} */
3103 	unsigned int nd_flag;
3104 	struct stat sb;
3105 	int error;
3106 
3107 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3108 		nd_flag = NOFOLLOW;
3109 	else
3110 		nd_flag = FOLLOW;
3111 
3112 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3113 	    &sb);
3114 	if (error)
3115 		return error;
3116 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3117 }
3118 
3119 /*
3120  * Get configurable pathname variables.
3121  */
3122 /* ARGSUSED */
3123 int
3124 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
3125 {
3126 	/* {
3127 		syscallarg(const char *) path;
3128 		syscallarg(int) name;
3129 	} */
3130 	int error;
3131 	struct pathbuf *pb;
3132 	struct nameidata nd;
3133 
3134 	error = pathbuf_copyin(SCARG(uap, path), &pb);
3135 	if (error) {
3136 		return error;
3137 	}
3138 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3139 	if ((error = namei(&nd)) != 0) {
3140 		pathbuf_destroy(pb);
3141 		return (error);
3142 	}
3143 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
3144 	vput(nd.ni_vp);
3145 	pathbuf_destroy(pb);
3146 	return (error);
3147 }
3148 
3149 /*
3150  * Return target name of a symbolic link.
3151  */
3152 /* ARGSUSED */
3153 int
3154 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3155     register_t *retval)
3156 {
3157 	/* {
3158 		syscallarg(const char *) path;
3159 		syscallarg(char *) buf;
3160 		syscallarg(size_t) count;
3161 	} */
3162 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3163 	    SCARG(uap, buf), SCARG(uap, count), retval);
3164 }
3165 
3166 static int
3167 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3168     size_t count, register_t *retval)
3169 {
3170 	struct vnode *vp;
3171 	struct iovec aiov;
3172 	struct uio auio;
3173 	int error;
3174 	struct pathbuf *pb;
3175 	struct nameidata nd;
3176 
3177 	error = pathbuf_copyin(path, &pb);
3178 	if (error) {
3179 		return error;
3180 	}
3181 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3182 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3183 		pathbuf_destroy(pb);
3184 		return error;
3185 	}
3186 	vp = nd.ni_vp;
3187 	pathbuf_destroy(pb);
3188 	if (vp->v_type != VLNK)
3189 		error = EINVAL;
3190 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3191 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3192 		aiov.iov_base = buf;
3193 		aiov.iov_len = count;
3194 		auio.uio_iov = &aiov;
3195 		auio.uio_iovcnt = 1;
3196 		auio.uio_offset = 0;
3197 		auio.uio_rw = UIO_READ;
3198 		KASSERT(l == curlwp);
3199 		auio.uio_vmspace = l->l_proc->p_vmspace;
3200 		auio.uio_resid = count;
3201 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3202 			*retval = count - auio.uio_resid;
3203 	}
3204 	vput(vp);
3205 	return (error);
3206 }
3207 
3208 int
3209 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3210     register_t *retval)
3211 {
3212 	/* {
3213 		syscallarg(int) fd;
3214 		syscallarg(const char *) path;
3215 		syscallarg(char *) buf;
3216 		syscallarg(size_t) bufsize;
3217 	} */
3218 
3219 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3220 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3221 }
3222 
3223 /*
3224  * Change flags of a file given a path name.
3225  */
3226 /* ARGSUSED */
3227 int
3228 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3229 {
3230 	/* {
3231 		syscallarg(const char *) path;
3232 		syscallarg(u_long) flags;
3233 	} */
3234 	struct vnode *vp;
3235 	int error;
3236 
3237 	error = namei_simple_user(SCARG(uap, path),
3238 				NSM_FOLLOW_TRYEMULROOT, &vp);
3239 	if (error != 0)
3240 		return (error);
3241 	error = change_flags(vp, SCARG(uap, flags), l);
3242 	vput(vp);
3243 	return (error);
3244 }
3245 
3246 /*
3247  * Change flags of a file given a file descriptor.
3248  */
3249 /* ARGSUSED */
3250 int
3251 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3252 {
3253 	/* {
3254 		syscallarg(int) fd;
3255 		syscallarg(u_long) flags;
3256 	} */
3257 	struct vnode *vp;
3258 	file_t *fp;
3259 	int error;
3260 
3261 	/* fd_getvnode() will use the descriptor for us */
3262 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3263 		return (error);
3264 	vp = fp->f_vnode;
3265 	error = change_flags(vp, SCARG(uap, flags), l);
3266 	VOP_UNLOCK(vp);
3267 	fd_putfile(SCARG(uap, fd));
3268 	return (error);
3269 }
3270 
3271 /*
3272  * Change flags of a file given a path name; this version does
3273  * not follow links.
3274  */
3275 int
3276 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3277 {
3278 	/* {
3279 		syscallarg(const char *) path;
3280 		syscallarg(u_long) flags;
3281 	} */
3282 	struct vnode *vp;
3283 	int error;
3284 
3285 	error = namei_simple_user(SCARG(uap, path),
3286 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3287 	if (error != 0)
3288 		return (error);
3289 	error = change_flags(vp, SCARG(uap, flags), l);
3290 	vput(vp);
3291 	return (error);
3292 }
3293 
3294 /*
3295  * Common routine to change flags of a file.
3296  */
3297 int
3298 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3299 {
3300 	struct vattr vattr;
3301 	int error;
3302 
3303 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3304 
3305 	vattr_null(&vattr);
3306 	vattr.va_flags = flags;
3307 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3308 
3309 	return (error);
3310 }
3311 
3312 /*
3313  * Change mode of a file given path name; this version follows links.
3314  */
3315 /* ARGSUSED */
3316 int
3317 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3318 {
3319 	/* {
3320 		syscallarg(const char *) path;
3321 		syscallarg(int) mode;
3322 	} */
3323 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3324 			      SCARG(uap, mode), 0);
3325 }
3326 
3327 int
3328 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3329 {
3330 	int error;
3331 	struct vnode *vp;
3332 	namei_simple_flags_t ns_flag;
3333 
3334 	if (flags & AT_SYMLINK_NOFOLLOW)
3335 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3336 	else
3337 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3338 
3339 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3340 	if (error != 0)
3341 		return error;
3342 
3343 	error = change_mode(vp, mode, l);
3344 
3345 	vrele(vp);
3346 
3347 	return (error);
3348 }
3349 
3350 /*
3351  * Change mode of a file given a file descriptor.
3352  */
3353 /* ARGSUSED */
3354 int
3355 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3356 {
3357 	/* {
3358 		syscallarg(int) fd;
3359 		syscallarg(int) mode;
3360 	} */
3361 	file_t *fp;
3362 	int error;
3363 
3364 	/* fd_getvnode() will use the descriptor for us */
3365 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3366 		return (error);
3367 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3368 	fd_putfile(SCARG(uap, fd));
3369 	return (error);
3370 }
3371 
3372 int
3373 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3374     register_t *retval)
3375 {
3376 	/* {
3377 		syscallarg(int) fd;
3378 		syscallarg(const char *) path;
3379 		syscallarg(int) mode;
3380 		syscallarg(int) flag;
3381 	} */
3382 
3383 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3384 			      SCARG(uap, mode), SCARG(uap, flag));
3385 }
3386 
3387 /*
3388  * Change mode of a file given path name; this version does not follow links.
3389  */
3390 /* ARGSUSED */
3391 int
3392 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3393 {
3394 	/* {
3395 		syscallarg(const char *) path;
3396 		syscallarg(int) mode;
3397 	} */
3398 	int error;
3399 	struct vnode *vp;
3400 
3401 	error = namei_simple_user(SCARG(uap, path),
3402 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3403 	if (error != 0)
3404 		return (error);
3405 
3406 	error = change_mode(vp, SCARG(uap, mode), l);
3407 
3408 	vrele(vp);
3409 	return (error);
3410 }
3411 
3412 /*
3413  * Common routine to set mode given a vnode.
3414  */
3415 static int
3416 change_mode(struct vnode *vp, int mode, struct lwp *l)
3417 {
3418 	struct vattr vattr;
3419 	int error;
3420 
3421 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3422 	vattr_null(&vattr);
3423 	vattr.va_mode = mode & ALLPERMS;
3424 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3425 	VOP_UNLOCK(vp);
3426 	return (error);
3427 }
3428 
3429 /*
3430  * Set ownership given a path name; this version follows links.
3431  */
3432 /* ARGSUSED */
3433 int
3434 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3435 {
3436 	/* {
3437 		syscallarg(const char *) path;
3438 		syscallarg(uid_t) uid;
3439 		syscallarg(gid_t) gid;
3440 	} */
3441 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3442 			      SCARG(uap, gid), 0);
3443 }
3444 
3445 int
3446 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3447    gid_t gid, int flags)
3448 {
3449 	int error;
3450 	struct vnode *vp;
3451 	namei_simple_flags_t ns_flag;
3452 
3453 	if (flags & AT_SYMLINK_NOFOLLOW)
3454 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3455 	else
3456 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3457 
3458 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3459 	if (error != 0)
3460 		return error;
3461 
3462 	error = change_owner(vp, uid, gid, l, 0);
3463 
3464 	vrele(vp);
3465 
3466 	return (error);
3467 }
3468 
3469 /*
3470  * Set ownership given a path name; this version follows links.
3471  * Provides POSIX semantics.
3472  */
3473 /* ARGSUSED */
3474 int
3475 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3476 {
3477 	/* {
3478 		syscallarg(const char *) path;
3479 		syscallarg(uid_t) uid;
3480 		syscallarg(gid_t) gid;
3481 	} */
3482 	int error;
3483 	struct vnode *vp;
3484 
3485 	error = namei_simple_user(SCARG(uap, path),
3486 				NSM_FOLLOW_TRYEMULROOT, &vp);
3487 	if (error != 0)
3488 		return (error);
3489 
3490 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3491 
3492 	vrele(vp);
3493 	return (error);
3494 }
3495 
3496 /*
3497  * Set ownership given a file descriptor.
3498  */
3499 /* ARGSUSED */
3500 int
3501 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3502 {
3503 	/* {
3504 		syscallarg(int) fd;
3505 		syscallarg(uid_t) uid;
3506 		syscallarg(gid_t) gid;
3507 	} */
3508 	int error;
3509 	file_t *fp;
3510 
3511 	/* fd_getvnode() will use the descriptor for us */
3512 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3513 		return (error);
3514 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3515 	    l, 0);
3516 	fd_putfile(SCARG(uap, fd));
3517 	return (error);
3518 }
3519 
3520 int
3521 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3522     register_t *retval)
3523 {
3524 	/* {
3525 		syscallarg(int) fd;
3526 		syscallarg(const char *) path;
3527 		syscallarg(uid_t) owner;
3528 		syscallarg(gid_t) group;
3529 		syscallarg(int) flag;
3530 	} */
3531 
3532 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3533 			      SCARG(uap, owner), SCARG(uap, group),
3534 			      SCARG(uap, flag));
3535 }
3536 
3537 /*
3538  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3539  */
3540 /* ARGSUSED */
3541 int
3542 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3543 {
3544 	/* {
3545 		syscallarg(int) fd;
3546 		syscallarg(uid_t) uid;
3547 		syscallarg(gid_t) gid;
3548 	} */
3549 	int error;
3550 	file_t *fp;
3551 
3552 	/* fd_getvnode() will use the descriptor for us */
3553 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3554 		return (error);
3555 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3556 	    l, 1);
3557 	fd_putfile(SCARG(uap, fd));
3558 	return (error);
3559 }
3560 
3561 /*
3562  * Set ownership given a path name; this version does not follow links.
3563  */
3564 /* ARGSUSED */
3565 int
3566 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3567 {
3568 	/* {
3569 		syscallarg(const char *) path;
3570 		syscallarg(uid_t) uid;
3571 		syscallarg(gid_t) gid;
3572 	} */
3573 	int error;
3574 	struct vnode *vp;
3575 
3576 	error = namei_simple_user(SCARG(uap, path),
3577 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3578 	if (error != 0)
3579 		return (error);
3580 
3581 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3582 
3583 	vrele(vp);
3584 	return (error);
3585 }
3586 
3587 /*
3588  * Set ownership given a path name; this version does not follow links.
3589  * Provides POSIX/XPG semantics.
3590  */
3591 /* ARGSUSED */
3592 int
3593 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3594 {
3595 	/* {
3596 		syscallarg(const char *) path;
3597 		syscallarg(uid_t) uid;
3598 		syscallarg(gid_t) gid;
3599 	} */
3600 	int error;
3601 	struct vnode *vp;
3602 
3603 	error = namei_simple_user(SCARG(uap, path),
3604 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3605 	if (error != 0)
3606 		return (error);
3607 
3608 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3609 
3610 	vrele(vp);
3611 	return (error);
3612 }
3613 
3614 /*
3615  * Common routine to set ownership given a vnode.
3616  */
3617 static int
3618 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3619     int posix_semantics)
3620 {
3621 	struct vattr vattr;
3622 	mode_t newmode;
3623 	int error;
3624 
3625 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3626 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3627 		goto out;
3628 
3629 #define CHANGED(x) ((int)(x) != -1)
3630 	newmode = vattr.va_mode;
3631 	if (posix_semantics) {
3632 		/*
3633 		 * POSIX/XPG semantics: if the caller is not the super-user,
3634 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3635 		 * the XPG consider the behaviour for calls by the super-user
3636 		 * implementation-defined; we leave the set-user-id and set-
3637 		 * group-id settings intact in that case.
3638 		 */
3639 		if (vattr.va_mode & S_ISUID) {
3640 			if (kauth_authorize_vnode(l->l_cred,
3641 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3642 				newmode &= ~S_ISUID;
3643 		}
3644 		if (vattr.va_mode & S_ISGID) {
3645 			if (kauth_authorize_vnode(l->l_cred,
3646 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3647 				newmode &= ~S_ISGID;
3648 		}
3649 	} else {
3650 		/*
3651 		 * NetBSD semantics: when changing owner and/or group,
3652 		 * clear the respective bit(s).
3653 		 */
3654 		if (CHANGED(uid))
3655 			newmode &= ~S_ISUID;
3656 		if (CHANGED(gid))
3657 			newmode &= ~S_ISGID;
3658 	}
3659 	/* Update va_mode iff altered. */
3660 	if (vattr.va_mode == newmode)
3661 		newmode = VNOVAL;
3662 
3663 	vattr_null(&vattr);
3664 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3665 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3666 	vattr.va_mode = newmode;
3667 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3668 #undef CHANGED
3669 
3670 out:
3671 	VOP_UNLOCK(vp);
3672 	return (error);
3673 }
3674 
3675 /*
3676  * Set the access and modification times given a path name; this
3677  * version follows links.
3678  */
3679 /* ARGSUSED */
3680 int
3681 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3682     register_t *retval)
3683 {
3684 	/* {
3685 		syscallarg(const char *) path;
3686 		syscallarg(const struct timeval *) tptr;
3687 	} */
3688 
3689 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3690 	    SCARG(uap, tptr), UIO_USERSPACE);
3691 }
3692 
3693 /*
3694  * Set the access and modification times given a file descriptor.
3695  */
3696 /* ARGSUSED */
3697 int
3698 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3699     register_t *retval)
3700 {
3701 	/* {
3702 		syscallarg(int) fd;
3703 		syscallarg(const struct timeval *) tptr;
3704 	} */
3705 	int error;
3706 	file_t *fp;
3707 
3708 	/* fd_getvnode() will use the descriptor for us */
3709 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3710 		return (error);
3711 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3712 	    UIO_USERSPACE);
3713 	fd_putfile(SCARG(uap, fd));
3714 	return (error);
3715 }
3716 
3717 int
3718 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3719     register_t *retval)
3720 {
3721 	/* {
3722 		syscallarg(int) fd;
3723 		syscallarg(const struct timespec *) tptr;
3724 	} */
3725 	int error;
3726 	file_t *fp;
3727 
3728 	/* fd_getvnode() will use the descriptor for us */
3729 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3730 		return (error);
3731 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3732 	    SCARG(uap, tptr), UIO_USERSPACE);
3733 	fd_putfile(SCARG(uap, fd));
3734 	return (error);
3735 }
3736 
3737 /*
3738  * Set the access and modification times given a path name; this
3739  * version does not follow links.
3740  */
3741 int
3742 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3743     register_t *retval)
3744 {
3745 	/* {
3746 		syscallarg(const char *) path;
3747 		syscallarg(const struct timeval *) tptr;
3748 	} */
3749 
3750 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3751 	    SCARG(uap, tptr), UIO_USERSPACE);
3752 }
3753 
3754 int
3755 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3756     register_t *retval)
3757 {
3758 	/* {
3759 		syscallarg(int) fd;
3760 		syscallarg(const char *) path;
3761 		syscallarg(const struct timespec *) tptr;
3762 		syscallarg(int) flag;
3763 	} */
3764 	int follow;
3765 	const struct timespec *tptr;
3766 	int error;
3767 
3768 	tptr = SCARG(uap, tptr);
3769 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3770 
3771 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3772 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3773 
3774 	return error;
3775 }
3776 
3777 /*
3778  * Common routine to set access and modification times given a vnode.
3779  */
3780 int
3781 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3782     const struct timespec *tptr, enum uio_seg seg)
3783 {
3784 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3785 }
3786 
3787 int
3788 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3789     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3790 {
3791 	struct vattr vattr;
3792 	int error, dorele = 0;
3793 	namei_simple_flags_t sflags;
3794 	bool vanull, setbirthtime;
3795 	struct timespec ts[2];
3796 
3797 	KASSERT(l != NULL || fdat == AT_FDCWD);
3798 
3799 	/*
3800 	 * I have checked all callers and they pass either FOLLOW,
3801 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3802 	 * is 0. More to the point, they don't pass anything else.
3803 	 * Let's keep it that way at least until the namei interfaces
3804 	 * are fully sanitized.
3805 	 */
3806 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3807 	sflags = (flag == FOLLOW) ?
3808 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3809 
3810 	if (tptr == NULL) {
3811 		vanull = true;
3812 		nanotime(&ts[0]);
3813 		ts[1] = ts[0];
3814 	} else {
3815 		vanull = false;
3816 		if (seg != UIO_SYSSPACE) {
3817 			error = copyin(tptr, ts, sizeof (ts));
3818 			if (error != 0)
3819 				return error;
3820 		} else {
3821 			ts[0] = tptr[0];
3822 			ts[1] = tptr[1];
3823 		}
3824 	}
3825 
3826 	if (ts[0].tv_nsec == UTIME_NOW) {
3827 		nanotime(&ts[0]);
3828 		if (ts[1].tv_nsec == UTIME_NOW) {
3829 			vanull = true;
3830 			ts[1] = ts[0];
3831 		}
3832 	} else if (ts[1].tv_nsec == UTIME_NOW)
3833 		nanotime(&ts[1]);
3834 
3835 	if (vp == NULL) {
3836 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3837 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
3838 		if (error != 0)
3839 			return error;
3840 		dorele = 1;
3841 	}
3842 
3843 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3844 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3845 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3846 	vattr_null(&vattr);
3847 
3848 	if (ts[0].tv_nsec != UTIME_OMIT)
3849 		vattr.va_atime = ts[0];
3850 
3851 	if (ts[1].tv_nsec != UTIME_OMIT) {
3852 		vattr.va_mtime = ts[1];
3853 		if (setbirthtime)
3854 			vattr.va_birthtime = ts[1];
3855 	}
3856 
3857 	if (vanull)
3858 		vattr.va_vaflags |= VA_UTIMES_NULL;
3859 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3860 	VOP_UNLOCK(vp);
3861 
3862 	if (dorele != 0)
3863 		vrele(vp);
3864 
3865 	return error;
3866 }
3867 
3868 int
3869 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3870     const struct timeval *tptr, enum uio_seg seg)
3871 {
3872 	struct timespec ts[2];
3873 	struct timespec *tsptr = NULL;
3874 	int error;
3875 
3876 	if (tptr != NULL) {
3877 		struct timeval tv[2];
3878 
3879 		if (seg != UIO_SYSSPACE) {
3880 			error = copyin(tptr, tv, sizeof (tv));
3881 			if (error != 0)
3882 				return error;
3883 			tptr = tv;
3884 		}
3885 
3886 		if ((tv[0].tv_usec == UTIME_NOW) ||
3887 		    (tv[0].tv_usec == UTIME_OMIT))
3888 			ts[0].tv_nsec = tv[0].tv_usec;
3889 		else
3890 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3891 
3892 		if ((tv[1].tv_usec == UTIME_NOW) ||
3893 		    (tv[1].tv_usec == UTIME_OMIT))
3894 			ts[1].tv_nsec = tv[1].tv_usec;
3895 		else
3896 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3897 
3898 		tsptr = &ts[0];
3899 	}
3900 
3901 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3902 }
3903 
3904 /*
3905  * Truncate a file given its path name.
3906  */
3907 /* ARGSUSED */
3908 int
3909 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3910 {
3911 	/* {
3912 		syscallarg(const char *) path;
3913 		syscallarg(int) pad;
3914 		syscallarg(off_t) length;
3915 	} */
3916 	struct vnode *vp;
3917 	struct vattr vattr;
3918 	int error;
3919 
3920 	if (SCARG(uap, length) < 0)
3921 		return EINVAL;
3922 
3923 	error = namei_simple_user(SCARG(uap, path),
3924 				NSM_FOLLOW_TRYEMULROOT, &vp);
3925 	if (error != 0)
3926 		return (error);
3927 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3928 	if (vp->v_type == VDIR)
3929 		error = EISDIR;
3930 	else if ((error = vn_writechk(vp)) == 0 &&
3931 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3932 		vattr_null(&vattr);
3933 		vattr.va_size = SCARG(uap, length);
3934 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3935 	}
3936 	vput(vp);
3937 	return (error);
3938 }
3939 
3940 /*
3941  * Truncate a file given a file descriptor.
3942  */
3943 /* ARGSUSED */
3944 int
3945 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3946 {
3947 	/* {
3948 		syscallarg(int) fd;
3949 		syscallarg(int) pad;
3950 		syscallarg(off_t) length;
3951 	} */
3952 	struct vattr vattr;
3953 	struct vnode *vp;
3954 	file_t *fp;
3955 	int error;
3956 
3957 	if (SCARG(uap, length) < 0)
3958 		return EINVAL;
3959 
3960 	/* fd_getvnode() will use the descriptor for us */
3961 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3962 		return (error);
3963 	if ((fp->f_flag & FWRITE) == 0) {
3964 		error = EINVAL;
3965 		goto out;
3966 	}
3967 	vp = fp->f_vnode;
3968 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3969 	if (vp->v_type == VDIR)
3970 		error = EISDIR;
3971 	else if ((error = vn_writechk(vp)) == 0) {
3972 		vattr_null(&vattr);
3973 		vattr.va_size = SCARG(uap, length);
3974 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3975 	}
3976 	VOP_UNLOCK(vp);
3977  out:
3978 	fd_putfile(SCARG(uap, fd));
3979 	return (error);
3980 }
3981 
3982 /*
3983  * Sync an open file.
3984  */
3985 /* ARGSUSED */
3986 int
3987 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3988 {
3989 	/* {
3990 		syscallarg(int) fd;
3991 	} */
3992 	struct vnode *vp;
3993 	file_t *fp;
3994 	int error;
3995 
3996 	/* fd_getvnode() will use the descriptor for us */
3997 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3998 		return (error);
3999 	vp = fp->f_vnode;
4000 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4001 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4002 	VOP_UNLOCK(vp);
4003 	fd_putfile(SCARG(uap, fd));
4004 	return (error);
4005 }
4006 
4007 /*
4008  * Sync a range of file data.  API modeled after that found in AIX.
4009  *
4010  * FDATASYNC indicates that we need only save enough metadata to be able
4011  * to re-read the written data.  Note we duplicate AIX's requirement that
4012  * the file be open for writing.
4013  */
4014 /* ARGSUSED */
4015 int
4016 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4017 {
4018 	/* {
4019 		syscallarg(int) fd;
4020 		syscallarg(int) flags;
4021 		syscallarg(off_t) start;
4022 		syscallarg(off_t) length;
4023 	} */
4024 	struct vnode *vp;
4025 	file_t *fp;
4026 	int flags, nflags;
4027 	off_t s, e, len;
4028 	int error;
4029 
4030 	/* fd_getvnode() will use the descriptor for us */
4031 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4032 		return (error);
4033 
4034 	if ((fp->f_flag & FWRITE) == 0) {
4035 		error = EBADF;
4036 		goto out;
4037 	}
4038 
4039 	flags = SCARG(uap, flags);
4040 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4041 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4042 		error = EINVAL;
4043 		goto out;
4044 	}
4045 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4046 	if (flags & FDATASYNC)
4047 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4048 	else
4049 		nflags = FSYNC_WAIT;
4050 	if (flags & FDISKSYNC)
4051 		nflags |= FSYNC_CACHE;
4052 
4053 	len = SCARG(uap, length);
4054 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4055 	if (len) {
4056 		s = SCARG(uap, start);
4057 		e = s + len;
4058 		if (e < s) {
4059 			error = EINVAL;
4060 			goto out;
4061 		}
4062 	} else {
4063 		e = 0;
4064 		s = 0;
4065 	}
4066 
4067 	vp = fp->f_vnode;
4068 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4069 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4070 	VOP_UNLOCK(vp);
4071 out:
4072 	fd_putfile(SCARG(uap, fd));
4073 	return (error);
4074 }
4075 
4076 /*
4077  * Sync the data of an open file.
4078  */
4079 /* ARGSUSED */
4080 int
4081 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4082 {
4083 	/* {
4084 		syscallarg(int) fd;
4085 	} */
4086 	struct vnode *vp;
4087 	file_t *fp;
4088 	int error;
4089 
4090 	/* fd_getvnode() will use the descriptor for us */
4091 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4092 		return (error);
4093 	if ((fp->f_flag & FWRITE) == 0) {
4094 		fd_putfile(SCARG(uap, fd));
4095 		return (EBADF);
4096 	}
4097 	vp = fp->f_vnode;
4098 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4099 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4100 	VOP_UNLOCK(vp);
4101 	fd_putfile(SCARG(uap, fd));
4102 	return (error);
4103 }
4104 
4105 /*
4106  * Rename files, (standard) BSD semantics frontend.
4107  */
4108 /* ARGSUSED */
4109 int
4110 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4111 {
4112 	/* {
4113 		syscallarg(const char *) from;
4114 		syscallarg(const char *) to;
4115 	} */
4116 
4117 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4118 	    SCARG(uap, to), UIO_USERSPACE, 0));
4119 }
4120 
4121 int
4122 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4123     register_t *retval)
4124 {
4125 	/* {
4126 		syscallarg(int) fromfd;
4127 		syscallarg(const char *) from;
4128 		syscallarg(int) tofd;
4129 		syscallarg(const char *) to;
4130 	} */
4131 
4132 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4133 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4134 }
4135 
4136 /*
4137  * Rename files, POSIX semantics frontend.
4138  */
4139 /* ARGSUSED */
4140 int
4141 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4142 {
4143 	/* {
4144 		syscallarg(const char *) from;
4145 		syscallarg(const char *) to;
4146 	} */
4147 
4148 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4149 	    SCARG(uap, to), UIO_USERSPACE, 1));
4150 }
4151 
4152 /*
4153  * Rename files.  Source and destination must either both be directories,
4154  * or both not be directories.  If target is a directory, it must be empty.
4155  * If `from' and `to' refer to the same object, the value of the `retain'
4156  * argument is used to determine whether `from' will be
4157  *
4158  * (retain == 0)	deleted unless `from' and `to' refer to the same
4159  *			object in the file system's name space (BSD).
4160  * (retain == 1)	always retained (POSIX).
4161  *
4162  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4163  */
4164 int
4165 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4166 {
4167 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4168 }
4169 
4170 static int
4171 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4172     const char *to, enum uio_seg seg, int retain)
4173 {
4174 	struct pathbuf *fpb, *tpb;
4175 	struct nameidata fnd, tnd;
4176 	struct vnode *fdvp, *fvp;
4177 	struct vnode *tdvp, *tvp;
4178 	struct mount *mp, *tmp;
4179 	int error;
4180 
4181 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4182 
4183 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4184 	if (error)
4185 		goto out0;
4186 	KASSERT(fpb != NULL);
4187 
4188 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4189 	if (error)
4190 		goto out1;
4191 	KASSERT(tpb != NULL);
4192 
4193 	/*
4194 	 * Lookup from.
4195 	 *
4196 	 * XXX LOCKPARENT is wrong because we don't actually want it
4197 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4198 	 * insane, so for the time being we need to leave it like this.
4199 	 */
4200 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4201 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4202 		goto out2;
4203 
4204 	/*
4205 	 * Pull out the important results of the lookup, fdvp and fvp.
4206 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4207 	 */
4208 	fdvp = fnd.ni_dvp;
4209 	fvp = fnd.ni_vp;
4210 	KASSERT(fdvp != NULL);
4211 	KASSERT(fvp != NULL);
4212 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4213 
4214 	/*
4215 	 * Make sure neither fdvp nor fvp is locked.
4216 	 */
4217 	if (fdvp != fvp)
4218 		VOP_UNLOCK(fdvp);
4219 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4220 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4221 
4222 	/*
4223 	 * Reject renaming `.' and `..'.  Can't do this until after
4224 	 * namei because we need namei's parsing to find the final
4225 	 * component name.  (namei should just leave us with the final
4226 	 * component name and not look it up itself, but anyway...)
4227 	 *
4228 	 * This was here before because we used to relookup from
4229 	 * instead of to and relookup requires the caller to check
4230 	 * this, but now file systems may depend on this check, so we
4231 	 * must retain it until the file systems are all rototilled.
4232 	 */
4233 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4234 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4235 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4236 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4237 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4238 		error = EINVAL;	/* XXX EISDIR?  */
4239 		goto abort0;
4240 	}
4241 
4242 	/*
4243 	 * Lookup to.
4244 	 *
4245 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4246 	 * fvp here to decide whether to add CREATEDIR is a load of
4247 	 * bollocks because fvp might be the wrong node by now, since
4248 	 * fdvp is unlocked.
4249 	 *
4250 	 * XXX Why not pass CREATEDIR always?
4251 	 */
4252 	NDINIT(&tnd, RENAME,
4253 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4254 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4255 	    tpb);
4256 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4257 		goto abort0;
4258 
4259 	/*
4260 	 * Pull out the important results of the lookup, tdvp and tvp.
4261 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4262 	 */
4263 	tdvp = tnd.ni_dvp;
4264 	tvp = tnd.ni_vp;
4265 	KASSERT(tdvp != NULL);
4266 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4267 
4268 	/*
4269 	 * Make sure neither tdvp nor tvp is locked.
4270 	 */
4271 	if (tdvp != tvp)
4272 		VOP_UNLOCK(tdvp);
4273 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4274 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4275 
4276 	/*
4277 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4278 	 * these, which is why we must do this here.  Once upon a time
4279 	 * we relooked up from instead of to, and consequently didn't
4280 	 * need this check, but now that we relookup to instead of
4281 	 * from, we need this; and we shall need it forever forward
4282 	 * until the VOP_RENAME protocol changes, because file systems
4283 	 * will no doubt begin to depend on this check.
4284 	 */
4285 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4286 		error = EISDIR;
4287 		goto abort1;
4288 	}
4289 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4290 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4291 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4292 		error = EINVAL;
4293 		goto abort1;
4294 	}
4295 
4296 	/*
4297 	 * Get the mount point.  If the file system has been unmounted,
4298 	 * which it may be because we're not holding any vnode locks,
4299 	 * then v_mount will be NULL.  We're not really supposed to
4300 	 * read v_mount without holding the vnode lock, but since we
4301 	 * have fdvp referenced, if fdvp->v_mount changes then at worst
4302 	 * it will be set to NULL, not changed to another mount point.
4303 	 * And, of course, since it is up to the file system to
4304 	 * determine the real lock order, we can't lock both fdvp and
4305 	 * tdvp at the same time.
4306 	 */
4307 	mp = fdvp->v_mount;
4308 	if (mp == NULL) {
4309 		error = ENOENT;
4310 		goto abort1;
4311 	}
4312 
4313 	/*
4314 	 * Make sure the mount points match.  Again, although we don't
4315 	 * hold any vnode locks, the v_mount fields may change -- but
4316 	 * at worst they will change to NULL, so this will never become
4317 	 * a cross-device rename, because we hold vnode references.
4318 	 *
4319 	 * XXX Because nothing is locked and the compiler may reorder
4320 	 * things here, unmounting the file system at an inopportune
4321 	 * moment may cause rename to fail with ENXDEV when it really
4322 	 * should fail with ENOENT.
4323 	 */
4324 	tmp = tdvp->v_mount;
4325 	if (tmp == NULL) {
4326 		error = ENOENT;
4327 		goto abort1;
4328 	}
4329 
4330 	if (mp != tmp) {
4331 		error = EXDEV;
4332 		goto abort1;
4333 	}
4334 
4335 	/*
4336 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4337 	 * Nothing is locked currently, so taking this lock is safe.
4338 	 */
4339 	error = VFS_RENAMELOCK_ENTER(mp);
4340 	if (error)
4341 		goto abort1;
4342 
4343 	/*
4344 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4345 	 * and nothing is locked except for the vfs rename lock.
4346 	 *
4347 	 * The next step is a little rain dance to conform to the
4348 	 * insane lock protocol, even though it does nothing to ward
4349 	 * off race conditions.
4350 	 *
4351 	 * We need tdvp and tvp to be locked.  However, because we have
4352 	 * unlocked tdvp in order to hold no locks while we take the
4353 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4354 	 * lock it even if the sensible file systems will just unlock
4355 	 * it straight away.  Consequently, we must lock tdvp and then
4356 	 * relookup tvp to get it locked.
4357 	 *
4358 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4359 	 * and various file systems insanely depend on the semantics of
4360 	 * this brain damage, the lookup of to must be the last lookup
4361 	 * before VOP_RENAME.
4362 	 */
4363 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4364 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4365 	if (error)
4366 		goto abort2;
4367 
4368 	/*
4369 	 * Drop the old tvp and pick up the new one -- which might be
4370 	 * the same, but that doesn't matter to us.  After this, tdvp
4371 	 * and tvp should both be locked.
4372 	 */
4373 	if (tvp != NULL)
4374 		vrele(tvp);
4375 	tvp = tnd.ni_vp;
4376 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4377 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4378 
4379 	/*
4380 	 * The old do_sys_rename had various consistency checks here
4381 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4382 	 * will become bogus soon in any sensible file system, so the
4383 	 * only purpose in putting these checks here is to give lip
4384 	 * service to these screw cases and to acknowledge that they
4385 	 * exist, not actually to handle them, but here you go
4386 	 * anyway...
4387 	 */
4388 
4389 	/*
4390 	 * Acknowledge that directories and non-directories aren't
4391 	 * suposed to mix.
4392 	 */
4393 	if (tvp != NULL) {
4394 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4395 			error = ENOTDIR;
4396 			goto abort3;
4397 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4398 			error = EISDIR;
4399 			goto abort3;
4400 		}
4401 	}
4402 
4403 	/*
4404 	 * Acknowledge some random screw case, among the dozens that
4405 	 * might arise.
4406 	 */
4407 	if (fvp == tdvp) {
4408 		error = EINVAL;
4409 		goto abort3;
4410 	}
4411 
4412 	/*
4413 	 * Acknowledge that POSIX has a wacky screw case.
4414 	 *
4415 	 * XXX Eventually the retain flag needs to be passed on to
4416 	 * VOP_RENAME.
4417 	 */
4418 	if (fvp == tvp) {
4419 		if (retain) {
4420 			error = 0;
4421 			goto abort3;
4422 		} else if ((fdvp == tdvp) &&
4423 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4424 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4425 			fnd.ni_cnd.cn_namelen))) {
4426 			error = 0;
4427 			goto abort3;
4428 		}
4429 	}
4430 
4431 	/*
4432 	 * Make sure veriexec can screw us up.  (But a race can screw
4433 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4434 	 * bogus.)
4435 	 */
4436 #if NVERIEXEC > 0
4437 	{
4438 		char *f1, *f2;
4439 		size_t f1_len;
4440 		size_t f2_len;
4441 
4442 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4443 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4444 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4445 
4446 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4447 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4448 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4449 
4450 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4451 
4452 		kmem_free(f1, f1_len);
4453 		kmem_free(f2, f2_len);
4454 
4455 		if (error)
4456 			goto abort3;
4457 	}
4458 #endif /* NVERIEXEC > 0 */
4459 
4460 	/*
4461 	 * All ready.  Incant the rename vop.
4462 	 */
4463 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4464 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4465 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4466 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4467 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4468 
4469 	/*
4470 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4471 	 * tdvp and tvp.  But we can't assert any of that.
4472 	 */
4473 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4474 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4475 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4476 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4477 
4478 	/*
4479 	 * So all we have left to do is to drop the rename lock and
4480 	 * destroy the pathbufs.
4481 	 */
4482 	VFS_RENAMELOCK_EXIT(mp);
4483 	goto out2;
4484 
4485 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4486 		VOP_UNLOCK(tvp);
4487 abort2:	VOP_UNLOCK(tdvp);
4488 	VFS_RENAMELOCK_EXIT(mp);
4489 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4490 	vrele(tdvp);
4491 	if (tvp != NULL)
4492 		vrele(tvp);
4493 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4494 	vrele(fdvp);
4495 	vrele(fvp);
4496 out2:	pathbuf_destroy(tpb);
4497 out1:	pathbuf_destroy(fpb);
4498 out0:	return error;
4499 }
4500 
4501 /*
4502  * Make a directory file.
4503  */
4504 /* ARGSUSED */
4505 int
4506 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4507 {
4508 	/* {
4509 		syscallarg(const char *) path;
4510 		syscallarg(int) mode;
4511 	} */
4512 
4513 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4514 	    SCARG(uap, mode), UIO_USERSPACE);
4515 }
4516 
4517 int
4518 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4519     register_t *retval)
4520 {
4521 	/* {
4522 		syscallarg(int) fd;
4523 		syscallarg(const char *) path;
4524 		syscallarg(int) mode;
4525 	} */
4526 
4527 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4528 	    SCARG(uap, mode), UIO_USERSPACE);
4529 }
4530 
4531 
4532 int
4533 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4534 {
4535 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, UIO_USERSPACE);
4536 }
4537 
4538 static int
4539 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4540     enum uio_seg seg)
4541 {
4542 	struct proc *p = curlwp->l_proc;
4543 	struct vnode *vp;
4544 	struct vattr vattr;
4545 	int error;
4546 	struct pathbuf *pb;
4547 	struct nameidata nd;
4548 
4549 	KASSERT(l != NULL || fdat == AT_FDCWD);
4550 
4551 	/* XXX bollocks, should pass in a pathbuf */
4552 	error = pathbuf_maybe_copyin(path, seg, &pb);
4553 	if (error) {
4554 		return error;
4555 	}
4556 
4557 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4558 
4559 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4560 		pathbuf_destroy(pb);
4561 		return (error);
4562 	}
4563 	vp = nd.ni_vp;
4564 	if (vp != NULL) {
4565 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4566 		if (nd.ni_dvp == vp)
4567 			vrele(nd.ni_dvp);
4568 		else
4569 			vput(nd.ni_dvp);
4570 		vrele(vp);
4571 		pathbuf_destroy(pb);
4572 		return (EEXIST);
4573 	}
4574 	vattr_null(&vattr);
4575 	vattr.va_type = VDIR;
4576 	/* We will read cwdi->cwdi_cmask unlocked. */
4577 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4578 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4579 	if (!error)
4580 		vrele(nd.ni_vp);
4581 	vput(nd.ni_dvp);
4582 	pathbuf_destroy(pb);
4583 	return (error);
4584 }
4585 
4586 /*
4587  * Remove a directory file.
4588  */
4589 /* ARGSUSED */
4590 int
4591 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4592 {
4593 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4594 	    AT_REMOVEDIR, UIO_USERSPACE);
4595 }
4596 
4597 /*
4598  * Read a block of directory entries in a file system independent format.
4599  */
4600 int
4601 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4602 {
4603 	/* {
4604 		syscallarg(int) fd;
4605 		syscallarg(char *) buf;
4606 		syscallarg(size_t) count;
4607 	} */
4608 	file_t *fp;
4609 	int error, done;
4610 
4611 	/* fd_getvnode() will use the descriptor for us */
4612 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4613 		return (error);
4614 	if ((fp->f_flag & FREAD) == 0) {
4615 		error = EBADF;
4616 		goto out;
4617 	}
4618 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4619 			SCARG(uap, count), &done, l, 0, 0);
4620 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4621 	*retval = done;
4622  out:
4623 	fd_putfile(SCARG(uap, fd));
4624 	return (error);
4625 }
4626 
4627 /*
4628  * Set the mode mask for creation of filesystem nodes.
4629  */
4630 int
4631 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4632 {
4633 	/* {
4634 		syscallarg(mode_t) newmask;
4635 	} */
4636 	struct proc *p = l->l_proc;
4637 	struct cwdinfo *cwdi;
4638 
4639 	/*
4640 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
4641 	 * important is that we serialize changes to the mask.  The
4642 	 * rw_exit() will issue a write memory barrier on our behalf,
4643 	 * and force the changes out to other CPUs (as it must use an
4644 	 * atomic operation, draining the local CPU's store buffers).
4645 	 */
4646 	cwdi = p->p_cwdi;
4647 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
4648 	*retval = cwdi->cwdi_cmask;
4649 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
4650 	rw_exit(&cwdi->cwdi_lock);
4651 
4652 	return (0);
4653 }
4654 
4655 int
4656 dorevoke(struct vnode *vp, kauth_cred_t cred)
4657 {
4658 	struct vattr vattr;
4659 	int error, fs_decision;
4660 
4661 	vn_lock(vp, LK_SHARED | LK_RETRY);
4662 	error = VOP_GETATTR(vp, &vattr, cred);
4663 	VOP_UNLOCK(vp);
4664 	if (error != 0)
4665 		return error;
4666 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4667 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4668 	    fs_decision);
4669 	if (!error)
4670 		VOP_REVOKE(vp, REVOKEALL);
4671 	return (error);
4672 }
4673 
4674 /*
4675  * Void all references to file by ripping underlying filesystem
4676  * away from vnode.
4677  */
4678 /* ARGSUSED */
4679 int
4680 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4681 {
4682 	/* {
4683 		syscallarg(const char *) path;
4684 	} */
4685 	struct vnode *vp;
4686 	int error;
4687 
4688 	error = namei_simple_user(SCARG(uap, path),
4689 				NSM_FOLLOW_TRYEMULROOT, &vp);
4690 	if (error != 0)
4691 		return (error);
4692 	error = dorevoke(vp, l->l_cred);
4693 	vrele(vp);
4694 	return (error);
4695 }
4696 
4697 /*
4698  * Allocate backing store for a file, filling a hole without having to
4699  * explicitly write anything out.
4700  */
4701 /* ARGSUSED */
4702 int
4703 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4704 		register_t *retval)
4705 {
4706 	/* {
4707 		syscallarg(int) fd;
4708 		syscallarg(off_t) pos;
4709 		syscallarg(off_t) len;
4710 	} */
4711 	int fd;
4712 	off_t pos, len;
4713 	struct file *fp;
4714 	struct vnode *vp;
4715 	int error;
4716 
4717 	fd = SCARG(uap, fd);
4718 	pos = SCARG(uap, pos);
4719 	len = SCARG(uap, len);
4720 
4721 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4722 		*retval = EINVAL;
4723 		return 0;
4724 	}
4725 
4726 	error = fd_getvnode(fd, &fp);
4727 	if (error) {
4728 		*retval = error;
4729 		return 0;
4730 	}
4731 	if ((fp->f_flag & FWRITE) == 0) {
4732 		error = EBADF;
4733 		goto fail;
4734 	}
4735 	vp = fp->f_vnode;
4736 
4737 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4738 	if (vp->v_type == VDIR) {
4739 		error = EISDIR;
4740 	} else {
4741 		error = VOP_FALLOCATE(vp, pos, len);
4742 	}
4743 	VOP_UNLOCK(vp);
4744 
4745 fail:
4746 	fd_putfile(fd);
4747 	*retval = error;
4748 	return 0;
4749 }
4750 
4751 /*
4752  * Deallocate backing store for a file, creating a hole. Also used for
4753  * invoking TRIM on disks.
4754  */
4755 /* ARGSUSED */
4756 int
4757 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4758 		register_t *retval)
4759 {
4760 	/* {
4761 		syscallarg(int) fd;
4762 		syscallarg(off_t) pos;
4763 		syscallarg(off_t) len;
4764 	} */
4765 	int fd;
4766 	off_t pos, len;
4767 	struct file *fp;
4768 	struct vnode *vp;
4769 	int error;
4770 
4771 	fd = SCARG(uap, fd);
4772 	pos = SCARG(uap, pos);
4773 	len = SCARG(uap, len);
4774 
4775 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4776 		return EINVAL;
4777 	}
4778 
4779 	error = fd_getvnode(fd, &fp);
4780 	if (error) {
4781 		return error;
4782 	}
4783 	if ((fp->f_flag & FWRITE) == 0) {
4784 		error = EBADF;
4785 		goto fail;
4786 	}
4787 	vp = fp->f_vnode;
4788 
4789 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4790 	if (vp->v_type == VDIR) {
4791 		error = EISDIR;
4792 	} else {
4793 		error = VOP_FDISCARD(vp, pos, len);
4794 	}
4795 	VOP_UNLOCK(vp);
4796 
4797 fail:
4798 	fd_putfile(fd);
4799 	return error;
4800 }
4801