xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 80d9064ac03cbb6a4174695f0d5b237c8766d3d0)
1 /*	$NetBSD: vfs_syscalls.c,v 1.491 2014/09/05 09:20:59 matt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.491 2014/09/05 09:20:59 matt Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/proc.h>
91 #include <sys/uio.h>
92 #include <sys/kmem.h>
93 #include <sys/dirent.h>
94 #include <sys/sysctl.h>
95 #include <sys/syscallargs.h>
96 #include <sys/vfs_syscalls.h>
97 #include <sys/quota.h>
98 #include <sys/quotactl.h>
99 #include <sys/ktrace.h>
100 #ifdef FILEASSOC
101 #include <sys/fileassoc.h>
102 #endif /* FILEASSOC */
103 #include <sys/extattr.h>
104 #include <sys/verified_exec.h>
105 #include <sys/kauth.h>
106 #include <sys/atomic.h>
107 #include <sys/module.h>
108 #include <sys/buf.h>
109 
110 #include <miscfs/genfs/genfs.h>
111 #include <miscfs/syncfs/syncfs.h>
112 #include <miscfs/specfs/specdev.h>
113 
114 #include <nfs/rpcv2.h>
115 #include <nfs/nfsproto.h>
116 #include <nfs/nfs.h>
117 #include <nfs/nfs_var.h>
118 
119 /* XXX this shouldn't be here */
120 #ifndef OFF_T_MAX
121 #define OFF_T_MAX __type_max(off_t)
122 #endif
123 
124 static int change_flags(struct vnode *, u_long, struct lwp *);
125 static int change_mode(struct vnode *, int, struct lwp *);
126 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
127 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
128 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
129     enum uio_seg);
130 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
131 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
132     enum uio_seg);
133 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
134     enum uio_seg, int);
135 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
136     size_t, register_t *);
137 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
138 
139 static int fd_nameiat(struct lwp *, int, struct nameidata *);
140 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
141     namei_simple_flags_t, struct vnode **);
142 
143 
144 /*
145  * This table is used to maintain compatibility with 4.3BSD
146  * and NetBSD 0.9 mount syscalls - and possibly other systems.
147  * Note, the order is important!
148  *
149  * Do not modify this table. It should only contain filesystems
150  * supported by NetBSD 0.9 and 4.3BSD.
151  */
152 const char * const mountcompatnames[] = {
153 	NULL,		/* 0 = MOUNT_NONE */
154 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
155 	MOUNT_NFS,	/* 2 */
156 	MOUNT_MFS,	/* 3 */
157 	MOUNT_MSDOS,	/* 4 */
158 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
159 	MOUNT_FDESC,	/* 6 */
160 	MOUNT_KERNFS,	/* 7 */
161 	NULL,		/* 8 = MOUNT_DEVFS */
162 	MOUNT_AFS,	/* 9 */
163 };
164 
165 const int nmountcompatnames = __arraycount(mountcompatnames);
166 
167 static int
168 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
169 {
170 	file_t *dfp;
171 	int error;
172 
173 	if (fdat != AT_FDCWD) {
174 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
175 			goto out;
176 
177 		NDAT(ndp, dfp->f_vnode);
178 	}
179 
180 	error = namei(ndp);
181 
182 	if (fdat != AT_FDCWD)
183 		fd_putfile(fdat);
184 out:
185 	return error;
186 }
187 
188 static int
189 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
190     namei_simple_flags_t sflags, struct vnode **vp_ret)
191 {
192 	file_t *dfp;
193 	struct vnode *dvp;
194 	int error;
195 
196 	if (fdat != AT_FDCWD) {
197 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
198 			goto out;
199 
200 		dvp = dfp->f_vnode;
201 	} else {
202 		dvp = NULL;
203 	}
204 
205 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
206 
207 	if (fdat != AT_FDCWD)
208 		fd_putfile(fdat);
209 out:
210 	return error;
211 }
212 
213 static int
214 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
215 {
216 	int error;
217 
218 	fp->f_flag = flags & FMASK;
219 	fp->f_type = DTYPE_VNODE;
220 	fp->f_ops = &vnops;
221 	fp->f_vnode = vp;
222 
223 	if (flags & (O_EXLOCK | O_SHLOCK)) {
224 		struct flock lf;
225 		int type;
226 
227 		lf.l_whence = SEEK_SET;
228 		lf.l_start = 0;
229 		lf.l_len = 0;
230 		if (flags & O_EXLOCK)
231 			lf.l_type = F_WRLCK;
232 		else
233 			lf.l_type = F_RDLCK;
234 		type = F_FLOCK;
235 		if ((flags & FNONBLOCK) == 0)
236 			type |= F_WAIT;
237 		VOP_UNLOCK(vp);
238 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
239 		if (error) {
240 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
241 			fd_abort(l->l_proc, fp, indx);
242 			return error;
243 		}
244 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
245 		atomic_or_uint(&fp->f_flag, FHASLOCK);
246 	}
247 	if (flags & O_CLOEXEC)
248 		fd_set_exclose(l, indx, true);
249 	return 0;
250 }
251 
252 static int
253 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
254     void *data, size_t *data_len)
255 {
256 	struct mount *mp;
257 	int error = 0, saved_flags;
258 
259 	mp = vp->v_mount;
260 	saved_flags = mp->mnt_flag;
261 
262 	/* We can operate only on VV_ROOT nodes. */
263 	if ((vp->v_vflag & VV_ROOT) == 0) {
264 		error = EINVAL;
265 		goto out;
266 	}
267 
268 	/*
269 	 * We only allow the filesystem to be reloaded if it
270 	 * is currently mounted read-only.  Additionally, we
271 	 * prevent read-write to read-only downgrades.
272 	 */
273 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
274 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
275 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
276 		error = EOPNOTSUPP;	/* Needs translation */
277 		goto out;
278 	}
279 
280 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
281 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
282 	if (error)
283 		goto out;
284 
285 	if (vfs_busy(mp, NULL)) {
286 		error = EPERM;
287 		goto out;
288 	}
289 
290 	mutex_enter(&mp->mnt_updating);
291 
292 	mp->mnt_flag &= ~MNT_OP_FLAGS;
293 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
294 
295 	/*
296 	 * Set the mount level flags.
297 	 */
298 	if (flags & MNT_RDONLY)
299 		mp->mnt_flag |= MNT_RDONLY;
300 	else if (mp->mnt_flag & MNT_RDONLY)
301 		mp->mnt_iflag |= IMNT_WANTRDWR;
302 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
303 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
304 	error = VFS_MOUNT(mp, path, data, data_len);
305 
306 	if (error && data != NULL) {
307 		int error2;
308 
309 		/*
310 		 * Update failed; let's try and see if it was an
311 		 * export request.  For compat with 3.0 and earlier.
312 		 */
313 		error2 = vfs_hooks_reexport(mp, path, data);
314 
315 		/*
316 		 * Only update error code if the export request was
317 		 * understood but some problem occurred while
318 		 * processing it.
319 		 */
320 		if (error2 != EJUSTRETURN)
321 			error = error2;
322 	}
323 
324 	if (mp->mnt_iflag & IMNT_WANTRDWR)
325 		mp->mnt_flag &= ~MNT_RDONLY;
326 	if (error)
327 		mp->mnt_flag = saved_flags;
328 	mp->mnt_flag &= ~MNT_OP_FLAGS;
329 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
330 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
331 		if (mp->mnt_syncer == NULL)
332 			error = vfs_allocate_syncvnode(mp);
333 	} else {
334 		if (mp->mnt_syncer != NULL)
335 			vfs_deallocate_syncvnode(mp);
336 	}
337 	mutex_exit(&mp->mnt_updating);
338 	vfs_unbusy(mp, false, NULL);
339 
340 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
341 	    (flags & MNT_EXTATTR)) {
342 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
343 				   NULL, 0, NULL) != 0) {
344 			printf("%s: failed to start extattr, error = %d",
345 			       mp->mnt_stat.f_mntonname, error);
346 			mp->mnt_flag &= ~MNT_EXTATTR;
347 		}
348 	}
349 
350 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
351 	    !(flags & MNT_EXTATTR)) {
352 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
353 				   NULL, 0, NULL) != 0) {
354 			printf("%s: failed to stop extattr, error = %d",
355 			       mp->mnt_stat.f_mntonname, error);
356 			mp->mnt_flag |= MNT_RDONLY;
357 		}
358 	}
359  out:
360 	return (error);
361 }
362 
363 static int
364 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
365 {
366 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
367 	int error;
368 
369 	/* Copy file-system type from userspace.  */
370 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
371 	if (error) {
372 		/*
373 		 * Historically, filesystem types were identified by numbers.
374 		 * If we get an integer for the filesystem type instead of a
375 		 * string, we check to see if it matches one of the historic
376 		 * filesystem types.
377 		 */
378 		u_long fsindex = (u_long)fstype;
379 		if (fsindex >= nmountcompatnames ||
380 		    mountcompatnames[fsindex] == NULL)
381 			return ENODEV;
382 		strlcpy(fstypename, mountcompatnames[fsindex],
383 		    sizeof(fstypename));
384 	}
385 
386 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
387 	if (strcmp(fstypename, "ufs") == 0)
388 		fstypename[0] = 'f';
389 
390 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
391 		return 0;
392 
393 	/* If we can autoload a vfs module, try again */
394 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
395 
396 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
397 		return 0;
398 
399 	return ENODEV;
400 }
401 
402 static int
403 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
404     void *data, size_t *data_len)
405 {
406 	struct mount *mp;
407 	int error;
408 
409 	/* If MNT_GETARGS is specified, it should be the only flag. */
410 	if (flags & ~MNT_GETARGS)
411 		return EINVAL;
412 
413 	mp = vp->v_mount;
414 
415 	/* XXX: probably some notion of "can see" here if we want isolation. */
416 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
417 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
418 	if (error)
419 		return error;
420 
421 	if ((vp->v_vflag & VV_ROOT) == 0)
422 		return EINVAL;
423 
424 	if (vfs_busy(mp, NULL))
425 		return EPERM;
426 
427 	mutex_enter(&mp->mnt_updating);
428 	mp->mnt_flag &= ~MNT_OP_FLAGS;
429 	mp->mnt_flag |= MNT_GETARGS;
430 	error = VFS_MOUNT(mp, path, data, data_len);
431 	mp->mnt_flag &= ~MNT_OP_FLAGS;
432 	mutex_exit(&mp->mnt_updating);
433 
434 	vfs_unbusy(mp, false, NULL);
435 	return (error);
436 }
437 
438 int
439 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
440 {
441 	/* {
442 		syscallarg(const char *) type;
443 		syscallarg(const char *) path;
444 		syscallarg(int) flags;
445 		syscallarg(void *) data;
446 		syscallarg(size_t) data_len;
447 	} */
448 
449 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
450 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
451 	    SCARG(uap, data_len), retval);
452 }
453 
454 int
455 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
456     const char *path, int flags, void *data, enum uio_seg data_seg,
457     size_t data_len, register_t *retval)
458 {
459 	struct vnode *vp;
460 	void *data_buf = data;
461 	bool vfsopsrele = false;
462 	size_t alloc_sz = 0;
463 	int error;
464 
465 	/* XXX: The calling convention of this routine is totally bizarre */
466 	if (vfsops)
467 		vfsopsrele = true;
468 
469 	/*
470 	 * Get vnode to be covered
471 	 */
472 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
473 	if (error != 0) {
474 		vp = NULL;
475 		goto done;
476 	}
477 
478 	if (vfsops == NULL) {
479 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
480 			vfsops = vp->v_mount->mnt_op;
481 		} else {
482 			/* 'type' is userspace */
483 			error = mount_get_vfsops(type, &vfsops);
484 			if (error != 0)
485 				goto done;
486 			vfsopsrele = true;
487 		}
488 	}
489 
490 	/*
491 	 * We allow data to be NULL, even for userspace. Some fs's don't need
492 	 * it. The others will handle NULL.
493 	 */
494 	if (data != NULL && data_seg == UIO_USERSPACE) {
495 		if (data_len == 0) {
496 			/* No length supplied, use default for filesystem */
497 			data_len = vfsops->vfs_min_mount_data;
498 
499 			/*
500 			 * Hopefully a longer buffer won't make copyin() fail.
501 			 * For compatibility with 3.0 and earlier.
502 			 */
503 			if (flags & MNT_UPDATE
504 			    && data_len < sizeof (struct mnt_export_args30))
505 				data_len = sizeof (struct mnt_export_args30);
506 		}
507 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
508 			error = EINVAL;
509 			goto done;
510 		}
511 		alloc_sz = data_len;
512 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
513 
514 		/* NFS needs the buffer even for mnt_getargs .... */
515 		error = copyin(data, data_buf, data_len);
516 		if (error != 0)
517 			goto done;
518 	}
519 
520 	if (flags & MNT_GETARGS) {
521 		if (data_len == 0) {
522 			error = EINVAL;
523 			goto done;
524 		}
525 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
526 		if (error != 0)
527 			goto done;
528 		if (data_seg == UIO_USERSPACE)
529 			error = copyout(data_buf, data, data_len);
530 		*retval = data_len;
531 	} else if (flags & MNT_UPDATE) {
532 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
533 	} else {
534 		/* Locking is handled internally in mount_domount(). */
535 		KASSERT(vfsopsrele == true);
536 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
537 		    &data_len);
538 		vfsopsrele = false;
539 	}
540 
541     done:
542 	if (vfsopsrele)
543 		vfs_delref(vfsops);
544     	if (vp != NULL) {
545 	    	vrele(vp);
546 	}
547 	if (data_buf != data)
548 		kmem_free(data_buf, alloc_sz);
549 	return (error);
550 }
551 
552 /*
553  * Unmount a file system.
554  *
555  * Note: unmount takes a path to the vnode mounted on as argument,
556  * not special file (as before).
557  */
558 /* ARGSUSED */
559 int
560 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
561 {
562 	/* {
563 		syscallarg(const char *) path;
564 		syscallarg(int) flags;
565 	} */
566 	struct vnode *vp;
567 	struct mount *mp;
568 	int error;
569 	struct pathbuf *pb;
570 	struct nameidata nd;
571 
572 	error = pathbuf_copyin(SCARG(uap, path), &pb);
573 	if (error) {
574 		return error;
575 	}
576 
577 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
578 	if ((error = namei(&nd)) != 0) {
579 		pathbuf_destroy(pb);
580 		return error;
581 	}
582 	vp = nd.ni_vp;
583 	pathbuf_destroy(pb);
584 
585 	mp = vp->v_mount;
586 	atomic_inc_uint(&mp->mnt_refcnt);
587 	VOP_UNLOCK(vp);
588 
589 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
590 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
591 	if (error) {
592 		vrele(vp);
593 		vfs_destroy(mp);
594 		return (error);
595 	}
596 
597 	/*
598 	 * Don't allow unmounting the root file system.
599 	 */
600 	if (mp->mnt_flag & MNT_ROOTFS) {
601 		vrele(vp);
602 		vfs_destroy(mp);
603 		return (EINVAL);
604 	}
605 
606 	/*
607 	 * Must be the root of the filesystem
608 	 */
609 	if ((vp->v_vflag & VV_ROOT) == 0) {
610 		vrele(vp);
611 		vfs_destroy(mp);
612 		return (EINVAL);
613 	}
614 
615 	vrele(vp);
616 	error = dounmount(mp, SCARG(uap, flags), l);
617 	vfs_destroy(mp);
618 	return error;
619 }
620 
621 /*
622  * Sync each mounted filesystem.
623  */
624 #ifdef DEBUG
625 int syncprt = 0;
626 struct ctldebug debug0 = { "syncprt", &syncprt };
627 #endif
628 
629 void
630 do_sys_sync(struct lwp *l)
631 {
632 	struct mount *mp, *nmp;
633 	int asyncflag;
634 
635 	mutex_enter(&mountlist_lock);
636 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
637 		if (vfs_busy(mp, &nmp)) {
638 			continue;
639 		}
640 		mutex_enter(&mp->mnt_updating);
641 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
642 			asyncflag = mp->mnt_flag & MNT_ASYNC;
643 			mp->mnt_flag &= ~MNT_ASYNC;
644 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
645 			if (asyncflag)
646 				 mp->mnt_flag |= MNT_ASYNC;
647 		}
648 		mutex_exit(&mp->mnt_updating);
649 		vfs_unbusy(mp, false, &nmp);
650 	}
651 	mutex_exit(&mountlist_lock);
652 #ifdef DEBUG
653 	if (syncprt)
654 		vfs_bufstats();
655 #endif /* DEBUG */
656 }
657 
658 /* ARGSUSED */
659 int
660 sys_sync(struct lwp *l, const void *v, register_t *retval)
661 {
662 	do_sys_sync(l);
663 	return (0);
664 }
665 
666 
667 /*
668  * Access or change filesystem quotas.
669  *
670  * (this is really 14 different calls bundled into one)
671  */
672 
673 static int
674 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
675 {
676 	struct quotastat info_k;
677 	int error;
678 
679 	/* ensure any padding bytes are cleared */
680 	memset(&info_k, 0, sizeof(info_k));
681 
682 	error = vfs_quotactl_stat(mp, &info_k);
683 	if (error) {
684 		return error;
685 	}
686 
687 	return copyout(&info_k, info_u, sizeof(info_k));
688 }
689 
690 static int
691 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
692     struct quotaidtypestat *info_u)
693 {
694 	struct quotaidtypestat info_k;
695 	int error;
696 
697 	/* ensure any padding bytes are cleared */
698 	memset(&info_k, 0, sizeof(info_k));
699 
700 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
701 	if (error) {
702 		return error;
703 	}
704 
705 	return copyout(&info_k, info_u, sizeof(info_k));
706 }
707 
708 static int
709 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
710     struct quotaobjtypestat *info_u)
711 {
712 	struct quotaobjtypestat info_k;
713 	int error;
714 
715 	/* ensure any padding bytes are cleared */
716 	memset(&info_k, 0, sizeof(info_k));
717 
718 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
719 	if (error) {
720 		return error;
721 	}
722 
723 	return copyout(&info_k, info_u, sizeof(info_k));
724 }
725 
726 static int
727 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
728     struct quotaval *val_u)
729 {
730 	struct quotakey key_k;
731 	struct quotaval val_k;
732 	int error;
733 
734 	/* ensure any padding bytes are cleared */
735 	memset(&val_k, 0, sizeof(val_k));
736 
737 	error = copyin(key_u, &key_k, sizeof(key_k));
738 	if (error) {
739 		return error;
740 	}
741 
742 	error = vfs_quotactl_get(mp, &key_k, &val_k);
743 	if (error) {
744 		return error;
745 	}
746 
747 	return copyout(&val_k, val_u, sizeof(val_k));
748 }
749 
750 static int
751 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
752     const struct quotaval *val_u)
753 {
754 	struct quotakey key_k;
755 	struct quotaval val_k;
756 	int error;
757 
758 	error = copyin(key_u, &key_k, sizeof(key_k));
759 	if (error) {
760 		return error;
761 	}
762 
763 	error = copyin(val_u, &val_k, sizeof(val_k));
764 	if (error) {
765 		return error;
766 	}
767 
768 	return vfs_quotactl_put(mp, &key_k, &val_k);
769 }
770 
771 static int
772 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
773 {
774 	struct quotakey key_k;
775 	int error;
776 
777 	error = copyin(key_u, &key_k, sizeof(key_k));
778 	if (error) {
779 		return error;
780 	}
781 
782 	return vfs_quotactl_del(mp, &key_k);
783 }
784 
785 static int
786 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
787 {
788 	struct quotakcursor cursor_k;
789 	int error;
790 
791 	/* ensure any padding bytes are cleared */
792 	memset(&cursor_k, 0, sizeof(cursor_k));
793 
794 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
795 	if (error) {
796 		return error;
797 	}
798 
799 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
800 }
801 
802 static int
803 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
804 {
805 	struct quotakcursor cursor_k;
806 	int error;
807 
808 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
809 	if (error) {
810 		return error;
811 	}
812 
813 	return vfs_quotactl_cursorclose(mp, &cursor_k);
814 }
815 
816 static int
817 do_sys_quotactl_cursorskipidtype(struct mount *mp,
818     struct quotakcursor *cursor_u, int idtype)
819 {
820 	struct quotakcursor cursor_k;
821 	int error;
822 
823 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
824 	if (error) {
825 		return error;
826 	}
827 
828 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
829 	if (error) {
830 		return error;
831 	}
832 
833 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
834 }
835 
836 static int
837 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
838     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
839     unsigned *ret_u)
840 {
841 #define CGET_STACK_MAX 8
842 	struct quotakcursor cursor_k;
843 	struct quotakey stackkeys[CGET_STACK_MAX];
844 	struct quotaval stackvals[CGET_STACK_MAX];
845 	struct quotakey *keys_k;
846 	struct quotaval *vals_k;
847 	unsigned ret_k;
848 	int error;
849 
850 	if (maxnum > 128) {
851 		maxnum = 128;
852 	}
853 
854 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
855 	if (error) {
856 		return error;
857 	}
858 
859 	if (maxnum <= CGET_STACK_MAX) {
860 		keys_k = stackkeys;
861 		vals_k = stackvals;
862 		/* ensure any padding bytes are cleared */
863 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
864 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
865 	} else {
866 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
867 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
868 	}
869 
870 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
871 				       &ret_k);
872 	if (error) {
873 		goto fail;
874 	}
875 
876 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
877 	if (error) {
878 		goto fail;
879 	}
880 
881 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
882 	if (error) {
883 		goto fail;
884 	}
885 
886 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
887 	if (error) {
888 		goto fail;
889 	}
890 
891 	/* do last to maximize the chance of being able to recover a failure */
892 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
893 
894 fail:
895 	if (keys_k != stackkeys) {
896 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
897 	}
898 	if (vals_k != stackvals) {
899 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
900 	}
901 	return error;
902 }
903 
904 static int
905 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
906     int *ret_u)
907 {
908 	struct quotakcursor cursor_k;
909 	int ret_k;
910 	int error;
911 
912 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
913 	if (error) {
914 		return error;
915 	}
916 
917 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
918 	if (error) {
919 		return error;
920 	}
921 
922 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
923 	if (error) {
924 		return error;
925 	}
926 
927 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
928 }
929 
930 static int
931 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
932 {
933 	struct quotakcursor cursor_k;
934 	int error;
935 
936 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
937 	if (error) {
938 		return error;
939 	}
940 
941 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
942 	if (error) {
943 		return error;
944 	}
945 
946 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
947 }
948 
949 static int
950 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
951 {
952 	char *path_k;
953 	int error;
954 
955 	/* XXX this should probably be a struct pathbuf */
956 	path_k = PNBUF_GET();
957 	error = copyin(path_u, path_k, PATH_MAX);
958 	if (error) {
959 		PNBUF_PUT(path_k);
960 		return error;
961 	}
962 
963 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
964 
965 	PNBUF_PUT(path_k);
966 	return error;
967 }
968 
969 static int
970 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
971 {
972 	return vfs_quotactl_quotaoff(mp, idtype);
973 }
974 
975 int
976 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
977 {
978 	struct mount *mp;
979 	struct vnode *vp;
980 	int error;
981 
982 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
983 	if (error != 0)
984 		return (error);
985 	mp = vp->v_mount;
986 
987 	switch (args->qc_op) {
988 	    case QUOTACTL_STAT:
989 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
990 		break;
991 	    case QUOTACTL_IDTYPESTAT:
992 		error = do_sys_quotactl_idtypestat(mp,
993 				args->u.idtypestat.qc_idtype,
994 				args->u.idtypestat.qc_info);
995 		break;
996 	    case QUOTACTL_OBJTYPESTAT:
997 		error = do_sys_quotactl_objtypestat(mp,
998 				args->u.objtypestat.qc_objtype,
999 				args->u.objtypestat.qc_info);
1000 		break;
1001 	    case QUOTACTL_GET:
1002 		error = do_sys_quotactl_get(mp,
1003 				args->u.get.qc_key,
1004 				args->u.get.qc_val);
1005 		break;
1006 	    case QUOTACTL_PUT:
1007 		error = do_sys_quotactl_put(mp,
1008 				args->u.put.qc_key,
1009 				args->u.put.qc_val);
1010 		break;
1011 	    case QUOTACTL_DEL:
1012 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1013 		break;
1014 	    case QUOTACTL_CURSOROPEN:
1015 		error = do_sys_quotactl_cursoropen(mp,
1016 				args->u.cursoropen.qc_cursor);
1017 		break;
1018 	    case QUOTACTL_CURSORCLOSE:
1019 		error = do_sys_quotactl_cursorclose(mp,
1020 				args->u.cursorclose.qc_cursor);
1021 		break;
1022 	    case QUOTACTL_CURSORSKIPIDTYPE:
1023 		error = do_sys_quotactl_cursorskipidtype(mp,
1024 				args->u.cursorskipidtype.qc_cursor,
1025 				args->u.cursorskipidtype.qc_idtype);
1026 		break;
1027 	    case QUOTACTL_CURSORGET:
1028 		error = do_sys_quotactl_cursorget(mp,
1029 				args->u.cursorget.qc_cursor,
1030 				args->u.cursorget.qc_keys,
1031 				args->u.cursorget.qc_vals,
1032 				args->u.cursorget.qc_maxnum,
1033 				args->u.cursorget.qc_ret);
1034 		break;
1035 	    case QUOTACTL_CURSORATEND:
1036 		error = do_sys_quotactl_cursoratend(mp,
1037 				args->u.cursoratend.qc_cursor,
1038 				args->u.cursoratend.qc_ret);
1039 		break;
1040 	    case QUOTACTL_CURSORREWIND:
1041 		error = do_sys_quotactl_cursorrewind(mp,
1042 				args->u.cursorrewind.qc_cursor);
1043 		break;
1044 	    case QUOTACTL_QUOTAON:
1045 		error = do_sys_quotactl_quotaon(mp,
1046 				args->u.quotaon.qc_idtype,
1047 				args->u.quotaon.qc_quotafile);
1048 		break;
1049 	    case QUOTACTL_QUOTAOFF:
1050 		error = do_sys_quotactl_quotaoff(mp,
1051 				args->u.quotaoff.qc_idtype);
1052 		break;
1053 	    default:
1054 		error = EINVAL;
1055 		break;
1056 	}
1057 
1058 	vrele(vp);
1059 	return error;
1060 }
1061 
1062 /* ARGSUSED */
1063 int
1064 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1065     register_t *retval)
1066 {
1067 	/* {
1068 		syscallarg(const char *) path;
1069 		syscallarg(struct quotactl_args *) args;
1070 	} */
1071 	struct quotactl_args args;
1072 	int error;
1073 
1074 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1075 	if (error) {
1076 		return error;
1077 	}
1078 
1079 	return do_sys_quotactl(SCARG(uap, path), &args);
1080 }
1081 
1082 int
1083 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1084     int root)
1085 {
1086 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1087 	int error = 0;
1088 
1089 	/*
1090 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1091 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1092 	 * overrides MNT_NOWAIT.
1093 	 */
1094 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1095 	    (flags != MNT_WAIT && flags != 0)) {
1096 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1097 		goto done;
1098 	}
1099 
1100 	/* Get the filesystem stats now */
1101 	memset(sp, 0, sizeof(*sp));
1102 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
1103 		return error;
1104 	}
1105 
1106 	if (cwdi->cwdi_rdir == NULL)
1107 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1108 done:
1109 	if (cwdi->cwdi_rdir != NULL) {
1110 		size_t len;
1111 		char *bp;
1112 		char c;
1113 		char *path = PNBUF_GET();
1114 
1115 		bp = path + MAXPATHLEN;
1116 		*--bp = '\0';
1117 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1118 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1119 		    MAXPATHLEN / 2, 0, l);
1120 		rw_exit(&cwdi->cwdi_lock);
1121 		if (error) {
1122 			PNBUF_PUT(path);
1123 			return error;
1124 		}
1125 		len = strlen(bp);
1126 		if (len != 1) {
1127 			/*
1128 			 * for mount points that are below our root, we can see
1129 			 * them, so we fix up the pathname and return them. The
1130 			 * rest we cannot see, so we don't allow viewing the
1131 			 * data.
1132 			 */
1133 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1134 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1135 				(void)strlcpy(sp->f_mntonname,
1136 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1137 				    sizeof(sp->f_mntonname));
1138 			} else {
1139 				if (root)
1140 					(void)strlcpy(sp->f_mntonname, "/",
1141 					    sizeof(sp->f_mntonname));
1142 				else
1143 					error = EPERM;
1144 			}
1145 		}
1146 		PNBUF_PUT(path);
1147 	}
1148 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1149 	return error;
1150 }
1151 
1152 /*
1153  * Get filesystem statistics by path.
1154  */
1155 int
1156 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1157 {
1158 	struct mount *mp;
1159 	int error;
1160 	struct vnode *vp;
1161 
1162 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1163 	if (error != 0)
1164 		return error;
1165 	mp = vp->v_mount;
1166 	error = dostatvfs(mp, sb, l, flags, 1);
1167 	vrele(vp);
1168 	return error;
1169 }
1170 
1171 /* ARGSUSED */
1172 int
1173 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
1174 {
1175 	/* {
1176 		syscallarg(const char *) path;
1177 		syscallarg(struct statvfs *) buf;
1178 		syscallarg(int) flags;
1179 	} */
1180 	struct statvfs *sb;
1181 	int error;
1182 
1183 	sb = STATVFSBUF_GET();
1184 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1185 	if (error == 0)
1186 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1187 	STATVFSBUF_PUT(sb);
1188 	return error;
1189 }
1190 
1191 /*
1192  * Get filesystem statistics by fd.
1193  */
1194 int
1195 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1196 {
1197 	file_t *fp;
1198 	struct mount *mp;
1199 	int error;
1200 
1201 	/* fd_getvnode() will use the descriptor for us */
1202 	if ((error = fd_getvnode(fd, &fp)) != 0)
1203 		return (error);
1204 	mp = fp->f_vnode->v_mount;
1205 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1206 	fd_putfile(fd);
1207 	return error;
1208 }
1209 
1210 /* ARGSUSED */
1211 int
1212 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
1213 {
1214 	/* {
1215 		syscallarg(int) fd;
1216 		syscallarg(struct statvfs *) buf;
1217 		syscallarg(int) flags;
1218 	} */
1219 	struct statvfs *sb;
1220 	int error;
1221 
1222 	sb = STATVFSBUF_GET();
1223 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1224 	if (error == 0)
1225 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1226 	STATVFSBUF_PUT(sb);
1227 	return error;
1228 }
1229 
1230 
1231 /*
1232  * Get statistics on all filesystems.
1233  */
1234 int
1235 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1236     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1237     register_t *retval)
1238 {
1239 	int root = 0;
1240 	struct proc *p = l->l_proc;
1241 	struct mount *mp, *nmp;
1242 	struct statvfs *sb;
1243 	size_t count, maxcount;
1244 	int error = 0;
1245 
1246 	sb = STATVFSBUF_GET();
1247 	maxcount = bufsize / entry_sz;
1248 	mutex_enter(&mountlist_lock);
1249 	count = 0;
1250 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1251 		if (vfs_busy(mp, &nmp)) {
1252 			continue;
1253 		}
1254 		if (sfsp && count < maxcount) {
1255 			error = dostatvfs(mp, sb, l, flags, 0);
1256 			if (error) {
1257 				vfs_unbusy(mp, false, &nmp);
1258 				error = 0;
1259 				continue;
1260 			}
1261 			error = copyfn(sb, sfsp, entry_sz);
1262 			if (error) {
1263 				vfs_unbusy(mp, false, NULL);
1264 				goto out;
1265 			}
1266 			sfsp = (char *)sfsp + entry_sz;
1267 			root |= strcmp(sb->f_mntonname, "/") == 0;
1268 		}
1269 		count++;
1270 		vfs_unbusy(mp, false, &nmp);
1271 	}
1272 	mutex_exit(&mountlist_lock);
1273 
1274 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1275 		/*
1276 		 * fake a root entry
1277 		 */
1278 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1279 		    sb, l, flags, 1);
1280 		if (error != 0)
1281 			goto out;
1282 		if (sfsp) {
1283 			error = copyfn(sb, sfsp, entry_sz);
1284 			if (error != 0)
1285 				goto out;
1286 		}
1287 		count++;
1288 	}
1289 	if (sfsp && count > maxcount)
1290 		*retval = maxcount;
1291 	else
1292 		*retval = count;
1293 out:
1294 	STATVFSBUF_PUT(sb);
1295 	return error;
1296 }
1297 
1298 int
1299 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1300 {
1301 	/* {
1302 		syscallarg(struct statvfs *) buf;
1303 		syscallarg(size_t) bufsize;
1304 		syscallarg(int) flags;
1305 	} */
1306 
1307 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1308 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1309 }
1310 
1311 /*
1312  * Change current working directory to a given file descriptor.
1313  */
1314 /* ARGSUSED */
1315 int
1316 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1317 {
1318 	/* {
1319 		syscallarg(int) fd;
1320 	} */
1321 	struct proc *p = l->l_proc;
1322 	struct cwdinfo *cwdi;
1323 	struct vnode *vp, *tdp;
1324 	struct mount *mp;
1325 	file_t *fp;
1326 	int error, fd;
1327 
1328 	/* fd_getvnode() will use the descriptor for us */
1329 	fd = SCARG(uap, fd);
1330 	if ((error = fd_getvnode(fd, &fp)) != 0)
1331 		return (error);
1332 	vp = fp->f_vnode;
1333 
1334 	vref(vp);
1335 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1336 	if (vp->v_type != VDIR)
1337 		error = ENOTDIR;
1338 	else
1339 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1340 	if (error) {
1341 		vput(vp);
1342 		goto out;
1343 	}
1344 	while ((mp = vp->v_mountedhere) != NULL) {
1345 		error = vfs_busy(mp, NULL);
1346 		vput(vp);
1347 		if (error != 0)
1348 			goto out;
1349 		error = VFS_ROOT(mp, &tdp);
1350 		vfs_unbusy(mp, false, NULL);
1351 		if (error)
1352 			goto out;
1353 		vp = tdp;
1354 	}
1355 	VOP_UNLOCK(vp);
1356 
1357 	/*
1358 	 * Disallow changing to a directory not under the process's
1359 	 * current root directory (if there is one).
1360 	 */
1361 	cwdi = p->p_cwdi;
1362 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1363 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1364 		vrele(vp);
1365 		error = EPERM;	/* operation not permitted */
1366 	} else {
1367 		vrele(cwdi->cwdi_cdir);
1368 		cwdi->cwdi_cdir = vp;
1369 	}
1370 	rw_exit(&cwdi->cwdi_lock);
1371 
1372  out:
1373 	fd_putfile(fd);
1374 	return (error);
1375 }
1376 
1377 /*
1378  * Change this process's notion of the root directory to a given file
1379  * descriptor.
1380  */
1381 int
1382 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1383 {
1384 	struct proc *p = l->l_proc;
1385 	struct vnode	*vp;
1386 	file_t	*fp;
1387 	int		 error, fd = SCARG(uap, fd);
1388 
1389 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1390  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1391 		return error;
1392 	/* fd_getvnode() will use the descriptor for us */
1393 	if ((error = fd_getvnode(fd, &fp)) != 0)
1394 		return error;
1395 	vp = fp->f_vnode;
1396 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1397 	if (vp->v_type != VDIR)
1398 		error = ENOTDIR;
1399 	else
1400 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1401 	VOP_UNLOCK(vp);
1402 	if (error)
1403 		goto out;
1404 	vref(vp);
1405 
1406 	change_root(p->p_cwdi, vp, l);
1407 
1408  out:
1409 	fd_putfile(fd);
1410 	return (error);
1411 }
1412 
1413 /*
1414  * Change current working directory (``.'').
1415  */
1416 /* ARGSUSED */
1417 int
1418 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1419 {
1420 	/* {
1421 		syscallarg(const char *) path;
1422 	} */
1423 	struct proc *p = l->l_proc;
1424 	struct cwdinfo *cwdi;
1425 	int error;
1426 	struct vnode *vp;
1427 
1428 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1429 				  &vp, l)) != 0)
1430 		return (error);
1431 	cwdi = p->p_cwdi;
1432 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1433 	vrele(cwdi->cwdi_cdir);
1434 	cwdi->cwdi_cdir = vp;
1435 	rw_exit(&cwdi->cwdi_lock);
1436 	return (0);
1437 }
1438 
1439 /*
1440  * Change notion of root (``/'') directory.
1441  */
1442 /* ARGSUSED */
1443 int
1444 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1445 {
1446 	/* {
1447 		syscallarg(const char *) path;
1448 	} */
1449 	struct proc *p = l->l_proc;
1450 	int error;
1451 	struct vnode *vp;
1452 
1453 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1454 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1455 		return (error);
1456 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1457 				  &vp, l)) != 0)
1458 		return (error);
1459 
1460 	change_root(p->p_cwdi, vp, l);
1461 
1462 	return (0);
1463 }
1464 
1465 /*
1466  * Common routine for chroot and fchroot.
1467  * NB: callers need to properly authorize the change root operation.
1468  */
1469 void
1470 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1471 {
1472 	struct proc *p = l->l_proc;
1473 	kauth_cred_t ncred;
1474 
1475 	ncred = kauth_cred_alloc();
1476 
1477 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1478 	if (cwdi->cwdi_rdir != NULL)
1479 		vrele(cwdi->cwdi_rdir);
1480 	cwdi->cwdi_rdir = vp;
1481 
1482 	/*
1483 	 * Prevent escaping from chroot by putting the root under
1484 	 * the working directory.  Silently chdir to / if we aren't
1485 	 * already there.
1486 	 */
1487 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1488 		/*
1489 		 * XXX would be more failsafe to change directory to a
1490 		 * deadfs node here instead
1491 		 */
1492 		vrele(cwdi->cwdi_cdir);
1493 		vref(vp);
1494 		cwdi->cwdi_cdir = vp;
1495 	}
1496 	rw_exit(&cwdi->cwdi_lock);
1497 
1498 	/* Get a write lock on the process credential. */
1499 	proc_crmod_enter();
1500 
1501 	kauth_cred_clone(p->p_cred, ncred);
1502 	kauth_proc_chroot(ncred, p->p_cwdi);
1503 
1504 	/* Broadcast our credentials to the process and other LWPs. */
1505  	proc_crmod_leave(ncred, p->p_cred, true);
1506 }
1507 
1508 /*
1509  * Common routine for chroot and chdir.
1510  * XXX "where" should be enum uio_seg
1511  */
1512 int
1513 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1514 {
1515 	struct pathbuf *pb;
1516 	struct nameidata nd;
1517 	int error;
1518 
1519 	error = pathbuf_maybe_copyin(path, where, &pb);
1520 	if (error) {
1521 		return error;
1522 	}
1523 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1524 	if ((error = namei(&nd)) != 0) {
1525 		pathbuf_destroy(pb);
1526 		return error;
1527 	}
1528 	*vpp = nd.ni_vp;
1529 	pathbuf_destroy(pb);
1530 
1531 	if ((*vpp)->v_type != VDIR)
1532 		error = ENOTDIR;
1533 	else
1534 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1535 
1536 	if (error)
1537 		vput(*vpp);
1538 	else
1539 		VOP_UNLOCK(*vpp);
1540 	return (error);
1541 }
1542 
1543 /*
1544  * Internals of sys_open - path has already been converted into a pathbuf
1545  * (so we can easily reuse this function from other parts of the kernel,
1546  * like posix_spawn post-processing).
1547  */
1548 int
1549 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1550 	int open_mode, int *fd)
1551 {
1552 	struct proc *p = l->l_proc;
1553 	struct cwdinfo *cwdi = p->p_cwdi;
1554 	file_t *fp;
1555 	struct vnode *vp;
1556 	int flags, cmode;
1557 	int indx, error;
1558 	struct nameidata nd;
1559 
1560 	if (open_flags & O_SEARCH) {
1561 		open_flags &= ~(int)O_SEARCH;
1562 	}
1563 
1564 	flags = FFLAGS(open_flags);
1565 	if ((flags & (FREAD | FWRITE)) == 0)
1566 		return EINVAL;
1567 
1568 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1569 		return error;
1570 	}
1571 
1572 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1573 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1574 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1575 	if (dvp != NULL)
1576 		NDAT(&nd, dvp);
1577 
1578 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1579 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1580 		fd_abort(p, fp, indx);
1581 		if ((error == EDUPFD || error == EMOVEFD) &&
1582 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1583 		    (error =
1584 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1585 			*fd = indx;
1586 			return 0;
1587 		}
1588 		if (error == ERESTART)
1589 			error = EINTR;
1590 		return error;
1591 	}
1592 
1593 	l->l_dupfd = 0;
1594 	vp = nd.ni_vp;
1595 
1596 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1597 		return error;
1598 
1599 	VOP_UNLOCK(vp);
1600 	*fd = indx;
1601 	fd_affix(p, fp, indx);
1602 	return 0;
1603 }
1604 
1605 int
1606 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1607 {
1608 	struct pathbuf *pb;
1609 	int error, oflags;
1610 
1611 	oflags = FFLAGS(open_flags);
1612 	if ((oflags & (FREAD | FWRITE)) == 0)
1613 		return EINVAL;
1614 
1615 	pb = pathbuf_create(path);
1616 	if (pb == NULL)
1617 		return ENOMEM;
1618 
1619 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1620 	pathbuf_destroy(pb);
1621 
1622 	return error;
1623 }
1624 
1625 /*
1626  * Check permissions, allocate an open file structure,
1627  * and call the device open routine if any.
1628  */
1629 static int
1630 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1631     int mode, int *fd)
1632 {
1633 	file_t *dfp = NULL;
1634 	struct vnode *dvp = NULL;
1635 	struct pathbuf *pb;
1636 	int error;
1637 
1638 #ifdef COMPAT_10	/* XXX: and perhaps later */
1639 	if (path == NULL) {
1640 		pb = pathbuf_create(".");
1641 		if (pb == NULL)
1642 			return ENOMEM;
1643 	} else
1644 #endif
1645 	{
1646 		error = pathbuf_copyin(path, &pb);
1647 		if (error)
1648 			return error;
1649 	}
1650 
1651 	if (fdat != AT_FDCWD) {
1652 		/* fd_getvnode() will use the descriptor for us */
1653 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1654 			goto out;
1655 
1656 		dvp = dfp->f_vnode;
1657 	}
1658 
1659 	error = do_open(l, dvp, pb, flags, mode, fd);
1660 
1661 	if (dfp != NULL)
1662 		fd_putfile(fdat);
1663 out:
1664 	pathbuf_destroy(pb);
1665 	return error;
1666 }
1667 
1668 int
1669 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1670 {
1671 	/* {
1672 		syscallarg(const char *) path;
1673 		syscallarg(int) flags;
1674 		syscallarg(int) mode;
1675 	} */
1676 	int error;
1677 	int fd;
1678 
1679 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1680 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1681 
1682 	if (error == 0)
1683 		*retval = fd;
1684 
1685 	return error;
1686 }
1687 
1688 int
1689 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1690 {
1691 	/* {
1692 		syscallarg(int) fd;
1693 		syscallarg(const char *) path;
1694 		syscallarg(int) oflags;
1695 		syscallarg(int) mode;
1696 	} */
1697 	int error;
1698 	int fd;
1699 
1700 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1701 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1702 
1703 	if (error == 0)
1704 		*retval = fd;
1705 
1706 	return error;
1707 }
1708 
1709 static void
1710 vfs__fhfree(fhandle_t *fhp)
1711 {
1712 	size_t fhsize;
1713 
1714 	fhsize = FHANDLE_SIZE(fhp);
1715 	kmem_free(fhp, fhsize);
1716 }
1717 
1718 /*
1719  * vfs_composefh: compose a filehandle.
1720  */
1721 
1722 int
1723 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1724 {
1725 	struct mount *mp;
1726 	struct fid *fidp;
1727 	int error;
1728 	size_t needfhsize;
1729 	size_t fidsize;
1730 
1731 	mp = vp->v_mount;
1732 	fidp = NULL;
1733 	if (*fh_size < FHANDLE_SIZE_MIN) {
1734 		fidsize = 0;
1735 	} else {
1736 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1737 		if (fhp != NULL) {
1738 			memset(fhp, 0, *fh_size);
1739 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1740 			fidp = &fhp->fh_fid;
1741 		}
1742 	}
1743 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1744 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1745 	if (error == 0 && *fh_size < needfhsize) {
1746 		error = E2BIG;
1747 	}
1748 	*fh_size = needfhsize;
1749 	return error;
1750 }
1751 
1752 int
1753 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1754 {
1755 	struct mount *mp;
1756 	fhandle_t *fhp;
1757 	size_t fhsize;
1758 	size_t fidsize;
1759 	int error;
1760 
1761 	mp = vp->v_mount;
1762 	fidsize = 0;
1763 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1764 	KASSERT(error != 0);
1765 	if (error != E2BIG) {
1766 		goto out;
1767 	}
1768 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1769 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1770 	if (fhp == NULL) {
1771 		error = ENOMEM;
1772 		goto out;
1773 	}
1774 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1775 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1776 	if (error == 0) {
1777 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1778 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1779 		*fhpp = fhp;
1780 	} else {
1781 		kmem_free(fhp, fhsize);
1782 	}
1783 out:
1784 	return error;
1785 }
1786 
1787 void
1788 vfs_composefh_free(fhandle_t *fhp)
1789 {
1790 
1791 	vfs__fhfree(fhp);
1792 }
1793 
1794 /*
1795  * vfs_fhtovp: lookup a vnode by a filehandle.
1796  */
1797 
1798 int
1799 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1800 {
1801 	struct mount *mp;
1802 	int error;
1803 
1804 	*vpp = NULL;
1805 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1806 	if (mp == NULL) {
1807 		error = ESTALE;
1808 		goto out;
1809 	}
1810 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1811 		error = EOPNOTSUPP;
1812 		goto out;
1813 	}
1814 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1815 out:
1816 	return error;
1817 }
1818 
1819 /*
1820  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1821  * the needed size.
1822  */
1823 
1824 int
1825 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1826 {
1827 	fhandle_t *fhp;
1828 	int error;
1829 
1830 	if (fhsize > FHANDLE_SIZE_MAX) {
1831 		return EINVAL;
1832 	}
1833 	if (fhsize < FHANDLE_SIZE_MIN) {
1834 		return EINVAL;
1835 	}
1836 again:
1837 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1838 	if (fhp == NULL) {
1839 		return ENOMEM;
1840 	}
1841 	error = copyin(ufhp, fhp, fhsize);
1842 	if (error == 0) {
1843 		/* XXX this check shouldn't be here */
1844 		if (FHANDLE_SIZE(fhp) == fhsize) {
1845 			*fhpp = fhp;
1846 			return 0;
1847 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1848 			/*
1849 			 * a kludge for nfsv2 padded handles.
1850 			 */
1851 			size_t sz;
1852 
1853 			sz = FHANDLE_SIZE(fhp);
1854 			kmem_free(fhp, fhsize);
1855 			fhsize = sz;
1856 			goto again;
1857 		} else {
1858 			/*
1859 			 * userland told us wrong size.
1860 			 */
1861 		    	error = EINVAL;
1862 		}
1863 	}
1864 	kmem_free(fhp, fhsize);
1865 	return error;
1866 }
1867 
1868 void
1869 vfs_copyinfh_free(fhandle_t *fhp)
1870 {
1871 
1872 	vfs__fhfree(fhp);
1873 }
1874 
1875 /*
1876  * Get file handle system call
1877  */
1878 int
1879 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1880 {
1881 	/* {
1882 		syscallarg(char *) fname;
1883 		syscallarg(fhandle_t *) fhp;
1884 		syscallarg(size_t *) fh_size;
1885 	} */
1886 	struct vnode *vp;
1887 	fhandle_t *fh;
1888 	int error;
1889 	struct pathbuf *pb;
1890 	struct nameidata nd;
1891 	size_t sz;
1892 	size_t usz;
1893 
1894 	/*
1895 	 * Must be super user
1896 	 */
1897 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1898 	    0, NULL, NULL, NULL);
1899 	if (error)
1900 		return (error);
1901 
1902 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1903 	if (error) {
1904 		return error;
1905 	}
1906 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1907 	error = namei(&nd);
1908 	if (error) {
1909 		pathbuf_destroy(pb);
1910 		return error;
1911 	}
1912 	vp = nd.ni_vp;
1913 	pathbuf_destroy(pb);
1914 
1915 	error = vfs_composefh_alloc(vp, &fh);
1916 	vput(vp);
1917 	if (error != 0) {
1918 		return error;
1919 	}
1920 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1921 	if (error != 0) {
1922 		goto out;
1923 	}
1924 	sz = FHANDLE_SIZE(fh);
1925 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1926 	if (error != 0) {
1927 		goto out;
1928 	}
1929 	if (usz >= sz) {
1930 		error = copyout(fh, SCARG(uap, fhp), sz);
1931 	} else {
1932 		error = E2BIG;
1933 	}
1934 out:
1935 	vfs_composefh_free(fh);
1936 	return (error);
1937 }
1938 
1939 /*
1940  * Open a file given a file handle.
1941  *
1942  * Check permissions, allocate an open file structure,
1943  * and call the device open routine if any.
1944  */
1945 
1946 int
1947 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1948     register_t *retval)
1949 {
1950 	file_t *fp;
1951 	struct vnode *vp = NULL;
1952 	kauth_cred_t cred = l->l_cred;
1953 	file_t *nfp;
1954 	int indx, error = 0;
1955 	struct vattr va;
1956 	fhandle_t *fh;
1957 	int flags;
1958 	proc_t *p;
1959 
1960 	p = curproc;
1961 
1962 	/*
1963 	 * Must be super user
1964 	 */
1965 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1966 	    0, NULL, NULL, NULL)))
1967 		return (error);
1968 
1969 	if (oflags & O_SEARCH) {
1970 		oflags &= ~(int)O_SEARCH;
1971 	}
1972 
1973 	flags = FFLAGS(oflags);
1974 	if ((flags & (FREAD | FWRITE)) == 0)
1975 		return (EINVAL);
1976 	if ((flags & O_CREAT))
1977 		return (EINVAL);
1978 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1979 		return (error);
1980 	fp = nfp;
1981 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1982 	if (error != 0) {
1983 		goto bad;
1984 	}
1985 	error = vfs_fhtovp(fh, &vp);
1986 	vfs_copyinfh_free(fh);
1987 	if (error != 0) {
1988 		goto bad;
1989 	}
1990 
1991 	/* Now do an effective vn_open */
1992 
1993 	if (vp->v_type == VSOCK) {
1994 		error = EOPNOTSUPP;
1995 		goto bad;
1996 	}
1997 	error = vn_openchk(vp, cred, flags);
1998 	if (error != 0)
1999 		goto bad;
2000 	if (flags & O_TRUNC) {
2001 		VOP_UNLOCK(vp);			/* XXX */
2002 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2003 		vattr_null(&va);
2004 		va.va_size = 0;
2005 		error = VOP_SETATTR(vp, &va, cred);
2006 		if (error)
2007 			goto bad;
2008 	}
2009 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2010 		goto bad;
2011 	if (flags & FWRITE) {
2012 		mutex_enter(vp->v_interlock);
2013 		vp->v_writecount++;
2014 		mutex_exit(vp->v_interlock);
2015 	}
2016 
2017 	/* done with modified vn_open, now finish what sys_open does. */
2018 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2019 		return error;
2020 
2021 	VOP_UNLOCK(vp);
2022 	*retval = indx;
2023 	fd_affix(p, fp, indx);
2024 	return (0);
2025 
2026 bad:
2027 	fd_abort(p, fp, indx);
2028 	if (vp != NULL)
2029 		vput(vp);
2030 	return (error);
2031 }
2032 
2033 int
2034 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2035 {
2036 	/* {
2037 		syscallarg(const void *) fhp;
2038 		syscallarg(size_t) fh_size;
2039 		syscallarg(int) flags;
2040 	} */
2041 
2042 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2043 	    SCARG(uap, flags), retval);
2044 }
2045 
2046 int
2047 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2048 {
2049 	int error;
2050 	fhandle_t *fh;
2051 	struct vnode *vp;
2052 
2053 	/*
2054 	 * Must be super user
2055 	 */
2056 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2057 	    0, NULL, NULL, NULL)))
2058 		return (error);
2059 
2060 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2061 	if (error != 0)
2062 		return error;
2063 
2064 	error = vfs_fhtovp(fh, &vp);
2065 	vfs_copyinfh_free(fh);
2066 	if (error != 0)
2067 		return error;
2068 
2069 	error = vn_stat(vp, sb);
2070 	vput(vp);
2071 	return error;
2072 }
2073 
2074 
2075 /* ARGSUSED */
2076 int
2077 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2078 {
2079 	/* {
2080 		syscallarg(const void *) fhp;
2081 		syscallarg(size_t) fh_size;
2082 		syscallarg(struct stat *) sb;
2083 	} */
2084 	struct stat sb;
2085 	int error;
2086 
2087 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2088 	if (error)
2089 		return error;
2090 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2091 }
2092 
2093 int
2094 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2095     int flags)
2096 {
2097 	fhandle_t *fh;
2098 	struct mount *mp;
2099 	struct vnode *vp;
2100 	int error;
2101 
2102 	/*
2103 	 * Must be super user
2104 	 */
2105 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2106 	    0, NULL, NULL, NULL)))
2107 		return error;
2108 
2109 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2110 	if (error != 0)
2111 		return error;
2112 
2113 	error = vfs_fhtovp(fh, &vp);
2114 	vfs_copyinfh_free(fh);
2115 	if (error != 0)
2116 		return error;
2117 
2118 	mp = vp->v_mount;
2119 	error = dostatvfs(mp, sb, l, flags, 1);
2120 	vput(vp);
2121 	return error;
2122 }
2123 
2124 /* ARGSUSED */
2125 int
2126 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
2127 {
2128 	/* {
2129 		syscallarg(const void *) fhp;
2130 		syscallarg(size_t) fh_size;
2131 		syscallarg(struct statvfs *) buf;
2132 		syscallarg(int)	flags;
2133 	} */
2134 	struct statvfs *sb = STATVFSBUF_GET();
2135 	int error;
2136 
2137 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2138 	    SCARG(uap, flags));
2139 	if (error == 0)
2140 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2141 	STATVFSBUF_PUT(sb);
2142 	return error;
2143 }
2144 
2145 /*
2146  * Create a special file.
2147  */
2148 /* ARGSUSED */
2149 int
2150 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2151     register_t *retval)
2152 {
2153 	/* {
2154 		syscallarg(const char *) path;
2155 		syscallarg(mode_t) mode;
2156 		syscallarg(dev_t) dev;
2157 	} */
2158 	return do_sys_mknodat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode),
2159 	    SCARG(uap, dev), retval, UIO_USERSPACE);
2160 }
2161 
2162 int
2163 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2164     register_t *retval)
2165 {
2166 	/* {
2167 		syscallarg(int) fd;
2168 		syscallarg(const char *) path;
2169 		syscallarg(mode_t) mode;
2170 		syscallarg(int) pad;
2171 		syscallarg(dev_t) dev;
2172 	} */
2173 
2174 	return do_sys_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2175 	    SCARG(uap, mode), SCARG(uap, dev), retval, UIO_USERSPACE);
2176 }
2177 
2178 int
2179 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2180     register_t *retval, enum uio_seg seg)
2181 {
2182 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, retval, seg);
2183 }
2184 
2185 int
2186 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2187     dev_t dev, register_t *retval, enum uio_seg seg)
2188 {
2189 	struct proc *p = l->l_proc;
2190 	struct vnode *vp;
2191 	struct vattr vattr;
2192 	int error, optype;
2193 	struct pathbuf *pb;
2194 	struct nameidata nd;
2195 	const char *pathstring;
2196 
2197 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2198 	    0, NULL, NULL, NULL)) != 0)
2199 		return (error);
2200 
2201 	optype = VOP_MKNOD_DESCOFFSET;
2202 
2203 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2204 	if (error) {
2205 		return error;
2206 	}
2207 	pathstring = pathbuf_stringcopy_get(pb);
2208 	if (pathstring == NULL) {
2209 		pathbuf_destroy(pb);
2210 		return ENOMEM;
2211 	}
2212 
2213 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2214 
2215 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2216 		goto out;
2217 	vp = nd.ni_vp;
2218 
2219 	if (vp != NULL)
2220 		error = EEXIST;
2221 	else {
2222 		vattr_null(&vattr);
2223 		/* We will read cwdi->cwdi_cmask unlocked. */
2224 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2225 		vattr.va_rdev = dev;
2226 
2227 		switch (mode & S_IFMT) {
2228 		case S_IFMT:	/* used by badsect to flag bad sectors */
2229 			vattr.va_type = VBAD;
2230 			break;
2231 		case S_IFCHR:
2232 			vattr.va_type = VCHR;
2233 			break;
2234 		case S_IFBLK:
2235 			vattr.va_type = VBLK;
2236 			break;
2237 		case S_IFWHT:
2238 			optype = VOP_WHITEOUT_DESCOFFSET;
2239 			break;
2240 		case S_IFREG:
2241 #if NVERIEXEC > 0
2242 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2243 			    O_CREAT);
2244 #endif /* NVERIEXEC > 0 */
2245 			vattr.va_type = VREG;
2246 			vattr.va_rdev = VNOVAL;
2247 			optype = VOP_CREATE_DESCOFFSET;
2248 			break;
2249 		default:
2250 			error = EINVAL;
2251 			break;
2252 		}
2253 	}
2254 	if (error == 0 && optype == VOP_MKNOD_DESCOFFSET
2255 	    && vattr.va_rdev == VNOVAL)
2256 		error = EINVAL;
2257 	if (!error) {
2258 		switch (optype) {
2259 		case VOP_WHITEOUT_DESCOFFSET:
2260 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2261 			if (error)
2262 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2263 			vput(nd.ni_dvp);
2264 			break;
2265 
2266 		case VOP_MKNOD_DESCOFFSET:
2267 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2268 						&nd.ni_cnd, &vattr);
2269 			if (error == 0)
2270 				vrele(nd.ni_vp);
2271 			vput(nd.ni_dvp);
2272 			break;
2273 
2274 		case VOP_CREATE_DESCOFFSET:
2275 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2276 						&nd.ni_cnd, &vattr);
2277 			if (error == 0)
2278 				vrele(nd.ni_vp);
2279 			vput(nd.ni_dvp);
2280 			break;
2281 		}
2282 	} else {
2283 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2284 		if (nd.ni_dvp == vp)
2285 			vrele(nd.ni_dvp);
2286 		else
2287 			vput(nd.ni_dvp);
2288 		if (vp)
2289 			vrele(vp);
2290 	}
2291 out:
2292 	pathbuf_stringcopy_put(pb, pathstring);
2293 	pathbuf_destroy(pb);
2294 	return (error);
2295 }
2296 
2297 /*
2298  * Create a named pipe.
2299  */
2300 /* ARGSUSED */
2301 int
2302 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2303 {
2304 	/* {
2305 		syscallarg(const char *) path;
2306 		syscallarg(int) mode;
2307 	} */
2308 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2309 }
2310 
2311 int
2312 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2313     register_t *retval)
2314 {
2315 	/* {
2316 		syscallarg(int) fd;
2317 		syscallarg(const char *) path;
2318 		syscallarg(int) mode;
2319 	} */
2320 
2321 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2322 	    SCARG(uap, mode));
2323 }
2324 
2325 static int
2326 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2327 {
2328 	struct proc *p = l->l_proc;
2329 	struct vattr vattr;
2330 	int error;
2331 	struct pathbuf *pb;
2332 	struct nameidata nd;
2333 
2334 	error = pathbuf_copyin(path, &pb);
2335 	if (error) {
2336 		return error;
2337 	}
2338 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2339 
2340 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2341 		pathbuf_destroy(pb);
2342 		return error;
2343 	}
2344 	if (nd.ni_vp != NULL) {
2345 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2346 		if (nd.ni_dvp == nd.ni_vp)
2347 			vrele(nd.ni_dvp);
2348 		else
2349 			vput(nd.ni_dvp);
2350 		vrele(nd.ni_vp);
2351 		pathbuf_destroy(pb);
2352 		return (EEXIST);
2353 	}
2354 	vattr_null(&vattr);
2355 	vattr.va_type = VFIFO;
2356 	/* We will read cwdi->cwdi_cmask unlocked. */
2357 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2358 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2359 	if (error == 0)
2360 		vrele(nd.ni_vp);
2361 	vput(nd.ni_dvp);
2362 	pathbuf_destroy(pb);
2363 	return (error);
2364 }
2365 
2366 /*
2367  * Make a hard file link.
2368  */
2369 /* ARGSUSED */
2370 int
2371 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2372     const char *link, int follow, register_t *retval)
2373 {
2374 	struct vnode *vp;
2375 	struct pathbuf *linkpb;
2376 	struct nameidata nd;
2377 	namei_simple_flags_t ns_flags;
2378 	int error;
2379 
2380 	if (follow & AT_SYMLINK_FOLLOW)
2381 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2382 	else
2383 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2384 
2385 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2386 	if (error != 0)
2387 		return (error);
2388 	error = pathbuf_copyin(link, &linkpb);
2389 	if (error) {
2390 		goto out1;
2391 	}
2392 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2393 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2394 		goto out2;
2395 	if (nd.ni_vp) {
2396 		error = EEXIST;
2397 		goto abortop;
2398 	}
2399 	/* Prevent hard links on directories. */
2400 	if (vp->v_type == VDIR) {
2401 		error = EPERM;
2402 		goto abortop;
2403 	}
2404 	/* Prevent cross-mount operation. */
2405 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2406 		error = EXDEV;
2407 		goto abortop;
2408 	}
2409 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2410 out2:
2411 	pathbuf_destroy(linkpb);
2412 out1:
2413 	vrele(vp);
2414 	return (error);
2415 abortop:
2416 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2417 	if (nd.ni_dvp == nd.ni_vp)
2418 		vrele(nd.ni_dvp);
2419 	else
2420 		vput(nd.ni_dvp);
2421 	if (nd.ni_vp != NULL)
2422 		vrele(nd.ni_vp);
2423 	goto out2;
2424 }
2425 
2426 int
2427 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2428 {
2429 	/* {
2430 		syscallarg(const char *) path;
2431 		syscallarg(const char *) link;
2432 	} */
2433 	const char *path = SCARG(uap, path);
2434 	const char *link = SCARG(uap, link);
2435 
2436 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2437 	    AT_SYMLINK_FOLLOW, retval);
2438 }
2439 
2440 int
2441 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2442     register_t *retval)
2443 {
2444 	/* {
2445 		syscallarg(int) fd1;
2446 		syscallarg(const char *) name1;
2447 		syscallarg(int) fd2;
2448 		syscallarg(const char *) name2;
2449 		syscallarg(int) flags;
2450 	} */
2451 	int fd1 = SCARG(uap, fd1);
2452 	const char *name1 = SCARG(uap, name1);
2453 	int fd2 = SCARG(uap, fd2);
2454 	const char *name2 = SCARG(uap, name2);
2455 	int follow;
2456 
2457 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2458 
2459 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2460 }
2461 
2462 
2463 int
2464 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2465 {
2466 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2467 }
2468 
2469 static int
2470 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2471     const char *link, enum uio_seg seg)
2472 {
2473 	struct proc *p = curproc;
2474 	struct vattr vattr;
2475 	char *path;
2476 	int error;
2477 	struct pathbuf *linkpb;
2478 	struct nameidata nd;
2479 
2480 	KASSERT(l != NULL || fdat == AT_FDCWD);
2481 
2482 	path = PNBUF_GET();
2483 	if (seg == UIO_USERSPACE) {
2484 		if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != 0)
2485 			goto out1;
2486 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2487 			goto out1;
2488 	} else {
2489 		KASSERT(strlen(patharg) < MAXPATHLEN);
2490 		strcpy(path, patharg);
2491 		linkpb = pathbuf_create(link);
2492 		if (linkpb == NULL) {
2493 			error = ENOMEM;
2494 			goto out1;
2495 		}
2496 	}
2497 	ktrkuser("symlink-target", path, strlen(path));
2498 
2499 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2500 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2501 		goto out2;
2502 	if (nd.ni_vp) {
2503 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2504 		if (nd.ni_dvp == nd.ni_vp)
2505 			vrele(nd.ni_dvp);
2506 		else
2507 			vput(nd.ni_dvp);
2508 		vrele(nd.ni_vp);
2509 		error = EEXIST;
2510 		goto out2;
2511 	}
2512 	vattr_null(&vattr);
2513 	vattr.va_type = VLNK;
2514 	/* We will read cwdi->cwdi_cmask unlocked. */
2515 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2516 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2517 	if (error == 0)
2518 		vrele(nd.ni_vp);
2519 	vput(nd.ni_dvp);
2520 out2:
2521 	pathbuf_destroy(linkpb);
2522 out1:
2523 	PNBUF_PUT(path);
2524 	return (error);
2525 }
2526 
2527 /*
2528  * Make a symbolic link.
2529  */
2530 /* ARGSUSED */
2531 int
2532 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2533 {
2534 	/* {
2535 		syscallarg(const char *) path;
2536 		syscallarg(const char *) link;
2537 	} */
2538 
2539 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2540 	    UIO_USERSPACE);
2541 }
2542 
2543 int
2544 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2545     register_t *retval)
2546 {
2547 	/* {
2548 		syscallarg(const char *) path1;
2549 		syscallarg(int) fd;
2550 		syscallarg(const char *) path2;
2551 	} */
2552 
2553 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2554 	    SCARG(uap, path2), UIO_USERSPACE);
2555 }
2556 
2557 /*
2558  * Delete a whiteout from the filesystem.
2559  */
2560 /* ARGSUSED */
2561 int
2562 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2563 {
2564 	/* {
2565 		syscallarg(const char *) path;
2566 	} */
2567 	int error;
2568 	struct pathbuf *pb;
2569 	struct nameidata nd;
2570 
2571 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2572 	if (error) {
2573 		return error;
2574 	}
2575 
2576 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2577 	error = namei(&nd);
2578 	if (error) {
2579 		pathbuf_destroy(pb);
2580 		return (error);
2581 	}
2582 
2583 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2584 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2585 		if (nd.ni_dvp == nd.ni_vp)
2586 			vrele(nd.ni_dvp);
2587 		else
2588 			vput(nd.ni_dvp);
2589 		if (nd.ni_vp)
2590 			vrele(nd.ni_vp);
2591 		pathbuf_destroy(pb);
2592 		return (EEXIST);
2593 	}
2594 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2595 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2596 	vput(nd.ni_dvp);
2597 	pathbuf_destroy(pb);
2598 	return (error);
2599 }
2600 
2601 /*
2602  * Delete a name from the filesystem.
2603  */
2604 /* ARGSUSED */
2605 int
2606 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2607 {
2608 	/* {
2609 		syscallarg(const char *) path;
2610 	} */
2611 
2612 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2613 }
2614 
2615 int
2616 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2617     register_t *retval)
2618 {
2619 	/* {
2620 		syscallarg(int) fd;
2621 		syscallarg(const char *) path;
2622 		syscallarg(int) flag;
2623 	} */
2624 
2625 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2626 	    SCARG(uap, flag), UIO_USERSPACE);
2627 }
2628 
2629 int
2630 do_sys_unlink(const char *arg, enum uio_seg seg)
2631 {
2632 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2633 }
2634 
2635 static int
2636 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2637     enum uio_seg seg)
2638 {
2639 	struct vnode *vp;
2640 	int error;
2641 	struct pathbuf *pb;
2642 	struct nameidata nd;
2643 	const char *pathstring;
2644 
2645 	KASSERT(l != NULL || fdat == AT_FDCWD);
2646 
2647 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2648 	if (error) {
2649 		return error;
2650 	}
2651 	pathstring = pathbuf_stringcopy_get(pb);
2652 	if (pathstring == NULL) {
2653 		pathbuf_destroy(pb);
2654 		return ENOMEM;
2655 	}
2656 
2657 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2658 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2659 		goto out;
2660 	vp = nd.ni_vp;
2661 
2662 	/*
2663 	 * The root of a mounted filesystem cannot be deleted.
2664 	 */
2665 	if ((vp->v_vflag & VV_ROOT) != 0) {
2666 		error = EBUSY;
2667 		goto abort;
2668 	}
2669 
2670 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2671 		error = EBUSY;
2672 		goto abort;
2673 	}
2674 
2675 	/*
2676 	 * No rmdir "." please.
2677 	 */
2678 	if (nd.ni_dvp == vp) {
2679 		error = EINVAL;
2680 		goto abort;
2681 	}
2682 
2683 	/*
2684 	 * AT_REMOVEDIR is required to remove a directory
2685 	 */
2686 	if (vp->v_type == VDIR) {
2687 		if (!(flags & AT_REMOVEDIR)) {
2688 			error = EPERM;
2689 			goto abort;
2690 		} else {
2691 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2692 			goto out;
2693 		}
2694 	}
2695 
2696 	/*
2697 	 * Starting here we only deal with non directories.
2698 	 */
2699 	if (flags & AT_REMOVEDIR) {
2700 		error = ENOTDIR;
2701 		goto abort;
2702 	}
2703 
2704 #if NVERIEXEC > 0
2705 	/* Handle remove requests for veriexec entries. */
2706 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2707 		goto abort;
2708 	}
2709 #endif /* NVERIEXEC > 0 */
2710 
2711 #ifdef FILEASSOC
2712 	(void)fileassoc_file_delete(vp);
2713 #endif /* FILEASSOC */
2714 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2715 	goto out;
2716 
2717 abort:
2718 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2719 	if (nd.ni_dvp == vp)
2720 		vrele(nd.ni_dvp);
2721 	else
2722 		vput(nd.ni_dvp);
2723 	vput(vp);
2724 
2725 out:
2726 	pathbuf_stringcopy_put(pb, pathstring);
2727 	pathbuf_destroy(pb);
2728 	return (error);
2729 }
2730 
2731 /*
2732  * Reposition read/write file offset.
2733  */
2734 int
2735 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2736 {
2737 	/* {
2738 		syscallarg(int) fd;
2739 		syscallarg(int) pad;
2740 		syscallarg(off_t) offset;
2741 		syscallarg(int) whence;
2742 	} */
2743 	kauth_cred_t cred = l->l_cred;
2744 	file_t *fp;
2745 	struct vnode *vp;
2746 	struct vattr vattr;
2747 	off_t newoff;
2748 	int error, fd;
2749 
2750 	fd = SCARG(uap, fd);
2751 
2752 	if ((fp = fd_getfile(fd)) == NULL)
2753 		return (EBADF);
2754 
2755 	vp = fp->f_vnode;
2756 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2757 		error = ESPIPE;
2758 		goto out;
2759 	}
2760 
2761 	switch (SCARG(uap, whence)) {
2762 	case SEEK_CUR:
2763 		newoff = fp->f_offset + SCARG(uap, offset);
2764 		break;
2765 	case SEEK_END:
2766 		vn_lock(vp, LK_SHARED | LK_RETRY);
2767 		error = VOP_GETATTR(vp, &vattr, cred);
2768 		VOP_UNLOCK(vp);
2769 		if (error) {
2770 			goto out;
2771 		}
2772 		newoff = SCARG(uap, offset) + vattr.va_size;
2773 		break;
2774 	case SEEK_SET:
2775 		newoff = SCARG(uap, offset);
2776 		break;
2777 	default:
2778 		error = EINVAL;
2779 		goto out;
2780 	}
2781 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2782 		*(off_t *)retval = fp->f_offset = newoff;
2783 	}
2784  out:
2785  	fd_putfile(fd);
2786 	return (error);
2787 }
2788 
2789 /*
2790  * Positional read system call.
2791  */
2792 int
2793 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2794 {
2795 	/* {
2796 		syscallarg(int) fd;
2797 		syscallarg(void *) buf;
2798 		syscallarg(size_t) nbyte;
2799 		syscallarg(off_t) offset;
2800 	} */
2801 	file_t *fp;
2802 	struct vnode *vp;
2803 	off_t offset;
2804 	int error, fd = SCARG(uap, fd);
2805 
2806 	if ((fp = fd_getfile(fd)) == NULL)
2807 		return (EBADF);
2808 
2809 	if ((fp->f_flag & FREAD) == 0) {
2810 		fd_putfile(fd);
2811 		return (EBADF);
2812 	}
2813 
2814 	vp = fp->f_vnode;
2815 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2816 		error = ESPIPE;
2817 		goto out;
2818 	}
2819 
2820 	offset = SCARG(uap, offset);
2821 
2822 	/*
2823 	 * XXX This works because no file systems actually
2824 	 * XXX take any action on the seek operation.
2825 	 */
2826 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2827 		goto out;
2828 
2829 	/* dofileread() will unuse the descriptor for us */
2830 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2831 	    &offset, 0, retval));
2832 
2833  out:
2834 	fd_putfile(fd);
2835 	return (error);
2836 }
2837 
2838 /*
2839  * Positional scatter read system call.
2840  */
2841 int
2842 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2843 {
2844 	/* {
2845 		syscallarg(int) fd;
2846 		syscallarg(const struct iovec *) iovp;
2847 		syscallarg(int) iovcnt;
2848 		syscallarg(off_t) offset;
2849 	} */
2850 	off_t offset = SCARG(uap, offset);
2851 
2852 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2853 	    SCARG(uap, iovcnt), &offset, 0, retval);
2854 }
2855 
2856 /*
2857  * Positional write system call.
2858  */
2859 int
2860 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2861 {
2862 	/* {
2863 		syscallarg(int) fd;
2864 		syscallarg(const void *) buf;
2865 		syscallarg(size_t) nbyte;
2866 		syscallarg(off_t) offset;
2867 	} */
2868 	file_t *fp;
2869 	struct vnode *vp;
2870 	off_t offset;
2871 	int error, fd = SCARG(uap, fd);
2872 
2873 	if ((fp = fd_getfile(fd)) == NULL)
2874 		return (EBADF);
2875 
2876 	if ((fp->f_flag & FWRITE) == 0) {
2877 		fd_putfile(fd);
2878 		return (EBADF);
2879 	}
2880 
2881 	vp = fp->f_vnode;
2882 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2883 		error = ESPIPE;
2884 		goto out;
2885 	}
2886 
2887 	offset = SCARG(uap, offset);
2888 
2889 	/*
2890 	 * XXX This works because no file systems actually
2891 	 * XXX take any action on the seek operation.
2892 	 */
2893 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2894 		goto out;
2895 
2896 	/* dofilewrite() will unuse the descriptor for us */
2897 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2898 	    &offset, 0, retval));
2899 
2900  out:
2901 	fd_putfile(fd);
2902 	return (error);
2903 }
2904 
2905 /*
2906  * Positional gather write system call.
2907  */
2908 int
2909 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2910 {
2911 	/* {
2912 		syscallarg(int) fd;
2913 		syscallarg(const struct iovec *) iovp;
2914 		syscallarg(int) iovcnt;
2915 		syscallarg(off_t) offset;
2916 	} */
2917 	off_t offset = SCARG(uap, offset);
2918 
2919 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2920 	    SCARG(uap, iovcnt), &offset, 0, retval);
2921 }
2922 
2923 /*
2924  * Check access permissions.
2925  */
2926 int
2927 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2928 {
2929 	/* {
2930 		syscallarg(const char *) path;
2931 		syscallarg(int) flags;
2932 	} */
2933 
2934 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
2935 	     SCARG(uap, flags), 0);
2936 }
2937 
2938 int
2939 do_sys_accessat(struct lwp *l, int fdat, const char *path,
2940     int mode, int flags)
2941 {
2942 	kauth_cred_t cred;
2943 	struct vnode *vp;
2944 	int error, nd_flag, vmode;
2945 	struct pathbuf *pb;
2946 	struct nameidata nd;
2947 
2948 	CTASSERT(F_OK == 0);
2949 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
2950 		/* nonsense mode */
2951 		return EINVAL;
2952 	}
2953 
2954 	nd_flag = FOLLOW | LOCKLEAF | TRYEMULROOT;
2955 	if (flags & AT_SYMLINK_NOFOLLOW)
2956 		nd_flag &= ~FOLLOW;
2957 
2958 	error = pathbuf_copyin(path, &pb);
2959 	if (error)
2960 		return error;
2961 
2962 	NDINIT(&nd, LOOKUP, nd_flag, pb);
2963 
2964 	/* Override default credentials */
2965 	cred = kauth_cred_dup(l->l_cred);
2966 	if (!(flags & AT_EACCESS)) {
2967 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2968 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2969 	}
2970 	nd.ni_cnd.cn_cred = cred;
2971 
2972 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2973 		pathbuf_destroy(pb);
2974 		goto out;
2975 	}
2976 	vp = nd.ni_vp;
2977 	pathbuf_destroy(pb);
2978 
2979 	/* Flags == 0 means only check for existence. */
2980 	if (mode) {
2981 		vmode = 0;
2982 		if (mode & R_OK)
2983 			vmode |= VREAD;
2984 		if (mode & W_OK)
2985 			vmode |= VWRITE;
2986 		if (mode & X_OK)
2987 			vmode |= VEXEC;
2988 
2989 		error = VOP_ACCESS(vp, vmode, cred);
2990 		if (!error && (vmode & VWRITE))
2991 			error = vn_writechk(vp);
2992 	}
2993 	vput(vp);
2994 out:
2995 	kauth_cred_free(cred);
2996 	return (error);
2997 }
2998 
2999 int
3000 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3001     register_t *retval)
3002 {
3003 	/* {
3004 		syscallarg(int) fd;
3005 		syscallarg(const char *) path;
3006 		syscallarg(int) amode;
3007 		syscallarg(int) flag;
3008 	} */
3009 
3010 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3011 	     SCARG(uap, amode), SCARG(uap, flag));
3012 }
3013 
3014 /*
3015  * Common code for all sys_stat functions, including compat versions.
3016  */
3017 int
3018 do_sys_stat(const char *userpath, unsigned int nd_flag,
3019     struct stat *sb)
3020 {
3021 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3022 }
3023 
3024 int
3025 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3026     unsigned int nd_flag, struct stat *sb)
3027 {
3028 	int error;
3029 	struct pathbuf *pb;
3030 	struct nameidata nd;
3031 
3032 	KASSERT(l != NULL || fdat == AT_FDCWD);
3033 
3034 	error = pathbuf_copyin(userpath, &pb);
3035 	if (error) {
3036 		return error;
3037 	}
3038 
3039 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3040 
3041 	error = fd_nameiat(l, fdat, &nd);
3042 	if (error != 0) {
3043 		pathbuf_destroy(pb);
3044 		return error;
3045 	}
3046 	error = vn_stat(nd.ni_vp, sb);
3047 	vput(nd.ni_vp);
3048 	pathbuf_destroy(pb);
3049 	return error;
3050 }
3051 
3052 /*
3053  * Get file status; this version follows links.
3054  */
3055 /* ARGSUSED */
3056 int
3057 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3058 {
3059 	/* {
3060 		syscallarg(const char *) path;
3061 		syscallarg(struct stat *) ub;
3062 	} */
3063 	struct stat sb;
3064 	int error;
3065 
3066 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3067 	if (error)
3068 		return error;
3069 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3070 }
3071 
3072 /*
3073  * Get file status; this version does not follow links.
3074  */
3075 /* ARGSUSED */
3076 int
3077 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3078 {
3079 	/* {
3080 		syscallarg(const char *) path;
3081 		syscallarg(struct stat *) ub;
3082 	} */
3083 	struct stat sb;
3084 	int error;
3085 
3086 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3087 	if (error)
3088 		return error;
3089 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3090 }
3091 
3092 int
3093 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3094     register_t *retval)
3095 {
3096 	/* {
3097 		syscallarg(int) fd;
3098 		syscallarg(const char *) path;
3099 		syscallarg(struct stat *) buf;
3100 		syscallarg(int) flag;
3101 	} */
3102 	unsigned int nd_flag;
3103 	struct stat sb;
3104 	int error;
3105 
3106 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3107 		nd_flag = NOFOLLOW;
3108 	else
3109 		nd_flag = FOLLOW;
3110 
3111 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3112 	    &sb);
3113 	if (error)
3114 		return error;
3115 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3116 }
3117 
3118 /*
3119  * Get configurable pathname variables.
3120  */
3121 /* ARGSUSED */
3122 int
3123 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
3124 {
3125 	/* {
3126 		syscallarg(const char *) path;
3127 		syscallarg(int) name;
3128 	} */
3129 	int error;
3130 	struct pathbuf *pb;
3131 	struct nameidata nd;
3132 
3133 	error = pathbuf_copyin(SCARG(uap, path), &pb);
3134 	if (error) {
3135 		return error;
3136 	}
3137 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3138 	if ((error = namei(&nd)) != 0) {
3139 		pathbuf_destroy(pb);
3140 		return (error);
3141 	}
3142 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
3143 	vput(nd.ni_vp);
3144 	pathbuf_destroy(pb);
3145 	return (error);
3146 }
3147 
3148 /*
3149  * Return target name of a symbolic link.
3150  */
3151 /* ARGSUSED */
3152 int
3153 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3154     register_t *retval)
3155 {
3156 	/* {
3157 		syscallarg(const char *) path;
3158 		syscallarg(char *) buf;
3159 		syscallarg(size_t) count;
3160 	} */
3161 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3162 	    SCARG(uap, buf), SCARG(uap, count), retval);
3163 }
3164 
3165 static int
3166 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3167     size_t count, register_t *retval)
3168 {
3169 	struct vnode *vp;
3170 	struct iovec aiov;
3171 	struct uio auio;
3172 	int error;
3173 	struct pathbuf *pb;
3174 	struct nameidata nd;
3175 
3176 	error = pathbuf_copyin(path, &pb);
3177 	if (error) {
3178 		return error;
3179 	}
3180 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3181 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3182 		pathbuf_destroy(pb);
3183 		return error;
3184 	}
3185 	vp = nd.ni_vp;
3186 	pathbuf_destroy(pb);
3187 	if (vp->v_type != VLNK)
3188 		error = EINVAL;
3189 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3190 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3191 		aiov.iov_base = buf;
3192 		aiov.iov_len = count;
3193 		auio.uio_iov = &aiov;
3194 		auio.uio_iovcnt = 1;
3195 		auio.uio_offset = 0;
3196 		auio.uio_rw = UIO_READ;
3197 		KASSERT(l == curlwp);
3198 		auio.uio_vmspace = l->l_proc->p_vmspace;
3199 		auio.uio_resid = count;
3200 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3201 			*retval = count - auio.uio_resid;
3202 	}
3203 	vput(vp);
3204 	return (error);
3205 }
3206 
3207 int
3208 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3209     register_t *retval)
3210 {
3211 	/* {
3212 		syscallarg(int) fd;
3213 		syscallarg(const char *) path;
3214 		syscallarg(char *) buf;
3215 		syscallarg(size_t) bufsize;
3216 	} */
3217 
3218 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3219 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3220 }
3221 
3222 /*
3223  * Change flags of a file given a path name.
3224  */
3225 /* ARGSUSED */
3226 int
3227 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3228 {
3229 	/* {
3230 		syscallarg(const char *) path;
3231 		syscallarg(u_long) flags;
3232 	} */
3233 	struct vnode *vp;
3234 	int error;
3235 
3236 	error = namei_simple_user(SCARG(uap, path),
3237 				NSM_FOLLOW_TRYEMULROOT, &vp);
3238 	if (error != 0)
3239 		return (error);
3240 	error = change_flags(vp, SCARG(uap, flags), l);
3241 	vput(vp);
3242 	return (error);
3243 }
3244 
3245 /*
3246  * Change flags of a file given a file descriptor.
3247  */
3248 /* ARGSUSED */
3249 int
3250 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3251 {
3252 	/* {
3253 		syscallarg(int) fd;
3254 		syscallarg(u_long) flags;
3255 	} */
3256 	struct vnode *vp;
3257 	file_t *fp;
3258 	int error;
3259 
3260 	/* fd_getvnode() will use the descriptor for us */
3261 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3262 		return (error);
3263 	vp = fp->f_vnode;
3264 	error = change_flags(vp, SCARG(uap, flags), l);
3265 	VOP_UNLOCK(vp);
3266 	fd_putfile(SCARG(uap, fd));
3267 	return (error);
3268 }
3269 
3270 /*
3271  * Change flags of a file given a path name; this version does
3272  * not follow links.
3273  */
3274 int
3275 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3276 {
3277 	/* {
3278 		syscallarg(const char *) path;
3279 		syscallarg(u_long) flags;
3280 	} */
3281 	struct vnode *vp;
3282 	int error;
3283 
3284 	error = namei_simple_user(SCARG(uap, path),
3285 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3286 	if (error != 0)
3287 		return (error);
3288 	error = change_flags(vp, SCARG(uap, flags), l);
3289 	vput(vp);
3290 	return (error);
3291 }
3292 
3293 /*
3294  * Common routine to change flags of a file.
3295  */
3296 int
3297 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3298 {
3299 	struct vattr vattr;
3300 	int error;
3301 
3302 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3303 
3304 	vattr_null(&vattr);
3305 	vattr.va_flags = flags;
3306 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3307 
3308 	return (error);
3309 }
3310 
3311 /*
3312  * Change mode of a file given path name; this version follows links.
3313  */
3314 /* ARGSUSED */
3315 int
3316 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3317 {
3318 	/* {
3319 		syscallarg(const char *) path;
3320 		syscallarg(int) mode;
3321 	} */
3322 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3323 			      SCARG(uap, mode), 0);
3324 }
3325 
3326 int
3327 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3328 {
3329 	int error;
3330 	struct vnode *vp;
3331 	namei_simple_flags_t ns_flag;
3332 
3333 	if (flags & AT_SYMLINK_NOFOLLOW)
3334 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3335 	else
3336 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3337 
3338 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3339 	if (error != 0)
3340 		return error;
3341 
3342 	error = change_mode(vp, mode, l);
3343 
3344 	vrele(vp);
3345 
3346 	return (error);
3347 }
3348 
3349 /*
3350  * Change mode of a file given a file descriptor.
3351  */
3352 /* ARGSUSED */
3353 int
3354 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3355 {
3356 	/* {
3357 		syscallarg(int) fd;
3358 		syscallarg(int) mode;
3359 	} */
3360 	file_t *fp;
3361 	int error;
3362 
3363 	/* fd_getvnode() will use the descriptor for us */
3364 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3365 		return (error);
3366 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3367 	fd_putfile(SCARG(uap, fd));
3368 	return (error);
3369 }
3370 
3371 int
3372 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3373     register_t *retval)
3374 {
3375 	/* {
3376 		syscallarg(int) fd;
3377 		syscallarg(const char *) path;
3378 		syscallarg(int) mode;
3379 		syscallarg(int) flag;
3380 	} */
3381 
3382 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3383 			      SCARG(uap, mode), SCARG(uap, flag));
3384 }
3385 
3386 /*
3387  * Change mode of a file given path name; this version does not follow links.
3388  */
3389 /* ARGSUSED */
3390 int
3391 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3392 {
3393 	/* {
3394 		syscallarg(const char *) path;
3395 		syscallarg(int) mode;
3396 	} */
3397 	int error;
3398 	struct vnode *vp;
3399 
3400 	error = namei_simple_user(SCARG(uap, path),
3401 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3402 	if (error != 0)
3403 		return (error);
3404 
3405 	error = change_mode(vp, SCARG(uap, mode), l);
3406 
3407 	vrele(vp);
3408 	return (error);
3409 }
3410 
3411 /*
3412  * Common routine to set mode given a vnode.
3413  */
3414 static int
3415 change_mode(struct vnode *vp, int mode, struct lwp *l)
3416 {
3417 	struct vattr vattr;
3418 	int error;
3419 
3420 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3421 	vattr_null(&vattr);
3422 	vattr.va_mode = mode & ALLPERMS;
3423 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3424 	VOP_UNLOCK(vp);
3425 	return (error);
3426 }
3427 
3428 /*
3429  * Set ownership given a path name; this version follows links.
3430  */
3431 /* ARGSUSED */
3432 int
3433 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3434 {
3435 	/* {
3436 		syscallarg(const char *) path;
3437 		syscallarg(uid_t) uid;
3438 		syscallarg(gid_t) gid;
3439 	} */
3440 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3441 			      SCARG(uap, gid), 0);
3442 }
3443 
3444 int
3445 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3446    gid_t gid, int flags)
3447 {
3448 	int error;
3449 	struct vnode *vp;
3450 	namei_simple_flags_t ns_flag;
3451 
3452 	if (flags & AT_SYMLINK_NOFOLLOW)
3453 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3454 	else
3455 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3456 
3457 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3458 	if (error != 0)
3459 		return error;
3460 
3461 	error = change_owner(vp, uid, gid, l, 0);
3462 
3463 	vrele(vp);
3464 
3465 	return (error);
3466 }
3467 
3468 /*
3469  * Set ownership given a path name; this version follows links.
3470  * Provides POSIX semantics.
3471  */
3472 /* ARGSUSED */
3473 int
3474 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3475 {
3476 	/* {
3477 		syscallarg(const char *) path;
3478 		syscallarg(uid_t) uid;
3479 		syscallarg(gid_t) gid;
3480 	} */
3481 	int error;
3482 	struct vnode *vp;
3483 
3484 	error = namei_simple_user(SCARG(uap, path),
3485 				NSM_FOLLOW_TRYEMULROOT, &vp);
3486 	if (error != 0)
3487 		return (error);
3488 
3489 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3490 
3491 	vrele(vp);
3492 	return (error);
3493 }
3494 
3495 /*
3496  * Set ownership given a file descriptor.
3497  */
3498 /* ARGSUSED */
3499 int
3500 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3501 {
3502 	/* {
3503 		syscallarg(int) fd;
3504 		syscallarg(uid_t) uid;
3505 		syscallarg(gid_t) gid;
3506 	} */
3507 	int error;
3508 	file_t *fp;
3509 
3510 	/* fd_getvnode() will use the descriptor for us */
3511 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3512 		return (error);
3513 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3514 	    l, 0);
3515 	fd_putfile(SCARG(uap, fd));
3516 	return (error);
3517 }
3518 
3519 int
3520 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3521     register_t *retval)
3522 {
3523 	/* {
3524 		syscallarg(int) fd;
3525 		syscallarg(const char *) path;
3526 		syscallarg(uid_t) owner;
3527 		syscallarg(gid_t) group;
3528 		syscallarg(int) flag;
3529 	} */
3530 
3531 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3532 			      SCARG(uap, owner), SCARG(uap, group),
3533 			      SCARG(uap, flag));
3534 }
3535 
3536 /*
3537  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3538  */
3539 /* ARGSUSED */
3540 int
3541 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3542 {
3543 	/* {
3544 		syscallarg(int) fd;
3545 		syscallarg(uid_t) uid;
3546 		syscallarg(gid_t) gid;
3547 	} */
3548 	int error;
3549 	file_t *fp;
3550 
3551 	/* fd_getvnode() will use the descriptor for us */
3552 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3553 		return (error);
3554 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3555 	    l, 1);
3556 	fd_putfile(SCARG(uap, fd));
3557 	return (error);
3558 }
3559 
3560 /*
3561  * Set ownership given a path name; this version does not follow links.
3562  */
3563 /* ARGSUSED */
3564 int
3565 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3566 {
3567 	/* {
3568 		syscallarg(const char *) path;
3569 		syscallarg(uid_t) uid;
3570 		syscallarg(gid_t) gid;
3571 	} */
3572 	int error;
3573 	struct vnode *vp;
3574 
3575 	error = namei_simple_user(SCARG(uap, path),
3576 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3577 	if (error != 0)
3578 		return (error);
3579 
3580 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3581 
3582 	vrele(vp);
3583 	return (error);
3584 }
3585 
3586 /*
3587  * Set ownership given a path name; this version does not follow links.
3588  * Provides POSIX/XPG semantics.
3589  */
3590 /* ARGSUSED */
3591 int
3592 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3593 {
3594 	/* {
3595 		syscallarg(const char *) path;
3596 		syscallarg(uid_t) uid;
3597 		syscallarg(gid_t) gid;
3598 	} */
3599 	int error;
3600 	struct vnode *vp;
3601 
3602 	error = namei_simple_user(SCARG(uap, path),
3603 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3604 	if (error != 0)
3605 		return (error);
3606 
3607 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3608 
3609 	vrele(vp);
3610 	return (error);
3611 }
3612 
3613 /*
3614  * Common routine to set ownership given a vnode.
3615  */
3616 static int
3617 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3618     int posix_semantics)
3619 {
3620 	struct vattr vattr;
3621 	mode_t newmode;
3622 	int error;
3623 
3624 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3625 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3626 		goto out;
3627 
3628 #define CHANGED(x) ((int)(x) != -1)
3629 	newmode = vattr.va_mode;
3630 	if (posix_semantics) {
3631 		/*
3632 		 * POSIX/XPG semantics: if the caller is not the super-user,
3633 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3634 		 * the XPG consider the behaviour for calls by the super-user
3635 		 * implementation-defined; we leave the set-user-id and set-
3636 		 * group-id settings intact in that case.
3637 		 */
3638 		if (vattr.va_mode & S_ISUID) {
3639 			if (kauth_authorize_vnode(l->l_cred,
3640 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3641 				newmode &= ~S_ISUID;
3642 		}
3643 		if (vattr.va_mode & S_ISGID) {
3644 			if (kauth_authorize_vnode(l->l_cred,
3645 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3646 				newmode &= ~S_ISGID;
3647 		}
3648 	} else {
3649 		/*
3650 		 * NetBSD semantics: when changing owner and/or group,
3651 		 * clear the respective bit(s).
3652 		 */
3653 		if (CHANGED(uid))
3654 			newmode &= ~S_ISUID;
3655 		if (CHANGED(gid))
3656 			newmode &= ~S_ISGID;
3657 	}
3658 	/* Update va_mode iff altered. */
3659 	if (vattr.va_mode == newmode)
3660 		newmode = VNOVAL;
3661 
3662 	vattr_null(&vattr);
3663 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3664 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3665 	vattr.va_mode = newmode;
3666 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3667 #undef CHANGED
3668 
3669 out:
3670 	VOP_UNLOCK(vp);
3671 	return (error);
3672 }
3673 
3674 /*
3675  * Set the access and modification times given a path name; this
3676  * version follows links.
3677  */
3678 /* ARGSUSED */
3679 int
3680 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3681     register_t *retval)
3682 {
3683 	/* {
3684 		syscallarg(const char *) path;
3685 		syscallarg(const struct timeval *) tptr;
3686 	} */
3687 
3688 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3689 	    SCARG(uap, tptr), UIO_USERSPACE);
3690 }
3691 
3692 /*
3693  * Set the access and modification times given a file descriptor.
3694  */
3695 /* ARGSUSED */
3696 int
3697 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3698     register_t *retval)
3699 {
3700 	/* {
3701 		syscallarg(int) fd;
3702 		syscallarg(const struct timeval *) tptr;
3703 	} */
3704 	int error;
3705 	file_t *fp;
3706 
3707 	/* fd_getvnode() will use the descriptor for us */
3708 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3709 		return (error);
3710 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3711 	    UIO_USERSPACE);
3712 	fd_putfile(SCARG(uap, fd));
3713 	return (error);
3714 }
3715 
3716 int
3717 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3718     register_t *retval)
3719 {
3720 	/* {
3721 		syscallarg(int) fd;
3722 		syscallarg(const struct timespec *) tptr;
3723 	} */
3724 	int error;
3725 	file_t *fp;
3726 
3727 	/* fd_getvnode() will use the descriptor for us */
3728 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3729 		return (error);
3730 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3731 	    SCARG(uap, tptr), UIO_USERSPACE);
3732 	fd_putfile(SCARG(uap, fd));
3733 	return (error);
3734 }
3735 
3736 /*
3737  * Set the access and modification times given a path name; this
3738  * version does not follow links.
3739  */
3740 int
3741 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3742     register_t *retval)
3743 {
3744 	/* {
3745 		syscallarg(const char *) path;
3746 		syscallarg(const struct timeval *) tptr;
3747 	} */
3748 
3749 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3750 	    SCARG(uap, tptr), UIO_USERSPACE);
3751 }
3752 
3753 int
3754 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3755     register_t *retval)
3756 {
3757 	/* {
3758 		syscallarg(int) fd;
3759 		syscallarg(const char *) path;
3760 		syscallarg(const struct timespec *) tptr;
3761 		syscallarg(int) flag;
3762 	} */
3763 	int follow;
3764 	const struct timespec *tptr;
3765 	int error;
3766 
3767 	tptr = SCARG(uap, tptr);
3768 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3769 
3770 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3771 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3772 
3773 	return error;
3774 }
3775 
3776 /*
3777  * Common routine to set access and modification times given a vnode.
3778  */
3779 int
3780 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3781     const struct timespec *tptr, enum uio_seg seg)
3782 {
3783 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3784 }
3785 
3786 int
3787 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3788     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3789 {
3790 	struct vattr vattr;
3791 	int error, dorele = 0;
3792 	namei_simple_flags_t sflags;
3793 	bool vanull, setbirthtime;
3794 	struct timespec ts[2];
3795 
3796 	KASSERT(l != NULL || fdat == AT_FDCWD);
3797 
3798 	/*
3799 	 * I have checked all callers and they pass either FOLLOW,
3800 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3801 	 * is 0. More to the point, they don't pass anything else.
3802 	 * Let's keep it that way at least until the namei interfaces
3803 	 * are fully sanitized.
3804 	 */
3805 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3806 	sflags = (flag == FOLLOW) ?
3807 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3808 
3809 	if (tptr == NULL) {
3810 		vanull = true;
3811 		nanotime(&ts[0]);
3812 		ts[1] = ts[0];
3813 	} else {
3814 		vanull = false;
3815 		if (seg != UIO_SYSSPACE) {
3816 			error = copyin(tptr, ts, sizeof (ts));
3817 			if (error != 0)
3818 				return error;
3819 		} else {
3820 			ts[0] = tptr[0];
3821 			ts[1] = tptr[1];
3822 		}
3823 	}
3824 
3825 	if (ts[0].tv_nsec == UTIME_NOW) {
3826 		nanotime(&ts[0]);
3827 		if (ts[1].tv_nsec == UTIME_NOW) {
3828 			vanull = true;
3829 			ts[1] = ts[0];
3830 		}
3831 	} else if (ts[1].tv_nsec == UTIME_NOW)
3832 		nanotime(&ts[1]);
3833 
3834 	if (vp == NULL) {
3835 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3836 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
3837 		if (error != 0)
3838 			return error;
3839 		dorele = 1;
3840 	}
3841 
3842 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3843 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3844 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3845 	vattr_null(&vattr);
3846 
3847 	if (ts[0].tv_nsec != UTIME_OMIT)
3848 		vattr.va_atime = ts[0];
3849 
3850 	if (ts[1].tv_nsec != UTIME_OMIT) {
3851 		vattr.va_mtime = ts[1];
3852 		if (setbirthtime)
3853 			vattr.va_birthtime = ts[1];
3854 	}
3855 
3856 	if (vanull)
3857 		vattr.va_vaflags |= VA_UTIMES_NULL;
3858 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3859 	VOP_UNLOCK(vp);
3860 
3861 	if (dorele != 0)
3862 		vrele(vp);
3863 
3864 	return error;
3865 }
3866 
3867 int
3868 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3869     const struct timeval *tptr, enum uio_seg seg)
3870 {
3871 	struct timespec ts[2];
3872 	struct timespec *tsptr = NULL;
3873 	int error;
3874 
3875 	if (tptr != NULL) {
3876 		struct timeval tv[2];
3877 
3878 		if (seg != UIO_SYSSPACE) {
3879 			error = copyin(tptr, tv, sizeof (tv));
3880 			if (error != 0)
3881 				return error;
3882 			tptr = tv;
3883 		}
3884 
3885 		if ((tv[0].tv_usec == UTIME_NOW) ||
3886 		    (tv[0].tv_usec == UTIME_OMIT))
3887 			ts[0].tv_nsec = tv[0].tv_usec;
3888 		else
3889 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3890 
3891 		if ((tv[1].tv_usec == UTIME_NOW) ||
3892 		    (tv[1].tv_usec == UTIME_OMIT))
3893 			ts[1].tv_nsec = tv[1].tv_usec;
3894 		else
3895 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3896 
3897 		tsptr = &ts[0];
3898 	}
3899 
3900 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3901 }
3902 
3903 /*
3904  * Truncate a file given its path name.
3905  */
3906 /* ARGSUSED */
3907 int
3908 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3909 {
3910 	/* {
3911 		syscallarg(const char *) path;
3912 		syscallarg(int) pad;
3913 		syscallarg(off_t) length;
3914 	} */
3915 	struct vnode *vp;
3916 	struct vattr vattr;
3917 	int error;
3918 
3919 	if (SCARG(uap, length) < 0)
3920 		return EINVAL;
3921 
3922 	error = namei_simple_user(SCARG(uap, path),
3923 				NSM_FOLLOW_TRYEMULROOT, &vp);
3924 	if (error != 0)
3925 		return (error);
3926 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3927 	if (vp->v_type == VDIR)
3928 		error = EISDIR;
3929 	else if ((error = vn_writechk(vp)) == 0 &&
3930 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3931 		vattr_null(&vattr);
3932 		vattr.va_size = SCARG(uap, length);
3933 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3934 	}
3935 	vput(vp);
3936 	return (error);
3937 }
3938 
3939 /*
3940  * Truncate a file given a file descriptor.
3941  */
3942 /* ARGSUSED */
3943 int
3944 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3945 {
3946 	/* {
3947 		syscallarg(int) fd;
3948 		syscallarg(int) pad;
3949 		syscallarg(off_t) length;
3950 	} */
3951 	struct vattr vattr;
3952 	struct vnode *vp;
3953 	file_t *fp;
3954 	int error;
3955 
3956 	if (SCARG(uap, length) < 0)
3957 		return EINVAL;
3958 
3959 	/* fd_getvnode() will use the descriptor for us */
3960 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3961 		return (error);
3962 	if ((fp->f_flag & FWRITE) == 0) {
3963 		error = EINVAL;
3964 		goto out;
3965 	}
3966 	vp = fp->f_vnode;
3967 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3968 	if (vp->v_type == VDIR)
3969 		error = EISDIR;
3970 	else if ((error = vn_writechk(vp)) == 0) {
3971 		vattr_null(&vattr);
3972 		vattr.va_size = SCARG(uap, length);
3973 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3974 	}
3975 	VOP_UNLOCK(vp);
3976  out:
3977 	fd_putfile(SCARG(uap, fd));
3978 	return (error);
3979 }
3980 
3981 /*
3982  * Sync an open file.
3983  */
3984 /* ARGSUSED */
3985 int
3986 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3987 {
3988 	/* {
3989 		syscallarg(int) fd;
3990 	} */
3991 	struct vnode *vp;
3992 	file_t *fp;
3993 	int error;
3994 
3995 	/* fd_getvnode() will use the descriptor for us */
3996 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3997 		return (error);
3998 	vp = fp->f_vnode;
3999 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4000 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4001 	VOP_UNLOCK(vp);
4002 	fd_putfile(SCARG(uap, fd));
4003 	return (error);
4004 }
4005 
4006 /*
4007  * Sync a range of file data.  API modeled after that found in AIX.
4008  *
4009  * FDATASYNC indicates that we need only save enough metadata to be able
4010  * to re-read the written data.  Note we duplicate AIX's requirement that
4011  * the file be open for writing.
4012  */
4013 /* ARGSUSED */
4014 int
4015 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4016 {
4017 	/* {
4018 		syscallarg(int) fd;
4019 		syscallarg(int) flags;
4020 		syscallarg(off_t) start;
4021 		syscallarg(off_t) length;
4022 	} */
4023 	struct vnode *vp;
4024 	file_t *fp;
4025 	int flags, nflags;
4026 	off_t s, e, len;
4027 	int error;
4028 
4029 	/* fd_getvnode() will use the descriptor for us */
4030 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4031 		return (error);
4032 
4033 	if ((fp->f_flag & FWRITE) == 0) {
4034 		error = EBADF;
4035 		goto out;
4036 	}
4037 
4038 	flags = SCARG(uap, flags);
4039 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4040 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4041 		error = EINVAL;
4042 		goto out;
4043 	}
4044 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4045 	if (flags & FDATASYNC)
4046 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4047 	else
4048 		nflags = FSYNC_WAIT;
4049 	if (flags & FDISKSYNC)
4050 		nflags |= FSYNC_CACHE;
4051 
4052 	len = SCARG(uap, length);
4053 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4054 	if (len) {
4055 		s = SCARG(uap, start);
4056 		e = s + len;
4057 		if (e < s) {
4058 			error = EINVAL;
4059 			goto out;
4060 		}
4061 	} else {
4062 		e = 0;
4063 		s = 0;
4064 	}
4065 
4066 	vp = fp->f_vnode;
4067 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4068 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4069 	VOP_UNLOCK(vp);
4070 out:
4071 	fd_putfile(SCARG(uap, fd));
4072 	return (error);
4073 }
4074 
4075 /*
4076  * Sync the data of an open file.
4077  */
4078 /* ARGSUSED */
4079 int
4080 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4081 {
4082 	/* {
4083 		syscallarg(int) fd;
4084 	} */
4085 	struct vnode *vp;
4086 	file_t *fp;
4087 	int error;
4088 
4089 	/* fd_getvnode() will use the descriptor for us */
4090 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4091 		return (error);
4092 	if ((fp->f_flag & FWRITE) == 0) {
4093 		fd_putfile(SCARG(uap, fd));
4094 		return (EBADF);
4095 	}
4096 	vp = fp->f_vnode;
4097 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4098 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4099 	VOP_UNLOCK(vp);
4100 	fd_putfile(SCARG(uap, fd));
4101 	return (error);
4102 }
4103 
4104 /*
4105  * Rename files, (standard) BSD semantics frontend.
4106  */
4107 /* ARGSUSED */
4108 int
4109 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4110 {
4111 	/* {
4112 		syscallarg(const char *) from;
4113 		syscallarg(const char *) to;
4114 	} */
4115 
4116 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4117 	    SCARG(uap, to), UIO_USERSPACE, 0));
4118 }
4119 
4120 int
4121 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4122     register_t *retval)
4123 {
4124 	/* {
4125 		syscallarg(int) fromfd;
4126 		syscallarg(const char *) from;
4127 		syscallarg(int) tofd;
4128 		syscallarg(const char *) to;
4129 	} */
4130 
4131 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4132 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4133 }
4134 
4135 /*
4136  * Rename files, POSIX semantics frontend.
4137  */
4138 /* ARGSUSED */
4139 int
4140 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4141 {
4142 	/* {
4143 		syscallarg(const char *) from;
4144 		syscallarg(const char *) to;
4145 	} */
4146 
4147 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4148 	    SCARG(uap, to), UIO_USERSPACE, 1));
4149 }
4150 
4151 /*
4152  * Rename files.  Source and destination must either both be directories,
4153  * or both not be directories.  If target is a directory, it must be empty.
4154  * If `from' and `to' refer to the same object, the value of the `retain'
4155  * argument is used to determine whether `from' will be
4156  *
4157  * (retain == 0)	deleted unless `from' and `to' refer to the same
4158  *			object in the file system's name space (BSD).
4159  * (retain == 1)	always retained (POSIX).
4160  *
4161  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4162  */
4163 int
4164 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4165 {
4166 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4167 }
4168 
4169 static int
4170 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4171     const char *to, enum uio_seg seg, int retain)
4172 {
4173 	struct pathbuf *fpb, *tpb;
4174 	struct nameidata fnd, tnd;
4175 	struct vnode *fdvp, *fvp;
4176 	struct vnode *tdvp, *tvp;
4177 	struct mount *mp, *tmp;
4178 	int error;
4179 
4180 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4181 
4182 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4183 	if (error)
4184 		goto out0;
4185 	KASSERT(fpb != NULL);
4186 
4187 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4188 	if (error)
4189 		goto out1;
4190 	KASSERT(tpb != NULL);
4191 
4192 	/*
4193 	 * Lookup from.
4194 	 *
4195 	 * XXX LOCKPARENT is wrong because we don't actually want it
4196 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4197 	 * insane, so for the time being we need to leave it like this.
4198 	 */
4199 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT | INRENAME), fpb);
4200 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4201 		goto out2;
4202 
4203 	/*
4204 	 * Pull out the important results of the lookup, fdvp and fvp.
4205 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4206 	 */
4207 	fdvp = fnd.ni_dvp;
4208 	fvp = fnd.ni_vp;
4209 	KASSERT(fdvp != NULL);
4210 	KASSERT(fvp != NULL);
4211 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4212 
4213 	/*
4214 	 * Make sure neither fdvp nor fvp is locked.
4215 	 */
4216 	if (fdvp != fvp)
4217 		VOP_UNLOCK(fdvp);
4218 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4219 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4220 
4221 	/*
4222 	 * Reject renaming `.' and `..'.  Can't do this until after
4223 	 * namei because we need namei's parsing to find the final
4224 	 * component name.  (namei should just leave us with the final
4225 	 * component name and not look it up itself, but anyway...)
4226 	 *
4227 	 * This was here before because we used to relookup from
4228 	 * instead of to and relookup requires the caller to check
4229 	 * this, but now file systems may depend on this check, so we
4230 	 * must retain it until the file systems are all rototilled.
4231 	 */
4232 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4233 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4234 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4235 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4236 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4237 		error = EINVAL;	/* XXX EISDIR?  */
4238 		goto abort0;
4239 	}
4240 
4241 	/*
4242 	 * Lookup to.
4243 	 *
4244 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4245 	 * fvp here to decide whether to add CREATEDIR is a load of
4246 	 * bollocks because fvp might be the wrong node by now, since
4247 	 * fdvp is unlocked.
4248 	 *
4249 	 * XXX Why not pass CREATEDIR always?
4250 	 */
4251 	NDINIT(&tnd, RENAME,
4252 	    (LOCKPARENT | NOCACHE | TRYEMULROOT | INRENAME |
4253 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4254 	    tpb);
4255 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4256 		goto abort0;
4257 
4258 	/*
4259 	 * Pull out the important results of the lookup, tdvp and tvp.
4260 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4261 	 */
4262 	tdvp = tnd.ni_dvp;
4263 	tvp = tnd.ni_vp;
4264 	KASSERT(tdvp != NULL);
4265 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4266 
4267 	/*
4268 	 * Make sure neither tdvp nor tvp is locked.
4269 	 */
4270 	if (tdvp != tvp)
4271 		VOP_UNLOCK(tdvp);
4272 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4273 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4274 
4275 	/*
4276 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4277 	 * these, which is why we must do this here.  Once upon a time
4278 	 * we relooked up from instead of to, and consequently didn't
4279 	 * need this check, but now that we relookup to instead of
4280 	 * from, we need this; and we shall need it forever forward
4281 	 * until the VOP_RENAME protocol changes, because file systems
4282 	 * will no doubt begin to depend on this check.
4283 	 */
4284 	if (((tnd.ni_cnd.cn_namelen == 1) &&
4285 		(tnd.ni_cnd.cn_nameptr[0] == '.')) ||
4286 	    ((tnd.ni_cnd.cn_namelen == 2) &&
4287 		(tnd.ni_cnd.cn_nameptr[0] == '.') &&
4288 		(tnd.ni_cnd.cn_nameptr[1] == '.'))) {
4289 		error = EINVAL;	/* XXX EISDIR?  */
4290 		goto abort1;
4291 	}
4292 
4293 	/*
4294 	 * Get the mount point.  If the file system has been unmounted,
4295 	 * which it may be because we're not holding any vnode locks,
4296 	 * then v_mount will be NULL.  We're not really supposed to
4297 	 * read v_mount without holding the vnode lock, but since we
4298 	 * have fdvp referenced, if fdvp->v_mount changes then at worst
4299 	 * it will be set to NULL, not changed to another mount point.
4300 	 * And, of course, since it is up to the file system to
4301 	 * determine the real lock order, we can't lock both fdvp and
4302 	 * tdvp at the same time.
4303 	 */
4304 	mp = fdvp->v_mount;
4305 	if (mp == NULL) {
4306 		error = ENOENT;
4307 		goto abort1;
4308 	}
4309 
4310 	/*
4311 	 * Make sure the mount points match.  Again, although we don't
4312 	 * hold any vnode locks, the v_mount fields may change -- but
4313 	 * at worst they will change to NULL, so this will never become
4314 	 * a cross-device rename, because we hold vnode references.
4315 	 *
4316 	 * XXX Because nothing is locked and the compiler may reorder
4317 	 * things here, unmounting the file system at an inopportune
4318 	 * moment may cause rename to fail with ENXDEV when it really
4319 	 * should fail with ENOENT.
4320 	 */
4321 	tmp = tdvp->v_mount;
4322 	if (tmp == NULL) {
4323 		error = ENOENT;
4324 		goto abort1;
4325 	}
4326 
4327 	if (mp != tmp) {
4328 		error = EXDEV;
4329 		goto abort1;
4330 	}
4331 
4332 	/*
4333 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4334 	 * Nothing is locked currently, so taking this lock is safe.
4335 	 */
4336 	error = VFS_RENAMELOCK_ENTER(mp);
4337 	if (error)
4338 		goto abort1;
4339 
4340 	/*
4341 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4342 	 * and nothing is locked except for the vfs rename lock.
4343 	 *
4344 	 * The next step is a little rain dance to conform to the
4345 	 * insane lock protocol, even though it does nothing to ward
4346 	 * off race conditions.
4347 	 *
4348 	 * We need tdvp and tvp to be locked.  However, because we have
4349 	 * unlocked tdvp in order to hold no locks while we take the
4350 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4351 	 * lock it even if the sensible file systems will just unlock
4352 	 * it straight away.  Consequently, we must lock tdvp and then
4353 	 * relookup tvp to get it locked.
4354 	 *
4355 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4356 	 * and various file systems insanely depend on the semantics of
4357 	 * this brain damage, the lookup of to must be the last lookup
4358 	 * before VOP_RENAME.
4359 	 */
4360 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4361 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4362 	if (error)
4363 		goto abort2;
4364 
4365 	/*
4366 	 * Drop the old tvp and pick up the new one -- which might be
4367 	 * the same, but that doesn't matter to us.  After this, tdvp
4368 	 * and tvp should both be locked.
4369 	 */
4370 	if (tvp != NULL)
4371 		vrele(tvp);
4372 	tvp = tnd.ni_vp;
4373 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4374 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4375 
4376 	/*
4377 	 * The old do_sys_rename had various consistency checks here
4378 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4379 	 * will become bogus soon in any sensible file system, so the
4380 	 * only purpose in putting these checks here is to give lip
4381 	 * service to these screw cases and to acknowledge that they
4382 	 * exist, not actually to handle them, but here you go
4383 	 * anyway...
4384 	 */
4385 
4386 	/*
4387 	 * Acknowledge that directories and non-directories aren't
4388 	 * suposed to mix.
4389 	 */
4390 	if (tvp != NULL) {
4391 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4392 			error = ENOTDIR;
4393 			goto abort3;
4394 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4395 			error = EISDIR;
4396 			goto abort3;
4397 		}
4398 	}
4399 
4400 	/*
4401 	 * Acknowledge some random screw case, among the dozens that
4402 	 * might arise.
4403 	 */
4404 	if (fvp == tdvp) {
4405 		error = EINVAL;
4406 		goto abort3;
4407 	}
4408 
4409 	/*
4410 	 * Acknowledge that POSIX has a wacky screw case.
4411 	 *
4412 	 * XXX Eventually the retain flag needs to be passed on to
4413 	 * VOP_RENAME.
4414 	 */
4415 	if (fvp == tvp) {
4416 		if (retain) {
4417 			error = 0;
4418 			goto abort3;
4419 		} else if ((fdvp == tdvp) &&
4420 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4421 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4422 			fnd.ni_cnd.cn_namelen))) {
4423 			error = 0;
4424 			goto abort3;
4425 		}
4426 	}
4427 
4428 	/*
4429 	 * Make sure veriexec can screw us up.  (But a race can screw
4430 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4431 	 * bogus.)
4432 	 */
4433 #if NVERIEXEC > 0
4434 	{
4435 		char *f1, *f2;
4436 		size_t f1_len;
4437 		size_t f2_len;
4438 
4439 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4440 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4441 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4442 
4443 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4444 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4445 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4446 
4447 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4448 
4449 		kmem_free(f1, f1_len);
4450 		kmem_free(f2, f2_len);
4451 
4452 		if (error)
4453 			goto abort3;
4454 	}
4455 #endif /* NVERIEXEC > 0 */
4456 
4457 	/*
4458 	 * All ready.  Incant the rename vop.
4459 	 */
4460 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4461 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4462 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4463 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4464 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4465 
4466 	/*
4467 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4468 	 * tdvp and tvp.  But we can't assert any of that.
4469 	 */
4470 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4471 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4472 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4473 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4474 
4475 	/*
4476 	 * So all we have left to do is to drop the rename lock and
4477 	 * destroy the pathbufs.
4478 	 */
4479 	VFS_RENAMELOCK_EXIT(mp);
4480 	goto out2;
4481 
4482 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4483 		VOP_UNLOCK(tvp);
4484 abort2:	VOP_UNLOCK(tdvp);
4485 	VFS_RENAMELOCK_EXIT(mp);
4486 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4487 	vrele(tdvp);
4488 	if (tvp != NULL)
4489 		vrele(tvp);
4490 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4491 	vrele(fdvp);
4492 	vrele(fvp);
4493 out2:	pathbuf_destroy(tpb);
4494 out1:	pathbuf_destroy(fpb);
4495 out0:	return error;
4496 }
4497 
4498 /*
4499  * Make a directory file.
4500  */
4501 /* ARGSUSED */
4502 int
4503 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4504 {
4505 	/* {
4506 		syscallarg(const char *) path;
4507 		syscallarg(int) mode;
4508 	} */
4509 
4510 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4511 	    SCARG(uap, mode), UIO_USERSPACE);
4512 }
4513 
4514 int
4515 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4516     register_t *retval)
4517 {
4518 	/* {
4519 		syscallarg(int) fd;
4520 		syscallarg(const char *) path;
4521 		syscallarg(int) mode;
4522 	} */
4523 
4524 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4525 	    SCARG(uap, mode), UIO_USERSPACE);
4526 }
4527 
4528 
4529 int
4530 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4531 {
4532 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, UIO_USERSPACE);
4533 }
4534 
4535 static int
4536 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4537     enum uio_seg seg)
4538 {
4539 	struct proc *p = curlwp->l_proc;
4540 	struct vnode *vp;
4541 	struct vattr vattr;
4542 	int error;
4543 	struct pathbuf *pb;
4544 	struct nameidata nd;
4545 
4546 	KASSERT(l != NULL || fdat == AT_FDCWD);
4547 
4548 	/* XXX bollocks, should pass in a pathbuf */
4549 	error = pathbuf_maybe_copyin(path, seg, &pb);
4550 	if (error) {
4551 		return error;
4552 	}
4553 
4554 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4555 
4556 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4557 		pathbuf_destroy(pb);
4558 		return (error);
4559 	}
4560 	vp = nd.ni_vp;
4561 	if (vp != NULL) {
4562 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4563 		if (nd.ni_dvp == vp)
4564 			vrele(nd.ni_dvp);
4565 		else
4566 			vput(nd.ni_dvp);
4567 		vrele(vp);
4568 		pathbuf_destroy(pb);
4569 		return (EEXIST);
4570 	}
4571 	vattr_null(&vattr);
4572 	vattr.va_type = VDIR;
4573 	/* We will read cwdi->cwdi_cmask unlocked. */
4574 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4575 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4576 	if (!error)
4577 		vrele(nd.ni_vp);
4578 	vput(nd.ni_dvp);
4579 	pathbuf_destroy(pb);
4580 	return (error);
4581 }
4582 
4583 /*
4584  * Remove a directory file.
4585  */
4586 /* ARGSUSED */
4587 int
4588 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4589 {
4590 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4591 	    AT_REMOVEDIR, UIO_USERSPACE);
4592 }
4593 
4594 /*
4595  * Read a block of directory entries in a file system independent format.
4596  */
4597 int
4598 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4599 {
4600 	/* {
4601 		syscallarg(int) fd;
4602 		syscallarg(char *) buf;
4603 		syscallarg(size_t) count;
4604 	} */
4605 	file_t *fp;
4606 	int error, done;
4607 
4608 	/* fd_getvnode() will use the descriptor for us */
4609 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4610 		return (error);
4611 	if ((fp->f_flag & FREAD) == 0) {
4612 		error = EBADF;
4613 		goto out;
4614 	}
4615 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4616 			SCARG(uap, count), &done, l, 0, 0);
4617 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4618 	*retval = done;
4619  out:
4620 	fd_putfile(SCARG(uap, fd));
4621 	return (error);
4622 }
4623 
4624 /*
4625  * Set the mode mask for creation of filesystem nodes.
4626  */
4627 int
4628 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4629 {
4630 	/* {
4631 		syscallarg(mode_t) newmask;
4632 	} */
4633 	struct proc *p = l->l_proc;
4634 	struct cwdinfo *cwdi;
4635 
4636 	/*
4637 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
4638 	 * important is that we serialize changes to the mask.  The
4639 	 * rw_exit() will issue a write memory barrier on our behalf,
4640 	 * and force the changes out to other CPUs (as it must use an
4641 	 * atomic operation, draining the local CPU's store buffers).
4642 	 */
4643 	cwdi = p->p_cwdi;
4644 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
4645 	*retval = cwdi->cwdi_cmask;
4646 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
4647 	rw_exit(&cwdi->cwdi_lock);
4648 
4649 	return (0);
4650 }
4651 
4652 int
4653 dorevoke(struct vnode *vp, kauth_cred_t cred)
4654 {
4655 	struct vattr vattr;
4656 	int error, fs_decision;
4657 
4658 	vn_lock(vp, LK_SHARED | LK_RETRY);
4659 	error = VOP_GETATTR(vp, &vattr, cred);
4660 	VOP_UNLOCK(vp);
4661 	if (error != 0)
4662 		return error;
4663 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4664 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4665 	    fs_decision);
4666 	if (!error)
4667 		VOP_REVOKE(vp, REVOKEALL);
4668 	return (error);
4669 }
4670 
4671 /*
4672  * Void all references to file by ripping underlying filesystem
4673  * away from vnode.
4674  */
4675 /* ARGSUSED */
4676 int
4677 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4678 {
4679 	/* {
4680 		syscallarg(const char *) path;
4681 	} */
4682 	struct vnode *vp;
4683 	int error;
4684 
4685 	error = namei_simple_user(SCARG(uap, path),
4686 				NSM_FOLLOW_TRYEMULROOT, &vp);
4687 	if (error != 0)
4688 		return (error);
4689 	error = dorevoke(vp, l->l_cred);
4690 	vrele(vp);
4691 	return (error);
4692 }
4693 
4694 /*
4695  * Allocate backing store for a file, filling a hole without having to
4696  * explicitly write anything out.
4697  */
4698 /* ARGSUSED */
4699 int
4700 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4701 		register_t *retval)
4702 {
4703 	/* {
4704 		syscallarg(int) fd;
4705 		syscallarg(off_t) pos;
4706 		syscallarg(off_t) len;
4707 	} */
4708 	int fd;
4709 	off_t pos, len;
4710 	struct file *fp;
4711 	struct vnode *vp;
4712 	int error;
4713 
4714 	fd = SCARG(uap, fd);
4715 	pos = SCARG(uap, pos);
4716 	len = SCARG(uap, len);
4717 
4718 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4719 		return EINVAL;
4720 	}
4721 
4722 	error = fd_getvnode(fd, &fp);
4723 	if (error) {
4724 		return error;
4725 	}
4726 	if ((fp->f_flag & FWRITE) == 0) {
4727 		error = EBADF;
4728 		goto fail;
4729 	}
4730 	vp = fp->f_vnode;
4731 
4732 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4733 	if (vp->v_type == VDIR) {
4734 		error = EISDIR;
4735 	} else {
4736 		error = VOP_FALLOCATE(vp, pos, len);
4737 	}
4738 	VOP_UNLOCK(vp);
4739 
4740 fail:
4741 	fd_putfile(fd);
4742 	return error;
4743 }
4744 
4745 /*
4746  * Deallocate backing store for a file, creating a hole. Also used for
4747  * invoking TRIM on disks.
4748  */
4749 /* ARGSUSED */
4750 int
4751 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4752 		register_t *retval)
4753 {
4754 	/* {
4755 		syscallarg(int) fd;
4756 		syscallarg(off_t) pos;
4757 		syscallarg(off_t) len;
4758 	} */
4759 	int fd;
4760 	off_t pos, len;
4761 	struct file *fp;
4762 	struct vnode *vp;
4763 	int error;
4764 
4765 	fd = SCARG(uap, fd);
4766 	pos = SCARG(uap, pos);
4767 	len = SCARG(uap, len);
4768 
4769 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4770 		return EINVAL;
4771 	}
4772 
4773 	error = fd_getvnode(fd, &fp);
4774 	if (error) {
4775 		return error;
4776 	}
4777 	if ((fp->f_flag & FWRITE) == 0) {
4778 		error = EBADF;
4779 		goto fail;
4780 	}
4781 	vp = fp->f_vnode;
4782 
4783 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4784 	if (vp->v_type == VDIR) {
4785 		error = EISDIR;
4786 	} else {
4787 		error = VOP_FDISCARD(vp, pos, len);
4788 	}
4789 	VOP_UNLOCK(vp);
4790 
4791 fail:
4792 	fd_putfile(fd);
4793 	return error;
4794 }
4795