xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 181254a7b1bdde6873432bffef2d2decc4b5c22f)
1 /*	$NetBSD: vfs_syscalls.c,v 1.548 2020/05/16 18:31:50 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.548 2020/05/16 18:31:50 christos Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/fstrans.h>
91 #include <sys/proc.h>
92 #include <sys/uio.h>
93 #include <sys/kmem.h>
94 #include <sys/dirent.h>
95 #include <sys/sysctl.h>
96 #include <sys/syscallargs.h>
97 #include <sys/vfs_syscalls.h>
98 #include <sys/quota.h>
99 #include <sys/quotactl.h>
100 #include <sys/ktrace.h>
101 #ifdef FILEASSOC
102 #include <sys/fileassoc.h>
103 #endif /* FILEASSOC */
104 #include <sys/extattr.h>
105 #include <sys/verified_exec.h>
106 #include <sys/kauth.h>
107 #include <sys/atomic.h>
108 #include <sys/module.h>
109 #include <sys/buf.h>
110 #include <sys/event.h>
111 #include <sys/compat_stub.h>
112 
113 #include <miscfs/genfs/genfs.h>
114 #include <miscfs/specfs/specdev.h>
115 
116 #include <nfs/rpcv2.h>
117 #include <nfs/nfsproto.h>
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 
121 /* XXX this shouldn't be here */
122 #ifndef OFF_T_MAX
123 #define OFF_T_MAX __type_max(off_t)
124 #endif
125 
126 static int change_flags(struct vnode *, u_long, struct lwp *);
127 static int change_mode(struct vnode *, int, struct lwp *);
128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131     enum uio_seg);
132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134     enum uio_seg);
135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136     enum uio_seg, int);
137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138     size_t, register_t *);
139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140 
141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143     namei_simple_flags_t, struct vnode **);
144 
145 /*
146  * This table is used to maintain compatibility with 4.3BSD
147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
148  * Note, the order is important!
149  *
150  * Do not modify this table. It should only contain filesystems
151  * supported by NetBSD 0.9 and 4.3BSD.
152  */
153 const char * const mountcompatnames[] = {
154 	NULL,		/* 0 = MOUNT_NONE */
155 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156 	MOUNT_NFS,	/* 2 */
157 	MOUNT_MFS,	/* 3 */
158 	MOUNT_MSDOS,	/* 4 */
159 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160 	MOUNT_FDESC,	/* 6 */
161 	MOUNT_KERNFS,	/* 7 */
162 	NULL,		/* 8 = MOUNT_DEVFS */
163 	MOUNT_AFS,	/* 9 */
164 };
165 
166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
167 
168 static int
169 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
170 {
171 	file_t *dfp;
172 	int error;
173 
174 	if (fdat != AT_FDCWD) {
175 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
176 			goto out;
177 
178 		NDAT(ndp, dfp->f_vnode);
179 	}
180 
181 	error = namei(ndp);
182 
183 	if (fdat != AT_FDCWD)
184 		fd_putfile(fdat);
185 out:
186 	return error;
187 }
188 
189 static int
190 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
191     namei_simple_flags_t sflags, struct vnode **vp_ret)
192 {
193 	file_t *dfp;
194 	struct vnode *dvp;
195 	int error;
196 
197 	if (fdat != AT_FDCWD) {
198 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
199 			goto out;
200 
201 		dvp = dfp->f_vnode;
202 	} else {
203 		dvp = NULL;
204 	}
205 
206 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
207 
208 	if (fdat != AT_FDCWD)
209 		fd_putfile(fdat);
210 out:
211 	return error;
212 }
213 
214 static int
215 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
216 {
217 	int error;
218 
219 	fp->f_flag = flags & FMASK;
220 	fp->f_type = DTYPE_VNODE;
221 	fp->f_ops = &vnops;
222 	fp->f_vnode = vp;
223 
224 	if (flags & (O_EXLOCK | O_SHLOCK)) {
225 		struct flock lf;
226 		int type;
227 
228 		lf.l_whence = SEEK_SET;
229 		lf.l_start = 0;
230 		lf.l_len = 0;
231 		if (flags & O_EXLOCK)
232 			lf.l_type = F_WRLCK;
233 		else
234 			lf.l_type = F_RDLCK;
235 		type = F_FLOCK;
236 		if ((flags & FNONBLOCK) == 0)
237 			type |= F_WAIT;
238 		VOP_UNLOCK(vp);
239 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
240 		if (error) {
241 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
242 			fd_abort(l->l_proc, fp, indx);
243 			return error;
244 		}
245 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
246 		atomic_or_uint(&fp->f_flag, FHASLOCK);
247 	}
248 	if (flags & O_CLOEXEC)
249 		fd_set_exclose(l, indx, true);
250 	return 0;
251 }
252 
253 static int
254 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
255     void *data, size_t *data_len)
256 {
257 	struct mount *mp;
258 	int error = 0, saved_flags;
259 
260 	mp = vp->v_mount;
261 	saved_flags = mp->mnt_flag;
262 
263 	/* We can operate only on VV_ROOT nodes. */
264 	if ((vp->v_vflag & VV_ROOT) == 0) {
265 		error = EINVAL;
266 		goto out;
267 	}
268 
269 	/*
270 	 * We only allow the filesystem to be reloaded if it
271 	 * is currently mounted read-only.  Additionally, we
272 	 * prevent read-write to read-only downgrades.
273 	 */
274 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
275 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
276 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
277 		error = EOPNOTSUPP;	/* Needs translation */
278 		goto out;
279 	}
280 
281 	/*
282 	 * Enabling MNT_UNION requires a covered mountpoint and
283 	 * must not happen on the root mount.
284 	 */
285 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
286 		error = EOPNOTSUPP;
287 		goto out;
288 	}
289 
290 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
291 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
292 	if (error)
293 		goto out;
294 
295 	error = vfs_suspend(mp, 0);
296 	if (error)
297 		goto out;
298 
299 	mutex_enter(mp->mnt_updating);
300 
301 	mp->mnt_flag &= ~MNT_OP_FLAGS;
302 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
303 
304 	/*
305 	 * Set the mount level flags.
306 	 */
307 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
308 		if ((flags & MNT_RDONLY))
309 			mp->mnt_iflag |= IMNT_WANTRDONLY;
310 		else
311 			mp->mnt_iflag |= IMNT_WANTRDWR;
312 	}
313 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
314 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
315 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
316 		mp->mnt_flag &= ~MNT_RDONLY;
317 
318 	error = VFS_MOUNT(mp, path, data, data_len);
319 
320 	if (error && data != NULL) {
321 		int error2;
322 
323 		/*
324 		 * Update failed; let's try and see if it was an
325 		 * export request.  For compat with 3.0 and earlier.
326 		 */
327 		error2 = vfs_hooks_reexport(mp, path, data);
328 
329 		/*
330 		 * Only update error code if the export request was
331 		 * understood but some problem occurred while
332 		 * processing it.
333 		 */
334 		if (error2 != EJUSTRETURN)
335 			error = error2;
336 	}
337 
338 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
339 		mp->mnt_flag |= MNT_RDONLY;
340 	if (error)
341 		mp->mnt_flag = saved_flags;
342 	mp->mnt_flag &= ~MNT_OP_FLAGS;
343 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
344 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
345 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
346 			vfs_syncer_add_to_worklist(mp);
347 	} else {
348 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
349 			vfs_syncer_remove_from_worklist(mp);
350 	}
351 	mutex_exit(mp->mnt_updating);
352 	vfs_resume(mp);
353 
354 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
355 	    (flags & MNT_EXTATTR)) {
356 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
357 				   NULL, 0, NULL) != 0) {
358 			printf("%s: failed to start extattr, error = %d",
359 			       mp->mnt_stat.f_mntonname, error);
360 			mp->mnt_flag &= ~MNT_EXTATTR;
361 		}
362 	}
363 
364 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
365 	    !(flags & MNT_EXTATTR)) {
366 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
367 				   NULL, 0, NULL) != 0) {
368 			printf("%s: failed to stop extattr, error = %d",
369 			       mp->mnt_stat.f_mntonname, error);
370 			mp->mnt_flag |= MNT_RDONLY;
371 		}
372 	}
373  out:
374 	return (error);
375 }
376 
377 static int
378 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
379     struct vfsops **vfsops)
380 {
381 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
382 	int error;
383 
384 	if (type_seg == UIO_USERSPACE) {
385 		/* Copy file-system type from userspace.  */
386 		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
387 	} else {
388 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
389 		KASSERT(error == 0);
390 	}
391 
392 	if (error) {
393 		/*
394 		 * Historically, filesystem types were identified by numbers.
395 		 * If we get an integer for the filesystem type instead of a
396 		 * string, we check to see if it matches one of the historic
397 		 * filesystem types.
398 		 */
399 		u_long fsindex = (u_long)fstype;
400 		if (fsindex >= nmountcompatnames ||
401 		    mountcompatnames[fsindex] == NULL)
402 			return ENODEV;
403 		strlcpy(fstypename, mountcompatnames[fsindex],
404 		    sizeof(fstypename));
405 	}
406 
407 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
408 	if (strcmp(fstypename, "ufs") == 0)
409 		fstypename[0] = 'f';
410 
411 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
412 		return 0;
413 
414 	/* If we can autoload a vfs module, try again */
415 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
416 
417 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
418 		return 0;
419 
420 	return ENODEV;
421 }
422 
423 static int
424 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
425     void *data, size_t *data_len)
426 {
427 	struct mount *mp;
428 	int error;
429 
430 	/* If MNT_GETARGS is specified, it should be the only flag. */
431 	if (flags & ~MNT_GETARGS)
432 		return EINVAL;
433 
434 	mp = vp->v_mount;
435 
436 	/* XXX: probably some notion of "can see" here if we want isolation. */
437 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
438 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
439 	if (error)
440 		return error;
441 
442 	if ((vp->v_vflag & VV_ROOT) == 0)
443 		return EINVAL;
444 
445 	if (vfs_busy(mp))
446 		return EPERM;
447 
448 	mutex_enter(mp->mnt_updating);
449 	mp->mnt_flag &= ~MNT_OP_FLAGS;
450 	mp->mnt_flag |= MNT_GETARGS;
451 	error = VFS_MOUNT(mp, path, data, data_len);
452 	mp->mnt_flag &= ~MNT_OP_FLAGS;
453 	mutex_exit(mp->mnt_updating);
454 
455 	vfs_unbusy(mp);
456 	return (error);
457 }
458 
459 int
460 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
461 {
462 	/* {
463 		syscallarg(const char *) type;
464 		syscallarg(const char *) path;
465 		syscallarg(int) flags;
466 		syscallarg(void *) data;
467 		syscallarg(size_t) data_len;
468 	} */
469 
470 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
471 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
472 	    SCARG(uap, data_len), retval);
473 }
474 
475 int
476 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
477     const char *path, int flags, void *data, enum uio_seg data_seg,
478     size_t data_len, register_t *retval)
479 {
480 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
481 	struct vnode *vp;
482 	void *data_buf = data;
483 	bool vfsopsrele = false;
484 	size_t alloc_sz = 0;
485 	int error;
486 
487 	/*
488 	 * Get vnode to be covered
489 	 */
490 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
491 	if (error != 0) {
492 		vp = NULL;
493 		goto done;
494 	}
495 
496 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
497 		vfsops = vp->v_mount->mnt_op;
498 	} else {
499 		/* 'type' is userspace */
500 		error = mount_get_vfsops(type, type_seg, &vfsops);
501 		if (error != 0)
502 			goto done;
503 		vfsopsrele = true;
504 	}
505 
506 	/*
507 	 * We allow data to be NULL, even for userspace. Some fs's don't need
508 	 * it. The others will handle NULL.
509 	 */
510 	if (data != NULL && data_seg == UIO_USERSPACE) {
511 		if (data_len == 0) {
512 			/* No length supplied, use default for filesystem */
513 			data_len = vfsops->vfs_min_mount_data;
514 
515 			/*
516 			 * Hopefully a longer buffer won't make copyin() fail.
517 			 * For compatibility with 3.0 and earlier.
518 			 */
519 			if (flags & MNT_UPDATE
520 			    && data_len < sizeof (struct mnt_export_args30))
521 				data_len = sizeof (struct mnt_export_args30);
522 		}
523 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
524 			error = EINVAL;
525 			goto done;
526 		}
527 		alloc_sz = data_len;
528 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
529 
530 		/* NFS needs the buffer even for mnt_getargs .... */
531 		error = copyin(data, data_buf, data_len);
532 		if (error != 0)
533 			goto done;
534 	}
535 
536 	if (flags & MNT_GETARGS) {
537 		if (data_len == 0) {
538 			error = EINVAL;
539 			goto done;
540 		}
541 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
542 		if (error != 0)
543 			goto done;
544 		if (data_seg == UIO_USERSPACE)
545 			error = copyout(data_buf, data, data_len);
546 		*retval = data_len;
547 	} else if (flags & MNT_UPDATE) {
548 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
549 	} else {
550 		/* Locking is handled internally in mount_domount(). */
551 		KASSERT(vfsopsrele == true);
552 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
553 		    &data_len);
554 		vfsopsrele = false;
555 	}
556 	if (!error)
557 		KNOTE(&fs_klist, VQ_MOUNT);
558 
559     done:
560 	if (vfsopsrele)
561 		vfs_delref(vfsops);
562     	if (vp != NULL) {
563 	    	vrele(vp);
564 	}
565 	if (data_buf != data)
566 		kmem_free(data_buf, alloc_sz);
567 	return (error);
568 }
569 
570 /*
571  * Unmount a file system.
572  *
573  * Note: unmount takes a path to the vnode mounted on as argument,
574  * not special file (as before).
575  */
576 /* ARGSUSED */
577 int
578 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
579 {
580 	/* {
581 		syscallarg(const char *) path;
582 		syscallarg(int) flags;
583 	} */
584 	struct vnode *vp;
585 	struct mount *mp;
586 	int error;
587 	struct pathbuf *pb;
588 	struct nameidata nd;
589 
590 	error = pathbuf_copyin(SCARG(uap, path), &pb);
591 	if (error) {
592 		return error;
593 	}
594 
595 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
596 	if ((error = namei(&nd)) != 0) {
597 		pathbuf_destroy(pb);
598 		return error;
599 	}
600 	vp = nd.ni_vp;
601 	pathbuf_destroy(pb);
602 
603 	mp = vp->v_mount;
604 	vfs_ref(mp);
605 	VOP_UNLOCK(vp);
606 
607 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
608 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
609 	if (error) {
610 		vrele(vp);
611 		vfs_rele(mp);
612 		return (error);
613 	}
614 
615 	/*
616 	 * Don't allow unmounting the root file system.
617 	 */
618 	if (mp->mnt_flag & MNT_ROOTFS) {
619 		vrele(vp);
620 		vfs_rele(mp);
621 		return (EINVAL);
622 	}
623 
624 	/*
625 	 * Must be the root of the filesystem
626 	 */
627 	if ((vp->v_vflag & VV_ROOT) == 0) {
628 		vrele(vp);
629 		vfs_rele(mp);
630 		return (EINVAL);
631 	}
632 
633 	vrele(vp);
634 	error = dounmount(mp, SCARG(uap, flags), l);
635 	vfs_rele(mp);
636 	if (!error)
637 		KNOTE(&fs_klist, VQ_UNMOUNT);
638 	return error;
639 }
640 
641 /*
642  * Sync each mounted filesystem.
643  */
644 #ifdef DEBUG
645 int syncprt = 0;
646 struct ctldebug debug0 = { "syncprt", &syncprt };
647 #endif
648 
649 void
650 do_sys_sync(struct lwp *l)
651 {
652 	mount_iterator_t *iter;
653 	struct mount *mp;
654 	int asyncflag;
655 
656 	mountlist_iterator_init(&iter);
657 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
658 		mutex_enter(mp->mnt_updating);
659 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
660 			asyncflag = mp->mnt_flag & MNT_ASYNC;
661 			mp->mnt_flag &= ~MNT_ASYNC;
662 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
663 			if (asyncflag)
664 				 mp->mnt_flag |= MNT_ASYNC;
665 		}
666 		mutex_exit(mp->mnt_updating);
667 	}
668 	mountlist_iterator_destroy(iter);
669 #ifdef DEBUG
670 	if (syncprt)
671 		vfs_bufstats();
672 #endif /* DEBUG */
673 }
674 
675 static bool
676 sync_vnode_filter(void *cookie, vnode_t *vp)
677 {
678 
679 	if (vp->v_numoutput > 0) {
680 		++*(int *)cookie;
681 	}
682 	return false;
683 }
684 
685 int
686 vfs_syncwait(void)
687 {
688 	int nbusy, nbusy_prev, iter;
689 	struct vnode_iterator *vniter;
690 	mount_iterator_t *mpiter;
691 	struct mount *mp;
692 
693 	for (nbusy_prev = 0, iter = 0; iter < 20;) {
694 		nbusy = 0;
695 		mountlist_iterator_init(&mpiter);
696 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
697 			vnode_t *vp __diagused;
698 			vfs_vnode_iterator_init(mp, &vniter);
699 			vp = vfs_vnode_iterator_next(vniter,
700 			    sync_vnode_filter, &nbusy);
701 			KASSERT(vp == NULL);
702 			vfs_vnode_iterator_destroy(vniter);
703 		}
704 		mountlist_iterator_destroy(mpiter);
705 
706 		if (nbusy == 0)
707 			break;
708 		if (nbusy_prev == 0)
709 			nbusy_prev = nbusy;
710 		printf("%d ", nbusy);
711 		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
712 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
713 			iter++;
714 		else
715 			nbusy_prev = nbusy;
716 	}
717 
718 	if (nbusy) {
719 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
720 		printf("giving up\nPrinting vnodes for busy buffers\n");
721 		mountlist_iterator_init(&mpiter);
722 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
723 			vnode_t *vp;
724 			vfs_vnode_iterator_init(mp, &vniter);
725 			vp = vfs_vnode_iterator_next(vniter,
726 			    NULL, NULL);
727 			mutex_enter(vp->v_interlock);
728 			if (vp->v_numoutput > 0)
729 				vprint(NULL, vp);
730 			mutex_exit(vp->v_interlock);
731 			vrele(vp);
732 			vfs_vnode_iterator_destroy(vniter);
733 		}
734 		mountlist_iterator_destroy(mpiter);
735 #endif
736 	}
737 
738 	return nbusy;
739 }
740 
741 /* ARGSUSED */
742 int
743 sys_sync(struct lwp *l, const void *v, register_t *retval)
744 {
745 	do_sys_sync(l);
746 	return (0);
747 }
748 
749 
750 /*
751  * Access or change filesystem quotas.
752  *
753  * (this is really 14 different calls bundled into one)
754  */
755 
756 static int
757 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
758 {
759 	struct quotastat info_k;
760 	int error;
761 
762 	/* ensure any padding bytes are cleared */
763 	memset(&info_k, 0, sizeof(info_k));
764 
765 	error = vfs_quotactl_stat(mp, &info_k);
766 	if (error) {
767 		return error;
768 	}
769 
770 	return copyout(&info_k, info_u, sizeof(info_k));
771 }
772 
773 static int
774 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
775     struct quotaidtypestat *info_u)
776 {
777 	struct quotaidtypestat info_k;
778 	int error;
779 
780 	/* ensure any padding bytes are cleared */
781 	memset(&info_k, 0, sizeof(info_k));
782 
783 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
784 	if (error) {
785 		return error;
786 	}
787 
788 	return copyout(&info_k, info_u, sizeof(info_k));
789 }
790 
791 static int
792 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
793     struct quotaobjtypestat *info_u)
794 {
795 	struct quotaobjtypestat info_k;
796 	int error;
797 
798 	/* ensure any padding bytes are cleared */
799 	memset(&info_k, 0, sizeof(info_k));
800 
801 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
802 	if (error) {
803 		return error;
804 	}
805 
806 	return copyout(&info_k, info_u, sizeof(info_k));
807 }
808 
809 static int
810 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
811     struct quotaval *val_u)
812 {
813 	struct quotakey key_k;
814 	struct quotaval val_k;
815 	int error;
816 
817 	/* ensure any padding bytes are cleared */
818 	memset(&val_k, 0, sizeof(val_k));
819 
820 	error = copyin(key_u, &key_k, sizeof(key_k));
821 	if (error) {
822 		return error;
823 	}
824 
825 	error = vfs_quotactl_get(mp, &key_k, &val_k);
826 	if (error) {
827 		return error;
828 	}
829 
830 	return copyout(&val_k, val_u, sizeof(val_k));
831 }
832 
833 static int
834 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
835     const struct quotaval *val_u)
836 {
837 	struct quotakey key_k;
838 	struct quotaval val_k;
839 	int error;
840 
841 	error = copyin(key_u, &key_k, sizeof(key_k));
842 	if (error) {
843 		return error;
844 	}
845 
846 	error = copyin(val_u, &val_k, sizeof(val_k));
847 	if (error) {
848 		return error;
849 	}
850 
851 	return vfs_quotactl_put(mp, &key_k, &val_k);
852 }
853 
854 static int
855 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
856 {
857 	struct quotakey key_k;
858 	int error;
859 
860 	error = copyin(key_u, &key_k, sizeof(key_k));
861 	if (error) {
862 		return error;
863 	}
864 
865 	return vfs_quotactl_del(mp, &key_k);
866 }
867 
868 static int
869 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
870 {
871 	struct quotakcursor cursor_k;
872 	int error;
873 
874 	/* ensure any padding bytes are cleared */
875 	memset(&cursor_k, 0, sizeof(cursor_k));
876 
877 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
878 	if (error) {
879 		return error;
880 	}
881 
882 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
883 }
884 
885 static int
886 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
887 {
888 	struct quotakcursor cursor_k;
889 	int error;
890 
891 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
892 	if (error) {
893 		return error;
894 	}
895 
896 	return vfs_quotactl_cursorclose(mp, &cursor_k);
897 }
898 
899 static int
900 do_sys_quotactl_cursorskipidtype(struct mount *mp,
901     struct quotakcursor *cursor_u, int idtype)
902 {
903 	struct quotakcursor cursor_k;
904 	int error;
905 
906 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
907 	if (error) {
908 		return error;
909 	}
910 
911 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
912 	if (error) {
913 		return error;
914 	}
915 
916 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
917 }
918 
919 static int
920 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
921     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
922     unsigned *ret_u)
923 {
924 #define CGET_STACK_MAX 8
925 	struct quotakcursor cursor_k;
926 	struct quotakey stackkeys[CGET_STACK_MAX];
927 	struct quotaval stackvals[CGET_STACK_MAX];
928 	struct quotakey *keys_k;
929 	struct quotaval *vals_k;
930 	unsigned ret_k;
931 	int error;
932 
933 	if (maxnum > 128) {
934 		maxnum = 128;
935 	}
936 
937 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
938 	if (error) {
939 		return error;
940 	}
941 
942 	if (maxnum <= CGET_STACK_MAX) {
943 		keys_k = stackkeys;
944 		vals_k = stackvals;
945 		/* ensure any padding bytes are cleared */
946 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
947 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
948 	} else {
949 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
950 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
951 	}
952 
953 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
954 				       &ret_k);
955 	if (error) {
956 		goto fail;
957 	}
958 
959 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
960 	if (error) {
961 		goto fail;
962 	}
963 
964 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
965 	if (error) {
966 		goto fail;
967 	}
968 
969 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
970 	if (error) {
971 		goto fail;
972 	}
973 
974 	/* do last to maximize the chance of being able to recover a failure */
975 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
976 
977 fail:
978 	if (keys_k != stackkeys) {
979 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
980 	}
981 	if (vals_k != stackvals) {
982 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
983 	}
984 	return error;
985 }
986 
987 static int
988 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
989     int *ret_u)
990 {
991 	struct quotakcursor cursor_k;
992 	int ret_k;
993 	int error;
994 
995 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
996 	if (error) {
997 		return error;
998 	}
999 
1000 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1001 	if (error) {
1002 		return error;
1003 	}
1004 
1005 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1006 	if (error) {
1007 		return error;
1008 	}
1009 
1010 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1011 }
1012 
1013 static int
1014 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1015 {
1016 	struct quotakcursor cursor_k;
1017 	int error;
1018 
1019 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1020 	if (error) {
1021 		return error;
1022 	}
1023 
1024 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1025 	if (error) {
1026 		return error;
1027 	}
1028 
1029 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1030 }
1031 
1032 static int
1033 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1034 {
1035 	char *path_k;
1036 	int error;
1037 
1038 	/* XXX this should probably be a struct pathbuf */
1039 	path_k = PNBUF_GET();
1040 	error = copyin(path_u, path_k, PATH_MAX);
1041 	if (error) {
1042 		PNBUF_PUT(path_k);
1043 		return error;
1044 	}
1045 
1046 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1047 
1048 	PNBUF_PUT(path_k);
1049 	return error;
1050 }
1051 
1052 static int
1053 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1054 {
1055 	return vfs_quotactl_quotaoff(mp, idtype);
1056 }
1057 
1058 int
1059 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1060 {
1061 	struct mount *mp;
1062 	struct vnode *vp;
1063 	int error;
1064 
1065 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1066 	if (error != 0)
1067 		return (error);
1068 	mp = vp->v_mount;
1069 
1070 	switch (args->qc_op) {
1071 	    case QUOTACTL_STAT:
1072 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1073 		break;
1074 	    case QUOTACTL_IDTYPESTAT:
1075 		error = do_sys_quotactl_idtypestat(mp,
1076 				args->u.idtypestat.qc_idtype,
1077 				args->u.idtypestat.qc_info);
1078 		break;
1079 	    case QUOTACTL_OBJTYPESTAT:
1080 		error = do_sys_quotactl_objtypestat(mp,
1081 				args->u.objtypestat.qc_objtype,
1082 				args->u.objtypestat.qc_info);
1083 		break;
1084 	    case QUOTACTL_GET:
1085 		error = do_sys_quotactl_get(mp,
1086 				args->u.get.qc_key,
1087 				args->u.get.qc_val);
1088 		break;
1089 	    case QUOTACTL_PUT:
1090 		error = do_sys_quotactl_put(mp,
1091 				args->u.put.qc_key,
1092 				args->u.put.qc_val);
1093 		break;
1094 	    case QUOTACTL_DEL:
1095 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1096 		break;
1097 	    case QUOTACTL_CURSOROPEN:
1098 		error = do_sys_quotactl_cursoropen(mp,
1099 				args->u.cursoropen.qc_cursor);
1100 		break;
1101 	    case QUOTACTL_CURSORCLOSE:
1102 		error = do_sys_quotactl_cursorclose(mp,
1103 				args->u.cursorclose.qc_cursor);
1104 		break;
1105 	    case QUOTACTL_CURSORSKIPIDTYPE:
1106 		error = do_sys_quotactl_cursorskipidtype(mp,
1107 				args->u.cursorskipidtype.qc_cursor,
1108 				args->u.cursorskipidtype.qc_idtype);
1109 		break;
1110 	    case QUOTACTL_CURSORGET:
1111 		error = do_sys_quotactl_cursorget(mp,
1112 				args->u.cursorget.qc_cursor,
1113 				args->u.cursorget.qc_keys,
1114 				args->u.cursorget.qc_vals,
1115 				args->u.cursorget.qc_maxnum,
1116 				args->u.cursorget.qc_ret);
1117 		break;
1118 	    case QUOTACTL_CURSORATEND:
1119 		error = do_sys_quotactl_cursoratend(mp,
1120 				args->u.cursoratend.qc_cursor,
1121 				args->u.cursoratend.qc_ret);
1122 		break;
1123 	    case QUOTACTL_CURSORREWIND:
1124 		error = do_sys_quotactl_cursorrewind(mp,
1125 				args->u.cursorrewind.qc_cursor);
1126 		break;
1127 	    case QUOTACTL_QUOTAON:
1128 		error = do_sys_quotactl_quotaon(mp,
1129 				args->u.quotaon.qc_idtype,
1130 				args->u.quotaon.qc_quotafile);
1131 		break;
1132 	    case QUOTACTL_QUOTAOFF:
1133 		error = do_sys_quotactl_quotaoff(mp,
1134 				args->u.quotaoff.qc_idtype);
1135 		break;
1136 	    default:
1137 		error = EINVAL;
1138 		break;
1139 	}
1140 
1141 	vrele(vp);
1142 	return error;
1143 }
1144 
1145 /* ARGSUSED */
1146 int
1147 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1148     register_t *retval)
1149 {
1150 	/* {
1151 		syscallarg(const char *) path;
1152 		syscallarg(struct quotactl_args *) args;
1153 	} */
1154 	struct quotactl_args args;
1155 	int error;
1156 
1157 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1158 	if (error) {
1159 		return error;
1160 	}
1161 
1162 	return do_sys_quotactl(SCARG(uap, path), &args);
1163 }
1164 
1165 int
1166 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1167     int root)
1168 {
1169 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1170 	bool chrooted;
1171 	int error = 0;
1172 
1173 	KASSERT(l == curlwp);
1174 
1175 	/*
1176 	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1177 	 * since it would imply chroots can be escaped.  Just make sure this
1178 	 * routine is self-consistent.
1179 	 */
1180 	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1181 
1182 	/*
1183 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1184 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1185 	 * overrides MNT_NOWAIT.
1186 	 */
1187 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1188 	    (flags != MNT_WAIT && flags != 0)) {
1189 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1190 	} else {
1191 		/* Get the filesystem stats now */
1192 		memset(sp, 0, sizeof(*sp));
1193 		if ((error = VFS_STATVFS(mp, sp)) != 0)
1194 			return error;
1195 		if (!chrooted)
1196 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1197 	}
1198 
1199 	if (chrooted) {
1200 		size_t len;
1201 		char *bp;
1202 		char c;
1203 		char *path = PNBUF_GET();
1204 
1205 		bp = path + MAXPATHLEN;
1206 		*--bp = '\0';
1207 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1208 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1209 		    MAXPATHLEN / 2, 0, l);
1210 		rw_exit(&cwdi->cwdi_lock);
1211 		if (error) {
1212 			PNBUF_PUT(path);
1213 			return error;
1214 		}
1215 		len = strlen(bp);
1216 		if (len != 1) {
1217 			/*
1218 			 * for mount points that are below our root, we can see
1219 			 * them, so we fix up the pathname and return them. The
1220 			 * rest we cannot see, so we don't allow viewing the
1221 			 * data.
1222 			 */
1223 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1224 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1225 				(void)strlcpy(sp->f_mntonname,
1226 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1227 				    sizeof(sp->f_mntonname));
1228 			} else {
1229 				if (root)
1230 					(void)strlcpy(sp->f_mntonname, "/",
1231 					    sizeof(sp->f_mntonname));
1232 				else
1233 					error = EPERM;
1234 			}
1235 		}
1236 		PNBUF_PUT(path);
1237 	}
1238 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1239 	return error;
1240 }
1241 
1242 /*
1243  * Get filesystem statistics by path.
1244  */
1245 int
1246 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1247 {
1248 	struct mount *mp;
1249 	int error;
1250 	struct vnode *vp;
1251 
1252 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1253 	if (error != 0)
1254 		return error;
1255 	mp = vp->v_mount;
1256 	error = dostatvfs(mp, sb, l, flags, 1);
1257 	vrele(vp);
1258 	return error;
1259 }
1260 
1261 /* ARGSUSED */
1262 int
1263 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1264 {
1265 	/* {
1266 		syscallarg(const char *) path;
1267 		syscallarg(struct statvfs *) buf;
1268 		syscallarg(int) flags;
1269 	} */
1270 	struct statvfs *sb;
1271 	int error;
1272 
1273 	sb = STATVFSBUF_GET();
1274 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1275 	if (error == 0)
1276 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1277 	STATVFSBUF_PUT(sb);
1278 	return error;
1279 }
1280 
1281 /*
1282  * Get filesystem statistics by fd.
1283  */
1284 int
1285 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1286 {
1287 	file_t *fp;
1288 	struct mount *mp;
1289 	int error;
1290 
1291 	/* fd_getvnode() will use the descriptor for us */
1292 	if ((error = fd_getvnode(fd, &fp)) != 0)
1293 		return (error);
1294 	mp = fp->f_vnode->v_mount;
1295 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1296 	fd_putfile(fd);
1297 	return error;
1298 }
1299 
1300 /* ARGSUSED */
1301 int
1302 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1303 {
1304 	/* {
1305 		syscallarg(int) fd;
1306 		syscallarg(struct statvfs *) buf;
1307 		syscallarg(int) flags;
1308 	} */
1309 	struct statvfs *sb;
1310 	int error;
1311 
1312 	sb = STATVFSBUF_GET();
1313 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1314 	if (error == 0)
1315 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1316 	STATVFSBUF_PUT(sb);
1317 	return error;
1318 }
1319 
1320 
1321 /*
1322  * Get statistics on all filesystems.
1323  */
1324 int
1325 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1326     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1327     register_t *retval)
1328 {
1329 	int root = 0;
1330 	mount_iterator_t *iter;
1331 	struct proc *p = l->l_proc;
1332 	struct mount *mp;
1333 	struct statvfs *sb;
1334 	size_t count, maxcount;
1335 	int error = 0;
1336 
1337 	sb = STATVFSBUF_GET();
1338 	maxcount = bufsize / entry_sz;
1339 	count = 0;
1340 	mountlist_iterator_init(&iter);
1341 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1342 		if (sfsp && count < maxcount) {
1343 			error = dostatvfs(mp, sb, l, flags, 0);
1344 			if (error) {
1345 				error = 0;
1346 				continue;
1347 			}
1348 			error = copyfn(sb, sfsp, entry_sz);
1349 			if (error)
1350 				goto out;
1351 			sfsp = (char *)sfsp + entry_sz;
1352 			root |= strcmp(sb->f_mntonname, "/") == 0;
1353 		}
1354 		count++;
1355 	}
1356 
1357 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1358 		/*
1359 		 * fake a root entry
1360 		 */
1361 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1362 		    sb, l, flags, 1);
1363 		if (error != 0)
1364 			goto out;
1365 		if (sfsp) {
1366 			error = copyfn(sb, sfsp, entry_sz);
1367 			if (error != 0)
1368 				goto out;
1369 		}
1370 		count++;
1371 	}
1372 	if (sfsp && count > maxcount)
1373 		*retval = maxcount;
1374 	else
1375 		*retval = count;
1376 out:
1377 	mountlist_iterator_destroy(iter);
1378 	STATVFSBUF_PUT(sb);
1379 	return error;
1380 }
1381 
1382 int
1383 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1384     register_t *retval)
1385 {
1386 	/* {
1387 		syscallarg(struct statvfs *) buf;
1388 		syscallarg(size_t) bufsize;
1389 		syscallarg(int) flags;
1390 	} */
1391 
1392 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1393 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1394 }
1395 
1396 /*
1397  * Change current working directory to a given file descriptor.
1398  */
1399 /* ARGSUSED */
1400 int
1401 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1402 {
1403 	/* {
1404 		syscallarg(int) fd;
1405 	} */
1406 	struct proc *p = l->l_proc;
1407 	struct cwdinfo *cwdi;
1408 	struct vnode *vp, *tdp;
1409 	struct mount *mp;
1410 	file_t *fp;
1411 	int error, fd;
1412 
1413 	/* fd_getvnode() will use the descriptor for us */
1414 	fd = SCARG(uap, fd);
1415 	if ((error = fd_getvnode(fd, &fp)) != 0)
1416 		return (error);
1417 	vp = fp->f_vnode;
1418 
1419 	vref(vp);
1420 	vn_lock(vp, LK_SHARED | LK_RETRY);
1421 	if (vp->v_type != VDIR)
1422 		error = ENOTDIR;
1423 	else
1424 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1425 	if (error) {
1426 		vput(vp);
1427 		goto out;
1428 	}
1429 	while ((mp = vp->v_mountedhere) != NULL) {
1430 		error = vfs_busy(mp);
1431 		vput(vp);
1432 		if (error != 0)
1433 			goto out;
1434 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1435 		vfs_unbusy(mp);
1436 		if (error)
1437 			goto out;
1438 		vp = tdp;
1439 	}
1440 	VOP_UNLOCK(vp);
1441 
1442 	/*
1443 	 * Disallow changing to a directory not under the process's
1444 	 * current root directory (if there is one).
1445 	 */
1446 	cwdi = p->p_cwdi;
1447 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1448 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1449 		vrele(vp);
1450 		error = EPERM;	/* operation not permitted */
1451 	} else {
1452 		vrele(cwdi->cwdi_cdir);
1453 		cwdi->cwdi_cdir = vp;
1454 	}
1455 	rw_exit(&cwdi->cwdi_lock);
1456 
1457  out:
1458 	fd_putfile(fd);
1459 	return (error);
1460 }
1461 
1462 /*
1463  * Change this process's notion of the root directory to a given file
1464  * descriptor.
1465  */
1466 int
1467 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1468 {
1469 	struct vnode	*vp;
1470 	file_t	*fp;
1471 	int		 error, fd = SCARG(uap, fd);
1472 
1473 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1474  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1475 		return error;
1476 	/* fd_getvnode() will use the descriptor for us */
1477 	if ((error = fd_getvnode(fd, &fp)) != 0)
1478 		return error;
1479 	vp = fp->f_vnode;
1480 	vn_lock(vp, LK_SHARED | LK_RETRY);
1481 	if (vp->v_type != VDIR)
1482 		error = ENOTDIR;
1483 	else
1484 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1485 	VOP_UNLOCK(vp);
1486 	if (error)
1487 		goto out;
1488 	vref(vp);
1489 	change_root(vp);
1490 
1491  out:
1492 	fd_putfile(fd);
1493 	return (error);
1494 }
1495 
1496 /*
1497  * Change current working directory (``.'').
1498  */
1499 /* ARGSUSED */
1500 int
1501 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1502 {
1503 	/* {
1504 		syscallarg(const char *) path;
1505 	} */
1506 	struct proc *p = l->l_proc;
1507 	struct cwdinfo *cwdi;
1508 	int error;
1509 	struct vnode *vp;
1510 
1511 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1512 				  &vp, l)) != 0)
1513 		return (error);
1514 	cwdi = p->p_cwdi;
1515 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1516 	vrele(cwdi->cwdi_cdir);
1517 	cwdi->cwdi_cdir = vp;
1518 	rw_exit(&cwdi->cwdi_lock);
1519 	return (0);
1520 }
1521 
1522 /*
1523  * Change notion of root (``/'') directory.
1524  */
1525 /* ARGSUSED */
1526 int
1527 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1528 {
1529 	/* {
1530 		syscallarg(const char *) path;
1531 	} */
1532 	int error;
1533 	struct vnode *vp;
1534 
1535 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1536 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1537 		return (error);
1538 
1539 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1540 	if (error == 0)
1541 		change_root(vp);
1542 	return error;
1543 }
1544 
1545 /*
1546  * Common routine for chroot and fchroot.
1547  * NB: callers need to properly authorize the change root operation.
1548  */
1549 void
1550 change_root(struct vnode *vp)
1551 {
1552 	kauth_cred_t ncred;
1553 	struct lwp *l = curlwp;
1554 	struct proc *p = l->l_proc;
1555 	struct cwdinfo *cwdi = p->p_cwdi;
1556 
1557 	ncred = kauth_cred_alloc();
1558 
1559 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1560 	if (cwdi->cwdi_rdir != NULL)
1561 		vrele(cwdi->cwdi_rdir);
1562 	cwdi->cwdi_rdir = vp;
1563 
1564 	/*
1565 	 * Prevent escaping from chroot by putting the root under
1566 	 * the working directory.  Silently chdir to / if we aren't
1567 	 * already there.
1568 	 */
1569 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1570 		/*
1571 		 * XXX would be more failsafe to change directory to a
1572 		 * deadfs node here instead
1573 		 */
1574 		vrele(cwdi->cwdi_cdir);
1575 		vref(vp);
1576 		cwdi->cwdi_cdir = vp;
1577 	}
1578 	rw_exit(&cwdi->cwdi_lock);
1579 
1580 	/* Get a write lock on the process credential. */
1581 	proc_crmod_enter();
1582 
1583 	kauth_cred_clone(p->p_cred, ncred);
1584 	kauth_proc_chroot(ncred, p->p_cwdi);
1585 
1586 	/* Broadcast our credentials to the process and other LWPs. */
1587  	proc_crmod_leave(ncred, p->p_cred, true);
1588 }
1589 
1590 /*
1591  * Common routine for chroot and chdir.
1592  * XXX "where" should be enum uio_seg
1593  */
1594 int
1595 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1596 {
1597 	struct pathbuf *pb;
1598 	struct nameidata nd;
1599 	int error;
1600 
1601 	error = pathbuf_maybe_copyin(path, where, &pb);
1602 	if (error) {
1603 		return error;
1604 	}
1605 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1606 	if ((error = namei(&nd)) != 0) {
1607 		pathbuf_destroy(pb);
1608 		return error;
1609 	}
1610 	*vpp = nd.ni_vp;
1611 	pathbuf_destroy(pb);
1612 
1613 	if ((*vpp)->v_type != VDIR)
1614 		error = ENOTDIR;
1615 	else
1616 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1617 
1618 	if (error)
1619 		vput(*vpp);
1620 	else
1621 		VOP_UNLOCK(*vpp);
1622 	return (error);
1623 }
1624 
1625 /*
1626  * Internals of sys_open - path has already been converted into a pathbuf
1627  * (so we can easily reuse this function from other parts of the kernel,
1628  * like posix_spawn post-processing).
1629  */
1630 int
1631 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1632 	int open_mode, int *fd)
1633 {
1634 	struct proc *p = l->l_proc;
1635 	struct cwdinfo *cwdi = p->p_cwdi;
1636 	file_t *fp;
1637 	struct vnode *vp;
1638 	int flags, cmode;
1639 	int indx, error;
1640 	struct nameidata nd;
1641 
1642 	if (open_flags & O_SEARCH) {
1643 		open_flags &= ~(int)O_SEARCH;
1644 	}
1645 
1646 	/*
1647 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1648 	 * may be specified.
1649 	 */
1650 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1651 		return EINVAL;
1652 
1653 	flags = FFLAGS(open_flags);
1654 	if ((flags & (FREAD | FWRITE)) == 0)
1655 		return EINVAL;
1656 
1657 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1658 		return error;
1659 	}
1660 
1661 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1662 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1663 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1664 	if (dvp != NULL)
1665 		NDAT(&nd, dvp);
1666 
1667 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1668 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1669 		fd_abort(p, fp, indx);
1670 		if ((error == EDUPFD || error == EMOVEFD) &&
1671 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1672 		    (error =
1673 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1674 			*fd = indx;
1675 			return 0;
1676 		}
1677 		if (error == ERESTART)
1678 			error = EINTR;
1679 		return error;
1680 	}
1681 
1682 	l->l_dupfd = 0;
1683 	vp = nd.ni_vp;
1684 
1685 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1686 		return error;
1687 
1688 	VOP_UNLOCK(vp);
1689 	*fd = indx;
1690 	fd_affix(p, fp, indx);
1691 	return 0;
1692 }
1693 
1694 int
1695 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1696 {
1697 	struct pathbuf *pb;
1698 	int error, oflags;
1699 
1700 	oflags = FFLAGS(open_flags);
1701 	if ((oflags & (FREAD | FWRITE)) == 0)
1702 		return EINVAL;
1703 
1704 	pb = pathbuf_create(path);
1705 	if (pb == NULL)
1706 		return ENOMEM;
1707 
1708 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1709 	pathbuf_destroy(pb);
1710 
1711 	return error;
1712 }
1713 
1714 static int
1715 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1716     int mode, int *fd)
1717 {
1718 	file_t *dfp = NULL;
1719 	struct vnode *dvp = NULL;
1720 	struct pathbuf *pb;
1721 	const char *pathstring = NULL;
1722 	int error;
1723 
1724 	if (path == NULL) {
1725 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1726 		if (error == ENOSYS)
1727 			goto no_compat;
1728 		if (error)
1729 			return error;
1730 	} else {
1731 no_compat:
1732 		error = pathbuf_copyin(path, &pb);
1733 		if (error)
1734 			return error;
1735 	}
1736 
1737 	pathstring = pathbuf_stringcopy_get(pb);
1738 
1739 	/*
1740 	 * fdat is ignored if:
1741 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1742 	 * 2) if path is absolute, then fdat is useless.
1743 	 */
1744 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1745 		/* fd_getvnode() will use the descriptor for us */
1746 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1747 			goto out;
1748 
1749 		dvp = dfp->f_vnode;
1750 	}
1751 
1752 	error = do_open(l, dvp, pb, flags, mode, fd);
1753 
1754 	if (dfp != NULL)
1755 		fd_putfile(fdat);
1756 out:
1757 	pathbuf_stringcopy_put(pb, pathstring);
1758 	pathbuf_destroy(pb);
1759 	return error;
1760 }
1761 
1762 int
1763 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1764 {
1765 	/* {
1766 		syscallarg(const char *) path;
1767 		syscallarg(int) flags;
1768 		syscallarg(int) mode;
1769 	} */
1770 	int error;
1771 	int fd;
1772 
1773 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1774 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1775 
1776 	if (error == 0)
1777 		*retval = fd;
1778 
1779 	return error;
1780 }
1781 
1782 int
1783 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1784 {
1785 	/* {
1786 		syscallarg(int) fd;
1787 		syscallarg(const char *) path;
1788 		syscallarg(int) oflags;
1789 		syscallarg(int) mode;
1790 	} */
1791 	int error;
1792 	int fd;
1793 
1794 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1795 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1796 
1797 	if (error == 0)
1798 		*retval = fd;
1799 
1800 	return error;
1801 }
1802 
1803 static void
1804 vfs__fhfree(fhandle_t *fhp)
1805 {
1806 	size_t fhsize;
1807 
1808 	fhsize = FHANDLE_SIZE(fhp);
1809 	kmem_free(fhp, fhsize);
1810 }
1811 
1812 /*
1813  * vfs_composefh: compose a filehandle.
1814  */
1815 
1816 int
1817 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1818 {
1819 	struct mount *mp;
1820 	struct fid *fidp;
1821 	int error;
1822 	size_t needfhsize;
1823 	size_t fidsize;
1824 
1825 	mp = vp->v_mount;
1826 	fidp = NULL;
1827 	if (*fh_size < FHANDLE_SIZE_MIN) {
1828 		fidsize = 0;
1829 	} else {
1830 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1831 		if (fhp != NULL) {
1832 			memset(fhp, 0, *fh_size);
1833 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1834 			fidp = &fhp->fh_fid;
1835 		}
1836 	}
1837 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1838 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1839 	if (error == 0 && *fh_size < needfhsize) {
1840 		error = E2BIG;
1841 	}
1842 	*fh_size = needfhsize;
1843 	return error;
1844 }
1845 
1846 int
1847 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1848 {
1849 	struct mount *mp;
1850 	fhandle_t *fhp;
1851 	size_t fhsize;
1852 	size_t fidsize;
1853 	int error;
1854 
1855 	mp = vp->v_mount;
1856 	fidsize = 0;
1857 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1858 	KASSERT(error != 0);
1859 	if (error != E2BIG) {
1860 		goto out;
1861 	}
1862 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1863 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1864 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1865 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1866 	if (error == 0) {
1867 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1868 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1869 		*fhpp = fhp;
1870 	} else {
1871 		kmem_free(fhp, fhsize);
1872 	}
1873 out:
1874 	return error;
1875 }
1876 
1877 void
1878 vfs_composefh_free(fhandle_t *fhp)
1879 {
1880 
1881 	vfs__fhfree(fhp);
1882 }
1883 
1884 /*
1885  * vfs_fhtovp: lookup a vnode by a filehandle.
1886  */
1887 
1888 int
1889 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1890 {
1891 	struct mount *mp;
1892 	int error;
1893 
1894 	*vpp = NULL;
1895 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1896 	if (mp == NULL) {
1897 		error = ESTALE;
1898 		goto out;
1899 	}
1900 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1901 		error = EOPNOTSUPP;
1902 		goto out;
1903 	}
1904 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1905 out:
1906 	return error;
1907 }
1908 
1909 /*
1910  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1911  * the needed size.
1912  */
1913 
1914 int
1915 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1916 {
1917 	fhandle_t *fhp;
1918 	int error;
1919 
1920 	if (fhsize > FHANDLE_SIZE_MAX) {
1921 		return EINVAL;
1922 	}
1923 	if (fhsize < FHANDLE_SIZE_MIN) {
1924 		return EINVAL;
1925 	}
1926 again:
1927 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1928 	error = copyin(ufhp, fhp, fhsize);
1929 	if (error == 0) {
1930 		/* XXX this check shouldn't be here */
1931 		if (FHANDLE_SIZE(fhp) == fhsize) {
1932 			*fhpp = fhp;
1933 			return 0;
1934 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1935 			/*
1936 			 * a kludge for nfsv2 padded handles.
1937 			 */
1938 			size_t sz;
1939 
1940 			sz = FHANDLE_SIZE(fhp);
1941 			kmem_free(fhp, fhsize);
1942 			fhsize = sz;
1943 			goto again;
1944 		} else {
1945 			/*
1946 			 * userland told us wrong size.
1947 			 */
1948 		    	error = EINVAL;
1949 		}
1950 	}
1951 	kmem_free(fhp, fhsize);
1952 	return error;
1953 }
1954 
1955 void
1956 vfs_copyinfh_free(fhandle_t *fhp)
1957 {
1958 
1959 	vfs__fhfree(fhp);
1960 }
1961 
1962 /*
1963  * Get file handle system call
1964  */
1965 int
1966 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1967 {
1968 	/* {
1969 		syscallarg(char *) fname;
1970 		syscallarg(fhandle_t *) fhp;
1971 		syscallarg(size_t *) fh_size;
1972 	} */
1973 	struct vnode *vp;
1974 	fhandle_t *fh;
1975 	int error;
1976 	struct pathbuf *pb;
1977 	struct nameidata nd;
1978 	size_t sz;
1979 	size_t usz;
1980 
1981 	/*
1982 	 * Must be super user
1983 	 */
1984 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1985 	    0, NULL, NULL, NULL);
1986 	if (error)
1987 		return (error);
1988 
1989 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1990 	if (error) {
1991 		return error;
1992 	}
1993 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1994 	error = namei(&nd);
1995 	if (error) {
1996 		pathbuf_destroy(pb);
1997 		return error;
1998 	}
1999 	vp = nd.ni_vp;
2000 	pathbuf_destroy(pb);
2001 
2002 	error = vfs_composefh_alloc(vp, &fh);
2003 	vput(vp);
2004 	if (error != 0) {
2005 		return error;
2006 	}
2007 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2008 	if (error != 0) {
2009 		goto out;
2010 	}
2011 	sz = FHANDLE_SIZE(fh);
2012 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2013 	if (error != 0) {
2014 		goto out;
2015 	}
2016 	if (usz >= sz) {
2017 		error = copyout(fh, SCARG(uap, fhp), sz);
2018 	} else {
2019 		error = E2BIG;
2020 	}
2021 out:
2022 	vfs_composefh_free(fh);
2023 	return (error);
2024 }
2025 
2026 /*
2027  * Open a file given a file handle.
2028  *
2029  * Check permissions, allocate an open file structure,
2030  * and call the device open routine if any.
2031  */
2032 
2033 int
2034 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2035     register_t *retval)
2036 {
2037 	file_t *fp;
2038 	struct vnode *vp = NULL;
2039 	kauth_cred_t cred = l->l_cred;
2040 	file_t *nfp;
2041 	int indx, error;
2042 	struct vattr va;
2043 	fhandle_t *fh;
2044 	int flags;
2045 	proc_t *p;
2046 
2047 	p = curproc;
2048 
2049 	/*
2050 	 * Must be super user
2051 	 */
2052 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2053 	    0, NULL, NULL, NULL)))
2054 		return (error);
2055 
2056 	if (oflags & O_SEARCH) {
2057 		oflags &= ~(int)O_SEARCH;
2058 	}
2059 
2060 	flags = FFLAGS(oflags);
2061 	if ((flags & (FREAD | FWRITE)) == 0)
2062 		return (EINVAL);
2063 	if ((flags & O_CREAT))
2064 		return (EINVAL);
2065 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2066 		return (error);
2067 	fp = nfp;
2068 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2069 	if (error != 0) {
2070 		goto bad;
2071 	}
2072 	error = vfs_fhtovp(fh, &vp);
2073 	vfs_copyinfh_free(fh);
2074 	if (error != 0) {
2075 		goto bad;
2076 	}
2077 
2078 	/* Now do an effective vn_open */
2079 
2080 	if (vp->v_type == VSOCK) {
2081 		error = EOPNOTSUPP;
2082 		goto bad;
2083 	}
2084 	error = vn_openchk(vp, cred, flags);
2085 	if (error != 0)
2086 		goto bad;
2087 	if (flags & O_TRUNC) {
2088 		VOP_UNLOCK(vp);			/* XXX */
2089 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2090 		vattr_null(&va);
2091 		va.va_size = 0;
2092 		error = VOP_SETATTR(vp, &va, cred);
2093 		if (error)
2094 			goto bad;
2095 	}
2096 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2097 		goto bad;
2098 	if (flags & FWRITE) {
2099 		mutex_enter(vp->v_interlock);
2100 		vp->v_writecount++;
2101 		mutex_exit(vp->v_interlock);
2102 	}
2103 
2104 	/* done with modified vn_open, now finish what sys_open does. */
2105 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2106 		return error;
2107 
2108 	VOP_UNLOCK(vp);
2109 	*retval = indx;
2110 	fd_affix(p, fp, indx);
2111 	return (0);
2112 
2113 bad:
2114 	fd_abort(p, fp, indx);
2115 	if (vp != NULL)
2116 		vput(vp);
2117 	return (error);
2118 }
2119 
2120 int
2121 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2122 {
2123 	/* {
2124 		syscallarg(const void *) fhp;
2125 		syscallarg(size_t) fh_size;
2126 		syscallarg(int) flags;
2127 	} */
2128 
2129 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2130 	    SCARG(uap, flags), retval);
2131 }
2132 
2133 int
2134 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2135 {
2136 	int error;
2137 	fhandle_t *fh;
2138 	struct vnode *vp;
2139 
2140 	/*
2141 	 * Must be super user
2142 	 */
2143 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2144 	    0, NULL, NULL, NULL)))
2145 		return (error);
2146 
2147 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2148 	if (error != 0)
2149 		return error;
2150 
2151 	error = vfs_fhtovp(fh, &vp);
2152 	vfs_copyinfh_free(fh);
2153 	if (error != 0)
2154 		return error;
2155 
2156 	error = vn_stat(vp, sb);
2157 	vput(vp);
2158 	return error;
2159 }
2160 
2161 
2162 /* ARGSUSED */
2163 int
2164 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2165 {
2166 	/* {
2167 		syscallarg(const void *) fhp;
2168 		syscallarg(size_t) fh_size;
2169 		syscallarg(struct stat *) sb;
2170 	} */
2171 	struct stat sb;
2172 	int error;
2173 
2174 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2175 	if (error)
2176 		return error;
2177 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2178 }
2179 
2180 int
2181 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2182     int flags)
2183 {
2184 	fhandle_t *fh;
2185 	struct mount *mp;
2186 	struct vnode *vp;
2187 	int error;
2188 
2189 	/*
2190 	 * Must be super user
2191 	 */
2192 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2193 	    0, NULL, NULL, NULL)))
2194 		return error;
2195 
2196 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2197 	if (error != 0)
2198 		return error;
2199 
2200 	error = vfs_fhtovp(fh, &vp);
2201 	vfs_copyinfh_free(fh);
2202 	if (error != 0)
2203 		return error;
2204 
2205 	mp = vp->v_mount;
2206 	error = dostatvfs(mp, sb, l, flags, 1);
2207 	vput(vp);
2208 	return error;
2209 }
2210 
2211 /* ARGSUSED */
2212 int
2213 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2214 {
2215 	/* {
2216 		syscallarg(const void *) fhp;
2217 		syscallarg(size_t) fh_size;
2218 		syscallarg(struct statvfs *) buf;
2219 		syscallarg(int)	flags;
2220 	} */
2221 	struct statvfs *sb = STATVFSBUF_GET();
2222 	int error;
2223 
2224 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2225 	    SCARG(uap, flags));
2226 	if (error == 0)
2227 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2228 	STATVFSBUF_PUT(sb);
2229 	return error;
2230 }
2231 
2232 int
2233 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2234     dev_t dev)
2235 {
2236 
2237 	/*
2238 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2239 	 * in mode and dev=0.
2240 	 *
2241 	 * In all the other cases it's implementation defined behavior.
2242 	 */
2243 
2244 	if ((mode & S_IFIFO) && dev == 0)
2245 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2246 	else
2247 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2248 		    UIO_USERSPACE);
2249 }
2250 
2251 /*
2252  * Create a special file.
2253  */
2254 /* ARGSUSED */
2255 int
2256 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2257     register_t *retval)
2258 {
2259 	/* {
2260 		syscallarg(const char *) path;
2261 		syscallarg(mode_t) mode;
2262 		syscallarg(dev_t) dev;
2263 	} */
2264 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2265 	    SCARG(uap, mode), SCARG(uap, dev));
2266 }
2267 
2268 int
2269 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2270     register_t *retval)
2271 {
2272 	/* {
2273 		syscallarg(int) fd;
2274 		syscallarg(const char *) path;
2275 		syscallarg(mode_t) mode;
2276 		syscallarg(int) pad;
2277 		syscallarg(dev_t) dev;
2278 	} */
2279 
2280 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2281 	    SCARG(uap, mode), SCARG(uap, dev));
2282 }
2283 
2284 int
2285 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2286     enum uio_seg seg)
2287 {
2288 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2289 }
2290 
2291 int
2292 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2293     dev_t dev, enum uio_seg seg)
2294 {
2295 	struct proc *p = l->l_proc;
2296 	struct vnode *vp;
2297 	struct vattr vattr;
2298 	int error, optype;
2299 	struct pathbuf *pb;
2300 	struct nameidata nd;
2301 	const char *pathstring;
2302 
2303 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2304 	    0, NULL, NULL, NULL)) != 0)
2305 		return (error);
2306 
2307 	optype = VOP_MKNOD_DESCOFFSET;
2308 
2309 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2310 	if (error) {
2311 		return error;
2312 	}
2313 	pathstring = pathbuf_stringcopy_get(pb);
2314 	if (pathstring == NULL) {
2315 		pathbuf_destroy(pb);
2316 		return ENOMEM;
2317 	}
2318 
2319 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2320 
2321 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2322 		goto out;
2323 	vp = nd.ni_vp;
2324 
2325 	if (vp != NULL)
2326 		error = EEXIST;
2327 	else {
2328 		vattr_null(&vattr);
2329 		/* We will read cwdi->cwdi_cmask unlocked. */
2330 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2331 		vattr.va_rdev = dev;
2332 
2333 		switch (mode & S_IFMT) {
2334 		case S_IFMT:	/* used by badsect to flag bad sectors */
2335 			vattr.va_type = VBAD;
2336 			break;
2337 		case S_IFCHR:
2338 			vattr.va_type = VCHR;
2339 			break;
2340 		case S_IFBLK:
2341 			vattr.va_type = VBLK;
2342 			break;
2343 		case S_IFWHT:
2344 			optype = VOP_WHITEOUT_DESCOFFSET;
2345 			break;
2346 		case S_IFREG:
2347 #if NVERIEXEC > 0
2348 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2349 			    O_CREAT);
2350 #endif /* NVERIEXEC > 0 */
2351 			vattr.va_type = VREG;
2352 			vattr.va_rdev = VNOVAL;
2353 			optype = VOP_CREATE_DESCOFFSET;
2354 			break;
2355 		default:
2356 			error = EINVAL;
2357 			break;
2358 		}
2359 
2360 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2361 		    vattr.va_rdev == VNOVAL)
2362 			error = EINVAL;
2363 	}
2364 
2365 	if (!error) {
2366 		switch (optype) {
2367 		case VOP_WHITEOUT_DESCOFFSET:
2368 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2369 			if (error)
2370 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2371 			vput(nd.ni_dvp);
2372 			break;
2373 
2374 		case VOP_MKNOD_DESCOFFSET:
2375 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2376 						&nd.ni_cnd, &vattr);
2377 			if (error == 0)
2378 				vrele(nd.ni_vp);
2379 			vput(nd.ni_dvp);
2380 			break;
2381 
2382 		case VOP_CREATE_DESCOFFSET:
2383 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2384 						&nd.ni_cnd, &vattr);
2385 			if (error == 0)
2386 				vrele(nd.ni_vp);
2387 			vput(nd.ni_dvp);
2388 			break;
2389 		}
2390 	} else {
2391 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2392 		if (nd.ni_dvp == vp)
2393 			vrele(nd.ni_dvp);
2394 		else
2395 			vput(nd.ni_dvp);
2396 		if (vp)
2397 			vrele(vp);
2398 	}
2399 out:
2400 	pathbuf_stringcopy_put(pb, pathstring);
2401 	pathbuf_destroy(pb);
2402 	return (error);
2403 }
2404 
2405 /*
2406  * Create a named pipe.
2407  */
2408 /* ARGSUSED */
2409 int
2410 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2411 {
2412 	/* {
2413 		syscallarg(const char *) path;
2414 		syscallarg(int) mode;
2415 	} */
2416 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2417 }
2418 
2419 int
2420 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2421     register_t *retval)
2422 {
2423 	/* {
2424 		syscallarg(int) fd;
2425 		syscallarg(const char *) path;
2426 		syscallarg(int) mode;
2427 	} */
2428 
2429 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2430 	    SCARG(uap, mode));
2431 }
2432 
2433 static int
2434 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2435 {
2436 	struct proc *p = l->l_proc;
2437 	struct vattr vattr;
2438 	int error;
2439 	struct pathbuf *pb;
2440 	struct nameidata nd;
2441 
2442 	error = pathbuf_copyin(path, &pb);
2443 	if (error) {
2444 		return error;
2445 	}
2446 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2447 
2448 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2449 		pathbuf_destroy(pb);
2450 		return error;
2451 	}
2452 	if (nd.ni_vp != NULL) {
2453 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2454 		if (nd.ni_dvp == nd.ni_vp)
2455 			vrele(nd.ni_dvp);
2456 		else
2457 			vput(nd.ni_dvp);
2458 		vrele(nd.ni_vp);
2459 		pathbuf_destroy(pb);
2460 		return (EEXIST);
2461 	}
2462 	vattr_null(&vattr);
2463 	vattr.va_type = VFIFO;
2464 	/* We will read cwdi->cwdi_cmask unlocked. */
2465 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2466 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2467 	if (error == 0)
2468 		vrele(nd.ni_vp);
2469 	vput(nd.ni_dvp);
2470 	pathbuf_destroy(pb);
2471 	return (error);
2472 }
2473 
2474 /*
2475  * Make a hard file link.
2476  */
2477 /* ARGSUSED */
2478 int
2479 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2480     const char *link, int follow, register_t *retval)
2481 {
2482 	struct vnode *vp;
2483 	struct pathbuf *linkpb;
2484 	struct nameidata nd;
2485 	namei_simple_flags_t ns_flags;
2486 	int error;
2487 
2488 	if (follow & AT_SYMLINK_FOLLOW)
2489 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2490 	else
2491 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2492 
2493 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2494 	if (error != 0)
2495 		return (error);
2496 	error = pathbuf_copyin(link, &linkpb);
2497 	if (error) {
2498 		goto out1;
2499 	}
2500 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2501 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2502 		goto out2;
2503 	if (nd.ni_vp) {
2504 		error = EEXIST;
2505 		goto abortop;
2506 	}
2507 	/* Prevent hard links on directories. */
2508 	if (vp->v_type == VDIR) {
2509 		error = EPERM;
2510 		goto abortop;
2511 	}
2512 	/* Prevent cross-mount operation. */
2513 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2514 		error = EXDEV;
2515 		goto abortop;
2516 	}
2517 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2518 	VOP_UNLOCK(nd.ni_dvp);
2519 	vrele(nd.ni_dvp);
2520 out2:
2521 	pathbuf_destroy(linkpb);
2522 out1:
2523 	vrele(vp);
2524 	return (error);
2525 abortop:
2526 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2527 	if (nd.ni_dvp == nd.ni_vp)
2528 		vrele(nd.ni_dvp);
2529 	else
2530 		vput(nd.ni_dvp);
2531 	if (nd.ni_vp != NULL)
2532 		vrele(nd.ni_vp);
2533 	goto out2;
2534 }
2535 
2536 int
2537 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2538 {
2539 	/* {
2540 		syscallarg(const char *) path;
2541 		syscallarg(const char *) link;
2542 	} */
2543 	const char *path = SCARG(uap, path);
2544 	const char *link = SCARG(uap, link);
2545 
2546 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2547 	    AT_SYMLINK_FOLLOW, retval);
2548 }
2549 
2550 int
2551 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2552     register_t *retval)
2553 {
2554 	/* {
2555 		syscallarg(int) fd1;
2556 		syscallarg(const char *) name1;
2557 		syscallarg(int) fd2;
2558 		syscallarg(const char *) name2;
2559 		syscallarg(int) flags;
2560 	} */
2561 	int fd1 = SCARG(uap, fd1);
2562 	const char *name1 = SCARG(uap, name1);
2563 	int fd2 = SCARG(uap, fd2);
2564 	const char *name2 = SCARG(uap, name2);
2565 	int follow;
2566 
2567 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2568 
2569 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2570 }
2571 
2572 
2573 int
2574 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2575 {
2576 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2577 }
2578 
2579 static int
2580 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2581     const char *link, enum uio_seg seg)
2582 {
2583 	struct proc *p = curproc;
2584 	struct vattr vattr;
2585 	char *path;
2586 	int error;
2587 	size_t len;
2588 	struct pathbuf *linkpb;
2589 	struct nameidata nd;
2590 
2591 	KASSERT(l != NULL || fdat == AT_FDCWD);
2592 
2593 	path = PNBUF_GET();
2594 	if (seg == UIO_USERSPACE) {
2595 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2596 			goto out1;
2597 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2598 			goto out1;
2599 	} else {
2600 		len = strlen(patharg) + 1;
2601 		KASSERT(len <= MAXPATHLEN);
2602 		memcpy(path, patharg, len);
2603 		linkpb = pathbuf_create(link);
2604 		if (linkpb == NULL) {
2605 			error = ENOMEM;
2606 			goto out1;
2607 		}
2608 	}
2609 	ktrkuser("symlink-target", path, len - 1);
2610 
2611 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2612 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2613 		goto out2;
2614 	if (nd.ni_vp) {
2615 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2616 		if (nd.ni_dvp == nd.ni_vp)
2617 			vrele(nd.ni_dvp);
2618 		else
2619 			vput(nd.ni_dvp);
2620 		vrele(nd.ni_vp);
2621 		error = EEXIST;
2622 		goto out2;
2623 	}
2624 	vattr_null(&vattr);
2625 	vattr.va_type = VLNK;
2626 	/* We will read cwdi->cwdi_cmask unlocked. */
2627 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2628 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2629 	if (error == 0)
2630 		vrele(nd.ni_vp);
2631 	vput(nd.ni_dvp);
2632 out2:
2633 	pathbuf_destroy(linkpb);
2634 out1:
2635 	PNBUF_PUT(path);
2636 	return (error);
2637 }
2638 
2639 /*
2640  * Make a symbolic link.
2641  */
2642 /* ARGSUSED */
2643 int
2644 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2645 {
2646 	/* {
2647 		syscallarg(const char *) path;
2648 		syscallarg(const char *) link;
2649 	} */
2650 
2651 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2652 	    UIO_USERSPACE);
2653 }
2654 
2655 int
2656 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2657     register_t *retval)
2658 {
2659 	/* {
2660 		syscallarg(const char *) path1;
2661 		syscallarg(int) fd;
2662 		syscallarg(const char *) path2;
2663 	} */
2664 
2665 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2666 	    SCARG(uap, path2), UIO_USERSPACE);
2667 }
2668 
2669 /*
2670  * Delete a whiteout from the filesystem.
2671  */
2672 /* ARGSUSED */
2673 int
2674 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2675 {
2676 	/* {
2677 		syscallarg(const char *) path;
2678 	} */
2679 	int error;
2680 	struct pathbuf *pb;
2681 	struct nameidata nd;
2682 
2683 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2684 	if (error) {
2685 		return error;
2686 	}
2687 
2688 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2689 	error = namei(&nd);
2690 	if (error) {
2691 		pathbuf_destroy(pb);
2692 		return (error);
2693 	}
2694 
2695 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2696 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2697 		if (nd.ni_dvp == nd.ni_vp)
2698 			vrele(nd.ni_dvp);
2699 		else
2700 			vput(nd.ni_dvp);
2701 		if (nd.ni_vp)
2702 			vrele(nd.ni_vp);
2703 		pathbuf_destroy(pb);
2704 		return (EEXIST);
2705 	}
2706 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2707 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2708 	vput(nd.ni_dvp);
2709 	pathbuf_destroy(pb);
2710 	return (error);
2711 }
2712 
2713 /*
2714  * Delete a name from the filesystem.
2715  */
2716 /* ARGSUSED */
2717 int
2718 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2719 {
2720 	/* {
2721 		syscallarg(const char *) path;
2722 	} */
2723 
2724 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2725 }
2726 
2727 int
2728 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2729     register_t *retval)
2730 {
2731 	/* {
2732 		syscallarg(int) fd;
2733 		syscallarg(const char *) path;
2734 		syscallarg(int) flag;
2735 	} */
2736 
2737 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2738 	    SCARG(uap, flag), UIO_USERSPACE);
2739 }
2740 
2741 int
2742 do_sys_unlink(const char *arg, enum uio_seg seg)
2743 {
2744 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2745 }
2746 
2747 static int
2748 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2749     enum uio_seg seg)
2750 {
2751 	struct vnode *vp;
2752 	int error;
2753 	struct pathbuf *pb;
2754 	struct nameidata nd;
2755 	const char *pathstring;
2756 
2757 	KASSERT(l != NULL || fdat == AT_FDCWD);
2758 
2759 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2760 	if (error) {
2761 		return error;
2762 	}
2763 	pathstring = pathbuf_stringcopy_get(pb);
2764 	if (pathstring == NULL) {
2765 		pathbuf_destroy(pb);
2766 		return ENOMEM;
2767 	}
2768 
2769 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2770 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2771 		goto out;
2772 	vp = nd.ni_vp;
2773 
2774 	/*
2775 	 * The root of a mounted filesystem cannot be deleted.
2776 	 */
2777 	if ((vp->v_vflag & VV_ROOT) != 0) {
2778 		error = EBUSY;
2779 		goto abort;
2780 	}
2781 
2782 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2783 		error = EBUSY;
2784 		goto abort;
2785 	}
2786 
2787 	/*
2788 	 * No rmdir "." please.
2789 	 */
2790 	if (nd.ni_dvp == vp) {
2791 		error = EINVAL;
2792 		goto abort;
2793 	}
2794 
2795 	/*
2796 	 * AT_REMOVEDIR is required to remove a directory
2797 	 */
2798 	if (vp->v_type == VDIR) {
2799 		if (!(flags & AT_REMOVEDIR)) {
2800 			error = EPERM;
2801 			goto abort;
2802 		} else {
2803 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2804 			vput(nd.ni_dvp);
2805 			goto out;
2806 		}
2807 	}
2808 
2809 	/*
2810 	 * Starting here we only deal with non directories.
2811 	 */
2812 	if (flags & AT_REMOVEDIR) {
2813 		error = ENOTDIR;
2814 		goto abort;
2815 	}
2816 
2817 #if NVERIEXEC > 0
2818 	/* Handle remove requests for veriexec entries. */
2819 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2820 		goto abort;
2821 	}
2822 #endif /* NVERIEXEC > 0 */
2823 
2824 #ifdef FILEASSOC
2825 	(void)fileassoc_file_delete(vp);
2826 #endif /* FILEASSOC */
2827 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2828 	vput(nd.ni_dvp);
2829 	goto out;
2830 
2831 abort:
2832 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2833 	if (nd.ni_dvp == vp)
2834 		vrele(nd.ni_dvp);
2835 	else
2836 		vput(nd.ni_dvp);
2837 	vput(vp);
2838 
2839 out:
2840 	pathbuf_stringcopy_put(pb, pathstring);
2841 	pathbuf_destroy(pb);
2842 	return (error);
2843 }
2844 
2845 /*
2846  * Reposition read/write file offset.
2847  */
2848 int
2849 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2850 {
2851 	/* {
2852 		syscallarg(int) fd;
2853 		syscallarg(int) pad;
2854 		syscallarg(off_t) offset;
2855 		syscallarg(int) whence;
2856 	} */
2857 	kauth_cred_t cred = l->l_cred;
2858 	file_t *fp;
2859 	struct vnode *vp;
2860 	struct vattr vattr;
2861 	off_t newoff;
2862 	int error, fd;
2863 
2864 	fd = SCARG(uap, fd);
2865 
2866 	if ((fp = fd_getfile(fd)) == NULL)
2867 		return (EBADF);
2868 
2869 	vp = fp->f_vnode;
2870 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2871 		error = ESPIPE;
2872 		goto out;
2873 	}
2874 
2875 	vn_lock(vp, LK_SHARED | LK_RETRY);
2876 
2877 	switch (SCARG(uap, whence)) {
2878 	case SEEK_CUR:
2879 		newoff = fp->f_offset + SCARG(uap, offset);
2880 		break;
2881 	case SEEK_END:
2882 		error = VOP_GETATTR(vp, &vattr, cred);
2883 		if (error) {
2884 			VOP_UNLOCK(vp);
2885 			goto out;
2886 		}
2887 		newoff = SCARG(uap, offset) + vattr.va_size;
2888 		break;
2889 	case SEEK_SET:
2890 		newoff = SCARG(uap, offset);
2891 		break;
2892 	default:
2893 		error = EINVAL;
2894 		VOP_UNLOCK(vp);
2895 		goto out;
2896 	}
2897 	VOP_UNLOCK(vp);
2898 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2899 		*(off_t *)retval = fp->f_offset = newoff;
2900 	}
2901  out:
2902  	fd_putfile(fd);
2903 	return (error);
2904 }
2905 
2906 /*
2907  * Positional read system call.
2908  */
2909 int
2910 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2911 {
2912 	/* {
2913 		syscallarg(int) fd;
2914 		syscallarg(void *) buf;
2915 		syscallarg(size_t) nbyte;
2916 		syscallarg(off_t) offset;
2917 	} */
2918 	file_t *fp;
2919 	struct vnode *vp;
2920 	off_t offset;
2921 	int error, fd = SCARG(uap, fd);
2922 
2923 	if ((fp = fd_getfile(fd)) == NULL)
2924 		return (EBADF);
2925 
2926 	if ((fp->f_flag & FREAD) == 0) {
2927 		fd_putfile(fd);
2928 		return (EBADF);
2929 	}
2930 
2931 	vp = fp->f_vnode;
2932 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2933 		error = ESPIPE;
2934 		goto out;
2935 	}
2936 
2937 	offset = SCARG(uap, offset);
2938 
2939 	/*
2940 	 * XXX This works because no file systems actually
2941 	 * XXX take any action on the seek operation.
2942 	 */
2943 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2944 		goto out;
2945 
2946 	/* dofileread() will unuse the descriptor for us */
2947 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2948 	    &offset, 0, retval));
2949 
2950  out:
2951 	fd_putfile(fd);
2952 	return (error);
2953 }
2954 
2955 /*
2956  * Positional scatter read system call.
2957  */
2958 int
2959 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2960 {
2961 	/* {
2962 		syscallarg(int) fd;
2963 		syscallarg(const struct iovec *) iovp;
2964 		syscallarg(int) iovcnt;
2965 		syscallarg(off_t) offset;
2966 	} */
2967 	off_t offset = SCARG(uap, offset);
2968 
2969 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2970 	    SCARG(uap, iovcnt), &offset, 0, retval);
2971 }
2972 
2973 /*
2974  * Positional write system call.
2975  */
2976 int
2977 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2978 {
2979 	/* {
2980 		syscallarg(int) fd;
2981 		syscallarg(const void *) buf;
2982 		syscallarg(size_t) nbyte;
2983 		syscallarg(off_t) offset;
2984 	} */
2985 	file_t *fp;
2986 	struct vnode *vp;
2987 	off_t offset;
2988 	int error, fd = SCARG(uap, fd);
2989 
2990 	if ((fp = fd_getfile(fd)) == NULL)
2991 		return (EBADF);
2992 
2993 	if ((fp->f_flag & FWRITE) == 0) {
2994 		fd_putfile(fd);
2995 		return (EBADF);
2996 	}
2997 
2998 	vp = fp->f_vnode;
2999 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
3000 		error = ESPIPE;
3001 		goto out;
3002 	}
3003 
3004 	offset = SCARG(uap, offset);
3005 
3006 	/*
3007 	 * XXX This works because no file systems actually
3008 	 * XXX take any action on the seek operation.
3009 	 */
3010 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
3011 		goto out;
3012 
3013 	/* dofilewrite() will unuse the descriptor for us */
3014 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3015 	    &offset, 0, retval));
3016 
3017  out:
3018 	fd_putfile(fd);
3019 	return (error);
3020 }
3021 
3022 /*
3023  * Positional gather write system call.
3024  */
3025 int
3026 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
3027 {
3028 	/* {
3029 		syscallarg(int) fd;
3030 		syscallarg(const struct iovec *) iovp;
3031 		syscallarg(int) iovcnt;
3032 		syscallarg(off_t) offset;
3033 	} */
3034 	off_t offset = SCARG(uap, offset);
3035 
3036 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3037 	    SCARG(uap, iovcnt), &offset, 0, retval);
3038 }
3039 
3040 /*
3041  * Check access permissions.
3042  */
3043 int
3044 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
3045 {
3046 	/* {
3047 		syscallarg(const char *) path;
3048 		syscallarg(int) flags;
3049 	} */
3050 
3051 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3052 	     SCARG(uap, flags), 0);
3053 }
3054 
3055 int
3056 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3057     int mode, int flags)
3058 {
3059 	kauth_cred_t cred;
3060 	struct vnode *vp;
3061 	int error, nd_flag, vmode;
3062 	struct pathbuf *pb;
3063 	struct nameidata nd;
3064 
3065 	CTASSERT(F_OK == 0);
3066 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3067 		/* nonsense mode */
3068 		return EINVAL;
3069 	}
3070 
3071 	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3072 	if (flags & AT_SYMLINK_NOFOLLOW)
3073 		nd_flag &= ~FOLLOW;
3074 
3075 	error = pathbuf_copyin(path, &pb);
3076 	if (error)
3077 		return error;
3078 
3079 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3080 
3081 	/* Override default credentials */
3082 	cred = kauth_cred_dup(l->l_cred);
3083 	if (!(flags & AT_EACCESS)) {
3084 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3085 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3086 	}
3087 	nd.ni_cnd.cn_cred = cred;
3088 
3089 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3090 		pathbuf_destroy(pb);
3091 		goto out;
3092 	}
3093 	vp = nd.ni_vp;
3094 	pathbuf_destroy(pb);
3095 
3096 	/* Flags == 0 means only check for existence. */
3097 	if (mode) {
3098 		vmode = 0;
3099 		if (mode & R_OK)
3100 			vmode |= VREAD;
3101 		if (mode & W_OK)
3102 			vmode |= VWRITE;
3103 		if (mode & X_OK)
3104 			vmode |= VEXEC;
3105 
3106 		error = VOP_ACCESS(vp, vmode, cred);
3107 		if (!error && (vmode & VWRITE))
3108 			error = vn_writechk(vp);
3109 	}
3110 	vput(vp);
3111 out:
3112 	kauth_cred_free(cred);
3113 	return (error);
3114 }
3115 
3116 int
3117 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3118     register_t *retval)
3119 {
3120 	/* {
3121 		syscallarg(int) fd;
3122 		syscallarg(const char *) path;
3123 		syscallarg(int) amode;
3124 		syscallarg(int) flag;
3125 	} */
3126 
3127 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3128 	     SCARG(uap, amode), SCARG(uap, flag));
3129 }
3130 
3131 /*
3132  * Common code for all sys_stat functions, including compat versions.
3133  */
3134 int
3135 do_sys_stat(const char *userpath, unsigned int nd_flag,
3136     struct stat *sb)
3137 {
3138 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3139 }
3140 
3141 int
3142 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3143     unsigned int nd_flag, struct stat *sb)
3144 {
3145 	int error;
3146 	struct pathbuf *pb;
3147 	struct nameidata nd;
3148 
3149 	KASSERT(l != NULL || fdat == AT_FDCWD);
3150 
3151 	error = pathbuf_copyin(userpath, &pb);
3152 	if (error) {
3153 		return error;
3154 	}
3155 
3156 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3157 
3158 	error = fd_nameiat(l, fdat, &nd);
3159 	if (error != 0) {
3160 		pathbuf_destroy(pb);
3161 		return error;
3162 	}
3163 	error = vn_stat(nd.ni_vp, sb);
3164 	vput(nd.ni_vp);
3165 	pathbuf_destroy(pb);
3166 	return error;
3167 }
3168 
3169 /*
3170  * Get file status; this version follows links.
3171  */
3172 /* ARGSUSED */
3173 int
3174 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3175 {
3176 	/* {
3177 		syscallarg(const char *) path;
3178 		syscallarg(struct stat *) ub;
3179 	} */
3180 	struct stat sb;
3181 	int error;
3182 
3183 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3184 	if (error)
3185 		return error;
3186 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3187 }
3188 
3189 /*
3190  * Get file status; this version does not follow links.
3191  */
3192 /* ARGSUSED */
3193 int
3194 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3195 {
3196 	/* {
3197 		syscallarg(const char *) path;
3198 		syscallarg(struct stat *) ub;
3199 	} */
3200 	struct stat sb;
3201 	int error;
3202 
3203 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3204 	if (error)
3205 		return error;
3206 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3207 }
3208 
3209 int
3210 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3211     register_t *retval)
3212 {
3213 	/* {
3214 		syscallarg(int) fd;
3215 		syscallarg(const char *) path;
3216 		syscallarg(struct stat *) buf;
3217 		syscallarg(int) flag;
3218 	} */
3219 	unsigned int nd_flag;
3220 	struct stat sb;
3221 	int error;
3222 
3223 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3224 		nd_flag = NOFOLLOW;
3225 	else
3226 		nd_flag = FOLLOW;
3227 
3228 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3229 	    &sb);
3230 	if (error)
3231 		return error;
3232 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3233 }
3234 
3235 static int
3236 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3237 {
3238 	int error;
3239 	struct pathbuf *pb;
3240 	struct nameidata nd;
3241 
3242 	error = pathbuf_copyin(path, &pb);
3243 	if (error) {
3244 		return error;
3245 	}
3246 	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3247 	if ((error = namei(&nd)) != 0) {
3248 		pathbuf_destroy(pb);
3249 		return error;
3250 	}
3251 	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3252 	vput(nd.ni_vp);
3253 	pathbuf_destroy(pb);
3254 	return error;
3255 }
3256 
3257 /*
3258  * Get configurable pathname variables.
3259  */
3260 /* ARGSUSED */
3261 int
3262 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3263     register_t *retval)
3264 {
3265 	/* {
3266 		syscallarg(const char *) path;
3267 		syscallarg(int) name;
3268 	} */
3269 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3270 	    FOLLOW);
3271 }
3272 
3273 /* ARGSUSED */
3274 int
3275 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3276     register_t *retval)
3277 {
3278 	/* {
3279 		syscallarg(const char *) path;
3280 		syscallarg(int) name;
3281 	} */
3282 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3283 	    NOFOLLOW);
3284 }
3285 
3286 /*
3287  * Return target name of a symbolic link.
3288  */
3289 /* ARGSUSED */
3290 int
3291 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3292     register_t *retval)
3293 {
3294 	/* {
3295 		syscallarg(const char *) path;
3296 		syscallarg(char *) buf;
3297 		syscallarg(size_t) count;
3298 	} */
3299 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3300 	    SCARG(uap, buf), SCARG(uap, count), retval);
3301 }
3302 
3303 static int
3304 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3305     size_t count, register_t *retval)
3306 {
3307 	struct vnode *vp;
3308 	struct iovec aiov;
3309 	struct uio auio;
3310 	int error;
3311 	struct pathbuf *pb;
3312 	struct nameidata nd;
3313 
3314 	error = pathbuf_copyin(path, &pb);
3315 	if (error) {
3316 		return error;
3317 	}
3318 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
3319 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3320 		pathbuf_destroy(pb);
3321 		return error;
3322 	}
3323 	vp = nd.ni_vp;
3324 	pathbuf_destroy(pb);
3325 	if (vp->v_type != VLNK)
3326 		error = EINVAL;
3327 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3328 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3329 		aiov.iov_base = buf;
3330 		aiov.iov_len = count;
3331 		auio.uio_iov = &aiov;
3332 		auio.uio_iovcnt = 1;
3333 		auio.uio_offset = 0;
3334 		auio.uio_rw = UIO_READ;
3335 		KASSERT(l == curlwp);
3336 		auio.uio_vmspace = l->l_proc->p_vmspace;
3337 		auio.uio_resid = count;
3338 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3339 			*retval = count - auio.uio_resid;
3340 	}
3341 	vput(vp);
3342 	return (error);
3343 }
3344 
3345 int
3346 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3347     register_t *retval)
3348 {
3349 	/* {
3350 		syscallarg(int) fd;
3351 		syscallarg(const char *) path;
3352 		syscallarg(char *) buf;
3353 		syscallarg(size_t) bufsize;
3354 	} */
3355 
3356 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3357 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3358 }
3359 
3360 /*
3361  * Change flags of a file given a path name.
3362  */
3363 /* ARGSUSED */
3364 int
3365 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3366 {
3367 	/* {
3368 		syscallarg(const char *) path;
3369 		syscallarg(u_long) flags;
3370 	} */
3371 	struct vnode *vp;
3372 	int error;
3373 
3374 	error = namei_simple_user(SCARG(uap, path),
3375 				NSM_FOLLOW_TRYEMULROOT, &vp);
3376 	if (error != 0)
3377 		return (error);
3378 	error = change_flags(vp, SCARG(uap, flags), l);
3379 	vput(vp);
3380 	return (error);
3381 }
3382 
3383 /*
3384  * Change flags of a file given a file descriptor.
3385  */
3386 /* ARGSUSED */
3387 int
3388 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3389 {
3390 	/* {
3391 		syscallarg(int) fd;
3392 		syscallarg(u_long) flags;
3393 	} */
3394 	struct vnode *vp;
3395 	file_t *fp;
3396 	int error;
3397 
3398 	/* fd_getvnode() will use the descriptor for us */
3399 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3400 		return (error);
3401 	vp = fp->f_vnode;
3402 	error = change_flags(vp, SCARG(uap, flags), l);
3403 	VOP_UNLOCK(vp);
3404 	fd_putfile(SCARG(uap, fd));
3405 	return (error);
3406 }
3407 
3408 /*
3409  * Change flags of a file given a path name; this version does
3410  * not follow links.
3411  */
3412 int
3413 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3414 {
3415 	/* {
3416 		syscallarg(const char *) path;
3417 		syscallarg(u_long) flags;
3418 	} */
3419 	struct vnode *vp;
3420 	int error;
3421 
3422 	error = namei_simple_user(SCARG(uap, path),
3423 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3424 	if (error != 0)
3425 		return (error);
3426 	error = change_flags(vp, SCARG(uap, flags), l);
3427 	vput(vp);
3428 	return (error);
3429 }
3430 
3431 /*
3432  * Common routine to change flags of a file.
3433  */
3434 int
3435 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3436 {
3437 	struct vattr vattr;
3438 	int error;
3439 
3440 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3441 
3442 	vattr_null(&vattr);
3443 	vattr.va_flags = flags;
3444 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3445 
3446 	return (error);
3447 }
3448 
3449 /*
3450  * Change mode of a file given path name; this version follows links.
3451  */
3452 /* ARGSUSED */
3453 int
3454 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3455 {
3456 	/* {
3457 		syscallarg(const char *) path;
3458 		syscallarg(int) mode;
3459 	} */
3460 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3461 			      SCARG(uap, mode), 0);
3462 }
3463 
3464 int
3465 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3466 {
3467 	int error;
3468 	struct vnode *vp;
3469 	namei_simple_flags_t ns_flag;
3470 
3471 	if (flags & AT_SYMLINK_NOFOLLOW)
3472 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3473 	else
3474 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3475 
3476 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3477 	if (error != 0)
3478 		return error;
3479 
3480 	error = change_mode(vp, mode, l);
3481 
3482 	vrele(vp);
3483 
3484 	return (error);
3485 }
3486 
3487 /*
3488  * Change mode of a file given a file descriptor.
3489  */
3490 /* ARGSUSED */
3491 int
3492 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3493 {
3494 	/* {
3495 		syscallarg(int) fd;
3496 		syscallarg(int) mode;
3497 	} */
3498 	file_t *fp;
3499 	int error;
3500 
3501 	/* fd_getvnode() will use the descriptor for us */
3502 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3503 		return (error);
3504 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3505 	fd_putfile(SCARG(uap, fd));
3506 	return (error);
3507 }
3508 
3509 int
3510 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3511     register_t *retval)
3512 {
3513 	/* {
3514 		syscallarg(int) fd;
3515 		syscallarg(const char *) path;
3516 		syscallarg(int) mode;
3517 		syscallarg(int) flag;
3518 	} */
3519 
3520 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3521 			      SCARG(uap, mode), SCARG(uap, flag));
3522 }
3523 
3524 /*
3525  * Change mode of a file given path name; this version does not follow links.
3526  */
3527 /* ARGSUSED */
3528 int
3529 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3530 {
3531 	/* {
3532 		syscallarg(const char *) path;
3533 		syscallarg(int) mode;
3534 	} */
3535 	int error;
3536 	struct vnode *vp;
3537 
3538 	error = namei_simple_user(SCARG(uap, path),
3539 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3540 	if (error != 0)
3541 		return (error);
3542 
3543 	error = change_mode(vp, SCARG(uap, mode), l);
3544 
3545 	vrele(vp);
3546 	return (error);
3547 }
3548 
3549 /*
3550  * Common routine to set mode given a vnode.
3551  */
3552 static int
3553 change_mode(struct vnode *vp, int mode, struct lwp *l)
3554 {
3555 	struct vattr vattr;
3556 	int error;
3557 
3558 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3559 	vattr_null(&vattr);
3560 	vattr.va_mode = mode & ALLPERMS;
3561 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3562 	VOP_UNLOCK(vp);
3563 	return (error);
3564 }
3565 
3566 /*
3567  * Set ownership given a path name; this version follows links.
3568  */
3569 /* ARGSUSED */
3570 int
3571 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3572 {
3573 	/* {
3574 		syscallarg(const char *) path;
3575 		syscallarg(uid_t) uid;
3576 		syscallarg(gid_t) gid;
3577 	} */
3578 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3579 			      SCARG(uap, gid), 0);
3580 }
3581 
3582 int
3583 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3584    gid_t gid, int flags)
3585 {
3586 	int error;
3587 	struct vnode *vp;
3588 	namei_simple_flags_t ns_flag;
3589 
3590 	if (flags & AT_SYMLINK_NOFOLLOW)
3591 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3592 	else
3593 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3594 
3595 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3596 	if (error != 0)
3597 		return error;
3598 
3599 	error = change_owner(vp, uid, gid, l, 0);
3600 
3601 	vrele(vp);
3602 
3603 	return (error);
3604 }
3605 
3606 /*
3607  * Set ownership given a path name; this version follows links.
3608  * Provides POSIX semantics.
3609  */
3610 /* ARGSUSED */
3611 int
3612 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3613 {
3614 	/* {
3615 		syscallarg(const char *) path;
3616 		syscallarg(uid_t) uid;
3617 		syscallarg(gid_t) gid;
3618 	} */
3619 	int error;
3620 	struct vnode *vp;
3621 
3622 	error = namei_simple_user(SCARG(uap, path),
3623 				NSM_FOLLOW_TRYEMULROOT, &vp);
3624 	if (error != 0)
3625 		return (error);
3626 
3627 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3628 
3629 	vrele(vp);
3630 	return (error);
3631 }
3632 
3633 /*
3634  * Set ownership given a file descriptor.
3635  */
3636 /* ARGSUSED */
3637 int
3638 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3639 {
3640 	/* {
3641 		syscallarg(int) fd;
3642 		syscallarg(uid_t) uid;
3643 		syscallarg(gid_t) gid;
3644 	} */
3645 	int error;
3646 	file_t *fp;
3647 
3648 	/* fd_getvnode() will use the descriptor for us */
3649 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3650 		return (error);
3651 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3652 	    l, 0);
3653 	fd_putfile(SCARG(uap, fd));
3654 	return (error);
3655 }
3656 
3657 int
3658 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3659     register_t *retval)
3660 {
3661 	/* {
3662 		syscallarg(int) fd;
3663 		syscallarg(const char *) path;
3664 		syscallarg(uid_t) owner;
3665 		syscallarg(gid_t) group;
3666 		syscallarg(int) flag;
3667 	} */
3668 
3669 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3670 			      SCARG(uap, owner), SCARG(uap, group),
3671 			      SCARG(uap, flag));
3672 }
3673 
3674 /*
3675  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3676  */
3677 /* ARGSUSED */
3678 int
3679 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3680 {
3681 	/* {
3682 		syscallarg(int) fd;
3683 		syscallarg(uid_t) uid;
3684 		syscallarg(gid_t) gid;
3685 	} */
3686 	int error;
3687 	file_t *fp;
3688 
3689 	/* fd_getvnode() will use the descriptor for us */
3690 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3691 		return (error);
3692 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3693 	    l, 1);
3694 	fd_putfile(SCARG(uap, fd));
3695 	return (error);
3696 }
3697 
3698 /*
3699  * Set ownership given a path name; this version does not follow links.
3700  */
3701 /* ARGSUSED */
3702 int
3703 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3704 {
3705 	/* {
3706 		syscallarg(const char *) path;
3707 		syscallarg(uid_t) uid;
3708 		syscallarg(gid_t) gid;
3709 	} */
3710 	int error;
3711 	struct vnode *vp;
3712 
3713 	error = namei_simple_user(SCARG(uap, path),
3714 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3715 	if (error != 0)
3716 		return (error);
3717 
3718 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3719 
3720 	vrele(vp);
3721 	return (error);
3722 }
3723 
3724 /*
3725  * Set ownership given a path name; this version does not follow links.
3726  * Provides POSIX/XPG semantics.
3727  */
3728 /* ARGSUSED */
3729 int
3730 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3731 {
3732 	/* {
3733 		syscallarg(const char *) path;
3734 		syscallarg(uid_t) uid;
3735 		syscallarg(gid_t) gid;
3736 	} */
3737 	int error;
3738 	struct vnode *vp;
3739 
3740 	error = namei_simple_user(SCARG(uap, path),
3741 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3742 	if (error != 0)
3743 		return (error);
3744 
3745 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3746 
3747 	vrele(vp);
3748 	return (error);
3749 }
3750 
3751 /*
3752  * Common routine to set ownership given a vnode.
3753  */
3754 static int
3755 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3756     int posix_semantics)
3757 {
3758 	struct vattr vattr;
3759 	mode_t newmode;
3760 	int error;
3761 
3762 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3763 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3764 		goto out;
3765 
3766 #define CHANGED(x) ((int)(x) != -1)
3767 	newmode = vattr.va_mode;
3768 	if (posix_semantics) {
3769 		/*
3770 		 * POSIX/XPG semantics: if the caller is not the super-user,
3771 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3772 		 * the XPG consider the behaviour for calls by the super-user
3773 		 * implementation-defined; we leave the set-user-id and set-
3774 		 * group-id settings intact in that case.
3775 		 */
3776 		if (vattr.va_mode & S_ISUID) {
3777 			if (kauth_authorize_vnode(l->l_cred,
3778 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3779 				newmode &= ~S_ISUID;
3780 		}
3781 		if (vattr.va_mode & S_ISGID) {
3782 			if (kauth_authorize_vnode(l->l_cred,
3783 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3784 				newmode &= ~S_ISGID;
3785 		}
3786 	} else {
3787 		/*
3788 		 * NetBSD semantics: when changing owner and/or group,
3789 		 * clear the respective bit(s).
3790 		 */
3791 		if (CHANGED(uid))
3792 			newmode &= ~S_ISUID;
3793 		if (CHANGED(gid))
3794 			newmode &= ~S_ISGID;
3795 	}
3796 	/* Update va_mode iff altered. */
3797 	if (vattr.va_mode == newmode)
3798 		newmode = VNOVAL;
3799 
3800 	vattr_null(&vattr);
3801 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3802 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3803 	vattr.va_mode = newmode;
3804 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3805 #undef CHANGED
3806 
3807 out:
3808 	VOP_UNLOCK(vp);
3809 	return (error);
3810 }
3811 
3812 /*
3813  * Set the access and modification times given a path name; this
3814  * version follows links.
3815  */
3816 /* ARGSUSED */
3817 int
3818 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3819     register_t *retval)
3820 {
3821 	/* {
3822 		syscallarg(const char *) path;
3823 		syscallarg(const struct timeval *) tptr;
3824 	} */
3825 
3826 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3827 	    SCARG(uap, tptr), UIO_USERSPACE);
3828 }
3829 
3830 /*
3831  * Set the access and modification times given a file descriptor.
3832  */
3833 /* ARGSUSED */
3834 int
3835 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3836     register_t *retval)
3837 {
3838 	/* {
3839 		syscallarg(int) fd;
3840 		syscallarg(const struct timeval *) tptr;
3841 	} */
3842 	int error;
3843 	file_t *fp;
3844 
3845 	/* fd_getvnode() will use the descriptor for us */
3846 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3847 		return (error);
3848 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3849 	    UIO_USERSPACE);
3850 	fd_putfile(SCARG(uap, fd));
3851 	return (error);
3852 }
3853 
3854 int
3855 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3856     register_t *retval)
3857 {
3858 	/* {
3859 		syscallarg(int) fd;
3860 		syscallarg(const struct timespec *) tptr;
3861 	} */
3862 	int error;
3863 	file_t *fp;
3864 
3865 	/* fd_getvnode() will use the descriptor for us */
3866 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3867 		return (error);
3868 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3869 	    SCARG(uap, tptr), UIO_USERSPACE);
3870 	fd_putfile(SCARG(uap, fd));
3871 	return (error);
3872 }
3873 
3874 /*
3875  * Set the access and modification times given a path name; this
3876  * version does not follow links.
3877  */
3878 int
3879 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3880     register_t *retval)
3881 {
3882 	/* {
3883 		syscallarg(const char *) path;
3884 		syscallarg(const struct timeval *) tptr;
3885 	} */
3886 
3887 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3888 	    SCARG(uap, tptr), UIO_USERSPACE);
3889 }
3890 
3891 int
3892 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3893     register_t *retval)
3894 {
3895 	/* {
3896 		syscallarg(int) fd;
3897 		syscallarg(const char *) path;
3898 		syscallarg(const struct timespec *) tptr;
3899 		syscallarg(int) flag;
3900 	} */
3901 	int follow;
3902 	const struct timespec *tptr;
3903 	int error;
3904 
3905 	tptr = SCARG(uap, tptr);
3906 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3907 
3908 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3909 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3910 
3911 	return error;
3912 }
3913 
3914 /*
3915  * Common routine to set access and modification times given a vnode.
3916  */
3917 int
3918 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3919     const struct timespec *tptr, enum uio_seg seg)
3920 {
3921 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3922 }
3923 
3924 int
3925 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3926     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3927 {
3928 	struct vattr vattr;
3929 	int error, dorele = 0;
3930 	namei_simple_flags_t sflags;
3931 	bool vanull, setbirthtime;
3932 	struct timespec ts[2];
3933 
3934 	KASSERT(l != NULL || fdat == AT_FDCWD);
3935 
3936 	/*
3937 	 * I have checked all callers and they pass either FOLLOW,
3938 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3939 	 * is 0. More to the point, they don't pass anything else.
3940 	 * Let's keep it that way at least until the namei interfaces
3941 	 * are fully sanitized.
3942 	 */
3943 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3944 	sflags = (flag == FOLLOW) ?
3945 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3946 
3947 	if (tptr == NULL) {
3948 		vanull = true;
3949 		nanotime(&ts[0]);
3950 		ts[1] = ts[0];
3951 	} else {
3952 		vanull = false;
3953 		if (seg != UIO_SYSSPACE) {
3954 			error = copyin(tptr, ts, sizeof (ts));
3955 			if (error != 0)
3956 				return error;
3957 		} else {
3958 			ts[0] = tptr[0];
3959 			ts[1] = tptr[1];
3960 		}
3961 	}
3962 
3963 	if (ts[0].tv_nsec == UTIME_NOW) {
3964 		nanotime(&ts[0]);
3965 		if (ts[1].tv_nsec == UTIME_NOW) {
3966 			vanull = true;
3967 			ts[1] = ts[0];
3968 		}
3969 	} else if (ts[1].tv_nsec == UTIME_NOW)
3970 		nanotime(&ts[1]);
3971 
3972 	if (vp == NULL) {
3973 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3974 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
3975 		if (error != 0)
3976 			return error;
3977 		dorele = 1;
3978 	}
3979 
3980 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3981 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3982 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3983 	vattr_null(&vattr);
3984 
3985 	if (ts[0].tv_nsec != UTIME_OMIT)
3986 		vattr.va_atime = ts[0];
3987 
3988 	if (ts[1].tv_nsec != UTIME_OMIT) {
3989 		vattr.va_mtime = ts[1];
3990 		if (setbirthtime)
3991 			vattr.va_birthtime = ts[1];
3992 	}
3993 
3994 	if (vanull)
3995 		vattr.va_vaflags |= VA_UTIMES_NULL;
3996 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3997 	VOP_UNLOCK(vp);
3998 
3999 	if (dorele != 0)
4000 		vrele(vp);
4001 
4002 	return error;
4003 }
4004 
4005 int
4006 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4007     const struct timeval *tptr, enum uio_seg seg)
4008 {
4009 	struct timespec ts[2];
4010 	struct timespec *tsptr = NULL;
4011 	int error;
4012 
4013 	if (tptr != NULL) {
4014 		struct timeval tv[2];
4015 
4016 		if (seg != UIO_SYSSPACE) {
4017 			error = copyin(tptr, tv, sizeof(tv));
4018 			if (error != 0)
4019 				return error;
4020 			tptr = tv;
4021 		}
4022 
4023 		if ((tptr[0].tv_usec == UTIME_NOW) ||
4024 		    (tptr[0].tv_usec == UTIME_OMIT))
4025 			ts[0].tv_nsec = tptr[0].tv_usec;
4026 		else {
4027 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4028 				return EINVAL;
4029 
4030 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4031 		}
4032 
4033 		if ((tptr[1].tv_usec == UTIME_NOW) ||
4034 		    (tptr[1].tv_usec == UTIME_OMIT))
4035 			ts[1].tv_nsec = tptr[1].tv_usec;
4036 		else {
4037 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4038 				return EINVAL;
4039 
4040 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4041 		}
4042 
4043 		tsptr = &ts[0];
4044 	}
4045 
4046 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4047 }
4048 
4049 /*
4050  * Truncate a file given its path name.
4051  */
4052 /* ARGSUSED */
4053 int
4054 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
4055 {
4056 	/* {
4057 		syscallarg(const char *) path;
4058 		syscallarg(int) pad;
4059 		syscallarg(off_t) length;
4060 	} */
4061 	struct vnode *vp;
4062 	struct vattr vattr;
4063 	int error;
4064 
4065 	if (SCARG(uap, length) < 0)
4066 		return EINVAL;
4067 
4068 	error = namei_simple_user(SCARG(uap, path),
4069 				NSM_FOLLOW_TRYEMULROOT, &vp);
4070 	if (error != 0)
4071 		return (error);
4072 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4073 	if (vp->v_type == VDIR)
4074 		error = EISDIR;
4075 	else if ((error = vn_writechk(vp)) == 0 &&
4076 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4077 		vattr_null(&vattr);
4078 		vattr.va_size = SCARG(uap, length);
4079 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4080 	}
4081 	vput(vp);
4082 	return (error);
4083 }
4084 
4085 /*
4086  * Truncate a file given a file descriptor.
4087  */
4088 /* ARGSUSED */
4089 int
4090 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
4091 {
4092 	/* {
4093 		syscallarg(int) fd;
4094 		syscallarg(int) pad;
4095 		syscallarg(off_t) length;
4096 	} */
4097 	struct vattr vattr;
4098 	struct vnode *vp;
4099 	file_t *fp;
4100 	int error;
4101 
4102 	if (SCARG(uap, length) < 0)
4103 		return EINVAL;
4104 
4105 	/* fd_getvnode() will use the descriptor for us */
4106 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4107 		return (error);
4108 	if ((fp->f_flag & FWRITE) == 0) {
4109 		error = EINVAL;
4110 		goto out;
4111 	}
4112 	vp = fp->f_vnode;
4113 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4114 	if (vp->v_type == VDIR)
4115 		error = EISDIR;
4116 	else if ((error = vn_writechk(vp)) == 0) {
4117 		vattr_null(&vattr);
4118 		vattr.va_size = SCARG(uap, length);
4119 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
4120 	}
4121 	VOP_UNLOCK(vp);
4122  out:
4123 	fd_putfile(SCARG(uap, fd));
4124 	return (error);
4125 }
4126 
4127 /*
4128  * Sync an open file.
4129  */
4130 /* ARGSUSED */
4131 int
4132 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4133 {
4134 	/* {
4135 		syscallarg(int) fd;
4136 	} */
4137 	struct vnode *vp;
4138 	file_t *fp;
4139 	int error;
4140 
4141 	/* fd_getvnode() will use the descriptor for us */
4142 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4143 		return (error);
4144 	vp = fp->f_vnode;
4145 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4146 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4147 	VOP_UNLOCK(vp);
4148 	fd_putfile(SCARG(uap, fd));
4149 	return (error);
4150 }
4151 
4152 /*
4153  * Sync a range of file data.  API modeled after that found in AIX.
4154  *
4155  * FDATASYNC indicates that we need only save enough metadata to be able
4156  * to re-read the written data.
4157  */
4158 /* ARGSUSED */
4159 int
4160 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4161 {
4162 	/* {
4163 		syscallarg(int) fd;
4164 		syscallarg(int) flags;
4165 		syscallarg(off_t) start;
4166 		syscallarg(off_t) length;
4167 	} */
4168 	struct vnode *vp;
4169 	file_t *fp;
4170 	int flags, nflags;
4171 	off_t s, e, len;
4172 	int error;
4173 
4174 	/* fd_getvnode() will use the descriptor for us */
4175 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4176 		return (error);
4177 
4178 	if ((fp->f_flag & FWRITE) == 0) {
4179 		error = EBADF;
4180 		goto out;
4181 	}
4182 
4183 	flags = SCARG(uap, flags);
4184 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4185 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4186 		error = EINVAL;
4187 		goto out;
4188 	}
4189 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4190 	if (flags & FDATASYNC)
4191 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4192 	else
4193 		nflags = FSYNC_WAIT;
4194 	if (flags & FDISKSYNC)
4195 		nflags |= FSYNC_CACHE;
4196 
4197 	len = SCARG(uap, length);
4198 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4199 	if (len) {
4200 		s = SCARG(uap, start);
4201 		e = s + len;
4202 		if (e < s) {
4203 			error = EINVAL;
4204 			goto out;
4205 		}
4206 	} else {
4207 		e = 0;
4208 		s = 0;
4209 	}
4210 
4211 	vp = fp->f_vnode;
4212 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4213 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4214 	VOP_UNLOCK(vp);
4215 out:
4216 	fd_putfile(SCARG(uap, fd));
4217 	return (error);
4218 }
4219 
4220 /*
4221  * Sync the data of an open file.
4222  */
4223 /* ARGSUSED */
4224 int
4225 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4226 {
4227 	/* {
4228 		syscallarg(int) fd;
4229 	} */
4230 	struct vnode *vp;
4231 	file_t *fp;
4232 	int error;
4233 
4234 	/* fd_getvnode() will use the descriptor for us */
4235 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4236 		return (error);
4237 	vp = fp->f_vnode;
4238 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4239 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4240 	VOP_UNLOCK(vp);
4241 	fd_putfile(SCARG(uap, fd));
4242 	return (error);
4243 }
4244 
4245 /*
4246  * Rename files, (standard) BSD semantics frontend.
4247  */
4248 /* ARGSUSED */
4249 int
4250 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4251 {
4252 	/* {
4253 		syscallarg(const char *) from;
4254 		syscallarg(const char *) to;
4255 	} */
4256 
4257 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4258 	    SCARG(uap, to), UIO_USERSPACE, 0));
4259 }
4260 
4261 int
4262 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4263     register_t *retval)
4264 {
4265 	/* {
4266 		syscallarg(int) fromfd;
4267 		syscallarg(const char *) from;
4268 		syscallarg(int) tofd;
4269 		syscallarg(const char *) to;
4270 	} */
4271 
4272 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4273 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4274 }
4275 
4276 /*
4277  * Rename files, POSIX semantics frontend.
4278  */
4279 /* ARGSUSED */
4280 int
4281 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4282 {
4283 	/* {
4284 		syscallarg(const char *) from;
4285 		syscallarg(const char *) to;
4286 	} */
4287 
4288 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4289 	    SCARG(uap, to), UIO_USERSPACE, 1));
4290 }
4291 
4292 /*
4293  * Rename files.  Source and destination must either both be directories,
4294  * or both not be directories.  If target is a directory, it must be empty.
4295  * If `from' and `to' refer to the same object, the value of the `retain'
4296  * argument is used to determine whether `from' will be
4297  *
4298  * (retain == 0)	deleted unless `from' and `to' refer to the same
4299  *			object in the file system's name space (BSD).
4300  * (retain == 1)	always retained (POSIX).
4301  *
4302  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4303  */
4304 int
4305 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4306 {
4307 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4308 }
4309 
4310 static int
4311 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4312     const char *to, enum uio_seg seg, int retain)
4313 {
4314 	struct pathbuf *fpb, *tpb;
4315 	struct nameidata fnd, tnd;
4316 	struct vnode *fdvp, *fvp;
4317 	struct vnode *tdvp, *tvp;
4318 	struct mount *mp, *tmp;
4319 	int error;
4320 
4321 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4322 
4323 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4324 	if (error)
4325 		goto out0;
4326 	KASSERT(fpb != NULL);
4327 
4328 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4329 	if (error)
4330 		goto out1;
4331 	KASSERT(tpb != NULL);
4332 
4333 	/*
4334 	 * Lookup from.
4335 	 *
4336 	 * XXX LOCKPARENT is wrong because we don't actually want it
4337 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4338 	 * insane, so for the time being we need to leave it like this.
4339 	 */
4340 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4341 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4342 		goto out2;
4343 
4344 	/*
4345 	 * Pull out the important results of the lookup, fdvp and fvp.
4346 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4347 	 */
4348 	fdvp = fnd.ni_dvp;
4349 	fvp = fnd.ni_vp;
4350 	mp = fdvp->v_mount;
4351 	KASSERT(fdvp != NULL);
4352 	KASSERT(fvp != NULL);
4353 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4354 	/*
4355 	 * Bracket the operation with fstrans_start()/fstrans_done().
4356 	 *
4357 	 * Inside the bracket this file system cannot be unmounted so
4358 	 * a vnode on this file system cannot change its v_mount.
4359 	 * A vnode on another file system may still change to dead mount.
4360 	 */
4361 	fstrans_start(mp);
4362 
4363 	/*
4364 	 * Make sure neither fdvp nor fvp is locked.
4365 	 */
4366 	if (fdvp != fvp)
4367 		VOP_UNLOCK(fdvp);
4368 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4369 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4370 
4371 	/*
4372 	 * Reject renaming `.' and `..'.  Can't do this until after
4373 	 * namei because we need namei's parsing to find the final
4374 	 * component name.  (namei should just leave us with the final
4375 	 * component name and not look it up itself, but anyway...)
4376 	 *
4377 	 * This was here before because we used to relookup from
4378 	 * instead of to and relookup requires the caller to check
4379 	 * this, but now file systems may depend on this check, so we
4380 	 * must retain it until the file systems are all rototilled.
4381 	 */
4382 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4383 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4384 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4385 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4386 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4387 		error = EINVAL;	/* XXX EISDIR?  */
4388 		goto abort0;
4389 	}
4390 
4391 	/*
4392 	 * Lookup to.
4393 	 *
4394 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4395 	 * fvp here to decide whether to add CREATEDIR is a load of
4396 	 * bollocks because fvp might be the wrong node by now, since
4397 	 * fdvp is unlocked.
4398 	 *
4399 	 * XXX Why not pass CREATEDIR always?
4400 	 */
4401 	NDINIT(&tnd, RENAME,
4402 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4403 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4404 	    tpb);
4405 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4406 		goto abort0;
4407 
4408 	/*
4409 	 * Pull out the important results of the lookup, tdvp and tvp.
4410 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4411 	 */
4412 	tdvp = tnd.ni_dvp;
4413 	tvp = tnd.ni_vp;
4414 	KASSERT(tdvp != NULL);
4415 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4416 
4417 	if (fvp->v_type == VDIR)
4418 		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4419 	/*
4420 	 * Make sure neither tdvp nor tvp is locked.
4421 	 */
4422 	if (tdvp != tvp)
4423 		VOP_UNLOCK(tdvp);
4424 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4425 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4426 
4427 	/*
4428 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4429 	 * these, which is why we must do this here.  Once upon a time
4430 	 * we relooked up from instead of to, and consequently didn't
4431 	 * need this check, but now that we relookup to instead of
4432 	 * from, we need this; and we shall need it forever forward
4433 	 * until the VOP_RENAME protocol changes, because file systems
4434 	 * will no doubt begin to depend on this check.
4435 	 */
4436 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4437 		error = EISDIR;
4438 		goto abort1;
4439 	}
4440 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4441 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4442 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4443 		error = EINVAL;
4444 		goto abort1;
4445 	}
4446 
4447 	/*
4448 	 * Make sure the mount points match.  Although we don't hold
4449 	 * any vnode locks, the v_mount on fdvp file system are stable.
4450 	 *
4451 	 * Unmounting another file system at an inopportune moment may
4452 	 * cause tdvp to disappear and change its v_mount to dead.
4453 	 *
4454 	 * So in either case different v_mount means cross-device rename.
4455 	 */
4456 	KASSERT(mp != NULL);
4457 	tmp = tdvp->v_mount;
4458 
4459 	if (mp != tmp) {
4460 		error = EXDEV;
4461 		goto abort1;
4462 	}
4463 
4464 	/*
4465 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4466 	 * Nothing is locked currently, so taking this lock is safe.
4467 	 */
4468 	error = VFS_RENAMELOCK_ENTER(mp);
4469 	if (error)
4470 		goto abort1;
4471 
4472 	/*
4473 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4474 	 * and nothing is locked except for the vfs rename lock.
4475 	 *
4476 	 * The next step is a little rain dance to conform to the
4477 	 * insane lock protocol, even though it does nothing to ward
4478 	 * off race conditions.
4479 	 *
4480 	 * We need tdvp and tvp to be locked.  However, because we have
4481 	 * unlocked tdvp in order to hold no locks while we take the
4482 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4483 	 * lock it even if the sensible file systems will just unlock
4484 	 * it straight away.  Consequently, we must lock tdvp and then
4485 	 * relookup tvp to get it locked.
4486 	 *
4487 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4488 	 * and various file systems insanely depend on the semantics of
4489 	 * this brain damage, the lookup of to must be the last lookup
4490 	 * before VOP_RENAME.
4491 	 */
4492 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4493 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4494 	if (error)
4495 		goto abort2;
4496 
4497 	/*
4498 	 * Drop the old tvp and pick up the new one -- which might be
4499 	 * the same, but that doesn't matter to us.  After this, tdvp
4500 	 * and tvp should both be locked.
4501 	 */
4502 	if (tvp != NULL)
4503 		vrele(tvp);
4504 	tvp = tnd.ni_vp;
4505 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4506 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4507 
4508 	/*
4509 	 * The old do_sys_rename had various consistency checks here
4510 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4511 	 * will become bogus soon in any sensible file system, so the
4512 	 * only purpose in putting these checks here is to give lip
4513 	 * service to these screw cases and to acknowledge that they
4514 	 * exist, not actually to handle them, but here you go
4515 	 * anyway...
4516 	 */
4517 
4518 	/*
4519 	 * Acknowledge that directories and non-directories aren't
4520 	 * suposed to mix.
4521 	 */
4522 	if (tvp != NULL) {
4523 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4524 			error = ENOTDIR;
4525 			goto abort3;
4526 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4527 			error = EISDIR;
4528 			goto abort3;
4529 		}
4530 	}
4531 
4532 	/*
4533 	 * Acknowledge some random screw case, among the dozens that
4534 	 * might arise.
4535 	 */
4536 	if (fvp == tdvp) {
4537 		error = EINVAL;
4538 		goto abort3;
4539 	}
4540 
4541 	/*
4542 	 * Acknowledge that POSIX has a wacky screw case.
4543 	 *
4544 	 * XXX Eventually the retain flag needs to be passed on to
4545 	 * VOP_RENAME.
4546 	 */
4547 	if (fvp == tvp) {
4548 		if (retain) {
4549 			error = 0;
4550 			goto abort3;
4551 		} else if ((fdvp == tdvp) &&
4552 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4553 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4554 			fnd.ni_cnd.cn_namelen))) {
4555 			error = 0;
4556 			goto abort3;
4557 		}
4558 	}
4559 
4560 	/*
4561 	 * Make sure veriexec can screw us up.  (But a race can screw
4562 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4563 	 * bogus.)
4564 	 */
4565 #if NVERIEXEC > 0
4566 	{
4567 		char *f1, *f2;
4568 		size_t f1_len;
4569 		size_t f2_len;
4570 
4571 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4572 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4573 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4574 
4575 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4576 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4577 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4578 
4579 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4580 
4581 		kmem_free(f1, f1_len);
4582 		kmem_free(f2, f2_len);
4583 
4584 		if (error)
4585 			goto abort3;
4586 	}
4587 #endif /* NVERIEXEC > 0 */
4588 
4589 	/*
4590 	 * All ready.  Incant the rename vop.
4591 	 */
4592 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4593 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4594 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4595 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4596 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4597 
4598 	/*
4599 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4600 	 * tdvp and tvp.  But we can't assert any of that.
4601 	 */
4602 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4603 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4604 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4605 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4606 
4607 	/*
4608 	 * So all we have left to do is to drop the rename lock and
4609 	 * destroy the pathbufs.
4610 	 */
4611 	VFS_RENAMELOCK_EXIT(mp);
4612 	fstrans_done(mp);
4613 	goto out2;
4614 
4615 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4616 		VOP_UNLOCK(tvp);
4617 abort2:	VOP_UNLOCK(tdvp);
4618 	VFS_RENAMELOCK_EXIT(mp);
4619 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4620 	vrele(tdvp);
4621 	if (tvp != NULL)
4622 		vrele(tvp);
4623 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4624 	vrele(fdvp);
4625 	vrele(fvp);
4626 	fstrans_done(mp);
4627 out2:	pathbuf_destroy(tpb);
4628 out1:	pathbuf_destroy(fpb);
4629 out0:	return error;
4630 }
4631 
4632 /*
4633  * Make a directory file.
4634  */
4635 /* ARGSUSED */
4636 int
4637 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4638 {
4639 	/* {
4640 		syscallarg(const char *) path;
4641 		syscallarg(int) mode;
4642 	} */
4643 
4644 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4645 	    SCARG(uap, mode), UIO_USERSPACE);
4646 }
4647 
4648 int
4649 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4650     register_t *retval)
4651 {
4652 	/* {
4653 		syscallarg(int) fd;
4654 		syscallarg(const char *) path;
4655 		syscallarg(int) mode;
4656 	} */
4657 
4658 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4659 	    SCARG(uap, mode), UIO_USERSPACE);
4660 }
4661 
4662 
4663 int
4664 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4665 {
4666 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4667 }
4668 
4669 static int
4670 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4671     enum uio_seg seg)
4672 {
4673 	struct proc *p = curlwp->l_proc;
4674 	struct vnode *vp;
4675 	struct vattr vattr;
4676 	int error;
4677 	struct pathbuf *pb;
4678 	struct nameidata nd;
4679 
4680 	KASSERT(l != NULL || fdat == AT_FDCWD);
4681 
4682 	/* XXX bollocks, should pass in a pathbuf */
4683 	error = pathbuf_maybe_copyin(path, seg, &pb);
4684 	if (error) {
4685 		return error;
4686 	}
4687 
4688 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4689 
4690 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4691 		pathbuf_destroy(pb);
4692 		return (error);
4693 	}
4694 	vp = nd.ni_vp;
4695 	if (vp != NULL) {
4696 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4697 		if (nd.ni_dvp == vp)
4698 			vrele(nd.ni_dvp);
4699 		else
4700 			vput(nd.ni_dvp);
4701 		vrele(vp);
4702 		pathbuf_destroy(pb);
4703 		return (EEXIST);
4704 	}
4705 	vattr_null(&vattr);
4706 	vattr.va_type = VDIR;
4707 	/* We will read cwdi->cwdi_cmask unlocked. */
4708 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4709 	nd.ni_cnd.cn_flags |= WILLBEDIR;
4710 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4711 	if (!error)
4712 		vrele(nd.ni_vp);
4713 	vput(nd.ni_dvp);
4714 	pathbuf_destroy(pb);
4715 	return (error);
4716 }
4717 
4718 /*
4719  * Remove a directory file.
4720  */
4721 /* ARGSUSED */
4722 int
4723 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4724 {
4725 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4726 	    AT_REMOVEDIR, UIO_USERSPACE);
4727 }
4728 
4729 /*
4730  * Read a block of directory entries in a file system independent format.
4731  */
4732 int
4733 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4734 {
4735 	/* {
4736 		syscallarg(int) fd;
4737 		syscallarg(char *) buf;
4738 		syscallarg(size_t) count;
4739 	} */
4740 	file_t *fp;
4741 	int error, done;
4742 
4743 	/* fd_getvnode() will use the descriptor for us */
4744 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4745 		return (error);
4746 	if ((fp->f_flag & FREAD) == 0) {
4747 		error = EBADF;
4748 		goto out;
4749 	}
4750 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4751 			SCARG(uap, count), &done, l, 0, 0);
4752 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4753 	*retval = done;
4754  out:
4755 	fd_putfile(SCARG(uap, fd));
4756 	return (error);
4757 }
4758 
4759 /*
4760  * Set the mode mask for creation of filesystem nodes.
4761  */
4762 int
4763 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4764 {
4765 	/* {
4766 		syscallarg(mode_t) newmask;
4767 	} */
4768 
4769 	/*
4770 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4771 	 * serialization with those reads is required.  It's important to
4772 	 * return a coherent answer for the caller of umask() though, and
4773 	 * the atomic operation accomplishes that.
4774 	 */
4775 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4776 	    SCARG(uap, newmask) & ALLPERMS);
4777 
4778 	return (0);
4779 }
4780 
4781 int
4782 dorevoke(struct vnode *vp, kauth_cred_t cred)
4783 {
4784 	struct vattr vattr;
4785 	int error, fs_decision;
4786 
4787 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4788 	error = VOP_GETATTR(vp, &vattr, cred);
4789 	VOP_UNLOCK(vp);
4790 	if (error != 0)
4791 		return error;
4792 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4793 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4794 	    fs_decision);
4795 	if (!error)
4796 		VOP_REVOKE(vp, REVOKEALL);
4797 	return (error);
4798 }
4799 
4800 /*
4801  * Void all references to file by ripping underlying filesystem
4802  * away from vnode.
4803  */
4804 /* ARGSUSED */
4805 int
4806 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4807 {
4808 	/* {
4809 		syscallarg(const char *) path;
4810 	} */
4811 	struct vnode *vp;
4812 	int error;
4813 
4814 	error = namei_simple_user(SCARG(uap, path),
4815 				NSM_FOLLOW_TRYEMULROOT, &vp);
4816 	if (error != 0)
4817 		return (error);
4818 	error = dorevoke(vp, l->l_cred);
4819 	vrele(vp);
4820 	return (error);
4821 }
4822 
4823 /*
4824  * Allocate backing store for a file, filling a hole without having to
4825  * explicitly write anything out.
4826  */
4827 /* ARGSUSED */
4828 int
4829 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4830 		register_t *retval)
4831 {
4832 	/* {
4833 		syscallarg(int) fd;
4834 		syscallarg(off_t) pos;
4835 		syscallarg(off_t) len;
4836 	} */
4837 	int fd;
4838 	off_t pos, len;
4839 	struct file *fp;
4840 	struct vnode *vp;
4841 	int error;
4842 
4843 	fd = SCARG(uap, fd);
4844 	pos = SCARG(uap, pos);
4845 	len = SCARG(uap, len);
4846 
4847 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4848 		*retval = EINVAL;
4849 		return 0;
4850 	}
4851 
4852 	error = fd_getvnode(fd, &fp);
4853 	if (error) {
4854 		*retval = error;
4855 		return 0;
4856 	}
4857 	if ((fp->f_flag & FWRITE) == 0) {
4858 		error = EBADF;
4859 		goto fail;
4860 	}
4861 	vp = fp->f_vnode;
4862 
4863 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4864 	if (vp->v_type == VDIR) {
4865 		error = EISDIR;
4866 	} else {
4867 		error = VOP_FALLOCATE(vp, pos, len);
4868 	}
4869 	VOP_UNLOCK(vp);
4870 
4871 fail:
4872 	fd_putfile(fd);
4873 	*retval = error;
4874 	return 0;
4875 }
4876 
4877 /*
4878  * Deallocate backing store for a file, creating a hole. Also used for
4879  * invoking TRIM on disks.
4880  */
4881 /* ARGSUSED */
4882 int
4883 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4884 		register_t *retval)
4885 {
4886 	/* {
4887 		syscallarg(int) fd;
4888 		syscallarg(off_t) pos;
4889 		syscallarg(off_t) len;
4890 	} */
4891 	int fd;
4892 	off_t pos, len;
4893 	struct file *fp;
4894 	struct vnode *vp;
4895 	int error;
4896 
4897 	fd = SCARG(uap, fd);
4898 	pos = SCARG(uap, pos);
4899 	len = SCARG(uap, len);
4900 
4901 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4902 		return EINVAL;
4903 	}
4904 
4905 	error = fd_getvnode(fd, &fp);
4906 	if (error) {
4907 		return error;
4908 	}
4909 	if ((fp->f_flag & FWRITE) == 0) {
4910 		error = EBADF;
4911 		goto fail;
4912 	}
4913 	vp = fp->f_vnode;
4914 
4915 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4916 	if (vp->v_type == VDIR) {
4917 		error = EISDIR;
4918 	} else {
4919 		error = VOP_FDISCARD(vp, pos, len);
4920 	}
4921 	VOP_UNLOCK(vp);
4922 
4923 fail:
4924 	fd_putfile(fd);
4925 	return error;
4926 }
4927