xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: vfs_syscalls.c,v 1.544 2020/03/25 18:08:34 gdt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.544 2020/03/25 18:08:34 gdt Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/fstrans.h>
91 #include <sys/proc.h>
92 #include <sys/uio.h>
93 #include <sys/kmem.h>
94 #include <sys/dirent.h>
95 #include <sys/sysctl.h>
96 #include <sys/syscallargs.h>
97 #include <sys/vfs_syscalls.h>
98 #include <sys/quota.h>
99 #include <sys/quotactl.h>
100 #include <sys/ktrace.h>
101 #ifdef FILEASSOC
102 #include <sys/fileassoc.h>
103 #endif /* FILEASSOC */
104 #include <sys/extattr.h>
105 #include <sys/verified_exec.h>
106 #include <sys/kauth.h>
107 #include <sys/atomic.h>
108 #include <sys/module.h>
109 #include <sys/buf.h>
110 #include <sys/event.h>
111 #include <sys/compat_stub.h>
112 
113 #include <miscfs/genfs/genfs.h>
114 #include <miscfs/specfs/specdev.h>
115 
116 #include <nfs/rpcv2.h>
117 #include <nfs/nfsproto.h>
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 
121 /* XXX this shouldn't be here */
122 #ifndef OFF_T_MAX
123 #define OFF_T_MAX __type_max(off_t)
124 #endif
125 
126 static int change_flags(struct vnode *, u_long, struct lwp *);
127 static int change_mode(struct vnode *, int, struct lwp *);
128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131     enum uio_seg);
132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134     enum uio_seg);
135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136     enum uio_seg, int);
137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138     size_t, register_t *);
139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140 
141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143     namei_simple_flags_t, struct vnode **);
144 
145 /*
146  * This table is used to maintain compatibility with 4.3BSD
147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
148  * Note, the order is important!
149  *
150  * Do not modify this table. It should only contain filesystems
151  * supported by NetBSD 0.9 and 4.3BSD.
152  */
153 const char * const mountcompatnames[] = {
154 	NULL,		/* 0 = MOUNT_NONE */
155 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156 	MOUNT_NFS,	/* 2 */
157 	MOUNT_MFS,	/* 3 */
158 	MOUNT_MSDOS,	/* 4 */
159 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160 	MOUNT_FDESC,	/* 6 */
161 	MOUNT_KERNFS,	/* 7 */
162 	NULL,		/* 8 = MOUNT_DEVFS */
163 	MOUNT_AFS,	/* 9 */
164 };
165 
166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
167 
168 static int
169 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
170 {
171 	file_t *dfp;
172 	int error;
173 
174 	if (fdat != AT_FDCWD) {
175 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
176 			goto out;
177 
178 		NDAT(ndp, dfp->f_vnode);
179 	}
180 
181 	error = namei(ndp);
182 
183 	if (fdat != AT_FDCWD)
184 		fd_putfile(fdat);
185 out:
186 	return error;
187 }
188 
189 static int
190 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
191     namei_simple_flags_t sflags, struct vnode **vp_ret)
192 {
193 	file_t *dfp;
194 	struct vnode *dvp;
195 	int error;
196 
197 	if (fdat != AT_FDCWD) {
198 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
199 			goto out;
200 
201 		dvp = dfp->f_vnode;
202 	} else {
203 		dvp = NULL;
204 	}
205 
206 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
207 
208 	if (fdat != AT_FDCWD)
209 		fd_putfile(fdat);
210 out:
211 	return error;
212 }
213 
214 static int
215 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
216 {
217 	int error;
218 
219 	fp->f_flag = flags & FMASK;
220 	fp->f_type = DTYPE_VNODE;
221 	fp->f_ops = &vnops;
222 	fp->f_vnode = vp;
223 
224 	if (flags & (O_EXLOCK | O_SHLOCK)) {
225 		struct flock lf;
226 		int type;
227 
228 		lf.l_whence = SEEK_SET;
229 		lf.l_start = 0;
230 		lf.l_len = 0;
231 		if (flags & O_EXLOCK)
232 			lf.l_type = F_WRLCK;
233 		else
234 			lf.l_type = F_RDLCK;
235 		type = F_FLOCK;
236 		if ((flags & FNONBLOCK) == 0)
237 			type |= F_WAIT;
238 		VOP_UNLOCK(vp);
239 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
240 		if (error) {
241 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
242 			fd_abort(l->l_proc, fp, indx);
243 			return error;
244 		}
245 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
246 		atomic_or_uint(&fp->f_flag, FHASLOCK);
247 	}
248 	if (flags & O_CLOEXEC)
249 		fd_set_exclose(l, indx, true);
250 	return 0;
251 }
252 
253 static int
254 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
255     void *data, size_t *data_len)
256 {
257 	struct mount *mp;
258 	int error = 0, saved_flags;
259 
260 	mp = vp->v_mount;
261 	saved_flags = mp->mnt_flag;
262 
263 	/* We can operate only on VV_ROOT nodes. */
264 	if ((vp->v_vflag & VV_ROOT) == 0) {
265 		error = EINVAL;
266 		goto out;
267 	}
268 
269 	/*
270 	 * We only allow the filesystem to be reloaded if it
271 	 * is currently mounted read-only.  Additionally, we
272 	 * prevent read-write to read-only downgrades.
273 	 */
274 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
275 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
276 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
277 		error = EOPNOTSUPP;	/* Needs translation */
278 		goto out;
279 	}
280 
281 	/*
282 	 * Enabling MNT_UNION requires a covered mountpoint and
283 	 * must not happen on the root mount.
284 	 */
285 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
286 		error = EOPNOTSUPP;
287 		goto out;
288 	}
289 
290 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
291 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
292 	if (error)
293 		goto out;
294 
295 	error = vfs_suspend(mp, 0);
296 	if (error)
297 		goto out;
298 
299 	mutex_enter(mp->mnt_updating);
300 
301 	mp->mnt_flag &= ~MNT_OP_FLAGS;
302 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
303 
304 	/*
305 	 * Set the mount level flags.
306 	 */
307 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
308 		if ((flags & MNT_RDONLY))
309 			mp->mnt_iflag |= IMNT_WANTRDONLY;
310 		else
311 			mp->mnt_iflag |= IMNT_WANTRDWR;
312 	}
313 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
314 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
315 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
316 		mp->mnt_flag &= ~MNT_RDONLY;
317 
318 	error = VFS_MOUNT(mp, path, data, data_len);
319 
320 	if (error && data != NULL) {
321 		int error2;
322 
323 		/*
324 		 * Update failed; let's try and see if it was an
325 		 * export request.  For compat with 3.0 and earlier.
326 		 */
327 		error2 = vfs_hooks_reexport(mp, path, data);
328 
329 		/*
330 		 * Only update error code if the export request was
331 		 * understood but some problem occurred while
332 		 * processing it.
333 		 */
334 		if (error2 != EJUSTRETURN)
335 			error = error2;
336 	}
337 
338 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
339 		mp->mnt_flag |= MNT_RDONLY;
340 	if (error)
341 		mp->mnt_flag = saved_flags;
342 	mp->mnt_flag &= ~MNT_OP_FLAGS;
343 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
344 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
345 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
346 			vfs_syncer_add_to_worklist(mp);
347 	} else {
348 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
349 			vfs_syncer_remove_from_worklist(mp);
350 	}
351 	mutex_exit(mp->mnt_updating);
352 	vfs_resume(mp);
353 
354 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
355 	    (flags & MNT_EXTATTR)) {
356 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
357 				   NULL, 0, NULL) != 0) {
358 			printf("%s: failed to start extattr, error = %d",
359 			       mp->mnt_stat.f_mntonname, error);
360 			mp->mnt_flag &= ~MNT_EXTATTR;
361 		}
362 	}
363 
364 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
365 	    !(flags & MNT_EXTATTR)) {
366 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
367 				   NULL, 0, NULL) != 0) {
368 			printf("%s: failed to stop extattr, error = %d",
369 			       mp->mnt_stat.f_mntonname, error);
370 			mp->mnt_flag |= MNT_RDONLY;
371 		}
372 	}
373  out:
374 	return (error);
375 }
376 
377 static int
378 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
379     struct vfsops **vfsops)
380 {
381 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
382 	int error;
383 
384 	if (type_seg == UIO_USERSPACE) {
385 		/* Copy file-system type from userspace.  */
386 		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
387 	} else {
388 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
389 		KASSERT(error == 0);
390 	}
391 
392 	if (error) {
393 		/*
394 		 * Historically, filesystem types were identified by numbers.
395 		 * If we get an integer for the filesystem type instead of a
396 		 * string, we check to see if it matches one of the historic
397 		 * filesystem types.
398 		 */
399 		u_long fsindex = (u_long)fstype;
400 		if (fsindex >= nmountcompatnames ||
401 		    mountcompatnames[fsindex] == NULL)
402 			return ENODEV;
403 		strlcpy(fstypename, mountcompatnames[fsindex],
404 		    sizeof(fstypename));
405 	}
406 
407 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
408 	if (strcmp(fstypename, "ufs") == 0)
409 		fstypename[0] = 'f';
410 
411 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
412 		return 0;
413 
414 	/* If we can autoload a vfs module, try again */
415 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
416 
417 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
418 		return 0;
419 
420 	return ENODEV;
421 }
422 
423 static int
424 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
425     void *data, size_t *data_len)
426 {
427 	struct mount *mp;
428 	int error;
429 
430 	/* If MNT_GETARGS is specified, it should be the only flag. */
431 	if (flags & ~MNT_GETARGS)
432 		return EINVAL;
433 
434 	mp = vp->v_mount;
435 
436 	/* XXX: probably some notion of "can see" here if we want isolation. */
437 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
438 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
439 	if (error)
440 		return error;
441 
442 	if ((vp->v_vflag & VV_ROOT) == 0)
443 		return EINVAL;
444 
445 	if (vfs_busy(mp))
446 		return EPERM;
447 
448 	mutex_enter(mp->mnt_updating);
449 	mp->mnt_flag &= ~MNT_OP_FLAGS;
450 	mp->mnt_flag |= MNT_GETARGS;
451 	error = VFS_MOUNT(mp, path, data, data_len);
452 	mp->mnt_flag &= ~MNT_OP_FLAGS;
453 	mutex_exit(mp->mnt_updating);
454 
455 	vfs_unbusy(mp);
456 	return (error);
457 }
458 
459 int
460 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
461 {
462 	/* {
463 		syscallarg(const char *) type;
464 		syscallarg(const char *) path;
465 		syscallarg(int) flags;
466 		syscallarg(void *) data;
467 		syscallarg(size_t) data_len;
468 	} */
469 
470 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
471 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
472 	    SCARG(uap, data_len), retval);
473 }
474 
475 int
476 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
477     const char *path, int flags, void *data, enum uio_seg data_seg,
478     size_t data_len, register_t *retval)
479 {
480 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
481 	struct vnode *vp;
482 	void *data_buf = data;
483 	bool vfsopsrele = false;
484 	size_t alloc_sz = 0;
485 	int error;
486 
487 	/*
488 	 * Get vnode to be covered
489 	 */
490 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
491 	if (error != 0) {
492 		vp = NULL;
493 		goto done;
494 	}
495 
496 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
497 		vfsops = vp->v_mount->mnt_op;
498 	} else {
499 		/* 'type' is userspace */
500 		error = mount_get_vfsops(type, type_seg, &vfsops);
501 		if (error != 0)
502 			goto done;
503 		vfsopsrele = true;
504 	}
505 
506 	/*
507 	 * We allow data to be NULL, even for userspace. Some fs's don't need
508 	 * it. The others will handle NULL.
509 	 */
510 	if (data != NULL && data_seg == UIO_USERSPACE) {
511 		if (data_len == 0) {
512 			/* No length supplied, use default for filesystem */
513 			data_len = vfsops->vfs_min_mount_data;
514 
515 			/*
516 			 * Hopefully a longer buffer won't make copyin() fail.
517 			 * For compatibility with 3.0 and earlier.
518 			 */
519 			if (flags & MNT_UPDATE
520 			    && data_len < sizeof (struct mnt_export_args30))
521 				data_len = sizeof (struct mnt_export_args30);
522 		}
523 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
524 			error = EINVAL;
525 			goto done;
526 		}
527 		alloc_sz = data_len;
528 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
529 
530 		/* NFS needs the buffer even for mnt_getargs .... */
531 		error = copyin(data, data_buf, data_len);
532 		if (error != 0)
533 			goto done;
534 	}
535 
536 	if (flags & MNT_GETARGS) {
537 		if (data_len == 0) {
538 			error = EINVAL;
539 			goto done;
540 		}
541 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
542 		if (error != 0)
543 			goto done;
544 		if (data_seg == UIO_USERSPACE)
545 			error = copyout(data_buf, data, data_len);
546 		*retval = data_len;
547 	} else if (flags & MNT_UPDATE) {
548 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
549 	} else {
550 		/* Locking is handled internally in mount_domount(). */
551 		KASSERT(vfsopsrele == true);
552 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
553 		    &data_len);
554 		vfsopsrele = false;
555 	}
556 	if (!error)
557 		KNOTE(&fs_klist, VQ_MOUNT);
558 
559     done:
560 	if (vfsopsrele)
561 		vfs_delref(vfsops);
562     	if (vp != NULL) {
563 	    	vrele(vp);
564 	}
565 	if (data_buf != data)
566 		kmem_free(data_buf, alloc_sz);
567 	return (error);
568 }
569 
570 /*
571  * Unmount a file system.
572  *
573  * Note: unmount takes a path to the vnode mounted on as argument,
574  * not special file (as before).
575  */
576 /* ARGSUSED */
577 int
578 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
579 {
580 	/* {
581 		syscallarg(const char *) path;
582 		syscallarg(int) flags;
583 	} */
584 	struct vnode *vp;
585 	struct mount *mp;
586 	int error;
587 	struct pathbuf *pb;
588 	struct nameidata nd;
589 
590 	error = pathbuf_copyin(SCARG(uap, path), &pb);
591 	if (error) {
592 		return error;
593 	}
594 
595 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
596 	if ((error = namei(&nd)) != 0) {
597 		pathbuf_destroy(pb);
598 		return error;
599 	}
600 	vp = nd.ni_vp;
601 	pathbuf_destroy(pb);
602 
603 	mp = vp->v_mount;
604 	vfs_ref(mp);
605 	VOP_UNLOCK(vp);
606 
607 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
608 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
609 	if (error) {
610 		vrele(vp);
611 		vfs_rele(mp);
612 		return (error);
613 	}
614 
615 	/*
616 	 * Don't allow unmounting the root file system.
617 	 */
618 	if (mp->mnt_flag & MNT_ROOTFS) {
619 		vrele(vp);
620 		vfs_rele(mp);
621 		return (EINVAL);
622 	}
623 
624 	/*
625 	 * Must be the root of the filesystem
626 	 */
627 	if ((vp->v_vflag & VV_ROOT) == 0) {
628 		vrele(vp);
629 		vfs_rele(mp);
630 		return (EINVAL);
631 	}
632 
633 	vrele(vp);
634 	error = dounmount(mp, SCARG(uap, flags), l);
635 	vfs_rele(mp);
636 	if (!error)
637 		KNOTE(&fs_klist, VQ_UNMOUNT);
638 	return error;
639 }
640 
641 /*
642  * Sync each mounted filesystem.
643  */
644 #ifdef DEBUG
645 int syncprt = 0;
646 struct ctldebug debug0 = { "syncprt", &syncprt };
647 #endif
648 
649 void
650 do_sys_sync(struct lwp *l)
651 {
652 	mount_iterator_t *iter;
653 	struct mount *mp;
654 	int asyncflag;
655 
656 	mountlist_iterator_init(&iter);
657 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
658 		mutex_enter(mp->mnt_updating);
659 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
660 			asyncflag = mp->mnt_flag & MNT_ASYNC;
661 			mp->mnt_flag &= ~MNT_ASYNC;
662 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
663 			if (asyncflag)
664 				 mp->mnt_flag |= MNT_ASYNC;
665 		}
666 		mutex_exit(mp->mnt_updating);
667 	}
668 	mountlist_iterator_destroy(iter);
669 #ifdef DEBUG
670 	if (syncprt)
671 		vfs_bufstats();
672 #endif /* DEBUG */
673 }
674 
675 /* ARGSUSED */
676 int
677 sys_sync(struct lwp *l, const void *v, register_t *retval)
678 {
679 	do_sys_sync(l);
680 	return (0);
681 }
682 
683 
684 /*
685  * Access or change filesystem quotas.
686  *
687  * (this is really 14 different calls bundled into one)
688  */
689 
690 static int
691 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
692 {
693 	struct quotastat info_k;
694 	int error;
695 
696 	/* ensure any padding bytes are cleared */
697 	memset(&info_k, 0, sizeof(info_k));
698 
699 	error = vfs_quotactl_stat(mp, &info_k);
700 	if (error) {
701 		return error;
702 	}
703 
704 	return copyout(&info_k, info_u, sizeof(info_k));
705 }
706 
707 static int
708 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
709     struct quotaidtypestat *info_u)
710 {
711 	struct quotaidtypestat info_k;
712 	int error;
713 
714 	/* ensure any padding bytes are cleared */
715 	memset(&info_k, 0, sizeof(info_k));
716 
717 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
718 	if (error) {
719 		return error;
720 	}
721 
722 	return copyout(&info_k, info_u, sizeof(info_k));
723 }
724 
725 static int
726 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
727     struct quotaobjtypestat *info_u)
728 {
729 	struct quotaobjtypestat info_k;
730 	int error;
731 
732 	/* ensure any padding bytes are cleared */
733 	memset(&info_k, 0, sizeof(info_k));
734 
735 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
736 	if (error) {
737 		return error;
738 	}
739 
740 	return copyout(&info_k, info_u, sizeof(info_k));
741 }
742 
743 static int
744 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
745     struct quotaval *val_u)
746 {
747 	struct quotakey key_k;
748 	struct quotaval val_k;
749 	int error;
750 
751 	/* ensure any padding bytes are cleared */
752 	memset(&val_k, 0, sizeof(val_k));
753 
754 	error = copyin(key_u, &key_k, sizeof(key_k));
755 	if (error) {
756 		return error;
757 	}
758 
759 	error = vfs_quotactl_get(mp, &key_k, &val_k);
760 	if (error) {
761 		return error;
762 	}
763 
764 	return copyout(&val_k, val_u, sizeof(val_k));
765 }
766 
767 static int
768 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
769     const struct quotaval *val_u)
770 {
771 	struct quotakey key_k;
772 	struct quotaval val_k;
773 	int error;
774 
775 	error = copyin(key_u, &key_k, sizeof(key_k));
776 	if (error) {
777 		return error;
778 	}
779 
780 	error = copyin(val_u, &val_k, sizeof(val_k));
781 	if (error) {
782 		return error;
783 	}
784 
785 	return vfs_quotactl_put(mp, &key_k, &val_k);
786 }
787 
788 static int
789 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
790 {
791 	struct quotakey key_k;
792 	int error;
793 
794 	error = copyin(key_u, &key_k, sizeof(key_k));
795 	if (error) {
796 		return error;
797 	}
798 
799 	return vfs_quotactl_del(mp, &key_k);
800 }
801 
802 static int
803 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
804 {
805 	struct quotakcursor cursor_k;
806 	int error;
807 
808 	/* ensure any padding bytes are cleared */
809 	memset(&cursor_k, 0, sizeof(cursor_k));
810 
811 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
812 	if (error) {
813 		return error;
814 	}
815 
816 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
817 }
818 
819 static int
820 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
821 {
822 	struct quotakcursor cursor_k;
823 	int error;
824 
825 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
826 	if (error) {
827 		return error;
828 	}
829 
830 	return vfs_quotactl_cursorclose(mp, &cursor_k);
831 }
832 
833 static int
834 do_sys_quotactl_cursorskipidtype(struct mount *mp,
835     struct quotakcursor *cursor_u, int idtype)
836 {
837 	struct quotakcursor cursor_k;
838 	int error;
839 
840 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
841 	if (error) {
842 		return error;
843 	}
844 
845 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
846 	if (error) {
847 		return error;
848 	}
849 
850 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
851 }
852 
853 static int
854 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
855     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
856     unsigned *ret_u)
857 {
858 #define CGET_STACK_MAX 8
859 	struct quotakcursor cursor_k;
860 	struct quotakey stackkeys[CGET_STACK_MAX];
861 	struct quotaval stackvals[CGET_STACK_MAX];
862 	struct quotakey *keys_k;
863 	struct quotaval *vals_k;
864 	unsigned ret_k;
865 	int error;
866 
867 	if (maxnum > 128) {
868 		maxnum = 128;
869 	}
870 
871 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
872 	if (error) {
873 		return error;
874 	}
875 
876 	if (maxnum <= CGET_STACK_MAX) {
877 		keys_k = stackkeys;
878 		vals_k = stackvals;
879 		/* ensure any padding bytes are cleared */
880 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
881 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
882 	} else {
883 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
884 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
885 	}
886 
887 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
888 				       &ret_k);
889 	if (error) {
890 		goto fail;
891 	}
892 
893 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
894 	if (error) {
895 		goto fail;
896 	}
897 
898 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
899 	if (error) {
900 		goto fail;
901 	}
902 
903 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
904 	if (error) {
905 		goto fail;
906 	}
907 
908 	/* do last to maximize the chance of being able to recover a failure */
909 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
910 
911 fail:
912 	if (keys_k != stackkeys) {
913 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
914 	}
915 	if (vals_k != stackvals) {
916 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
917 	}
918 	return error;
919 }
920 
921 static int
922 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
923     int *ret_u)
924 {
925 	struct quotakcursor cursor_k;
926 	int ret_k;
927 	int error;
928 
929 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
930 	if (error) {
931 		return error;
932 	}
933 
934 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
935 	if (error) {
936 		return error;
937 	}
938 
939 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
940 	if (error) {
941 		return error;
942 	}
943 
944 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
945 }
946 
947 static int
948 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
949 {
950 	struct quotakcursor cursor_k;
951 	int error;
952 
953 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
954 	if (error) {
955 		return error;
956 	}
957 
958 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
959 	if (error) {
960 		return error;
961 	}
962 
963 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
964 }
965 
966 static int
967 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
968 {
969 	char *path_k;
970 	int error;
971 
972 	/* XXX this should probably be a struct pathbuf */
973 	path_k = PNBUF_GET();
974 	error = copyin(path_u, path_k, PATH_MAX);
975 	if (error) {
976 		PNBUF_PUT(path_k);
977 		return error;
978 	}
979 
980 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
981 
982 	PNBUF_PUT(path_k);
983 	return error;
984 }
985 
986 static int
987 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
988 {
989 	return vfs_quotactl_quotaoff(mp, idtype);
990 }
991 
992 int
993 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
994 {
995 	struct mount *mp;
996 	struct vnode *vp;
997 	int error;
998 
999 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1000 	if (error != 0)
1001 		return (error);
1002 	mp = vp->v_mount;
1003 
1004 	switch (args->qc_op) {
1005 	    case QUOTACTL_STAT:
1006 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1007 		break;
1008 	    case QUOTACTL_IDTYPESTAT:
1009 		error = do_sys_quotactl_idtypestat(mp,
1010 				args->u.idtypestat.qc_idtype,
1011 				args->u.idtypestat.qc_info);
1012 		break;
1013 	    case QUOTACTL_OBJTYPESTAT:
1014 		error = do_sys_quotactl_objtypestat(mp,
1015 				args->u.objtypestat.qc_objtype,
1016 				args->u.objtypestat.qc_info);
1017 		break;
1018 	    case QUOTACTL_GET:
1019 		error = do_sys_quotactl_get(mp,
1020 				args->u.get.qc_key,
1021 				args->u.get.qc_val);
1022 		break;
1023 	    case QUOTACTL_PUT:
1024 		error = do_sys_quotactl_put(mp,
1025 				args->u.put.qc_key,
1026 				args->u.put.qc_val);
1027 		break;
1028 	    case QUOTACTL_DEL:
1029 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1030 		break;
1031 	    case QUOTACTL_CURSOROPEN:
1032 		error = do_sys_quotactl_cursoropen(mp,
1033 				args->u.cursoropen.qc_cursor);
1034 		break;
1035 	    case QUOTACTL_CURSORCLOSE:
1036 		error = do_sys_quotactl_cursorclose(mp,
1037 				args->u.cursorclose.qc_cursor);
1038 		break;
1039 	    case QUOTACTL_CURSORSKIPIDTYPE:
1040 		error = do_sys_quotactl_cursorskipidtype(mp,
1041 				args->u.cursorskipidtype.qc_cursor,
1042 				args->u.cursorskipidtype.qc_idtype);
1043 		break;
1044 	    case QUOTACTL_CURSORGET:
1045 		error = do_sys_quotactl_cursorget(mp,
1046 				args->u.cursorget.qc_cursor,
1047 				args->u.cursorget.qc_keys,
1048 				args->u.cursorget.qc_vals,
1049 				args->u.cursorget.qc_maxnum,
1050 				args->u.cursorget.qc_ret);
1051 		break;
1052 	    case QUOTACTL_CURSORATEND:
1053 		error = do_sys_quotactl_cursoratend(mp,
1054 				args->u.cursoratend.qc_cursor,
1055 				args->u.cursoratend.qc_ret);
1056 		break;
1057 	    case QUOTACTL_CURSORREWIND:
1058 		error = do_sys_quotactl_cursorrewind(mp,
1059 				args->u.cursorrewind.qc_cursor);
1060 		break;
1061 	    case QUOTACTL_QUOTAON:
1062 		error = do_sys_quotactl_quotaon(mp,
1063 				args->u.quotaon.qc_idtype,
1064 				args->u.quotaon.qc_quotafile);
1065 		break;
1066 	    case QUOTACTL_QUOTAOFF:
1067 		error = do_sys_quotactl_quotaoff(mp,
1068 				args->u.quotaoff.qc_idtype);
1069 		break;
1070 	    default:
1071 		error = EINVAL;
1072 		break;
1073 	}
1074 
1075 	vrele(vp);
1076 	return error;
1077 }
1078 
1079 /* ARGSUSED */
1080 int
1081 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1082     register_t *retval)
1083 {
1084 	/* {
1085 		syscallarg(const char *) path;
1086 		syscallarg(struct quotactl_args *) args;
1087 	} */
1088 	struct quotactl_args args;
1089 	int error;
1090 
1091 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1092 	if (error) {
1093 		return error;
1094 	}
1095 
1096 	return do_sys_quotactl(SCARG(uap, path), &args);
1097 }
1098 
1099 int
1100 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1101     int root)
1102 {
1103 	struct vnode *rvp;
1104 	int error = 0;
1105 
1106 	/*
1107 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1108 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1109 	 * overrides MNT_NOWAIT.
1110 	 */
1111 	KASSERT(l == curlwp);
1112 	rvp = cwdrdir();
1113 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1114 	    (flags != MNT_WAIT && flags != 0)) {
1115 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1116 	} else {
1117 		/* Get the filesystem stats now */
1118 		memset(sp, 0, sizeof(*sp));
1119 		if ((error = VFS_STATVFS(mp, sp)) != 0) {
1120 			if (rvp)
1121 				vrele(rvp);
1122 			return error;
1123 		}
1124 		if (rvp == NULL)
1125 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1126 	}
1127 
1128 	if (rvp != NULL) {
1129 		size_t len;
1130 		char *bp;
1131 		char c;
1132 		char *path = PNBUF_GET();
1133 
1134 		bp = path + MAXPATHLEN;
1135 		*--bp = '\0';
1136 		error = getcwd_common(rvp, rootvnode, &bp, path,
1137 		    MAXPATHLEN / 2, 0, l);
1138 		if (error) {
1139 			PNBUF_PUT(path);
1140 			vrele(rvp);
1141 			return error;
1142 		}
1143 		len = strlen(bp);
1144 		if (len != 1) {
1145 			/*
1146 			 * for mount points that are below our root, we can see
1147 			 * them, so we fix up the pathname and return them. The
1148 			 * rest we cannot see, so we don't allow viewing the
1149 			 * data.
1150 			 */
1151 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1152 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1153 				(void)strlcpy(sp->f_mntonname,
1154 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1155 				    sizeof(sp->f_mntonname));
1156 			} else {
1157 				if (root)
1158 					(void)strlcpy(sp->f_mntonname, "/",
1159 					    sizeof(sp->f_mntonname));
1160 				else
1161 					error = EPERM;
1162 			}
1163 		}
1164 		PNBUF_PUT(path);
1165 		vrele(rvp);
1166 	}
1167 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1168 	return error;
1169 }
1170 
1171 /*
1172  * Get filesystem statistics by path.
1173  */
1174 int
1175 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1176 {
1177 	struct mount *mp;
1178 	int error;
1179 	struct vnode *vp;
1180 
1181 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1182 	if (error != 0)
1183 		return error;
1184 	mp = vp->v_mount;
1185 	error = dostatvfs(mp, sb, l, flags, 1);
1186 	vrele(vp);
1187 	return error;
1188 }
1189 
1190 /* ARGSUSED */
1191 int
1192 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1193 {
1194 	/* {
1195 		syscallarg(const char *) path;
1196 		syscallarg(struct statvfs *) buf;
1197 		syscallarg(int) flags;
1198 	} */
1199 	struct statvfs *sb;
1200 	int error;
1201 
1202 	sb = STATVFSBUF_GET();
1203 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1204 	if (error == 0)
1205 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1206 	STATVFSBUF_PUT(sb);
1207 	return error;
1208 }
1209 
1210 /*
1211  * Get filesystem statistics by fd.
1212  */
1213 int
1214 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1215 {
1216 	file_t *fp;
1217 	struct mount *mp;
1218 	int error;
1219 
1220 	/* fd_getvnode() will use the descriptor for us */
1221 	if ((error = fd_getvnode(fd, &fp)) != 0)
1222 		return (error);
1223 	mp = fp->f_vnode->v_mount;
1224 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1225 	fd_putfile(fd);
1226 	return error;
1227 }
1228 
1229 /* ARGSUSED */
1230 int
1231 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1232 {
1233 	/* {
1234 		syscallarg(int) fd;
1235 		syscallarg(struct statvfs *) buf;
1236 		syscallarg(int) flags;
1237 	} */
1238 	struct statvfs *sb;
1239 	int error;
1240 
1241 	sb = STATVFSBUF_GET();
1242 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1243 	if (error == 0)
1244 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1245 	STATVFSBUF_PUT(sb);
1246 	return error;
1247 }
1248 
1249 
1250 /*
1251  * Get statistics on all filesystems.
1252  */
1253 int
1254 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1255     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1256     register_t *retval)
1257 {
1258 	int root = 0;
1259 	mount_iterator_t *iter;
1260 	struct proc *p = l->l_proc;
1261 	struct mount *mp;
1262 	struct statvfs *sb;
1263 	size_t count, maxcount;
1264 	int error = 0;
1265 
1266 	sb = STATVFSBUF_GET();
1267 	maxcount = bufsize / entry_sz;
1268 	count = 0;
1269 	mountlist_iterator_init(&iter);
1270 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1271 		if (sfsp && count < maxcount) {
1272 			error = dostatvfs(mp, sb, l, flags, 0);
1273 			if (error) {
1274 				error = 0;
1275 				continue;
1276 			}
1277 			error = copyfn(sb, sfsp, entry_sz);
1278 			if (error)
1279 				goto out;
1280 			sfsp = (char *)sfsp + entry_sz;
1281 			root |= strcmp(sb->f_mntonname, "/") == 0;
1282 		}
1283 		count++;
1284 	}
1285 
1286 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1287 		/*
1288 		 * fake a root entry
1289 		 */
1290 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1291 		    sb, l, flags, 1);
1292 		if (error != 0)
1293 			goto out;
1294 		if (sfsp) {
1295 			error = copyfn(sb, sfsp, entry_sz);
1296 			if (error != 0)
1297 				goto out;
1298 		}
1299 		count++;
1300 	}
1301 	if (sfsp && count > maxcount)
1302 		*retval = maxcount;
1303 	else
1304 		*retval = count;
1305 out:
1306 	mountlist_iterator_destroy(iter);
1307 	STATVFSBUF_PUT(sb);
1308 	return error;
1309 }
1310 
1311 int
1312 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1313     register_t *retval)
1314 {
1315 	/* {
1316 		syscallarg(struct statvfs *) buf;
1317 		syscallarg(size_t) bufsize;
1318 		syscallarg(int) flags;
1319 	} */
1320 
1321 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1322 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1323 }
1324 
1325 /*
1326  * Change current working directory to a given file descriptor.
1327  */
1328 /* ARGSUSED */
1329 int
1330 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1331 {
1332 	/* {
1333 		syscallarg(int) fd;
1334 	} */
1335 	struct cwdinfo *cwdi;
1336 	struct vnode *vp, *tdp;
1337 	struct mount *mp;
1338 	file_t *fp;
1339 	int error, fd;
1340 
1341 	/* fd_getvnode() will use the descriptor for us */
1342 	fd = SCARG(uap, fd);
1343 	if ((error = fd_getvnode(fd, &fp)) != 0)
1344 		return (error);
1345 	vp = fp->f_vnode;
1346 
1347 	vref(vp);
1348 	vn_lock(vp, LK_SHARED | LK_RETRY);
1349 	if (vp->v_type != VDIR)
1350 		error = ENOTDIR;
1351 	else
1352 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1353 	if (error) {
1354 		vput(vp);
1355 		goto out;
1356 	}
1357 	while ((mp = vp->v_mountedhere) != NULL) {
1358 		error = vfs_busy(mp);
1359 		vput(vp);
1360 		if (error != 0)
1361 			goto out;
1362 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1363 		vfs_unbusy(mp);
1364 		if (error)
1365 			goto out;
1366 		vp = tdp;
1367 	}
1368 	VOP_UNLOCK(vp);
1369 
1370 	/*
1371 	 * Disallow changing to a directory not under the process's
1372 	 * current root directory (if there is one).
1373 	 */
1374 	cwdi = cwdenter(RW_WRITER);
1375 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1376 		vrele(vp);
1377 		error = EPERM;	/* operation not permitted */
1378 	} else {
1379 		vrele(cwdi->cwdi_cdir);
1380 		cwdi->cwdi_cdir = vp;
1381 	}
1382 	cwdexit(cwdi);
1383 
1384  out:
1385 	fd_putfile(fd);
1386 	return (error);
1387 }
1388 
1389 /*
1390  * Change this process's notion of the root directory to a given file
1391  * descriptor.
1392  */
1393 int
1394 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1395 {
1396 	struct vnode	*vp;
1397 	file_t	*fp;
1398 	int		 error, fd = SCARG(uap, fd);
1399 
1400 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1401  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1402 		return error;
1403 	/* fd_getvnode() will use the descriptor for us */
1404 	if ((error = fd_getvnode(fd, &fp)) != 0)
1405 		return error;
1406 	vp = fp->f_vnode;
1407 	vn_lock(vp, LK_SHARED | LK_RETRY);
1408 	if (vp->v_type != VDIR)
1409 		error = ENOTDIR;
1410 	else
1411 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1412 	VOP_UNLOCK(vp);
1413 	if (error)
1414 		goto out;
1415 	vref(vp);
1416 	change_root(vp);
1417 
1418  out:
1419 	fd_putfile(fd);
1420 	return (error);
1421 }
1422 
1423 /*
1424  * Change current working directory (``.'').
1425  */
1426 /* ARGSUSED */
1427 int
1428 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1429 {
1430 	/* {
1431 		syscallarg(const char *) path;
1432 	} */
1433 	struct cwdinfo *cwdi;
1434 	int error;
1435 	struct vnode *vp, *ovp;
1436 
1437 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1438 	if (error != 0)
1439 		return (error);
1440 
1441 	cwdi = cwdenter(RW_WRITER);
1442 	ovp = cwdi->cwdi_cdir;
1443 	cwdi->cwdi_cdir = vp;
1444 	cwdexit(cwdi);
1445 	vrele(ovp);
1446 	return (0);
1447 }
1448 
1449 /*
1450  * Change notion of root (``/'') directory.
1451  */
1452 /* ARGSUSED */
1453 int
1454 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1455 {
1456 	/* {
1457 		syscallarg(const char *) path;
1458 	} */
1459 	int error;
1460 	struct vnode *vp;
1461 
1462 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1463 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1464 		return (error);
1465 
1466 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1467 	if (error == 0)
1468 		change_root(vp);
1469 	return error;
1470 }
1471 
1472 /*
1473  * Common routine for chroot and fchroot.
1474  * NB: callers need to properly authorize the change root operation.
1475  */
1476 void
1477 change_root(struct vnode *vp)
1478 {
1479 	struct cwdinfo *cwdi;
1480 	kauth_cred_t ncred;
1481 	struct lwp *l = curlwp;
1482 	struct proc *p = l->l_proc;
1483 
1484 	ncred = kauth_cred_alloc();
1485 
1486 	cwdi = cwdenter(RW_WRITER);
1487 	if (cwdi->cwdi_rdir != NULL)
1488 		vrele(cwdi->cwdi_rdir);
1489 	cwdi->cwdi_rdir = vp;
1490 
1491 	/*
1492 	 * Prevent escaping from chroot by putting the root under
1493 	 * the working directory.  Silently chdir to / if we aren't
1494 	 * already there.
1495 	 */
1496 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1497 		/*
1498 		 * XXX would be more failsafe to change directory to a
1499 		 * deadfs node here instead
1500 		 */
1501 		vrele(cwdi->cwdi_cdir);
1502 		vref(vp);
1503 		cwdi->cwdi_cdir = vp;
1504 	}
1505 	cwdexit(cwdi);
1506 
1507 	/* Get a write lock on the process credential. */
1508 	proc_crmod_enter();
1509 
1510 	kauth_cred_clone(p->p_cred, ncred);
1511 	kauth_proc_chroot(ncred, p->p_cwdi);
1512 
1513 	/* Broadcast our credentials to the process and other LWPs. */
1514  	proc_crmod_leave(ncred, p->p_cred, true);
1515 }
1516 
1517 /*
1518  * Common routine for chroot and chdir.
1519  * XXX "where" should be enum uio_seg
1520  */
1521 int
1522 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1523 {
1524 	struct pathbuf *pb;
1525 	struct nameidata nd;
1526 	int error;
1527 
1528 	error = pathbuf_maybe_copyin(path, where, &pb);
1529 	if (error) {
1530 		return error;
1531 	}
1532 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1533 	if ((error = namei(&nd)) != 0) {
1534 		pathbuf_destroy(pb);
1535 		return error;
1536 	}
1537 	*vpp = nd.ni_vp;
1538 	pathbuf_destroy(pb);
1539 
1540 	if ((*vpp)->v_type != VDIR)
1541 		error = ENOTDIR;
1542 	else
1543 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1544 
1545 	if (error)
1546 		vput(*vpp);
1547 	else
1548 		VOP_UNLOCK(*vpp);
1549 	return (error);
1550 }
1551 
1552 /*
1553  * Internals of sys_open - path has already been converted into a pathbuf
1554  * (so we can easily reuse this function from other parts of the kernel,
1555  * like posix_spawn post-processing).
1556  */
1557 int
1558 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1559 	int open_mode, int *fd)
1560 {
1561 	struct proc *p = l->l_proc;
1562 	struct cwdinfo *cwdi = p->p_cwdi;
1563 	file_t *fp;
1564 	struct vnode *vp;
1565 	int flags, cmode;
1566 	int indx, error;
1567 	struct nameidata nd;
1568 
1569 	if (open_flags & O_SEARCH) {
1570 		open_flags &= ~(int)O_SEARCH;
1571 	}
1572 
1573 	/*
1574 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1575 	 * may be specified.
1576 	 */
1577 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1578 		return EINVAL;
1579 
1580 	flags = FFLAGS(open_flags);
1581 	if ((flags & (FREAD | FWRITE)) == 0)
1582 		return EINVAL;
1583 
1584 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1585 		return error;
1586 	}
1587 
1588 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1589 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1590 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1591 	if (dvp != NULL)
1592 		NDAT(&nd, dvp);
1593 
1594 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1595 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1596 		fd_abort(p, fp, indx);
1597 		if ((error == EDUPFD || error == EMOVEFD) &&
1598 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1599 		    (error =
1600 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1601 			*fd = indx;
1602 			return 0;
1603 		}
1604 		if (error == ERESTART)
1605 			error = EINTR;
1606 		return error;
1607 	}
1608 
1609 	l->l_dupfd = 0;
1610 	vp = nd.ni_vp;
1611 
1612 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1613 		return error;
1614 
1615 	VOP_UNLOCK(vp);
1616 	*fd = indx;
1617 	fd_affix(p, fp, indx);
1618 	return 0;
1619 }
1620 
1621 int
1622 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1623 {
1624 	struct pathbuf *pb;
1625 	int error, oflags;
1626 
1627 	oflags = FFLAGS(open_flags);
1628 	if ((oflags & (FREAD | FWRITE)) == 0)
1629 		return EINVAL;
1630 
1631 	pb = pathbuf_create(path);
1632 	if (pb == NULL)
1633 		return ENOMEM;
1634 
1635 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1636 	pathbuf_destroy(pb);
1637 
1638 	return error;
1639 }
1640 
1641 static int
1642 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1643     int mode, int *fd)
1644 {
1645 	file_t *dfp = NULL;
1646 	struct vnode *dvp = NULL;
1647 	struct pathbuf *pb;
1648 	const char *pathstring = NULL;
1649 	int error;
1650 
1651 	if (path == NULL) {
1652 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1653 		if (error == ENOSYS)
1654 			goto no_compat;
1655 		if (error)
1656 			return error;
1657 	} else {
1658 no_compat:
1659 		error = pathbuf_copyin(path, &pb);
1660 		if (error)
1661 			return error;
1662 	}
1663 
1664 	pathstring = pathbuf_stringcopy_get(pb);
1665 
1666 	/*
1667 	 * fdat is ignored if:
1668 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1669 	 * 2) if path is absolute, then fdat is useless.
1670 	 */
1671 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1672 		/* fd_getvnode() will use the descriptor for us */
1673 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1674 			goto out;
1675 
1676 		dvp = dfp->f_vnode;
1677 	}
1678 
1679 	error = do_open(l, dvp, pb, flags, mode, fd);
1680 
1681 	if (dfp != NULL)
1682 		fd_putfile(fdat);
1683 out:
1684 	pathbuf_stringcopy_put(pb, pathstring);
1685 	pathbuf_destroy(pb);
1686 	return error;
1687 }
1688 
1689 int
1690 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1691 {
1692 	/* {
1693 		syscallarg(const char *) path;
1694 		syscallarg(int) flags;
1695 		syscallarg(int) mode;
1696 	} */
1697 	int error;
1698 	int fd;
1699 
1700 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1701 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1702 
1703 	if (error == 0)
1704 		*retval = fd;
1705 
1706 	return error;
1707 }
1708 
1709 int
1710 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1711 {
1712 	/* {
1713 		syscallarg(int) fd;
1714 		syscallarg(const char *) path;
1715 		syscallarg(int) oflags;
1716 		syscallarg(int) mode;
1717 	} */
1718 	int error;
1719 	int fd;
1720 
1721 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1722 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1723 
1724 	if (error == 0)
1725 		*retval = fd;
1726 
1727 	return error;
1728 }
1729 
1730 static void
1731 vfs__fhfree(fhandle_t *fhp)
1732 {
1733 	size_t fhsize;
1734 
1735 	fhsize = FHANDLE_SIZE(fhp);
1736 	kmem_free(fhp, fhsize);
1737 }
1738 
1739 /*
1740  * vfs_composefh: compose a filehandle.
1741  */
1742 
1743 int
1744 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1745 {
1746 	struct mount *mp;
1747 	struct fid *fidp;
1748 	int error;
1749 	size_t needfhsize;
1750 	size_t fidsize;
1751 
1752 	mp = vp->v_mount;
1753 	fidp = NULL;
1754 	if (*fh_size < FHANDLE_SIZE_MIN) {
1755 		fidsize = 0;
1756 	} else {
1757 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1758 		if (fhp != NULL) {
1759 			memset(fhp, 0, *fh_size);
1760 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1761 			fidp = &fhp->fh_fid;
1762 		}
1763 	}
1764 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1765 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1766 	if (error == 0 && *fh_size < needfhsize) {
1767 		error = E2BIG;
1768 	}
1769 	*fh_size = needfhsize;
1770 	return error;
1771 }
1772 
1773 int
1774 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1775 {
1776 	struct mount *mp;
1777 	fhandle_t *fhp;
1778 	size_t fhsize;
1779 	size_t fidsize;
1780 	int error;
1781 
1782 	mp = vp->v_mount;
1783 	fidsize = 0;
1784 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1785 	KASSERT(error != 0);
1786 	if (error != E2BIG) {
1787 		goto out;
1788 	}
1789 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1790 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1791 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1792 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1793 	if (error == 0) {
1794 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1795 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1796 		*fhpp = fhp;
1797 	} else {
1798 		kmem_free(fhp, fhsize);
1799 	}
1800 out:
1801 	return error;
1802 }
1803 
1804 void
1805 vfs_composefh_free(fhandle_t *fhp)
1806 {
1807 
1808 	vfs__fhfree(fhp);
1809 }
1810 
1811 /*
1812  * vfs_fhtovp: lookup a vnode by a filehandle.
1813  */
1814 
1815 int
1816 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1817 {
1818 	struct mount *mp;
1819 	int error;
1820 
1821 	*vpp = NULL;
1822 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1823 	if (mp == NULL) {
1824 		error = ESTALE;
1825 		goto out;
1826 	}
1827 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1828 		error = EOPNOTSUPP;
1829 		goto out;
1830 	}
1831 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1832 out:
1833 	return error;
1834 }
1835 
1836 /*
1837  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1838  * the needed size.
1839  */
1840 
1841 int
1842 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1843 {
1844 	fhandle_t *fhp;
1845 	int error;
1846 
1847 	if (fhsize > FHANDLE_SIZE_MAX) {
1848 		return EINVAL;
1849 	}
1850 	if (fhsize < FHANDLE_SIZE_MIN) {
1851 		return EINVAL;
1852 	}
1853 again:
1854 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1855 	error = copyin(ufhp, fhp, fhsize);
1856 	if (error == 0) {
1857 		/* XXX this check shouldn't be here */
1858 		if (FHANDLE_SIZE(fhp) == fhsize) {
1859 			*fhpp = fhp;
1860 			return 0;
1861 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1862 			/*
1863 			 * a kludge for nfsv2 padded handles.
1864 			 */
1865 			size_t sz;
1866 
1867 			sz = FHANDLE_SIZE(fhp);
1868 			kmem_free(fhp, fhsize);
1869 			fhsize = sz;
1870 			goto again;
1871 		} else {
1872 			/*
1873 			 * userland told us wrong size.
1874 			 */
1875 		    	error = EINVAL;
1876 		}
1877 	}
1878 	kmem_free(fhp, fhsize);
1879 	return error;
1880 }
1881 
1882 void
1883 vfs_copyinfh_free(fhandle_t *fhp)
1884 {
1885 
1886 	vfs__fhfree(fhp);
1887 }
1888 
1889 /*
1890  * Get file handle system call
1891  */
1892 int
1893 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1894 {
1895 	/* {
1896 		syscallarg(char *) fname;
1897 		syscallarg(fhandle_t *) fhp;
1898 		syscallarg(size_t *) fh_size;
1899 	} */
1900 	struct vnode *vp;
1901 	fhandle_t *fh;
1902 	int error;
1903 	struct pathbuf *pb;
1904 	struct nameidata nd;
1905 	size_t sz;
1906 	size_t usz;
1907 
1908 	/*
1909 	 * Must be super user
1910 	 */
1911 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1912 	    0, NULL, NULL, NULL);
1913 	if (error)
1914 		return (error);
1915 
1916 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1917 	if (error) {
1918 		return error;
1919 	}
1920 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1921 	error = namei(&nd);
1922 	if (error) {
1923 		pathbuf_destroy(pb);
1924 		return error;
1925 	}
1926 	vp = nd.ni_vp;
1927 	pathbuf_destroy(pb);
1928 
1929 	error = vfs_composefh_alloc(vp, &fh);
1930 	vput(vp);
1931 	if (error != 0) {
1932 		return error;
1933 	}
1934 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1935 	if (error != 0) {
1936 		goto out;
1937 	}
1938 	sz = FHANDLE_SIZE(fh);
1939 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1940 	if (error != 0) {
1941 		goto out;
1942 	}
1943 	if (usz >= sz) {
1944 		error = copyout(fh, SCARG(uap, fhp), sz);
1945 	} else {
1946 		error = E2BIG;
1947 	}
1948 out:
1949 	vfs_composefh_free(fh);
1950 	return (error);
1951 }
1952 
1953 /*
1954  * Open a file given a file handle.
1955  *
1956  * Check permissions, allocate an open file structure,
1957  * and call the device open routine if any.
1958  */
1959 
1960 int
1961 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1962     register_t *retval)
1963 {
1964 	file_t *fp;
1965 	struct vnode *vp = NULL;
1966 	kauth_cred_t cred = l->l_cred;
1967 	file_t *nfp;
1968 	int indx, error;
1969 	struct vattr va;
1970 	fhandle_t *fh;
1971 	int flags;
1972 	proc_t *p;
1973 
1974 	p = curproc;
1975 
1976 	/*
1977 	 * Must be super user
1978 	 */
1979 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1980 	    0, NULL, NULL, NULL)))
1981 		return (error);
1982 
1983 	if (oflags & O_SEARCH) {
1984 		oflags &= ~(int)O_SEARCH;
1985 	}
1986 
1987 	flags = FFLAGS(oflags);
1988 	if ((flags & (FREAD | FWRITE)) == 0)
1989 		return (EINVAL);
1990 	if ((flags & O_CREAT))
1991 		return (EINVAL);
1992 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1993 		return (error);
1994 	fp = nfp;
1995 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1996 	if (error != 0) {
1997 		goto bad;
1998 	}
1999 	error = vfs_fhtovp(fh, &vp);
2000 	vfs_copyinfh_free(fh);
2001 	if (error != 0) {
2002 		goto bad;
2003 	}
2004 
2005 	/* Now do an effective vn_open */
2006 
2007 	if (vp->v_type == VSOCK) {
2008 		error = EOPNOTSUPP;
2009 		goto bad;
2010 	}
2011 	error = vn_openchk(vp, cred, flags);
2012 	if (error != 0)
2013 		goto bad;
2014 	if (flags & O_TRUNC) {
2015 		VOP_UNLOCK(vp);			/* XXX */
2016 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2017 		vattr_null(&va);
2018 		va.va_size = 0;
2019 		error = VOP_SETATTR(vp, &va, cred);
2020 		if (error)
2021 			goto bad;
2022 	}
2023 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2024 		goto bad;
2025 	if (flags & FWRITE) {
2026 		mutex_enter(vp->v_interlock);
2027 		vp->v_writecount++;
2028 		mutex_exit(vp->v_interlock);
2029 	}
2030 
2031 	/* done with modified vn_open, now finish what sys_open does. */
2032 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2033 		return error;
2034 
2035 	VOP_UNLOCK(vp);
2036 	*retval = indx;
2037 	fd_affix(p, fp, indx);
2038 	return (0);
2039 
2040 bad:
2041 	fd_abort(p, fp, indx);
2042 	if (vp != NULL)
2043 		vput(vp);
2044 	return (error);
2045 }
2046 
2047 int
2048 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2049 {
2050 	/* {
2051 		syscallarg(const void *) fhp;
2052 		syscallarg(size_t) fh_size;
2053 		syscallarg(int) flags;
2054 	} */
2055 
2056 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2057 	    SCARG(uap, flags), retval);
2058 }
2059 
2060 int
2061 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2062 {
2063 	int error;
2064 	fhandle_t *fh;
2065 	struct vnode *vp;
2066 
2067 	/*
2068 	 * Must be super user
2069 	 */
2070 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2071 	    0, NULL, NULL, NULL)))
2072 		return (error);
2073 
2074 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2075 	if (error != 0)
2076 		return error;
2077 
2078 	error = vfs_fhtovp(fh, &vp);
2079 	vfs_copyinfh_free(fh);
2080 	if (error != 0)
2081 		return error;
2082 
2083 	error = vn_stat(vp, sb);
2084 	vput(vp);
2085 	return error;
2086 }
2087 
2088 
2089 /* ARGSUSED */
2090 int
2091 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2092 {
2093 	/* {
2094 		syscallarg(const void *) fhp;
2095 		syscallarg(size_t) fh_size;
2096 		syscallarg(struct stat *) sb;
2097 	} */
2098 	struct stat sb;
2099 	int error;
2100 
2101 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2102 	if (error)
2103 		return error;
2104 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2105 }
2106 
2107 int
2108 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2109     int flags)
2110 {
2111 	fhandle_t *fh;
2112 	struct mount *mp;
2113 	struct vnode *vp;
2114 	int error;
2115 
2116 	/*
2117 	 * Must be super user
2118 	 */
2119 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2120 	    0, NULL, NULL, NULL)))
2121 		return error;
2122 
2123 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2124 	if (error != 0)
2125 		return error;
2126 
2127 	error = vfs_fhtovp(fh, &vp);
2128 	vfs_copyinfh_free(fh);
2129 	if (error != 0)
2130 		return error;
2131 
2132 	mp = vp->v_mount;
2133 	error = dostatvfs(mp, sb, l, flags, 1);
2134 	vput(vp);
2135 	return error;
2136 }
2137 
2138 /* ARGSUSED */
2139 int
2140 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2141 {
2142 	/* {
2143 		syscallarg(const void *) fhp;
2144 		syscallarg(size_t) fh_size;
2145 		syscallarg(struct statvfs *) buf;
2146 		syscallarg(int)	flags;
2147 	} */
2148 	struct statvfs *sb = STATVFSBUF_GET();
2149 	int error;
2150 
2151 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2152 	    SCARG(uap, flags));
2153 	if (error == 0)
2154 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2155 	STATVFSBUF_PUT(sb);
2156 	return error;
2157 }
2158 
2159 int
2160 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2161     dev_t dev)
2162 {
2163 
2164 	/*
2165 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2166 	 * in mode and dev=0.
2167 	 *
2168 	 * In all the other cases it's implementation defined behavior.
2169 	 */
2170 
2171 	if ((mode & S_IFIFO) && dev == 0)
2172 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2173 	else
2174 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2175 		    UIO_USERSPACE);
2176 }
2177 
2178 /*
2179  * Create a special file.
2180  */
2181 /* ARGSUSED */
2182 int
2183 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2184     register_t *retval)
2185 {
2186 	/* {
2187 		syscallarg(const char *) path;
2188 		syscallarg(mode_t) mode;
2189 		syscallarg(dev_t) dev;
2190 	} */
2191 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2192 	    SCARG(uap, mode), SCARG(uap, dev));
2193 }
2194 
2195 int
2196 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2197     register_t *retval)
2198 {
2199 	/* {
2200 		syscallarg(int) fd;
2201 		syscallarg(const char *) path;
2202 		syscallarg(mode_t) mode;
2203 		syscallarg(int) pad;
2204 		syscallarg(dev_t) dev;
2205 	} */
2206 
2207 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2208 	    SCARG(uap, mode), SCARG(uap, dev));
2209 }
2210 
2211 int
2212 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2213     enum uio_seg seg)
2214 {
2215 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2216 }
2217 
2218 int
2219 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2220     dev_t dev, enum uio_seg seg)
2221 {
2222 	struct proc *p = l->l_proc;
2223 	struct vnode *vp;
2224 	struct vattr vattr;
2225 	int error, optype;
2226 	struct pathbuf *pb;
2227 	struct nameidata nd;
2228 	const char *pathstring;
2229 
2230 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2231 	    0, NULL, NULL, NULL)) != 0)
2232 		return (error);
2233 
2234 	optype = VOP_MKNOD_DESCOFFSET;
2235 
2236 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2237 	if (error) {
2238 		return error;
2239 	}
2240 	pathstring = pathbuf_stringcopy_get(pb);
2241 	if (pathstring == NULL) {
2242 		pathbuf_destroy(pb);
2243 		return ENOMEM;
2244 	}
2245 
2246 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2247 
2248 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2249 		goto out;
2250 	vp = nd.ni_vp;
2251 
2252 	if (vp != NULL)
2253 		error = EEXIST;
2254 	else {
2255 		vattr_null(&vattr);
2256 		/* We will read cwdi->cwdi_cmask unlocked. */
2257 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2258 		vattr.va_rdev = dev;
2259 
2260 		switch (mode & S_IFMT) {
2261 		case S_IFMT:	/* used by badsect to flag bad sectors */
2262 			vattr.va_type = VBAD;
2263 			break;
2264 		case S_IFCHR:
2265 			vattr.va_type = VCHR;
2266 			break;
2267 		case S_IFBLK:
2268 			vattr.va_type = VBLK;
2269 			break;
2270 		case S_IFWHT:
2271 			optype = VOP_WHITEOUT_DESCOFFSET;
2272 			break;
2273 		case S_IFREG:
2274 #if NVERIEXEC > 0
2275 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2276 			    O_CREAT);
2277 #endif /* NVERIEXEC > 0 */
2278 			vattr.va_type = VREG;
2279 			vattr.va_rdev = VNOVAL;
2280 			optype = VOP_CREATE_DESCOFFSET;
2281 			break;
2282 		default:
2283 			error = EINVAL;
2284 			break;
2285 		}
2286 
2287 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2288 		    vattr.va_rdev == VNOVAL)
2289 			error = EINVAL;
2290 	}
2291 
2292 	if (!error) {
2293 		switch (optype) {
2294 		case VOP_WHITEOUT_DESCOFFSET:
2295 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2296 			if (error)
2297 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2298 			vput(nd.ni_dvp);
2299 			break;
2300 
2301 		case VOP_MKNOD_DESCOFFSET:
2302 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2303 						&nd.ni_cnd, &vattr);
2304 			if (error == 0)
2305 				vrele(nd.ni_vp);
2306 			vput(nd.ni_dvp);
2307 			break;
2308 
2309 		case VOP_CREATE_DESCOFFSET:
2310 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2311 						&nd.ni_cnd, &vattr);
2312 			if (error == 0)
2313 				vrele(nd.ni_vp);
2314 			vput(nd.ni_dvp);
2315 			break;
2316 		}
2317 	} else {
2318 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2319 		if (nd.ni_dvp == vp)
2320 			vrele(nd.ni_dvp);
2321 		else
2322 			vput(nd.ni_dvp);
2323 		if (vp)
2324 			vrele(vp);
2325 	}
2326 out:
2327 	pathbuf_stringcopy_put(pb, pathstring);
2328 	pathbuf_destroy(pb);
2329 	return (error);
2330 }
2331 
2332 /*
2333  * Create a named pipe.
2334  */
2335 /* ARGSUSED */
2336 int
2337 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2338 {
2339 	/* {
2340 		syscallarg(const char *) path;
2341 		syscallarg(int) mode;
2342 	} */
2343 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2344 }
2345 
2346 int
2347 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2348     register_t *retval)
2349 {
2350 	/* {
2351 		syscallarg(int) fd;
2352 		syscallarg(const char *) path;
2353 		syscallarg(int) mode;
2354 	} */
2355 
2356 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2357 	    SCARG(uap, mode));
2358 }
2359 
2360 static int
2361 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2362 {
2363 	struct proc *p = l->l_proc;
2364 	struct vattr vattr;
2365 	int error;
2366 	struct pathbuf *pb;
2367 	struct nameidata nd;
2368 
2369 	error = pathbuf_copyin(path, &pb);
2370 	if (error) {
2371 		return error;
2372 	}
2373 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2374 
2375 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2376 		pathbuf_destroy(pb);
2377 		return error;
2378 	}
2379 	if (nd.ni_vp != NULL) {
2380 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2381 		if (nd.ni_dvp == nd.ni_vp)
2382 			vrele(nd.ni_dvp);
2383 		else
2384 			vput(nd.ni_dvp);
2385 		vrele(nd.ni_vp);
2386 		pathbuf_destroy(pb);
2387 		return (EEXIST);
2388 	}
2389 	vattr_null(&vattr);
2390 	vattr.va_type = VFIFO;
2391 	/* We will read cwdi->cwdi_cmask unlocked. */
2392 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2393 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2394 	if (error == 0)
2395 		vrele(nd.ni_vp);
2396 	vput(nd.ni_dvp);
2397 	pathbuf_destroy(pb);
2398 	return (error);
2399 }
2400 
2401 /*
2402  * Make a hard file link.
2403  */
2404 /* ARGSUSED */
2405 int
2406 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2407     const char *link, int follow, register_t *retval)
2408 {
2409 	struct vnode *vp;
2410 	struct pathbuf *linkpb;
2411 	struct nameidata nd;
2412 	namei_simple_flags_t ns_flags;
2413 	int error;
2414 
2415 	if (follow & AT_SYMLINK_FOLLOW)
2416 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2417 	else
2418 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2419 
2420 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2421 	if (error != 0)
2422 		return (error);
2423 	error = pathbuf_copyin(link, &linkpb);
2424 	if (error) {
2425 		goto out1;
2426 	}
2427 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2428 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2429 		goto out2;
2430 	if (nd.ni_vp) {
2431 		error = EEXIST;
2432 		goto abortop;
2433 	}
2434 	/* Prevent hard links on directories. */
2435 	if (vp->v_type == VDIR) {
2436 		error = EPERM;
2437 		goto abortop;
2438 	}
2439 	/* Prevent cross-mount operation. */
2440 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2441 		error = EXDEV;
2442 		goto abortop;
2443 	}
2444 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2445 	VOP_UNLOCK(nd.ni_dvp);
2446 	vrele(nd.ni_dvp);
2447 out2:
2448 	pathbuf_destroy(linkpb);
2449 out1:
2450 	vrele(vp);
2451 	return (error);
2452 abortop:
2453 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2454 	if (nd.ni_dvp == nd.ni_vp)
2455 		vrele(nd.ni_dvp);
2456 	else
2457 		vput(nd.ni_dvp);
2458 	if (nd.ni_vp != NULL)
2459 		vrele(nd.ni_vp);
2460 	goto out2;
2461 }
2462 
2463 int
2464 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2465 {
2466 	/* {
2467 		syscallarg(const char *) path;
2468 		syscallarg(const char *) link;
2469 	} */
2470 	const char *path = SCARG(uap, path);
2471 	const char *link = SCARG(uap, link);
2472 
2473 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2474 	    AT_SYMLINK_FOLLOW, retval);
2475 }
2476 
2477 int
2478 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2479     register_t *retval)
2480 {
2481 	/* {
2482 		syscallarg(int) fd1;
2483 		syscallarg(const char *) name1;
2484 		syscallarg(int) fd2;
2485 		syscallarg(const char *) name2;
2486 		syscallarg(int) flags;
2487 	} */
2488 	int fd1 = SCARG(uap, fd1);
2489 	const char *name1 = SCARG(uap, name1);
2490 	int fd2 = SCARG(uap, fd2);
2491 	const char *name2 = SCARG(uap, name2);
2492 	int follow;
2493 
2494 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2495 
2496 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2497 }
2498 
2499 
2500 int
2501 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2502 {
2503 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2504 }
2505 
2506 static int
2507 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2508     const char *link, enum uio_seg seg)
2509 {
2510 	struct proc *p = curproc;
2511 	struct vattr vattr;
2512 	char *path;
2513 	int error;
2514 	size_t len;
2515 	struct pathbuf *linkpb;
2516 	struct nameidata nd;
2517 
2518 	KASSERT(l != NULL || fdat == AT_FDCWD);
2519 
2520 	path = PNBUF_GET();
2521 	if (seg == UIO_USERSPACE) {
2522 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2523 			goto out1;
2524 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2525 			goto out1;
2526 	} else {
2527 		len = strlen(patharg) + 1;
2528 		KASSERT(len <= MAXPATHLEN);
2529 		memcpy(path, patharg, len);
2530 		linkpb = pathbuf_create(link);
2531 		if (linkpb == NULL) {
2532 			error = ENOMEM;
2533 			goto out1;
2534 		}
2535 	}
2536 	ktrkuser("symlink-target", path, len - 1);
2537 
2538 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2539 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2540 		goto out2;
2541 	if (nd.ni_vp) {
2542 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2543 		if (nd.ni_dvp == nd.ni_vp)
2544 			vrele(nd.ni_dvp);
2545 		else
2546 			vput(nd.ni_dvp);
2547 		vrele(nd.ni_vp);
2548 		error = EEXIST;
2549 		goto out2;
2550 	}
2551 	vattr_null(&vattr);
2552 	vattr.va_type = VLNK;
2553 	/* We will read cwdi->cwdi_cmask unlocked. */
2554 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2555 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2556 	if (error == 0)
2557 		vrele(nd.ni_vp);
2558 	vput(nd.ni_dvp);
2559 out2:
2560 	pathbuf_destroy(linkpb);
2561 out1:
2562 	PNBUF_PUT(path);
2563 	return (error);
2564 }
2565 
2566 /*
2567  * Make a symbolic link.
2568  */
2569 /* ARGSUSED */
2570 int
2571 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2572 {
2573 	/* {
2574 		syscallarg(const char *) path;
2575 		syscallarg(const char *) link;
2576 	} */
2577 
2578 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2579 	    UIO_USERSPACE);
2580 }
2581 
2582 int
2583 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2584     register_t *retval)
2585 {
2586 	/* {
2587 		syscallarg(const char *) path1;
2588 		syscallarg(int) fd;
2589 		syscallarg(const char *) path2;
2590 	} */
2591 
2592 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2593 	    SCARG(uap, path2), UIO_USERSPACE);
2594 }
2595 
2596 /*
2597  * Delete a whiteout from the filesystem.
2598  */
2599 /* ARGSUSED */
2600 int
2601 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2602 {
2603 	/* {
2604 		syscallarg(const char *) path;
2605 	} */
2606 	int error;
2607 	struct pathbuf *pb;
2608 	struct nameidata nd;
2609 
2610 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2611 	if (error) {
2612 		return error;
2613 	}
2614 
2615 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2616 	error = namei(&nd);
2617 	if (error) {
2618 		pathbuf_destroy(pb);
2619 		return (error);
2620 	}
2621 
2622 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2623 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2624 		if (nd.ni_dvp == nd.ni_vp)
2625 			vrele(nd.ni_dvp);
2626 		else
2627 			vput(nd.ni_dvp);
2628 		if (nd.ni_vp)
2629 			vrele(nd.ni_vp);
2630 		pathbuf_destroy(pb);
2631 		return (EEXIST);
2632 	}
2633 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2634 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2635 	vput(nd.ni_dvp);
2636 	pathbuf_destroy(pb);
2637 	return (error);
2638 }
2639 
2640 /*
2641  * Delete a name from the filesystem.
2642  */
2643 /* ARGSUSED */
2644 int
2645 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2646 {
2647 	/* {
2648 		syscallarg(const char *) path;
2649 	} */
2650 
2651 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2652 }
2653 
2654 int
2655 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2656     register_t *retval)
2657 {
2658 	/* {
2659 		syscallarg(int) fd;
2660 		syscallarg(const char *) path;
2661 		syscallarg(int) flag;
2662 	} */
2663 
2664 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2665 	    SCARG(uap, flag), UIO_USERSPACE);
2666 }
2667 
2668 int
2669 do_sys_unlink(const char *arg, enum uio_seg seg)
2670 {
2671 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2672 }
2673 
2674 static int
2675 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2676     enum uio_seg seg)
2677 {
2678 	struct vnode *vp;
2679 	int error;
2680 	struct pathbuf *pb;
2681 	struct nameidata nd;
2682 	const char *pathstring;
2683 
2684 	KASSERT(l != NULL || fdat == AT_FDCWD);
2685 
2686 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2687 	if (error) {
2688 		return error;
2689 	}
2690 	pathstring = pathbuf_stringcopy_get(pb);
2691 	if (pathstring == NULL) {
2692 		pathbuf_destroy(pb);
2693 		return ENOMEM;
2694 	}
2695 
2696 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2697 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2698 		goto out;
2699 	vp = nd.ni_vp;
2700 
2701 	/*
2702 	 * The root of a mounted filesystem cannot be deleted.
2703 	 */
2704 	if ((vp->v_vflag & VV_ROOT) != 0) {
2705 		error = EBUSY;
2706 		goto abort;
2707 	}
2708 
2709 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2710 		error = EBUSY;
2711 		goto abort;
2712 	}
2713 
2714 	/*
2715 	 * No rmdir "." please.
2716 	 */
2717 	if (nd.ni_dvp == vp) {
2718 		error = EINVAL;
2719 		goto abort;
2720 	}
2721 
2722 	/*
2723 	 * AT_REMOVEDIR is required to remove a directory
2724 	 */
2725 	if (vp->v_type == VDIR) {
2726 		if (!(flags & AT_REMOVEDIR)) {
2727 			error = EPERM;
2728 			goto abort;
2729 		} else {
2730 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2731 			vput(nd.ni_dvp);
2732 			goto out;
2733 		}
2734 	}
2735 
2736 	/*
2737 	 * Starting here we only deal with non directories.
2738 	 */
2739 	if (flags & AT_REMOVEDIR) {
2740 		error = ENOTDIR;
2741 		goto abort;
2742 	}
2743 
2744 #if NVERIEXEC > 0
2745 	/* Handle remove requests for veriexec entries. */
2746 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2747 		goto abort;
2748 	}
2749 #endif /* NVERIEXEC > 0 */
2750 
2751 #ifdef FILEASSOC
2752 	(void)fileassoc_file_delete(vp);
2753 #endif /* FILEASSOC */
2754 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2755 	vput(nd.ni_dvp);
2756 	goto out;
2757 
2758 abort:
2759 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2760 	if (nd.ni_dvp == vp)
2761 		vrele(nd.ni_dvp);
2762 	else
2763 		vput(nd.ni_dvp);
2764 	vput(vp);
2765 
2766 out:
2767 	pathbuf_stringcopy_put(pb, pathstring);
2768 	pathbuf_destroy(pb);
2769 	return (error);
2770 }
2771 
2772 /*
2773  * Reposition read/write file offset.
2774  */
2775 int
2776 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2777 {
2778 	/* {
2779 		syscallarg(int) fd;
2780 		syscallarg(int) pad;
2781 		syscallarg(off_t) offset;
2782 		syscallarg(int) whence;
2783 	} */
2784 	kauth_cred_t cred = l->l_cred;
2785 	file_t *fp;
2786 	struct vnode *vp;
2787 	struct vattr vattr;
2788 	off_t newoff;
2789 	int error, fd;
2790 
2791 	fd = SCARG(uap, fd);
2792 
2793 	if ((fp = fd_getfile(fd)) == NULL)
2794 		return (EBADF);
2795 
2796 	vp = fp->f_vnode;
2797 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2798 		error = ESPIPE;
2799 		goto out;
2800 	}
2801 
2802 	vn_lock(vp, LK_SHARED | LK_RETRY);
2803 
2804 	switch (SCARG(uap, whence)) {
2805 	case SEEK_CUR:
2806 		newoff = fp->f_offset + SCARG(uap, offset);
2807 		break;
2808 	case SEEK_END:
2809 		error = VOP_GETATTR(vp, &vattr, cred);
2810 		if (error) {
2811 			VOP_UNLOCK(vp);
2812 			goto out;
2813 		}
2814 		newoff = SCARG(uap, offset) + vattr.va_size;
2815 		break;
2816 	case SEEK_SET:
2817 		newoff = SCARG(uap, offset);
2818 		break;
2819 	default:
2820 		error = EINVAL;
2821 		VOP_UNLOCK(vp);
2822 		goto out;
2823 	}
2824 	VOP_UNLOCK(vp);
2825 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2826 		*(off_t *)retval = fp->f_offset = newoff;
2827 	}
2828  out:
2829  	fd_putfile(fd);
2830 	return (error);
2831 }
2832 
2833 /*
2834  * Positional read system call.
2835  */
2836 int
2837 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2838 {
2839 	/* {
2840 		syscallarg(int) fd;
2841 		syscallarg(void *) buf;
2842 		syscallarg(size_t) nbyte;
2843 		syscallarg(off_t) offset;
2844 	} */
2845 	file_t *fp;
2846 	struct vnode *vp;
2847 	off_t offset;
2848 	int error, fd = SCARG(uap, fd);
2849 
2850 	if ((fp = fd_getfile(fd)) == NULL)
2851 		return (EBADF);
2852 
2853 	if ((fp->f_flag & FREAD) == 0) {
2854 		fd_putfile(fd);
2855 		return (EBADF);
2856 	}
2857 
2858 	vp = fp->f_vnode;
2859 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2860 		error = ESPIPE;
2861 		goto out;
2862 	}
2863 
2864 	offset = SCARG(uap, offset);
2865 
2866 	/*
2867 	 * XXX This works because no file systems actually
2868 	 * XXX take any action on the seek operation.
2869 	 */
2870 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2871 		goto out;
2872 
2873 	/* dofileread() will unuse the descriptor for us */
2874 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2875 	    &offset, 0, retval));
2876 
2877  out:
2878 	fd_putfile(fd);
2879 	return (error);
2880 }
2881 
2882 /*
2883  * Positional scatter read system call.
2884  */
2885 int
2886 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2887 {
2888 	/* {
2889 		syscallarg(int) fd;
2890 		syscallarg(const struct iovec *) iovp;
2891 		syscallarg(int) iovcnt;
2892 		syscallarg(off_t) offset;
2893 	} */
2894 	off_t offset = SCARG(uap, offset);
2895 
2896 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2897 	    SCARG(uap, iovcnt), &offset, 0, retval);
2898 }
2899 
2900 /*
2901  * Positional write system call.
2902  */
2903 int
2904 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2905 {
2906 	/* {
2907 		syscallarg(int) fd;
2908 		syscallarg(const void *) buf;
2909 		syscallarg(size_t) nbyte;
2910 		syscallarg(off_t) offset;
2911 	} */
2912 	file_t *fp;
2913 	struct vnode *vp;
2914 	off_t offset;
2915 	int error, fd = SCARG(uap, fd);
2916 
2917 	if ((fp = fd_getfile(fd)) == NULL)
2918 		return (EBADF);
2919 
2920 	if ((fp->f_flag & FWRITE) == 0) {
2921 		fd_putfile(fd);
2922 		return (EBADF);
2923 	}
2924 
2925 	vp = fp->f_vnode;
2926 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2927 		error = ESPIPE;
2928 		goto out;
2929 	}
2930 
2931 	offset = SCARG(uap, offset);
2932 
2933 	/*
2934 	 * XXX This works because no file systems actually
2935 	 * XXX take any action on the seek operation.
2936 	 */
2937 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2938 		goto out;
2939 
2940 	/* dofilewrite() will unuse the descriptor for us */
2941 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2942 	    &offset, 0, retval));
2943 
2944  out:
2945 	fd_putfile(fd);
2946 	return (error);
2947 }
2948 
2949 /*
2950  * Positional gather write system call.
2951  */
2952 int
2953 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2954 {
2955 	/* {
2956 		syscallarg(int) fd;
2957 		syscallarg(const struct iovec *) iovp;
2958 		syscallarg(int) iovcnt;
2959 		syscallarg(off_t) offset;
2960 	} */
2961 	off_t offset = SCARG(uap, offset);
2962 
2963 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2964 	    SCARG(uap, iovcnt), &offset, 0, retval);
2965 }
2966 
2967 /*
2968  * Check access permissions.
2969  */
2970 int
2971 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2972 {
2973 	/* {
2974 		syscallarg(const char *) path;
2975 		syscallarg(int) flags;
2976 	} */
2977 
2978 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
2979 	     SCARG(uap, flags), 0);
2980 }
2981 
2982 int
2983 do_sys_accessat(struct lwp *l, int fdat, const char *path,
2984     int mode, int flags)
2985 {
2986 	kauth_cred_t cred;
2987 	struct vnode *vp;
2988 	int error, nd_flag, vmode;
2989 	struct pathbuf *pb;
2990 	struct nameidata nd;
2991 
2992 	CTASSERT(F_OK == 0);
2993 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
2994 		/* nonsense mode */
2995 		return EINVAL;
2996 	}
2997 
2998 	nd_flag = FOLLOW | LOCKLEAF | TRYEMULROOT;
2999 	if (flags & AT_SYMLINK_NOFOLLOW)
3000 		nd_flag &= ~FOLLOW;
3001 
3002 	error = pathbuf_copyin(path, &pb);
3003 	if (error)
3004 		return error;
3005 
3006 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3007 
3008 	/* Override default credentials */
3009 	cred = kauth_cred_dup(l->l_cred);
3010 	if (!(flags & AT_EACCESS)) {
3011 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3012 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3013 	}
3014 	nd.ni_cnd.cn_cred = cred;
3015 
3016 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3017 		pathbuf_destroy(pb);
3018 		goto out;
3019 	}
3020 	vp = nd.ni_vp;
3021 	pathbuf_destroy(pb);
3022 
3023 	/* Flags == 0 means only check for existence. */
3024 	if (mode) {
3025 		vmode = 0;
3026 		if (mode & R_OK)
3027 			vmode |= VREAD;
3028 		if (mode & W_OK)
3029 			vmode |= VWRITE;
3030 		if (mode & X_OK)
3031 			vmode |= VEXEC;
3032 
3033 		error = VOP_ACCESS(vp, vmode, cred);
3034 		if (!error && (vmode & VWRITE))
3035 			error = vn_writechk(vp);
3036 	}
3037 	vput(vp);
3038 out:
3039 	kauth_cred_free(cred);
3040 	return (error);
3041 }
3042 
3043 int
3044 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3045     register_t *retval)
3046 {
3047 	/* {
3048 		syscallarg(int) fd;
3049 		syscallarg(const char *) path;
3050 		syscallarg(int) amode;
3051 		syscallarg(int) flag;
3052 	} */
3053 
3054 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3055 	     SCARG(uap, amode), SCARG(uap, flag));
3056 }
3057 
3058 /*
3059  * Common code for all sys_stat functions, including compat versions.
3060  */
3061 int
3062 do_sys_stat(const char *userpath, unsigned int nd_flag,
3063     struct stat *sb)
3064 {
3065 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3066 }
3067 
3068 int
3069 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3070     unsigned int nd_flag, struct stat *sb)
3071 {
3072 	int error;
3073 	struct pathbuf *pb;
3074 	struct nameidata nd;
3075 
3076 	KASSERT(l != NULL || fdat == AT_FDCWD);
3077 
3078 	error = pathbuf_copyin(userpath, &pb);
3079 	if (error) {
3080 		return error;
3081 	}
3082 
3083 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3084 
3085 	error = fd_nameiat(l, fdat, &nd);
3086 	if (error != 0) {
3087 		pathbuf_destroy(pb);
3088 		return error;
3089 	}
3090 	error = vn_stat(nd.ni_vp, sb);
3091 	vput(nd.ni_vp);
3092 	pathbuf_destroy(pb);
3093 	return error;
3094 }
3095 
3096 /*
3097  * Get file status; this version follows links.
3098  */
3099 /* ARGSUSED */
3100 int
3101 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3102 {
3103 	/* {
3104 		syscallarg(const char *) path;
3105 		syscallarg(struct stat *) ub;
3106 	} */
3107 	struct stat sb;
3108 	int error;
3109 
3110 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3111 	if (error)
3112 		return error;
3113 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3114 }
3115 
3116 /*
3117  * Get file status; this version does not follow links.
3118  */
3119 /* ARGSUSED */
3120 int
3121 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3122 {
3123 	/* {
3124 		syscallarg(const char *) path;
3125 		syscallarg(struct stat *) ub;
3126 	} */
3127 	struct stat sb;
3128 	int error;
3129 
3130 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3131 	if (error)
3132 		return error;
3133 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3134 }
3135 
3136 int
3137 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3138     register_t *retval)
3139 {
3140 	/* {
3141 		syscallarg(int) fd;
3142 		syscallarg(const char *) path;
3143 		syscallarg(struct stat *) buf;
3144 		syscallarg(int) flag;
3145 	} */
3146 	unsigned int nd_flag;
3147 	struct stat sb;
3148 	int error;
3149 
3150 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3151 		nd_flag = NOFOLLOW;
3152 	else
3153 		nd_flag = FOLLOW;
3154 
3155 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3156 	    &sb);
3157 	if (error)
3158 		return error;
3159 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3160 }
3161 
3162 /*
3163  * Get configurable pathname variables.
3164  */
3165 /* ARGSUSED */
3166 int
3167 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
3168 {
3169 	/* {
3170 		syscallarg(const char *) path;
3171 		syscallarg(int) name;
3172 	} */
3173 	int error;
3174 	struct pathbuf *pb;
3175 	struct nameidata nd;
3176 
3177 	error = pathbuf_copyin(SCARG(uap, path), &pb);
3178 	if (error) {
3179 		return error;
3180 	}
3181 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3182 	if ((error = namei(&nd)) != 0) {
3183 		pathbuf_destroy(pb);
3184 		return (error);
3185 	}
3186 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
3187 	vput(nd.ni_vp);
3188 	pathbuf_destroy(pb);
3189 	return (error);
3190 }
3191 
3192 /*
3193  * Return target name of a symbolic link.
3194  */
3195 /* ARGSUSED */
3196 int
3197 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3198     register_t *retval)
3199 {
3200 	/* {
3201 		syscallarg(const char *) path;
3202 		syscallarg(char *) buf;
3203 		syscallarg(size_t) count;
3204 	} */
3205 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3206 	    SCARG(uap, buf), SCARG(uap, count), retval);
3207 }
3208 
3209 static int
3210 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3211     size_t count, register_t *retval)
3212 {
3213 	struct vnode *vp;
3214 	struct iovec aiov;
3215 	struct uio auio;
3216 	int error;
3217 	struct pathbuf *pb;
3218 	struct nameidata nd;
3219 
3220 	error = pathbuf_copyin(path, &pb);
3221 	if (error) {
3222 		return error;
3223 	}
3224 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
3225 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3226 		pathbuf_destroy(pb);
3227 		return error;
3228 	}
3229 	vp = nd.ni_vp;
3230 	pathbuf_destroy(pb);
3231 	if (vp->v_type != VLNK)
3232 		error = EINVAL;
3233 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3234 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3235 		aiov.iov_base = buf;
3236 		aiov.iov_len = count;
3237 		auio.uio_iov = &aiov;
3238 		auio.uio_iovcnt = 1;
3239 		auio.uio_offset = 0;
3240 		auio.uio_rw = UIO_READ;
3241 		KASSERT(l == curlwp);
3242 		auio.uio_vmspace = l->l_proc->p_vmspace;
3243 		auio.uio_resid = count;
3244 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3245 			*retval = count - auio.uio_resid;
3246 	}
3247 	vput(vp);
3248 	return (error);
3249 }
3250 
3251 int
3252 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3253     register_t *retval)
3254 {
3255 	/* {
3256 		syscallarg(int) fd;
3257 		syscallarg(const char *) path;
3258 		syscallarg(char *) buf;
3259 		syscallarg(size_t) bufsize;
3260 	} */
3261 
3262 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3263 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3264 }
3265 
3266 /*
3267  * Change flags of a file given a path name.
3268  */
3269 /* ARGSUSED */
3270 int
3271 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3272 {
3273 	/* {
3274 		syscallarg(const char *) path;
3275 		syscallarg(u_long) flags;
3276 	} */
3277 	struct vnode *vp;
3278 	int error;
3279 
3280 	error = namei_simple_user(SCARG(uap, path),
3281 				NSM_FOLLOW_TRYEMULROOT, &vp);
3282 	if (error != 0)
3283 		return (error);
3284 	error = change_flags(vp, SCARG(uap, flags), l);
3285 	vput(vp);
3286 	return (error);
3287 }
3288 
3289 /*
3290  * Change flags of a file given a file descriptor.
3291  */
3292 /* ARGSUSED */
3293 int
3294 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3295 {
3296 	/* {
3297 		syscallarg(int) fd;
3298 		syscallarg(u_long) flags;
3299 	} */
3300 	struct vnode *vp;
3301 	file_t *fp;
3302 	int error;
3303 
3304 	/* fd_getvnode() will use the descriptor for us */
3305 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3306 		return (error);
3307 	vp = fp->f_vnode;
3308 	error = change_flags(vp, SCARG(uap, flags), l);
3309 	VOP_UNLOCK(vp);
3310 	fd_putfile(SCARG(uap, fd));
3311 	return (error);
3312 }
3313 
3314 /*
3315  * Change flags of a file given a path name; this version does
3316  * not follow links.
3317  */
3318 int
3319 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3320 {
3321 	/* {
3322 		syscallarg(const char *) path;
3323 		syscallarg(u_long) flags;
3324 	} */
3325 	struct vnode *vp;
3326 	int error;
3327 
3328 	error = namei_simple_user(SCARG(uap, path),
3329 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3330 	if (error != 0)
3331 		return (error);
3332 	error = change_flags(vp, SCARG(uap, flags), l);
3333 	vput(vp);
3334 	return (error);
3335 }
3336 
3337 /*
3338  * Common routine to change flags of a file.
3339  */
3340 int
3341 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3342 {
3343 	struct vattr vattr;
3344 	int error;
3345 
3346 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3347 
3348 	vattr_null(&vattr);
3349 	vattr.va_flags = flags;
3350 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3351 
3352 	return (error);
3353 }
3354 
3355 /*
3356  * Change mode of a file given path name; this version follows links.
3357  */
3358 /* ARGSUSED */
3359 int
3360 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3361 {
3362 	/* {
3363 		syscallarg(const char *) path;
3364 		syscallarg(int) mode;
3365 	} */
3366 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3367 			      SCARG(uap, mode), 0);
3368 }
3369 
3370 int
3371 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3372 {
3373 	int error;
3374 	struct vnode *vp;
3375 	namei_simple_flags_t ns_flag;
3376 
3377 	if (flags & AT_SYMLINK_NOFOLLOW)
3378 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3379 	else
3380 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3381 
3382 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3383 	if (error != 0)
3384 		return error;
3385 
3386 	error = change_mode(vp, mode, l);
3387 
3388 	vrele(vp);
3389 
3390 	return (error);
3391 }
3392 
3393 /*
3394  * Change mode of a file given a file descriptor.
3395  */
3396 /* ARGSUSED */
3397 int
3398 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3399 {
3400 	/* {
3401 		syscallarg(int) fd;
3402 		syscallarg(int) mode;
3403 	} */
3404 	file_t *fp;
3405 	int error;
3406 
3407 	/* fd_getvnode() will use the descriptor for us */
3408 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3409 		return (error);
3410 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3411 	fd_putfile(SCARG(uap, fd));
3412 	return (error);
3413 }
3414 
3415 int
3416 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3417     register_t *retval)
3418 {
3419 	/* {
3420 		syscallarg(int) fd;
3421 		syscallarg(const char *) path;
3422 		syscallarg(int) mode;
3423 		syscallarg(int) flag;
3424 	} */
3425 
3426 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3427 			      SCARG(uap, mode), SCARG(uap, flag));
3428 }
3429 
3430 /*
3431  * Change mode of a file given path name; this version does not follow links.
3432  */
3433 /* ARGSUSED */
3434 int
3435 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3436 {
3437 	/* {
3438 		syscallarg(const char *) path;
3439 		syscallarg(int) mode;
3440 	} */
3441 	int error;
3442 	struct vnode *vp;
3443 
3444 	error = namei_simple_user(SCARG(uap, path),
3445 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3446 	if (error != 0)
3447 		return (error);
3448 
3449 	error = change_mode(vp, SCARG(uap, mode), l);
3450 
3451 	vrele(vp);
3452 	return (error);
3453 }
3454 
3455 /*
3456  * Common routine to set mode given a vnode.
3457  */
3458 static int
3459 change_mode(struct vnode *vp, int mode, struct lwp *l)
3460 {
3461 	struct vattr vattr;
3462 	int error;
3463 
3464 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3465 	vattr_null(&vattr);
3466 	vattr.va_mode = mode & ALLPERMS;
3467 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3468 	VOP_UNLOCK(vp);
3469 	return (error);
3470 }
3471 
3472 /*
3473  * Set ownership given a path name; this version follows links.
3474  */
3475 /* ARGSUSED */
3476 int
3477 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3478 {
3479 	/* {
3480 		syscallarg(const char *) path;
3481 		syscallarg(uid_t) uid;
3482 		syscallarg(gid_t) gid;
3483 	} */
3484 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3485 			      SCARG(uap, gid), 0);
3486 }
3487 
3488 int
3489 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3490    gid_t gid, int flags)
3491 {
3492 	int error;
3493 	struct vnode *vp;
3494 	namei_simple_flags_t ns_flag;
3495 
3496 	if (flags & AT_SYMLINK_NOFOLLOW)
3497 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3498 	else
3499 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3500 
3501 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3502 	if (error != 0)
3503 		return error;
3504 
3505 	error = change_owner(vp, uid, gid, l, 0);
3506 
3507 	vrele(vp);
3508 
3509 	return (error);
3510 }
3511 
3512 /*
3513  * Set ownership given a path name; this version follows links.
3514  * Provides POSIX semantics.
3515  */
3516 /* ARGSUSED */
3517 int
3518 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3519 {
3520 	/* {
3521 		syscallarg(const char *) path;
3522 		syscallarg(uid_t) uid;
3523 		syscallarg(gid_t) gid;
3524 	} */
3525 	int error;
3526 	struct vnode *vp;
3527 
3528 	error = namei_simple_user(SCARG(uap, path),
3529 				NSM_FOLLOW_TRYEMULROOT, &vp);
3530 	if (error != 0)
3531 		return (error);
3532 
3533 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3534 
3535 	vrele(vp);
3536 	return (error);
3537 }
3538 
3539 /*
3540  * Set ownership given a file descriptor.
3541  */
3542 /* ARGSUSED */
3543 int
3544 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3545 {
3546 	/* {
3547 		syscallarg(int) fd;
3548 		syscallarg(uid_t) uid;
3549 		syscallarg(gid_t) gid;
3550 	} */
3551 	int error;
3552 	file_t *fp;
3553 
3554 	/* fd_getvnode() will use the descriptor for us */
3555 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3556 		return (error);
3557 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3558 	    l, 0);
3559 	fd_putfile(SCARG(uap, fd));
3560 	return (error);
3561 }
3562 
3563 int
3564 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3565     register_t *retval)
3566 {
3567 	/* {
3568 		syscallarg(int) fd;
3569 		syscallarg(const char *) path;
3570 		syscallarg(uid_t) owner;
3571 		syscallarg(gid_t) group;
3572 		syscallarg(int) flag;
3573 	} */
3574 
3575 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3576 			      SCARG(uap, owner), SCARG(uap, group),
3577 			      SCARG(uap, flag));
3578 }
3579 
3580 /*
3581  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3582  */
3583 /* ARGSUSED */
3584 int
3585 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3586 {
3587 	/* {
3588 		syscallarg(int) fd;
3589 		syscallarg(uid_t) uid;
3590 		syscallarg(gid_t) gid;
3591 	} */
3592 	int error;
3593 	file_t *fp;
3594 
3595 	/* fd_getvnode() will use the descriptor for us */
3596 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3597 		return (error);
3598 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3599 	    l, 1);
3600 	fd_putfile(SCARG(uap, fd));
3601 	return (error);
3602 }
3603 
3604 /*
3605  * Set ownership given a path name; this version does not follow links.
3606  */
3607 /* ARGSUSED */
3608 int
3609 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3610 {
3611 	/* {
3612 		syscallarg(const char *) path;
3613 		syscallarg(uid_t) uid;
3614 		syscallarg(gid_t) gid;
3615 	} */
3616 	int error;
3617 	struct vnode *vp;
3618 
3619 	error = namei_simple_user(SCARG(uap, path),
3620 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3621 	if (error != 0)
3622 		return (error);
3623 
3624 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3625 
3626 	vrele(vp);
3627 	return (error);
3628 }
3629 
3630 /*
3631  * Set ownership given a path name; this version does not follow links.
3632  * Provides POSIX/XPG semantics.
3633  */
3634 /* ARGSUSED */
3635 int
3636 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3637 {
3638 	/* {
3639 		syscallarg(const char *) path;
3640 		syscallarg(uid_t) uid;
3641 		syscallarg(gid_t) gid;
3642 	} */
3643 	int error;
3644 	struct vnode *vp;
3645 
3646 	error = namei_simple_user(SCARG(uap, path),
3647 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3648 	if (error != 0)
3649 		return (error);
3650 
3651 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3652 
3653 	vrele(vp);
3654 	return (error);
3655 }
3656 
3657 /*
3658  * Common routine to set ownership given a vnode.
3659  */
3660 static int
3661 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3662     int posix_semantics)
3663 {
3664 	struct vattr vattr;
3665 	mode_t newmode;
3666 	int error;
3667 
3668 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3669 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3670 		goto out;
3671 
3672 #define CHANGED(x) ((int)(x) != -1)
3673 	newmode = vattr.va_mode;
3674 	if (posix_semantics) {
3675 		/*
3676 		 * POSIX/XPG semantics: if the caller is not the super-user,
3677 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3678 		 * the XPG consider the behaviour for calls by the super-user
3679 		 * implementation-defined; we leave the set-user-id and set-
3680 		 * group-id settings intact in that case.
3681 		 */
3682 		if (vattr.va_mode & S_ISUID) {
3683 			if (kauth_authorize_vnode(l->l_cred,
3684 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3685 				newmode &= ~S_ISUID;
3686 		}
3687 		if (vattr.va_mode & S_ISGID) {
3688 			if (kauth_authorize_vnode(l->l_cred,
3689 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3690 				newmode &= ~S_ISGID;
3691 		}
3692 	} else {
3693 		/*
3694 		 * NetBSD semantics: when changing owner and/or group,
3695 		 * clear the respective bit(s).
3696 		 */
3697 		if (CHANGED(uid))
3698 			newmode &= ~S_ISUID;
3699 		if (CHANGED(gid))
3700 			newmode &= ~S_ISGID;
3701 	}
3702 	/* Update va_mode iff altered. */
3703 	if (vattr.va_mode == newmode)
3704 		newmode = VNOVAL;
3705 
3706 	vattr_null(&vattr);
3707 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3708 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3709 	vattr.va_mode = newmode;
3710 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3711 #undef CHANGED
3712 
3713 out:
3714 	VOP_UNLOCK(vp);
3715 	return (error);
3716 }
3717 
3718 /*
3719  * Set the access and modification times given a path name; this
3720  * version follows links.
3721  */
3722 /* ARGSUSED */
3723 int
3724 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3725     register_t *retval)
3726 {
3727 	/* {
3728 		syscallarg(const char *) path;
3729 		syscallarg(const struct timeval *) tptr;
3730 	} */
3731 
3732 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3733 	    SCARG(uap, tptr), UIO_USERSPACE);
3734 }
3735 
3736 /*
3737  * Set the access and modification times given a file descriptor.
3738  */
3739 /* ARGSUSED */
3740 int
3741 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3742     register_t *retval)
3743 {
3744 	/* {
3745 		syscallarg(int) fd;
3746 		syscallarg(const struct timeval *) tptr;
3747 	} */
3748 	int error;
3749 	file_t *fp;
3750 
3751 	/* fd_getvnode() will use the descriptor for us */
3752 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3753 		return (error);
3754 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3755 	    UIO_USERSPACE);
3756 	fd_putfile(SCARG(uap, fd));
3757 	return (error);
3758 }
3759 
3760 int
3761 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3762     register_t *retval)
3763 {
3764 	/* {
3765 		syscallarg(int) fd;
3766 		syscallarg(const struct timespec *) tptr;
3767 	} */
3768 	int error;
3769 	file_t *fp;
3770 
3771 	/* fd_getvnode() will use the descriptor for us */
3772 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3773 		return (error);
3774 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3775 	    SCARG(uap, tptr), UIO_USERSPACE);
3776 	fd_putfile(SCARG(uap, fd));
3777 	return (error);
3778 }
3779 
3780 /*
3781  * Set the access and modification times given a path name; this
3782  * version does not follow links.
3783  */
3784 int
3785 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3786     register_t *retval)
3787 {
3788 	/* {
3789 		syscallarg(const char *) path;
3790 		syscallarg(const struct timeval *) tptr;
3791 	} */
3792 
3793 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3794 	    SCARG(uap, tptr), UIO_USERSPACE);
3795 }
3796 
3797 int
3798 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3799     register_t *retval)
3800 {
3801 	/* {
3802 		syscallarg(int) fd;
3803 		syscallarg(const char *) path;
3804 		syscallarg(const struct timespec *) tptr;
3805 		syscallarg(int) flag;
3806 	} */
3807 	int follow;
3808 	const struct timespec *tptr;
3809 	int error;
3810 
3811 	tptr = SCARG(uap, tptr);
3812 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3813 
3814 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3815 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3816 
3817 	return error;
3818 }
3819 
3820 /*
3821  * Common routine to set access and modification times given a vnode.
3822  */
3823 int
3824 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3825     const struct timespec *tptr, enum uio_seg seg)
3826 {
3827 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3828 }
3829 
3830 int
3831 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3832     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3833 {
3834 	struct vattr vattr;
3835 	int error, dorele = 0;
3836 	namei_simple_flags_t sflags;
3837 	bool vanull, setbirthtime;
3838 	struct timespec ts[2];
3839 
3840 	KASSERT(l != NULL || fdat == AT_FDCWD);
3841 
3842 	/*
3843 	 * I have checked all callers and they pass either FOLLOW,
3844 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3845 	 * is 0. More to the point, they don't pass anything else.
3846 	 * Let's keep it that way at least until the namei interfaces
3847 	 * are fully sanitized.
3848 	 */
3849 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3850 	sflags = (flag == FOLLOW) ?
3851 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3852 
3853 	if (tptr == NULL) {
3854 		vanull = true;
3855 		nanotime(&ts[0]);
3856 		ts[1] = ts[0];
3857 	} else {
3858 		vanull = false;
3859 		if (seg != UIO_SYSSPACE) {
3860 			error = copyin(tptr, ts, sizeof (ts));
3861 			if (error != 0)
3862 				return error;
3863 		} else {
3864 			ts[0] = tptr[0];
3865 			ts[1] = tptr[1];
3866 		}
3867 	}
3868 
3869 	if (ts[0].tv_nsec == UTIME_NOW) {
3870 		nanotime(&ts[0]);
3871 		if (ts[1].tv_nsec == UTIME_NOW) {
3872 			vanull = true;
3873 			ts[1] = ts[0];
3874 		}
3875 	} else if (ts[1].tv_nsec == UTIME_NOW)
3876 		nanotime(&ts[1]);
3877 
3878 	if (vp == NULL) {
3879 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3880 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
3881 		if (error != 0)
3882 			return error;
3883 		dorele = 1;
3884 	}
3885 
3886 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3887 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3888 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3889 	vattr_null(&vattr);
3890 
3891 	if (ts[0].tv_nsec != UTIME_OMIT)
3892 		vattr.va_atime = ts[0];
3893 
3894 	if (ts[1].tv_nsec != UTIME_OMIT) {
3895 		vattr.va_mtime = ts[1];
3896 		if (setbirthtime)
3897 			vattr.va_birthtime = ts[1];
3898 	}
3899 
3900 	if (vanull)
3901 		vattr.va_vaflags |= VA_UTIMES_NULL;
3902 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3903 	VOP_UNLOCK(vp);
3904 
3905 	if (dorele != 0)
3906 		vrele(vp);
3907 
3908 	return error;
3909 }
3910 
3911 int
3912 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3913     const struct timeval *tptr, enum uio_seg seg)
3914 {
3915 	struct timespec ts[2];
3916 	struct timespec *tsptr = NULL;
3917 	int error;
3918 
3919 	if (tptr != NULL) {
3920 		struct timeval tv[2];
3921 
3922 		if (seg != UIO_SYSSPACE) {
3923 			error = copyin(tptr, tv, sizeof(tv));
3924 			if (error != 0)
3925 				return error;
3926 			tptr = tv;
3927 		}
3928 
3929 		if ((tptr[0].tv_usec == UTIME_NOW) ||
3930 		    (tptr[0].tv_usec == UTIME_OMIT))
3931 			ts[0].tv_nsec = tptr[0].tv_usec;
3932 		else {
3933 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
3934 				return EINVAL;
3935 
3936 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3937 		}
3938 
3939 		if ((tptr[1].tv_usec == UTIME_NOW) ||
3940 		    (tptr[1].tv_usec == UTIME_OMIT))
3941 			ts[1].tv_nsec = tptr[1].tv_usec;
3942 		else {
3943 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
3944 				return EINVAL;
3945 
3946 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3947 		}
3948 
3949 		tsptr = &ts[0];
3950 	}
3951 
3952 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3953 }
3954 
3955 /*
3956  * Truncate a file given its path name.
3957  */
3958 /* ARGSUSED */
3959 int
3960 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3961 {
3962 	/* {
3963 		syscallarg(const char *) path;
3964 		syscallarg(int) pad;
3965 		syscallarg(off_t) length;
3966 	} */
3967 	struct vnode *vp;
3968 	struct vattr vattr;
3969 	int error;
3970 
3971 	if (SCARG(uap, length) < 0)
3972 		return EINVAL;
3973 
3974 	error = namei_simple_user(SCARG(uap, path),
3975 				NSM_FOLLOW_TRYEMULROOT, &vp);
3976 	if (error != 0)
3977 		return (error);
3978 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3979 	if (vp->v_type == VDIR)
3980 		error = EISDIR;
3981 	else if ((error = vn_writechk(vp)) == 0 &&
3982 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3983 		vattr_null(&vattr);
3984 		vattr.va_size = SCARG(uap, length);
3985 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3986 	}
3987 	vput(vp);
3988 	return (error);
3989 }
3990 
3991 /*
3992  * Truncate a file given a file descriptor.
3993  */
3994 /* ARGSUSED */
3995 int
3996 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3997 {
3998 	/* {
3999 		syscallarg(int) fd;
4000 		syscallarg(int) pad;
4001 		syscallarg(off_t) length;
4002 	} */
4003 	struct vattr vattr;
4004 	struct vnode *vp;
4005 	file_t *fp;
4006 	int error;
4007 
4008 	if (SCARG(uap, length) < 0)
4009 		return EINVAL;
4010 
4011 	/* fd_getvnode() will use the descriptor for us */
4012 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4013 		return (error);
4014 	if ((fp->f_flag & FWRITE) == 0) {
4015 		error = EINVAL;
4016 		goto out;
4017 	}
4018 	vp = fp->f_vnode;
4019 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4020 	if (vp->v_type == VDIR)
4021 		error = EISDIR;
4022 	else if ((error = vn_writechk(vp)) == 0) {
4023 		vattr_null(&vattr);
4024 		vattr.va_size = SCARG(uap, length);
4025 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
4026 	}
4027 	VOP_UNLOCK(vp);
4028  out:
4029 	fd_putfile(SCARG(uap, fd));
4030 	return (error);
4031 }
4032 
4033 /*
4034  * Sync an open file.
4035  */
4036 /* ARGSUSED */
4037 int
4038 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4039 {
4040 	/* {
4041 		syscallarg(int) fd;
4042 	} */
4043 	struct vnode *vp;
4044 	file_t *fp;
4045 	int error;
4046 
4047 	/* fd_getvnode() will use the descriptor for us */
4048 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4049 		return (error);
4050 	vp = fp->f_vnode;
4051 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4052 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4053 	VOP_UNLOCK(vp);
4054 	fd_putfile(SCARG(uap, fd));
4055 	return (error);
4056 }
4057 
4058 /*
4059  * Sync a range of file data.  API modeled after that found in AIX.
4060  *
4061  * FDATASYNC indicates that we need only save enough metadata to be able
4062  * to re-read the written data.
4063  */
4064 /* ARGSUSED */
4065 int
4066 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4067 {
4068 	/* {
4069 		syscallarg(int) fd;
4070 		syscallarg(int) flags;
4071 		syscallarg(off_t) start;
4072 		syscallarg(off_t) length;
4073 	} */
4074 	struct vnode *vp;
4075 	file_t *fp;
4076 	int flags, nflags;
4077 	off_t s, e, len;
4078 	int error;
4079 
4080 	/* fd_getvnode() will use the descriptor for us */
4081 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4082 		return (error);
4083 
4084 	if ((fp->f_flag & FWRITE) == 0) {
4085 		error = EBADF;
4086 		goto out;
4087 	}
4088 
4089 	flags = SCARG(uap, flags);
4090 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4091 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4092 		error = EINVAL;
4093 		goto out;
4094 	}
4095 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4096 	if (flags & FDATASYNC)
4097 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4098 	else
4099 		nflags = FSYNC_WAIT;
4100 	if (flags & FDISKSYNC)
4101 		nflags |= FSYNC_CACHE;
4102 
4103 	len = SCARG(uap, length);
4104 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4105 	if (len) {
4106 		s = SCARG(uap, start);
4107 		e = s + len;
4108 		if (e < s) {
4109 			error = EINVAL;
4110 			goto out;
4111 		}
4112 	} else {
4113 		e = 0;
4114 		s = 0;
4115 	}
4116 
4117 	vp = fp->f_vnode;
4118 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4119 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4120 	VOP_UNLOCK(vp);
4121 out:
4122 	fd_putfile(SCARG(uap, fd));
4123 	return (error);
4124 }
4125 
4126 /*
4127  * Sync the data of an open file.
4128  */
4129 /* ARGSUSED */
4130 int
4131 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4132 {
4133 	/* {
4134 		syscallarg(int) fd;
4135 	} */
4136 	struct vnode *vp;
4137 	file_t *fp;
4138 	int error;
4139 
4140 	/* fd_getvnode() will use the descriptor for us */
4141 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4142 		return (error);
4143 	vp = fp->f_vnode;
4144 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4145 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4146 	VOP_UNLOCK(vp);
4147 	fd_putfile(SCARG(uap, fd));
4148 	return (error);
4149 }
4150 
4151 /*
4152  * Rename files, (standard) BSD semantics frontend.
4153  */
4154 /* ARGSUSED */
4155 int
4156 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4157 {
4158 	/* {
4159 		syscallarg(const char *) from;
4160 		syscallarg(const char *) to;
4161 	} */
4162 
4163 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4164 	    SCARG(uap, to), UIO_USERSPACE, 0));
4165 }
4166 
4167 int
4168 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4169     register_t *retval)
4170 {
4171 	/* {
4172 		syscallarg(int) fromfd;
4173 		syscallarg(const char *) from;
4174 		syscallarg(int) tofd;
4175 		syscallarg(const char *) to;
4176 	} */
4177 
4178 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4179 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4180 }
4181 
4182 /*
4183  * Rename files, POSIX semantics frontend.
4184  */
4185 /* ARGSUSED */
4186 int
4187 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4188 {
4189 	/* {
4190 		syscallarg(const char *) from;
4191 		syscallarg(const char *) to;
4192 	} */
4193 
4194 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4195 	    SCARG(uap, to), UIO_USERSPACE, 1));
4196 }
4197 
4198 /*
4199  * Rename files.  Source and destination must either both be directories,
4200  * or both not be directories.  If target is a directory, it must be empty.
4201  * If `from' and `to' refer to the same object, the value of the `retain'
4202  * argument is used to determine whether `from' will be
4203  *
4204  * (retain == 0)	deleted unless `from' and `to' refer to the same
4205  *			object in the file system's name space (BSD).
4206  * (retain == 1)	always retained (POSIX).
4207  *
4208  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4209  */
4210 int
4211 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4212 {
4213 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4214 }
4215 
4216 static int
4217 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4218     const char *to, enum uio_seg seg, int retain)
4219 {
4220 	struct pathbuf *fpb, *tpb;
4221 	struct nameidata fnd, tnd;
4222 	struct vnode *fdvp, *fvp;
4223 	struct vnode *tdvp, *tvp;
4224 	struct mount *mp, *tmp;
4225 	int error;
4226 
4227 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4228 
4229 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4230 	if (error)
4231 		goto out0;
4232 	KASSERT(fpb != NULL);
4233 
4234 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4235 	if (error)
4236 		goto out1;
4237 	KASSERT(tpb != NULL);
4238 
4239 	/*
4240 	 * Lookup from.
4241 	 *
4242 	 * XXX LOCKPARENT is wrong because we don't actually want it
4243 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4244 	 * insane, so for the time being we need to leave it like this.
4245 	 */
4246 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4247 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4248 		goto out2;
4249 
4250 	/*
4251 	 * Pull out the important results of the lookup, fdvp and fvp.
4252 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4253 	 */
4254 	fdvp = fnd.ni_dvp;
4255 	fvp = fnd.ni_vp;
4256 	mp = fdvp->v_mount;
4257 	KASSERT(fdvp != NULL);
4258 	KASSERT(fvp != NULL);
4259 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4260 	/*
4261 	 * Bracket the operation with fstrans_start()/fstrans_done().
4262 	 *
4263 	 * Inside the bracket this file system cannot be unmounted so
4264 	 * a vnode on this file system cannot change its v_mount.
4265 	 * A vnode on another file system may still change to dead mount.
4266 	 */
4267 	fstrans_start(mp);
4268 
4269 	/*
4270 	 * Make sure neither fdvp nor fvp is locked.
4271 	 */
4272 	if (fdvp != fvp)
4273 		VOP_UNLOCK(fdvp);
4274 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4275 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4276 
4277 	/*
4278 	 * Reject renaming `.' and `..'.  Can't do this until after
4279 	 * namei because we need namei's parsing to find the final
4280 	 * component name.  (namei should just leave us with the final
4281 	 * component name and not look it up itself, but anyway...)
4282 	 *
4283 	 * This was here before because we used to relookup from
4284 	 * instead of to and relookup requires the caller to check
4285 	 * this, but now file systems may depend on this check, so we
4286 	 * must retain it until the file systems are all rototilled.
4287 	 */
4288 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4289 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4290 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4291 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4292 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4293 		error = EINVAL;	/* XXX EISDIR?  */
4294 		goto abort0;
4295 	}
4296 
4297 	/*
4298 	 * Lookup to.
4299 	 *
4300 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4301 	 * fvp here to decide whether to add CREATEDIR is a load of
4302 	 * bollocks because fvp might be the wrong node by now, since
4303 	 * fdvp is unlocked.
4304 	 *
4305 	 * XXX Why not pass CREATEDIR always?
4306 	 */
4307 	NDINIT(&tnd, RENAME,
4308 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4309 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4310 	    tpb);
4311 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4312 		goto abort0;
4313 
4314 	/*
4315 	 * Pull out the important results of the lookup, tdvp and tvp.
4316 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4317 	 */
4318 	tdvp = tnd.ni_dvp;
4319 	tvp = tnd.ni_vp;
4320 	KASSERT(tdvp != NULL);
4321 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4322 
4323 	/*
4324 	 * Make sure neither tdvp nor tvp is locked.
4325 	 */
4326 	if (tdvp != tvp)
4327 		VOP_UNLOCK(tdvp);
4328 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4329 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4330 
4331 	/*
4332 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4333 	 * these, which is why we must do this here.  Once upon a time
4334 	 * we relooked up from instead of to, and consequently didn't
4335 	 * need this check, but now that we relookup to instead of
4336 	 * from, we need this; and we shall need it forever forward
4337 	 * until the VOP_RENAME protocol changes, because file systems
4338 	 * will no doubt begin to depend on this check.
4339 	 */
4340 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4341 		error = EISDIR;
4342 		goto abort1;
4343 	}
4344 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4345 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4346 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4347 		error = EINVAL;
4348 		goto abort1;
4349 	}
4350 
4351 	/*
4352 	 * Make sure the mount points match.  Although we don't hold
4353 	 * any vnode locks, the v_mount on fdvp file system are stable.
4354 	 *
4355 	 * Unmounting another file system at an inopportune moment may
4356 	 * cause tdvp to disappear and change its v_mount to dead.
4357 	 *
4358 	 * So in either case different v_mount means cross-device rename.
4359 	 */
4360 	KASSERT(mp != NULL);
4361 	tmp = tdvp->v_mount;
4362 
4363 	if (mp != tmp) {
4364 		error = EXDEV;
4365 		goto abort1;
4366 	}
4367 
4368 	/*
4369 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4370 	 * Nothing is locked currently, so taking this lock is safe.
4371 	 */
4372 	error = VFS_RENAMELOCK_ENTER(mp);
4373 	if (error)
4374 		goto abort1;
4375 
4376 	/*
4377 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4378 	 * and nothing is locked except for the vfs rename lock.
4379 	 *
4380 	 * The next step is a little rain dance to conform to the
4381 	 * insane lock protocol, even though it does nothing to ward
4382 	 * off race conditions.
4383 	 *
4384 	 * We need tdvp and tvp to be locked.  However, because we have
4385 	 * unlocked tdvp in order to hold no locks while we take the
4386 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4387 	 * lock it even if the sensible file systems will just unlock
4388 	 * it straight away.  Consequently, we must lock tdvp and then
4389 	 * relookup tvp to get it locked.
4390 	 *
4391 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4392 	 * and various file systems insanely depend on the semantics of
4393 	 * this brain damage, the lookup of to must be the last lookup
4394 	 * before VOP_RENAME.
4395 	 */
4396 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4397 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4398 	if (error)
4399 		goto abort2;
4400 
4401 	/*
4402 	 * Drop the old tvp and pick up the new one -- which might be
4403 	 * the same, but that doesn't matter to us.  After this, tdvp
4404 	 * and tvp should both be locked.
4405 	 */
4406 	if (tvp != NULL)
4407 		vrele(tvp);
4408 	tvp = tnd.ni_vp;
4409 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4410 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4411 
4412 	/*
4413 	 * The old do_sys_rename had various consistency checks here
4414 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4415 	 * will become bogus soon in any sensible file system, so the
4416 	 * only purpose in putting these checks here is to give lip
4417 	 * service to these screw cases and to acknowledge that they
4418 	 * exist, not actually to handle them, but here you go
4419 	 * anyway...
4420 	 */
4421 
4422 	/*
4423 	 * Acknowledge that directories and non-directories aren't
4424 	 * suposed to mix.
4425 	 */
4426 	if (tvp != NULL) {
4427 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4428 			error = ENOTDIR;
4429 			goto abort3;
4430 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4431 			error = EISDIR;
4432 			goto abort3;
4433 		}
4434 	}
4435 
4436 	/*
4437 	 * Acknowledge some random screw case, among the dozens that
4438 	 * might arise.
4439 	 */
4440 	if (fvp == tdvp) {
4441 		error = EINVAL;
4442 		goto abort3;
4443 	}
4444 
4445 	/*
4446 	 * Acknowledge that POSIX has a wacky screw case.
4447 	 *
4448 	 * XXX Eventually the retain flag needs to be passed on to
4449 	 * VOP_RENAME.
4450 	 */
4451 	if (fvp == tvp) {
4452 		if (retain) {
4453 			error = 0;
4454 			goto abort3;
4455 		} else if ((fdvp == tdvp) &&
4456 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4457 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4458 			fnd.ni_cnd.cn_namelen))) {
4459 			error = 0;
4460 			goto abort3;
4461 		}
4462 	}
4463 
4464 	/*
4465 	 * Make sure veriexec can screw us up.  (But a race can screw
4466 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4467 	 * bogus.)
4468 	 */
4469 #if NVERIEXEC > 0
4470 	{
4471 		char *f1, *f2;
4472 		size_t f1_len;
4473 		size_t f2_len;
4474 
4475 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4476 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4477 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4478 
4479 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4480 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4481 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4482 
4483 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4484 
4485 		kmem_free(f1, f1_len);
4486 		kmem_free(f2, f2_len);
4487 
4488 		if (error)
4489 			goto abort3;
4490 	}
4491 #endif /* NVERIEXEC > 0 */
4492 
4493 	/*
4494 	 * All ready.  Incant the rename vop.
4495 	 */
4496 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4497 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4498 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4499 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4500 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4501 
4502 	/*
4503 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4504 	 * tdvp and tvp.  But we can't assert any of that.
4505 	 */
4506 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4507 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4508 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4509 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4510 
4511 	/*
4512 	 * So all we have left to do is to drop the rename lock and
4513 	 * destroy the pathbufs.
4514 	 */
4515 	VFS_RENAMELOCK_EXIT(mp);
4516 	fstrans_done(mp);
4517 	goto out2;
4518 
4519 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4520 		VOP_UNLOCK(tvp);
4521 abort2:	VOP_UNLOCK(tdvp);
4522 	VFS_RENAMELOCK_EXIT(mp);
4523 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4524 	vrele(tdvp);
4525 	if (tvp != NULL)
4526 		vrele(tvp);
4527 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4528 	vrele(fdvp);
4529 	vrele(fvp);
4530 	fstrans_done(mp);
4531 out2:	pathbuf_destroy(tpb);
4532 out1:	pathbuf_destroy(fpb);
4533 out0:	return error;
4534 }
4535 
4536 /*
4537  * Make a directory file.
4538  */
4539 /* ARGSUSED */
4540 int
4541 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4542 {
4543 	/* {
4544 		syscallarg(const char *) path;
4545 		syscallarg(int) mode;
4546 	} */
4547 
4548 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4549 	    SCARG(uap, mode), UIO_USERSPACE);
4550 }
4551 
4552 int
4553 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4554     register_t *retval)
4555 {
4556 	/* {
4557 		syscallarg(int) fd;
4558 		syscallarg(const char *) path;
4559 		syscallarg(int) mode;
4560 	} */
4561 
4562 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4563 	    SCARG(uap, mode), UIO_USERSPACE);
4564 }
4565 
4566 
4567 int
4568 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4569 {
4570 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4571 }
4572 
4573 static int
4574 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4575     enum uio_seg seg)
4576 {
4577 	struct proc *p = curlwp->l_proc;
4578 	struct vnode *vp;
4579 	struct vattr vattr;
4580 	int error;
4581 	struct pathbuf *pb;
4582 	struct nameidata nd;
4583 
4584 	KASSERT(l != NULL || fdat == AT_FDCWD);
4585 
4586 	/* XXX bollocks, should pass in a pathbuf */
4587 	error = pathbuf_maybe_copyin(path, seg, &pb);
4588 	if (error) {
4589 		return error;
4590 	}
4591 
4592 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4593 
4594 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4595 		pathbuf_destroy(pb);
4596 		return (error);
4597 	}
4598 	vp = nd.ni_vp;
4599 	if (vp != NULL) {
4600 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4601 		if (nd.ni_dvp == vp)
4602 			vrele(nd.ni_dvp);
4603 		else
4604 			vput(nd.ni_dvp);
4605 		vrele(vp);
4606 		pathbuf_destroy(pb);
4607 		return (EEXIST);
4608 	}
4609 	vattr_null(&vattr);
4610 	vattr.va_type = VDIR;
4611 	/* We will read cwdi->cwdi_cmask unlocked. */
4612 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4613 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4614 	if (!error)
4615 		vrele(nd.ni_vp);
4616 	vput(nd.ni_dvp);
4617 	pathbuf_destroy(pb);
4618 	return (error);
4619 }
4620 
4621 /*
4622  * Remove a directory file.
4623  */
4624 /* ARGSUSED */
4625 int
4626 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4627 {
4628 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4629 	    AT_REMOVEDIR, UIO_USERSPACE);
4630 }
4631 
4632 /*
4633  * Read a block of directory entries in a file system independent format.
4634  */
4635 int
4636 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4637 {
4638 	/* {
4639 		syscallarg(int) fd;
4640 		syscallarg(char *) buf;
4641 		syscallarg(size_t) count;
4642 	} */
4643 	file_t *fp;
4644 	int error, done;
4645 
4646 	/* fd_getvnode() will use the descriptor for us */
4647 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4648 		return (error);
4649 	if ((fp->f_flag & FREAD) == 0) {
4650 		error = EBADF;
4651 		goto out;
4652 	}
4653 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4654 			SCARG(uap, count), &done, l, 0, 0);
4655 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4656 	*retval = done;
4657  out:
4658 	fd_putfile(SCARG(uap, fd));
4659 	return (error);
4660 }
4661 
4662 /*
4663  * Set the mode mask for creation of filesystem nodes.
4664  */
4665 int
4666 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4667 {
4668 	/* {
4669 		syscallarg(mode_t) newmask;
4670 	} */
4671 
4672 	/*
4673 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4674 	 * serialization with those reads is required.  It's important to
4675 	 * return a coherent answer for the caller of umask() though, and
4676 	 * the atomic operation accomplishes that.
4677 	 */
4678 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4679 	    SCARG(uap, newmask) & ALLPERMS);
4680 
4681 	return (0);
4682 }
4683 
4684 int
4685 dorevoke(struct vnode *vp, kauth_cred_t cred)
4686 {
4687 	struct vattr vattr;
4688 	int error, fs_decision;
4689 
4690 	vn_lock(vp, LK_SHARED | LK_RETRY);
4691 	error = VOP_GETATTR(vp, &vattr, cred);
4692 	VOP_UNLOCK(vp);
4693 	if (error != 0)
4694 		return error;
4695 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4696 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4697 	    fs_decision);
4698 	if (!error)
4699 		VOP_REVOKE(vp, REVOKEALL);
4700 	return (error);
4701 }
4702 
4703 /*
4704  * Void all references to file by ripping underlying filesystem
4705  * away from vnode.
4706  */
4707 /* ARGSUSED */
4708 int
4709 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4710 {
4711 	/* {
4712 		syscallarg(const char *) path;
4713 	} */
4714 	struct vnode *vp;
4715 	int error;
4716 
4717 	error = namei_simple_user(SCARG(uap, path),
4718 				NSM_FOLLOW_TRYEMULROOT, &vp);
4719 	if (error != 0)
4720 		return (error);
4721 	error = dorevoke(vp, l->l_cred);
4722 	vrele(vp);
4723 	return (error);
4724 }
4725 
4726 /*
4727  * Allocate backing store for a file, filling a hole without having to
4728  * explicitly write anything out.
4729  */
4730 /* ARGSUSED */
4731 int
4732 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4733 		register_t *retval)
4734 {
4735 	/* {
4736 		syscallarg(int) fd;
4737 		syscallarg(off_t) pos;
4738 		syscallarg(off_t) len;
4739 	} */
4740 	int fd;
4741 	off_t pos, len;
4742 	struct file *fp;
4743 	struct vnode *vp;
4744 	int error;
4745 
4746 	fd = SCARG(uap, fd);
4747 	pos = SCARG(uap, pos);
4748 	len = SCARG(uap, len);
4749 
4750 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4751 		*retval = EINVAL;
4752 		return 0;
4753 	}
4754 
4755 	error = fd_getvnode(fd, &fp);
4756 	if (error) {
4757 		*retval = error;
4758 		return 0;
4759 	}
4760 	if ((fp->f_flag & FWRITE) == 0) {
4761 		error = EBADF;
4762 		goto fail;
4763 	}
4764 	vp = fp->f_vnode;
4765 
4766 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4767 	if (vp->v_type == VDIR) {
4768 		error = EISDIR;
4769 	} else {
4770 		error = VOP_FALLOCATE(vp, pos, len);
4771 	}
4772 	VOP_UNLOCK(vp);
4773 
4774 fail:
4775 	fd_putfile(fd);
4776 	*retval = error;
4777 	return 0;
4778 }
4779 
4780 /*
4781  * Deallocate backing store for a file, creating a hole. Also used for
4782  * invoking TRIM on disks.
4783  */
4784 /* ARGSUSED */
4785 int
4786 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4787 		register_t *retval)
4788 {
4789 	/* {
4790 		syscallarg(int) fd;
4791 		syscallarg(off_t) pos;
4792 		syscallarg(off_t) len;
4793 	} */
4794 	int fd;
4795 	off_t pos, len;
4796 	struct file *fp;
4797 	struct vnode *vp;
4798 	int error;
4799 
4800 	fd = SCARG(uap, fd);
4801 	pos = SCARG(uap, pos);
4802 	len = SCARG(uap, len);
4803 
4804 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4805 		return EINVAL;
4806 	}
4807 
4808 	error = fd_getvnode(fd, &fp);
4809 	if (error) {
4810 		return error;
4811 	}
4812 	if ((fp->f_flag & FWRITE) == 0) {
4813 		error = EBADF;
4814 		goto fail;
4815 	}
4816 	vp = fp->f_vnode;
4817 
4818 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4819 	if (vp->v_type == VDIR) {
4820 		error = EISDIR;
4821 	} else {
4822 		error = VOP_FDISCARD(vp, pos, len);
4823 	}
4824 	VOP_UNLOCK(vp);
4825 
4826 fail:
4827 	fd_putfile(fd);
4828 	return error;
4829 }
4830