xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision c42dbd0ed2e61fe6eda8590caa852ccf34719964)
1 /*	$NetBSD: vfs_syscalls.c,v 1.562 2024/06/29 13:31:07 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.562 2024/06/29 13:31:07 christos Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/fstrans.h>
91 #include <sys/proc.h>
92 #include <sys/uio.h>
93 #include <sys/kmem.h>
94 #include <sys/dirent.h>
95 #include <sys/sysctl.h>
96 #include <sys/syscallargs.h>
97 #include <sys/vfs_syscalls.h>
98 #include <sys/quota.h>
99 #include <sys/quotactl.h>
100 #include <sys/ktrace.h>
101 #ifdef FILEASSOC
102 #include <sys/fileassoc.h>
103 #endif /* FILEASSOC */
104 #include <sys/extattr.h>
105 #include <sys/verified_exec.h>
106 #include <sys/kauth.h>
107 #include <sys/atomic.h>
108 #include <sys/module.h>
109 #include <sys/buf.h>
110 #include <sys/event.h>
111 #include <sys/compat_stub.h>
112 
113 #include <miscfs/genfs/genfs.h>
114 #include <miscfs/specfs/specdev.h>
115 
116 #include <nfs/rpcv2.h>
117 #include <nfs/nfsproto.h>
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 
121 /* XXX this shouldn't be here */
122 #ifndef OFF_T_MAX
123 #define OFF_T_MAX __type_max(off_t)
124 #endif
125 
126 static int change_flags(struct vnode *, u_long, struct lwp *);
127 static int change_mode(struct vnode *, int, struct lwp *);
128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131     enum uio_seg);
132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134     enum uio_seg);
135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136     enum uio_seg, int);
137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138     size_t, register_t *);
139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140 
141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143     namei_simple_flags_t, struct vnode **);
144 
145 /*
146  * This table is used to maintain compatibility with 4.3BSD
147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
148  * Note, the order is important!
149  *
150  * Do not modify this table. It should only contain filesystems
151  * supported by NetBSD 0.9 and 4.3BSD.
152  */
153 const char * const mountcompatnames[] = {
154 	NULL,		/* 0 = MOUNT_NONE */
155 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156 	MOUNT_NFS,	/* 2 */
157 	MOUNT_MFS,	/* 3 */
158 	MOUNT_MSDOS,	/* 4 */
159 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160 	MOUNT_FDESC,	/* 6 */
161 	MOUNT_KERNFS,	/* 7 */
162 	NULL,		/* 8 = MOUNT_DEVFS */
163 	MOUNT_AFS,	/* 9 */
164 };
165 
166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
167 
168 /*
169  * Filter event method for EVFILT_FS.
170  */
171 static struct klist fs_klist;
172 static kmutex_t fs_klist_lock;
173 
174 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
175 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
176 
177 void
178 vfs_evfilt_fs_init(void)
179 {
180 	klist_init(&fs_klist);
181 	mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
182 }
183 
184 static int
185 filt_fsattach(struct knote *kn)
186 {
187 	mutex_enter(&fs_klist_lock);
188 	kn->kn_flags |= EV_CLEAR;
189 	klist_insert(&fs_klist, kn);
190 	mutex_exit(&fs_klist_lock);
191 
192 	return 0;
193 }
194 
195 static void
196 filt_fsdetach(struct knote *kn)
197 {
198 	mutex_enter(&fs_klist_lock);
199 	klist_remove(&fs_klist, kn);
200 	mutex_exit(&fs_klist_lock);
201 }
202 
203 static int
204 filt_fs(struct knote *kn, long hint)
205 {
206 	int rv;
207 
208 	if (hint & NOTE_SUBMIT) {
209 		KASSERT(mutex_owned(&fs_klist_lock));
210 		kn->kn_fflags |= hint & ~NOTE_SUBMIT;
211 	} else {
212 		mutex_enter(&fs_klist_lock);
213 	}
214 
215 	rv = (kn->kn_fflags != 0);
216 
217 	if ((hint & NOTE_SUBMIT) == 0) {
218 		mutex_exit(&fs_klist_lock);
219 	}
220 
221 	return rv;
222 }
223 
224 /* referenced in kern_event.c */
225 const struct filterops fs_filtops = {
226 	.f_flags = FILTEROP_MPSAFE,
227 	.f_attach = filt_fsattach,
228 	.f_detach = filt_fsdetach,
229 	.f_event = filt_fs,
230 };
231 
232 static int
233 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
234 {
235 	file_t *dfp;
236 	int error;
237 	const char *path = pathbuf_stringcopy_get(ndp->ni_pathbuf);
238 
239 	if (fdat != AT_FDCWD && path[0] != '/') {
240 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
241 			goto out;
242 
243 		NDAT(ndp, dfp->f_vnode);
244 	}
245 
246 	error = namei(ndp);
247 
248 	if (fdat != AT_FDCWD)
249 		fd_putfile(fdat);
250 out:
251 	pathbuf_stringcopy_put(ndp->ni_pathbuf, path);
252 	return error;
253 }
254 
255 static int
256 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
257     namei_simple_flags_t sflags, struct vnode **vp_ret)
258 {
259 	file_t *dfp;
260 	struct vnode *dvp;
261 	int error;
262 
263 	if (fdat != AT_FDCWD && path[0] != '/') {
264 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
265 			goto out;
266 
267 		dvp = dfp->f_vnode;
268 	} else {
269 		dvp = NULL;
270 	}
271 
272 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
273 
274 	if (fdat != AT_FDCWD)
275 		fd_putfile(fdat);
276 out:
277 	return error;
278 }
279 
280 static int
281 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
282 {
283 	int error;
284 
285 	fp->f_flag = flags & FMASK;
286 	fp->f_type = DTYPE_VNODE;
287 	fp->f_ops = &vnops;
288 	fp->f_vnode = vp;
289 
290 	if (flags & (O_EXLOCK | O_SHLOCK)) {
291 		struct flock lf;
292 		int type;
293 
294 		lf.l_whence = SEEK_SET;
295 		lf.l_start = 0;
296 		lf.l_len = 0;
297 		if (flags & O_EXLOCK)
298 			lf.l_type = F_WRLCK;
299 		else
300 			lf.l_type = F_RDLCK;
301 		type = F_FLOCK;
302 		if ((flags & FNONBLOCK) == 0)
303 			type |= F_WAIT;
304 		VOP_UNLOCK(vp);
305 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
306 		if (error) {
307 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
308 			fd_abort(l->l_proc, fp, indx);
309 			return error;
310 		}
311 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
312 		atomic_or_uint(&fp->f_flag, FHASLOCK);
313 	}
314 	if (flags & O_CLOEXEC)
315 		fd_set_exclose(l, indx, true);
316 	return 0;
317 }
318 
319 static int
320 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
321     void *data, size_t *data_len)
322 {
323 	struct mount *mp;
324 	int error = 0, saved_flags;
325 
326 	mp = vp->v_mount;
327 	saved_flags = mp->mnt_flag;
328 
329 	/* We can operate only on VV_ROOT nodes. */
330 	if ((vp->v_vflag & VV_ROOT) == 0) {
331 		error = EINVAL;
332 		goto out;
333 	}
334 
335 	/*
336 	 * We only allow the filesystem to be reloaded if it
337 	 * is currently mounted read-only.  Additionally, we
338 	 * prevent read-write to read-only downgrades.
339 	 */
340 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
341 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
342 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
343 		error = EOPNOTSUPP;	/* Needs translation */
344 		goto out;
345 	}
346 
347 	/*
348 	 * Enabling MNT_UNION requires a covered mountpoint and
349 	 * must not happen on the root mount.
350 	 */
351 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
352 		error = EOPNOTSUPP;
353 		goto out;
354 	}
355 
356 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
357 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
358 	if (error)
359 		goto out;
360 
361 	error = vfs_suspend(mp, 0);
362 	if (error)
363 		goto out;
364 
365 	mutex_enter(mp->mnt_updating);
366 
367 	mp->mnt_flag &= ~MNT_OP_FLAGS;
368 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
369 
370 	/*
371 	 * Set the mount level flags.
372 	 */
373 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
374 		if ((flags & MNT_RDONLY))
375 			mp->mnt_iflag |= IMNT_WANTRDONLY;
376 		else
377 			mp->mnt_iflag |= IMNT_WANTRDWR;
378 	}
379 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
380 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
381 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
382 		mp->mnt_flag &= ~MNT_RDONLY;
383 
384 	error = VFS_MOUNT(mp, path, data, data_len);
385 
386 	if (error && data != NULL) {
387 		int error2;
388 
389 		/*
390 		 * Update failed; let's try and see if it was an
391 		 * export request.  For compat with 3.0 and earlier.
392 		 */
393 		error2 = vfs_hooks_reexport(mp, path, data);
394 
395 		/*
396 		 * Only update error code if the export request was
397 		 * understood but some problem occurred while
398 		 * processing it.
399 		 */
400 		if (error2 != EJUSTRETURN)
401 			error = error2;
402 	}
403 
404 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
405 		mp->mnt_flag |= MNT_RDONLY;
406 	if (error)
407 		mp->mnt_flag = saved_flags;
408 	mp->mnt_flag &= ~MNT_OP_FLAGS;
409 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
410 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
411 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
412 			vfs_syncer_add_to_worklist(mp);
413 	} else {
414 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
415 			vfs_syncer_remove_from_worklist(mp);
416 	}
417 	mutex_exit(mp->mnt_updating);
418 	vfs_resume(mp);
419 
420 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
421 	    (flags & MNT_EXTATTR)) {
422 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
423 				   NULL, 0, NULL) != 0) {
424 			printf("%s: failed to start extattr, error = %d",
425 			       mp->mnt_stat.f_mntonname, error);
426 			mp->mnt_flag &= ~MNT_EXTATTR;
427 		}
428 	}
429 
430 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
431 	    !(flags & MNT_EXTATTR)) {
432 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
433 				   NULL, 0, NULL) != 0) {
434 			printf("%s: failed to stop extattr, error = %d",
435 			       mp->mnt_stat.f_mntonname, error);
436 			mp->mnt_flag |= MNT_RDONLY;
437 		}
438 	}
439  out:
440 	return (error);
441 }
442 
443 static int
444 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
445     struct vfsops **vfsops)
446 {
447 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
448 	int error;
449 
450 	if (type_seg == UIO_USERSPACE) {
451 		/* Copy file-system type from userspace.  */
452 		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
453 	} else {
454 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
455 		KASSERT(error == 0);
456 	}
457 
458 	if (error) {
459 		/*
460 		 * Historically, filesystem types were identified by numbers.
461 		 * If we get an integer for the filesystem type instead of a
462 		 * string, we check to see if it matches one of the historic
463 		 * filesystem types.
464 		 */
465 		u_long fsindex = (u_long)fstype;
466 		if (fsindex >= nmountcompatnames ||
467 		    mountcompatnames[fsindex] == NULL)
468 			return ENODEV;
469 		strlcpy(fstypename, mountcompatnames[fsindex],
470 		    sizeof(fstypename));
471 	}
472 
473 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
474 	if (strcmp(fstypename, "ufs") == 0)
475 		fstypename[0] = 'f';
476 
477 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
478 		return 0;
479 
480 	/* If we can autoload a vfs module, try again */
481 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
482 
483 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
484 		return 0;
485 
486 	return ENODEV;
487 }
488 
489 static int
490 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
491     void *data, size_t *data_len)
492 {
493 	struct mount *mp;
494 	int error;
495 
496 	/* If MNT_GETARGS is specified, it should be the only flag. */
497 	if (flags & ~MNT_GETARGS)
498 		return EINVAL;
499 
500 	mp = vp->v_mount;
501 
502 	/* XXX: probably some notion of "can see" here if we want isolation. */
503 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
504 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
505 	if (error)
506 		return error;
507 
508 	if ((vp->v_vflag & VV_ROOT) == 0)
509 		return EINVAL;
510 
511 	if (vfs_busy(mp))
512 		return EPERM;
513 
514 	mutex_enter(mp->mnt_updating);
515 	mp->mnt_flag &= ~MNT_OP_FLAGS;
516 	mp->mnt_flag |= MNT_GETARGS;
517 	error = VFS_MOUNT(mp, path, data, data_len);
518 	mp->mnt_flag &= ~MNT_OP_FLAGS;
519 	mutex_exit(mp->mnt_updating);
520 
521 	vfs_unbusy(mp);
522 	return (error);
523 }
524 
525 int
526 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
527 {
528 	/* {
529 		syscallarg(const char *) type;
530 		syscallarg(const char *) path;
531 		syscallarg(int) flags;
532 		syscallarg(void *) data;
533 		syscallarg(size_t) data_len;
534 	} */
535 
536 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
537 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
538 	    SCARG(uap, data_len), retval);
539 }
540 
541 int
542 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
543     const char *path, int flags, void *data, enum uio_seg data_seg,
544     size_t data_len, register_t *retval)
545 {
546 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
547 	struct vnode *vp;
548 	void *data_buf = data;
549 	bool vfsopsrele = false;
550 	size_t alloc_sz = 0;
551 	int error;
552 
553 	/*
554 	 * Get vnode to be covered
555 	 */
556 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
557 	if (error != 0) {
558 		vp = NULL;
559 		goto done;
560 	}
561 
562 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
563 		vfsops = vp->v_mount->mnt_op;
564 	} else {
565 		/* 'type' is userspace */
566 		error = mount_get_vfsops(type, type_seg, &vfsops);
567 		if (error != 0)
568 			goto done;
569 		vfsopsrele = true;
570 	}
571 
572 	/*
573 	 * We allow data to be NULL, even for userspace. Some fs's don't need
574 	 * it. The others will handle NULL.
575 	 */
576 	if (data != NULL && data_seg == UIO_USERSPACE) {
577 		if (data_len == 0) {
578 			/* No length supplied, use default for filesystem */
579 			data_len = vfsops->vfs_min_mount_data;
580 
581 			/*
582 			 * Hopefully a longer buffer won't make copyin() fail.
583 			 * For compatibility with 3.0 and earlier.
584 			 */
585 			if (flags & MNT_UPDATE
586 			    && data_len < sizeof (struct mnt_export_args30))
587 				data_len = sizeof (struct mnt_export_args30);
588 		}
589 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
590 			error = EINVAL;
591 			goto done;
592 		}
593 		alloc_sz = data_len;
594 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
595 
596 		/* NFS needs the buffer even for mnt_getargs .... */
597 		error = copyin(data, data_buf, data_len);
598 		if (error != 0)
599 			goto done;
600 	}
601 
602 	if (flags & MNT_GETARGS) {
603 		if (data_len == 0) {
604 			error = EINVAL;
605 			goto done;
606 		}
607 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
608 		if (error != 0)
609 			goto done;
610 		if (data_seg == UIO_USERSPACE)
611 			error = copyout(data_buf, data, data_len);
612 		*retval = data_len;
613 	} else if (flags & MNT_UPDATE) {
614 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
615 	} else {
616 		/* Locking is handled internally in mount_domount(). */
617 		KASSERT(vfsopsrele == true);
618 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
619 		    &data_len);
620 		vfsopsrele = false;
621 	}
622 	if (!error) {
623 		mutex_enter(&fs_klist_lock);
624 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
625 		mutex_exit(&fs_klist_lock);
626 	}
627 
628     done:
629 	if (vfsopsrele)
630 		vfs_delref(vfsops);
631     	if (vp != NULL) {
632 	    	vrele(vp);
633 	}
634 	if (data_buf != data)
635 		kmem_free(data_buf, alloc_sz);
636 	return (error);
637 }
638 
639 /*
640  * Unmount a file system.
641  *
642  * Note: unmount takes a path to the vnode mounted on as argument,
643  * not special file (as before).
644  */
645 /* ARGSUSED */
646 int
647 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
648 {
649 	/* {
650 		syscallarg(const char *) path;
651 		syscallarg(int) flags;
652 	} */
653 	struct vnode *vp;
654 	struct mount *mp;
655 	int error;
656 	struct pathbuf *pb;
657 	struct nameidata nd;
658 
659 	error = pathbuf_copyin(SCARG(uap, path), &pb);
660 	if (error) {
661 		return error;
662 	}
663 
664 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
665 	if ((error = namei(&nd)) != 0) {
666 		pathbuf_destroy(pb);
667 		return error;
668 	}
669 	vp = nd.ni_vp;
670 	pathbuf_destroy(pb);
671 
672 	mp = vp->v_mount;
673 	vfs_ref(mp);
674 	VOP_UNLOCK(vp);
675 
676 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
677 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
678 	if (error) {
679 		vrele(vp);
680 		vfs_rele(mp);
681 		return (error);
682 	}
683 
684 	/*
685 	 * Don't allow unmounting the root file system.
686 	 */
687 	if (mp->mnt_flag & MNT_ROOTFS) {
688 		vrele(vp);
689 		vfs_rele(mp);
690 		return (EINVAL);
691 	}
692 
693 	/*
694 	 * Must be the root of the filesystem
695 	 */
696 	if ((vp->v_vflag & VV_ROOT) == 0) {
697 		vrele(vp);
698 		vfs_rele(mp);
699 		return (EINVAL);
700 	}
701 
702 	vrele(vp);
703 	error = dounmount(mp, SCARG(uap, flags), l);
704 	vfs_rele(mp);
705 	if (!error) {
706 		mutex_enter(&fs_klist_lock);
707 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
708 		mutex_exit(&fs_klist_lock);
709 	}
710 	return error;
711 }
712 
713 /*
714  * Sync each mounted filesystem.
715  */
716 #ifdef DEBUG
717 int syncprt = 0;
718 struct ctldebug debug0 = { "syncprt", &syncprt };
719 #endif
720 
721 void
722 do_sys_sync(struct lwp *l)
723 {
724 	mount_iterator_t *iter;
725 	struct mount *mp;
726 	int asyncflag;
727 
728 	mountlist_iterator_init(&iter);
729 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
730 		mutex_enter(mp->mnt_updating);
731 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
732 			asyncflag = mp->mnt_flag & MNT_ASYNC;
733 			mp->mnt_flag &= ~MNT_ASYNC;
734 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
735 			if (asyncflag)
736 				 mp->mnt_flag |= MNT_ASYNC;
737 		}
738 		mutex_exit(mp->mnt_updating);
739 	}
740 	mountlist_iterator_destroy(iter);
741 #ifdef DEBUG
742 	if (syncprt)
743 		vfs_bufstats();
744 #endif /* DEBUG */
745 }
746 
747 static bool
748 sync_vnode_filter(void *cookie, vnode_t *vp)
749 {
750 
751 	if (vp->v_numoutput > 0) {
752 		++*(int *)cookie;
753 	}
754 	return false;
755 }
756 
757 int
758 vfs_syncwait(void)
759 {
760 	int nbusy, nbusy_prev, iter;
761 	struct vnode_iterator *vniter;
762 	mount_iterator_t *mpiter;
763 	struct mount *mp;
764 
765 	for (nbusy_prev = 0, iter = 0; iter < 20;) {
766 		nbusy = 0;
767 		mountlist_iterator_init(&mpiter);
768 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
769 			vnode_t *vp __diagused;
770 			vfs_vnode_iterator_init(mp, &vniter);
771 			vp = vfs_vnode_iterator_next(vniter,
772 			    sync_vnode_filter, &nbusy);
773 			KASSERT(vp == NULL);
774 			vfs_vnode_iterator_destroy(vniter);
775 		}
776 		mountlist_iterator_destroy(mpiter);
777 
778 		if (nbusy == 0)
779 			break;
780 		if (nbusy_prev == 0)
781 			nbusy_prev = nbusy;
782 		printf("%d ", nbusy);
783 		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
784 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
785 			iter++;
786 		else
787 			nbusy_prev = nbusy;
788 	}
789 
790 	if (nbusy) {
791 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
792 		printf("giving up\nPrinting vnodes for busy buffers\n");
793 		mountlist_iterator_init(&mpiter);
794 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
795 			vnode_t *vp;
796 			vfs_vnode_iterator_init(mp, &vniter);
797 			vp = vfs_vnode_iterator_next(vniter,
798 			    NULL, NULL);
799 			mutex_enter(vp->v_interlock);
800 			if (vp->v_numoutput > 0)
801 				vprint(NULL, vp);
802 			mutex_exit(vp->v_interlock);
803 			vrele(vp);
804 			vfs_vnode_iterator_destroy(vniter);
805 		}
806 		mountlist_iterator_destroy(mpiter);
807 #endif
808 	}
809 
810 	return nbusy;
811 }
812 
813 /* ARGSUSED */
814 int
815 sys_sync(struct lwp *l, const void *v, register_t *retval)
816 {
817 	do_sys_sync(l);
818 	return (0);
819 }
820 
821 
822 /*
823  * Access or change filesystem quotas.
824  *
825  * (this is really 14 different calls bundled into one)
826  */
827 
828 static int
829 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
830 {
831 	struct quotastat info_k;
832 	int error;
833 
834 	/* ensure any padding bytes are cleared */
835 	memset(&info_k, 0, sizeof(info_k));
836 
837 	error = vfs_quotactl_stat(mp, &info_k);
838 	if (error) {
839 		return error;
840 	}
841 
842 	return copyout(&info_k, info_u, sizeof(info_k));
843 }
844 
845 static int
846 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
847     struct quotaidtypestat *info_u)
848 {
849 	struct quotaidtypestat info_k;
850 	int error;
851 
852 	/* ensure any padding bytes are cleared */
853 	memset(&info_k, 0, sizeof(info_k));
854 
855 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
856 	if (error) {
857 		return error;
858 	}
859 
860 	return copyout(&info_k, info_u, sizeof(info_k));
861 }
862 
863 static int
864 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
865     struct quotaobjtypestat *info_u)
866 {
867 	struct quotaobjtypestat info_k;
868 	int error;
869 
870 	/* ensure any padding bytes are cleared */
871 	memset(&info_k, 0, sizeof(info_k));
872 
873 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
874 	if (error) {
875 		return error;
876 	}
877 
878 	return copyout(&info_k, info_u, sizeof(info_k));
879 }
880 
881 static int
882 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
883     struct quotaval *val_u)
884 {
885 	struct quotakey key_k;
886 	struct quotaval val_k;
887 	int error;
888 
889 	/* ensure any padding bytes are cleared */
890 	memset(&val_k, 0, sizeof(val_k));
891 
892 	error = copyin(key_u, &key_k, sizeof(key_k));
893 	if (error) {
894 		return error;
895 	}
896 
897 	error = vfs_quotactl_get(mp, &key_k, &val_k);
898 	if (error) {
899 		return error;
900 	}
901 
902 	return copyout(&val_k, val_u, sizeof(val_k));
903 }
904 
905 static int
906 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
907     const struct quotaval *val_u)
908 {
909 	struct quotakey key_k;
910 	struct quotaval val_k;
911 	int error;
912 
913 	error = copyin(key_u, &key_k, sizeof(key_k));
914 	if (error) {
915 		return error;
916 	}
917 
918 	error = copyin(val_u, &val_k, sizeof(val_k));
919 	if (error) {
920 		return error;
921 	}
922 
923 	return vfs_quotactl_put(mp, &key_k, &val_k);
924 }
925 
926 static int
927 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
928 {
929 	struct quotakey key_k;
930 	int error;
931 
932 	error = copyin(key_u, &key_k, sizeof(key_k));
933 	if (error) {
934 		return error;
935 	}
936 
937 	return vfs_quotactl_del(mp, &key_k);
938 }
939 
940 static int
941 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
942 {
943 	struct quotakcursor cursor_k;
944 	int error;
945 
946 	/* ensure any padding bytes are cleared */
947 	memset(&cursor_k, 0, sizeof(cursor_k));
948 
949 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
950 	if (error) {
951 		return error;
952 	}
953 
954 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
955 }
956 
957 static int
958 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
959 {
960 	struct quotakcursor cursor_k;
961 	int error;
962 
963 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
964 	if (error) {
965 		return error;
966 	}
967 
968 	return vfs_quotactl_cursorclose(mp, &cursor_k);
969 }
970 
971 static int
972 do_sys_quotactl_cursorskipidtype(struct mount *mp,
973     struct quotakcursor *cursor_u, int idtype)
974 {
975 	struct quotakcursor cursor_k;
976 	int error;
977 
978 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
979 	if (error) {
980 		return error;
981 	}
982 
983 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
984 	if (error) {
985 		return error;
986 	}
987 
988 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
989 }
990 
991 static int
992 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
993     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
994     unsigned *ret_u)
995 {
996 #define CGET_STACK_MAX 8
997 	struct quotakcursor cursor_k;
998 	struct quotakey stackkeys[CGET_STACK_MAX];
999 	struct quotaval stackvals[CGET_STACK_MAX];
1000 	struct quotakey *keys_k;
1001 	struct quotaval *vals_k;
1002 	unsigned ret_k;
1003 	int error;
1004 
1005 	if (maxnum > 128) {
1006 		maxnum = 128;
1007 	}
1008 
1009 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1010 	if (error) {
1011 		return error;
1012 	}
1013 
1014 	if (maxnum <= CGET_STACK_MAX) {
1015 		keys_k = stackkeys;
1016 		vals_k = stackvals;
1017 		/* ensure any padding bytes are cleared */
1018 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1019 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1020 	} else {
1021 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1022 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1023 	}
1024 
1025 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1026 				       &ret_k);
1027 	if (error) {
1028 		goto fail;
1029 	}
1030 
1031 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1032 	if (error) {
1033 		goto fail;
1034 	}
1035 
1036 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1037 	if (error) {
1038 		goto fail;
1039 	}
1040 
1041 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1042 	if (error) {
1043 		goto fail;
1044 	}
1045 
1046 	/* do last to maximize the chance of being able to recover a failure */
1047 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1048 
1049 fail:
1050 	if (keys_k != stackkeys) {
1051 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1052 	}
1053 	if (vals_k != stackvals) {
1054 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1055 	}
1056 	return error;
1057 }
1058 
1059 static int
1060 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1061     int *ret_u)
1062 {
1063 	struct quotakcursor cursor_k;
1064 	int ret_k;
1065 	int error;
1066 
1067 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1068 	if (error) {
1069 		return error;
1070 	}
1071 
1072 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1073 	if (error) {
1074 		return error;
1075 	}
1076 
1077 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1078 	if (error) {
1079 		return error;
1080 	}
1081 
1082 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1083 }
1084 
1085 static int
1086 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1087 {
1088 	struct quotakcursor cursor_k;
1089 	int error;
1090 
1091 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1092 	if (error) {
1093 		return error;
1094 	}
1095 
1096 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1097 	if (error) {
1098 		return error;
1099 	}
1100 
1101 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1102 }
1103 
1104 static int
1105 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1106 {
1107 	char *path_k;
1108 	int error;
1109 
1110 	/* XXX this should probably be a struct pathbuf */
1111 	path_k = PNBUF_GET();
1112 	error = copyin(path_u, path_k, PATH_MAX);
1113 	if (error) {
1114 		PNBUF_PUT(path_k);
1115 		return error;
1116 	}
1117 
1118 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1119 
1120 	PNBUF_PUT(path_k);
1121 	return error;
1122 }
1123 
1124 static int
1125 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1126 {
1127 	return vfs_quotactl_quotaoff(mp, idtype);
1128 }
1129 
1130 int
1131 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1132 {
1133 	struct mount *mp;
1134 	struct vnode *vp;
1135 	int error;
1136 
1137 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1138 	if (error != 0)
1139 		return (error);
1140 	mp = vp->v_mount;
1141 
1142 	switch (args->qc_op) {
1143 	    case QUOTACTL_STAT:
1144 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1145 		break;
1146 	    case QUOTACTL_IDTYPESTAT:
1147 		error = do_sys_quotactl_idtypestat(mp,
1148 				args->u.idtypestat.qc_idtype,
1149 				args->u.idtypestat.qc_info);
1150 		break;
1151 	    case QUOTACTL_OBJTYPESTAT:
1152 		error = do_sys_quotactl_objtypestat(mp,
1153 				args->u.objtypestat.qc_objtype,
1154 				args->u.objtypestat.qc_info);
1155 		break;
1156 	    case QUOTACTL_GET:
1157 		error = do_sys_quotactl_get(mp,
1158 				args->u.get.qc_key,
1159 				args->u.get.qc_val);
1160 		break;
1161 	    case QUOTACTL_PUT:
1162 		error = do_sys_quotactl_put(mp,
1163 				args->u.put.qc_key,
1164 				args->u.put.qc_val);
1165 		break;
1166 	    case QUOTACTL_DEL:
1167 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1168 		break;
1169 	    case QUOTACTL_CURSOROPEN:
1170 		error = do_sys_quotactl_cursoropen(mp,
1171 				args->u.cursoropen.qc_cursor);
1172 		break;
1173 	    case QUOTACTL_CURSORCLOSE:
1174 		error = do_sys_quotactl_cursorclose(mp,
1175 				args->u.cursorclose.qc_cursor);
1176 		break;
1177 	    case QUOTACTL_CURSORSKIPIDTYPE:
1178 		error = do_sys_quotactl_cursorskipidtype(mp,
1179 				args->u.cursorskipidtype.qc_cursor,
1180 				args->u.cursorskipidtype.qc_idtype);
1181 		break;
1182 	    case QUOTACTL_CURSORGET:
1183 		error = do_sys_quotactl_cursorget(mp,
1184 				args->u.cursorget.qc_cursor,
1185 				args->u.cursorget.qc_keys,
1186 				args->u.cursorget.qc_vals,
1187 				args->u.cursorget.qc_maxnum,
1188 				args->u.cursorget.qc_ret);
1189 		break;
1190 	    case QUOTACTL_CURSORATEND:
1191 		error = do_sys_quotactl_cursoratend(mp,
1192 				args->u.cursoratend.qc_cursor,
1193 				args->u.cursoratend.qc_ret);
1194 		break;
1195 	    case QUOTACTL_CURSORREWIND:
1196 		error = do_sys_quotactl_cursorrewind(mp,
1197 				args->u.cursorrewind.qc_cursor);
1198 		break;
1199 	    case QUOTACTL_QUOTAON:
1200 		error = do_sys_quotactl_quotaon(mp,
1201 				args->u.quotaon.qc_idtype,
1202 				args->u.quotaon.qc_quotafile);
1203 		break;
1204 	    case QUOTACTL_QUOTAOFF:
1205 		error = do_sys_quotactl_quotaoff(mp,
1206 				args->u.quotaoff.qc_idtype);
1207 		break;
1208 	    default:
1209 		error = EINVAL;
1210 		break;
1211 	}
1212 
1213 	vrele(vp);
1214 	return error;
1215 }
1216 
1217 /* ARGSUSED */
1218 int
1219 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1220     register_t *retval)
1221 {
1222 	/* {
1223 		syscallarg(const char *) path;
1224 		syscallarg(struct quotactl_args *) args;
1225 	} */
1226 	struct quotactl_args args;
1227 	int error;
1228 
1229 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1230 	if (error) {
1231 		return error;
1232 	}
1233 
1234 	return do_sys_quotactl(SCARG(uap, path), &args);
1235 }
1236 
1237 int
1238 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1239     int root)
1240 {
1241 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1242 	bool chrooted;
1243 	int error = 0;
1244 
1245 	KASSERT(l == curlwp);
1246 
1247 	/*
1248 	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1249 	 * since it would imply chroots can be escaped.  Just make sure this
1250 	 * routine is self-consistent.
1251 	 */
1252 	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1253 
1254 	/*
1255 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1256 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1257 	 * overrides MNT_NOWAIT.
1258 	 */
1259 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1260 	    (flags != MNT_WAIT && flags != 0)) {
1261 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1262 	} else {
1263 		/* Get the filesystem stats now */
1264 		memset(sp, 0, sizeof(*sp));
1265 		if ((error = VFS_STATVFS(mp, sp)) != 0)
1266 			return error;
1267 		if (!chrooted)
1268 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1269 	}
1270 
1271 	if (chrooted) {
1272 		size_t len;
1273 		char *bp;
1274 		char c;
1275 		char *path = PNBUF_GET();
1276 
1277 		bp = path + MAXPATHLEN;
1278 		*--bp = '\0';
1279 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1280 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1281 		    MAXPATHLEN / 2, 0, l);
1282 		rw_exit(&cwdi->cwdi_lock);
1283 		if (error) {
1284 			PNBUF_PUT(path);
1285 			return error;
1286 		}
1287 		len = strlen(bp);
1288 		if (len != 1) {
1289 			/*
1290 			 * for mount points that are below our root, we can see
1291 			 * them, so we fix up the pathname and return them. The
1292 			 * rest we cannot see, so we don't allow viewing the
1293 			 * data.
1294 			 */
1295 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1296 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1297 				(void)strlcpy(sp->f_mntonname,
1298 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1299 				    sizeof(sp->f_mntonname));
1300 			} else {
1301 				if (root)
1302 					(void)strlcpy(sp->f_mntonname, "/",
1303 					    sizeof(sp->f_mntonname));
1304 				else
1305 					error = EPERM;
1306 			}
1307 		}
1308 		PNBUF_PUT(path);
1309 	}
1310 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1311 	return error;
1312 }
1313 
1314 /*
1315  * Get filesystem statistics by path.
1316  */
1317 int
1318 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1319 {
1320 	struct mount *mp;
1321 	int error;
1322 	struct vnode *vp;
1323 
1324 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1325 	if (error != 0)
1326 		return error;
1327 	mp = vp->v_mount;
1328 	error = dostatvfs(mp, sb, l, flags, 1);
1329 	vrele(vp);
1330 	return error;
1331 }
1332 
1333 /* ARGSUSED */
1334 int
1335 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1336 {
1337 	/* {
1338 		syscallarg(const char *) path;
1339 		syscallarg(struct statvfs *) buf;
1340 		syscallarg(int) flags;
1341 	} */
1342 	struct statvfs *sb;
1343 	int error;
1344 
1345 	sb = STATVFSBUF_GET();
1346 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1347 	if (error == 0)
1348 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1349 	STATVFSBUF_PUT(sb);
1350 	return error;
1351 }
1352 
1353 /*
1354  * Get filesystem statistics by fd.
1355  */
1356 int
1357 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1358 {
1359 	file_t *fp;
1360 	struct mount *mp;
1361 	int error;
1362 
1363 	/* fd_getvnode() will use the descriptor for us */
1364 	if ((error = fd_getvnode(fd, &fp)) != 0)
1365 		return (error);
1366 	mp = fp->f_vnode->v_mount;
1367 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1368 	fd_putfile(fd);
1369 	return error;
1370 }
1371 
1372 /* ARGSUSED */
1373 int
1374 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1375 {
1376 	/* {
1377 		syscallarg(int) fd;
1378 		syscallarg(struct statvfs *) buf;
1379 		syscallarg(int) flags;
1380 	} */
1381 	struct statvfs *sb;
1382 	int error;
1383 
1384 	sb = STATVFSBUF_GET();
1385 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1386 	if (error == 0)
1387 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1388 	STATVFSBUF_PUT(sb);
1389 	return error;
1390 }
1391 
1392 
1393 /*
1394  * Get statistics on all filesystems.
1395  */
1396 int
1397 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1398     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1399     register_t *retval)
1400 {
1401 	int root = 0;
1402 	mount_iterator_t *iter;
1403 	struct proc *p = l->l_proc;
1404 	struct mount *mp;
1405 	struct statvfs *sb;
1406 	size_t count, maxcount;
1407 	int error = 0;
1408 
1409 	sb = STATVFSBUF_GET();
1410 	maxcount = bufsize / entry_sz;
1411 	count = 0;
1412 	mountlist_iterator_init(&iter);
1413 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1414 		if (sfsp && count < maxcount) {
1415 			error = dostatvfs(mp, sb, l, flags, 0);
1416 			if (error) {
1417 				error = 0;
1418 				continue;
1419 			}
1420 			error = copyfn(sb, sfsp, entry_sz);
1421 			if (error)
1422 				goto out;
1423 			sfsp = (char *)sfsp + entry_sz;
1424 			root |= strcmp(sb->f_mntonname, "/") == 0;
1425 		}
1426 		count++;
1427 	}
1428 
1429 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1430 		/*
1431 		 * fake a root entry
1432 		 */
1433 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1434 		    sb, l, flags, 1);
1435 		if (error != 0)
1436 			goto out;
1437 		if (sfsp) {
1438 			error = copyfn(sb, sfsp, entry_sz);
1439 			if (error != 0)
1440 				goto out;
1441 		}
1442 		count++;
1443 	}
1444 	if (sfsp && count > maxcount)
1445 		*retval = maxcount;
1446 	else
1447 		*retval = count;
1448 out:
1449 	mountlist_iterator_destroy(iter);
1450 	STATVFSBUF_PUT(sb);
1451 	return error;
1452 }
1453 
1454 int
1455 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1456     register_t *retval)
1457 {
1458 	/* {
1459 		syscallarg(struct statvfs *) buf;
1460 		syscallarg(size_t) bufsize;
1461 		syscallarg(int) flags;
1462 	} */
1463 
1464 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1465 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1466 }
1467 
1468 /*
1469  * Change current working directory to a given file descriptor.
1470  */
1471 int
1472 do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
1473 {
1474 	struct proc *p = l->l_proc;
1475 	struct cwdinfo *cwdi;
1476 	struct vnode *vp, *tdp;
1477 	struct mount *mp;
1478 	file_t *fp;
1479 	int error;
1480 
1481 	/* fd_getvnode() will use the descriptor for us */
1482 	if ((error = fd_getvnode(fd, &fp)) != 0)
1483 		return error;
1484 	vp = fp->f_vnode;
1485 
1486 	vref(vp);
1487 	vn_lock(vp, LK_SHARED | LK_RETRY);
1488 	if (vp->v_type != VDIR)
1489 		error = ENOTDIR;
1490 	else
1491 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1492 	if (error) {
1493 		vput(vp);
1494 		goto out;
1495 	}
1496 	while ((mp = vp->v_mountedhere) != NULL) {
1497 		error = vfs_busy(mp);
1498 		vput(vp);
1499 		if (error != 0)
1500 			goto out;
1501 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1502 		vfs_unbusy(mp);
1503 		if (error)
1504 			goto out;
1505 		vp = tdp;
1506 	}
1507 	VOP_UNLOCK(vp);
1508 
1509 	/*
1510 	 * Disallow changing to a directory not under the process's
1511 	 * current root directory (if there is one).
1512 	 */
1513 	cwdi = p->p_cwdi;
1514 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1515 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1516 		vrele(vp);
1517 		error = EPERM;	/* operation not permitted */
1518 	} else {
1519 		vrele(cwdi->cwdi_cdir);
1520 		cwdi->cwdi_cdir = vp;
1521 	}
1522 	rw_exit(&cwdi->cwdi_lock);
1523 
1524 out:
1525 	fd_putfile(fd);
1526 	return error;
1527 }
1528 
1529 /*
1530  * Change current working directory to a given file descriptor.
1531  */
1532 /* ARGSUSED */
1533 int
1534 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1535 {
1536 	/* {
1537 		syscallarg(int) fd;
1538 	} */
1539 	return do_sys_fchdir(l, SCARG(uap, fd), retval);
1540 }
1541 
1542 /*
1543  * Change this process's notion of the root directory to a given file
1544  * descriptor.
1545  */
1546 int
1547 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1548 {
1549 	struct vnode	*vp;
1550 	file_t	*fp;
1551 	int		 error, fd = SCARG(uap, fd);
1552 
1553 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1554  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1555 		return error;
1556 	/* fd_getvnode() will use the descriptor for us */
1557 	if ((error = fd_getvnode(fd, &fp)) != 0)
1558 		return error;
1559 	vp = fp->f_vnode;
1560 	vn_lock(vp, LK_SHARED | LK_RETRY);
1561 	if (vp->v_type != VDIR)
1562 		error = ENOTDIR;
1563 	else
1564 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1565 	VOP_UNLOCK(vp);
1566 	if (error)
1567 		goto out;
1568 	vref(vp);
1569 	change_root(vp);
1570 
1571  out:
1572 	fd_putfile(fd);
1573 	return (error);
1574 }
1575 
1576 /*
1577  * Change current working directory (``.'').
1578  */
1579 int
1580 do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
1581     register_t *retval)
1582 {
1583 	struct proc *p = l->l_proc;
1584 	struct cwdinfo * cwdi;
1585 	int error;
1586 	struct vnode *vp;
1587 
1588 	if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
1589 		return error;
1590 	cwdi = p->p_cwdi;
1591 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1592 	vrele(cwdi->cwdi_cdir);
1593 	cwdi->cwdi_cdir = vp;
1594 	rw_exit(&cwdi->cwdi_lock);
1595 	return 0;
1596 }
1597 
1598 /*
1599  * Change current working directory (``.'').
1600  */
1601 /* ARGSUSED */
1602 int
1603 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1604 {
1605 	/* {
1606 		syscallarg(const char *) path;
1607 	} */
1608 	return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
1609 }
1610 
1611 /*
1612  * Change notion of root (``/'') directory.
1613  */
1614 /* ARGSUSED */
1615 int
1616 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1617 {
1618 	/* {
1619 		syscallarg(const char *) path;
1620 	} */
1621 	int error;
1622 	struct vnode *vp;
1623 
1624 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1625 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1626 		return (error);
1627 
1628 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1629 	if (error == 0)
1630 		change_root(vp);
1631 	return error;
1632 }
1633 
1634 /*
1635  * Common routine for chroot and fchroot.
1636  * NB: callers need to properly authorize the change root operation.
1637  */
1638 void
1639 change_root(struct vnode *vp)
1640 {
1641 	kauth_cred_t ncred;
1642 	struct lwp *l = curlwp;
1643 	struct proc *p = l->l_proc;
1644 	struct cwdinfo *cwdi = p->p_cwdi;
1645 
1646 	ncred = kauth_cred_alloc();
1647 
1648 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1649 	if (cwdi->cwdi_rdir != NULL)
1650 		vrele(cwdi->cwdi_rdir);
1651 	cwdi->cwdi_rdir = vp;
1652 
1653 	/*
1654 	 * Prevent escaping from chroot by putting the root under
1655 	 * the working directory.  Silently chdir to / if we aren't
1656 	 * already there.
1657 	 */
1658 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1659 		/*
1660 		 * XXX would be more failsafe to change directory to a
1661 		 * deadfs node here instead
1662 		 */
1663 		vrele(cwdi->cwdi_cdir);
1664 		vref(vp);
1665 		cwdi->cwdi_cdir = vp;
1666 	}
1667 	rw_exit(&cwdi->cwdi_lock);
1668 
1669 	/* Get a write lock on the process credential. */
1670 	proc_crmod_enter();
1671 
1672 	kauth_cred_clone(p->p_cred, ncred);
1673 	kauth_proc_chroot(ncred, p->p_cwdi);
1674 
1675 	/* Broadcast our credentials to the process and other LWPs. */
1676  	proc_crmod_leave(ncred, p->p_cred, true);
1677 }
1678 
1679 /*
1680  * Common routine for chroot and chdir.
1681  * XXX "where" should be enum uio_seg
1682  */
1683 int
1684 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1685 {
1686 	struct pathbuf *pb;
1687 	struct nameidata nd;
1688 	int error;
1689 
1690 	error = pathbuf_maybe_copyin(path, where, &pb);
1691 	if (error) {
1692 		return error;
1693 	}
1694 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1695 	if ((error = namei(&nd)) != 0) {
1696 		pathbuf_destroy(pb);
1697 		return error;
1698 	}
1699 	*vpp = nd.ni_vp;
1700 	pathbuf_destroy(pb);
1701 
1702 	if ((*vpp)->v_type != VDIR)
1703 		error = ENOTDIR;
1704 	else
1705 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1706 
1707 	if (error)
1708 		vput(*vpp);
1709 	else
1710 		VOP_UNLOCK(*vpp);
1711 	return (error);
1712 }
1713 
1714 /*
1715  * Internals of sys_open - path has already been converted into a pathbuf
1716  * (so we can easily reuse this function from other parts of the kernel,
1717  * like posix_spawn post-processing).
1718  */
1719 int
1720 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1721 	int open_mode, int *fd)
1722 {
1723 	struct proc *p = l->l_proc;
1724 	struct cwdinfo *cwdi = p->p_cwdi;
1725 	file_t *fp;
1726 	struct vnode *vp;
1727 	int dupfd;
1728 	bool dupfd_move;
1729 	int flags, cmode;
1730 	int indx, error;
1731 
1732 	if (open_flags & O_SEARCH) {
1733 		open_flags &= ~(int)O_SEARCH;
1734 	}
1735 
1736 	/*
1737 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1738 	 * may be specified.
1739 	 */
1740 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1741 		return EINVAL;
1742 
1743 	flags = FFLAGS(open_flags);
1744 	if ((flags & (FREAD | FWRITE)) == 0)
1745 		return EINVAL;
1746 
1747 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1748 		return error;
1749 	}
1750 
1751 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1752 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1753 
1754 	error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1755 	    &vp, &dupfd_move, &dupfd);
1756 	if (error != 0) {
1757 		fd_abort(p, fp, indx);
1758 		return error;
1759 	}
1760 
1761 	if (vp == NULL) {
1762 		fd_abort(p, fp, indx);
1763 		error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1764 		if (error)
1765 			return error;
1766 		*fd = indx;
1767 	} else {
1768 		error = open_setfp(l, fp, vp, indx, flags);
1769 		if (error)
1770 			return error;
1771 		VOP_UNLOCK(vp);
1772 		*fd = indx;
1773 		fd_affix(p, fp, indx);
1774 	}
1775 
1776 	return 0;
1777 }
1778 
1779 int
1780 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1781 {
1782 	struct pathbuf *pb;
1783 	int error, oflags;
1784 
1785 	oflags = FFLAGS(open_flags);
1786 	if ((oflags & (FREAD | FWRITE)) == 0)
1787 		return EINVAL;
1788 
1789 	pb = pathbuf_create(path);
1790 	if (pb == NULL)
1791 		return ENOMEM;
1792 
1793 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1794 	pathbuf_destroy(pb);
1795 
1796 	return error;
1797 }
1798 
1799 static int
1800 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1801     int mode, int *fd)
1802 {
1803 	file_t *dfp = NULL;
1804 	struct vnode *dvp = NULL;
1805 	struct pathbuf *pb;
1806 	const char *pathstring = NULL;
1807 	int error;
1808 
1809 	if (path == NULL) {
1810 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1811 		if (error == ENOSYS)
1812 			goto no_compat;
1813 		if (error)
1814 			return error;
1815 	} else {
1816 no_compat:
1817 		error = pathbuf_copyin(path, &pb);
1818 		if (error)
1819 			return error;
1820 	}
1821 
1822 	pathstring = pathbuf_stringcopy_get(pb);
1823 
1824 	/*
1825 	 * fdat is ignored if:
1826 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1827 	 * 2) if path is absolute, then fdat is useless.
1828 	 */
1829 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1830 		/* fd_getvnode() will use the descriptor for us */
1831 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1832 			goto out;
1833 
1834 		dvp = dfp->f_vnode;
1835 	}
1836 
1837 	error = do_open(l, dvp, pb, flags, mode, fd);
1838 
1839 	if (dfp != NULL)
1840 		fd_putfile(fdat);
1841 out:
1842 	pathbuf_stringcopy_put(pb, pathstring);
1843 	pathbuf_destroy(pb);
1844 	return error;
1845 }
1846 
1847 int
1848 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1849 {
1850 	/* {
1851 		syscallarg(const char *) path;
1852 		syscallarg(int) flags;
1853 		syscallarg(int) mode;
1854 	} */
1855 	int error;
1856 	int fd;
1857 
1858 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1859 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1860 
1861 	if (error == 0)
1862 		*retval = fd;
1863 
1864 	return error;
1865 }
1866 
1867 int
1868 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1869 {
1870 	/* {
1871 		syscallarg(int) fd;
1872 		syscallarg(const char *) path;
1873 		syscallarg(int) oflags;
1874 		syscallarg(int) mode;
1875 	} */
1876 	int error;
1877 	int fd;
1878 
1879 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1880 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1881 
1882 	if (error == 0)
1883 		*retval = fd;
1884 
1885 	return error;
1886 }
1887 
1888 static void
1889 vfs__fhfree(fhandle_t *fhp)
1890 {
1891 	size_t fhsize;
1892 
1893 	fhsize = FHANDLE_SIZE(fhp);
1894 	kmem_free(fhp, fhsize);
1895 }
1896 
1897 /*
1898  * vfs_composefh: compose a filehandle.
1899  */
1900 
1901 int
1902 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1903 {
1904 	struct mount *mp;
1905 	struct fid *fidp;
1906 	int error;
1907 	size_t needfhsize;
1908 	size_t fidsize;
1909 
1910 	mp = vp->v_mount;
1911 	fidp = NULL;
1912 	if (*fh_size < FHANDLE_SIZE_MIN) {
1913 		fidsize = 0;
1914 	} else {
1915 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1916 		if (fhp != NULL) {
1917 			memset(fhp, 0, *fh_size);
1918 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1919 			fidp = &fhp->fh_fid;
1920 		}
1921 	}
1922 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1923 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1924 	if (error == 0 && *fh_size < needfhsize) {
1925 		error = E2BIG;
1926 	}
1927 	*fh_size = needfhsize;
1928 	return error;
1929 }
1930 
1931 int
1932 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1933 {
1934 	struct mount *mp;
1935 	fhandle_t *fhp;
1936 	size_t fhsize;
1937 	size_t fidsize;
1938 	int error;
1939 
1940 	mp = vp->v_mount;
1941 	fidsize = 0;
1942 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1943 	KASSERT(error != 0);
1944 	if (error != E2BIG) {
1945 		goto out;
1946 	}
1947 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1948 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1949 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1950 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1951 	if (error == 0) {
1952 		KASSERT(FHANDLE_SIZE(fhp) == fhsize);
1953 		KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize);
1954 		*fhpp = fhp;
1955 	} else {
1956 		kmem_free(fhp, fhsize);
1957 	}
1958 out:
1959 	return error;
1960 }
1961 
1962 void
1963 vfs_composefh_free(fhandle_t *fhp)
1964 {
1965 
1966 	vfs__fhfree(fhp);
1967 }
1968 
1969 /*
1970  * vfs_fhtovp: lookup a vnode by a filehandle.
1971  */
1972 
1973 int
1974 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1975 {
1976 	struct mount *mp;
1977 	int error;
1978 
1979 	*vpp = NULL;
1980 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1981 	if (mp == NULL) {
1982 		error = ESTALE;
1983 		goto out;
1984 	}
1985 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1986 		error = EOPNOTSUPP;
1987 		goto out;
1988 	}
1989 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1990 out:
1991 	return error;
1992 }
1993 
1994 /*
1995  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1996  * the needed size.
1997  */
1998 
1999 int
2000 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
2001 {
2002 	fhandle_t *fhp;
2003 	int error;
2004 
2005 	if (fhsize > FHANDLE_SIZE_MAX) {
2006 		return EINVAL;
2007 	}
2008 	if (fhsize < FHANDLE_SIZE_MIN) {
2009 		return EINVAL;
2010 	}
2011 again:
2012 	fhp = kmem_alloc(fhsize, KM_SLEEP);
2013 	error = copyin(ufhp, fhp, fhsize);
2014 	if (error == 0) {
2015 		/* XXX this check shouldn't be here */
2016 		if (FHANDLE_SIZE(fhp) == fhsize) {
2017 			*fhpp = fhp;
2018 			return 0;
2019 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
2020 			/*
2021 			 * a kludge for nfsv2 padded handles.
2022 			 */
2023 			size_t sz;
2024 
2025 			sz = FHANDLE_SIZE(fhp);
2026 			kmem_free(fhp, fhsize);
2027 			fhsize = sz;
2028 			goto again;
2029 		} else {
2030 			/*
2031 			 * userland told us wrong size.
2032 			 */
2033 		    	error = EINVAL;
2034 		}
2035 	}
2036 	kmem_free(fhp, fhsize);
2037 	return error;
2038 }
2039 
2040 void
2041 vfs_copyinfh_free(fhandle_t *fhp)
2042 {
2043 
2044 	vfs__fhfree(fhp);
2045 }
2046 
2047 /*
2048  * Get file handle system call
2049  */
2050 int
2051 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
2052 {
2053 	/* {
2054 		syscallarg(char *) fname;
2055 		syscallarg(fhandle_t *) fhp;
2056 		syscallarg(size_t *) fh_size;
2057 	} */
2058 	struct vnode *vp;
2059 	fhandle_t *fh;
2060 	int error;
2061 	struct pathbuf *pb;
2062 	struct nameidata nd;
2063 	size_t sz;
2064 	size_t usz;
2065 
2066 	/*
2067 	 * Must be super user
2068 	 */
2069 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2070 	    0, NULL, NULL, NULL);
2071 	if (error)
2072 		return (error);
2073 
2074 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
2075 	if (error) {
2076 		return error;
2077 	}
2078 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2079 	error = namei(&nd);
2080 	if (error) {
2081 		pathbuf_destroy(pb);
2082 		return error;
2083 	}
2084 	vp = nd.ni_vp;
2085 	pathbuf_destroy(pb);
2086 
2087 	error = vfs_composefh_alloc(vp, &fh);
2088 	vput(vp);
2089 	if (error != 0) {
2090 		return error;
2091 	}
2092 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2093 	if (error != 0) {
2094 		goto out;
2095 	}
2096 	sz = FHANDLE_SIZE(fh);
2097 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2098 	if (error != 0) {
2099 		goto out;
2100 	}
2101 	if (usz >= sz) {
2102 		error = copyout(fh, SCARG(uap, fhp), sz);
2103 	} else {
2104 		error = E2BIG;
2105 	}
2106 out:
2107 	vfs_composefh_free(fh);
2108 	return (error);
2109 }
2110 
2111 /*
2112  * Open a file given a file handle.
2113  *
2114  * Check permissions, allocate an open file structure,
2115  * and call the device open routine if any.
2116  */
2117 
2118 int
2119 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2120     register_t *retval)
2121 {
2122 	file_t *fp;
2123 	struct vnode *vp = NULL;
2124 	kauth_cred_t cred = l->l_cred;
2125 	file_t *nfp;
2126 	int indx, error;
2127 	struct vattr va;
2128 	fhandle_t *fh;
2129 	int flags;
2130 	proc_t *p;
2131 
2132 	p = curproc;
2133 
2134 	/*
2135 	 * Must be super user
2136 	 */
2137 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2138 	    0, NULL, NULL, NULL)))
2139 		return (error);
2140 
2141 	if (oflags & O_SEARCH) {
2142 		oflags &= ~(int)O_SEARCH;
2143 	}
2144 
2145 	flags = FFLAGS(oflags);
2146 	if ((flags & (FREAD | FWRITE)) == 0)
2147 		return (EINVAL);
2148 	if ((flags & O_CREAT))
2149 		return (EINVAL);
2150 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2151 		return (error);
2152 	fp = nfp;
2153 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2154 	if (error != 0) {
2155 		goto bad;
2156 	}
2157 	error = vfs_fhtovp(fh, &vp);
2158 	vfs_copyinfh_free(fh);
2159 	if (error != 0) {
2160 		goto bad;
2161 	}
2162 
2163 	/* Now do an effective vn_open */
2164 
2165 	if (vp->v_type == VSOCK) {
2166 		error = EOPNOTSUPP;
2167 		goto bad;
2168 	}
2169 	error = vn_openchk(vp, cred, flags);
2170 	if (error != 0)
2171 		goto bad;
2172 	if (flags & O_TRUNC) {
2173 		VOP_UNLOCK(vp);			/* XXX */
2174 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2175 		vattr_null(&va);
2176 		va.va_size = 0;
2177 		error = VOP_SETATTR(vp, &va, cred);
2178 		if (error)
2179 			goto bad;
2180 	}
2181 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2182 		goto bad;
2183 	if (flags & FWRITE) {
2184 		mutex_enter(vp->v_interlock);
2185 		vp->v_writecount++;
2186 		mutex_exit(vp->v_interlock);
2187 	}
2188 
2189 	/* done with modified vn_open, now finish what sys_open does. */
2190 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2191 		return error;
2192 
2193 	VOP_UNLOCK(vp);
2194 	*retval = indx;
2195 	fd_affix(p, fp, indx);
2196 	return (0);
2197 
2198 bad:
2199 	fd_abort(p, fp, indx);
2200 	if (vp != NULL)
2201 		vput(vp);
2202 	if (error == EDUPFD || error == EMOVEFD) {
2203 		/* XXX should probably close curlwp->l_dupfd */
2204 		error = EOPNOTSUPP;
2205 	}
2206 	return (error);
2207 }
2208 
2209 int
2210 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2211 {
2212 	/* {
2213 		syscallarg(const void *) fhp;
2214 		syscallarg(size_t) fh_size;
2215 		syscallarg(int) flags;
2216 	} */
2217 
2218 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2219 	    SCARG(uap, flags), retval);
2220 }
2221 
2222 int
2223 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2224 {
2225 	int error;
2226 	fhandle_t *fh;
2227 	struct vnode *vp;
2228 
2229 	/*
2230 	 * Must be super user
2231 	 */
2232 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2233 	    0, NULL, NULL, NULL)))
2234 		return (error);
2235 
2236 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2237 	if (error != 0)
2238 		return error;
2239 
2240 	error = vfs_fhtovp(fh, &vp);
2241 	vfs_copyinfh_free(fh);
2242 	if (error != 0)
2243 		return error;
2244 
2245 	error = vn_stat(vp, sb);
2246 	vput(vp);
2247 	return error;
2248 }
2249 
2250 
2251 /* ARGSUSED */
2252 int
2253 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2254 {
2255 	/* {
2256 		syscallarg(const void *) fhp;
2257 		syscallarg(size_t) fh_size;
2258 		syscallarg(struct stat *) sb;
2259 	} */
2260 	struct stat sb;
2261 	int error;
2262 
2263 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2264 	if (error)
2265 		return error;
2266 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2267 }
2268 
2269 int
2270 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2271     int flags)
2272 {
2273 	fhandle_t *fh;
2274 	struct mount *mp;
2275 	struct vnode *vp;
2276 	int error;
2277 
2278 	/*
2279 	 * Must be super user
2280 	 */
2281 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2282 	    0, NULL, NULL, NULL)))
2283 		return error;
2284 
2285 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2286 	if (error != 0)
2287 		return error;
2288 
2289 	error = vfs_fhtovp(fh, &vp);
2290 	vfs_copyinfh_free(fh);
2291 	if (error != 0)
2292 		return error;
2293 
2294 	mp = vp->v_mount;
2295 	error = dostatvfs(mp, sb, l, flags, 1);
2296 	vput(vp);
2297 	return error;
2298 }
2299 
2300 /* ARGSUSED */
2301 int
2302 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2303 {
2304 	/* {
2305 		syscallarg(const void *) fhp;
2306 		syscallarg(size_t) fh_size;
2307 		syscallarg(struct statvfs *) buf;
2308 		syscallarg(int)	flags;
2309 	} */
2310 	struct statvfs *sb = STATVFSBUF_GET();
2311 	int error;
2312 
2313 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2314 	    SCARG(uap, flags));
2315 	if (error == 0)
2316 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2317 	STATVFSBUF_PUT(sb);
2318 	return error;
2319 }
2320 
2321 int
2322 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2323     dev_t dev)
2324 {
2325 
2326 	/*
2327 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2328 	 * in mode and dev=0.
2329 	 *
2330 	 * In all the other cases it's implementation defined behavior.
2331 	 */
2332 
2333 	if ((mode & S_IFIFO) && dev == 0)
2334 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2335 	else
2336 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2337 		    UIO_USERSPACE);
2338 }
2339 
2340 /*
2341  * Create a special file.
2342  */
2343 /* ARGSUSED */
2344 int
2345 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2346     register_t *retval)
2347 {
2348 	/* {
2349 		syscallarg(const char *) path;
2350 		syscallarg(mode_t) mode;
2351 		syscallarg(dev_t) dev;
2352 	} */
2353 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2354 	    SCARG(uap, mode), SCARG(uap, dev));
2355 }
2356 
2357 int
2358 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2359     register_t *retval)
2360 {
2361 	/* {
2362 		syscallarg(int) fd;
2363 		syscallarg(const char *) path;
2364 		syscallarg(mode_t) mode;
2365 		syscallarg(int) pad;
2366 		syscallarg(dev_t) dev;
2367 	} */
2368 
2369 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2370 	    SCARG(uap, mode), SCARG(uap, dev));
2371 }
2372 
2373 int
2374 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2375     enum uio_seg seg)
2376 {
2377 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2378 }
2379 
2380 int
2381 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2382     dev_t dev, enum uio_seg seg)
2383 {
2384 	struct proc *p = l->l_proc;
2385 	struct vnode *vp;
2386 	struct vattr vattr;
2387 	int error, optype;
2388 	struct pathbuf *pb;
2389 	struct nameidata nd;
2390 	const char *pathstring;
2391 
2392 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2393 	    0, NULL, NULL, NULL)) != 0)
2394 		return (error);
2395 
2396 	optype = VOP_MKNOD_DESCOFFSET;
2397 
2398 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2399 	if (error) {
2400 		return error;
2401 	}
2402 	pathstring = pathbuf_stringcopy_get(pb);
2403 	if (pathstring == NULL) {
2404 		pathbuf_destroy(pb);
2405 		return ENOMEM;
2406 	}
2407 
2408 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2409 
2410 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2411 		goto out;
2412 	vp = nd.ni_vp;
2413 
2414 	if (vp != NULL)
2415 		error = EEXIST;
2416 	else {
2417 		vattr_null(&vattr);
2418 		/* We will read cwdi->cwdi_cmask unlocked. */
2419 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2420 		vattr.va_rdev = dev;
2421 
2422 		switch (mode & S_IFMT) {
2423 		case S_IFMT:	/* used by badsect to flag bad sectors */
2424 			vattr.va_type = VBAD;
2425 			break;
2426 		case S_IFCHR:
2427 			vattr.va_type = VCHR;
2428 			break;
2429 		case S_IFBLK:
2430 			vattr.va_type = VBLK;
2431 			break;
2432 		case S_IFWHT:
2433 			optype = VOP_WHITEOUT_DESCOFFSET;
2434 			break;
2435 		case S_IFREG:
2436 #if NVERIEXEC > 0
2437 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2438 			    O_CREAT);
2439 #endif /* NVERIEXEC > 0 */
2440 			vattr.va_type = VREG;
2441 			vattr.va_rdev = VNOVAL;
2442 			optype = VOP_CREATE_DESCOFFSET;
2443 			break;
2444 		default:
2445 			error = EINVAL;
2446 			break;
2447 		}
2448 
2449 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2450 		    vattr.va_rdev == VNOVAL)
2451 			error = EINVAL;
2452 	}
2453 
2454 	if (!error) {
2455 		switch (optype) {
2456 		case VOP_WHITEOUT_DESCOFFSET:
2457 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2458 			if (error)
2459 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2460 			vput(nd.ni_dvp);
2461 			break;
2462 
2463 		case VOP_MKNOD_DESCOFFSET:
2464 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2465 						&nd.ni_cnd, &vattr);
2466 			if (error == 0)
2467 				vrele(nd.ni_vp);
2468 			vput(nd.ni_dvp);
2469 			break;
2470 
2471 		case VOP_CREATE_DESCOFFSET:
2472 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2473 						&nd.ni_cnd, &vattr);
2474 			if (error == 0)
2475 				vrele(nd.ni_vp);
2476 			vput(nd.ni_dvp);
2477 			break;
2478 		}
2479 	} else {
2480 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2481 		if (nd.ni_dvp == vp)
2482 			vrele(nd.ni_dvp);
2483 		else
2484 			vput(nd.ni_dvp);
2485 		if (vp)
2486 			vrele(vp);
2487 	}
2488 out:
2489 	pathbuf_stringcopy_put(pb, pathstring);
2490 	pathbuf_destroy(pb);
2491 	return (error);
2492 }
2493 
2494 /*
2495  * Create a named pipe.
2496  */
2497 /* ARGSUSED */
2498 int
2499 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2500 {
2501 	/* {
2502 		syscallarg(const char *) path;
2503 		syscallarg(int) mode;
2504 	} */
2505 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2506 }
2507 
2508 int
2509 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2510     register_t *retval)
2511 {
2512 	/* {
2513 		syscallarg(int) fd;
2514 		syscallarg(const char *) path;
2515 		syscallarg(int) mode;
2516 	} */
2517 
2518 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2519 	    SCARG(uap, mode));
2520 }
2521 
2522 static int
2523 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2524 {
2525 	struct proc *p = l->l_proc;
2526 	struct vattr vattr;
2527 	int error;
2528 	struct pathbuf *pb;
2529 	struct nameidata nd;
2530 
2531 	error = pathbuf_copyin(path, &pb);
2532 	if (error) {
2533 		return error;
2534 	}
2535 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2536 
2537 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2538 		pathbuf_destroy(pb);
2539 		return error;
2540 	}
2541 	if (nd.ni_vp != NULL) {
2542 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2543 		if (nd.ni_dvp == nd.ni_vp)
2544 			vrele(nd.ni_dvp);
2545 		else
2546 			vput(nd.ni_dvp);
2547 		vrele(nd.ni_vp);
2548 		pathbuf_destroy(pb);
2549 		return (EEXIST);
2550 	}
2551 	vattr_null(&vattr);
2552 	vattr.va_type = VFIFO;
2553 	/* We will read cwdi->cwdi_cmask unlocked. */
2554 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2555 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2556 	if (error == 0)
2557 		vrele(nd.ni_vp);
2558 	vput(nd.ni_dvp);
2559 	pathbuf_destroy(pb);
2560 	return (error);
2561 }
2562 
2563 /*
2564  * Make a hard file link.
2565  */
2566 /* ARGSUSED */
2567 int
2568 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2569     const char *link, int follow, register_t *retval)
2570 {
2571 	struct vnode *vp;
2572 	struct pathbuf *linkpb;
2573 	struct nameidata nd;
2574 	namei_simple_flags_t ns_flags;
2575 	int error;
2576 
2577 	if (follow & AT_SYMLINK_FOLLOW)
2578 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2579 	else
2580 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2581 
2582 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2583 	if (error != 0)
2584 		return (error);
2585 	error = pathbuf_copyin(link, &linkpb);
2586 	if (error) {
2587 		goto out1;
2588 	}
2589 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2590 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2591 		goto out2;
2592 	if (nd.ni_vp) {
2593 		error = EEXIST;
2594 		goto abortop;
2595 	}
2596 	/* Prevent hard links on directories. */
2597 	if (vp->v_type == VDIR) {
2598 		error = EPERM;
2599 		goto abortop;
2600 	}
2601 	/* Prevent cross-mount operation. */
2602 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2603 		error = EXDEV;
2604 		goto abortop;
2605 	}
2606 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2607 	VOP_UNLOCK(nd.ni_dvp);
2608 	vrele(nd.ni_dvp);
2609 out2:
2610 	pathbuf_destroy(linkpb);
2611 out1:
2612 	vrele(vp);
2613 	return (error);
2614 abortop:
2615 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2616 	if (nd.ni_dvp == nd.ni_vp)
2617 		vrele(nd.ni_dvp);
2618 	else
2619 		vput(nd.ni_dvp);
2620 	if (nd.ni_vp != NULL)
2621 		vrele(nd.ni_vp);
2622 	goto out2;
2623 }
2624 
2625 int
2626 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2627 {
2628 	/* {
2629 		syscallarg(const char *) path;
2630 		syscallarg(const char *) link;
2631 	} */
2632 	const char *path = SCARG(uap, path);
2633 	const char *link = SCARG(uap, link);
2634 
2635 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2636 	    AT_SYMLINK_FOLLOW, retval);
2637 }
2638 
2639 int
2640 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2641     register_t *retval)
2642 {
2643 	/* {
2644 		syscallarg(int) fd1;
2645 		syscallarg(const char *) name1;
2646 		syscallarg(int) fd2;
2647 		syscallarg(const char *) name2;
2648 		syscallarg(int) flags;
2649 	} */
2650 	int fd1 = SCARG(uap, fd1);
2651 	const char *name1 = SCARG(uap, name1);
2652 	int fd2 = SCARG(uap, fd2);
2653 	const char *name2 = SCARG(uap, name2);
2654 	int follow;
2655 
2656 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2657 
2658 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2659 }
2660 
2661 
2662 int
2663 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2664 {
2665 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2666 }
2667 
2668 static int
2669 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2670     const char *link, enum uio_seg seg)
2671 {
2672 	struct proc *p = curproc;
2673 	struct vattr vattr;
2674 	char *path;
2675 	int error;
2676 	size_t len;
2677 	struct pathbuf *linkpb;
2678 	struct nameidata nd;
2679 
2680 	KASSERT(l != NULL || fdat == AT_FDCWD);
2681 
2682 	path = PNBUF_GET();
2683 	if (seg == UIO_USERSPACE) {
2684 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2685 			goto out1;
2686 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2687 			goto out1;
2688 	} else {
2689 		len = strlen(patharg) + 1;
2690 		KASSERT(len <= MAXPATHLEN);
2691 		memcpy(path, patharg, len);
2692 		linkpb = pathbuf_create(link);
2693 		if (linkpb == NULL) {
2694 			error = ENOMEM;
2695 			goto out1;
2696 		}
2697 	}
2698 	ktrkuser("symlink-target", path, len - 1);
2699 
2700 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2701 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2702 		goto out2;
2703 	if (nd.ni_vp) {
2704 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2705 		if (nd.ni_dvp == nd.ni_vp)
2706 			vrele(nd.ni_dvp);
2707 		else
2708 			vput(nd.ni_dvp);
2709 		vrele(nd.ni_vp);
2710 		error = EEXIST;
2711 		goto out2;
2712 	}
2713 	vattr_null(&vattr);
2714 	vattr.va_type = VLNK;
2715 	/* We will read cwdi->cwdi_cmask unlocked. */
2716 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2717 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2718 	if (error == 0)
2719 		vrele(nd.ni_vp);
2720 	vput(nd.ni_dvp);
2721 out2:
2722 	pathbuf_destroy(linkpb);
2723 out1:
2724 	PNBUF_PUT(path);
2725 	return (error);
2726 }
2727 
2728 /*
2729  * Make a symbolic link.
2730  */
2731 /* ARGSUSED */
2732 int
2733 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2734 {
2735 	/* {
2736 		syscallarg(const char *) path;
2737 		syscallarg(const char *) link;
2738 	} */
2739 
2740 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2741 	    UIO_USERSPACE);
2742 }
2743 
2744 int
2745 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2746     register_t *retval)
2747 {
2748 	/* {
2749 		syscallarg(const char *) path1;
2750 		syscallarg(int) fd;
2751 		syscallarg(const char *) path2;
2752 	} */
2753 
2754 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2755 	    SCARG(uap, path2), UIO_USERSPACE);
2756 }
2757 
2758 /*
2759  * Delete a whiteout from the filesystem.
2760  */
2761 /* ARGSUSED */
2762 int
2763 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2764 {
2765 	/* {
2766 		syscallarg(const char *) path;
2767 	} */
2768 	int error;
2769 	struct pathbuf *pb;
2770 	struct nameidata nd;
2771 
2772 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2773 	if (error) {
2774 		return error;
2775 	}
2776 
2777 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2778 	error = namei(&nd);
2779 	if (error) {
2780 		pathbuf_destroy(pb);
2781 		return (error);
2782 	}
2783 
2784 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2785 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2786 		if (nd.ni_dvp == nd.ni_vp)
2787 			vrele(nd.ni_dvp);
2788 		else
2789 			vput(nd.ni_dvp);
2790 		if (nd.ni_vp)
2791 			vrele(nd.ni_vp);
2792 		pathbuf_destroy(pb);
2793 		return (EEXIST);
2794 	}
2795 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2796 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2797 	vput(nd.ni_dvp);
2798 	pathbuf_destroy(pb);
2799 	return (error);
2800 }
2801 
2802 /*
2803  * Delete a name from the filesystem.
2804  */
2805 /* ARGSUSED */
2806 int
2807 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2808 {
2809 	/* {
2810 		syscallarg(const char *) path;
2811 	} */
2812 
2813 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2814 }
2815 
2816 int
2817 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2818     register_t *retval)
2819 {
2820 	/* {
2821 		syscallarg(int) fd;
2822 		syscallarg(const char *) path;
2823 		syscallarg(int) flag;
2824 	} */
2825 
2826 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2827 	    SCARG(uap, flag), UIO_USERSPACE);
2828 }
2829 
2830 int
2831 do_sys_unlink(const char *arg, enum uio_seg seg)
2832 {
2833 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2834 }
2835 
2836 static int
2837 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2838     enum uio_seg seg)
2839 {
2840 	struct vnode *vp;
2841 	int error;
2842 	struct pathbuf *pb;
2843 	struct nameidata nd;
2844 	const char *pathstring;
2845 
2846 	KASSERT(l != NULL || fdat == AT_FDCWD);
2847 
2848 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2849 	if (error) {
2850 		return error;
2851 	}
2852 	pathstring = pathbuf_stringcopy_get(pb);
2853 	if (pathstring == NULL) {
2854 		pathbuf_destroy(pb);
2855 		return ENOMEM;
2856 	}
2857 
2858 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2859 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2860 		goto out;
2861 	vp = nd.ni_vp;
2862 
2863 	/*
2864 	 * The root of a mounted filesystem cannot be deleted.
2865 	 */
2866 	if ((vp->v_vflag & VV_ROOT) != 0) {
2867 		error = EBUSY;
2868 		goto abort;
2869 	}
2870 
2871 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2872 		error = EBUSY;
2873 		goto abort;
2874 	}
2875 
2876 	/*
2877 	 * No rmdir "." please.
2878 	 */
2879 	if (nd.ni_dvp == vp) {
2880 		error = EINVAL;
2881 		goto abort;
2882 	}
2883 
2884 	/*
2885 	 * AT_REMOVEDIR is required to remove a directory
2886 	 */
2887 	if (vp->v_type == VDIR) {
2888 		if (!(flags & AT_REMOVEDIR)) {
2889 			error = EPERM;
2890 			goto abort;
2891 		} else {
2892 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2893 			vput(nd.ni_dvp);
2894 			goto out;
2895 		}
2896 	}
2897 
2898 	/*
2899 	 * Starting here we only deal with non directories.
2900 	 */
2901 	if (flags & AT_REMOVEDIR) {
2902 		error = ENOTDIR;
2903 		goto abort;
2904 	}
2905 
2906 #if NVERIEXEC > 0
2907 	/* Handle remove requests for veriexec entries. */
2908 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2909 		goto abort;
2910 	}
2911 #endif /* NVERIEXEC > 0 */
2912 
2913 #ifdef FILEASSOC
2914 	(void)fileassoc_file_delete(vp);
2915 #endif /* FILEASSOC */
2916 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2917 	vput(nd.ni_dvp);
2918 	goto out;
2919 
2920 abort:
2921 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2922 	if (nd.ni_dvp == vp)
2923 		vrele(nd.ni_dvp);
2924 	else
2925 		vput(nd.ni_dvp);
2926 	vput(vp);
2927 
2928 out:
2929 	pathbuf_stringcopy_put(pb, pathstring);
2930 	pathbuf_destroy(pb);
2931 	return (error);
2932 }
2933 
2934 /*
2935  * Reposition read/write file offset.
2936  */
2937 int
2938 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2939 {
2940 	/* {
2941 		syscallarg(int) fd;
2942 		syscallarg(int) pad;
2943 		syscallarg(off_t) offset;
2944 		syscallarg(int) whence;
2945 	} */
2946 	file_t *fp;
2947 	int error, fd;
2948 
2949 	switch (SCARG(uap, whence)) {
2950 	case SEEK_CUR:
2951 	case SEEK_END:
2952 	case SEEK_SET:
2953 		break;
2954 	default:
2955 		return EINVAL;
2956 	}
2957 
2958 	fd = SCARG(uap, fd);
2959 
2960 	if ((fp = fd_getfile(fd)) == NULL)
2961 		return (EBADF);
2962 
2963 	if (fp->f_ops->fo_seek == NULL) {
2964 		error = ESPIPE;
2965 		goto out;
2966 	}
2967 
2968 	error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
2969 	    SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
2970  out:
2971  	fd_putfile(fd);
2972 	return (error);
2973 }
2974 
2975 /*
2976  * Positional read system call.
2977  */
2978 int
2979 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2980 {
2981 	/* {
2982 		syscallarg(int) fd;
2983 		syscallarg(void *) buf;
2984 		syscallarg(size_t) nbyte;
2985 		syscallarg(off_t) offset;
2986 	} */
2987 	file_t *fp;
2988 	off_t offset;
2989 	int error, fd = SCARG(uap, fd);
2990 
2991 	if ((fp = fd_getfile(fd)) == NULL)
2992 		return (EBADF);
2993 
2994 	if ((fp->f_flag & FREAD) == 0) {
2995 		fd_putfile(fd);
2996 		return (EBADF);
2997 	}
2998 
2999 	if (fp->f_ops->fo_seek == NULL) {
3000 		error = ESPIPE;
3001 		goto out;
3002 	}
3003 
3004 	offset = SCARG(uap, offset);
3005 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3006 	if (error)
3007 		goto out;
3008 
3009 	/* dofileread() will unuse the descriptor for us */
3010 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3011 	    &offset, 0, retval));
3012 
3013  out:
3014 	fd_putfile(fd);
3015 	return (error);
3016 }
3017 
3018 /*
3019  * Positional scatter read system call.
3020  */
3021 int
3022 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
3023 {
3024 	/* {
3025 		syscallarg(int) fd;
3026 		syscallarg(const struct iovec *) iovp;
3027 		syscallarg(int) iovcnt;
3028 		syscallarg(off_t) offset;
3029 	} */
3030 	off_t offset = SCARG(uap, offset);
3031 
3032 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3033 	    SCARG(uap, iovcnt), &offset, 0, retval);
3034 }
3035 
3036 /*
3037  * Positional write system call.
3038  */
3039 int
3040 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
3041 {
3042 	/* {
3043 		syscallarg(int) fd;
3044 		syscallarg(const void *) buf;
3045 		syscallarg(size_t) nbyte;
3046 		syscallarg(off_t) offset;
3047 	} */
3048 	file_t *fp;
3049 	off_t offset;
3050 	int error, fd = SCARG(uap, fd);
3051 
3052 	if ((fp = fd_getfile(fd)) == NULL)
3053 		return (EBADF);
3054 
3055 	if ((fp->f_flag & FWRITE) == 0) {
3056 		fd_putfile(fd);
3057 		return (EBADF);
3058 	}
3059 
3060 	if (fp->f_ops->fo_seek == NULL) {
3061 		error = ESPIPE;
3062 		goto out;
3063 	}
3064 
3065 	offset = SCARG(uap, offset);
3066 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3067 	if (error)
3068 		goto out;
3069 
3070 	/* dofilewrite() will unuse the descriptor for us */
3071 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3072 	    &offset, 0, retval));
3073 
3074  out:
3075 	fd_putfile(fd);
3076 	return (error);
3077 }
3078 
3079 /*
3080  * Positional gather write system call.
3081  */
3082 int
3083 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
3084 {
3085 	/* {
3086 		syscallarg(int) fd;
3087 		syscallarg(const struct iovec *) iovp;
3088 		syscallarg(int) iovcnt;
3089 		syscallarg(off_t) offset;
3090 	} */
3091 	off_t offset = SCARG(uap, offset);
3092 
3093 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3094 	    SCARG(uap, iovcnt), &offset, 0, retval);
3095 }
3096 
3097 /*
3098  * Check access permissions.
3099  */
3100 int
3101 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
3102 {
3103 	/* {
3104 		syscallarg(const char *) path;
3105 		syscallarg(int) flags;
3106 	} */
3107 
3108 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3109 	     SCARG(uap, flags), 0);
3110 }
3111 
3112 int
3113 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3114     int mode, int flags)
3115 {
3116 	kauth_cred_t cred;
3117 	struct vnode *vp;
3118 	int error, nd_flag, vmode;
3119 	struct pathbuf *pb;
3120 	struct nameidata nd;
3121 
3122 	CTASSERT(F_OK == 0);
3123 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3124 		/* nonsense mode */
3125 		return EINVAL;
3126 	}
3127 
3128 	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3129 	if (flags & AT_SYMLINK_NOFOLLOW)
3130 		nd_flag &= ~FOLLOW;
3131 
3132 	error = pathbuf_copyin(path, &pb);
3133 	if (error)
3134 		return error;
3135 
3136 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3137 
3138 	/* Override default credentials */
3139 	if (!(flags & AT_EACCESS)) {
3140 		cred = kauth_cred_dup(l->l_cred);
3141 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3142 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3143 	} else
3144 		cred = l->l_cred;
3145 	nd.ni_cnd.cn_cred = cred;
3146 
3147 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3148 		pathbuf_destroy(pb);
3149 		goto out;
3150 	}
3151 	vp = nd.ni_vp;
3152 	pathbuf_destroy(pb);
3153 
3154 	/* Flags == 0 means only check for existence. */
3155 	if (mode) {
3156 		vmode = 0;
3157 		if (mode & R_OK)
3158 			vmode |= VREAD;
3159 		if (mode & W_OK)
3160 			vmode |= VWRITE;
3161 		if (mode & X_OK)
3162 			vmode |= VEXEC;
3163 
3164 		error = VOP_ACCESS(vp, vmode, cred);
3165 		if (!error && (vmode & VWRITE))
3166 			error = vn_writechk(vp);
3167 	}
3168 	vput(vp);
3169 out:
3170 	if (!(flags & AT_EACCESS))
3171 		kauth_cred_free(cred);
3172 	return (error);
3173 }
3174 
3175 int
3176 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3177     register_t *retval)
3178 {
3179 	/* {
3180 		syscallarg(int) fd;
3181 		syscallarg(const char *) path;
3182 		syscallarg(int) amode;
3183 		syscallarg(int) flag;
3184 	} */
3185 
3186 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3187 	     SCARG(uap, amode), SCARG(uap, flag));
3188 }
3189 
3190 /*
3191  * Common code for all sys_stat functions, including compat versions.
3192  */
3193 int
3194 do_sys_stat(const char *userpath, unsigned int nd_flag,
3195     struct stat *sb)
3196 {
3197 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3198 }
3199 
3200 int
3201 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3202     unsigned int nd_flag, struct stat *sb)
3203 {
3204 	int error;
3205 	struct pathbuf *pb;
3206 	struct nameidata nd;
3207 
3208 	KASSERT(l != NULL || fdat == AT_FDCWD);
3209 
3210 	error = pathbuf_copyin(userpath, &pb);
3211 	if (error) {
3212 		return error;
3213 	}
3214 
3215 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3216 
3217 	error = fd_nameiat(l, fdat, &nd);
3218 	if (error != 0) {
3219 		pathbuf_destroy(pb);
3220 		return error;
3221 	}
3222 	error = vn_stat(nd.ni_vp, sb);
3223 	vput(nd.ni_vp);
3224 	pathbuf_destroy(pb);
3225 	return error;
3226 }
3227 
3228 /*
3229  * Get file status; this version follows links.
3230  */
3231 /* ARGSUSED */
3232 int
3233 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3234 {
3235 	/* {
3236 		syscallarg(const char *) path;
3237 		syscallarg(struct stat *) ub;
3238 	} */
3239 	struct stat sb;
3240 	int error;
3241 
3242 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3243 	if (error)
3244 		return error;
3245 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3246 }
3247 
3248 /*
3249  * Get file status; this version does not follow links.
3250  */
3251 /* ARGSUSED */
3252 int
3253 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3254 {
3255 	/* {
3256 		syscallarg(const char *) path;
3257 		syscallarg(struct stat *) ub;
3258 	} */
3259 	struct stat sb;
3260 	int error;
3261 
3262 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3263 	if (error)
3264 		return error;
3265 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3266 }
3267 
3268 int
3269 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3270     register_t *retval)
3271 {
3272 	/* {
3273 		syscallarg(int) fd;
3274 		syscallarg(const char *) path;
3275 		syscallarg(struct stat *) buf;
3276 		syscallarg(int) flag;
3277 	} */
3278 	unsigned int nd_flag;
3279 	struct stat sb;
3280 	int error;
3281 
3282 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3283 		nd_flag = NOFOLLOW;
3284 	else
3285 		nd_flag = FOLLOW;
3286 
3287 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3288 	    &sb);
3289 	if (error)
3290 		return error;
3291 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3292 }
3293 
3294 static int
3295 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3296 {
3297 	int error;
3298 	struct pathbuf *pb;
3299 	struct nameidata nd;
3300 
3301 	error = pathbuf_copyin(path, &pb);
3302 	if (error) {
3303 		return error;
3304 	}
3305 	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3306 	if ((error = namei(&nd)) != 0) {
3307 		pathbuf_destroy(pb);
3308 		return error;
3309 	}
3310 	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3311 	vput(nd.ni_vp);
3312 	pathbuf_destroy(pb);
3313 	return error;
3314 }
3315 
3316 /*
3317  * Get configurable pathname variables.
3318  */
3319 /* ARGSUSED */
3320 int
3321 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3322     register_t *retval)
3323 {
3324 	/* {
3325 		syscallarg(const char *) path;
3326 		syscallarg(int) name;
3327 	} */
3328 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3329 	    FOLLOW);
3330 }
3331 
3332 /* ARGSUSED */
3333 int
3334 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3335     register_t *retval)
3336 {
3337 	/* {
3338 		syscallarg(const char *) path;
3339 		syscallarg(int) name;
3340 	} */
3341 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3342 	    NOFOLLOW);
3343 }
3344 
3345 /*
3346  * Return target name of a symbolic link.
3347  */
3348 /* ARGSUSED */
3349 int
3350 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3351     register_t *retval)
3352 {
3353 	/* {
3354 		syscallarg(const char *) path;
3355 		syscallarg(char *) buf;
3356 		syscallarg(size_t) count;
3357 	} */
3358 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3359 	    SCARG(uap, buf), SCARG(uap, count), retval);
3360 }
3361 
3362 static int
3363 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3364     size_t count, register_t *retval)
3365 {
3366 	struct vnode *vp;
3367 	struct iovec aiov;
3368 	struct uio auio;
3369 	int error;
3370 	struct pathbuf *pb;
3371 	struct nameidata nd;
3372 
3373 	error = pathbuf_copyin(path, &pb);
3374 	if (error) {
3375 		return error;
3376 	}
3377 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
3378 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3379 		pathbuf_destroy(pb);
3380 		return error;
3381 	}
3382 	vp = nd.ni_vp;
3383 	pathbuf_destroy(pb);
3384 	if (vp->v_type != VLNK)
3385 		error = EINVAL;
3386 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3387 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3388 		aiov.iov_base = buf;
3389 		aiov.iov_len = count;
3390 		auio.uio_iov = &aiov;
3391 		auio.uio_iovcnt = 1;
3392 		auio.uio_offset = 0;
3393 		auio.uio_rw = UIO_READ;
3394 		KASSERT(l == curlwp);
3395 		auio.uio_vmspace = l->l_proc->p_vmspace;
3396 		auio.uio_resid = count;
3397 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3398 			*retval = count - auio.uio_resid;
3399 	}
3400 	vput(vp);
3401 	return (error);
3402 }
3403 
3404 int
3405 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3406     register_t *retval)
3407 {
3408 	/* {
3409 		syscallarg(int) fd;
3410 		syscallarg(const char *) path;
3411 		syscallarg(char *) buf;
3412 		syscallarg(size_t) bufsize;
3413 	} */
3414 
3415 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3416 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3417 }
3418 
3419 /*
3420  * Change flags of a file given a path name.
3421  */
3422 /* ARGSUSED */
3423 int
3424 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3425 {
3426 	/* {
3427 		syscallarg(const char *) path;
3428 		syscallarg(u_long) flags;
3429 	} */
3430 	struct vnode *vp;
3431 	int error;
3432 
3433 	error = namei_simple_user(SCARG(uap, path),
3434 				NSM_FOLLOW_TRYEMULROOT, &vp);
3435 	if (error != 0)
3436 		return (error);
3437 	error = change_flags(vp, SCARG(uap, flags), l);
3438 	vput(vp);
3439 	return (error);
3440 }
3441 
3442 /*
3443  * Change flags of a file given a file descriptor.
3444  */
3445 /* ARGSUSED */
3446 int
3447 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3448 {
3449 	/* {
3450 		syscallarg(int) fd;
3451 		syscallarg(u_long) flags;
3452 	} */
3453 	struct vnode *vp;
3454 	file_t *fp;
3455 	int error;
3456 
3457 	/* fd_getvnode() will use the descriptor for us */
3458 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3459 		return (error);
3460 	vp = fp->f_vnode;
3461 	error = change_flags(vp, SCARG(uap, flags), l);
3462 	VOP_UNLOCK(vp);
3463 	fd_putfile(SCARG(uap, fd));
3464 	return (error);
3465 }
3466 
3467 /*
3468  * Change flags of a file given a path name; this version does
3469  * not follow links.
3470  */
3471 int
3472 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3473 {
3474 	/* {
3475 		syscallarg(const char *) path;
3476 		syscallarg(u_long) flags;
3477 	} */
3478 	struct vnode *vp;
3479 	int error;
3480 
3481 	error = namei_simple_user(SCARG(uap, path),
3482 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3483 	if (error != 0)
3484 		return (error);
3485 	error = change_flags(vp, SCARG(uap, flags), l);
3486 	vput(vp);
3487 	return (error);
3488 }
3489 
3490 /*
3491  * Common routine to change flags of a file.
3492  */
3493 int
3494 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3495 {
3496 	struct vattr vattr;
3497 	int error;
3498 
3499 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3500 
3501 	vattr_null(&vattr);
3502 	vattr.va_flags = flags;
3503 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3504 
3505 	return (error);
3506 }
3507 
3508 /*
3509  * Change mode of a file given path name; this version follows links.
3510  */
3511 /* ARGSUSED */
3512 int
3513 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3514 {
3515 	/* {
3516 		syscallarg(const char *) path;
3517 		syscallarg(int) mode;
3518 	} */
3519 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3520 			      SCARG(uap, mode), 0);
3521 }
3522 
3523 int
3524 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3525 {
3526 	int error;
3527 	struct vnode *vp;
3528 	namei_simple_flags_t ns_flag;
3529 
3530 	if (flags & AT_SYMLINK_NOFOLLOW)
3531 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3532 	else
3533 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3534 
3535 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3536 	if (error != 0)
3537 		return error;
3538 
3539 	error = change_mode(vp, mode, l);
3540 
3541 	vrele(vp);
3542 
3543 	return (error);
3544 }
3545 
3546 /*
3547  * Change mode of a file given a file descriptor.
3548  */
3549 /* ARGSUSED */
3550 int
3551 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3552 {
3553 	/* {
3554 		syscallarg(int) fd;
3555 		syscallarg(int) mode;
3556 	} */
3557 	file_t *fp;
3558 	int error;
3559 
3560 	/* fd_getvnode() will use the descriptor for us */
3561 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3562 		return (error);
3563 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3564 	fd_putfile(SCARG(uap, fd));
3565 	return (error);
3566 }
3567 
3568 int
3569 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3570     register_t *retval)
3571 {
3572 	/* {
3573 		syscallarg(int) fd;
3574 		syscallarg(const char *) path;
3575 		syscallarg(int) mode;
3576 		syscallarg(int) flag;
3577 	} */
3578 
3579 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3580 			      SCARG(uap, mode), SCARG(uap, flag));
3581 }
3582 
3583 /*
3584  * Change mode of a file given path name; this version does not follow links.
3585  */
3586 /* ARGSUSED */
3587 int
3588 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3589 {
3590 	/* {
3591 		syscallarg(const char *) path;
3592 		syscallarg(int) mode;
3593 	} */
3594 	int error;
3595 	struct vnode *vp;
3596 
3597 	error = namei_simple_user(SCARG(uap, path),
3598 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3599 	if (error != 0)
3600 		return (error);
3601 
3602 	error = change_mode(vp, SCARG(uap, mode), l);
3603 
3604 	vrele(vp);
3605 	return (error);
3606 }
3607 
3608 /*
3609  * Common routine to set mode given a vnode.
3610  */
3611 static int
3612 change_mode(struct vnode *vp, int mode, struct lwp *l)
3613 {
3614 	struct vattr vattr;
3615 	int error;
3616 
3617 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3618 	vattr_null(&vattr);
3619 	vattr.va_mode = mode & ALLPERMS;
3620 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3621 	VOP_UNLOCK(vp);
3622 	return (error);
3623 }
3624 
3625 /*
3626  * Set ownership given a path name; this version follows links.
3627  */
3628 /* ARGSUSED */
3629 int
3630 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3631 {
3632 	/* {
3633 		syscallarg(const char *) path;
3634 		syscallarg(uid_t) uid;
3635 		syscallarg(gid_t) gid;
3636 	} */
3637 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3638 			      SCARG(uap, gid), 0);
3639 }
3640 
3641 int
3642 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3643    gid_t gid, int flags)
3644 {
3645 	int error;
3646 	struct vnode *vp;
3647 	namei_simple_flags_t ns_flag;
3648 
3649 	if (flags & AT_SYMLINK_NOFOLLOW)
3650 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3651 	else
3652 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3653 
3654 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3655 	if (error != 0)
3656 		return error;
3657 
3658 	error = change_owner(vp, uid, gid, l, 0);
3659 
3660 	vrele(vp);
3661 
3662 	return (error);
3663 }
3664 
3665 /*
3666  * Set ownership given a path name; this version follows links.
3667  * Provides POSIX semantics.
3668  */
3669 /* ARGSUSED */
3670 int
3671 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3672 {
3673 	/* {
3674 		syscallarg(const char *) path;
3675 		syscallarg(uid_t) uid;
3676 		syscallarg(gid_t) gid;
3677 	} */
3678 	int error;
3679 	struct vnode *vp;
3680 
3681 	error = namei_simple_user(SCARG(uap, path),
3682 				NSM_FOLLOW_TRYEMULROOT, &vp);
3683 	if (error != 0)
3684 		return (error);
3685 
3686 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3687 
3688 	vrele(vp);
3689 	return (error);
3690 }
3691 
3692 /*
3693  * Set ownership given a file descriptor.
3694  */
3695 /* ARGSUSED */
3696 int
3697 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3698 {
3699 	/* {
3700 		syscallarg(int) fd;
3701 		syscallarg(uid_t) uid;
3702 		syscallarg(gid_t) gid;
3703 	} */
3704 	int error;
3705 	file_t *fp;
3706 
3707 	/* fd_getvnode() will use the descriptor for us */
3708 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3709 		return (error);
3710 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3711 	    l, 0);
3712 	fd_putfile(SCARG(uap, fd));
3713 	return (error);
3714 }
3715 
3716 int
3717 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3718     register_t *retval)
3719 {
3720 	/* {
3721 		syscallarg(int) fd;
3722 		syscallarg(const char *) path;
3723 		syscallarg(uid_t) owner;
3724 		syscallarg(gid_t) group;
3725 		syscallarg(int) flag;
3726 	} */
3727 
3728 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3729 			      SCARG(uap, owner), SCARG(uap, group),
3730 			      SCARG(uap, flag));
3731 }
3732 
3733 /*
3734  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3735  */
3736 /* ARGSUSED */
3737 int
3738 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3739 {
3740 	/* {
3741 		syscallarg(int) fd;
3742 		syscallarg(uid_t) uid;
3743 		syscallarg(gid_t) gid;
3744 	} */
3745 	int error;
3746 	file_t *fp;
3747 
3748 	/* fd_getvnode() will use the descriptor for us */
3749 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3750 		return (error);
3751 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3752 	    l, 1);
3753 	fd_putfile(SCARG(uap, fd));
3754 	return (error);
3755 }
3756 
3757 /*
3758  * Set ownership given a path name; this version does not follow links.
3759  */
3760 /* ARGSUSED */
3761 int
3762 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3763 {
3764 	/* {
3765 		syscallarg(const char *) path;
3766 		syscallarg(uid_t) uid;
3767 		syscallarg(gid_t) gid;
3768 	} */
3769 	int error;
3770 	struct vnode *vp;
3771 
3772 	error = namei_simple_user(SCARG(uap, path),
3773 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3774 	if (error != 0)
3775 		return (error);
3776 
3777 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3778 
3779 	vrele(vp);
3780 	return (error);
3781 }
3782 
3783 /*
3784  * Set ownership given a path name; this version does not follow links.
3785  * Provides POSIX/XPG semantics.
3786  */
3787 /* ARGSUSED */
3788 int
3789 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3790 {
3791 	/* {
3792 		syscallarg(const char *) path;
3793 		syscallarg(uid_t) uid;
3794 		syscallarg(gid_t) gid;
3795 	} */
3796 	int error;
3797 	struct vnode *vp;
3798 
3799 	error = namei_simple_user(SCARG(uap, path),
3800 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3801 	if (error != 0)
3802 		return (error);
3803 
3804 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3805 
3806 	vrele(vp);
3807 	return (error);
3808 }
3809 
3810 /*
3811  * Common routine to set ownership given a vnode.
3812  */
3813 static int
3814 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3815     int posix_semantics)
3816 {
3817 	struct vattr vattr;
3818 	mode_t newmode;
3819 	int error;
3820 
3821 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3822 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3823 		goto out;
3824 
3825 #define CHANGED(x) ((int)(x) != -1)
3826 	newmode = vattr.va_mode;
3827 	if (posix_semantics) {
3828 		/*
3829 		 * POSIX/XPG semantics: if the caller is not the super-user,
3830 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3831 		 * the XPG consider the behaviour for calls by the super-user
3832 		 * implementation-defined; we leave the set-user-id and set-
3833 		 * group-id settings intact in that case.
3834 		 */
3835 		if (vattr.va_mode & S_ISUID) {
3836 			if (kauth_authorize_vnode(l->l_cred,
3837 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3838 				newmode &= ~S_ISUID;
3839 		}
3840 		if (vattr.va_mode & S_ISGID) {
3841 			if (kauth_authorize_vnode(l->l_cred,
3842 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3843 				newmode &= ~S_ISGID;
3844 		}
3845 	} else {
3846 		/*
3847 		 * NetBSD semantics: when changing owner and/or group,
3848 		 * clear the respective bit(s).
3849 		 */
3850 		if (CHANGED(uid))
3851 			newmode &= ~S_ISUID;
3852 		if (CHANGED(gid))
3853 			newmode &= ~S_ISGID;
3854 	}
3855 	/* Update va_mode iff altered. */
3856 	if (vattr.va_mode == newmode)
3857 		newmode = VNOVAL;
3858 
3859 	vattr_null(&vattr);
3860 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3861 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3862 	vattr.va_mode = newmode;
3863 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3864 #undef CHANGED
3865 
3866 out:
3867 	VOP_UNLOCK(vp);
3868 	return (error);
3869 }
3870 
3871 /*
3872  * Set the access and modification times given a path name; this
3873  * version follows links.
3874  */
3875 /* ARGSUSED */
3876 int
3877 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3878     register_t *retval)
3879 {
3880 	/* {
3881 		syscallarg(const char *) path;
3882 		syscallarg(const struct timeval *) tptr;
3883 	} */
3884 
3885 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3886 	    SCARG(uap, tptr), UIO_USERSPACE);
3887 }
3888 
3889 /*
3890  * Set the access and modification times given a file descriptor.
3891  */
3892 /* ARGSUSED */
3893 int
3894 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3895     register_t *retval)
3896 {
3897 	/* {
3898 		syscallarg(int) fd;
3899 		syscallarg(const struct timeval *) tptr;
3900 	} */
3901 	int error;
3902 	file_t *fp;
3903 
3904 	/* fd_getvnode() will use the descriptor for us */
3905 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3906 		return (error);
3907 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3908 	    UIO_USERSPACE);
3909 	fd_putfile(SCARG(uap, fd));
3910 	return (error);
3911 }
3912 
3913 int
3914 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3915     register_t *retval)
3916 {
3917 	/* {
3918 		syscallarg(int) fd;
3919 		syscallarg(const struct timespec *) tptr;
3920 	} */
3921 	int error;
3922 	file_t *fp;
3923 
3924 	/* fd_getvnode() will use the descriptor for us */
3925 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3926 		return (error);
3927 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3928 	    SCARG(uap, tptr), UIO_USERSPACE);
3929 	fd_putfile(SCARG(uap, fd));
3930 	return (error);
3931 }
3932 
3933 /*
3934  * Set the access and modification times given a path name; this
3935  * version does not follow links.
3936  */
3937 int
3938 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3939     register_t *retval)
3940 {
3941 	/* {
3942 		syscallarg(const char *) path;
3943 		syscallarg(const struct timeval *) tptr;
3944 	} */
3945 
3946 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3947 	    SCARG(uap, tptr), UIO_USERSPACE);
3948 }
3949 
3950 int
3951 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3952     register_t *retval)
3953 {
3954 	/* {
3955 		syscallarg(int) fd;
3956 		syscallarg(const char *) path;
3957 		syscallarg(const struct timespec *) tptr;
3958 		syscallarg(int) flag;
3959 	} */
3960 	int follow;
3961 	const struct timespec *tptr;
3962 	int error;
3963 
3964 	tptr = SCARG(uap, tptr);
3965 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3966 
3967 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3968 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3969 
3970 	return error;
3971 }
3972 
3973 /*
3974  * Common routine to set access and modification times given a vnode.
3975  */
3976 int
3977 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3978     const struct timespec *tptr, enum uio_seg seg)
3979 {
3980 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3981 }
3982 
3983 int
3984 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3985     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3986 {
3987 	struct vattr vattr;
3988 	int error, dorele = 0;
3989 	namei_simple_flags_t sflags;
3990 	bool vanull, setbirthtime;
3991 	struct timespec ts[2];
3992 
3993 	KASSERT(l != NULL || fdat == AT_FDCWD);
3994 
3995 	/*
3996 	 * I have checked all callers and they pass either FOLLOW,
3997 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3998 	 * is 0. More to the point, they don't pass anything else.
3999 	 * Let's keep it that way at least until the namei interfaces
4000 	 * are fully sanitized.
4001 	 */
4002 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
4003 	sflags = (flag == FOLLOW) ?
4004 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
4005 
4006 	if (tptr == NULL) {
4007 		vanull = true;
4008 		nanotime(&ts[0]);
4009 		ts[1] = ts[0];
4010 	} else {
4011 		vanull = false;
4012 		if (seg != UIO_SYSSPACE) {
4013 			error = copyin(tptr, ts, sizeof (ts));
4014 			if (error != 0)
4015 				return error;
4016 		} else {
4017 			ts[0] = tptr[0];
4018 			ts[1] = tptr[1];
4019 		}
4020 	}
4021 
4022 	if (ts[0].tv_nsec == UTIME_NOW) {
4023 		nanotime(&ts[0]);
4024 		if (ts[1].tv_nsec == UTIME_NOW) {
4025 			vanull = true;
4026 			ts[1] = ts[0];
4027 		}
4028 	} else if (ts[1].tv_nsec == UTIME_NOW)
4029 		nanotime(&ts[1]);
4030 
4031 	if (vp == NULL) {
4032 		/* note: SEG describes TPTR, not PATH; PATH is always user */
4033 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4034 		if (error != 0)
4035 			return error;
4036 		dorele = 1;
4037 	}
4038 
4039 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4040 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4041 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
4042 	vattr_null(&vattr);
4043 
4044 	if (ts[0].tv_nsec != UTIME_OMIT)
4045 		vattr.va_atime = ts[0];
4046 
4047 	if (ts[1].tv_nsec != UTIME_OMIT) {
4048 		vattr.va_mtime = ts[1];
4049 		if (setbirthtime)
4050 			vattr.va_birthtime = ts[1];
4051 	}
4052 
4053 	if (vanull)
4054 		vattr.va_vaflags |= VA_UTIMES_NULL;
4055 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
4056 	VOP_UNLOCK(vp);
4057 
4058 	if (dorele != 0)
4059 		vrele(vp);
4060 
4061 	return error;
4062 }
4063 
4064 int
4065 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4066     const struct timeval *tptr, enum uio_seg seg)
4067 {
4068 	struct timespec ts[2];
4069 	struct timespec *tsptr = NULL;
4070 	int error;
4071 
4072 	if (tptr != NULL) {
4073 		struct timeval tv[2];
4074 
4075 		if (seg != UIO_SYSSPACE) {
4076 			error = copyin(tptr, tv, sizeof(tv));
4077 			if (error != 0)
4078 				return error;
4079 			tptr = tv;
4080 		}
4081 
4082 		if ((tptr[0].tv_usec == UTIME_NOW) ||
4083 		    (tptr[0].tv_usec == UTIME_OMIT))
4084 			ts[0].tv_nsec = tptr[0].tv_usec;
4085 		else {
4086 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4087 				return EINVAL;
4088 
4089 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4090 		}
4091 
4092 		if ((tptr[1].tv_usec == UTIME_NOW) ||
4093 		    (tptr[1].tv_usec == UTIME_OMIT))
4094 			ts[1].tv_nsec = tptr[1].tv_usec;
4095 		else {
4096 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4097 				return EINVAL;
4098 
4099 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4100 		}
4101 
4102 		tsptr = &ts[0];
4103 	}
4104 
4105 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4106 }
4107 
4108 /*
4109  * Truncate a file given its path name.
4110  */
4111 /* ARGSUSED */
4112 int
4113 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
4114 {
4115 	/* {
4116 		syscallarg(const char *) path;
4117 		syscallarg(int) pad;
4118 		syscallarg(off_t) length;
4119 	} */
4120 	struct vnode *vp;
4121 	struct vattr vattr;
4122 	int error;
4123 
4124 	if (SCARG(uap, length) < 0)
4125 		return EINVAL;
4126 
4127 	error = namei_simple_user(SCARG(uap, path),
4128 				NSM_FOLLOW_TRYEMULROOT, &vp);
4129 	if (error != 0)
4130 		return (error);
4131 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4132 	if (vp->v_type == VDIR)
4133 		error = EISDIR;
4134 	else if ((error = vn_writechk(vp)) == 0 &&
4135 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4136 		vattr_null(&vattr);
4137 		vattr.va_size = SCARG(uap, length);
4138 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4139 	}
4140 	vput(vp);
4141 	return (error);
4142 }
4143 
4144 /*
4145  * Truncate a file given a file descriptor.
4146  */
4147 /* ARGSUSED */
4148 int
4149 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
4150 {
4151 	/* {
4152 		syscallarg(int) fd;
4153 		syscallarg(int) pad;
4154 		syscallarg(off_t) length;
4155 	} */
4156 	file_t *fp;
4157 	int error, fd = SCARG(uap, fd);
4158 
4159 	fp = fd_getfile(fd);
4160 	if (fp == NULL)
4161 		return EBADF;
4162 	if (fp->f_ops->fo_truncate == NULL)
4163 		error = EOPNOTSUPP;
4164 	else
4165 		error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
4166 
4167 	fd_putfile(fd);
4168 	return error;
4169 }
4170 
4171 /*
4172  * Sync an open file.
4173  */
4174 /* ARGSUSED */
4175 int
4176 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4177 {
4178 	/* {
4179 		syscallarg(int) fd;
4180 	} */
4181 	struct vnode *vp;
4182 	file_t *fp;
4183 	int error;
4184 
4185 	/* fd_getvnode() will use the descriptor for us */
4186 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4187 		return (error);
4188 	vp = fp->f_vnode;
4189 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4190 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4191 	VOP_UNLOCK(vp);
4192 	fd_putfile(SCARG(uap, fd));
4193 	return (error);
4194 }
4195 
4196 /*
4197  * Sync a range of file data.  API modeled after that found in AIX.
4198  *
4199  * FDATASYNC indicates that we need only save enough metadata to be able
4200  * to re-read the written data.
4201  */
4202 /* ARGSUSED */
4203 int
4204 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4205 {
4206 	/* {
4207 		syscallarg(int) fd;
4208 		syscallarg(int) flags;
4209 		syscallarg(off_t) start;
4210 		syscallarg(off_t) length;
4211 	} */
4212 	struct vnode *vp;
4213 	file_t *fp;
4214 	int flags, nflags;
4215 	off_t s, e, len;
4216 	int error;
4217 
4218 	/* fd_getvnode() will use the descriptor for us */
4219 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4220 		return (error);
4221 
4222 	if ((fp->f_flag & FWRITE) == 0) {
4223 		error = EBADF;
4224 		goto out;
4225 	}
4226 
4227 	flags = SCARG(uap, flags);
4228 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4229 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4230 		error = EINVAL;
4231 		goto out;
4232 	}
4233 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4234 	if (flags & FDATASYNC)
4235 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4236 	else
4237 		nflags = FSYNC_WAIT;
4238 	if (flags & FDISKSYNC)
4239 		nflags |= FSYNC_CACHE;
4240 
4241 	len = SCARG(uap, length);
4242 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4243 	if (len) {
4244 		s = SCARG(uap, start);
4245 		if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4246 			error = EINVAL;
4247 			goto out;
4248 		}
4249 		e = s + len;
4250 		KASSERT(s <= e);
4251 	} else {
4252 		e = 0;
4253 		s = 0;
4254 	}
4255 
4256 	vp = fp->f_vnode;
4257 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4258 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4259 	VOP_UNLOCK(vp);
4260 out:
4261 	fd_putfile(SCARG(uap, fd));
4262 	return (error);
4263 }
4264 
4265 /*
4266  * Sync the data of an open file.
4267  */
4268 /* ARGSUSED */
4269 int
4270 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4271 {
4272 	/* {
4273 		syscallarg(int) fd;
4274 	} */
4275 	struct vnode *vp;
4276 	file_t *fp;
4277 	int error;
4278 
4279 	/* fd_getvnode() will use the descriptor for us */
4280 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4281 		return (error);
4282 	vp = fp->f_vnode;
4283 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4284 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4285 	VOP_UNLOCK(vp);
4286 	fd_putfile(SCARG(uap, fd));
4287 	return (error);
4288 }
4289 
4290 /*
4291  * Rename files, (standard) BSD semantics frontend.
4292  */
4293 /* ARGSUSED */
4294 int
4295 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4296 {
4297 	/* {
4298 		syscallarg(const char *) from;
4299 		syscallarg(const char *) to;
4300 	} */
4301 
4302 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4303 	    SCARG(uap, to), UIO_USERSPACE, 0));
4304 }
4305 
4306 int
4307 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4308     register_t *retval)
4309 {
4310 	/* {
4311 		syscallarg(int) fromfd;
4312 		syscallarg(const char *) from;
4313 		syscallarg(int) tofd;
4314 		syscallarg(const char *) to;
4315 	} */
4316 
4317 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4318 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4319 }
4320 
4321 /*
4322  * Rename files, POSIX semantics frontend.
4323  */
4324 /* ARGSUSED */
4325 int
4326 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4327 {
4328 	/* {
4329 		syscallarg(const char *) from;
4330 		syscallarg(const char *) to;
4331 	} */
4332 
4333 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4334 	    SCARG(uap, to), UIO_USERSPACE, 1));
4335 }
4336 
4337 /*
4338  * Rename files.  Source and destination must either both be directories,
4339  * or both not be directories.  If target is a directory, it must be empty.
4340  * If `from' and `to' refer to the same object, the value of the `retain'
4341  * argument is used to determine whether `from' will be
4342  *
4343  * (retain == 0)	deleted unless `from' and `to' refer to the same
4344  *			object in the file system's name space (BSD).
4345  * (retain == 1)	always retained (POSIX).
4346  *
4347  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4348  */
4349 int
4350 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4351 {
4352 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4353 }
4354 
4355 static int
4356 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4357     const char *to, enum uio_seg seg, int retain)
4358 {
4359 	struct pathbuf *fpb, *tpb;
4360 	struct nameidata fnd, tnd;
4361 	struct vnode *fdvp, *fvp;
4362 	struct vnode *tdvp, *tvp;
4363 	struct mount *mp, *tmp;
4364 	int error;
4365 
4366 	KASSERT(l != NULL || fromfd == AT_FDCWD);
4367 	KASSERT(l != NULL || tofd == AT_FDCWD);
4368 
4369 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4370 	if (error)
4371 		goto out0;
4372 	KASSERT(fpb != NULL);
4373 
4374 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4375 	if (error)
4376 		goto out1;
4377 	KASSERT(tpb != NULL);
4378 
4379 	/*
4380 	 * Lookup from.
4381 	 *
4382 	 * XXX LOCKPARENT is wrong because we don't actually want it
4383 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4384 	 * insane, so for the time being we need to leave it like this.
4385 	 */
4386 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4387 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4388 		goto out2;
4389 
4390 	/*
4391 	 * Pull out the important results of the lookup, fdvp and fvp.
4392 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4393 	 */
4394 	fdvp = fnd.ni_dvp;
4395 	fvp = fnd.ni_vp;
4396 	mp = fdvp->v_mount;
4397 	KASSERT(fdvp != NULL);
4398 	KASSERT(fvp != NULL);
4399 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4400 	/*
4401 	 * Bracket the operation with fstrans_start()/fstrans_done().
4402 	 *
4403 	 * Inside the bracket this file system cannot be unmounted so
4404 	 * a vnode on this file system cannot change its v_mount.
4405 	 * A vnode on another file system may still change to dead mount.
4406 	 */
4407 	fstrans_start(mp);
4408 
4409 	/*
4410 	 * Make sure neither fdvp nor fvp is locked.
4411 	 */
4412 	if (fdvp != fvp)
4413 		VOP_UNLOCK(fdvp);
4414 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4415 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4416 
4417 	/*
4418 	 * Reject renaming `.' and `..'.  Can't do this until after
4419 	 * namei because we need namei's parsing to find the final
4420 	 * component name.  (namei should just leave us with the final
4421 	 * component name and not look it up itself, but anyway...)
4422 	 *
4423 	 * This was here before because we used to relookup from
4424 	 * instead of to and relookup requires the caller to check
4425 	 * this, but now file systems may depend on this check, so we
4426 	 * must retain it until the file systems are all rototilled.
4427 	 */
4428 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4429 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4430 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4431 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4432 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4433 		error = EINVAL;	/* XXX EISDIR?  */
4434 		goto abort0;
4435 	}
4436 
4437 	/*
4438 	 * Lookup to.
4439 	 *
4440 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4441 	 * fvp here to decide whether to add CREATEDIR is a load of
4442 	 * bollocks because fvp might be the wrong node by now, since
4443 	 * fdvp is unlocked.
4444 	 *
4445 	 * XXX Why not pass CREATEDIR always?
4446 	 */
4447 	NDINIT(&tnd, RENAME,
4448 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4449 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4450 	    tpb);
4451 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4452 		goto abort0;
4453 
4454 	/*
4455 	 * Pull out the important results of the lookup, tdvp and tvp.
4456 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4457 	 */
4458 	tdvp = tnd.ni_dvp;
4459 	tvp = tnd.ni_vp;
4460 	KASSERT(tdvp != NULL);
4461 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4462 
4463 	if (fvp->v_type == VDIR)
4464 		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4465 	/*
4466 	 * Make sure neither tdvp nor tvp is locked.
4467 	 */
4468 	if (tdvp != tvp)
4469 		VOP_UNLOCK(tdvp);
4470 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4471 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4472 
4473 	/*
4474 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4475 	 * these, which is why we must do this here.  Once upon a time
4476 	 * we relooked up from instead of to, and consequently didn't
4477 	 * need this check, but now that we relookup to instead of
4478 	 * from, we need this; and we shall need it forever forward
4479 	 * until the VOP_RENAME protocol changes, because file systems
4480 	 * will no doubt begin to depend on this check.
4481 	 */
4482 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4483 		error = EISDIR;
4484 		goto abort1;
4485 	}
4486 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4487 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4488 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4489 		error = EINVAL;
4490 		goto abort1;
4491 	}
4492 
4493 	/*
4494 	 * Make sure the mount points match.  Although we don't hold
4495 	 * any vnode locks, the v_mount on fdvp file system are stable.
4496 	 *
4497 	 * Unmounting another file system at an inopportune moment may
4498 	 * cause tdvp to disappear and change its v_mount to dead.
4499 	 *
4500 	 * So in either case different v_mount means cross-device rename.
4501 	 */
4502 	KASSERT(mp != NULL);
4503 	tmp = tdvp->v_mount;
4504 
4505 	if (mp != tmp) {
4506 		error = EXDEV;
4507 		goto abort1;
4508 	}
4509 
4510 	/*
4511 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4512 	 * Nothing is locked currently, so taking this lock is safe.
4513 	 */
4514 	error = VFS_RENAMELOCK_ENTER(mp);
4515 	if (error)
4516 		goto abort1;
4517 
4518 	/*
4519 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4520 	 * and nothing is locked except for the vfs rename lock.
4521 	 *
4522 	 * The next step is a little rain dance to conform to the
4523 	 * insane lock protocol, even though it does nothing to ward
4524 	 * off race conditions.
4525 	 *
4526 	 * We need tdvp and tvp to be locked.  However, because we have
4527 	 * unlocked tdvp in order to hold no locks while we take the
4528 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4529 	 * lock it even if the sensible file systems will just unlock
4530 	 * it straight away.  Consequently, we must lock tdvp and then
4531 	 * relookup tvp to get it locked.
4532 	 *
4533 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4534 	 * and various file systems insanely depend on the semantics of
4535 	 * this brain damage, the lookup of to must be the last lookup
4536 	 * before VOP_RENAME.
4537 	 */
4538 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4539 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4540 	if (error)
4541 		goto abort2;
4542 
4543 	/*
4544 	 * Drop the old tvp and pick up the new one -- which might be
4545 	 * the same, but that doesn't matter to us.  After this, tdvp
4546 	 * and tvp should both be locked.
4547 	 */
4548 	if (tvp != NULL)
4549 		vrele(tvp);
4550 	tvp = tnd.ni_vp;
4551 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4552 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4553 
4554 	/*
4555 	 * The old do_sys_rename had various consistency checks here
4556 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4557 	 * will become bogus soon in any sensible file system, so the
4558 	 * only purpose in putting these checks here is to give lip
4559 	 * service to these screw cases and to acknowledge that they
4560 	 * exist, not actually to handle them, but here you go
4561 	 * anyway...
4562 	 */
4563 
4564 	/*
4565 	 * Acknowledge that directories and non-directories aren't
4566 	 * supposed to mix.
4567 	 */
4568 	if (tvp != NULL) {
4569 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4570 			error = ENOTDIR;
4571 			goto abort3;
4572 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4573 			error = EISDIR;
4574 			goto abort3;
4575 		}
4576 	}
4577 
4578 	/*
4579 	 * Acknowledge some random screw case, among the dozens that
4580 	 * might arise.
4581 	 */
4582 	if (fvp == tdvp) {
4583 		error = EINVAL;
4584 		goto abort3;
4585 	}
4586 
4587 	/*
4588 	 * Acknowledge that POSIX has a wacky screw case.
4589 	 *
4590 	 * XXX Eventually the retain flag needs to be passed on to
4591 	 * VOP_RENAME.
4592 	 */
4593 	if (fvp == tvp) {
4594 		if (retain) {
4595 			error = 0;
4596 			goto abort3;
4597 		} else if ((fdvp == tdvp) &&
4598 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4599 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4600 			fnd.ni_cnd.cn_namelen))) {
4601 			error = 0;
4602 			goto abort3;
4603 		}
4604 	}
4605 
4606 	/*
4607 	 * Make sure veriexec can screw us up.  (But a race can screw
4608 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4609 	 * bogus.)
4610 	 */
4611 #if NVERIEXEC > 0
4612 	{
4613 		char *f1, *f2;
4614 		size_t f1_len;
4615 		size_t f2_len;
4616 
4617 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4618 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4619 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4620 
4621 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4622 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4623 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4624 
4625 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4626 
4627 		kmem_free(f1, f1_len);
4628 		kmem_free(f2, f2_len);
4629 
4630 		if (error)
4631 			goto abort3;
4632 	}
4633 #endif /* NVERIEXEC > 0 */
4634 
4635 	/*
4636 	 * All ready.  Incant the rename vop.
4637 	 */
4638 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4639 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4640 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4641 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4642 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4643 
4644 	/*
4645 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4646 	 * tdvp and tvp.  But we can't assert any of that.
4647 	 */
4648 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4649 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4650 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4651 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4652 
4653 	/*
4654 	 * So all we have left to do is to drop the rename lock and
4655 	 * destroy the pathbufs.
4656 	 */
4657 	VFS_RENAMELOCK_EXIT(mp);
4658 	fstrans_done(mp);
4659 	goto out2;
4660 
4661 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4662 		VOP_UNLOCK(tvp);
4663 abort2:	VOP_UNLOCK(tdvp);
4664 	VFS_RENAMELOCK_EXIT(mp);
4665 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4666 	vrele(tdvp);
4667 	if (tvp != NULL)
4668 		vrele(tvp);
4669 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4670 	vrele(fdvp);
4671 	vrele(fvp);
4672 	fstrans_done(mp);
4673 out2:	pathbuf_destroy(tpb);
4674 out1:	pathbuf_destroy(fpb);
4675 out0:	return error;
4676 }
4677 
4678 /*
4679  * Make a directory file.
4680  */
4681 /* ARGSUSED */
4682 int
4683 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4684 {
4685 	/* {
4686 		syscallarg(const char *) path;
4687 		syscallarg(int) mode;
4688 	} */
4689 
4690 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4691 	    SCARG(uap, mode), UIO_USERSPACE);
4692 }
4693 
4694 int
4695 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4696     register_t *retval)
4697 {
4698 	/* {
4699 		syscallarg(int) fd;
4700 		syscallarg(const char *) path;
4701 		syscallarg(int) mode;
4702 	} */
4703 
4704 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4705 	    SCARG(uap, mode), UIO_USERSPACE);
4706 }
4707 
4708 
4709 int
4710 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4711 {
4712 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4713 }
4714 
4715 static int
4716 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4717     enum uio_seg seg)
4718 {
4719 	struct proc *p = curlwp->l_proc;
4720 	struct vnode *vp;
4721 	struct vattr vattr;
4722 	int error;
4723 	struct pathbuf *pb;
4724 	struct nameidata nd;
4725 
4726 	KASSERT(l != NULL || fdat == AT_FDCWD);
4727 
4728 	/* XXX bollocks, should pass in a pathbuf */
4729 	error = pathbuf_maybe_copyin(path, seg, &pb);
4730 	if (error) {
4731 		return error;
4732 	}
4733 
4734 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4735 
4736 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4737 		pathbuf_destroy(pb);
4738 		return (error);
4739 	}
4740 	vp = nd.ni_vp;
4741 	if (vp != NULL) {
4742 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4743 		if (nd.ni_dvp == vp)
4744 			vrele(nd.ni_dvp);
4745 		else
4746 			vput(nd.ni_dvp);
4747 		vrele(vp);
4748 		pathbuf_destroy(pb);
4749 		return (EEXIST);
4750 	}
4751 	vattr_null(&vattr);
4752 	vattr.va_type = VDIR;
4753 	/* We will read cwdi->cwdi_cmask unlocked. */
4754 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4755 	nd.ni_cnd.cn_flags |= WILLBEDIR;
4756 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4757 	if (!error)
4758 		vrele(nd.ni_vp);
4759 	vput(nd.ni_dvp);
4760 	pathbuf_destroy(pb);
4761 	return (error);
4762 }
4763 
4764 /*
4765  * Remove a directory file.
4766  */
4767 /* ARGSUSED */
4768 int
4769 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4770 {
4771 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4772 	    AT_REMOVEDIR, UIO_USERSPACE);
4773 }
4774 
4775 /*
4776  * Read a block of directory entries in a file system independent format.
4777  */
4778 int
4779 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4780 {
4781 	/* {
4782 		syscallarg(int) fd;
4783 		syscallarg(char *) buf;
4784 		syscallarg(size_t) count;
4785 	} */
4786 	file_t *fp;
4787 	int error, done;
4788 
4789 	/* fd_getvnode() will use the descriptor for us */
4790 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4791 		return (error);
4792 	if ((fp->f_flag & FREAD) == 0) {
4793 		error = EBADF;
4794 		goto out;
4795 	}
4796 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4797 			SCARG(uap, count), &done, l, 0, 0);
4798 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4799 	*retval = done;
4800  out:
4801 	fd_putfile(SCARG(uap, fd));
4802 	return (error);
4803 }
4804 
4805 /*
4806  * Set the mode mask for creation of filesystem nodes.
4807  */
4808 int
4809 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4810 {
4811 	/* {
4812 		syscallarg(mode_t) newmask;
4813 	} */
4814 
4815 	/*
4816 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4817 	 * serialization with those reads is required.  It's important to
4818 	 * return a coherent answer for the caller of umask() though, and
4819 	 * the atomic operation accomplishes that.
4820 	 */
4821 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4822 	    SCARG(uap, newmask) & ALLPERMS);
4823 
4824 	return (0);
4825 }
4826 
4827 int
4828 dorevoke(struct vnode *vp, kauth_cred_t cred)
4829 {
4830 	struct vattr vattr;
4831 	int error, fs_decision;
4832 
4833 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4834 	error = VOP_GETATTR(vp, &vattr, cred);
4835 	VOP_UNLOCK(vp);
4836 	if (error != 0)
4837 		return error;
4838 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4839 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4840 	    fs_decision);
4841 	if (!error)
4842 		VOP_REVOKE(vp, REVOKEALL);
4843 	return (error);
4844 }
4845 
4846 /*
4847  * Void all references to file by ripping underlying filesystem
4848  * away from vnode.
4849  */
4850 /* ARGSUSED */
4851 int
4852 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4853 {
4854 	/* {
4855 		syscallarg(const char *) path;
4856 	} */
4857 	struct vnode *vp;
4858 	int error;
4859 
4860 	error = namei_simple_user(SCARG(uap, path),
4861 				NSM_FOLLOW_TRYEMULROOT, &vp);
4862 	if (error != 0)
4863 		return (error);
4864 	error = dorevoke(vp, l->l_cred);
4865 	vrele(vp);
4866 	return (error);
4867 }
4868 
4869 /*
4870  * Allocate backing store for a file, filling a hole without having to
4871  * explicitly write anything out.
4872  */
4873 /* ARGSUSED */
4874 int
4875 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4876 		register_t *retval)
4877 {
4878 	/* {
4879 		syscallarg(int) fd;
4880 		syscallarg(off_t) pos;
4881 		syscallarg(off_t) len;
4882 	} */
4883 	int fd;
4884 	off_t pos, len;
4885 	struct file *fp;
4886 	struct vnode *vp;
4887 	int error;
4888 
4889 	fd = SCARG(uap, fd);
4890 	pos = SCARG(uap, pos);
4891 	len = SCARG(uap, len);
4892 
4893 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4894 		*retval = EINVAL;
4895 		return 0;
4896 	}
4897 
4898 	error = fd_getvnode(fd, &fp);
4899 	if (error) {
4900 		*retval = error;
4901 		return 0;
4902 	}
4903 	if ((fp->f_flag & FWRITE) == 0) {
4904 		error = EBADF;
4905 		goto fail;
4906 	}
4907 	vp = fp->f_vnode;
4908 
4909 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4910 	if (vp->v_type == VDIR) {
4911 		error = EISDIR;
4912 	} else {
4913 		error = VOP_FALLOCATE(vp, pos, len);
4914 	}
4915 	VOP_UNLOCK(vp);
4916 
4917 fail:
4918 	fd_putfile(fd);
4919 	*retval = error;
4920 	return 0;
4921 }
4922 
4923 /*
4924  * Deallocate backing store for a file, creating a hole. Also used for
4925  * invoking TRIM on disks.
4926  */
4927 /* ARGSUSED */
4928 int
4929 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4930 		register_t *retval)
4931 {
4932 	/* {
4933 		syscallarg(int) fd;
4934 		syscallarg(off_t) pos;
4935 		syscallarg(off_t) len;
4936 	} */
4937 	int fd;
4938 	off_t pos, len;
4939 	struct file *fp;
4940 	struct vnode *vp;
4941 	int error;
4942 
4943 	fd = SCARG(uap, fd);
4944 	pos = SCARG(uap, pos);
4945 	len = SCARG(uap, len);
4946 
4947 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4948 		return EINVAL;
4949 	}
4950 
4951 	error = fd_getvnode(fd, &fp);
4952 	if (error) {
4953 		return error;
4954 	}
4955 	if ((fp->f_flag & FWRITE) == 0) {
4956 		error = EBADF;
4957 		goto fail;
4958 	}
4959 	vp = fp->f_vnode;
4960 
4961 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4962 	if (vp->v_type == VDIR) {
4963 		error = EISDIR;
4964 	} else {
4965 		error = VOP_FDISCARD(vp, pos, len);
4966 	}
4967 	VOP_UNLOCK(vp);
4968 
4969 fail:
4970 	fd_putfile(fd);
4971 	return error;
4972 }
4973