xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1 /*	$NetBSD: vfs_syscalls.c,v 1.555 2022/02/12 15:51:29 thorpej Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.555 2022/02/12 15:51:29 thorpej Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/fstrans.h>
91 #include <sys/proc.h>
92 #include <sys/uio.h>
93 #include <sys/kmem.h>
94 #include <sys/dirent.h>
95 #include <sys/sysctl.h>
96 #include <sys/syscallargs.h>
97 #include <sys/vfs_syscalls.h>
98 #include <sys/quota.h>
99 #include <sys/quotactl.h>
100 #include <sys/ktrace.h>
101 #ifdef FILEASSOC
102 #include <sys/fileassoc.h>
103 #endif /* FILEASSOC */
104 #include <sys/extattr.h>
105 #include <sys/verified_exec.h>
106 #include <sys/kauth.h>
107 #include <sys/atomic.h>
108 #include <sys/module.h>
109 #include <sys/buf.h>
110 #include <sys/event.h>
111 #include <sys/compat_stub.h>
112 
113 #include <miscfs/genfs/genfs.h>
114 #include <miscfs/specfs/specdev.h>
115 
116 #include <nfs/rpcv2.h>
117 #include <nfs/nfsproto.h>
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 
121 /* XXX this shouldn't be here */
122 #ifndef OFF_T_MAX
123 #define OFF_T_MAX __type_max(off_t)
124 #endif
125 
126 static int change_flags(struct vnode *, u_long, struct lwp *);
127 static int change_mode(struct vnode *, int, struct lwp *);
128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
131     enum uio_seg);
132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
134     enum uio_seg);
135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
136     enum uio_seg, int);
137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
138     size_t, register_t *);
139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
140 
141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
143     namei_simple_flags_t, struct vnode **);
144 
145 /*
146  * This table is used to maintain compatibility with 4.3BSD
147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
148  * Note, the order is important!
149  *
150  * Do not modify this table. It should only contain filesystems
151  * supported by NetBSD 0.9 and 4.3BSD.
152  */
153 const char * const mountcompatnames[] = {
154 	NULL,		/* 0 = MOUNT_NONE */
155 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
156 	MOUNT_NFS,	/* 2 */
157 	MOUNT_MFS,	/* 3 */
158 	MOUNT_MSDOS,	/* 4 */
159 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
160 	MOUNT_FDESC,	/* 6 */
161 	MOUNT_KERNFS,	/* 7 */
162 	NULL,		/* 8 = MOUNT_DEVFS */
163 	MOUNT_AFS,	/* 9 */
164 };
165 
166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
167 
168 /*
169  * Filter event method for EVFILT_FS.
170  */
171 static struct klist fs_klist;
172 static kmutex_t fs_klist_lock;
173 
174 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
175 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
176 
177 void
178 vfs_evfilt_fs_init(void)
179 {
180 	klist_init(&fs_klist);
181 	mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
182 }
183 
184 static int
185 filt_fsattach(struct knote *kn)
186 {
187 	mutex_enter(&fs_klist_lock);
188 	kn->kn_flags |= EV_CLEAR;
189 	klist_insert(&fs_klist, kn);
190 	mutex_exit(&fs_klist_lock);
191 
192 	return 0;
193 }
194 
195 static void
196 filt_fsdetach(struct knote *kn)
197 {
198 	mutex_enter(&fs_klist_lock);
199 	klist_remove(&fs_klist, kn);
200 	mutex_exit(&fs_klist_lock);
201 }
202 
203 static int
204 filt_fs(struct knote *kn, long hint)
205 {
206 	int rv;
207 
208 	if (hint & NOTE_SUBMIT) {
209 		KASSERT(mutex_owned(&fs_klist_lock));
210 		kn->kn_fflags |= hint & ~NOTE_SUBMIT;
211 	} else {
212 		mutex_enter(&fs_klist_lock);
213 	}
214 
215 	rv = (kn->kn_fflags != 0);
216 
217 	if ((hint & NOTE_SUBMIT) == 0) {
218 		mutex_exit(&fs_klist_lock);
219 	}
220 
221 	return rv;
222 }
223 
224 /* referenced in kern_event.c */
225 const struct filterops fs_filtops = {
226 	.f_flags = FILTEROP_MPSAFE,
227 	.f_attach = filt_fsattach,
228 	.f_detach = filt_fsdetach,
229 	.f_event = filt_fs,
230 };
231 
232 static int
233 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
234 {
235 	file_t *dfp;
236 	int error;
237 
238 	if (fdat != AT_FDCWD) {
239 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
240 			goto out;
241 
242 		NDAT(ndp, dfp->f_vnode);
243 	}
244 
245 	error = namei(ndp);
246 
247 	if (fdat != AT_FDCWD)
248 		fd_putfile(fdat);
249 out:
250 	return error;
251 }
252 
253 static int
254 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
255     namei_simple_flags_t sflags, struct vnode **vp_ret)
256 {
257 	file_t *dfp;
258 	struct vnode *dvp;
259 	int error;
260 
261 	if (fdat != AT_FDCWD) {
262 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
263 			goto out;
264 
265 		dvp = dfp->f_vnode;
266 	} else {
267 		dvp = NULL;
268 	}
269 
270 	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
271 
272 	if (fdat != AT_FDCWD)
273 		fd_putfile(fdat);
274 out:
275 	return error;
276 }
277 
278 static int
279 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
280 {
281 	int error;
282 
283 	fp->f_flag = flags & FMASK;
284 	fp->f_type = DTYPE_VNODE;
285 	fp->f_ops = &vnops;
286 	fp->f_vnode = vp;
287 
288 	if (flags & (O_EXLOCK | O_SHLOCK)) {
289 		struct flock lf;
290 		int type;
291 
292 		lf.l_whence = SEEK_SET;
293 		lf.l_start = 0;
294 		lf.l_len = 0;
295 		if (flags & O_EXLOCK)
296 			lf.l_type = F_WRLCK;
297 		else
298 			lf.l_type = F_RDLCK;
299 		type = F_FLOCK;
300 		if ((flags & FNONBLOCK) == 0)
301 			type |= F_WAIT;
302 		VOP_UNLOCK(vp);
303 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
304 		if (error) {
305 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
306 			fd_abort(l->l_proc, fp, indx);
307 			return error;
308 		}
309 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
310 		atomic_or_uint(&fp->f_flag, FHASLOCK);
311 	}
312 	if (flags & O_CLOEXEC)
313 		fd_set_exclose(l, indx, true);
314 	return 0;
315 }
316 
317 static int
318 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
319     void *data, size_t *data_len)
320 {
321 	struct mount *mp;
322 	int error = 0, saved_flags;
323 
324 	mp = vp->v_mount;
325 	saved_flags = mp->mnt_flag;
326 
327 	/* We can operate only on VV_ROOT nodes. */
328 	if ((vp->v_vflag & VV_ROOT) == 0) {
329 		error = EINVAL;
330 		goto out;
331 	}
332 
333 	/*
334 	 * We only allow the filesystem to be reloaded if it
335 	 * is currently mounted read-only.  Additionally, we
336 	 * prevent read-write to read-only downgrades.
337 	 */
338 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
339 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
340 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
341 		error = EOPNOTSUPP;	/* Needs translation */
342 		goto out;
343 	}
344 
345 	/*
346 	 * Enabling MNT_UNION requires a covered mountpoint and
347 	 * must not happen on the root mount.
348 	 */
349 	if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
350 		error = EOPNOTSUPP;
351 		goto out;
352 	}
353 
354 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
355 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
356 	if (error)
357 		goto out;
358 
359 	error = vfs_suspend(mp, 0);
360 	if (error)
361 		goto out;
362 
363 	mutex_enter(mp->mnt_updating);
364 
365 	mp->mnt_flag &= ~MNT_OP_FLAGS;
366 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
367 
368 	/*
369 	 * Set the mount level flags.
370 	 */
371 	if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
372 		if ((flags & MNT_RDONLY))
373 			mp->mnt_iflag |= IMNT_WANTRDONLY;
374 		else
375 			mp->mnt_iflag |= IMNT_WANTRDWR;
376 	}
377 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
378 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
379 	if ((mp->mnt_iflag & IMNT_WANTRDONLY))
380 		mp->mnt_flag &= ~MNT_RDONLY;
381 
382 	error = VFS_MOUNT(mp, path, data, data_len);
383 
384 	if (error && data != NULL) {
385 		int error2;
386 
387 		/*
388 		 * Update failed; let's try and see if it was an
389 		 * export request.  For compat with 3.0 and earlier.
390 		 */
391 		error2 = vfs_hooks_reexport(mp, path, data);
392 
393 		/*
394 		 * Only update error code if the export request was
395 		 * understood but some problem occurred while
396 		 * processing it.
397 		 */
398 		if (error2 != EJUSTRETURN)
399 			error = error2;
400 	}
401 
402 	if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
403 		mp->mnt_flag |= MNT_RDONLY;
404 	if (error)
405 		mp->mnt_flag = saved_flags;
406 	mp->mnt_flag &= ~MNT_OP_FLAGS;
407 	mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
408 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
409 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
410 			vfs_syncer_add_to_worklist(mp);
411 	} else {
412 		if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
413 			vfs_syncer_remove_from_worklist(mp);
414 	}
415 	mutex_exit(mp->mnt_updating);
416 	vfs_resume(mp);
417 
418 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
419 	    (flags & MNT_EXTATTR)) {
420 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
421 				   NULL, 0, NULL) != 0) {
422 			printf("%s: failed to start extattr, error = %d",
423 			       mp->mnt_stat.f_mntonname, error);
424 			mp->mnt_flag &= ~MNT_EXTATTR;
425 		}
426 	}
427 
428 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
429 	    !(flags & MNT_EXTATTR)) {
430 		if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
431 				   NULL, 0, NULL) != 0) {
432 			printf("%s: failed to stop extattr, error = %d",
433 			       mp->mnt_stat.f_mntonname, error);
434 			mp->mnt_flag |= MNT_RDONLY;
435 		}
436 	}
437  out:
438 	return (error);
439 }
440 
441 static int
442 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
443     struct vfsops **vfsops)
444 {
445 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
446 	int error;
447 
448 	if (type_seg == UIO_USERSPACE) {
449 		/* Copy file-system type from userspace.  */
450 		error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
451 	} else {
452 		error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
453 		KASSERT(error == 0);
454 	}
455 
456 	if (error) {
457 		/*
458 		 * Historically, filesystem types were identified by numbers.
459 		 * If we get an integer for the filesystem type instead of a
460 		 * string, we check to see if it matches one of the historic
461 		 * filesystem types.
462 		 */
463 		u_long fsindex = (u_long)fstype;
464 		if (fsindex >= nmountcompatnames ||
465 		    mountcompatnames[fsindex] == NULL)
466 			return ENODEV;
467 		strlcpy(fstypename, mountcompatnames[fsindex],
468 		    sizeof(fstypename));
469 	}
470 
471 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
472 	if (strcmp(fstypename, "ufs") == 0)
473 		fstypename[0] = 'f';
474 
475 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
476 		return 0;
477 
478 	/* If we can autoload a vfs module, try again */
479 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
480 
481 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
482 		return 0;
483 
484 	return ENODEV;
485 }
486 
487 static int
488 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
489     void *data, size_t *data_len)
490 {
491 	struct mount *mp;
492 	int error;
493 
494 	/* If MNT_GETARGS is specified, it should be the only flag. */
495 	if (flags & ~MNT_GETARGS)
496 		return EINVAL;
497 
498 	mp = vp->v_mount;
499 
500 	/* XXX: probably some notion of "can see" here if we want isolation. */
501 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
502 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
503 	if (error)
504 		return error;
505 
506 	if ((vp->v_vflag & VV_ROOT) == 0)
507 		return EINVAL;
508 
509 	if (vfs_busy(mp))
510 		return EPERM;
511 
512 	mutex_enter(mp->mnt_updating);
513 	mp->mnt_flag &= ~MNT_OP_FLAGS;
514 	mp->mnt_flag |= MNT_GETARGS;
515 	error = VFS_MOUNT(mp, path, data, data_len);
516 	mp->mnt_flag &= ~MNT_OP_FLAGS;
517 	mutex_exit(mp->mnt_updating);
518 
519 	vfs_unbusy(mp);
520 	return (error);
521 }
522 
523 int
524 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
525 {
526 	/* {
527 		syscallarg(const char *) type;
528 		syscallarg(const char *) path;
529 		syscallarg(int) flags;
530 		syscallarg(void *) data;
531 		syscallarg(size_t) data_len;
532 	} */
533 
534 	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
535 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
536 	    SCARG(uap, data_len), retval);
537 }
538 
539 int
540 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
541     const char *path, int flags, void *data, enum uio_seg data_seg,
542     size_t data_len, register_t *retval)
543 {
544 	struct vfsops *vfsops = NULL;	/* XXX gcc4.8 */
545 	struct vnode *vp;
546 	void *data_buf = data;
547 	bool vfsopsrele = false;
548 	size_t alloc_sz = 0;
549 	int error;
550 
551 	/*
552 	 * Get vnode to be covered
553 	 */
554 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
555 	if (error != 0) {
556 		vp = NULL;
557 		goto done;
558 	}
559 
560 	if (flags & (MNT_GETARGS | MNT_UPDATE)) {
561 		vfsops = vp->v_mount->mnt_op;
562 	} else {
563 		/* 'type' is userspace */
564 		error = mount_get_vfsops(type, type_seg, &vfsops);
565 		if (error != 0)
566 			goto done;
567 		vfsopsrele = true;
568 	}
569 
570 	/*
571 	 * We allow data to be NULL, even for userspace. Some fs's don't need
572 	 * it. The others will handle NULL.
573 	 */
574 	if (data != NULL && data_seg == UIO_USERSPACE) {
575 		if (data_len == 0) {
576 			/* No length supplied, use default for filesystem */
577 			data_len = vfsops->vfs_min_mount_data;
578 
579 			/*
580 			 * Hopefully a longer buffer won't make copyin() fail.
581 			 * For compatibility with 3.0 and earlier.
582 			 */
583 			if (flags & MNT_UPDATE
584 			    && data_len < sizeof (struct mnt_export_args30))
585 				data_len = sizeof (struct mnt_export_args30);
586 		}
587 		if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
588 			error = EINVAL;
589 			goto done;
590 		}
591 		alloc_sz = data_len;
592 		data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
593 
594 		/* NFS needs the buffer even for mnt_getargs .... */
595 		error = copyin(data, data_buf, data_len);
596 		if (error != 0)
597 			goto done;
598 	}
599 
600 	if (flags & MNT_GETARGS) {
601 		if (data_len == 0) {
602 			error = EINVAL;
603 			goto done;
604 		}
605 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
606 		if (error != 0)
607 			goto done;
608 		if (data_seg == UIO_USERSPACE)
609 			error = copyout(data_buf, data, data_len);
610 		*retval = data_len;
611 	} else if (flags & MNT_UPDATE) {
612 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
613 	} else {
614 		/* Locking is handled internally in mount_domount(). */
615 		KASSERT(vfsopsrele == true);
616 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
617 		    &data_len);
618 		vfsopsrele = false;
619 	}
620 	if (!error) {
621 		mutex_enter(&fs_klist_lock);
622 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
623 		mutex_exit(&fs_klist_lock);
624 	}
625 
626     done:
627 	if (vfsopsrele)
628 		vfs_delref(vfsops);
629     	if (vp != NULL) {
630 	    	vrele(vp);
631 	}
632 	if (data_buf != data)
633 		kmem_free(data_buf, alloc_sz);
634 	return (error);
635 }
636 
637 /*
638  * Unmount a file system.
639  *
640  * Note: unmount takes a path to the vnode mounted on as argument,
641  * not special file (as before).
642  */
643 /* ARGSUSED */
644 int
645 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
646 {
647 	/* {
648 		syscallarg(const char *) path;
649 		syscallarg(int) flags;
650 	} */
651 	struct vnode *vp;
652 	struct mount *mp;
653 	int error;
654 	struct pathbuf *pb;
655 	struct nameidata nd;
656 
657 	error = pathbuf_copyin(SCARG(uap, path), &pb);
658 	if (error) {
659 		return error;
660 	}
661 
662 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
663 	if ((error = namei(&nd)) != 0) {
664 		pathbuf_destroy(pb);
665 		return error;
666 	}
667 	vp = nd.ni_vp;
668 	pathbuf_destroy(pb);
669 
670 	mp = vp->v_mount;
671 	vfs_ref(mp);
672 	VOP_UNLOCK(vp);
673 
674 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
675 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
676 	if (error) {
677 		vrele(vp);
678 		vfs_rele(mp);
679 		return (error);
680 	}
681 
682 	/*
683 	 * Don't allow unmounting the root file system.
684 	 */
685 	if (mp->mnt_flag & MNT_ROOTFS) {
686 		vrele(vp);
687 		vfs_rele(mp);
688 		return (EINVAL);
689 	}
690 
691 	/*
692 	 * Must be the root of the filesystem
693 	 */
694 	if ((vp->v_vflag & VV_ROOT) == 0) {
695 		vrele(vp);
696 		vfs_rele(mp);
697 		return (EINVAL);
698 	}
699 
700 	vrele(vp);
701 	error = dounmount(mp, SCARG(uap, flags), l);
702 	vfs_rele(mp);
703 	if (!error) {
704 		mutex_enter(&fs_klist_lock);
705 		KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
706 		mutex_exit(&fs_klist_lock);
707 	}
708 	return error;
709 }
710 
711 /*
712  * Sync each mounted filesystem.
713  */
714 #ifdef DEBUG
715 int syncprt = 0;
716 struct ctldebug debug0 = { "syncprt", &syncprt };
717 #endif
718 
719 void
720 do_sys_sync(struct lwp *l)
721 {
722 	mount_iterator_t *iter;
723 	struct mount *mp;
724 	int asyncflag;
725 
726 	mountlist_iterator_init(&iter);
727 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
728 		mutex_enter(mp->mnt_updating);
729 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
730 			asyncflag = mp->mnt_flag & MNT_ASYNC;
731 			mp->mnt_flag &= ~MNT_ASYNC;
732 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
733 			if (asyncflag)
734 				 mp->mnt_flag |= MNT_ASYNC;
735 		}
736 		mutex_exit(mp->mnt_updating);
737 	}
738 	mountlist_iterator_destroy(iter);
739 #ifdef DEBUG
740 	if (syncprt)
741 		vfs_bufstats();
742 #endif /* DEBUG */
743 }
744 
745 static bool
746 sync_vnode_filter(void *cookie, vnode_t *vp)
747 {
748 
749 	if (vp->v_numoutput > 0) {
750 		++*(int *)cookie;
751 	}
752 	return false;
753 }
754 
755 int
756 vfs_syncwait(void)
757 {
758 	int nbusy, nbusy_prev, iter;
759 	struct vnode_iterator *vniter;
760 	mount_iterator_t *mpiter;
761 	struct mount *mp;
762 
763 	for (nbusy_prev = 0, iter = 0; iter < 20;) {
764 		nbusy = 0;
765 		mountlist_iterator_init(&mpiter);
766 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
767 			vnode_t *vp __diagused;
768 			vfs_vnode_iterator_init(mp, &vniter);
769 			vp = vfs_vnode_iterator_next(vniter,
770 			    sync_vnode_filter, &nbusy);
771 			KASSERT(vp == NULL);
772 			vfs_vnode_iterator_destroy(vniter);
773 		}
774 		mountlist_iterator_destroy(mpiter);
775 
776 		if (nbusy == 0)
777 			break;
778 		if (nbusy_prev == 0)
779 			nbusy_prev = nbusy;
780 		printf("%d ", nbusy);
781 		kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
782 		if (nbusy >= nbusy_prev) /* we didn't flush anything */
783 			iter++;
784 		else
785 			nbusy_prev = nbusy;
786 	}
787 
788 	if (nbusy) {
789 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
790 		printf("giving up\nPrinting vnodes for busy buffers\n");
791 		mountlist_iterator_init(&mpiter);
792 		while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
793 			vnode_t *vp;
794 			vfs_vnode_iterator_init(mp, &vniter);
795 			vp = vfs_vnode_iterator_next(vniter,
796 			    NULL, NULL);
797 			mutex_enter(vp->v_interlock);
798 			if (vp->v_numoutput > 0)
799 				vprint(NULL, vp);
800 			mutex_exit(vp->v_interlock);
801 			vrele(vp);
802 			vfs_vnode_iterator_destroy(vniter);
803 		}
804 		mountlist_iterator_destroy(mpiter);
805 #endif
806 	}
807 
808 	return nbusy;
809 }
810 
811 /* ARGSUSED */
812 int
813 sys_sync(struct lwp *l, const void *v, register_t *retval)
814 {
815 	do_sys_sync(l);
816 	return (0);
817 }
818 
819 
820 /*
821  * Access or change filesystem quotas.
822  *
823  * (this is really 14 different calls bundled into one)
824  */
825 
826 static int
827 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
828 {
829 	struct quotastat info_k;
830 	int error;
831 
832 	/* ensure any padding bytes are cleared */
833 	memset(&info_k, 0, sizeof(info_k));
834 
835 	error = vfs_quotactl_stat(mp, &info_k);
836 	if (error) {
837 		return error;
838 	}
839 
840 	return copyout(&info_k, info_u, sizeof(info_k));
841 }
842 
843 static int
844 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
845     struct quotaidtypestat *info_u)
846 {
847 	struct quotaidtypestat info_k;
848 	int error;
849 
850 	/* ensure any padding bytes are cleared */
851 	memset(&info_k, 0, sizeof(info_k));
852 
853 	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
854 	if (error) {
855 		return error;
856 	}
857 
858 	return copyout(&info_k, info_u, sizeof(info_k));
859 }
860 
861 static int
862 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
863     struct quotaobjtypestat *info_u)
864 {
865 	struct quotaobjtypestat info_k;
866 	int error;
867 
868 	/* ensure any padding bytes are cleared */
869 	memset(&info_k, 0, sizeof(info_k));
870 
871 	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
872 	if (error) {
873 		return error;
874 	}
875 
876 	return copyout(&info_k, info_u, sizeof(info_k));
877 }
878 
879 static int
880 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
881     struct quotaval *val_u)
882 {
883 	struct quotakey key_k;
884 	struct quotaval val_k;
885 	int error;
886 
887 	/* ensure any padding bytes are cleared */
888 	memset(&val_k, 0, sizeof(val_k));
889 
890 	error = copyin(key_u, &key_k, sizeof(key_k));
891 	if (error) {
892 		return error;
893 	}
894 
895 	error = vfs_quotactl_get(mp, &key_k, &val_k);
896 	if (error) {
897 		return error;
898 	}
899 
900 	return copyout(&val_k, val_u, sizeof(val_k));
901 }
902 
903 static int
904 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
905     const struct quotaval *val_u)
906 {
907 	struct quotakey key_k;
908 	struct quotaval val_k;
909 	int error;
910 
911 	error = copyin(key_u, &key_k, sizeof(key_k));
912 	if (error) {
913 		return error;
914 	}
915 
916 	error = copyin(val_u, &val_k, sizeof(val_k));
917 	if (error) {
918 		return error;
919 	}
920 
921 	return vfs_quotactl_put(mp, &key_k, &val_k);
922 }
923 
924 static int
925 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
926 {
927 	struct quotakey key_k;
928 	int error;
929 
930 	error = copyin(key_u, &key_k, sizeof(key_k));
931 	if (error) {
932 		return error;
933 	}
934 
935 	return vfs_quotactl_del(mp, &key_k);
936 }
937 
938 static int
939 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
940 {
941 	struct quotakcursor cursor_k;
942 	int error;
943 
944 	/* ensure any padding bytes are cleared */
945 	memset(&cursor_k, 0, sizeof(cursor_k));
946 
947 	error = vfs_quotactl_cursoropen(mp, &cursor_k);
948 	if (error) {
949 		return error;
950 	}
951 
952 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
953 }
954 
955 static int
956 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
957 {
958 	struct quotakcursor cursor_k;
959 	int error;
960 
961 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
962 	if (error) {
963 		return error;
964 	}
965 
966 	return vfs_quotactl_cursorclose(mp, &cursor_k);
967 }
968 
969 static int
970 do_sys_quotactl_cursorskipidtype(struct mount *mp,
971     struct quotakcursor *cursor_u, int idtype)
972 {
973 	struct quotakcursor cursor_k;
974 	int error;
975 
976 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
977 	if (error) {
978 		return error;
979 	}
980 
981 	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
982 	if (error) {
983 		return error;
984 	}
985 
986 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
987 }
988 
989 static int
990 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
991     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
992     unsigned *ret_u)
993 {
994 #define CGET_STACK_MAX 8
995 	struct quotakcursor cursor_k;
996 	struct quotakey stackkeys[CGET_STACK_MAX];
997 	struct quotaval stackvals[CGET_STACK_MAX];
998 	struct quotakey *keys_k;
999 	struct quotaval *vals_k;
1000 	unsigned ret_k;
1001 	int error;
1002 
1003 	if (maxnum > 128) {
1004 		maxnum = 128;
1005 	}
1006 
1007 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1008 	if (error) {
1009 		return error;
1010 	}
1011 
1012 	if (maxnum <= CGET_STACK_MAX) {
1013 		keys_k = stackkeys;
1014 		vals_k = stackvals;
1015 		/* ensure any padding bytes are cleared */
1016 		memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1017 		memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1018 	} else {
1019 		keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1020 		vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1021 	}
1022 
1023 	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1024 				       &ret_k);
1025 	if (error) {
1026 		goto fail;
1027 	}
1028 
1029 	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1030 	if (error) {
1031 		goto fail;
1032 	}
1033 
1034 	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1035 	if (error) {
1036 		goto fail;
1037 	}
1038 
1039 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1040 	if (error) {
1041 		goto fail;
1042 	}
1043 
1044 	/* do last to maximize the chance of being able to recover a failure */
1045 	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1046 
1047 fail:
1048 	if (keys_k != stackkeys) {
1049 		kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1050 	}
1051 	if (vals_k != stackvals) {
1052 		kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1053 	}
1054 	return error;
1055 }
1056 
1057 static int
1058 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1059     int *ret_u)
1060 {
1061 	struct quotakcursor cursor_k;
1062 	int ret_k;
1063 	int error;
1064 
1065 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1066 	if (error) {
1067 		return error;
1068 	}
1069 
1070 	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1071 	if (error) {
1072 		return error;
1073 	}
1074 
1075 	error = copyout(&ret_k, ret_u, sizeof(ret_k));
1076 	if (error) {
1077 		return error;
1078 	}
1079 
1080 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1081 }
1082 
1083 static int
1084 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1085 {
1086 	struct quotakcursor cursor_k;
1087 	int error;
1088 
1089 	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1090 	if (error) {
1091 		return error;
1092 	}
1093 
1094 	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1095 	if (error) {
1096 		return error;
1097 	}
1098 
1099 	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1100 }
1101 
1102 static int
1103 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1104 {
1105 	char *path_k;
1106 	int error;
1107 
1108 	/* XXX this should probably be a struct pathbuf */
1109 	path_k = PNBUF_GET();
1110 	error = copyin(path_u, path_k, PATH_MAX);
1111 	if (error) {
1112 		PNBUF_PUT(path_k);
1113 		return error;
1114 	}
1115 
1116 	error = vfs_quotactl_quotaon(mp, idtype, path_k);
1117 
1118 	PNBUF_PUT(path_k);
1119 	return error;
1120 }
1121 
1122 static int
1123 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1124 {
1125 	return vfs_quotactl_quotaoff(mp, idtype);
1126 }
1127 
1128 int
1129 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1130 {
1131 	struct mount *mp;
1132 	struct vnode *vp;
1133 	int error;
1134 
1135 	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1136 	if (error != 0)
1137 		return (error);
1138 	mp = vp->v_mount;
1139 
1140 	switch (args->qc_op) {
1141 	    case QUOTACTL_STAT:
1142 		error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1143 		break;
1144 	    case QUOTACTL_IDTYPESTAT:
1145 		error = do_sys_quotactl_idtypestat(mp,
1146 				args->u.idtypestat.qc_idtype,
1147 				args->u.idtypestat.qc_info);
1148 		break;
1149 	    case QUOTACTL_OBJTYPESTAT:
1150 		error = do_sys_quotactl_objtypestat(mp,
1151 				args->u.objtypestat.qc_objtype,
1152 				args->u.objtypestat.qc_info);
1153 		break;
1154 	    case QUOTACTL_GET:
1155 		error = do_sys_quotactl_get(mp,
1156 				args->u.get.qc_key,
1157 				args->u.get.qc_val);
1158 		break;
1159 	    case QUOTACTL_PUT:
1160 		error = do_sys_quotactl_put(mp,
1161 				args->u.put.qc_key,
1162 				args->u.put.qc_val);
1163 		break;
1164 	    case QUOTACTL_DEL:
1165 		error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1166 		break;
1167 	    case QUOTACTL_CURSOROPEN:
1168 		error = do_sys_quotactl_cursoropen(mp,
1169 				args->u.cursoropen.qc_cursor);
1170 		break;
1171 	    case QUOTACTL_CURSORCLOSE:
1172 		error = do_sys_quotactl_cursorclose(mp,
1173 				args->u.cursorclose.qc_cursor);
1174 		break;
1175 	    case QUOTACTL_CURSORSKIPIDTYPE:
1176 		error = do_sys_quotactl_cursorskipidtype(mp,
1177 				args->u.cursorskipidtype.qc_cursor,
1178 				args->u.cursorskipidtype.qc_idtype);
1179 		break;
1180 	    case QUOTACTL_CURSORGET:
1181 		error = do_sys_quotactl_cursorget(mp,
1182 				args->u.cursorget.qc_cursor,
1183 				args->u.cursorget.qc_keys,
1184 				args->u.cursorget.qc_vals,
1185 				args->u.cursorget.qc_maxnum,
1186 				args->u.cursorget.qc_ret);
1187 		break;
1188 	    case QUOTACTL_CURSORATEND:
1189 		error = do_sys_quotactl_cursoratend(mp,
1190 				args->u.cursoratend.qc_cursor,
1191 				args->u.cursoratend.qc_ret);
1192 		break;
1193 	    case QUOTACTL_CURSORREWIND:
1194 		error = do_sys_quotactl_cursorrewind(mp,
1195 				args->u.cursorrewind.qc_cursor);
1196 		break;
1197 	    case QUOTACTL_QUOTAON:
1198 		error = do_sys_quotactl_quotaon(mp,
1199 				args->u.quotaon.qc_idtype,
1200 				args->u.quotaon.qc_quotafile);
1201 		break;
1202 	    case QUOTACTL_QUOTAOFF:
1203 		error = do_sys_quotactl_quotaoff(mp,
1204 				args->u.quotaoff.qc_idtype);
1205 		break;
1206 	    default:
1207 		error = EINVAL;
1208 		break;
1209 	}
1210 
1211 	vrele(vp);
1212 	return error;
1213 }
1214 
1215 /* ARGSUSED */
1216 int
1217 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1218     register_t *retval)
1219 {
1220 	/* {
1221 		syscallarg(const char *) path;
1222 		syscallarg(struct quotactl_args *) args;
1223 	} */
1224 	struct quotactl_args args;
1225 	int error;
1226 
1227 	error = copyin(SCARG(uap, args), &args, sizeof(args));
1228 	if (error) {
1229 		return error;
1230 	}
1231 
1232 	return do_sys_quotactl(SCARG(uap, path), &args);
1233 }
1234 
1235 int
1236 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1237     int root)
1238 {
1239 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1240 	bool chrooted;
1241 	int error = 0;
1242 
1243 	KASSERT(l == curlwp);
1244 
1245 	/*
1246 	 * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1247 	 * since it would imply chroots can be escaped.  Just make sure this
1248 	 * routine is self-consistent.
1249 	 */
1250 	chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1251 
1252 	/*
1253 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1254 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1255 	 * overrides MNT_NOWAIT.
1256 	 */
1257 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
1258 	    (flags != MNT_WAIT && flags != 0)) {
1259 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1260 	} else {
1261 		/* Get the filesystem stats now */
1262 		memset(sp, 0, sizeof(*sp));
1263 		if ((error = VFS_STATVFS(mp, sp)) != 0)
1264 			return error;
1265 		if (!chrooted)
1266 			(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1267 	}
1268 
1269 	if (chrooted) {
1270 		size_t len;
1271 		char *bp;
1272 		char c;
1273 		char *path = PNBUF_GET();
1274 
1275 		bp = path + MAXPATHLEN;
1276 		*--bp = '\0';
1277 		rw_enter(&cwdi->cwdi_lock, RW_READER);
1278 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1279 		    MAXPATHLEN / 2, 0, l);
1280 		rw_exit(&cwdi->cwdi_lock);
1281 		if (error) {
1282 			PNBUF_PUT(path);
1283 			return error;
1284 		}
1285 		len = strlen(bp);
1286 		if (len != 1) {
1287 			/*
1288 			 * for mount points that are below our root, we can see
1289 			 * them, so we fix up the pathname and return them. The
1290 			 * rest we cannot see, so we don't allow viewing the
1291 			 * data.
1292 			 */
1293 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1294 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1295 				(void)strlcpy(sp->f_mntonname,
1296 				    c == '\0' ? "/" : &sp->f_mntonname[len],
1297 				    sizeof(sp->f_mntonname));
1298 			} else {
1299 				if (root)
1300 					(void)strlcpy(sp->f_mntonname, "/",
1301 					    sizeof(sp->f_mntonname));
1302 				else
1303 					error = EPERM;
1304 			}
1305 		}
1306 		PNBUF_PUT(path);
1307 	}
1308 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1309 	return error;
1310 }
1311 
1312 /*
1313  * Get filesystem statistics by path.
1314  */
1315 int
1316 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1317 {
1318 	struct mount *mp;
1319 	int error;
1320 	struct vnode *vp;
1321 
1322 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1323 	if (error != 0)
1324 		return error;
1325 	mp = vp->v_mount;
1326 	error = dostatvfs(mp, sb, l, flags, 1);
1327 	vrele(vp);
1328 	return error;
1329 }
1330 
1331 /* ARGSUSED */
1332 int
1333 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
1334 {
1335 	/* {
1336 		syscallarg(const char *) path;
1337 		syscallarg(struct statvfs *) buf;
1338 		syscallarg(int) flags;
1339 	} */
1340 	struct statvfs *sb;
1341 	int error;
1342 
1343 	sb = STATVFSBUF_GET();
1344 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1345 	if (error == 0)
1346 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1347 	STATVFSBUF_PUT(sb);
1348 	return error;
1349 }
1350 
1351 /*
1352  * Get filesystem statistics by fd.
1353  */
1354 int
1355 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1356 {
1357 	file_t *fp;
1358 	struct mount *mp;
1359 	int error;
1360 
1361 	/* fd_getvnode() will use the descriptor for us */
1362 	if ((error = fd_getvnode(fd, &fp)) != 0)
1363 		return (error);
1364 	mp = fp->f_vnode->v_mount;
1365 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1366 	fd_putfile(fd);
1367 	return error;
1368 }
1369 
1370 /* ARGSUSED */
1371 int
1372 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
1373 {
1374 	/* {
1375 		syscallarg(int) fd;
1376 		syscallarg(struct statvfs *) buf;
1377 		syscallarg(int) flags;
1378 	} */
1379 	struct statvfs *sb;
1380 	int error;
1381 
1382 	sb = STATVFSBUF_GET();
1383 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1384 	if (error == 0)
1385 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1386 	STATVFSBUF_PUT(sb);
1387 	return error;
1388 }
1389 
1390 
1391 /*
1392  * Get statistics on all filesystems.
1393  */
1394 int
1395 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1396     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1397     register_t *retval)
1398 {
1399 	int root = 0;
1400 	mount_iterator_t *iter;
1401 	struct proc *p = l->l_proc;
1402 	struct mount *mp;
1403 	struct statvfs *sb;
1404 	size_t count, maxcount;
1405 	int error = 0;
1406 
1407 	sb = STATVFSBUF_GET();
1408 	maxcount = bufsize / entry_sz;
1409 	count = 0;
1410 	mountlist_iterator_init(&iter);
1411 	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1412 		if (sfsp && count < maxcount) {
1413 			error = dostatvfs(mp, sb, l, flags, 0);
1414 			if (error) {
1415 				error = 0;
1416 				continue;
1417 			}
1418 			error = copyfn(sb, sfsp, entry_sz);
1419 			if (error)
1420 				goto out;
1421 			sfsp = (char *)sfsp + entry_sz;
1422 			root |= strcmp(sb->f_mntonname, "/") == 0;
1423 		}
1424 		count++;
1425 	}
1426 
1427 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1428 		/*
1429 		 * fake a root entry
1430 		 */
1431 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1432 		    sb, l, flags, 1);
1433 		if (error != 0)
1434 			goto out;
1435 		if (sfsp) {
1436 			error = copyfn(sb, sfsp, entry_sz);
1437 			if (error != 0)
1438 				goto out;
1439 		}
1440 		count++;
1441 	}
1442 	if (sfsp && count > maxcount)
1443 		*retval = maxcount;
1444 	else
1445 		*retval = count;
1446 out:
1447 	mountlist_iterator_destroy(iter);
1448 	STATVFSBUF_PUT(sb);
1449 	return error;
1450 }
1451 
1452 int
1453 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1454     register_t *retval)
1455 {
1456 	/* {
1457 		syscallarg(struct statvfs *) buf;
1458 		syscallarg(size_t) bufsize;
1459 		syscallarg(int) flags;
1460 	} */
1461 
1462 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1463 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1464 }
1465 
1466 /*
1467  * Change current working directory to a given file descriptor.
1468  */
1469 int
1470 do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
1471 {
1472 	struct proc *p = l->l_proc;
1473 	struct cwdinfo *cwdi;
1474 	struct vnode *vp, *tdp;
1475 	struct mount *mp;
1476 	file_t *fp;
1477 	int error;
1478 
1479 	/* fd_getvnode() will use the descriptor for us */
1480 	if ((error = fd_getvnode(fd, &fp)) != 0)
1481 		return error;
1482 	vp = fp->f_vnode;
1483 
1484 	vref(vp);
1485 	vn_lock(vp, LK_SHARED | LK_RETRY);
1486 	if (vp->v_type != VDIR)
1487 		error = ENOTDIR;
1488 	else
1489 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1490 	if (error) {
1491 		vput(vp);
1492 		goto out;
1493 	}
1494 	while ((mp = vp->v_mountedhere) != NULL) {
1495 		error = vfs_busy(mp);
1496 		vput(vp);
1497 		if (error != 0)
1498 			goto out;
1499 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
1500 		vfs_unbusy(mp);
1501 		if (error)
1502 			goto out;
1503 		vp = tdp;
1504 	}
1505 	VOP_UNLOCK(vp);
1506 
1507 	/*
1508 	 * Disallow changing to a directory not under the process's
1509 	 * current root directory (if there is one).
1510 	 */
1511 	cwdi = p->p_cwdi;
1512 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1513 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1514 		vrele(vp);
1515 		error = EPERM;	/* operation not permitted */
1516 	} else {
1517 		vrele(cwdi->cwdi_cdir);
1518 		cwdi->cwdi_cdir = vp;
1519 	}
1520 	rw_exit(&cwdi->cwdi_lock);
1521 
1522 out:
1523 	fd_putfile(fd);
1524 	return error;
1525 }
1526 
1527 /*
1528  * Change current working directory to a given file descriptor.
1529  */
1530 /* ARGSUSED */
1531 int
1532 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1533 {
1534 	/* {
1535 		syscallarg(int) fd;
1536 	} */
1537 	return do_sys_fchdir(l, SCARG(uap, fd), retval);
1538 }
1539 
1540 /*
1541  * Change this process's notion of the root directory to a given file
1542  * descriptor.
1543  */
1544 int
1545 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1546 {
1547 	struct vnode	*vp;
1548 	file_t	*fp;
1549 	int		 error, fd = SCARG(uap, fd);
1550 
1551 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1552  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1553 		return error;
1554 	/* fd_getvnode() will use the descriptor for us */
1555 	if ((error = fd_getvnode(fd, &fp)) != 0)
1556 		return error;
1557 	vp = fp->f_vnode;
1558 	vn_lock(vp, LK_SHARED | LK_RETRY);
1559 	if (vp->v_type != VDIR)
1560 		error = ENOTDIR;
1561 	else
1562 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1563 	VOP_UNLOCK(vp);
1564 	if (error)
1565 		goto out;
1566 	vref(vp);
1567 	change_root(vp);
1568 
1569  out:
1570 	fd_putfile(fd);
1571 	return (error);
1572 }
1573 
1574 /*
1575  * Change current working directory (``.'').
1576  */
1577 int
1578 do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
1579     register_t *retval)
1580 {
1581 	struct proc *p = l->l_proc;
1582 	struct cwdinfo * cwdi;
1583 	int error;
1584 	struct vnode *vp;
1585 
1586 	if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
1587 		return error;
1588 	cwdi = p->p_cwdi;
1589 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1590 	vrele(cwdi->cwdi_cdir);
1591 	cwdi->cwdi_cdir = vp;
1592 	rw_exit(&cwdi->cwdi_lock);
1593 	return 0;
1594 }
1595 
1596 /*
1597  * Change current working directory (``.'').
1598  */
1599 /* ARGSUSED */
1600 int
1601 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1602 {
1603 	/* {
1604 		syscallarg(const char *) path;
1605 	} */
1606 	return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
1607 }
1608 
1609 /*
1610  * Change notion of root (``/'') directory.
1611  */
1612 /* ARGSUSED */
1613 int
1614 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1615 {
1616 	/* {
1617 		syscallarg(const char *) path;
1618 	} */
1619 	int error;
1620 	struct vnode *vp;
1621 
1622 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1623 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1624 		return (error);
1625 
1626 	error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1627 	if (error == 0)
1628 		change_root(vp);
1629 	return error;
1630 }
1631 
1632 /*
1633  * Common routine for chroot and fchroot.
1634  * NB: callers need to properly authorize the change root operation.
1635  */
1636 void
1637 change_root(struct vnode *vp)
1638 {
1639 	kauth_cred_t ncred;
1640 	struct lwp *l = curlwp;
1641 	struct proc *p = l->l_proc;
1642 	struct cwdinfo *cwdi = p->p_cwdi;
1643 
1644 	ncred = kauth_cred_alloc();
1645 
1646 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1647 	if (cwdi->cwdi_rdir != NULL)
1648 		vrele(cwdi->cwdi_rdir);
1649 	cwdi->cwdi_rdir = vp;
1650 
1651 	/*
1652 	 * Prevent escaping from chroot by putting the root under
1653 	 * the working directory.  Silently chdir to / if we aren't
1654 	 * already there.
1655 	 */
1656 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1657 		/*
1658 		 * XXX would be more failsafe to change directory to a
1659 		 * deadfs node here instead
1660 		 */
1661 		vrele(cwdi->cwdi_cdir);
1662 		vref(vp);
1663 		cwdi->cwdi_cdir = vp;
1664 	}
1665 	rw_exit(&cwdi->cwdi_lock);
1666 
1667 	/* Get a write lock on the process credential. */
1668 	proc_crmod_enter();
1669 
1670 	kauth_cred_clone(p->p_cred, ncred);
1671 	kauth_proc_chroot(ncred, p->p_cwdi);
1672 
1673 	/* Broadcast our credentials to the process and other LWPs. */
1674  	proc_crmod_leave(ncred, p->p_cred, true);
1675 }
1676 
1677 /*
1678  * Common routine for chroot and chdir.
1679  * XXX "where" should be enum uio_seg
1680  */
1681 int
1682 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1683 {
1684 	struct pathbuf *pb;
1685 	struct nameidata nd;
1686 	int error;
1687 
1688 	error = pathbuf_maybe_copyin(path, where, &pb);
1689 	if (error) {
1690 		return error;
1691 	}
1692 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1693 	if ((error = namei(&nd)) != 0) {
1694 		pathbuf_destroy(pb);
1695 		return error;
1696 	}
1697 	*vpp = nd.ni_vp;
1698 	pathbuf_destroy(pb);
1699 
1700 	if ((*vpp)->v_type != VDIR)
1701 		error = ENOTDIR;
1702 	else
1703 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1704 
1705 	if (error)
1706 		vput(*vpp);
1707 	else
1708 		VOP_UNLOCK(*vpp);
1709 	return (error);
1710 }
1711 
1712 /*
1713  * Internals of sys_open - path has already been converted into a pathbuf
1714  * (so we can easily reuse this function from other parts of the kernel,
1715  * like posix_spawn post-processing).
1716  */
1717 int
1718 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1719 	int open_mode, int *fd)
1720 {
1721 	struct proc *p = l->l_proc;
1722 	struct cwdinfo *cwdi = p->p_cwdi;
1723 	file_t *fp;
1724 	struct vnode *vp;
1725 	int dupfd;
1726 	bool dupfd_move;
1727 	int flags, cmode;
1728 	int indx, error;
1729 
1730 	if (open_flags & O_SEARCH) {
1731 		open_flags &= ~(int)O_SEARCH;
1732 	}
1733 
1734 	/*
1735 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1736 	 * may be specified.
1737 	 */
1738 	if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1739 		return EINVAL;
1740 
1741 	flags = FFLAGS(open_flags);
1742 	if ((flags & (FREAD | FWRITE)) == 0)
1743 		return EINVAL;
1744 
1745 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1746 		return error;
1747 	}
1748 
1749 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1750 	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1751 
1752 	error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1753 	    &vp, &dupfd_move, &dupfd);
1754 	if (error != 0) {
1755 		fd_abort(p, fp, indx);
1756 		if (error == ERESTART)
1757 			error = EINTR;
1758 		return error;
1759 	}
1760 
1761 	if (vp == NULL) {
1762 		fd_abort(p, fp, indx);
1763 		error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1764 		if (error)
1765 			return error;
1766 		*fd = indx;
1767 	} else {
1768 		error = open_setfp(l, fp, vp, indx, flags);
1769 		if (error)
1770 			return error;
1771 		VOP_UNLOCK(vp);
1772 		*fd = indx;
1773 		fd_affix(p, fp, indx);
1774 	}
1775 
1776 	return 0;
1777 }
1778 
1779 int
1780 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1781 {
1782 	struct pathbuf *pb;
1783 	int error, oflags;
1784 
1785 	oflags = FFLAGS(open_flags);
1786 	if ((oflags & (FREAD | FWRITE)) == 0)
1787 		return EINVAL;
1788 
1789 	pb = pathbuf_create(path);
1790 	if (pb == NULL)
1791 		return ENOMEM;
1792 
1793 	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1794 	pathbuf_destroy(pb);
1795 
1796 	return error;
1797 }
1798 
1799 static int
1800 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1801     int mode, int *fd)
1802 {
1803 	file_t *dfp = NULL;
1804 	struct vnode *dvp = NULL;
1805 	struct pathbuf *pb;
1806 	const char *pathstring = NULL;
1807 	int error;
1808 
1809 	if (path == NULL) {
1810 		MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1811 		if (error == ENOSYS)
1812 			goto no_compat;
1813 		if (error)
1814 			return error;
1815 	} else {
1816 no_compat:
1817 		error = pathbuf_copyin(path, &pb);
1818 		if (error)
1819 			return error;
1820 	}
1821 
1822 	pathstring = pathbuf_stringcopy_get(pb);
1823 
1824 	/*
1825 	 * fdat is ignored if:
1826 	 * 1) if fdat is AT_FDCWD, which means use current directory as base.
1827 	 * 2) if path is absolute, then fdat is useless.
1828 	 */
1829 	if (fdat != AT_FDCWD && pathstring[0] != '/') {
1830 		/* fd_getvnode() will use the descriptor for us */
1831 		if ((error = fd_getvnode(fdat, &dfp)) != 0)
1832 			goto out;
1833 
1834 		dvp = dfp->f_vnode;
1835 	}
1836 
1837 	error = do_open(l, dvp, pb, flags, mode, fd);
1838 
1839 	if (dfp != NULL)
1840 		fd_putfile(fdat);
1841 out:
1842 	pathbuf_stringcopy_put(pb, pathstring);
1843 	pathbuf_destroy(pb);
1844 	return error;
1845 }
1846 
1847 int
1848 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1849 {
1850 	/* {
1851 		syscallarg(const char *) path;
1852 		syscallarg(int) flags;
1853 		syscallarg(int) mode;
1854 	} */
1855 	int error;
1856 	int fd;
1857 
1858 	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1859 			      SCARG(uap, flags), SCARG(uap, mode), &fd);
1860 
1861 	if (error == 0)
1862 		*retval = fd;
1863 
1864 	return error;
1865 }
1866 
1867 int
1868 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1869 {
1870 	/* {
1871 		syscallarg(int) fd;
1872 		syscallarg(const char *) path;
1873 		syscallarg(int) oflags;
1874 		syscallarg(int) mode;
1875 	} */
1876 	int error;
1877 	int fd;
1878 
1879 	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1880 			      SCARG(uap, oflags), SCARG(uap, mode), &fd);
1881 
1882 	if (error == 0)
1883 		*retval = fd;
1884 
1885 	return error;
1886 }
1887 
1888 static void
1889 vfs__fhfree(fhandle_t *fhp)
1890 {
1891 	size_t fhsize;
1892 
1893 	fhsize = FHANDLE_SIZE(fhp);
1894 	kmem_free(fhp, fhsize);
1895 }
1896 
1897 /*
1898  * vfs_composefh: compose a filehandle.
1899  */
1900 
1901 int
1902 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1903 {
1904 	struct mount *mp;
1905 	struct fid *fidp;
1906 	int error;
1907 	size_t needfhsize;
1908 	size_t fidsize;
1909 
1910 	mp = vp->v_mount;
1911 	fidp = NULL;
1912 	if (*fh_size < FHANDLE_SIZE_MIN) {
1913 		fidsize = 0;
1914 	} else {
1915 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1916 		if (fhp != NULL) {
1917 			memset(fhp, 0, *fh_size);
1918 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1919 			fidp = &fhp->fh_fid;
1920 		}
1921 	}
1922 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1923 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1924 	if (error == 0 && *fh_size < needfhsize) {
1925 		error = E2BIG;
1926 	}
1927 	*fh_size = needfhsize;
1928 	return error;
1929 }
1930 
1931 int
1932 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1933 {
1934 	struct mount *mp;
1935 	fhandle_t *fhp;
1936 	size_t fhsize;
1937 	size_t fidsize;
1938 	int error;
1939 
1940 	mp = vp->v_mount;
1941 	fidsize = 0;
1942 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1943 	KASSERT(error != 0);
1944 	if (error != E2BIG) {
1945 		goto out;
1946 	}
1947 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1948 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1949 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1950 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1951 	if (error == 0) {
1952 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1953 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1954 		*fhpp = fhp;
1955 	} else {
1956 		kmem_free(fhp, fhsize);
1957 	}
1958 out:
1959 	return error;
1960 }
1961 
1962 void
1963 vfs_composefh_free(fhandle_t *fhp)
1964 {
1965 
1966 	vfs__fhfree(fhp);
1967 }
1968 
1969 /*
1970  * vfs_fhtovp: lookup a vnode by a filehandle.
1971  */
1972 
1973 int
1974 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1975 {
1976 	struct mount *mp;
1977 	int error;
1978 
1979 	*vpp = NULL;
1980 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1981 	if (mp == NULL) {
1982 		error = ESTALE;
1983 		goto out;
1984 	}
1985 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1986 		error = EOPNOTSUPP;
1987 		goto out;
1988 	}
1989 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
1990 out:
1991 	return error;
1992 }
1993 
1994 /*
1995  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1996  * the needed size.
1997  */
1998 
1999 int
2000 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
2001 {
2002 	fhandle_t *fhp;
2003 	int error;
2004 
2005 	if (fhsize > FHANDLE_SIZE_MAX) {
2006 		return EINVAL;
2007 	}
2008 	if (fhsize < FHANDLE_SIZE_MIN) {
2009 		return EINVAL;
2010 	}
2011 again:
2012 	fhp = kmem_alloc(fhsize, KM_SLEEP);
2013 	error = copyin(ufhp, fhp, fhsize);
2014 	if (error == 0) {
2015 		/* XXX this check shouldn't be here */
2016 		if (FHANDLE_SIZE(fhp) == fhsize) {
2017 			*fhpp = fhp;
2018 			return 0;
2019 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
2020 			/*
2021 			 * a kludge for nfsv2 padded handles.
2022 			 */
2023 			size_t sz;
2024 
2025 			sz = FHANDLE_SIZE(fhp);
2026 			kmem_free(fhp, fhsize);
2027 			fhsize = sz;
2028 			goto again;
2029 		} else {
2030 			/*
2031 			 * userland told us wrong size.
2032 			 */
2033 		    	error = EINVAL;
2034 		}
2035 	}
2036 	kmem_free(fhp, fhsize);
2037 	return error;
2038 }
2039 
2040 void
2041 vfs_copyinfh_free(fhandle_t *fhp)
2042 {
2043 
2044 	vfs__fhfree(fhp);
2045 }
2046 
2047 /*
2048  * Get file handle system call
2049  */
2050 int
2051 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
2052 {
2053 	/* {
2054 		syscallarg(char *) fname;
2055 		syscallarg(fhandle_t *) fhp;
2056 		syscallarg(size_t *) fh_size;
2057 	} */
2058 	struct vnode *vp;
2059 	fhandle_t *fh;
2060 	int error;
2061 	struct pathbuf *pb;
2062 	struct nameidata nd;
2063 	size_t sz;
2064 	size_t usz;
2065 
2066 	/*
2067 	 * Must be super user
2068 	 */
2069 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2070 	    0, NULL, NULL, NULL);
2071 	if (error)
2072 		return (error);
2073 
2074 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
2075 	if (error) {
2076 		return error;
2077 	}
2078 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2079 	error = namei(&nd);
2080 	if (error) {
2081 		pathbuf_destroy(pb);
2082 		return error;
2083 	}
2084 	vp = nd.ni_vp;
2085 	pathbuf_destroy(pb);
2086 
2087 	error = vfs_composefh_alloc(vp, &fh);
2088 	vput(vp);
2089 	if (error != 0) {
2090 		return error;
2091 	}
2092 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2093 	if (error != 0) {
2094 		goto out;
2095 	}
2096 	sz = FHANDLE_SIZE(fh);
2097 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2098 	if (error != 0) {
2099 		goto out;
2100 	}
2101 	if (usz >= sz) {
2102 		error = copyout(fh, SCARG(uap, fhp), sz);
2103 	} else {
2104 		error = E2BIG;
2105 	}
2106 out:
2107 	vfs_composefh_free(fh);
2108 	return (error);
2109 }
2110 
2111 /*
2112  * Open a file given a file handle.
2113  *
2114  * Check permissions, allocate an open file structure,
2115  * and call the device open routine if any.
2116  */
2117 
2118 int
2119 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2120     register_t *retval)
2121 {
2122 	file_t *fp;
2123 	struct vnode *vp = NULL;
2124 	kauth_cred_t cred = l->l_cred;
2125 	file_t *nfp;
2126 	int indx, error;
2127 	struct vattr va;
2128 	fhandle_t *fh;
2129 	int flags;
2130 	proc_t *p;
2131 
2132 	p = curproc;
2133 
2134 	/*
2135 	 * Must be super user
2136 	 */
2137 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2138 	    0, NULL, NULL, NULL)))
2139 		return (error);
2140 
2141 	if (oflags & O_SEARCH) {
2142 		oflags &= ~(int)O_SEARCH;
2143 	}
2144 
2145 	flags = FFLAGS(oflags);
2146 	if ((flags & (FREAD | FWRITE)) == 0)
2147 		return (EINVAL);
2148 	if ((flags & O_CREAT))
2149 		return (EINVAL);
2150 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
2151 		return (error);
2152 	fp = nfp;
2153 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2154 	if (error != 0) {
2155 		goto bad;
2156 	}
2157 	error = vfs_fhtovp(fh, &vp);
2158 	vfs_copyinfh_free(fh);
2159 	if (error != 0) {
2160 		goto bad;
2161 	}
2162 
2163 	/* Now do an effective vn_open */
2164 
2165 	if (vp->v_type == VSOCK) {
2166 		error = EOPNOTSUPP;
2167 		goto bad;
2168 	}
2169 	error = vn_openchk(vp, cred, flags);
2170 	if (error != 0)
2171 		goto bad;
2172 	if (flags & O_TRUNC) {
2173 		VOP_UNLOCK(vp);			/* XXX */
2174 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2175 		vattr_null(&va);
2176 		va.va_size = 0;
2177 		error = VOP_SETATTR(vp, &va, cred);
2178 		if (error)
2179 			goto bad;
2180 	}
2181 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2182 		goto bad;
2183 	if (flags & FWRITE) {
2184 		mutex_enter(vp->v_interlock);
2185 		vp->v_writecount++;
2186 		mutex_exit(vp->v_interlock);
2187 	}
2188 
2189 	/* done with modified vn_open, now finish what sys_open does. */
2190 	if ((error = open_setfp(l, fp, vp, indx, flags)))
2191 		return error;
2192 
2193 	VOP_UNLOCK(vp);
2194 	*retval = indx;
2195 	fd_affix(p, fp, indx);
2196 	return (0);
2197 
2198 bad:
2199 	fd_abort(p, fp, indx);
2200 	if (vp != NULL)
2201 		vput(vp);
2202 	if (error == EDUPFD || error == EMOVEFD) {
2203 		/* XXX should probably close curlwp->l_dupfd */
2204 		error = EOPNOTSUPP;
2205 	}
2206 	return (error);
2207 }
2208 
2209 int
2210 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
2211 {
2212 	/* {
2213 		syscallarg(const void *) fhp;
2214 		syscallarg(size_t) fh_size;
2215 		syscallarg(int) flags;
2216 	} */
2217 
2218 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2219 	    SCARG(uap, flags), retval);
2220 }
2221 
2222 int
2223 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2224 {
2225 	int error;
2226 	fhandle_t *fh;
2227 	struct vnode *vp;
2228 
2229 	/*
2230 	 * Must be super user
2231 	 */
2232 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2233 	    0, NULL, NULL, NULL)))
2234 		return (error);
2235 
2236 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2237 	if (error != 0)
2238 		return error;
2239 
2240 	error = vfs_fhtovp(fh, &vp);
2241 	vfs_copyinfh_free(fh);
2242 	if (error != 0)
2243 		return error;
2244 
2245 	error = vn_stat(vp, sb);
2246 	vput(vp);
2247 	return error;
2248 }
2249 
2250 
2251 /* ARGSUSED */
2252 int
2253 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
2254 {
2255 	/* {
2256 		syscallarg(const void *) fhp;
2257 		syscallarg(size_t) fh_size;
2258 		syscallarg(struct stat *) sb;
2259 	} */
2260 	struct stat sb;
2261 	int error;
2262 
2263 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2264 	if (error)
2265 		return error;
2266 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2267 }
2268 
2269 int
2270 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
2271     int flags)
2272 {
2273 	fhandle_t *fh;
2274 	struct mount *mp;
2275 	struct vnode *vp;
2276 	int error;
2277 
2278 	/*
2279 	 * Must be super user
2280 	 */
2281 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2282 	    0, NULL, NULL, NULL)))
2283 		return error;
2284 
2285 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2286 	if (error != 0)
2287 		return error;
2288 
2289 	error = vfs_fhtovp(fh, &vp);
2290 	vfs_copyinfh_free(fh);
2291 	if (error != 0)
2292 		return error;
2293 
2294 	mp = vp->v_mount;
2295 	error = dostatvfs(mp, sb, l, flags, 1);
2296 	vput(vp);
2297 	return error;
2298 }
2299 
2300 /* ARGSUSED */
2301 int
2302 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
2303 {
2304 	/* {
2305 		syscallarg(const void *) fhp;
2306 		syscallarg(size_t) fh_size;
2307 		syscallarg(struct statvfs *) buf;
2308 		syscallarg(int)	flags;
2309 	} */
2310 	struct statvfs *sb = STATVFSBUF_GET();
2311 	int error;
2312 
2313 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2314 	    SCARG(uap, flags));
2315 	if (error == 0)
2316 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2317 	STATVFSBUF_PUT(sb);
2318 	return error;
2319 }
2320 
2321 int
2322 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2323     dev_t dev)
2324 {
2325 
2326 	/*
2327 	 * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2328 	 * in mode and dev=0.
2329 	 *
2330 	 * In all the other cases it's implementation defined behavior.
2331 	 */
2332 
2333 	if ((mode & S_IFIFO) && dev == 0)
2334 		return do_sys_mkfifoat(l, fdat, pathname, mode);
2335 	else
2336 		return do_sys_mknodat(l, fdat, pathname, mode, dev,
2337 		    UIO_USERSPACE);
2338 }
2339 
2340 /*
2341  * Create a special file.
2342  */
2343 /* ARGSUSED */
2344 int
2345 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2346     register_t *retval)
2347 {
2348 	/* {
2349 		syscallarg(const char *) path;
2350 		syscallarg(mode_t) mode;
2351 		syscallarg(dev_t) dev;
2352 	} */
2353 	return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2354 	    SCARG(uap, mode), SCARG(uap, dev));
2355 }
2356 
2357 int
2358 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2359     register_t *retval)
2360 {
2361 	/* {
2362 		syscallarg(int) fd;
2363 		syscallarg(const char *) path;
2364 		syscallarg(mode_t) mode;
2365 		syscallarg(int) pad;
2366 		syscallarg(dev_t) dev;
2367 	} */
2368 
2369 	return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2370 	    SCARG(uap, mode), SCARG(uap, dev));
2371 }
2372 
2373 int
2374 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2375     enum uio_seg seg)
2376 {
2377 	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2378 }
2379 
2380 int
2381 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2382     dev_t dev, enum uio_seg seg)
2383 {
2384 	struct proc *p = l->l_proc;
2385 	struct vnode *vp;
2386 	struct vattr vattr;
2387 	int error, optype;
2388 	struct pathbuf *pb;
2389 	struct nameidata nd;
2390 	const char *pathstring;
2391 
2392 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2393 	    0, NULL, NULL, NULL)) != 0)
2394 		return (error);
2395 
2396 	optype = VOP_MKNOD_DESCOFFSET;
2397 
2398 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2399 	if (error) {
2400 		return error;
2401 	}
2402 	pathstring = pathbuf_stringcopy_get(pb);
2403 	if (pathstring == NULL) {
2404 		pathbuf_destroy(pb);
2405 		return ENOMEM;
2406 	}
2407 
2408 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2409 
2410 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2411 		goto out;
2412 	vp = nd.ni_vp;
2413 
2414 	if (vp != NULL)
2415 		error = EEXIST;
2416 	else {
2417 		vattr_null(&vattr);
2418 		/* We will read cwdi->cwdi_cmask unlocked. */
2419 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2420 		vattr.va_rdev = dev;
2421 
2422 		switch (mode & S_IFMT) {
2423 		case S_IFMT:	/* used by badsect to flag bad sectors */
2424 			vattr.va_type = VBAD;
2425 			break;
2426 		case S_IFCHR:
2427 			vattr.va_type = VCHR;
2428 			break;
2429 		case S_IFBLK:
2430 			vattr.va_type = VBLK;
2431 			break;
2432 		case S_IFWHT:
2433 			optype = VOP_WHITEOUT_DESCOFFSET;
2434 			break;
2435 		case S_IFREG:
2436 #if NVERIEXEC > 0
2437 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
2438 			    O_CREAT);
2439 #endif /* NVERIEXEC > 0 */
2440 			vattr.va_type = VREG;
2441 			vattr.va_rdev = VNOVAL;
2442 			optype = VOP_CREATE_DESCOFFSET;
2443 			break;
2444 		default:
2445 			error = EINVAL;
2446 			break;
2447 		}
2448 
2449 		if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2450 		    vattr.va_rdev == VNOVAL)
2451 			error = EINVAL;
2452 	}
2453 
2454 	if (!error) {
2455 		switch (optype) {
2456 		case VOP_WHITEOUT_DESCOFFSET:
2457 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2458 			if (error)
2459 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2460 			vput(nd.ni_dvp);
2461 			break;
2462 
2463 		case VOP_MKNOD_DESCOFFSET:
2464 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2465 						&nd.ni_cnd, &vattr);
2466 			if (error == 0)
2467 				vrele(nd.ni_vp);
2468 			vput(nd.ni_dvp);
2469 			break;
2470 
2471 		case VOP_CREATE_DESCOFFSET:
2472 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2473 						&nd.ni_cnd, &vattr);
2474 			if (error == 0)
2475 				vrele(nd.ni_vp);
2476 			vput(nd.ni_dvp);
2477 			break;
2478 		}
2479 	} else {
2480 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2481 		if (nd.ni_dvp == vp)
2482 			vrele(nd.ni_dvp);
2483 		else
2484 			vput(nd.ni_dvp);
2485 		if (vp)
2486 			vrele(vp);
2487 	}
2488 out:
2489 	pathbuf_stringcopy_put(pb, pathstring);
2490 	pathbuf_destroy(pb);
2491 	return (error);
2492 }
2493 
2494 /*
2495  * Create a named pipe.
2496  */
2497 /* ARGSUSED */
2498 int
2499 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
2500 {
2501 	/* {
2502 		syscallarg(const char *) path;
2503 		syscallarg(int) mode;
2504 	} */
2505 	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2506 }
2507 
2508 int
2509 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2510     register_t *retval)
2511 {
2512 	/* {
2513 		syscallarg(int) fd;
2514 		syscallarg(const char *) path;
2515 		syscallarg(int) mode;
2516 	} */
2517 
2518 	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2519 	    SCARG(uap, mode));
2520 }
2521 
2522 static int
2523 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2524 {
2525 	struct proc *p = l->l_proc;
2526 	struct vattr vattr;
2527 	int error;
2528 	struct pathbuf *pb;
2529 	struct nameidata nd;
2530 
2531 	error = pathbuf_copyin(path, &pb);
2532 	if (error) {
2533 		return error;
2534 	}
2535 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2536 
2537 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2538 		pathbuf_destroy(pb);
2539 		return error;
2540 	}
2541 	if (nd.ni_vp != NULL) {
2542 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2543 		if (nd.ni_dvp == nd.ni_vp)
2544 			vrele(nd.ni_dvp);
2545 		else
2546 			vput(nd.ni_dvp);
2547 		vrele(nd.ni_vp);
2548 		pathbuf_destroy(pb);
2549 		return (EEXIST);
2550 	}
2551 	vattr_null(&vattr);
2552 	vattr.va_type = VFIFO;
2553 	/* We will read cwdi->cwdi_cmask unlocked. */
2554 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2555 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2556 	if (error == 0)
2557 		vrele(nd.ni_vp);
2558 	vput(nd.ni_dvp);
2559 	pathbuf_destroy(pb);
2560 	return (error);
2561 }
2562 
2563 /*
2564  * Make a hard file link.
2565  */
2566 /* ARGSUSED */
2567 int
2568 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2569     const char *link, int follow, register_t *retval)
2570 {
2571 	struct vnode *vp;
2572 	struct pathbuf *linkpb;
2573 	struct nameidata nd;
2574 	namei_simple_flags_t ns_flags;
2575 	int error;
2576 
2577 	if (follow & AT_SYMLINK_FOLLOW)
2578 		ns_flags = NSM_FOLLOW_TRYEMULROOT;
2579 	else
2580 		ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2581 
2582 	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2583 	if (error != 0)
2584 		return (error);
2585 	error = pathbuf_copyin(link, &linkpb);
2586 	if (error) {
2587 		goto out1;
2588 	}
2589 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2590 	if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2591 		goto out2;
2592 	if (nd.ni_vp) {
2593 		error = EEXIST;
2594 		goto abortop;
2595 	}
2596 	/* Prevent hard links on directories. */
2597 	if (vp->v_type == VDIR) {
2598 		error = EPERM;
2599 		goto abortop;
2600 	}
2601 	/* Prevent cross-mount operation. */
2602 	if (nd.ni_dvp->v_mount != vp->v_mount) {
2603 		error = EXDEV;
2604 		goto abortop;
2605 	}
2606 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2607 	VOP_UNLOCK(nd.ni_dvp);
2608 	vrele(nd.ni_dvp);
2609 out2:
2610 	pathbuf_destroy(linkpb);
2611 out1:
2612 	vrele(vp);
2613 	return (error);
2614 abortop:
2615 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2616 	if (nd.ni_dvp == nd.ni_vp)
2617 		vrele(nd.ni_dvp);
2618 	else
2619 		vput(nd.ni_dvp);
2620 	if (nd.ni_vp != NULL)
2621 		vrele(nd.ni_vp);
2622 	goto out2;
2623 }
2624 
2625 int
2626 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2627 {
2628 	/* {
2629 		syscallarg(const char *) path;
2630 		syscallarg(const char *) link;
2631 	} */
2632 	const char *path = SCARG(uap, path);
2633 	const char *link = SCARG(uap, link);
2634 
2635 	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2636 	    AT_SYMLINK_FOLLOW, retval);
2637 }
2638 
2639 int
2640 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2641     register_t *retval)
2642 {
2643 	/* {
2644 		syscallarg(int) fd1;
2645 		syscallarg(const char *) name1;
2646 		syscallarg(int) fd2;
2647 		syscallarg(const char *) name2;
2648 		syscallarg(int) flags;
2649 	} */
2650 	int fd1 = SCARG(uap, fd1);
2651 	const char *name1 = SCARG(uap, name1);
2652 	int fd2 = SCARG(uap, fd2);
2653 	const char *name2 = SCARG(uap, name2);
2654 	int follow;
2655 
2656 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2657 
2658 	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2659 }
2660 
2661 
2662 int
2663 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2664 {
2665 	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2666 }
2667 
2668 static int
2669 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2670     const char *link, enum uio_seg seg)
2671 {
2672 	struct proc *p = curproc;
2673 	struct vattr vattr;
2674 	char *path;
2675 	int error;
2676 	size_t len;
2677 	struct pathbuf *linkpb;
2678 	struct nameidata nd;
2679 
2680 	KASSERT(l != NULL || fdat == AT_FDCWD);
2681 
2682 	path = PNBUF_GET();
2683 	if (seg == UIO_USERSPACE) {
2684 		if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2685 			goto out1;
2686 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2687 			goto out1;
2688 	} else {
2689 		len = strlen(patharg) + 1;
2690 		KASSERT(len <= MAXPATHLEN);
2691 		memcpy(path, patharg, len);
2692 		linkpb = pathbuf_create(link);
2693 		if (linkpb == NULL) {
2694 			error = ENOMEM;
2695 			goto out1;
2696 		}
2697 	}
2698 	ktrkuser("symlink-target", path, len - 1);
2699 
2700 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2701 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2702 		goto out2;
2703 	if (nd.ni_vp) {
2704 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2705 		if (nd.ni_dvp == nd.ni_vp)
2706 			vrele(nd.ni_dvp);
2707 		else
2708 			vput(nd.ni_dvp);
2709 		vrele(nd.ni_vp);
2710 		error = EEXIST;
2711 		goto out2;
2712 	}
2713 	vattr_null(&vattr);
2714 	vattr.va_type = VLNK;
2715 	/* We will read cwdi->cwdi_cmask unlocked. */
2716 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2717 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2718 	if (error == 0)
2719 		vrele(nd.ni_vp);
2720 	vput(nd.ni_dvp);
2721 out2:
2722 	pathbuf_destroy(linkpb);
2723 out1:
2724 	PNBUF_PUT(path);
2725 	return (error);
2726 }
2727 
2728 /*
2729  * Make a symbolic link.
2730  */
2731 /* ARGSUSED */
2732 int
2733 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2734 {
2735 	/* {
2736 		syscallarg(const char *) path;
2737 		syscallarg(const char *) link;
2738 	} */
2739 
2740 	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2741 	    UIO_USERSPACE);
2742 }
2743 
2744 int
2745 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2746     register_t *retval)
2747 {
2748 	/* {
2749 		syscallarg(const char *) path1;
2750 		syscallarg(int) fd;
2751 		syscallarg(const char *) path2;
2752 	} */
2753 
2754 	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2755 	    SCARG(uap, path2), UIO_USERSPACE);
2756 }
2757 
2758 /*
2759  * Delete a whiteout from the filesystem.
2760  */
2761 /* ARGSUSED */
2762 int
2763 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2764 {
2765 	/* {
2766 		syscallarg(const char *) path;
2767 	} */
2768 	int error;
2769 	struct pathbuf *pb;
2770 	struct nameidata nd;
2771 
2772 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2773 	if (error) {
2774 		return error;
2775 	}
2776 
2777 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2778 	error = namei(&nd);
2779 	if (error) {
2780 		pathbuf_destroy(pb);
2781 		return (error);
2782 	}
2783 
2784 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2785 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2786 		if (nd.ni_dvp == nd.ni_vp)
2787 			vrele(nd.ni_dvp);
2788 		else
2789 			vput(nd.ni_dvp);
2790 		if (nd.ni_vp)
2791 			vrele(nd.ni_vp);
2792 		pathbuf_destroy(pb);
2793 		return (EEXIST);
2794 	}
2795 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2796 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2797 	vput(nd.ni_dvp);
2798 	pathbuf_destroy(pb);
2799 	return (error);
2800 }
2801 
2802 /*
2803  * Delete a name from the filesystem.
2804  */
2805 /* ARGSUSED */
2806 int
2807 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2808 {
2809 	/* {
2810 		syscallarg(const char *) path;
2811 	} */
2812 
2813 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
2814 }
2815 
2816 int
2817 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2818     register_t *retval)
2819 {
2820 	/* {
2821 		syscallarg(int) fd;
2822 		syscallarg(const char *) path;
2823 		syscallarg(int) flag;
2824 	} */
2825 
2826 	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2827 	    SCARG(uap, flag), UIO_USERSPACE);
2828 }
2829 
2830 int
2831 do_sys_unlink(const char *arg, enum uio_seg seg)
2832 {
2833 	return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2834 }
2835 
2836 static int
2837 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2838     enum uio_seg seg)
2839 {
2840 	struct vnode *vp;
2841 	int error;
2842 	struct pathbuf *pb;
2843 	struct nameidata nd;
2844 	const char *pathstring;
2845 
2846 	KASSERT(l != NULL || fdat == AT_FDCWD);
2847 
2848 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2849 	if (error) {
2850 		return error;
2851 	}
2852 	pathstring = pathbuf_stringcopy_get(pb);
2853 	if (pathstring == NULL) {
2854 		pathbuf_destroy(pb);
2855 		return ENOMEM;
2856 	}
2857 
2858 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2859 	if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2860 		goto out;
2861 	vp = nd.ni_vp;
2862 
2863 	/*
2864 	 * The root of a mounted filesystem cannot be deleted.
2865 	 */
2866 	if ((vp->v_vflag & VV_ROOT) != 0) {
2867 		error = EBUSY;
2868 		goto abort;
2869 	}
2870 
2871 	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2872 		error = EBUSY;
2873 		goto abort;
2874 	}
2875 
2876 	/*
2877 	 * No rmdir "." please.
2878 	 */
2879 	if (nd.ni_dvp == vp) {
2880 		error = EINVAL;
2881 		goto abort;
2882 	}
2883 
2884 	/*
2885 	 * AT_REMOVEDIR is required to remove a directory
2886 	 */
2887 	if (vp->v_type == VDIR) {
2888 		if (!(flags & AT_REMOVEDIR)) {
2889 			error = EPERM;
2890 			goto abort;
2891 		} else {
2892 			error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2893 			vput(nd.ni_dvp);
2894 			goto out;
2895 		}
2896 	}
2897 
2898 	/*
2899 	 * Starting here we only deal with non directories.
2900 	 */
2901 	if (flags & AT_REMOVEDIR) {
2902 		error = ENOTDIR;
2903 		goto abort;
2904 	}
2905 
2906 #if NVERIEXEC > 0
2907 	/* Handle remove requests for veriexec entries. */
2908 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2909 		goto abort;
2910 	}
2911 #endif /* NVERIEXEC > 0 */
2912 
2913 #ifdef FILEASSOC
2914 	(void)fileassoc_file_delete(vp);
2915 #endif /* FILEASSOC */
2916 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2917 	vput(nd.ni_dvp);
2918 	goto out;
2919 
2920 abort:
2921 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2922 	if (nd.ni_dvp == vp)
2923 		vrele(nd.ni_dvp);
2924 	else
2925 		vput(nd.ni_dvp);
2926 	vput(vp);
2927 
2928 out:
2929 	pathbuf_stringcopy_put(pb, pathstring);
2930 	pathbuf_destroy(pb);
2931 	return (error);
2932 }
2933 
2934 /*
2935  * Reposition read/write file offset.
2936  */
2937 int
2938 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2939 {
2940 	/* {
2941 		syscallarg(int) fd;
2942 		syscallarg(int) pad;
2943 		syscallarg(off_t) offset;
2944 		syscallarg(int) whence;
2945 	} */
2946 	file_t *fp;
2947 	int error, fd;
2948 
2949 	switch (SCARG(uap, whence)) {
2950 	case SEEK_CUR:
2951 	case SEEK_END:
2952 	case SEEK_SET:
2953 		break;
2954 	default:
2955 		return EINVAL;
2956 	}
2957 
2958 	fd = SCARG(uap, fd);
2959 
2960 	if ((fp = fd_getfile(fd)) == NULL)
2961 		return (EBADF);
2962 
2963 	if (fp->f_ops->fo_seek == NULL) {
2964 		error = ESPIPE;
2965 		goto out;
2966 	}
2967 
2968 	error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
2969 	    SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
2970  out:
2971  	fd_putfile(fd);
2972 	return (error);
2973 }
2974 
2975 /*
2976  * Positional read system call.
2977  */
2978 int
2979 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2980 {
2981 	/* {
2982 		syscallarg(int) fd;
2983 		syscallarg(void *) buf;
2984 		syscallarg(size_t) nbyte;
2985 		syscallarg(off_t) offset;
2986 	} */
2987 	file_t *fp;
2988 	off_t offset;
2989 	int error, fd = SCARG(uap, fd);
2990 
2991 	if ((fp = fd_getfile(fd)) == NULL)
2992 		return (EBADF);
2993 
2994 	if ((fp->f_flag & FREAD) == 0) {
2995 		fd_putfile(fd);
2996 		return (EBADF);
2997 	}
2998 
2999 	if (fp->f_ops->fo_seek == NULL) {
3000 		error = ESPIPE;
3001 		goto out;
3002 	}
3003 
3004 	offset = SCARG(uap, offset);
3005 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3006 	if (error)
3007 		goto out;
3008 
3009 	/* dofileread() will unuse the descriptor for us */
3010 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3011 	    &offset, 0, retval));
3012 
3013  out:
3014 	fd_putfile(fd);
3015 	return (error);
3016 }
3017 
3018 /*
3019  * Positional scatter read system call.
3020  */
3021 int
3022 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
3023 {
3024 	/* {
3025 		syscallarg(int) fd;
3026 		syscallarg(const struct iovec *) iovp;
3027 		syscallarg(int) iovcnt;
3028 		syscallarg(off_t) offset;
3029 	} */
3030 	off_t offset = SCARG(uap, offset);
3031 
3032 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3033 	    SCARG(uap, iovcnt), &offset, 0, retval);
3034 }
3035 
3036 /*
3037  * Positional write system call.
3038  */
3039 int
3040 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
3041 {
3042 	/* {
3043 		syscallarg(int) fd;
3044 		syscallarg(const void *) buf;
3045 		syscallarg(size_t) nbyte;
3046 		syscallarg(off_t) offset;
3047 	} */
3048 	file_t *fp;
3049 	off_t offset;
3050 	int error, fd = SCARG(uap, fd);
3051 
3052 	if ((fp = fd_getfile(fd)) == NULL)
3053 		return (EBADF);
3054 
3055 	if ((fp->f_flag & FWRITE) == 0) {
3056 		fd_putfile(fd);
3057 		return (EBADF);
3058 	}
3059 
3060 	if (fp->f_ops->fo_seek == NULL) {
3061 		error = ESPIPE;
3062 		goto out;
3063 	}
3064 
3065 	offset = SCARG(uap, offset);
3066 	error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3067 	if (error)
3068 		goto out;
3069 
3070 	/* dofilewrite() will unuse the descriptor for us */
3071 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3072 	    &offset, 0, retval));
3073 
3074  out:
3075 	fd_putfile(fd);
3076 	return (error);
3077 }
3078 
3079 /*
3080  * Positional gather write system call.
3081  */
3082 int
3083 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
3084 {
3085 	/* {
3086 		syscallarg(int) fd;
3087 		syscallarg(const struct iovec *) iovp;
3088 		syscallarg(int) iovcnt;
3089 		syscallarg(off_t) offset;
3090 	} */
3091 	off_t offset = SCARG(uap, offset);
3092 
3093 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3094 	    SCARG(uap, iovcnt), &offset, 0, retval);
3095 }
3096 
3097 /*
3098  * Check access permissions.
3099  */
3100 int
3101 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
3102 {
3103 	/* {
3104 		syscallarg(const char *) path;
3105 		syscallarg(int) flags;
3106 	} */
3107 
3108 	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3109 	     SCARG(uap, flags), 0);
3110 }
3111 
3112 int
3113 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3114     int mode, int flags)
3115 {
3116 	kauth_cred_t cred;
3117 	struct vnode *vp;
3118 	int error, nd_flag, vmode;
3119 	struct pathbuf *pb;
3120 	struct nameidata nd;
3121 
3122 	CTASSERT(F_OK == 0);
3123 	if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3124 		/* nonsense mode */
3125 		return EINVAL;
3126 	}
3127 
3128 	nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3129 	if (flags & AT_SYMLINK_NOFOLLOW)
3130 		nd_flag &= ~FOLLOW;
3131 
3132 	error = pathbuf_copyin(path, &pb);
3133 	if (error)
3134 		return error;
3135 
3136 	NDINIT(&nd, LOOKUP, nd_flag, pb);
3137 
3138 	/* Override default credentials */
3139 	cred = kauth_cred_dup(l->l_cred);
3140 	if (!(flags & AT_EACCESS)) {
3141 		kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3142 		kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3143 	}
3144 	nd.ni_cnd.cn_cred = cred;
3145 
3146 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3147 		pathbuf_destroy(pb);
3148 		goto out;
3149 	}
3150 	vp = nd.ni_vp;
3151 	pathbuf_destroy(pb);
3152 
3153 	/* Flags == 0 means only check for existence. */
3154 	if (mode) {
3155 		vmode = 0;
3156 		if (mode & R_OK)
3157 			vmode |= VREAD;
3158 		if (mode & W_OK)
3159 			vmode |= VWRITE;
3160 		if (mode & X_OK)
3161 			vmode |= VEXEC;
3162 
3163 		error = VOP_ACCESS(vp, vmode, cred);
3164 		if (!error && (vmode & VWRITE))
3165 			error = vn_writechk(vp);
3166 	}
3167 	vput(vp);
3168 out:
3169 	kauth_cred_free(cred);
3170 	return (error);
3171 }
3172 
3173 int
3174 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3175     register_t *retval)
3176 {
3177 	/* {
3178 		syscallarg(int) fd;
3179 		syscallarg(const char *) path;
3180 		syscallarg(int) amode;
3181 		syscallarg(int) flag;
3182 	} */
3183 
3184 	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3185 	     SCARG(uap, amode), SCARG(uap, flag));
3186 }
3187 
3188 /*
3189  * Common code for all sys_stat functions, including compat versions.
3190  */
3191 int
3192 do_sys_stat(const char *userpath, unsigned int nd_flag,
3193     struct stat *sb)
3194 {
3195 	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3196 }
3197 
3198 int
3199 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3200     unsigned int nd_flag, struct stat *sb)
3201 {
3202 	int error;
3203 	struct pathbuf *pb;
3204 	struct nameidata nd;
3205 
3206 	KASSERT(l != NULL || fdat == AT_FDCWD);
3207 
3208 	error = pathbuf_copyin(userpath, &pb);
3209 	if (error) {
3210 		return error;
3211 	}
3212 
3213 	NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3214 
3215 	error = fd_nameiat(l, fdat, &nd);
3216 	if (error != 0) {
3217 		pathbuf_destroy(pb);
3218 		return error;
3219 	}
3220 	error = vn_stat(nd.ni_vp, sb);
3221 	vput(nd.ni_vp);
3222 	pathbuf_destroy(pb);
3223 	return error;
3224 }
3225 
3226 /*
3227  * Get file status; this version follows links.
3228  */
3229 /* ARGSUSED */
3230 int
3231 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
3232 {
3233 	/* {
3234 		syscallarg(const char *) path;
3235 		syscallarg(struct stat *) ub;
3236 	} */
3237 	struct stat sb;
3238 	int error;
3239 
3240 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3241 	if (error)
3242 		return error;
3243 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3244 }
3245 
3246 /*
3247  * Get file status; this version does not follow links.
3248  */
3249 /* ARGSUSED */
3250 int
3251 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
3252 {
3253 	/* {
3254 		syscallarg(const char *) path;
3255 		syscallarg(struct stat *) ub;
3256 	} */
3257 	struct stat sb;
3258 	int error;
3259 
3260 	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3261 	if (error)
3262 		return error;
3263 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3264 }
3265 
3266 int
3267 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3268     register_t *retval)
3269 {
3270 	/* {
3271 		syscallarg(int) fd;
3272 		syscallarg(const char *) path;
3273 		syscallarg(struct stat *) buf;
3274 		syscallarg(int) flag;
3275 	} */
3276 	unsigned int nd_flag;
3277 	struct stat sb;
3278 	int error;
3279 
3280 	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3281 		nd_flag = NOFOLLOW;
3282 	else
3283 		nd_flag = FOLLOW;
3284 
3285 	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3286 	    &sb);
3287 	if (error)
3288 		return error;
3289 	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3290 }
3291 
3292 static int
3293 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3294 {
3295 	int error;
3296 	struct pathbuf *pb;
3297 	struct nameidata nd;
3298 
3299 	error = pathbuf_copyin(path, &pb);
3300 	if (error) {
3301 		return error;
3302 	}
3303 	NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3304 	if ((error = namei(&nd)) != 0) {
3305 		pathbuf_destroy(pb);
3306 		return error;
3307 	}
3308 	error = VOP_PATHCONF(nd.ni_vp, name, retval);
3309 	vput(nd.ni_vp);
3310 	pathbuf_destroy(pb);
3311 	return error;
3312 }
3313 
3314 /*
3315  * Get configurable pathname variables.
3316  */
3317 /* ARGSUSED */
3318 int
3319 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3320     register_t *retval)
3321 {
3322 	/* {
3323 		syscallarg(const char *) path;
3324 		syscallarg(int) name;
3325 	} */
3326 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3327 	    FOLLOW);
3328 }
3329 
3330 /* ARGSUSED */
3331 int
3332 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3333     register_t *retval)
3334 {
3335 	/* {
3336 		syscallarg(const char *) path;
3337 		syscallarg(int) name;
3338 	} */
3339 	return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3340 	    NOFOLLOW);
3341 }
3342 
3343 /*
3344  * Return target name of a symbolic link.
3345  */
3346 /* ARGSUSED */
3347 int
3348 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3349     register_t *retval)
3350 {
3351 	/* {
3352 		syscallarg(const char *) path;
3353 		syscallarg(char *) buf;
3354 		syscallarg(size_t) count;
3355 	} */
3356 	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3357 	    SCARG(uap, buf), SCARG(uap, count), retval);
3358 }
3359 
3360 static int
3361 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3362     size_t count, register_t *retval)
3363 {
3364 	struct vnode *vp;
3365 	struct iovec aiov;
3366 	struct uio auio;
3367 	int error;
3368 	struct pathbuf *pb;
3369 	struct nameidata nd;
3370 
3371 	error = pathbuf_copyin(path, &pb);
3372 	if (error) {
3373 		return error;
3374 	}
3375 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
3376 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3377 		pathbuf_destroy(pb);
3378 		return error;
3379 	}
3380 	vp = nd.ni_vp;
3381 	pathbuf_destroy(pb);
3382 	if (vp->v_type != VLNK)
3383 		error = EINVAL;
3384 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3385 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3386 		aiov.iov_base = buf;
3387 		aiov.iov_len = count;
3388 		auio.uio_iov = &aiov;
3389 		auio.uio_iovcnt = 1;
3390 		auio.uio_offset = 0;
3391 		auio.uio_rw = UIO_READ;
3392 		KASSERT(l == curlwp);
3393 		auio.uio_vmspace = l->l_proc->p_vmspace;
3394 		auio.uio_resid = count;
3395 		if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3396 			*retval = count - auio.uio_resid;
3397 	}
3398 	vput(vp);
3399 	return (error);
3400 }
3401 
3402 int
3403 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3404     register_t *retval)
3405 {
3406 	/* {
3407 		syscallarg(int) fd;
3408 		syscallarg(const char *) path;
3409 		syscallarg(char *) buf;
3410 		syscallarg(size_t) bufsize;
3411 	} */
3412 
3413 	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3414 	    SCARG(uap, buf), SCARG(uap, bufsize), retval);
3415 }
3416 
3417 /*
3418  * Change flags of a file given a path name.
3419  */
3420 /* ARGSUSED */
3421 int
3422 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
3423 {
3424 	/* {
3425 		syscallarg(const char *) path;
3426 		syscallarg(u_long) flags;
3427 	} */
3428 	struct vnode *vp;
3429 	int error;
3430 
3431 	error = namei_simple_user(SCARG(uap, path),
3432 				NSM_FOLLOW_TRYEMULROOT, &vp);
3433 	if (error != 0)
3434 		return (error);
3435 	error = change_flags(vp, SCARG(uap, flags), l);
3436 	vput(vp);
3437 	return (error);
3438 }
3439 
3440 /*
3441  * Change flags of a file given a file descriptor.
3442  */
3443 /* ARGSUSED */
3444 int
3445 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
3446 {
3447 	/* {
3448 		syscallarg(int) fd;
3449 		syscallarg(u_long) flags;
3450 	} */
3451 	struct vnode *vp;
3452 	file_t *fp;
3453 	int error;
3454 
3455 	/* fd_getvnode() will use the descriptor for us */
3456 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3457 		return (error);
3458 	vp = fp->f_vnode;
3459 	error = change_flags(vp, SCARG(uap, flags), l);
3460 	VOP_UNLOCK(vp);
3461 	fd_putfile(SCARG(uap, fd));
3462 	return (error);
3463 }
3464 
3465 /*
3466  * Change flags of a file given a path name; this version does
3467  * not follow links.
3468  */
3469 int
3470 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
3471 {
3472 	/* {
3473 		syscallarg(const char *) path;
3474 		syscallarg(u_long) flags;
3475 	} */
3476 	struct vnode *vp;
3477 	int error;
3478 
3479 	error = namei_simple_user(SCARG(uap, path),
3480 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3481 	if (error != 0)
3482 		return (error);
3483 	error = change_flags(vp, SCARG(uap, flags), l);
3484 	vput(vp);
3485 	return (error);
3486 }
3487 
3488 /*
3489  * Common routine to change flags of a file.
3490  */
3491 int
3492 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3493 {
3494 	struct vattr vattr;
3495 	int error;
3496 
3497 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3498 
3499 	vattr_null(&vattr);
3500 	vattr.va_flags = flags;
3501 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3502 
3503 	return (error);
3504 }
3505 
3506 /*
3507  * Change mode of a file given path name; this version follows links.
3508  */
3509 /* ARGSUSED */
3510 int
3511 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3512 {
3513 	/* {
3514 		syscallarg(const char *) path;
3515 		syscallarg(int) mode;
3516 	} */
3517 	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3518 			      SCARG(uap, mode), 0);
3519 }
3520 
3521 int
3522 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3523 {
3524 	int error;
3525 	struct vnode *vp;
3526 	namei_simple_flags_t ns_flag;
3527 
3528 	if (flags & AT_SYMLINK_NOFOLLOW)
3529 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3530 	else
3531 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3532 
3533 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3534 	if (error != 0)
3535 		return error;
3536 
3537 	error = change_mode(vp, mode, l);
3538 
3539 	vrele(vp);
3540 
3541 	return (error);
3542 }
3543 
3544 /*
3545  * Change mode of a file given a file descriptor.
3546  */
3547 /* ARGSUSED */
3548 int
3549 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
3550 {
3551 	/* {
3552 		syscallarg(int) fd;
3553 		syscallarg(int) mode;
3554 	} */
3555 	file_t *fp;
3556 	int error;
3557 
3558 	/* fd_getvnode() will use the descriptor for us */
3559 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3560 		return (error);
3561 	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3562 	fd_putfile(SCARG(uap, fd));
3563 	return (error);
3564 }
3565 
3566 int
3567 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3568     register_t *retval)
3569 {
3570 	/* {
3571 		syscallarg(int) fd;
3572 		syscallarg(const char *) path;
3573 		syscallarg(int) mode;
3574 		syscallarg(int) flag;
3575 	} */
3576 
3577 	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3578 			      SCARG(uap, mode), SCARG(uap, flag));
3579 }
3580 
3581 /*
3582  * Change mode of a file given path name; this version does not follow links.
3583  */
3584 /* ARGSUSED */
3585 int
3586 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
3587 {
3588 	/* {
3589 		syscallarg(const char *) path;
3590 		syscallarg(int) mode;
3591 	} */
3592 	int error;
3593 	struct vnode *vp;
3594 
3595 	error = namei_simple_user(SCARG(uap, path),
3596 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3597 	if (error != 0)
3598 		return (error);
3599 
3600 	error = change_mode(vp, SCARG(uap, mode), l);
3601 
3602 	vrele(vp);
3603 	return (error);
3604 }
3605 
3606 /*
3607  * Common routine to set mode given a vnode.
3608  */
3609 static int
3610 change_mode(struct vnode *vp, int mode, struct lwp *l)
3611 {
3612 	struct vattr vattr;
3613 	int error;
3614 
3615 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3616 	vattr_null(&vattr);
3617 	vattr.va_mode = mode & ALLPERMS;
3618 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3619 	VOP_UNLOCK(vp);
3620 	return (error);
3621 }
3622 
3623 /*
3624  * Set ownership given a path name; this version follows links.
3625  */
3626 /* ARGSUSED */
3627 int
3628 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3629 {
3630 	/* {
3631 		syscallarg(const char *) path;
3632 		syscallarg(uid_t) uid;
3633 		syscallarg(gid_t) gid;
3634 	} */
3635 	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3636 			      SCARG(uap, gid), 0);
3637 }
3638 
3639 int
3640 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3641    gid_t gid, int flags)
3642 {
3643 	int error;
3644 	struct vnode *vp;
3645 	namei_simple_flags_t ns_flag;
3646 
3647 	if (flags & AT_SYMLINK_NOFOLLOW)
3648 		ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3649 	else
3650 		ns_flag = NSM_FOLLOW_TRYEMULROOT;
3651 
3652 	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3653 	if (error != 0)
3654 		return error;
3655 
3656 	error = change_owner(vp, uid, gid, l, 0);
3657 
3658 	vrele(vp);
3659 
3660 	return (error);
3661 }
3662 
3663 /*
3664  * Set ownership given a path name; this version follows links.
3665  * Provides POSIX semantics.
3666  */
3667 /* ARGSUSED */
3668 int
3669 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
3670 {
3671 	/* {
3672 		syscallarg(const char *) path;
3673 		syscallarg(uid_t) uid;
3674 		syscallarg(gid_t) gid;
3675 	} */
3676 	int error;
3677 	struct vnode *vp;
3678 
3679 	error = namei_simple_user(SCARG(uap, path),
3680 				NSM_FOLLOW_TRYEMULROOT, &vp);
3681 	if (error != 0)
3682 		return (error);
3683 
3684 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3685 
3686 	vrele(vp);
3687 	return (error);
3688 }
3689 
3690 /*
3691  * Set ownership given a file descriptor.
3692  */
3693 /* ARGSUSED */
3694 int
3695 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
3696 {
3697 	/* {
3698 		syscallarg(int) fd;
3699 		syscallarg(uid_t) uid;
3700 		syscallarg(gid_t) gid;
3701 	} */
3702 	int error;
3703 	file_t *fp;
3704 
3705 	/* fd_getvnode() will use the descriptor for us */
3706 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3707 		return (error);
3708 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3709 	    l, 0);
3710 	fd_putfile(SCARG(uap, fd));
3711 	return (error);
3712 }
3713 
3714 int
3715 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3716     register_t *retval)
3717 {
3718 	/* {
3719 		syscallarg(int) fd;
3720 		syscallarg(const char *) path;
3721 		syscallarg(uid_t) owner;
3722 		syscallarg(gid_t) group;
3723 		syscallarg(int) flag;
3724 	} */
3725 
3726 	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3727 			      SCARG(uap, owner), SCARG(uap, group),
3728 			      SCARG(uap, flag));
3729 }
3730 
3731 /*
3732  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3733  */
3734 /* ARGSUSED */
3735 int
3736 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
3737 {
3738 	/* {
3739 		syscallarg(int) fd;
3740 		syscallarg(uid_t) uid;
3741 		syscallarg(gid_t) gid;
3742 	} */
3743 	int error;
3744 	file_t *fp;
3745 
3746 	/* fd_getvnode() will use the descriptor for us */
3747 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3748 		return (error);
3749 	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3750 	    l, 1);
3751 	fd_putfile(SCARG(uap, fd));
3752 	return (error);
3753 }
3754 
3755 /*
3756  * Set ownership given a path name; this version does not follow links.
3757  */
3758 /* ARGSUSED */
3759 int
3760 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
3761 {
3762 	/* {
3763 		syscallarg(const char *) path;
3764 		syscallarg(uid_t) uid;
3765 		syscallarg(gid_t) gid;
3766 	} */
3767 	int error;
3768 	struct vnode *vp;
3769 
3770 	error = namei_simple_user(SCARG(uap, path),
3771 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3772 	if (error != 0)
3773 		return (error);
3774 
3775 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3776 
3777 	vrele(vp);
3778 	return (error);
3779 }
3780 
3781 /*
3782  * Set ownership given a path name; this version does not follow links.
3783  * Provides POSIX/XPG semantics.
3784  */
3785 /* ARGSUSED */
3786 int
3787 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
3788 {
3789 	/* {
3790 		syscallarg(const char *) path;
3791 		syscallarg(uid_t) uid;
3792 		syscallarg(gid_t) gid;
3793 	} */
3794 	int error;
3795 	struct vnode *vp;
3796 
3797 	error = namei_simple_user(SCARG(uap, path),
3798 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
3799 	if (error != 0)
3800 		return (error);
3801 
3802 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3803 
3804 	vrele(vp);
3805 	return (error);
3806 }
3807 
3808 /*
3809  * Common routine to set ownership given a vnode.
3810  */
3811 static int
3812 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3813     int posix_semantics)
3814 {
3815 	struct vattr vattr;
3816 	mode_t newmode;
3817 	int error;
3818 
3819 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3820 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3821 		goto out;
3822 
3823 #define CHANGED(x) ((int)(x) != -1)
3824 	newmode = vattr.va_mode;
3825 	if (posix_semantics) {
3826 		/*
3827 		 * POSIX/XPG semantics: if the caller is not the super-user,
3828 		 * clear set-user-id and set-group-id bits.  Both POSIX and
3829 		 * the XPG consider the behaviour for calls by the super-user
3830 		 * implementation-defined; we leave the set-user-id and set-
3831 		 * group-id settings intact in that case.
3832 		 */
3833 		if (vattr.va_mode & S_ISUID) {
3834 			if (kauth_authorize_vnode(l->l_cred,
3835 			    KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3836 				newmode &= ~S_ISUID;
3837 		}
3838 		if (vattr.va_mode & S_ISGID) {
3839 			if (kauth_authorize_vnode(l->l_cred,
3840 			    KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3841 				newmode &= ~S_ISGID;
3842 		}
3843 	} else {
3844 		/*
3845 		 * NetBSD semantics: when changing owner and/or group,
3846 		 * clear the respective bit(s).
3847 		 */
3848 		if (CHANGED(uid))
3849 			newmode &= ~S_ISUID;
3850 		if (CHANGED(gid))
3851 			newmode &= ~S_ISGID;
3852 	}
3853 	/* Update va_mode iff altered. */
3854 	if (vattr.va_mode == newmode)
3855 		newmode = VNOVAL;
3856 
3857 	vattr_null(&vattr);
3858 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3859 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3860 	vattr.va_mode = newmode;
3861 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3862 #undef CHANGED
3863 
3864 out:
3865 	VOP_UNLOCK(vp);
3866 	return (error);
3867 }
3868 
3869 /*
3870  * Set the access and modification times given a path name; this
3871  * version follows links.
3872  */
3873 /* ARGSUSED */
3874 int
3875 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3876     register_t *retval)
3877 {
3878 	/* {
3879 		syscallarg(const char *) path;
3880 		syscallarg(const struct timeval *) tptr;
3881 	} */
3882 
3883 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3884 	    SCARG(uap, tptr), UIO_USERSPACE);
3885 }
3886 
3887 /*
3888  * Set the access and modification times given a file descriptor.
3889  */
3890 /* ARGSUSED */
3891 int
3892 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3893     register_t *retval)
3894 {
3895 	/* {
3896 		syscallarg(int) fd;
3897 		syscallarg(const struct timeval *) tptr;
3898 	} */
3899 	int error;
3900 	file_t *fp;
3901 
3902 	/* fd_getvnode() will use the descriptor for us */
3903 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3904 		return (error);
3905 	error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3906 	    UIO_USERSPACE);
3907 	fd_putfile(SCARG(uap, fd));
3908 	return (error);
3909 }
3910 
3911 int
3912 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3913     register_t *retval)
3914 {
3915 	/* {
3916 		syscallarg(int) fd;
3917 		syscallarg(const struct timespec *) tptr;
3918 	} */
3919 	int error;
3920 	file_t *fp;
3921 
3922 	/* fd_getvnode() will use the descriptor for us */
3923 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3924 		return (error);
3925 	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3926 	    SCARG(uap, tptr), UIO_USERSPACE);
3927 	fd_putfile(SCARG(uap, fd));
3928 	return (error);
3929 }
3930 
3931 /*
3932  * Set the access and modification times given a path name; this
3933  * version does not follow links.
3934  */
3935 int
3936 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3937     register_t *retval)
3938 {
3939 	/* {
3940 		syscallarg(const char *) path;
3941 		syscallarg(const struct timeval *) tptr;
3942 	} */
3943 
3944 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3945 	    SCARG(uap, tptr), UIO_USERSPACE);
3946 }
3947 
3948 int
3949 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3950     register_t *retval)
3951 {
3952 	/* {
3953 		syscallarg(int) fd;
3954 		syscallarg(const char *) path;
3955 		syscallarg(const struct timespec *) tptr;
3956 		syscallarg(int) flag;
3957 	} */
3958 	int follow;
3959 	const struct timespec *tptr;
3960 	int error;
3961 
3962 	tptr = SCARG(uap, tptr);
3963 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3964 
3965 	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3966 	    SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3967 
3968 	return error;
3969 }
3970 
3971 /*
3972  * Common routine to set access and modification times given a vnode.
3973  */
3974 int
3975 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3976     const struct timespec *tptr, enum uio_seg seg)
3977 {
3978 	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3979 }
3980 
3981 int
3982 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
3983     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
3984 {
3985 	struct vattr vattr;
3986 	int error, dorele = 0;
3987 	namei_simple_flags_t sflags;
3988 	bool vanull, setbirthtime;
3989 	struct timespec ts[2];
3990 
3991 	KASSERT(l != NULL || fdat == AT_FDCWD);
3992 
3993 	/*
3994 	 * I have checked all callers and they pass either FOLLOW,
3995 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3996 	 * is 0. More to the point, they don't pass anything else.
3997 	 * Let's keep it that way at least until the namei interfaces
3998 	 * are fully sanitized.
3999 	 */
4000 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
4001 	sflags = (flag == FOLLOW) ?
4002 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
4003 
4004 	if (tptr == NULL) {
4005 		vanull = true;
4006 		nanotime(&ts[0]);
4007 		ts[1] = ts[0];
4008 	} else {
4009 		vanull = false;
4010 		if (seg != UIO_SYSSPACE) {
4011 			error = copyin(tptr, ts, sizeof (ts));
4012 			if (error != 0)
4013 				return error;
4014 		} else {
4015 			ts[0] = tptr[0];
4016 			ts[1] = tptr[1];
4017 		}
4018 	}
4019 
4020 	if (ts[0].tv_nsec == UTIME_NOW) {
4021 		nanotime(&ts[0]);
4022 		if (ts[1].tv_nsec == UTIME_NOW) {
4023 			vanull = true;
4024 			ts[1] = ts[0];
4025 		}
4026 	} else if (ts[1].tv_nsec == UTIME_NOW)
4027 		nanotime(&ts[1]);
4028 
4029 	if (vp == NULL) {
4030 		/* note: SEG describes TPTR, not PATH; PATH is always user */
4031 		error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4032 		if (error != 0)
4033 			return error;
4034 		dorele = 1;
4035 	}
4036 
4037 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4038 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4039 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
4040 	vattr_null(&vattr);
4041 
4042 	if (ts[0].tv_nsec != UTIME_OMIT)
4043 		vattr.va_atime = ts[0];
4044 
4045 	if (ts[1].tv_nsec != UTIME_OMIT) {
4046 		vattr.va_mtime = ts[1];
4047 		if (setbirthtime)
4048 			vattr.va_birthtime = ts[1];
4049 	}
4050 
4051 	if (vanull)
4052 		vattr.va_vaflags |= VA_UTIMES_NULL;
4053 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
4054 	VOP_UNLOCK(vp);
4055 
4056 	if (dorele != 0)
4057 		vrele(vp);
4058 
4059 	return error;
4060 }
4061 
4062 int
4063 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4064     const struct timeval *tptr, enum uio_seg seg)
4065 {
4066 	struct timespec ts[2];
4067 	struct timespec *tsptr = NULL;
4068 	int error;
4069 
4070 	if (tptr != NULL) {
4071 		struct timeval tv[2];
4072 
4073 		if (seg != UIO_SYSSPACE) {
4074 			error = copyin(tptr, tv, sizeof(tv));
4075 			if (error != 0)
4076 				return error;
4077 			tptr = tv;
4078 		}
4079 
4080 		if ((tptr[0].tv_usec == UTIME_NOW) ||
4081 		    (tptr[0].tv_usec == UTIME_OMIT))
4082 			ts[0].tv_nsec = tptr[0].tv_usec;
4083 		else {
4084 			if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4085 				return EINVAL;
4086 
4087 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4088 		}
4089 
4090 		if ((tptr[1].tv_usec == UTIME_NOW) ||
4091 		    (tptr[1].tv_usec == UTIME_OMIT))
4092 			ts[1].tv_nsec = tptr[1].tv_usec;
4093 		else {
4094 			if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4095 				return EINVAL;
4096 
4097 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4098 		}
4099 
4100 		tsptr = &ts[0];
4101 	}
4102 
4103 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4104 }
4105 
4106 /*
4107  * Truncate a file given its path name.
4108  */
4109 /* ARGSUSED */
4110 int
4111 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
4112 {
4113 	/* {
4114 		syscallarg(const char *) path;
4115 		syscallarg(int) pad;
4116 		syscallarg(off_t) length;
4117 	} */
4118 	struct vnode *vp;
4119 	struct vattr vattr;
4120 	int error;
4121 
4122 	if (SCARG(uap, length) < 0)
4123 		return EINVAL;
4124 
4125 	error = namei_simple_user(SCARG(uap, path),
4126 				NSM_FOLLOW_TRYEMULROOT, &vp);
4127 	if (error != 0)
4128 		return (error);
4129 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4130 	if (vp->v_type == VDIR)
4131 		error = EISDIR;
4132 	else if ((error = vn_writechk(vp)) == 0 &&
4133 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4134 		vattr_null(&vattr);
4135 		vattr.va_size = SCARG(uap, length);
4136 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
4137 	}
4138 	vput(vp);
4139 	return (error);
4140 }
4141 
4142 /*
4143  * Truncate a file given a file descriptor.
4144  */
4145 /* ARGSUSED */
4146 int
4147 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
4148 {
4149 	/* {
4150 		syscallarg(int) fd;
4151 		syscallarg(int) pad;
4152 		syscallarg(off_t) length;
4153 	} */
4154 	struct vattr vattr;
4155 	struct vnode *vp;
4156 	file_t *fp;
4157 	int error;
4158 
4159 	if (SCARG(uap, length) < 0)
4160 		return EINVAL;
4161 
4162 	/* fd_getvnode() will use the descriptor for us */
4163 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4164 		return (error);
4165 	if ((fp->f_flag & FWRITE) == 0) {
4166 		error = EINVAL;
4167 		goto out;
4168 	}
4169 	vp = fp->f_vnode;
4170 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4171 	if (vp->v_type == VDIR)
4172 		error = EISDIR;
4173 	else if ((error = vn_writechk(vp)) == 0) {
4174 		vattr_null(&vattr);
4175 		vattr.va_size = SCARG(uap, length);
4176 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
4177 	}
4178 	VOP_UNLOCK(vp);
4179  out:
4180 	fd_putfile(SCARG(uap, fd));
4181 	return (error);
4182 }
4183 
4184 /*
4185  * Sync an open file.
4186  */
4187 /* ARGSUSED */
4188 int
4189 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4190 {
4191 	/* {
4192 		syscallarg(int) fd;
4193 	} */
4194 	struct vnode *vp;
4195 	file_t *fp;
4196 	int error;
4197 
4198 	/* fd_getvnode() will use the descriptor for us */
4199 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4200 		return (error);
4201 	vp = fp->f_vnode;
4202 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4203 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4204 	VOP_UNLOCK(vp);
4205 	fd_putfile(SCARG(uap, fd));
4206 	return (error);
4207 }
4208 
4209 /*
4210  * Sync a range of file data.  API modeled after that found in AIX.
4211  *
4212  * FDATASYNC indicates that we need only save enough metadata to be able
4213  * to re-read the written data.
4214  */
4215 /* ARGSUSED */
4216 int
4217 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
4218 {
4219 	/* {
4220 		syscallarg(int) fd;
4221 		syscallarg(int) flags;
4222 		syscallarg(off_t) start;
4223 		syscallarg(off_t) length;
4224 	} */
4225 	struct vnode *vp;
4226 	file_t *fp;
4227 	int flags, nflags;
4228 	off_t s, e, len;
4229 	int error;
4230 
4231 	/* fd_getvnode() will use the descriptor for us */
4232 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4233 		return (error);
4234 
4235 	if ((fp->f_flag & FWRITE) == 0) {
4236 		error = EBADF;
4237 		goto out;
4238 	}
4239 
4240 	flags = SCARG(uap, flags);
4241 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4242 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4243 		error = EINVAL;
4244 		goto out;
4245 	}
4246 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4247 	if (flags & FDATASYNC)
4248 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4249 	else
4250 		nflags = FSYNC_WAIT;
4251 	if (flags & FDISKSYNC)
4252 		nflags |= FSYNC_CACHE;
4253 
4254 	len = SCARG(uap, length);
4255 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
4256 	if (len) {
4257 		s = SCARG(uap, start);
4258 		if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4259 			error = EINVAL;
4260 			goto out;
4261 		}
4262 		e = s + len;
4263 		KASSERT(s <= e);
4264 	} else {
4265 		e = 0;
4266 		s = 0;
4267 	}
4268 
4269 	vp = fp->f_vnode;
4270 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4271 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4272 	VOP_UNLOCK(vp);
4273 out:
4274 	fd_putfile(SCARG(uap, fd));
4275 	return (error);
4276 }
4277 
4278 /*
4279  * Sync the data of an open file.
4280  */
4281 /* ARGSUSED */
4282 int
4283 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
4284 {
4285 	/* {
4286 		syscallarg(int) fd;
4287 	} */
4288 	struct vnode *vp;
4289 	file_t *fp;
4290 	int error;
4291 
4292 	/* fd_getvnode() will use the descriptor for us */
4293 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4294 		return (error);
4295 	vp = fp->f_vnode;
4296 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4297 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4298 	VOP_UNLOCK(vp);
4299 	fd_putfile(SCARG(uap, fd));
4300 	return (error);
4301 }
4302 
4303 /*
4304  * Rename files, (standard) BSD semantics frontend.
4305  */
4306 /* ARGSUSED */
4307 int
4308 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
4309 {
4310 	/* {
4311 		syscallarg(const char *) from;
4312 		syscallarg(const char *) to;
4313 	} */
4314 
4315 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4316 	    SCARG(uap, to), UIO_USERSPACE, 0));
4317 }
4318 
4319 int
4320 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4321     register_t *retval)
4322 {
4323 	/* {
4324 		syscallarg(int) fromfd;
4325 		syscallarg(const char *) from;
4326 		syscallarg(int) tofd;
4327 		syscallarg(const char *) to;
4328 	} */
4329 
4330 	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4331 	    SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
4332 }
4333 
4334 /*
4335  * Rename files, POSIX semantics frontend.
4336  */
4337 /* ARGSUSED */
4338 int
4339 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
4340 {
4341 	/* {
4342 		syscallarg(const char *) from;
4343 		syscallarg(const char *) to;
4344 	} */
4345 
4346 	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4347 	    SCARG(uap, to), UIO_USERSPACE, 1));
4348 }
4349 
4350 /*
4351  * Rename files.  Source and destination must either both be directories,
4352  * or both not be directories.  If target is a directory, it must be empty.
4353  * If `from' and `to' refer to the same object, the value of the `retain'
4354  * argument is used to determine whether `from' will be
4355  *
4356  * (retain == 0)	deleted unless `from' and `to' refer to the same
4357  *			object in the file system's name space (BSD).
4358  * (retain == 1)	always retained (POSIX).
4359  *
4360  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4361  */
4362 int
4363 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4364 {
4365 	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4366 }
4367 
4368 static int
4369 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4370     const char *to, enum uio_seg seg, int retain)
4371 {
4372 	struct pathbuf *fpb, *tpb;
4373 	struct nameidata fnd, tnd;
4374 	struct vnode *fdvp, *fvp;
4375 	struct vnode *tdvp, *tvp;
4376 	struct mount *mp, *tmp;
4377 	int error;
4378 
4379 	KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4380 
4381 	error = pathbuf_maybe_copyin(from, seg, &fpb);
4382 	if (error)
4383 		goto out0;
4384 	KASSERT(fpb != NULL);
4385 
4386 	error = pathbuf_maybe_copyin(to, seg, &tpb);
4387 	if (error)
4388 		goto out1;
4389 	KASSERT(tpb != NULL);
4390 
4391 	/*
4392 	 * Lookup from.
4393 	 *
4394 	 * XXX LOCKPARENT is wrong because we don't actually want it
4395 	 * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4396 	 * insane, so for the time being we need to leave it like this.
4397 	 */
4398 	NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4399 	if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4400 		goto out2;
4401 
4402 	/*
4403 	 * Pull out the important results of the lookup, fdvp and fvp.
4404 	 * Of course, fvp is bogus because we're about to unlock fdvp.
4405 	 */
4406 	fdvp = fnd.ni_dvp;
4407 	fvp = fnd.ni_vp;
4408 	mp = fdvp->v_mount;
4409 	KASSERT(fdvp != NULL);
4410 	KASSERT(fvp != NULL);
4411 	KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4412 	/*
4413 	 * Bracket the operation with fstrans_start()/fstrans_done().
4414 	 *
4415 	 * Inside the bracket this file system cannot be unmounted so
4416 	 * a vnode on this file system cannot change its v_mount.
4417 	 * A vnode on another file system may still change to dead mount.
4418 	 */
4419 	fstrans_start(mp);
4420 
4421 	/*
4422 	 * Make sure neither fdvp nor fvp is locked.
4423 	 */
4424 	if (fdvp != fvp)
4425 		VOP_UNLOCK(fdvp);
4426 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4427 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4428 
4429 	/*
4430 	 * Reject renaming `.' and `..'.  Can't do this until after
4431 	 * namei because we need namei's parsing to find the final
4432 	 * component name.  (namei should just leave us with the final
4433 	 * component name and not look it up itself, but anyway...)
4434 	 *
4435 	 * This was here before because we used to relookup from
4436 	 * instead of to and relookup requires the caller to check
4437 	 * this, but now file systems may depend on this check, so we
4438 	 * must retain it until the file systems are all rototilled.
4439 	 */
4440 	if (((fnd.ni_cnd.cn_namelen == 1) &&
4441 		(fnd.ni_cnd.cn_nameptr[0] == '.')) ||
4442 	    ((fnd.ni_cnd.cn_namelen == 2) &&
4443 		(fnd.ni_cnd.cn_nameptr[0] == '.') &&
4444 		(fnd.ni_cnd.cn_nameptr[1] == '.'))) {
4445 		error = EINVAL;	/* XXX EISDIR?  */
4446 		goto abort0;
4447 	}
4448 
4449 	/*
4450 	 * Lookup to.
4451 	 *
4452 	 * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4453 	 * fvp here to decide whether to add CREATEDIR is a load of
4454 	 * bollocks because fvp might be the wrong node by now, since
4455 	 * fdvp is unlocked.
4456 	 *
4457 	 * XXX Why not pass CREATEDIR always?
4458 	 */
4459 	NDINIT(&tnd, RENAME,
4460 	    (LOCKPARENT | NOCACHE | TRYEMULROOT |
4461 		((fvp->v_type == VDIR)? CREATEDIR : 0)),
4462 	    tpb);
4463 	if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4464 		goto abort0;
4465 
4466 	/*
4467 	 * Pull out the important results of the lookup, tdvp and tvp.
4468 	 * Of course, tvp is bogus because we're about to unlock tdvp.
4469 	 */
4470 	tdvp = tnd.ni_dvp;
4471 	tvp = tnd.ni_vp;
4472 	KASSERT(tdvp != NULL);
4473 	KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4474 
4475 	if (fvp->v_type == VDIR)
4476 		tnd.ni_cnd.cn_flags |= WILLBEDIR;
4477 	/*
4478 	 * Make sure neither tdvp nor tvp is locked.
4479 	 */
4480 	if (tdvp != tvp)
4481 		VOP_UNLOCK(tdvp);
4482 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4483 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4484 
4485 	/*
4486 	 * Reject renaming onto `.' or `..'.  relookup is unhappy with
4487 	 * these, which is why we must do this here.  Once upon a time
4488 	 * we relooked up from instead of to, and consequently didn't
4489 	 * need this check, but now that we relookup to instead of
4490 	 * from, we need this; and we shall need it forever forward
4491 	 * until the VOP_RENAME protocol changes, because file systems
4492 	 * will no doubt begin to depend on this check.
4493 	 */
4494 	if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
4495 		error = EISDIR;
4496 		goto abort1;
4497 	}
4498 	if ((tnd.ni_cnd.cn_namelen == 2) &&
4499 	    (tnd.ni_cnd.cn_nameptr[0] == '.') &&
4500 	    (tnd.ni_cnd.cn_nameptr[1] == '.')) {
4501 		error = EINVAL;
4502 		goto abort1;
4503 	}
4504 
4505 	/*
4506 	 * Make sure the mount points match.  Although we don't hold
4507 	 * any vnode locks, the v_mount on fdvp file system are stable.
4508 	 *
4509 	 * Unmounting another file system at an inopportune moment may
4510 	 * cause tdvp to disappear and change its v_mount to dead.
4511 	 *
4512 	 * So in either case different v_mount means cross-device rename.
4513 	 */
4514 	KASSERT(mp != NULL);
4515 	tmp = tdvp->v_mount;
4516 
4517 	if (mp != tmp) {
4518 		error = EXDEV;
4519 		goto abort1;
4520 	}
4521 
4522 	/*
4523 	 * Take the vfs rename lock to avoid cross-directory screw cases.
4524 	 * Nothing is locked currently, so taking this lock is safe.
4525 	 */
4526 	error = VFS_RENAMELOCK_ENTER(mp);
4527 	if (error)
4528 		goto abort1;
4529 
4530 	/*
4531 	 * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4532 	 * and nothing is locked except for the vfs rename lock.
4533 	 *
4534 	 * The next step is a little rain dance to conform to the
4535 	 * insane lock protocol, even though it does nothing to ward
4536 	 * off race conditions.
4537 	 *
4538 	 * We need tdvp and tvp to be locked.  However, because we have
4539 	 * unlocked tdvp in order to hold no locks while we take the
4540 	 * vfs rename lock, tvp may be wrong here, and we can't safely
4541 	 * lock it even if the sensible file systems will just unlock
4542 	 * it straight away.  Consequently, we must lock tdvp and then
4543 	 * relookup tvp to get it locked.
4544 	 *
4545 	 * Finally, because the VOP_RENAME protocol is brain-damaged
4546 	 * and various file systems insanely depend on the semantics of
4547 	 * this brain damage, the lookup of to must be the last lookup
4548 	 * before VOP_RENAME.
4549 	 */
4550 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4551 	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4552 	if (error)
4553 		goto abort2;
4554 
4555 	/*
4556 	 * Drop the old tvp and pick up the new one -- which might be
4557 	 * the same, but that doesn't matter to us.  After this, tdvp
4558 	 * and tvp should both be locked.
4559 	 */
4560 	if (tvp != NULL)
4561 		vrele(tvp);
4562 	tvp = tnd.ni_vp;
4563 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4564 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4565 
4566 	/*
4567 	 * The old do_sys_rename had various consistency checks here
4568 	 * involving fvp and tvp.  fvp is bogus already here, and tvp
4569 	 * will become bogus soon in any sensible file system, so the
4570 	 * only purpose in putting these checks here is to give lip
4571 	 * service to these screw cases and to acknowledge that they
4572 	 * exist, not actually to handle them, but here you go
4573 	 * anyway...
4574 	 */
4575 
4576 	/*
4577 	 * Acknowledge that directories and non-directories aren't
4578 	 * suposed to mix.
4579 	 */
4580 	if (tvp != NULL) {
4581 		if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4582 			error = ENOTDIR;
4583 			goto abort3;
4584 		} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4585 			error = EISDIR;
4586 			goto abort3;
4587 		}
4588 	}
4589 
4590 	/*
4591 	 * Acknowledge some random screw case, among the dozens that
4592 	 * might arise.
4593 	 */
4594 	if (fvp == tdvp) {
4595 		error = EINVAL;
4596 		goto abort3;
4597 	}
4598 
4599 	/*
4600 	 * Acknowledge that POSIX has a wacky screw case.
4601 	 *
4602 	 * XXX Eventually the retain flag needs to be passed on to
4603 	 * VOP_RENAME.
4604 	 */
4605 	if (fvp == tvp) {
4606 		if (retain) {
4607 			error = 0;
4608 			goto abort3;
4609 		} else if ((fdvp == tdvp) &&
4610 		    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4611 		    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4612 			fnd.ni_cnd.cn_namelen))) {
4613 			error = 0;
4614 			goto abort3;
4615 		}
4616 	}
4617 
4618 	/*
4619 	 * Make sure veriexec can screw us up.  (But a race can screw
4620 	 * up veriexec, of course -- remember, fvp and (soon) tvp are
4621 	 * bogus.)
4622 	 */
4623 #if NVERIEXEC > 0
4624 	{
4625 		char *f1, *f2;
4626 		size_t f1_len;
4627 		size_t f2_len;
4628 
4629 		f1_len = fnd.ni_cnd.cn_namelen + 1;
4630 		f1 = kmem_alloc(f1_len, KM_SLEEP);
4631 		strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4632 
4633 		f2_len = tnd.ni_cnd.cn_namelen + 1;
4634 		f2 = kmem_alloc(f2_len, KM_SLEEP);
4635 		strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4636 
4637 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4638 
4639 		kmem_free(f1, f1_len);
4640 		kmem_free(f2, f2_len);
4641 
4642 		if (error)
4643 			goto abort3;
4644 	}
4645 #endif /* NVERIEXEC > 0 */
4646 
4647 	/*
4648 	 * All ready.  Incant the rename vop.
4649 	 */
4650 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4651 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4652 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4653 	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4654 	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4655 
4656 	/*
4657 	 * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4658 	 * tdvp and tvp.  But we can't assert any of that.
4659 	 */
4660 	/* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4661 	/* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4662 	/* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4663 	/* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4664 
4665 	/*
4666 	 * So all we have left to do is to drop the rename lock and
4667 	 * destroy the pathbufs.
4668 	 */
4669 	VFS_RENAMELOCK_EXIT(mp);
4670 	fstrans_done(mp);
4671 	goto out2;
4672 
4673 abort3:	if ((tvp != NULL) && (tvp != tdvp))
4674 		VOP_UNLOCK(tvp);
4675 abort2:	VOP_UNLOCK(tdvp);
4676 	VFS_RENAMELOCK_EXIT(mp);
4677 abort1:	VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4678 	vrele(tdvp);
4679 	if (tvp != NULL)
4680 		vrele(tvp);
4681 abort0:	VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4682 	vrele(fdvp);
4683 	vrele(fvp);
4684 	fstrans_done(mp);
4685 out2:	pathbuf_destroy(tpb);
4686 out1:	pathbuf_destroy(fpb);
4687 out0:	return error;
4688 }
4689 
4690 /*
4691  * Make a directory file.
4692  */
4693 /* ARGSUSED */
4694 int
4695 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4696 {
4697 	/* {
4698 		syscallarg(const char *) path;
4699 		syscallarg(int) mode;
4700 	} */
4701 
4702 	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4703 	    SCARG(uap, mode), UIO_USERSPACE);
4704 }
4705 
4706 int
4707 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4708     register_t *retval)
4709 {
4710 	/* {
4711 		syscallarg(int) fd;
4712 		syscallarg(const char *) path;
4713 		syscallarg(int) mode;
4714 	} */
4715 
4716 	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4717 	    SCARG(uap, mode), UIO_USERSPACE);
4718 }
4719 
4720 
4721 int
4722 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4723 {
4724 	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4725 }
4726 
4727 static int
4728 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4729     enum uio_seg seg)
4730 {
4731 	struct proc *p = curlwp->l_proc;
4732 	struct vnode *vp;
4733 	struct vattr vattr;
4734 	int error;
4735 	struct pathbuf *pb;
4736 	struct nameidata nd;
4737 
4738 	KASSERT(l != NULL || fdat == AT_FDCWD);
4739 
4740 	/* XXX bollocks, should pass in a pathbuf */
4741 	error = pathbuf_maybe_copyin(path, seg, &pb);
4742 	if (error) {
4743 		return error;
4744 	}
4745 
4746 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4747 
4748 	if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4749 		pathbuf_destroy(pb);
4750 		return (error);
4751 	}
4752 	vp = nd.ni_vp;
4753 	if (vp != NULL) {
4754 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4755 		if (nd.ni_dvp == vp)
4756 			vrele(nd.ni_dvp);
4757 		else
4758 			vput(nd.ni_dvp);
4759 		vrele(vp);
4760 		pathbuf_destroy(pb);
4761 		return (EEXIST);
4762 	}
4763 	vattr_null(&vattr);
4764 	vattr.va_type = VDIR;
4765 	/* We will read cwdi->cwdi_cmask unlocked. */
4766 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4767 	nd.ni_cnd.cn_flags |= WILLBEDIR;
4768 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4769 	if (!error)
4770 		vrele(nd.ni_vp);
4771 	vput(nd.ni_dvp);
4772 	pathbuf_destroy(pb);
4773 	return (error);
4774 }
4775 
4776 /*
4777  * Remove a directory file.
4778  */
4779 /* ARGSUSED */
4780 int
4781 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4782 {
4783 	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4784 	    AT_REMOVEDIR, UIO_USERSPACE);
4785 }
4786 
4787 /*
4788  * Read a block of directory entries in a file system independent format.
4789  */
4790 int
4791 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
4792 {
4793 	/* {
4794 		syscallarg(int) fd;
4795 		syscallarg(char *) buf;
4796 		syscallarg(size_t) count;
4797 	} */
4798 	file_t *fp;
4799 	int error, done;
4800 
4801 	/* fd_getvnode() will use the descriptor for us */
4802 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4803 		return (error);
4804 	if ((fp->f_flag & FREAD) == 0) {
4805 		error = EBADF;
4806 		goto out;
4807 	}
4808 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4809 			SCARG(uap, count), &done, l, 0, 0);
4810 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4811 	*retval = done;
4812  out:
4813 	fd_putfile(SCARG(uap, fd));
4814 	return (error);
4815 }
4816 
4817 /*
4818  * Set the mode mask for creation of filesystem nodes.
4819  */
4820 int
4821 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4822 {
4823 	/* {
4824 		syscallarg(mode_t) newmask;
4825 	} */
4826 
4827 	/*
4828 	 * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4829 	 * serialization with those reads is required.  It's important to
4830 	 * return a coherent answer for the caller of umask() though, and
4831 	 * the atomic operation accomplishes that.
4832 	 */
4833 	*retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4834 	    SCARG(uap, newmask) & ALLPERMS);
4835 
4836 	return (0);
4837 }
4838 
4839 int
4840 dorevoke(struct vnode *vp, kauth_cred_t cred)
4841 {
4842 	struct vattr vattr;
4843 	int error, fs_decision;
4844 
4845 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4846 	error = VOP_GETATTR(vp, &vattr, cred);
4847 	VOP_UNLOCK(vp);
4848 	if (error != 0)
4849 		return error;
4850 	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4851 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4852 	    fs_decision);
4853 	if (!error)
4854 		VOP_REVOKE(vp, REVOKEALL);
4855 	return (error);
4856 }
4857 
4858 /*
4859  * Void all references to file by ripping underlying filesystem
4860  * away from vnode.
4861  */
4862 /* ARGSUSED */
4863 int
4864 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
4865 {
4866 	/* {
4867 		syscallarg(const char *) path;
4868 	} */
4869 	struct vnode *vp;
4870 	int error;
4871 
4872 	error = namei_simple_user(SCARG(uap, path),
4873 				NSM_FOLLOW_TRYEMULROOT, &vp);
4874 	if (error != 0)
4875 		return (error);
4876 	error = dorevoke(vp, l->l_cred);
4877 	vrele(vp);
4878 	return (error);
4879 }
4880 
4881 /*
4882  * Allocate backing store for a file, filling a hole without having to
4883  * explicitly write anything out.
4884  */
4885 /* ARGSUSED */
4886 int
4887 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4888 		register_t *retval)
4889 {
4890 	/* {
4891 		syscallarg(int) fd;
4892 		syscallarg(off_t) pos;
4893 		syscallarg(off_t) len;
4894 	} */
4895 	int fd;
4896 	off_t pos, len;
4897 	struct file *fp;
4898 	struct vnode *vp;
4899 	int error;
4900 
4901 	fd = SCARG(uap, fd);
4902 	pos = SCARG(uap, pos);
4903 	len = SCARG(uap, len);
4904 
4905 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4906 		*retval = EINVAL;
4907 		return 0;
4908 	}
4909 
4910 	error = fd_getvnode(fd, &fp);
4911 	if (error) {
4912 		*retval = error;
4913 		return 0;
4914 	}
4915 	if ((fp->f_flag & FWRITE) == 0) {
4916 		error = EBADF;
4917 		goto fail;
4918 	}
4919 	vp = fp->f_vnode;
4920 
4921 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4922 	if (vp->v_type == VDIR) {
4923 		error = EISDIR;
4924 	} else {
4925 		error = VOP_FALLOCATE(vp, pos, len);
4926 	}
4927 	VOP_UNLOCK(vp);
4928 
4929 fail:
4930 	fd_putfile(fd);
4931 	*retval = error;
4932 	return 0;
4933 }
4934 
4935 /*
4936  * Deallocate backing store for a file, creating a hole. Also used for
4937  * invoking TRIM on disks.
4938  */
4939 /* ARGSUSED */
4940 int
4941 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
4942 		register_t *retval)
4943 {
4944 	/* {
4945 		syscallarg(int) fd;
4946 		syscallarg(off_t) pos;
4947 		syscallarg(off_t) len;
4948 	} */
4949 	int fd;
4950 	off_t pos, len;
4951 	struct file *fp;
4952 	struct vnode *vp;
4953 	int error;
4954 
4955 	fd = SCARG(uap, fd);
4956 	pos = SCARG(uap, pos);
4957 	len = SCARG(uap, len);
4958 
4959 	if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4960 		return EINVAL;
4961 	}
4962 
4963 	error = fd_getvnode(fd, &fp);
4964 	if (error) {
4965 		return error;
4966 	}
4967 	if ((fp->f_flag & FWRITE) == 0) {
4968 		error = EBADF;
4969 		goto fail;
4970 	}
4971 	vp = fp->f_vnode;
4972 
4973 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4974 	if (vp->v_type == VDIR) {
4975 		error = EISDIR;
4976 	} else {
4977 		error = VOP_FDISCARD(vp, pos, len);
4978 	}
4979 	VOP_UNLOCK(vp);
4980 
4981 fail:
4982 	fd_putfile(fd);
4983 	return error;
4984 }
4985