xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 46f5119e40af2e51998f686b2fdcc76b5488f7f3)
1 /*	$NetBSD: vfs_syscalls.c,v 1.423 2011/04/24 21:35:29 rmind Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.423 2011/04/24 21:35:29 rmind Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/stat.h>
87 #include <sys/vnode.h>
88 #include <sys/mount.h>
89 #include <sys/proc.h>
90 #include <sys/uio.h>
91 #include <sys/kmem.h>
92 #include <sys/dirent.h>
93 #include <sys/sysctl.h>
94 #include <sys/syscallargs.h>
95 #include <sys/vfs_syscalls.h>
96 #include <sys/ktrace.h>
97 #ifdef FILEASSOC
98 #include <sys/fileassoc.h>
99 #endif /* FILEASSOC */
100 #include <sys/verified_exec.h>
101 #include <sys/kauth.h>
102 #include <sys/atomic.h>
103 #include <sys/module.h>
104 #include <sys/buf.h>
105 
106 #include <miscfs/genfs/genfs.h>
107 #include <miscfs/syncfs/syncfs.h>
108 #include <miscfs/specfs/specdev.h>
109 
110 #include <nfs/rpcv2.h>
111 #include <nfs/nfsproto.h>
112 #include <nfs/nfs.h>
113 #include <nfs/nfs_var.h>
114 
115 static int change_flags(struct vnode *, u_long, struct lwp *);
116 static int change_mode(struct vnode *, int, struct lwp *l);
117 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
118 
119 /*
120  * This table is used to maintain compatibility with 4.3BSD
121  * and NetBSD 0.9 mount syscalls - and possibly other systems.
122  * Note, the order is important!
123  *
124  * Do not modify this table. It should only contain filesystems
125  * supported by NetBSD 0.9 and 4.3BSD.
126  */
127 const char * const mountcompatnames[] = {
128 	NULL,		/* 0 = MOUNT_NONE */
129 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
130 	MOUNT_NFS,	/* 2 */
131 	MOUNT_MFS,	/* 3 */
132 	MOUNT_MSDOS,	/* 4 */
133 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
134 	MOUNT_FDESC,	/* 6 */
135 	MOUNT_KERNFS,	/* 7 */
136 	NULL,		/* 8 = MOUNT_DEVFS */
137 	MOUNT_AFS,	/* 9 */
138 };
139 
140 const int nmountcompatnames = __arraycount(mountcompatnames);
141 
142 static int
143 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
144 {
145 	int error;
146 
147 	fp->f_flag = flags & FMASK;
148 	fp->f_type = DTYPE_VNODE;
149 	fp->f_ops = &vnops;
150 	fp->f_data = vp;
151 
152 	if (flags & (O_EXLOCK | O_SHLOCK)) {
153 		struct flock lf;
154 		int type;
155 
156 		lf.l_whence = SEEK_SET;
157 		lf.l_start = 0;
158 		lf.l_len = 0;
159 		if (flags & O_EXLOCK)
160 			lf.l_type = F_WRLCK;
161 		else
162 			lf.l_type = F_RDLCK;
163 		type = F_FLOCK;
164 		if ((flags & FNONBLOCK) == 0)
165 			type |= F_WAIT;
166 		VOP_UNLOCK(vp);
167 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
168 		if (error) {
169 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
170 			fd_abort(l->l_proc, fp, indx);
171 			return error;
172 		}
173 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
174 		atomic_or_uint(&fp->f_flag, FHASLOCK);
175 	}
176 	if (flags & O_CLOEXEC)
177 		fd_set_exclose(l, indx, true);
178 	return 0;
179 }
180 
181 static int
182 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
183     void *data, size_t *data_len)
184 {
185 	struct mount *mp;
186 	int error = 0, saved_flags;
187 
188 	mp = vp->v_mount;
189 	saved_flags = mp->mnt_flag;
190 
191 	/* We can operate only on VV_ROOT nodes. */
192 	if ((vp->v_vflag & VV_ROOT) == 0) {
193 		error = EINVAL;
194 		goto out;
195 	}
196 
197 	/*
198 	 * We only allow the filesystem to be reloaded if it
199 	 * is currently mounted read-only.  Additionally, we
200 	 * prevent read-write to read-only downgrades.
201 	 */
202 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
203 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
204 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
205 		error = EOPNOTSUPP;	/* Needs translation */
206 		goto out;
207 	}
208 
209 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
210 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
211 	if (error)
212 		goto out;
213 
214 	if (vfs_busy(mp, NULL)) {
215 		error = EPERM;
216 		goto out;
217 	}
218 
219 	mutex_enter(&mp->mnt_updating);
220 
221 	mp->mnt_flag &= ~MNT_OP_FLAGS;
222 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
223 
224 	/*
225 	 * Set the mount level flags.
226 	 */
227 	if (flags & MNT_RDONLY)
228 		mp->mnt_flag |= MNT_RDONLY;
229 	else if (mp->mnt_flag & MNT_RDONLY)
230 		mp->mnt_iflag |= IMNT_WANTRDWR;
231 	mp->mnt_flag &=
232 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
233 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
234 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
235 	    MNT_LOG);
236 	mp->mnt_flag |= flags &
237 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
238 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
239 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
240 	    MNT_LOG | MNT_IGNORE);
241 
242 	error = VFS_MOUNT(mp, path, data, data_len);
243 
244 	if (error && data != NULL) {
245 		int error2;
246 
247 		/*
248 		 * Update failed; let's try and see if it was an
249 		 * export request.  For compat with 3.0 and earlier.
250 		 */
251 		error2 = vfs_hooks_reexport(mp, path, data);
252 
253 		/*
254 		 * Only update error code if the export request was
255 		 * understood but some problem occurred while
256 		 * processing it.
257 		 */
258 		if (error2 != EJUSTRETURN)
259 			error = error2;
260 	}
261 
262 	if (mp->mnt_iflag & IMNT_WANTRDWR)
263 		mp->mnt_flag &= ~MNT_RDONLY;
264 	if (error)
265 		mp->mnt_flag = saved_flags;
266 	mp->mnt_flag &= ~MNT_OP_FLAGS;
267 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
268 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
269 		if (mp->mnt_syncer == NULL)
270 			error = vfs_allocate_syncvnode(mp);
271 	} else {
272 		if (mp->mnt_syncer != NULL)
273 			vfs_deallocate_syncvnode(mp);
274 	}
275 	mutex_exit(&mp->mnt_updating);
276 	vfs_unbusy(mp, false, NULL);
277 
278  out:
279 	return (error);
280 }
281 
282 static int
283 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
284 {
285 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
286 	int error;
287 
288 	/* Copy file-system type from userspace.  */
289 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
290 	if (error) {
291 		/*
292 		 * Historically, filesystem types were identified by numbers.
293 		 * If we get an integer for the filesystem type instead of a
294 		 * string, we check to see if it matches one of the historic
295 		 * filesystem types.
296 		 */
297 		u_long fsindex = (u_long)fstype;
298 		if (fsindex >= nmountcompatnames ||
299 		    mountcompatnames[fsindex] == NULL)
300 			return ENODEV;
301 		strlcpy(fstypename, mountcompatnames[fsindex],
302 		    sizeof(fstypename));
303 	}
304 
305 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
306 	if (strcmp(fstypename, "ufs") == 0)
307 		fstypename[0] = 'f';
308 
309 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
310 		return 0;
311 
312 	/* If we can autoload a vfs module, try again */
313 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
314 
315 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
316 		return 0;
317 
318 	return ENODEV;
319 }
320 
321 static int
322 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
323     void *data, size_t *data_len)
324 {
325 	struct mount *mp;
326 	int error;
327 
328 	/* If MNT_GETARGS is specified, it should be the only flag. */
329 	if (flags & ~MNT_GETARGS)
330 		return EINVAL;
331 
332 	mp = vp->v_mount;
333 
334 	/* XXX: probably some notion of "can see" here if we want isolation. */
335 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
336 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
337 	if (error)
338 		return error;
339 
340 	if ((vp->v_vflag & VV_ROOT) == 0)
341 		return EINVAL;
342 
343 	if (vfs_busy(mp, NULL))
344 		return EPERM;
345 
346 	mutex_enter(&mp->mnt_updating);
347 	mp->mnt_flag &= ~MNT_OP_FLAGS;
348 	mp->mnt_flag |= MNT_GETARGS;
349 	error = VFS_MOUNT(mp, path, data, data_len);
350 	mp->mnt_flag &= ~MNT_OP_FLAGS;
351 	mutex_exit(&mp->mnt_updating);
352 
353 	vfs_unbusy(mp, false, NULL);
354 	return (error);
355 }
356 
357 int
358 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
359 {
360 	/* {
361 		syscallarg(const char *) type;
362 		syscallarg(const char *) path;
363 		syscallarg(int) flags;
364 		syscallarg(void *) data;
365 		syscallarg(size_t) data_len;
366 	} */
367 
368 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
369 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
370 	    SCARG(uap, data_len), retval);
371 }
372 
373 int
374 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
375     const char *path, int flags, void *data, enum uio_seg data_seg,
376     size_t data_len, register_t *retval)
377 {
378 	struct vnode *vp;
379 	void *data_buf = data;
380 	bool vfsopsrele = false;
381 	int error;
382 
383 	/* XXX: The calling convention of this routine is totally bizarre */
384 	if (vfsops)
385 		vfsopsrele = true;
386 
387 	/*
388 	 * Get vnode to be covered
389 	 */
390 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
391 	if (error != 0) {
392 		vp = NULL;
393 		goto done;
394 	}
395 
396 	if (vfsops == NULL) {
397 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
398 			vfsops = vp->v_mount->mnt_op;
399 		} else {
400 			/* 'type' is userspace */
401 			error = mount_get_vfsops(type, &vfsops);
402 			if (error != 0)
403 				goto done;
404 			vfsopsrele = true;
405 		}
406 	}
407 
408 	if (data != NULL && data_seg == UIO_USERSPACE) {
409 		if (data_len == 0) {
410 			/* No length supplied, use default for filesystem */
411 			data_len = vfsops->vfs_min_mount_data;
412 			if (data_len > VFS_MAX_MOUNT_DATA) {
413 				error = EINVAL;
414 				goto done;
415 			}
416 			/*
417 			 * Hopefully a longer buffer won't make copyin() fail.
418 			 * For compatibility with 3.0 and earlier.
419 			 */
420 			if (flags & MNT_UPDATE
421 			    && data_len < sizeof (struct mnt_export_args30))
422 				data_len = sizeof (struct mnt_export_args30);
423 		}
424 		data_buf = kmem_alloc(data_len, KM_SLEEP);
425 
426 		/* NFS needs the buffer even for mnt_getargs .... */
427 		error = copyin(data, data_buf, data_len);
428 		if (error != 0)
429 			goto done;
430 	}
431 
432 	if (flags & MNT_GETARGS) {
433 		if (data_len == 0) {
434 			error = EINVAL;
435 			goto done;
436 		}
437 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
438 		if (error != 0)
439 			goto done;
440 		if (data_seg == UIO_USERSPACE)
441 			error = copyout(data_buf, data, data_len);
442 		*retval = data_len;
443 	} else if (flags & MNT_UPDATE) {
444 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
445 	} else {
446 		/* Locking is handled internally in mount_domount(). */
447 		KASSERT(vfsopsrele == true);
448 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
449 		    &data_len);
450 		vfsopsrele = false;
451 	}
452 
453     done:
454 	if (vfsopsrele)
455 		vfs_delref(vfsops);
456     	if (vp != NULL) {
457 	    	vrele(vp);
458 	}
459 	if (data_buf != data)
460 		kmem_free(data_buf, data_len);
461 	return (error);
462 }
463 
464 /*
465  * Unmount a file system.
466  *
467  * Note: unmount takes a path to the vnode mounted on as argument,
468  * not special file (as before).
469  */
470 /* ARGSUSED */
471 int
472 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
473 {
474 	/* {
475 		syscallarg(const char *) path;
476 		syscallarg(int) flags;
477 	} */
478 	struct vnode *vp;
479 	struct mount *mp;
480 	int error;
481 	struct pathbuf *pb;
482 	struct nameidata nd;
483 
484 	error = pathbuf_copyin(SCARG(uap, path), &pb);
485 	if (error) {
486 		return error;
487 	}
488 
489 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
490 	if ((error = namei(&nd)) != 0) {
491 		pathbuf_destroy(pb);
492 		return error;
493 	}
494 	vp = nd.ni_vp;
495 	pathbuf_destroy(pb);
496 
497 	mp = vp->v_mount;
498 	atomic_inc_uint(&mp->mnt_refcnt);
499 	VOP_UNLOCK(vp);
500 
501 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
502 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
503 	if (error) {
504 		vrele(vp);
505 		vfs_destroy(mp);
506 		return (error);
507 	}
508 
509 	/*
510 	 * Don't allow unmounting the root file system.
511 	 */
512 	if (mp->mnt_flag & MNT_ROOTFS) {
513 		vrele(vp);
514 		vfs_destroy(mp);
515 		return (EINVAL);
516 	}
517 
518 	/*
519 	 * Must be the root of the filesystem
520 	 */
521 	if ((vp->v_vflag & VV_ROOT) == 0) {
522 		vrele(vp);
523 		vfs_destroy(mp);
524 		return (EINVAL);
525 	}
526 
527 	vrele(vp);
528 	error = dounmount(mp, SCARG(uap, flags), l);
529 	vfs_destroy(mp);
530 	return error;
531 }
532 
533 /*
534  * Sync each mounted filesystem.
535  */
536 #ifdef DEBUG
537 int syncprt = 0;
538 struct ctldebug debug0 = { "syncprt", &syncprt };
539 #endif
540 
541 /* ARGSUSED */
542 int
543 sys_sync(struct lwp *l, const void *v, register_t *retval)
544 {
545 	struct mount *mp, *nmp;
546 	int asyncflag;
547 
548 	if (l == NULL)
549 		l = &lwp0;
550 
551 	mutex_enter(&mountlist_lock);
552 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
553 	     mp = nmp) {
554 		if (vfs_busy(mp, &nmp)) {
555 			continue;
556 		}
557 		mutex_enter(&mp->mnt_updating);
558 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
559 			asyncflag = mp->mnt_flag & MNT_ASYNC;
560 			mp->mnt_flag &= ~MNT_ASYNC;
561 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
562 			if (asyncflag)
563 				 mp->mnt_flag |= MNT_ASYNC;
564 		}
565 		mutex_exit(&mp->mnt_updating);
566 		vfs_unbusy(mp, false, &nmp);
567 	}
568 	mutex_exit(&mountlist_lock);
569 #ifdef DEBUG
570 	if (syncprt)
571 		vfs_bufstats();
572 #endif /* DEBUG */
573 	return (0);
574 }
575 
576 /*
577  * Change filesystem quotas.
578  */
579 /* ARGSUSED */
580 int
581 sys___quotactl50(struct lwp *l, const struct sys___quotactl50_args *uap,
582     register_t *retval)
583 {
584 	/* {
585 		syscallarg(const char *) path;
586 		syscallarg(struct plistref *) pref;
587 	} */
588 	struct mount *mp;
589 	int error;
590 	struct vnode *vp;
591 	prop_dictionary_t dict;
592 	struct plistref pref;
593 
594 	error = namei_simple_user(SCARG(uap, path),
595 				NSM_FOLLOW_TRYEMULROOT, &vp);
596 	if (error != 0)
597 		return (error);
598 	mp = vp->v_mount;
599 	error = copyin(SCARG(uap, pref), &pref, sizeof(pref));
600 	if (error)
601 		return error;
602 	error = prop_dictionary_copyin(&pref, &dict);
603 	if (error)
604 		return error;
605 	error = VFS_QUOTACTL(mp, dict);
606 	vrele(vp);
607 	if (!error)
608 		error = prop_dictionary_copyout(&pref, dict);
609 	if (!error)
610 		error = copyout(&pref, SCARG(uap, pref), sizeof(pref));
611 	prop_object_release(dict);
612 	return (error);
613 }
614 
615 int
616 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
617     int root)
618 {
619 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
620 	int error = 0;
621 
622 	/*
623 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
624 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
625 	 * overrides MNT_NOWAIT.
626 	 */
627 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
628 	    (flags != MNT_WAIT && flags != 0)) {
629 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
630 		goto done;
631 	}
632 
633 	/* Get the filesystem stats now */
634 	memset(sp, 0, sizeof(*sp));
635 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
636 		return error;
637 	}
638 
639 	if (cwdi->cwdi_rdir == NULL)
640 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
641 done:
642 	if (cwdi->cwdi_rdir != NULL) {
643 		size_t len;
644 		char *bp;
645 		char c;
646 		char *path = PNBUF_GET();
647 
648 		bp = path + MAXPATHLEN;
649 		*--bp = '\0';
650 		rw_enter(&cwdi->cwdi_lock, RW_READER);
651 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
652 		    MAXPATHLEN / 2, 0, l);
653 		rw_exit(&cwdi->cwdi_lock);
654 		if (error) {
655 			PNBUF_PUT(path);
656 			return error;
657 		}
658 		len = strlen(bp);
659 		if (len != 1) {
660 			/*
661 			 * for mount points that are below our root, we can see
662 			 * them, so we fix up the pathname and return them. The
663 			 * rest we cannot see, so we don't allow viewing the
664 			 * data.
665 			 */
666 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
667 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
668 				(void)strlcpy(sp->f_mntonname,
669 				    c == '\0' ? "/" : &sp->f_mntonname[len],
670 				    sizeof(sp->f_mntonname));
671 			} else {
672 				if (root)
673 					(void)strlcpy(sp->f_mntonname, "/",
674 					    sizeof(sp->f_mntonname));
675 				else
676 					error = EPERM;
677 			}
678 		}
679 		PNBUF_PUT(path);
680 	}
681 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
682 	return error;
683 }
684 
685 /*
686  * Get filesystem statistics by path.
687  */
688 int
689 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
690 {
691 	struct mount *mp;
692 	int error;
693 	struct vnode *vp;
694 
695 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
696 	if (error != 0)
697 		return error;
698 	mp = vp->v_mount;
699 	error = dostatvfs(mp, sb, l, flags, 1);
700 	vrele(vp);
701 	return error;
702 }
703 
704 /* ARGSUSED */
705 int
706 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
707 {
708 	/* {
709 		syscallarg(const char *) path;
710 		syscallarg(struct statvfs *) buf;
711 		syscallarg(int) flags;
712 	} */
713 	struct statvfs *sb;
714 	int error;
715 
716 	sb = STATVFSBUF_GET();
717 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
718 	if (error == 0)
719 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
720 	STATVFSBUF_PUT(sb);
721 	return error;
722 }
723 
724 /*
725  * Get filesystem statistics by fd.
726  */
727 int
728 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
729 {
730 	file_t *fp;
731 	struct mount *mp;
732 	int error;
733 
734 	/* fd_getvnode() will use the descriptor for us */
735 	if ((error = fd_getvnode(fd, &fp)) != 0)
736 		return (error);
737 	mp = ((struct vnode *)fp->f_data)->v_mount;
738 	error = dostatvfs(mp, sb, curlwp, flags, 1);
739 	fd_putfile(fd);
740 	return error;
741 }
742 
743 /* ARGSUSED */
744 int
745 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
746 {
747 	/* {
748 		syscallarg(int) fd;
749 		syscallarg(struct statvfs *) buf;
750 		syscallarg(int) flags;
751 	} */
752 	struct statvfs *sb;
753 	int error;
754 
755 	sb = STATVFSBUF_GET();
756 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
757 	if (error == 0)
758 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
759 	STATVFSBUF_PUT(sb);
760 	return error;
761 }
762 
763 
764 /*
765  * Get statistics on all filesystems.
766  */
767 int
768 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
769     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
770     register_t *retval)
771 {
772 	int root = 0;
773 	struct proc *p = l->l_proc;
774 	struct mount *mp, *nmp;
775 	struct statvfs *sb;
776 	size_t count, maxcount;
777 	int error = 0;
778 
779 	sb = STATVFSBUF_GET();
780 	maxcount = bufsize / entry_sz;
781 	mutex_enter(&mountlist_lock);
782 	count = 0;
783 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
784 	     mp = nmp) {
785 		if (vfs_busy(mp, &nmp)) {
786 			continue;
787 		}
788 		if (sfsp && count < maxcount) {
789 			error = dostatvfs(mp, sb, l, flags, 0);
790 			if (error) {
791 				vfs_unbusy(mp, false, &nmp);
792 				error = 0;
793 				continue;
794 			}
795 			error = copyfn(sb, sfsp, entry_sz);
796 			if (error) {
797 				vfs_unbusy(mp, false, NULL);
798 				goto out;
799 			}
800 			sfsp = (char *)sfsp + entry_sz;
801 			root |= strcmp(sb->f_mntonname, "/") == 0;
802 		}
803 		count++;
804 		vfs_unbusy(mp, false, &nmp);
805 	}
806 	mutex_exit(&mountlist_lock);
807 
808 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
809 		/*
810 		 * fake a root entry
811 		 */
812 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
813 		    sb, l, flags, 1);
814 		if (error != 0)
815 			goto out;
816 		if (sfsp) {
817 			error = copyfn(sb, sfsp, entry_sz);
818 			if (error != 0)
819 				goto out;
820 		}
821 		count++;
822 	}
823 	if (sfsp && count > maxcount)
824 		*retval = maxcount;
825 	else
826 		*retval = count;
827 out:
828 	STATVFSBUF_PUT(sb);
829 	return error;
830 }
831 
832 int
833 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
834 {
835 	/* {
836 		syscallarg(struct statvfs *) buf;
837 		syscallarg(size_t) bufsize;
838 		syscallarg(int) flags;
839 	} */
840 
841 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
842 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
843 }
844 
845 /*
846  * Change current working directory to a given file descriptor.
847  */
848 /* ARGSUSED */
849 int
850 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
851 {
852 	/* {
853 		syscallarg(int) fd;
854 	} */
855 	struct proc *p = l->l_proc;
856 	struct cwdinfo *cwdi;
857 	struct vnode *vp, *tdp;
858 	struct mount *mp;
859 	file_t *fp;
860 	int error, fd;
861 
862 	/* fd_getvnode() will use the descriptor for us */
863 	fd = SCARG(uap, fd);
864 	if ((error = fd_getvnode(fd, &fp)) != 0)
865 		return (error);
866 	vp = fp->f_data;
867 
868 	vref(vp);
869 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
870 	if (vp->v_type != VDIR)
871 		error = ENOTDIR;
872 	else
873 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
874 	if (error) {
875 		vput(vp);
876 		goto out;
877 	}
878 	while ((mp = vp->v_mountedhere) != NULL) {
879 		error = vfs_busy(mp, NULL);
880 		vput(vp);
881 		if (error != 0)
882 			goto out;
883 		error = VFS_ROOT(mp, &tdp);
884 		vfs_unbusy(mp, false, NULL);
885 		if (error)
886 			goto out;
887 		vp = tdp;
888 	}
889 	VOP_UNLOCK(vp);
890 
891 	/*
892 	 * Disallow changing to a directory not under the process's
893 	 * current root directory (if there is one).
894 	 */
895 	cwdi = p->p_cwdi;
896 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
897 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
898 		vrele(vp);
899 		error = EPERM;	/* operation not permitted */
900 	} else {
901 		vrele(cwdi->cwdi_cdir);
902 		cwdi->cwdi_cdir = vp;
903 	}
904 	rw_exit(&cwdi->cwdi_lock);
905 
906  out:
907 	fd_putfile(fd);
908 	return (error);
909 }
910 
911 /*
912  * Change this process's notion of the root directory to a given file
913  * descriptor.
914  */
915 int
916 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
917 {
918 	struct proc *p = l->l_proc;
919 	struct vnode	*vp;
920 	file_t	*fp;
921 	int		 error, fd = SCARG(uap, fd);
922 
923 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
924  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
925 		return error;
926 	/* fd_getvnode() will use the descriptor for us */
927 	if ((error = fd_getvnode(fd, &fp)) != 0)
928 		return error;
929 	vp = fp->f_data;
930 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
931 	if (vp->v_type != VDIR)
932 		error = ENOTDIR;
933 	else
934 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
935 	VOP_UNLOCK(vp);
936 	if (error)
937 		goto out;
938 	vref(vp);
939 
940 	change_root(p->p_cwdi, vp, l);
941 
942  out:
943 	fd_putfile(fd);
944 	return (error);
945 }
946 
947 /*
948  * Change current working directory (``.'').
949  */
950 /* ARGSUSED */
951 int
952 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
953 {
954 	/* {
955 		syscallarg(const char *) path;
956 	} */
957 	struct proc *p = l->l_proc;
958 	struct cwdinfo *cwdi;
959 	int error;
960 	struct vnode *vp;
961 
962 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
963 				  &vp, l)) != 0)
964 		return (error);
965 	cwdi = p->p_cwdi;
966 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
967 	vrele(cwdi->cwdi_cdir);
968 	cwdi->cwdi_cdir = vp;
969 	rw_exit(&cwdi->cwdi_lock);
970 	return (0);
971 }
972 
973 /*
974  * Change notion of root (``/'') directory.
975  */
976 /* ARGSUSED */
977 int
978 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
979 {
980 	/* {
981 		syscallarg(const char *) path;
982 	} */
983 	struct proc *p = l->l_proc;
984 	int error;
985 	struct vnode *vp;
986 
987 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
988 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
989 		return (error);
990 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
991 				  &vp, l)) != 0)
992 		return (error);
993 
994 	change_root(p->p_cwdi, vp, l);
995 
996 	return (0);
997 }
998 
999 /*
1000  * Common routine for chroot and fchroot.
1001  * NB: callers need to properly authorize the change root operation.
1002  */
1003 void
1004 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1005 {
1006 
1007 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1008 	if (cwdi->cwdi_rdir != NULL)
1009 		vrele(cwdi->cwdi_rdir);
1010 	cwdi->cwdi_rdir = vp;
1011 
1012 	/*
1013 	 * Prevent escaping from chroot by putting the root under
1014 	 * the working directory.  Silently chdir to / if we aren't
1015 	 * already there.
1016 	 */
1017 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1018 		/*
1019 		 * XXX would be more failsafe to change directory to a
1020 		 * deadfs node here instead
1021 		 */
1022 		vrele(cwdi->cwdi_cdir);
1023 		vref(vp);
1024 		cwdi->cwdi_cdir = vp;
1025 	}
1026 	rw_exit(&cwdi->cwdi_lock);
1027 }
1028 
1029 /*
1030  * Common routine for chroot and chdir.
1031  * XXX "where" should be enum uio_seg
1032  */
1033 int
1034 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1035 {
1036 	struct pathbuf *pb;
1037 	struct nameidata nd;
1038 	int error;
1039 
1040 	error = pathbuf_maybe_copyin(path, where, &pb);
1041 	if (error) {
1042 		return error;
1043 	}
1044 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1045 	if ((error = namei(&nd)) != 0) {
1046 		pathbuf_destroy(pb);
1047 		return error;
1048 	}
1049 	*vpp = nd.ni_vp;
1050 	pathbuf_destroy(pb);
1051 
1052 	if ((*vpp)->v_type != VDIR)
1053 		error = ENOTDIR;
1054 	else
1055 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1056 
1057 	if (error)
1058 		vput(*vpp);
1059 	else
1060 		VOP_UNLOCK(*vpp);
1061 	return (error);
1062 }
1063 
1064 /*
1065  * Check permissions, allocate an open file structure,
1066  * and call the device open routine if any.
1067  */
1068 int
1069 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1070 {
1071 	/* {
1072 		syscallarg(const char *) path;
1073 		syscallarg(int) flags;
1074 		syscallarg(int) mode;
1075 	} */
1076 	struct proc *p = l->l_proc;
1077 	struct cwdinfo *cwdi = p->p_cwdi;
1078 	file_t *fp;
1079 	struct vnode *vp;
1080 	int flags, cmode;
1081 	int indx, error;
1082 	struct pathbuf *pb;
1083 	struct nameidata nd;
1084 
1085 	flags = FFLAGS(SCARG(uap, flags));
1086 	if ((flags & (FREAD | FWRITE)) == 0)
1087 		return (EINVAL);
1088 
1089 	error = pathbuf_copyin(SCARG(uap, path), &pb);
1090 	if (error) {
1091 		return error;
1092 	}
1093 
1094 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1095 		pathbuf_destroy(pb);
1096 		return error;
1097 	}
1098 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1099 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1100 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1101 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1102 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1103 		fd_abort(p, fp, indx);
1104 		if ((error == EDUPFD || error == EMOVEFD) &&
1105 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1106 		    (error =
1107 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1108 			*retval = indx;
1109 			pathbuf_destroy(pb);
1110 			return (0);
1111 		}
1112 		if (error == ERESTART)
1113 			error = EINTR;
1114 		pathbuf_destroy(pb);
1115 		return (error);
1116 	}
1117 
1118 	l->l_dupfd = 0;
1119 	vp = nd.ni_vp;
1120 	pathbuf_destroy(pb);
1121 
1122 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1123 		return error;
1124 
1125 	VOP_UNLOCK(vp);
1126 	*retval = indx;
1127 	fd_affix(p, fp, indx);
1128 	return (0);
1129 }
1130 
1131 static void
1132 vfs__fhfree(fhandle_t *fhp)
1133 {
1134 	size_t fhsize;
1135 
1136 	if (fhp == NULL) {
1137 		return;
1138 	}
1139 	fhsize = FHANDLE_SIZE(fhp);
1140 	kmem_free(fhp, fhsize);
1141 }
1142 
1143 /*
1144  * vfs_composefh: compose a filehandle.
1145  */
1146 
1147 int
1148 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1149 {
1150 	struct mount *mp;
1151 	struct fid *fidp;
1152 	int error;
1153 	size_t needfhsize;
1154 	size_t fidsize;
1155 
1156 	mp = vp->v_mount;
1157 	fidp = NULL;
1158 	if (*fh_size < FHANDLE_SIZE_MIN) {
1159 		fidsize = 0;
1160 	} else {
1161 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1162 		if (fhp != NULL) {
1163 			memset(fhp, 0, *fh_size);
1164 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1165 			fidp = &fhp->fh_fid;
1166 		}
1167 	}
1168 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1169 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1170 	if (error == 0 && *fh_size < needfhsize) {
1171 		error = E2BIG;
1172 	}
1173 	*fh_size = needfhsize;
1174 	return error;
1175 }
1176 
1177 int
1178 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1179 {
1180 	struct mount *mp;
1181 	fhandle_t *fhp;
1182 	size_t fhsize;
1183 	size_t fidsize;
1184 	int error;
1185 
1186 	*fhpp = NULL;
1187 	mp = vp->v_mount;
1188 	fidsize = 0;
1189 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1190 	KASSERT(error != 0);
1191 	if (error != E2BIG) {
1192 		goto out;
1193 	}
1194 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1195 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1196 	if (fhp == NULL) {
1197 		error = ENOMEM;
1198 		goto out;
1199 	}
1200 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1201 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1202 	if (error == 0) {
1203 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1204 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1205 		*fhpp = fhp;
1206 	} else {
1207 		kmem_free(fhp, fhsize);
1208 	}
1209 out:
1210 	return error;
1211 }
1212 
1213 void
1214 vfs_composefh_free(fhandle_t *fhp)
1215 {
1216 
1217 	vfs__fhfree(fhp);
1218 }
1219 
1220 /*
1221  * vfs_fhtovp: lookup a vnode by a filehandle.
1222  */
1223 
1224 int
1225 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1226 {
1227 	struct mount *mp;
1228 	int error;
1229 
1230 	*vpp = NULL;
1231 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1232 	if (mp == NULL) {
1233 		error = ESTALE;
1234 		goto out;
1235 	}
1236 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1237 		error = EOPNOTSUPP;
1238 		goto out;
1239 	}
1240 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1241 out:
1242 	return error;
1243 }
1244 
1245 /*
1246  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1247  * the needed size.
1248  */
1249 
1250 int
1251 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1252 {
1253 	fhandle_t *fhp;
1254 	int error;
1255 
1256 	*fhpp = NULL;
1257 	if (fhsize > FHANDLE_SIZE_MAX) {
1258 		return EINVAL;
1259 	}
1260 	if (fhsize < FHANDLE_SIZE_MIN) {
1261 		return EINVAL;
1262 	}
1263 again:
1264 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1265 	if (fhp == NULL) {
1266 		return ENOMEM;
1267 	}
1268 	error = copyin(ufhp, fhp, fhsize);
1269 	if (error == 0) {
1270 		/* XXX this check shouldn't be here */
1271 		if (FHANDLE_SIZE(fhp) == fhsize) {
1272 			*fhpp = fhp;
1273 			return 0;
1274 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1275 			/*
1276 			 * a kludge for nfsv2 padded handles.
1277 			 */
1278 			size_t sz;
1279 
1280 			sz = FHANDLE_SIZE(fhp);
1281 			kmem_free(fhp, fhsize);
1282 			fhsize = sz;
1283 			goto again;
1284 		} else {
1285 			/*
1286 			 * userland told us wrong size.
1287 			 */
1288 		    	error = EINVAL;
1289 		}
1290 	}
1291 	kmem_free(fhp, fhsize);
1292 	return error;
1293 }
1294 
1295 void
1296 vfs_copyinfh_free(fhandle_t *fhp)
1297 {
1298 
1299 	vfs__fhfree(fhp);
1300 }
1301 
1302 /*
1303  * Get file handle system call
1304  */
1305 int
1306 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1307 {
1308 	/* {
1309 		syscallarg(char *) fname;
1310 		syscallarg(fhandle_t *) fhp;
1311 		syscallarg(size_t *) fh_size;
1312 	} */
1313 	struct vnode *vp;
1314 	fhandle_t *fh;
1315 	int error;
1316 	struct pathbuf *pb;
1317 	struct nameidata nd;
1318 	size_t sz;
1319 	size_t usz;
1320 
1321 	/*
1322 	 * Must be super user
1323 	 */
1324 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1325 	    0, NULL, NULL, NULL);
1326 	if (error)
1327 		return (error);
1328 
1329 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1330 	if (error) {
1331 		return error;
1332 	}
1333 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1334 	error = namei(&nd);
1335 	if (error) {
1336 		pathbuf_destroy(pb);
1337 		return error;
1338 	}
1339 	vp = nd.ni_vp;
1340 	pathbuf_destroy(pb);
1341 
1342 	error = vfs_composefh_alloc(vp, &fh);
1343 	vput(vp);
1344 	if (error != 0) {
1345 		goto out;
1346 	}
1347 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1348 	if (error != 0) {
1349 		goto out;
1350 	}
1351 	sz = FHANDLE_SIZE(fh);
1352 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1353 	if (error != 0) {
1354 		goto out;
1355 	}
1356 	if (usz >= sz) {
1357 		error = copyout(fh, SCARG(uap, fhp), sz);
1358 	} else {
1359 		error = E2BIG;
1360 	}
1361 out:
1362 	vfs_composefh_free(fh);
1363 	return (error);
1364 }
1365 
1366 /*
1367  * Open a file given a file handle.
1368  *
1369  * Check permissions, allocate an open file structure,
1370  * and call the device open routine if any.
1371  */
1372 
1373 int
1374 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1375     register_t *retval)
1376 {
1377 	file_t *fp;
1378 	struct vnode *vp = NULL;
1379 	kauth_cred_t cred = l->l_cred;
1380 	file_t *nfp;
1381 	int indx, error = 0;
1382 	struct vattr va;
1383 	fhandle_t *fh;
1384 	int flags;
1385 	proc_t *p;
1386 
1387 	p = curproc;
1388 
1389 	/*
1390 	 * Must be super user
1391 	 */
1392 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1393 	    0, NULL, NULL, NULL)))
1394 		return (error);
1395 
1396 	flags = FFLAGS(oflags);
1397 	if ((flags & (FREAD | FWRITE)) == 0)
1398 		return (EINVAL);
1399 	if ((flags & O_CREAT))
1400 		return (EINVAL);
1401 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1402 		return (error);
1403 	fp = nfp;
1404 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1405 	if (error != 0) {
1406 		goto bad;
1407 	}
1408 	error = vfs_fhtovp(fh, &vp);
1409 	if (error != 0) {
1410 		goto bad;
1411 	}
1412 
1413 	/* Now do an effective vn_open */
1414 
1415 	if (vp->v_type == VSOCK) {
1416 		error = EOPNOTSUPP;
1417 		goto bad;
1418 	}
1419 	error = vn_openchk(vp, cred, flags);
1420 	if (error != 0)
1421 		goto bad;
1422 	if (flags & O_TRUNC) {
1423 		VOP_UNLOCK(vp);			/* XXX */
1424 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1425 		vattr_null(&va);
1426 		va.va_size = 0;
1427 		error = VOP_SETATTR(vp, &va, cred);
1428 		if (error)
1429 			goto bad;
1430 	}
1431 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1432 		goto bad;
1433 	if (flags & FWRITE) {
1434 		mutex_enter(&vp->v_interlock);
1435 		vp->v_writecount++;
1436 		mutex_exit(&vp->v_interlock);
1437 	}
1438 
1439 	/* done with modified vn_open, now finish what sys_open does. */
1440 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1441 		return error;
1442 
1443 	VOP_UNLOCK(vp);
1444 	*retval = indx;
1445 	fd_affix(p, fp, indx);
1446 	vfs_copyinfh_free(fh);
1447 	return (0);
1448 
1449 bad:
1450 	fd_abort(p, fp, indx);
1451 	if (vp != NULL)
1452 		vput(vp);
1453 	vfs_copyinfh_free(fh);
1454 	return (error);
1455 }
1456 
1457 int
1458 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1459 {
1460 	/* {
1461 		syscallarg(const void *) fhp;
1462 		syscallarg(size_t) fh_size;
1463 		syscallarg(int) flags;
1464 	} */
1465 
1466 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1467 	    SCARG(uap, flags), retval);
1468 }
1469 
1470 int
1471 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1472 {
1473 	int error;
1474 	fhandle_t *fh;
1475 	struct vnode *vp;
1476 
1477 	/*
1478 	 * Must be super user
1479 	 */
1480 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1481 	    0, NULL, NULL, NULL)))
1482 		return (error);
1483 
1484 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1485 	if (error != 0)
1486 		return error;
1487 
1488 	error = vfs_fhtovp(fh, &vp);
1489 	vfs_copyinfh_free(fh);
1490 	if (error != 0)
1491 		return error;
1492 
1493 	error = vn_stat(vp, sb);
1494 	vput(vp);
1495 	return error;
1496 }
1497 
1498 
1499 /* ARGSUSED */
1500 int
1501 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1502 {
1503 	/* {
1504 		syscallarg(const void *) fhp;
1505 		syscallarg(size_t) fh_size;
1506 		syscallarg(struct stat *) sb;
1507 	} */
1508 	struct stat sb;
1509 	int error;
1510 
1511 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1512 	if (error)
1513 		return error;
1514 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1515 }
1516 
1517 int
1518 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1519     int flags)
1520 {
1521 	fhandle_t *fh;
1522 	struct mount *mp;
1523 	struct vnode *vp;
1524 	int error;
1525 
1526 	/*
1527 	 * Must be super user
1528 	 */
1529 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1530 	    0, NULL, NULL, NULL)))
1531 		return error;
1532 
1533 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1534 	if (error != 0)
1535 		return error;
1536 
1537 	error = vfs_fhtovp(fh, &vp);
1538 	vfs_copyinfh_free(fh);
1539 	if (error != 0)
1540 		return error;
1541 
1542 	mp = vp->v_mount;
1543 	error = dostatvfs(mp, sb, l, flags, 1);
1544 	vput(vp);
1545 	return error;
1546 }
1547 
1548 /* ARGSUSED */
1549 int
1550 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1551 {
1552 	/* {
1553 		syscallarg(const void *) fhp;
1554 		syscallarg(size_t) fh_size;
1555 		syscallarg(struct statvfs *) buf;
1556 		syscallarg(int)	flags;
1557 	} */
1558 	struct statvfs *sb = STATVFSBUF_GET();
1559 	int error;
1560 
1561 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1562 	    SCARG(uap, flags));
1563 	if (error == 0)
1564 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1565 	STATVFSBUF_PUT(sb);
1566 	return error;
1567 }
1568 
1569 /*
1570  * Create a special file.
1571  */
1572 /* ARGSUSED */
1573 int
1574 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
1575     register_t *retval)
1576 {
1577 	/* {
1578 		syscallarg(const char *) path;
1579 		syscallarg(mode_t) mode;
1580 		syscallarg(dev_t) dev;
1581 	} */
1582 	return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
1583 	    SCARG(uap, dev), retval, UIO_USERSPACE);
1584 }
1585 
1586 int
1587 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
1588     register_t *retval, enum uio_seg seg)
1589 {
1590 	struct proc *p = l->l_proc;
1591 	struct vnode *vp;
1592 	struct vattr vattr;
1593 	int error, optype;
1594 	struct pathbuf *pb;
1595 	struct nameidata nd;
1596 	const char *pathstring;
1597 
1598 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1599 	    0, NULL, NULL, NULL)) != 0)
1600 		return (error);
1601 
1602 	optype = VOP_MKNOD_DESCOFFSET;
1603 
1604 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
1605 	if (error) {
1606 		return error;
1607 	}
1608 	pathstring = pathbuf_stringcopy_get(pb);
1609 	if (pathstring == NULL) {
1610 		pathbuf_destroy(pb);
1611 		return ENOMEM;
1612 	}
1613 
1614 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
1615 	if ((error = namei(&nd)) != 0)
1616 		goto out;
1617 	vp = nd.ni_vp;
1618 
1619 	if (vp != NULL)
1620 		error = EEXIST;
1621 	else {
1622 		vattr_null(&vattr);
1623 		/* We will read cwdi->cwdi_cmask unlocked. */
1624 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1625 		vattr.va_rdev = dev;
1626 
1627 		switch (mode & S_IFMT) {
1628 		case S_IFMT:	/* used by badsect to flag bad sectors */
1629 			vattr.va_type = VBAD;
1630 			break;
1631 		case S_IFCHR:
1632 			vattr.va_type = VCHR;
1633 			break;
1634 		case S_IFBLK:
1635 			vattr.va_type = VBLK;
1636 			break;
1637 		case S_IFWHT:
1638 			optype = VOP_WHITEOUT_DESCOFFSET;
1639 			break;
1640 		case S_IFREG:
1641 #if NVERIEXEC > 0
1642 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
1643 			    O_CREAT);
1644 #endif /* NVERIEXEC > 0 */
1645 			vattr.va_type = VREG;
1646 			vattr.va_rdev = VNOVAL;
1647 			optype = VOP_CREATE_DESCOFFSET;
1648 			break;
1649 		default:
1650 			error = EINVAL;
1651 			break;
1652 		}
1653 	}
1654 	if (!error) {
1655 		switch (optype) {
1656 		case VOP_WHITEOUT_DESCOFFSET:
1657 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1658 			if (error)
1659 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1660 			vput(nd.ni_dvp);
1661 			break;
1662 
1663 		case VOP_MKNOD_DESCOFFSET:
1664 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1665 						&nd.ni_cnd, &vattr);
1666 			if (error == 0)
1667 				vput(nd.ni_vp);
1668 			break;
1669 
1670 		case VOP_CREATE_DESCOFFSET:
1671 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1672 						&nd.ni_cnd, &vattr);
1673 			if (error == 0)
1674 				vput(nd.ni_vp);
1675 			break;
1676 		}
1677 	} else {
1678 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1679 		if (nd.ni_dvp == vp)
1680 			vrele(nd.ni_dvp);
1681 		else
1682 			vput(nd.ni_dvp);
1683 		if (vp)
1684 			vrele(vp);
1685 	}
1686 out:
1687 	pathbuf_stringcopy_put(pb, pathstring);
1688 	pathbuf_destroy(pb);
1689 	return (error);
1690 }
1691 
1692 /*
1693  * Create a named pipe.
1694  */
1695 /* ARGSUSED */
1696 int
1697 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1698 {
1699 	/* {
1700 		syscallarg(const char *) path;
1701 		syscallarg(int) mode;
1702 	} */
1703 	struct proc *p = l->l_proc;
1704 	struct vattr vattr;
1705 	int error;
1706 	struct pathbuf *pb;
1707 	struct nameidata nd;
1708 
1709 	error = pathbuf_copyin(SCARG(uap, path), &pb);
1710 	if (error) {
1711 		return error;
1712 	}
1713 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
1714 	if ((error = namei(&nd)) != 0) {
1715 		pathbuf_destroy(pb);
1716 		return error;
1717 	}
1718 	if (nd.ni_vp != NULL) {
1719 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1720 		if (nd.ni_dvp == nd.ni_vp)
1721 			vrele(nd.ni_dvp);
1722 		else
1723 			vput(nd.ni_dvp);
1724 		vrele(nd.ni_vp);
1725 		pathbuf_destroy(pb);
1726 		return (EEXIST);
1727 	}
1728 	vattr_null(&vattr);
1729 	vattr.va_type = VFIFO;
1730 	/* We will read cwdi->cwdi_cmask unlocked. */
1731 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1732 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1733 	if (error == 0)
1734 		vput(nd.ni_vp);
1735 	pathbuf_destroy(pb);
1736 	return (error);
1737 }
1738 
1739 /*
1740  * Make a hard file link.
1741  */
1742 /* ARGSUSED */
1743 int
1744 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1745 {
1746 	/* {
1747 		syscallarg(const char *) path;
1748 		syscallarg(const char *) link;
1749 	} */
1750 	struct vnode *vp;
1751 	struct pathbuf *linkpb;
1752 	struct nameidata nd;
1753 	int error;
1754 
1755 	error = namei_simple_user(SCARG(uap, path),
1756 				NSM_FOLLOW_TRYEMULROOT, &vp);
1757 	if (error != 0)
1758 		return (error);
1759 	error = pathbuf_copyin(SCARG(uap, link), &linkpb);
1760 	if (error) {
1761 		goto out1;
1762 	}
1763 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
1764 	if ((error = namei(&nd)) != 0)
1765 		goto out2;
1766 	if (nd.ni_vp) {
1767 		error = EEXIST;
1768 		goto abortop;
1769 	}
1770 	/* Prevent hard links on directories. */
1771 	if (vp->v_type == VDIR) {
1772 		error = EPERM;
1773 		goto abortop;
1774 	}
1775 	/* Prevent cross-mount operation. */
1776 	if (nd.ni_dvp->v_mount != vp->v_mount) {
1777 		error = EXDEV;
1778 		goto abortop;
1779 	}
1780 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1781 out2:
1782 	pathbuf_destroy(linkpb);
1783 out1:
1784 	vrele(vp);
1785 	return (error);
1786 abortop:
1787 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1788 	if (nd.ni_dvp == nd.ni_vp)
1789 		vrele(nd.ni_dvp);
1790 	else
1791 		vput(nd.ni_dvp);
1792 	if (nd.ni_vp != NULL)
1793 		vrele(nd.ni_vp);
1794 	goto out2;
1795 }
1796 
1797 int
1798 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
1799 {
1800 	struct proc *p = curproc;
1801 	struct vattr vattr;
1802 	char *path;
1803 	int error;
1804 	struct pathbuf *linkpb;
1805 	struct nameidata nd;
1806 
1807 	path = PNBUF_GET();
1808 	if (seg == UIO_USERSPACE) {
1809 		if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != 0)
1810 			goto out1;
1811 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
1812 			goto out1;
1813 	} else {
1814 		KASSERT(strlen(patharg) < MAXPATHLEN);
1815 		strcpy(path, patharg);
1816 		linkpb = pathbuf_create(link);
1817 		if (linkpb == NULL) {
1818 			error = ENOMEM;
1819 			goto out1;
1820 		}
1821 	}
1822 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
1823 	if ((error = namei(&nd)) != 0)
1824 		goto out2;
1825 	if (nd.ni_vp) {
1826 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1827 		if (nd.ni_dvp == nd.ni_vp)
1828 			vrele(nd.ni_dvp);
1829 		else
1830 			vput(nd.ni_dvp);
1831 		vrele(nd.ni_vp);
1832 		error = EEXIST;
1833 		goto out2;
1834 	}
1835 	vattr_null(&vattr);
1836 	vattr.va_type = VLNK;
1837 	/* We will read cwdi->cwdi_cmask unlocked. */
1838 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
1839 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
1840 	if (error == 0)
1841 		vput(nd.ni_vp);
1842 out2:
1843 	pathbuf_destroy(linkpb);
1844 out1:
1845 	PNBUF_PUT(path);
1846 	return (error);
1847 }
1848 
1849 /*
1850  * Make a symbolic link.
1851  */
1852 /* ARGSUSED */
1853 int
1854 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
1855 {
1856 	/* {
1857 		syscallarg(const char *) path;
1858 		syscallarg(const char *) link;
1859 	} */
1860 
1861 	return do_sys_symlink(SCARG(uap, path), SCARG(uap, link),
1862 	    UIO_USERSPACE);
1863 }
1864 
1865 /*
1866  * Delete a whiteout from the filesystem.
1867  */
1868 /* ARGSUSED */
1869 int
1870 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
1871 {
1872 	/* {
1873 		syscallarg(const char *) path;
1874 	} */
1875 	int error;
1876 	struct pathbuf *pb;
1877 	struct nameidata nd;
1878 
1879 	error = pathbuf_copyin(SCARG(uap, path), &pb);
1880 	if (error) {
1881 		return error;
1882 	}
1883 
1884 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
1885 	error = namei(&nd);
1886 	if (error) {
1887 		pathbuf_destroy(pb);
1888 		return (error);
1889 	}
1890 
1891 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1892 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1893 		if (nd.ni_dvp == nd.ni_vp)
1894 			vrele(nd.ni_dvp);
1895 		else
1896 			vput(nd.ni_dvp);
1897 		if (nd.ni_vp)
1898 			vrele(nd.ni_vp);
1899 		pathbuf_destroy(pb);
1900 		return (EEXIST);
1901 	}
1902 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
1903 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1904 	vput(nd.ni_dvp);
1905 	pathbuf_destroy(pb);
1906 	return (error);
1907 }
1908 
1909 /*
1910  * Delete a name from the filesystem.
1911  */
1912 /* ARGSUSED */
1913 int
1914 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
1915 {
1916 	/* {
1917 		syscallarg(const char *) path;
1918 	} */
1919 
1920 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
1921 }
1922 
1923 int
1924 do_sys_unlink(const char *arg, enum uio_seg seg)
1925 {
1926 	struct vnode *vp;
1927 	int error;
1928 	struct pathbuf *pb;
1929 	struct nameidata nd;
1930 	const char *pathstring;
1931 
1932 	error = pathbuf_maybe_copyin(arg, seg, &pb);
1933 	if (error) {
1934 		return error;
1935 	}
1936 	pathstring = pathbuf_stringcopy_get(pb);
1937 	if (pathstring == NULL) {
1938 		pathbuf_destroy(pb);
1939 		return ENOMEM;
1940 	}
1941 
1942 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
1943 	if ((error = namei(&nd)) != 0)
1944 		goto out;
1945 	vp = nd.ni_vp;
1946 
1947 	/*
1948 	 * The root of a mounted filesystem cannot be deleted.
1949 	 */
1950 	if (vp->v_vflag & VV_ROOT) {
1951 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1952 		if (nd.ni_dvp == vp)
1953 			vrele(nd.ni_dvp);
1954 		else
1955 			vput(nd.ni_dvp);
1956 		vput(vp);
1957 		error = EBUSY;
1958 		goto out;
1959 	}
1960 
1961 #if NVERIEXEC > 0
1962 	/* Handle remove requests for veriexec entries. */
1963 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
1964 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1965 		if (nd.ni_dvp == vp)
1966 			vrele(nd.ni_dvp);
1967 		else
1968 			vput(nd.ni_dvp);
1969 		vput(vp);
1970 		goto out;
1971 	}
1972 #endif /* NVERIEXEC > 0 */
1973 
1974 #ifdef FILEASSOC
1975 	(void)fileassoc_file_delete(vp);
1976 #endif /* FILEASSOC */
1977 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
1978 out:
1979 	pathbuf_stringcopy_put(pb, pathstring);
1980 	pathbuf_destroy(pb);
1981 	return (error);
1982 }
1983 
1984 /*
1985  * Reposition read/write file offset.
1986  */
1987 int
1988 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
1989 {
1990 	/* {
1991 		syscallarg(int) fd;
1992 		syscallarg(int) pad;
1993 		syscallarg(off_t) offset;
1994 		syscallarg(int) whence;
1995 	} */
1996 	kauth_cred_t cred = l->l_cred;
1997 	file_t *fp;
1998 	struct vnode *vp;
1999 	struct vattr vattr;
2000 	off_t newoff;
2001 	int error, fd;
2002 
2003 	fd = SCARG(uap, fd);
2004 
2005 	if ((fp = fd_getfile(fd)) == NULL)
2006 		return (EBADF);
2007 
2008 	vp = fp->f_data;
2009 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2010 		error = ESPIPE;
2011 		goto out;
2012 	}
2013 
2014 	switch (SCARG(uap, whence)) {
2015 	case SEEK_CUR:
2016 		newoff = fp->f_offset + SCARG(uap, offset);
2017 		break;
2018 	case SEEK_END:
2019 		error = VOP_GETATTR(vp, &vattr, cred);
2020 		if (error) {
2021 			goto out;
2022 		}
2023 		newoff = SCARG(uap, offset) + vattr.va_size;
2024 		break;
2025 	case SEEK_SET:
2026 		newoff = SCARG(uap, offset);
2027 		break;
2028 	default:
2029 		error = EINVAL;
2030 		goto out;
2031 	}
2032 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2033 		*(off_t *)retval = fp->f_offset = newoff;
2034 	}
2035  out:
2036  	fd_putfile(fd);
2037 	return (error);
2038 }
2039 
2040 /*
2041  * Positional read system call.
2042  */
2043 int
2044 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2045 {
2046 	/* {
2047 		syscallarg(int) fd;
2048 		syscallarg(void *) buf;
2049 		syscallarg(size_t) nbyte;
2050 		syscallarg(off_t) offset;
2051 	} */
2052 	file_t *fp;
2053 	struct vnode *vp;
2054 	off_t offset;
2055 	int error, fd = SCARG(uap, fd);
2056 
2057 	if ((fp = fd_getfile(fd)) == NULL)
2058 		return (EBADF);
2059 
2060 	if ((fp->f_flag & FREAD) == 0) {
2061 		fd_putfile(fd);
2062 		return (EBADF);
2063 	}
2064 
2065 	vp = fp->f_data;
2066 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2067 		error = ESPIPE;
2068 		goto out;
2069 	}
2070 
2071 	offset = SCARG(uap, offset);
2072 
2073 	/*
2074 	 * XXX This works because no file systems actually
2075 	 * XXX take any action on the seek operation.
2076 	 */
2077 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2078 		goto out;
2079 
2080 	/* dofileread() will unuse the descriptor for us */
2081 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2082 	    &offset, 0, retval));
2083 
2084  out:
2085 	fd_putfile(fd);
2086 	return (error);
2087 }
2088 
2089 /*
2090  * Positional scatter read system call.
2091  */
2092 int
2093 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2094 {
2095 	/* {
2096 		syscallarg(int) fd;
2097 		syscallarg(const struct iovec *) iovp;
2098 		syscallarg(int) iovcnt;
2099 		syscallarg(off_t) offset;
2100 	} */
2101 	off_t offset = SCARG(uap, offset);
2102 
2103 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2104 	    SCARG(uap, iovcnt), &offset, 0, retval);
2105 }
2106 
2107 /*
2108  * Positional write system call.
2109  */
2110 int
2111 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2112 {
2113 	/* {
2114 		syscallarg(int) fd;
2115 		syscallarg(const void *) buf;
2116 		syscallarg(size_t) nbyte;
2117 		syscallarg(off_t) offset;
2118 	} */
2119 	file_t *fp;
2120 	struct vnode *vp;
2121 	off_t offset;
2122 	int error, fd = SCARG(uap, fd);
2123 
2124 	if ((fp = fd_getfile(fd)) == NULL)
2125 		return (EBADF);
2126 
2127 	if ((fp->f_flag & FWRITE) == 0) {
2128 		fd_putfile(fd);
2129 		return (EBADF);
2130 	}
2131 
2132 	vp = fp->f_data;
2133 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2134 		error = ESPIPE;
2135 		goto out;
2136 	}
2137 
2138 	offset = SCARG(uap, offset);
2139 
2140 	/*
2141 	 * XXX This works because no file systems actually
2142 	 * XXX take any action on the seek operation.
2143 	 */
2144 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2145 		goto out;
2146 
2147 	/* dofilewrite() will unuse the descriptor for us */
2148 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2149 	    &offset, 0, retval));
2150 
2151  out:
2152 	fd_putfile(fd);
2153 	return (error);
2154 }
2155 
2156 /*
2157  * Positional gather write system call.
2158  */
2159 int
2160 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2161 {
2162 	/* {
2163 		syscallarg(int) fd;
2164 		syscallarg(const struct iovec *) iovp;
2165 		syscallarg(int) iovcnt;
2166 		syscallarg(off_t) offset;
2167 	} */
2168 	off_t offset = SCARG(uap, offset);
2169 
2170 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2171 	    SCARG(uap, iovcnt), &offset, 0, retval);
2172 }
2173 
2174 /*
2175  * Check access permissions.
2176  */
2177 int
2178 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2179 {
2180 	/* {
2181 		syscallarg(const char *) path;
2182 		syscallarg(int) flags;
2183 	} */
2184 	kauth_cred_t cred;
2185 	struct vnode *vp;
2186 	int error, flags;
2187 	struct pathbuf *pb;
2188 	struct nameidata nd;
2189 
2190 	CTASSERT(F_OK == 0);
2191 	if ((SCARG(uap, flags) & ~(R_OK | W_OK | X_OK)) != 0) {
2192 		/* nonsense flags */
2193 		return EINVAL;
2194 	}
2195 
2196 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2197 	if (error) {
2198 		return error;
2199 	}
2200 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2201 
2202 	/* Override default credentials */
2203 	cred = kauth_cred_dup(l->l_cred);
2204 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2205 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2206 	nd.ni_cnd.cn_cred = cred;
2207 
2208 	if ((error = namei(&nd)) != 0) {
2209 		pathbuf_destroy(pb);
2210 		goto out;
2211 	}
2212 	vp = nd.ni_vp;
2213 	pathbuf_destroy(pb);
2214 
2215 	/* Flags == 0 means only check for existence. */
2216 	if (SCARG(uap, flags)) {
2217 		flags = 0;
2218 		if (SCARG(uap, flags) & R_OK)
2219 			flags |= VREAD;
2220 		if (SCARG(uap, flags) & W_OK)
2221 			flags |= VWRITE;
2222 		if (SCARG(uap, flags) & X_OK)
2223 			flags |= VEXEC;
2224 
2225 		error = VOP_ACCESS(vp, flags, cred);
2226 		if (!error && (flags & VWRITE))
2227 			error = vn_writechk(vp);
2228 	}
2229 	vput(vp);
2230 out:
2231 	kauth_cred_free(cred);
2232 	return (error);
2233 }
2234 
2235 /*
2236  * Common code for all sys_stat functions, including compat versions.
2237  */
2238 int
2239 do_sys_stat(const char *userpath, unsigned int nd_flags, struct stat *sb)
2240 {
2241 	int error;
2242 	struct pathbuf *pb;
2243 	struct nameidata nd;
2244 
2245 	error = pathbuf_copyin(userpath, &pb);
2246 	if (error) {
2247 		return error;
2248 	}
2249 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT, pb);
2250 	error = namei(&nd);
2251 	if (error != 0) {
2252 		pathbuf_destroy(pb);
2253 		return error;
2254 	}
2255 	error = vn_stat(nd.ni_vp, sb);
2256 	vput(nd.ni_vp);
2257 	pathbuf_destroy(pb);
2258 	return error;
2259 }
2260 
2261 /*
2262  * Get file status; this version follows links.
2263  */
2264 /* ARGSUSED */
2265 int
2266 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2267 {
2268 	/* {
2269 		syscallarg(const char *) path;
2270 		syscallarg(struct stat *) ub;
2271 	} */
2272 	struct stat sb;
2273 	int error;
2274 
2275 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2276 	if (error)
2277 		return error;
2278 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2279 }
2280 
2281 /*
2282  * Get file status; this version does not follow links.
2283  */
2284 /* ARGSUSED */
2285 int
2286 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2287 {
2288 	/* {
2289 		syscallarg(const char *) path;
2290 		syscallarg(struct stat *) ub;
2291 	} */
2292 	struct stat sb;
2293 	int error;
2294 
2295 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2296 	if (error)
2297 		return error;
2298 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2299 }
2300 
2301 /*
2302  * Get configurable pathname variables.
2303  */
2304 /* ARGSUSED */
2305 int
2306 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2307 {
2308 	/* {
2309 		syscallarg(const char *) path;
2310 		syscallarg(int) name;
2311 	} */
2312 	int error;
2313 	struct pathbuf *pb;
2314 	struct nameidata nd;
2315 
2316 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2317 	if (error) {
2318 		return error;
2319 	}
2320 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2321 	if ((error = namei(&nd)) != 0) {
2322 		pathbuf_destroy(pb);
2323 		return (error);
2324 	}
2325 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2326 	vput(nd.ni_vp);
2327 	pathbuf_destroy(pb);
2328 	return (error);
2329 }
2330 
2331 /*
2332  * Return target name of a symbolic link.
2333  */
2334 /* ARGSUSED */
2335 int
2336 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2337 {
2338 	/* {
2339 		syscallarg(const char *) path;
2340 		syscallarg(char *) buf;
2341 		syscallarg(size_t) count;
2342 	} */
2343 	struct vnode *vp;
2344 	struct iovec aiov;
2345 	struct uio auio;
2346 	int error;
2347 	struct pathbuf *pb;
2348 	struct nameidata nd;
2349 
2350 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2351 	if (error) {
2352 		return error;
2353 	}
2354 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2355 	if ((error = namei(&nd)) != 0) {
2356 		pathbuf_destroy(pb);
2357 		return error;
2358 	}
2359 	vp = nd.ni_vp;
2360 	pathbuf_destroy(pb);
2361 	if (vp->v_type != VLNK)
2362 		error = EINVAL;
2363 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2364 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2365 		aiov.iov_base = SCARG(uap, buf);
2366 		aiov.iov_len = SCARG(uap, count);
2367 		auio.uio_iov = &aiov;
2368 		auio.uio_iovcnt = 1;
2369 		auio.uio_offset = 0;
2370 		auio.uio_rw = UIO_READ;
2371 		KASSERT(l == curlwp);
2372 		auio.uio_vmspace = l->l_proc->p_vmspace;
2373 		auio.uio_resid = SCARG(uap, count);
2374 		error = VOP_READLINK(vp, &auio, l->l_cred);
2375 	}
2376 	vput(vp);
2377 	*retval = SCARG(uap, count) - auio.uio_resid;
2378 	return (error);
2379 }
2380 
2381 /*
2382  * Change flags of a file given a path name.
2383  */
2384 /* ARGSUSED */
2385 int
2386 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2387 {
2388 	/* {
2389 		syscallarg(const char *) path;
2390 		syscallarg(u_long) flags;
2391 	} */
2392 	struct vnode *vp;
2393 	int error;
2394 
2395 	error = namei_simple_user(SCARG(uap, path),
2396 				NSM_FOLLOW_TRYEMULROOT, &vp);
2397 	if (error != 0)
2398 		return (error);
2399 	error = change_flags(vp, SCARG(uap, flags), l);
2400 	vput(vp);
2401 	return (error);
2402 }
2403 
2404 /*
2405  * Change flags of a file given a file descriptor.
2406  */
2407 /* ARGSUSED */
2408 int
2409 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2410 {
2411 	/* {
2412 		syscallarg(int) fd;
2413 		syscallarg(u_long) flags;
2414 	} */
2415 	struct vnode *vp;
2416 	file_t *fp;
2417 	int error;
2418 
2419 	/* fd_getvnode() will use the descriptor for us */
2420 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2421 		return (error);
2422 	vp = fp->f_data;
2423 	error = change_flags(vp, SCARG(uap, flags), l);
2424 	VOP_UNLOCK(vp);
2425 	fd_putfile(SCARG(uap, fd));
2426 	return (error);
2427 }
2428 
2429 /*
2430  * Change flags of a file given a path name; this version does
2431  * not follow links.
2432  */
2433 int
2434 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2435 {
2436 	/* {
2437 		syscallarg(const char *) path;
2438 		syscallarg(u_long) flags;
2439 	} */
2440 	struct vnode *vp;
2441 	int error;
2442 
2443 	error = namei_simple_user(SCARG(uap, path),
2444 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2445 	if (error != 0)
2446 		return (error);
2447 	error = change_flags(vp, SCARG(uap, flags), l);
2448 	vput(vp);
2449 	return (error);
2450 }
2451 
2452 /*
2453  * Common routine to change flags of a file.
2454  */
2455 int
2456 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2457 {
2458 	struct vattr vattr;
2459 	int error;
2460 
2461 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2462 	/*
2463 	 * Non-superusers cannot change the flags on devices, even if they
2464 	 * own them.
2465 	 */
2466 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2467 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2468 			goto out;
2469 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2470 			error = EINVAL;
2471 			goto out;
2472 		}
2473 	}
2474 	vattr_null(&vattr);
2475 	vattr.va_flags = flags;
2476 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2477 out:
2478 	return (error);
2479 }
2480 
2481 /*
2482  * Change mode of a file given path name; this version follows links.
2483  */
2484 /* ARGSUSED */
2485 int
2486 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2487 {
2488 	/* {
2489 		syscallarg(const char *) path;
2490 		syscallarg(int) mode;
2491 	} */
2492 	int error;
2493 	struct vnode *vp;
2494 
2495 	error = namei_simple_user(SCARG(uap, path),
2496 				NSM_FOLLOW_TRYEMULROOT, &vp);
2497 	if (error != 0)
2498 		return (error);
2499 
2500 	error = change_mode(vp, SCARG(uap, mode), l);
2501 
2502 	vrele(vp);
2503 	return (error);
2504 }
2505 
2506 /*
2507  * Change mode of a file given a file descriptor.
2508  */
2509 /* ARGSUSED */
2510 int
2511 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2512 {
2513 	/* {
2514 		syscallarg(int) fd;
2515 		syscallarg(int) mode;
2516 	} */
2517 	file_t *fp;
2518 	int error;
2519 
2520 	/* fd_getvnode() will use the descriptor for us */
2521 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2522 		return (error);
2523 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2524 	fd_putfile(SCARG(uap, fd));
2525 	return (error);
2526 }
2527 
2528 /*
2529  * Change mode of a file given path name; this version does not follow links.
2530  */
2531 /* ARGSUSED */
2532 int
2533 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2534 {
2535 	/* {
2536 		syscallarg(const char *) path;
2537 		syscallarg(int) mode;
2538 	} */
2539 	int error;
2540 	struct vnode *vp;
2541 
2542 	error = namei_simple_user(SCARG(uap, path),
2543 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2544 	if (error != 0)
2545 		return (error);
2546 
2547 	error = change_mode(vp, SCARG(uap, mode), l);
2548 
2549 	vrele(vp);
2550 	return (error);
2551 }
2552 
2553 /*
2554  * Common routine to set mode given a vnode.
2555  */
2556 static int
2557 change_mode(struct vnode *vp, int mode, struct lwp *l)
2558 {
2559 	struct vattr vattr;
2560 	int error;
2561 
2562 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2563 	vattr_null(&vattr);
2564 	vattr.va_mode = mode & ALLPERMS;
2565 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2566 	VOP_UNLOCK(vp);
2567 	return (error);
2568 }
2569 
2570 /*
2571  * Set ownership given a path name; this version follows links.
2572  */
2573 /* ARGSUSED */
2574 int
2575 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2576 {
2577 	/* {
2578 		syscallarg(const char *) path;
2579 		syscallarg(uid_t) uid;
2580 		syscallarg(gid_t) gid;
2581 	} */
2582 	int error;
2583 	struct vnode *vp;
2584 
2585 	error = namei_simple_user(SCARG(uap, path),
2586 				NSM_FOLLOW_TRYEMULROOT, &vp);
2587 	if (error != 0)
2588 		return (error);
2589 
2590 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2591 
2592 	vrele(vp);
2593 	return (error);
2594 }
2595 
2596 /*
2597  * Set ownership given a path name; this version follows links.
2598  * Provides POSIX semantics.
2599  */
2600 /* ARGSUSED */
2601 int
2602 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2603 {
2604 	/* {
2605 		syscallarg(const char *) path;
2606 		syscallarg(uid_t) uid;
2607 		syscallarg(gid_t) gid;
2608 	} */
2609 	int error;
2610 	struct vnode *vp;
2611 
2612 	error = namei_simple_user(SCARG(uap, path),
2613 				NSM_FOLLOW_TRYEMULROOT, &vp);
2614 	if (error != 0)
2615 		return (error);
2616 
2617 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2618 
2619 	vrele(vp);
2620 	return (error);
2621 }
2622 
2623 /*
2624  * Set ownership given a file descriptor.
2625  */
2626 /* ARGSUSED */
2627 int
2628 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2629 {
2630 	/* {
2631 		syscallarg(int) fd;
2632 		syscallarg(uid_t) uid;
2633 		syscallarg(gid_t) gid;
2634 	} */
2635 	int error;
2636 	file_t *fp;
2637 
2638 	/* fd_getvnode() will use the descriptor for us */
2639 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2640 		return (error);
2641 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2642 	    l, 0);
2643 	fd_putfile(SCARG(uap, fd));
2644 	return (error);
2645 }
2646 
2647 /*
2648  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2649  */
2650 /* ARGSUSED */
2651 int
2652 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2653 {
2654 	/* {
2655 		syscallarg(int) fd;
2656 		syscallarg(uid_t) uid;
2657 		syscallarg(gid_t) gid;
2658 	} */
2659 	int error;
2660 	file_t *fp;
2661 
2662 	/* fd_getvnode() will use the descriptor for us */
2663 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2664 		return (error);
2665 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2666 	    l, 1);
2667 	fd_putfile(SCARG(uap, fd));
2668 	return (error);
2669 }
2670 
2671 /*
2672  * Set ownership given a path name; this version does not follow links.
2673  */
2674 /* ARGSUSED */
2675 int
2676 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2677 {
2678 	/* {
2679 		syscallarg(const char *) path;
2680 		syscallarg(uid_t) uid;
2681 		syscallarg(gid_t) gid;
2682 	} */
2683 	int error;
2684 	struct vnode *vp;
2685 
2686 	error = namei_simple_user(SCARG(uap, path),
2687 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2688 	if (error != 0)
2689 		return (error);
2690 
2691 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2692 
2693 	vrele(vp);
2694 	return (error);
2695 }
2696 
2697 /*
2698  * Set ownership given a path name; this version does not follow links.
2699  * Provides POSIX/XPG semantics.
2700  */
2701 /* ARGSUSED */
2702 int
2703 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2704 {
2705 	/* {
2706 		syscallarg(const char *) path;
2707 		syscallarg(uid_t) uid;
2708 		syscallarg(gid_t) gid;
2709 	} */
2710 	int error;
2711 	struct vnode *vp;
2712 
2713 	error = namei_simple_user(SCARG(uap, path),
2714 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2715 	if (error != 0)
2716 		return (error);
2717 
2718 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2719 
2720 	vrele(vp);
2721 	return (error);
2722 }
2723 
2724 /*
2725  * Common routine to set ownership given a vnode.
2726  */
2727 static int
2728 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2729     int posix_semantics)
2730 {
2731 	struct vattr vattr;
2732 	mode_t newmode;
2733 	int error;
2734 
2735 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2736 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2737 		goto out;
2738 
2739 #define CHANGED(x) ((int)(x) != -1)
2740 	newmode = vattr.va_mode;
2741 	if (posix_semantics) {
2742 		/*
2743 		 * POSIX/XPG semantics: if the caller is not the super-user,
2744 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2745 		 * the XPG consider the behaviour for calls by the super-user
2746 		 * implementation-defined; we leave the set-user-id and set-
2747 		 * group-id settings intact in that case.
2748 		 */
2749 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2750 				      NULL) != 0)
2751 			newmode &= ~(S_ISUID | S_ISGID);
2752 	} else {
2753 		/*
2754 		 * NetBSD semantics: when changing owner and/or group,
2755 		 * clear the respective bit(s).
2756 		 */
2757 		if (CHANGED(uid))
2758 			newmode &= ~S_ISUID;
2759 		if (CHANGED(gid))
2760 			newmode &= ~S_ISGID;
2761 	}
2762 	/* Update va_mode iff altered. */
2763 	if (vattr.va_mode == newmode)
2764 		newmode = VNOVAL;
2765 
2766 	vattr_null(&vattr);
2767 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2768 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2769 	vattr.va_mode = newmode;
2770 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2771 #undef CHANGED
2772 
2773 out:
2774 	VOP_UNLOCK(vp);
2775 	return (error);
2776 }
2777 
2778 /*
2779  * Set the access and modification times given a path name; this
2780  * version follows links.
2781  */
2782 /* ARGSUSED */
2783 int
2784 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
2785     register_t *retval)
2786 {
2787 	/* {
2788 		syscallarg(const char *) path;
2789 		syscallarg(const struct timeval *) tptr;
2790 	} */
2791 
2792 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2793 	    SCARG(uap, tptr), UIO_USERSPACE);
2794 }
2795 
2796 /*
2797  * Set the access and modification times given a file descriptor.
2798  */
2799 /* ARGSUSED */
2800 int
2801 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
2802     register_t *retval)
2803 {
2804 	/* {
2805 		syscallarg(int) fd;
2806 		syscallarg(const struct timeval *) tptr;
2807 	} */
2808 	int error;
2809 	file_t *fp;
2810 
2811 	/* fd_getvnode() will use the descriptor for us */
2812 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2813 		return (error);
2814 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2815 	    UIO_USERSPACE);
2816 	fd_putfile(SCARG(uap, fd));
2817 	return (error);
2818 }
2819 
2820 /*
2821  * Set the access and modification times given a path name; this
2822  * version does not follow links.
2823  */
2824 int
2825 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
2826     register_t *retval)
2827 {
2828 	/* {
2829 		syscallarg(const char *) path;
2830 		syscallarg(const struct timeval *) tptr;
2831 	} */
2832 
2833 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
2834 	    SCARG(uap, tptr), UIO_USERSPACE);
2835 }
2836 
2837 /*
2838  * Common routine to set access and modification times given a vnode.
2839  */
2840 int
2841 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
2842     const struct timeval *tptr, enum uio_seg seg)
2843 {
2844 	struct vattr vattr;
2845 	int error, dorele = 0;
2846 	namei_simple_flags_t sflags;
2847 
2848 	bool vanull, setbirthtime;
2849 	struct timespec ts[2];
2850 
2851 	/*
2852 	 * I have checked all callers and they pass either FOLLOW,
2853 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
2854 	 * is 0. More to the point, they don't pass anything else.
2855 	 * Let's keep it that way at least until the namei interfaces
2856 	 * are fully sanitized.
2857 	 */
2858 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
2859 	sflags = (flag == FOLLOW) ?
2860 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
2861 
2862 	if (tptr == NULL) {
2863 		vanull = true;
2864 		nanotime(&ts[0]);
2865 		ts[1] = ts[0];
2866 	} else {
2867 		struct timeval tv[2];
2868 
2869 		vanull = false;
2870 		if (seg != UIO_SYSSPACE) {
2871 			error = copyin(tptr, tv, sizeof (tv));
2872 			if (error != 0)
2873 				return error;
2874 			tptr = tv;
2875 		}
2876 		TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
2877 		TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
2878 	}
2879 
2880 	if (vp == NULL) {
2881 		/* note: SEG describes TPTR, not PATH; PATH is always user */
2882 		error = namei_simple_user(path, sflags, &vp);
2883 		if (error != 0)
2884 			return error;
2885 		dorele = 1;
2886 	}
2887 
2888 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2889 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
2890 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
2891 	vattr_null(&vattr);
2892 	vattr.va_atime = ts[0];
2893 	vattr.va_mtime = ts[1];
2894 	if (setbirthtime)
2895 		vattr.va_birthtime = ts[1];
2896 	if (vanull)
2897 		vattr.va_vaflags |= VA_UTIMES_NULL;
2898 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2899 	VOP_UNLOCK(vp);
2900 
2901 	if (dorele != 0)
2902 		vrele(vp);
2903 
2904 	return error;
2905 }
2906 
2907 /*
2908  * Truncate a file given its path name.
2909  */
2910 /* ARGSUSED */
2911 int
2912 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
2913 {
2914 	/* {
2915 		syscallarg(const char *) path;
2916 		syscallarg(int) pad;
2917 		syscallarg(off_t) length;
2918 	} */
2919 	struct vnode *vp;
2920 	struct vattr vattr;
2921 	int error;
2922 
2923 	error = namei_simple_user(SCARG(uap, path),
2924 				NSM_FOLLOW_TRYEMULROOT, &vp);
2925 	if (error != 0)
2926 		return (error);
2927 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2928 	if (vp->v_type == VDIR)
2929 		error = EISDIR;
2930 	else if ((error = vn_writechk(vp)) == 0 &&
2931 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
2932 		vattr_null(&vattr);
2933 		vattr.va_size = SCARG(uap, length);
2934 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
2935 	}
2936 	vput(vp);
2937 	return (error);
2938 }
2939 
2940 /*
2941  * Truncate a file given a file descriptor.
2942  */
2943 /* ARGSUSED */
2944 int
2945 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
2946 {
2947 	/* {
2948 		syscallarg(int) fd;
2949 		syscallarg(int) pad;
2950 		syscallarg(off_t) length;
2951 	} */
2952 	struct vattr vattr;
2953 	struct vnode *vp;
2954 	file_t *fp;
2955 	int error;
2956 
2957 	/* fd_getvnode() will use the descriptor for us */
2958 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2959 		return (error);
2960 	if ((fp->f_flag & FWRITE) == 0) {
2961 		error = EINVAL;
2962 		goto out;
2963 	}
2964 	vp = fp->f_data;
2965 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2966 	if (vp->v_type == VDIR)
2967 		error = EISDIR;
2968 	else if ((error = vn_writechk(vp)) == 0) {
2969 		vattr_null(&vattr);
2970 		vattr.va_size = SCARG(uap, length);
2971 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
2972 	}
2973 	VOP_UNLOCK(vp);
2974  out:
2975 	fd_putfile(SCARG(uap, fd));
2976 	return (error);
2977 }
2978 
2979 /*
2980  * Sync an open file.
2981  */
2982 /* ARGSUSED */
2983 int
2984 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
2985 {
2986 	/* {
2987 		syscallarg(int) fd;
2988 	} */
2989 	struct vnode *vp;
2990 	file_t *fp;
2991 	int error;
2992 
2993 	/* fd_getvnode() will use the descriptor for us */
2994 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2995 		return (error);
2996 	vp = fp->f_data;
2997 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2998 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
2999 	VOP_UNLOCK(vp);
3000 	fd_putfile(SCARG(uap, fd));
3001 	return (error);
3002 }
3003 
3004 /*
3005  * Sync a range of file data.  API modeled after that found in AIX.
3006  *
3007  * FDATASYNC indicates that we need only save enough metadata to be able
3008  * to re-read the written data.  Note we duplicate AIX's requirement that
3009  * the file be open for writing.
3010  */
3011 /* ARGSUSED */
3012 int
3013 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3014 {
3015 	/* {
3016 		syscallarg(int) fd;
3017 		syscallarg(int) flags;
3018 		syscallarg(off_t) start;
3019 		syscallarg(off_t) length;
3020 	} */
3021 	struct vnode *vp;
3022 	file_t *fp;
3023 	int flags, nflags;
3024 	off_t s, e, len;
3025 	int error;
3026 
3027 	/* fd_getvnode() will use the descriptor for us */
3028 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3029 		return (error);
3030 
3031 	if ((fp->f_flag & FWRITE) == 0) {
3032 		error = EBADF;
3033 		goto out;
3034 	}
3035 
3036 	flags = SCARG(uap, flags);
3037 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3038 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3039 		error = EINVAL;
3040 		goto out;
3041 	}
3042 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3043 	if (flags & FDATASYNC)
3044 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3045 	else
3046 		nflags = FSYNC_WAIT;
3047 	if (flags & FDISKSYNC)
3048 		nflags |= FSYNC_CACHE;
3049 
3050 	len = SCARG(uap, length);
3051 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3052 	if (len) {
3053 		s = SCARG(uap, start);
3054 		e = s + len;
3055 		if (e < s) {
3056 			error = EINVAL;
3057 			goto out;
3058 		}
3059 	} else {
3060 		e = 0;
3061 		s = 0;
3062 	}
3063 
3064 	vp = fp->f_data;
3065 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3066 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3067 	VOP_UNLOCK(vp);
3068 out:
3069 	fd_putfile(SCARG(uap, fd));
3070 	return (error);
3071 }
3072 
3073 /*
3074  * Sync the data of an open file.
3075  */
3076 /* ARGSUSED */
3077 int
3078 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3079 {
3080 	/* {
3081 		syscallarg(int) fd;
3082 	} */
3083 	struct vnode *vp;
3084 	file_t *fp;
3085 	int error;
3086 
3087 	/* fd_getvnode() will use the descriptor for us */
3088 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3089 		return (error);
3090 	if ((fp->f_flag & FWRITE) == 0) {
3091 		fd_putfile(SCARG(uap, fd));
3092 		return (EBADF);
3093 	}
3094 	vp = fp->f_data;
3095 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3096 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3097 	VOP_UNLOCK(vp);
3098 	fd_putfile(SCARG(uap, fd));
3099 	return (error);
3100 }
3101 
3102 /*
3103  * Rename files, (standard) BSD semantics frontend.
3104  */
3105 /* ARGSUSED */
3106 int
3107 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3108 {
3109 	/* {
3110 		syscallarg(const char *) from;
3111 		syscallarg(const char *) to;
3112 	} */
3113 
3114 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3115 }
3116 
3117 /*
3118  * Rename files, POSIX semantics frontend.
3119  */
3120 /* ARGSUSED */
3121 int
3122 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3123 {
3124 	/* {
3125 		syscallarg(const char *) from;
3126 		syscallarg(const char *) to;
3127 	} */
3128 
3129 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3130 }
3131 
3132 /*
3133  * Rename files.  Source and destination must either both be directories,
3134  * or both not be directories.  If target is a directory, it must be empty.
3135  * If `from' and `to' refer to the same object, the value of the `retain'
3136  * argument is used to determine whether `from' will be
3137  *
3138  * (retain == 0)	deleted unless `from' and `to' refer to the same
3139  *			object in the file system's name space (BSD).
3140  * (retain == 1)	always retained (POSIX).
3141  */
3142 int
3143 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3144 {
3145 	struct vnode *tvp, *fvp, *tdvp;
3146 	struct pathbuf *frompb, *topb;
3147 	struct nameidata fromnd, tond;
3148 	struct mount *fs;
3149 	struct lwp *l = curlwp;
3150 	struct proc *p;
3151 	int error;
3152 
3153 	error = pathbuf_maybe_copyin(from, seg, &frompb);
3154 	if (error) {
3155 		return error;
3156 	}
3157 	error = pathbuf_maybe_copyin(to, seg, &topb);
3158 	if (error) {
3159 		pathbuf_destroy(frompb);
3160 		return error;
3161 	}
3162 
3163 	NDINIT(&fromnd, DELETE, LOCKPARENT | TRYEMULROOT | INRENAME,
3164 	    frompb);
3165 	if ((error = namei(&fromnd)) != 0) {
3166 		pathbuf_destroy(frompb);
3167 		pathbuf_destroy(topb);
3168 		return (error);
3169 	}
3170 	if (fromnd.ni_dvp != fromnd.ni_vp)
3171 		VOP_UNLOCK(fromnd.ni_dvp);
3172 	fvp = fromnd.ni_vp;
3173 
3174 	fs = fvp->v_mount;
3175 	error = VFS_RENAMELOCK_ENTER(fs);
3176 	if (error) {
3177 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3178 		vrele(fromnd.ni_dvp);
3179 		vrele(fvp);
3180 		goto out1;
3181 	}
3182 
3183 	/*
3184 	 * close, partially, yet another race - ideally we should only
3185 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3186 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3187 	 * that with namei as it stands.
3188 	 *
3189 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3190 	 * under us. The real fix is to get the locks in the right
3191 	 * order and do the lookups in the right places, but that's a
3192 	 * major rototill.
3193 	 *
3194 	 * Note: this logic (as well as this whole function) is cloned
3195 	 * in nfs_serv.c. Proceed accordingly.
3196 	 */
3197 	vrele(fvp);
3198 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3199 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3200 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3201 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3202 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3203 		error = EINVAL;
3204 		VFS_RENAMELOCK_EXIT(fs);
3205 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3206 		vrele(fromnd.ni_dvp);
3207 		goto out1;
3208 	}
3209 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3210 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd, 0);
3211 	if (error) {
3212 		VOP_UNLOCK(fromnd.ni_dvp);
3213 		VFS_RENAMELOCK_EXIT(fs);
3214 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3215 		vrele(fromnd.ni_dvp);
3216 		goto out1;
3217 	}
3218 	VOP_UNLOCK(fromnd.ni_vp);
3219 	if (fromnd.ni_dvp != fromnd.ni_vp)
3220 		VOP_UNLOCK(fromnd.ni_dvp);
3221 	fvp = fromnd.ni_vp;
3222 
3223 	NDINIT(&tond, RENAME,
3224 	    LOCKPARENT | LOCKLEAF | NOCACHE | TRYEMULROOT
3225 	      | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
3226 	    topb);
3227 	if ((error = namei(&tond)) != 0) {
3228 		VFS_RENAMELOCK_EXIT(fs);
3229 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3230 		vrele(fromnd.ni_dvp);
3231 		vrele(fvp);
3232 		goto out1;
3233 	}
3234 	tdvp = tond.ni_dvp;
3235 	tvp = tond.ni_vp;
3236 
3237 	if (tvp != NULL) {
3238 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3239 			error = ENOTDIR;
3240 			goto out;
3241 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3242 			error = EISDIR;
3243 			goto out;
3244 		}
3245 	}
3246 
3247 	if (fvp == tdvp)
3248 		error = EINVAL;
3249 
3250 	/*
3251 	 * Source and destination refer to the same object.
3252 	 */
3253 	if (fvp == tvp) {
3254 		if (retain)
3255 			error = -1;
3256 		else if (fromnd.ni_dvp == tdvp &&
3257 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3258 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3259 		          tond.ni_cnd.cn_nameptr,
3260 		          fromnd.ni_cnd.cn_namelen))
3261 		error = -1;
3262 	}
3263 	/*
3264 	 * Prevent cross-mount operation.
3265 	 */
3266 	if (error == 0) {
3267 		if (tond.ni_dvp->v_mount != fromnd.ni_dvp->v_mount) {
3268 			error = EXDEV;
3269 		}
3270 	}
3271 #if NVERIEXEC > 0
3272 	if (!error) {
3273 		char *f1, *f2;
3274 		size_t f1_len;
3275 		size_t f2_len;
3276 
3277 		f1_len = fromnd.ni_cnd.cn_namelen + 1;
3278 		f1 = kmem_alloc(f1_len, KM_SLEEP);
3279 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
3280 
3281 		f2_len = tond.ni_cnd.cn_namelen + 1;
3282 		f2 = kmem_alloc(f2_len, KM_SLEEP);
3283 		strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
3284 
3285 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3286 
3287 		kmem_free(f1, f1_len);
3288 		kmem_free(f2, f2_len);
3289 	}
3290 #endif /* NVERIEXEC > 0 */
3291 
3292 out:
3293 	p = l->l_proc;
3294 	if (!error) {
3295 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3296 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3297 		VFS_RENAMELOCK_EXIT(fs);
3298 	} else {
3299 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3300 		if (tdvp == tvp)
3301 			vrele(tdvp);
3302 		else
3303 			vput(tdvp);
3304 		if (tvp)
3305 			vput(tvp);
3306 		VFS_RENAMELOCK_EXIT(fs);
3307 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3308 		vrele(fromnd.ni_dvp);
3309 		vrele(fvp);
3310 	}
3311 out1:
3312 	pathbuf_destroy(frompb);
3313 	pathbuf_destroy(topb);
3314 	return (error == -1 ? 0 : error);
3315 }
3316 
3317 /*
3318  * Make a directory file.
3319  */
3320 /* ARGSUSED */
3321 int
3322 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3323 {
3324 	/* {
3325 		syscallarg(const char *) path;
3326 		syscallarg(int) mode;
3327 	} */
3328 
3329 	return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE);
3330 }
3331 
3332 int
3333 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
3334 {
3335 	struct proc *p = curlwp->l_proc;
3336 	struct vnode *vp;
3337 	struct vattr vattr;
3338 	int error;
3339 	struct pathbuf *pb;
3340 	struct nameidata nd;
3341 
3342 	/* XXX bollocks, should pass in a pathbuf */
3343 	error = pathbuf_maybe_copyin(path, seg, &pb);
3344 	if (error) {
3345 		return error;
3346 	}
3347 
3348 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
3349 	if ((error = namei(&nd)) != 0) {
3350 		pathbuf_destroy(pb);
3351 		return (error);
3352 	}
3353 	vp = nd.ni_vp;
3354 	if (vp != NULL) {
3355 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3356 		if (nd.ni_dvp == vp)
3357 			vrele(nd.ni_dvp);
3358 		else
3359 			vput(nd.ni_dvp);
3360 		vrele(vp);
3361 		pathbuf_destroy(pb);
3362 		return (EEXIST);
3363 	}
3364 	vattr_null(&vattr);
3365 	vattr.va_type = VDIR;
3366 	/* We will read cwdi->cwdi_cmask unlocked. */
3367 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3368 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3369 	if (!error)
3370 		vput(nd.ni_vp);
3371 	pathbuf_destroy(pb);
3372 	return (error);
3373 }
3374 
3375 /*
3376  * Remove a directory file.
3377  */
3378 /* ARGSUSED */
3379 int
3380 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3381 {
3382 	/* {
3383 		syscallarg(const char *) path;
3384 	} */
3385 	struct vnode *vp;
3386 	int error;
3387 	struct pathbuf *pb;
3388 	struct nameidata nd;
3389 
3390 	error = pathbuf_copyin(SCARG(uap, path), &pb);
3391 	if (error) {
3392 		return error;
3393 	}
3394 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
3395 	if ((error = namei(&nd)) != 0) {
3396 		pathbuf_destroy(pb);
3397 		return error;
3398 	}
3399 	vp = nd.ni_vp;
3400 	if (vp->v_type != VDIR) {
3401 		error = ENOTDIR;
3402 		goto out;
3403 	}
3404 	/*
3405 	 * No rmdir "." please.
3406 	 */
3407 	if (nd.ni_dvp == vp) {
3408 		error = EINVAL;
3409 		goto out;
3410 	}
3411 	/*
3412 	 * The root of a mounted filesystem cannot be deleted.
3413 	 */
3414 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3415 		error = EBUSY;
3416 		goto out;
3417 	}
3418 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3419 	pathbuf_destroy(pb);
3420 	return (error);
3421 
3422 out:
3423 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3424 	if (nd.ni_dvp == vp)
3425 		vrele(nd.ni_dvp);
3426 	else
3427 		vput(nd.ni_dvp);
3428 	vput(vp);
3429 	pathbuf_destroy(pb);
3430 	return (error);
3431 }
3432 
3433 /*
3434  * Read a block of directory entries in a file system independent format.
3435  */
3436 int
3437 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3438 {
3439 	/* {
3440 		syscallarg(int) fd;
3441 		syscallarg(char *) buf;
3442 		syscallarg(size_t) count;
3443 	} */
3444 	file_t *fp;
3445 	int error, done;
3446 
3447 	/* fd_getvnode() will use the descriptor for us */
3448 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3449 		return (error);
3450 	if ((fp->f_flag & FREAD) == 0) {
3451 		error = EBADF;
3452 		goto out;
3453 	}
3454 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3455 			SCARG(uap, count), &done, l, 0, 0);
3456 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3457 	*retval = done;
3458  out:
3459 	fd_putfile(SCARG(uap, fd));
3460 	return (error);
3461 }
3462 
3463 /*
3464  * Set the mode mask for creation of filesystem nodes.
3465  */
3466 int
3467 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3468 {
3469 	/* {
3470 		syscallarg(mode_t) newmask;
3471 	} */
3472 	struct proc *p = l->l_proc;
3473 	struct cwdinfo *cwdi;
3474 
3475 	/*
3476 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3477 	 * important is that we serialize changes to the mask.  The
3478 	 * rw_exit() will issue a write memory barrier on our behalf,
3479 	 * and force the changes out to other CPUs (as it must use an
3480 	 * atomic operation, draining the local CPU's store buffers).
3481 	 */
3482 	cwdi = p->p_cwdi;
3483 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3484 	*retval = cwdi->cwdi_cmask;
3485 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3486 	rw_exit(&cwdi->cwdi_lock);
3487 
3488 	return (0);
3489 }
3490 
3491 int
3492 dorevoke(struct vnode *vp, kauth_cred_t cred)
3493 {
3494 	struct vattr vattr;
3495 	int error;
3496 
3497 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3498 		return error;
3499 	if (kauth_cred_geteuid(cred) == vattr.va_uid ||
3500 	    (error = kauth_authorize_generic(cred,
3501 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3502 		VOP_REVOKE(vp, REVOKEALL);
3503 	return (error);
3504 }
3505 
3506 /*
3507  * Void all references to file by ripping underlying filesystem
3508  * away from vnode.
3509  */
3510 /* ARGSUSED */
3511 int
3512 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3513 {
3514 	/* {
3515 		syscallarg(const char *) path;
3516 	} */
3517 	struct vnode *vp;
3518 	int error;
3519 
3520 	error = namei_simple_user(SCARG(uap, path),
3521 				NSM_FOLLOW_TRYEMULROOT, &vp);
3522 	if (error != 0)
3523 		return (error);
3524 	error = dorevoke(vp, l->l_cred);
3525 	vrele(vp);
3526 	return (error);
3527 }
3528