xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 2980e352a13e8f0b545a366830c411e7a542ada8)
1 /*	$NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
63  */
64 
65 #include <sys/cdefs.h>
66 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $");
67 
68 #include "opt_compat_netbsd.h"
69 #include "opt_compat_43.h"
70 #include "opt_fileassoc.h"
71 #include "fss.h"
72 #include "veriexec.h"
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file.h>
80 #include <sys/stat.h>
81 #include <sys/vnode.h>
82 #include <sys/mount.h>
83 #include <sys/proc.h>
84 #include <sys/uio.h>
85 #include <sys/malloc.h>
86 #include <sys/kmem.h>
87 #include <sys/dirent.h>
88 #include <sys/sysctl.h>
89 #include <sys/syscallargs.h>
90 #include <sys/vfs_syscalls.h>
91 #include <sys/ktrace.h>
92 #ifdef FILEASSOC
93 #include <sys/fileassoc.h>
94 #endif /* FILEASSOC */
95 #include <sys/verified_exec.h>
96 #include <sys/kauth.h>
97 #include <sys/atomic.h>
98 #include <sys/module.h>
99 
100 #include <miscfs/genfs/genfs.h>
101 #include <miscfs/syncfs/syncfs.h>
102 #include <miscfs/specfs/specdev.h>
103 
104 #ifdef COMPAT_30
105 #include "opt_nfsserver.h"
106 #include <nfs/rpcv2.h>
107 #endif
108 #include <nfs/nfsproto.h>
109 #ifdef COMPAT_30
110 #include <nfs/nfs.h>
111 #include <nfs/nfs_var.h>
112 #endif
113 
114 #if NFSS > 0
115 #include <dev/fssvar.h>
116 #endif
117 
118 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
119 
120 static int change_dir(struct nameidata *, struct lwp *);
121 static int change_flags(struct vnode *, u_long, struct lwp *);
122 static int change_mode(struct vnode *, int, struct lwp *l);
123 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
124 
125 void checkdirs(struct vnode *);
126 
127 int dovfsusermount = 0;
128 
129 /*
130  * Virtual File System System Calls
131  */
132 
133 /*
134  * Mount a file system.
135  */
136 
137 #if defined(COMPAT_09) || defined(COMPAT_43)
138 /*
139  * This table is used to maintain compatibility with 4.3BSD
140  * and NetBSD 0.9 mount syscalls.  Note, the order is important!
141  *
142  * Do not modify this table. It should only contain filesystems
143  * supported by NetBSD 0.9 and 4.3BSD.
144  */
145 const char * const mountcompatnames[] = {
146 	NULL,		/* 0 = MOUNT_NONE */
147 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
148 	MOUNT_NFS,	/* 2 */
149 	MOUNT_MFS,	/* 3 */
150 	MOUNT_MSDOS,	/* 4 */
151 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
152 	MOUNT_FDESC,	/* 6 */
153 	MOUNT_KERNFS,	/* 7 */
154 	NULL,		/* 8 = MOUNT_DEVFS */
155 	MOUNT_AFS,	/* 9 */
156 };
157 const int nmountcompatnames = sizeof(mountcompatnames) /
158     sizeof(mountcompatnames[0]);
159 #endif /* COMPAT_09 || COMPAT_43 */
160 
161 static int
162 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
163     void *data, size_t *data_len)
164 {
165 	struct mount *mp;
166 	int error = 0, saved_flags;
167 
168 	mp = vp->v_mount;
169 	saved_flags = mp->mnt_flag;
170 
171 	/* We can operate only on VV_ROOT nodes. */
172 	if ((vp->v_vflag & VV_ROOT) == 0) {
173 		error = EINVAL;
174 		goto out;
175 	}
176 
177 	/*
178 	 * We only allow the filesystem to be reloaded if it
179 	 * is currently mounted read-only.
180 	 */
181 	if (flags & MNT_RELOAD && !(mp->mnt_flag & MNT_RDONLY)) {
182 		error = EOPNOTSUPP;	/* Needs translation */
183 		goto out;
184 	}
185 
186 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
187 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
188 	if (error)
189 		goto out;
190 
191 	if (vfs_busy(mp, NULL)) {
192 		error = EPERM;
193 		goto out;
194 	}
195 
196 	mutex_enter(&mp->mnt_updating);
197 
198 	mp->mnt_flag &= ~MNT_OP_FLAGS;
199 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
200 
201 	/*
202 	 * Set the mount level flags.
203 	 */
204 	if (flags & MNT_RDONLY)
205 		mp->mnt_flag |= MNT_RDONLY;
206 	else if (mp->mnt_flag & MNT_RDONLY)
207 		mp->mnt_iflag |= IMNT_WANTRDWR;
208 	mp->mnt_flag &=
209 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
210 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
211 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP);
212 	mp->mnt_flag |= flags &
213 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
214 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
215 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
216 	    MNT_IGNORE);
217 
218 	error = VFS_MOUNT(mp, path, data, data_len);
219 
220 #if defined(COMPAT_30) && defined(NFSSERVER)
221 	if (error && data != NULL) {
222 		int error2;
223 
224 		/* Update failed; let's try and see if it was an
225 		 * export request. */
226 		error2 = nfs_update_exports_30(mp, path, data, l);
227 
228 		/* Only update error code if the export request was
229 		 * understood but some problem occurred while
230 		 * processing it. */
231 		if (error2 != EJUSTRETURN)
232 			error = error2;
233 	}
234 #endif
235 	if (mp->mnt_iflag & IMNT_WANTRDWR)
236 		mp->mnt_flag &= ~MNT_RDONLY;
237 	if (error)
238 		mp->mnt_flag = saved_flags;
239 	mp->mnt_flag &= ~MNT_OP_FLAGS;
240 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
241 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
242 		if (mp->mnt_syncer == NULL)
243 			error = vfs_allocate_syncvnode(mp);
244 	} else {
245 		if (mp->mnt_syncer != NULL)
246 			vfs_deallocate_syncvnode(mp);
247 	}
248 	mutex_exit(&mp->mnt_updating);
249 	vfs_unbusy(mp, false, NULL);
250 
251  out:
252 	return (error);
253 }
254 
255 static int
256 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
257 {
258 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
259 	int error;
260 
261 	/* Copy file-system type from userspace.  */
262 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
263 	if (error) {
264 #if defined(COMPAT_09) || defined(COMPAT_43)
265 		/*
266 		 * Historically, filesystem types were identified by numbers.
267 		 * If we get an integer for the filesystem type instead of a
268 		 * string, we check to see if it matches one of the historic
269 		 * filesystem types.
270 		 */
271 		u_long fsindex = (u_long)fstype;
272 		if (fsindex >= nmountcompatnames ||
273 		    mountcompatnames[fsindex] == NULL)
274 			return ENODEV;
275 		strlcpy(fstypename, mountcompatnames[fsindex],
276 		    sizeof(fstypename));
277 #else
278 		return error;
279 #endif
280 	}
281 
282 #ifdef	COMPAT_10
283 	/* Accept `ufs' as an alias for `ffs'. */
284 	if (strcmp(fstypename, "ufs") == 0)
285 		fstypename[0] = 'f';
286 #endif
287 
288 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
289 		return 0;
290 
291 	/* If we can autoload a vfs module, try again */
292 	(void)module_load(fstype, 0, NULL, MODULE_CLASS_VFS, true);
293 
294 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
295 		return 0;
296 
297 	return ENODEV;
298 }
299 
300 static int
301 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
302     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
303 {
304 	struct mount *mp;
305 	struct vnode *vp = *vpp;
306 	struct vattr va;
307 	int error;
308 
309 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
310 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
311 	if (error)
312 		return error;
313 
314 	/* Can't make a non-dir a mount-point (from here anyway). */
315 	if (vp->v_type != VDIR)
316 		return ENOTDIR;
317 
318 	/*
319 	 * If the user is not root, ensure that they own the directory
320 	 * onto which we are attempting to mount.
321 	 */
322 	if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
323 	    (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
324 	    (error = kauth_authorize_generic(l->l_cred,
325 	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
326 		return error;
327 	}
328 
329 	if (flags & MNT_EXPORTED)
330 		return EINVAL;
331 
332 	if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
333 		return error;
334 
335 	/*
336 	 * Check if a file-system is not already mounted on this vnode.
337 	 */
338 	if (vp->v_mountedhere != NULL)
339 		return EBUSY;
340 
341 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
342 	if (mp == NULL)
343 		return ENOMEM;
344 
345 	mp->mnt_op = vfsops;
346 	mp->mnt_refcnt = 1;
347 
348 	TAILQ_INIT(&mp->mnt_vnodelist);
349 	rw_init(&mp->mnt_unmounting);
350  	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
351 	mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
352 	error = vfs_busy(mp, NULL);
353 	KASSERT(error == 0);
354 	mutex_enter(&mp->mnt_updating);
355 
356 	mp->mnt_vnodecovered = vp;
357 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
358 	mount_initspecific(mp);
359 
360 	/*
361 	 * The underlying file system may refuse the mount for
362 	 * various reasons.  Allow the user to force it to happen.
363 	 *
364 	 * Set the mount level flags.
365 	 */
366 	mp->mnt_flag = flags &
367 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
368 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
369 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
370 	    MNT_IGNORE | MNT_RDONLY);
371 
372 	error = VFS_MOUNT(mp, path, data, data_len);
373 	mp->mnt_flag &= ~MNT_OP_FLAGS;
374 
375 	/*
376 	 * Put the new filesystem on the mount list after root.
377 	 */
378 	cache_purge(vp);
379 	if (error != 0) {
380 		vp->v_mountedhere = NULL;
381 		mutex_exit(&mp->mnt_updating);
382 		vfs_unbusy(mp, false, NULL);
383 		vfs_destroy(mp);
384 		return error;
385 	}
386 
387 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
388 	mutex_enter(&mountlist_lock);
389 	vp->v_mountedhere = mp;
390 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
391 	mutex_exit(&mountlist_lock);
392     	vn_restorerecurse(vp, recurse);
393 	VOP_UNLOCK(vp, 0);
394 	checkdirs(vp);
395 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
396 		error = vfs_allocate_syncvnode(mp);
397 	/* Hold an additional reference to the mount across VFS_START(). */
398 	mutex_exit(&mp->mnt_updating);
399 	vfs_unbusy(mp, true, NULL);
400 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
401 	error = VFS_START(mp, 0);
402 	if (error) {
403 		vrele(vp);
404 		vfs_destroy(mp);
405 	}
406 	/* Drop reference held for VFS_START(). */
407 	vfs_destroy(mp);
408 	*vpp = NULL;
409 	return error;
410 }
411 
412 static int
413 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
414     void *data, size_t *data_len)
415 {
416 	struct mount *mp;
417 	int error;
418 
419 	/* If MNT_GETARGS is specified, it should be the only flag. */
420 	if (flags & ~MNT_GETARGS)
421 		return EINVAL;
422 
423 	mp = vp->v_mount;
424 
425 	/* XXX: probably some notion of "can see" here if we want isolation. */
426 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
427 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
428 	if (error)
429 		return error;
430 
431 	if ((vp->v_vflag & VV_ROOT) == 0)
432 		return EINVAL;
433 
434 	if (vfs_busy(mp, NULL))
435 		return EPERM;
436 
437 	mutex_enter(&mp->mnt_updating);
438 	mp->mnt_flag &= ~MNT_OP_FLAGS;
439 	mp->mnt_flag |= MNT_GETARGS;
440 	error = VFS_MOUNT(mp, path, data, data_len);
441 	mp->mnt_flag &= ~MNT_OP_FLAGS;
442 	mutex_exit(&mp->mnt_updating);
443 
444 	vfs_unbusy(mp, false, NULL);
445 	return (error);
446 }
447 
448 #ifdef COMPAT_40
449 /* ARGSUSED */
450 int
451 compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
452 {
453 	/* {
454 		syscallarg(const char *) type;
455 		syscallarg(const char *) path;
456 		syscallarg(int) flags;
457 		syscallarg(void *) data;
458 	} */
459 	register_t dummy;
460 
461 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
462 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
463 }
464 #endif
465 
466 int
467 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
468 {
469 	/* {
470 		syscallarg(const char *) type;
471 		syscallarg(const char *) path;
472 		syscallarg(int) flags;
473 		syscallarg(void *) data;
474 		syscallarg(size_t) data_len;
475 	} */
476 
477 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
478 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
479 	    SCARG(uap, data_len), retval);
480 }
481 
482 int
483 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
484     const char *path, int flags, void *data, enum uio_seg data_seg,
485     size_t data_len, register_t *retval)
486 {
487 	struct vnode *vp;
488 	struct nameidata nd;
489 	void *data_buf = data;
490 	u_int recurse;
491 	int error;
492 
493 	/*
494 	 * Get vnode to be covered
495 	 */
496 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
497 	if ((error = namei(&nd)) != 0)
498 		return (error);
499 	vp = nd.ni_vp;
500 
501 	/*
502 	 * A lookup in VFS_MOUNT might result in an attempt to
503 	 * lock this vnode again, so make the lock recursive.
504 	 */
505 	if (vfsops == NULL) {
506 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
507 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
508 			recurse = vn_setrecurse(vp);
509 			vfsops = vp->v_mount->mnt_op;
510 		} else {
511 			/* 'type' is userspace */
512 			error = mount_get_vfsops(type, &vfsops);
513 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
514 			recurse = vn_setrecurse(vp);
515 			if (error != 0)
516 				goto done;
517 		}
518 	} else {
519 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
520 		recurse = vn_setrecurse(vp);
521 	}
522 
523 	if (data != NULL && data_seg == UIO_USERSPACE) {
524 		if (data_len == 0) {
525 			/* No length supplied, use default for filesystem */
526 			data_len = vfsops->vfs_min_mount_data;
527 			if (data_len > VFS_MAX_MOUNT_DATA) {
528 				/* maybe a force loaded old LKM */
529 				error = EINVAL;
530 				goto done;
531 			}
532 #ifdef COMPAT_30
533 			/* Hopefully a longer buffer won't make copyin() fail */
534 			if (flags & MNT_UPDATE
535 			    && data_len < sizeof (struct mnt_export_args30))
536 				data_len = sizeof (struct mnt_export_args30);
537 #endif
538 		}
539 		data_buf = malloc(data_len, M_TEMP, M_WAITOK);
540 
541 		/* NFS needs the buffer even for mnt_getargs .... */
542 		error = copyin(data, data_buf, data_len);
543 		if (error != 0)
544 			goto done;
545 	}
546 
547 	if (flags & MNT_GETARGS) {
548 		if (data_len == 0) {
549 			error = EINVAL;
550 			goto done;
551 		}
552 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
553 		if (error != 0)
554 			goto done;
555 		if (data_seg == UIO_USERSPACE)
556 			error = copyout(data_buf, data, data_len);
557 		*retval = data_len;
558 	} else if (flags & MNT_UPDATE) {
559 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
560 	} else {
561 		/* Locking is handled internally in mount_domount(). */
562 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
563 		    &data_len, recurse);
564 	}
565 
566     done:
567     	if (vp != NULL) {
568 	    	vn_restorerecurse(vp, recurse);
569 	    	vput(vp);
570 	}
571 	if (data_buf != data)
572 		free(data_buf, M_TEMP);
573 	return (error);
574 }
575 
576 /*
577  * Scan all active processes to see if any of them have a current
578  * or root directory onto which the new filesystem has just been
579  * mounted. If so, replace them with the new mount point.
580  */
581 void
582 checkdirs(struct vnode *olddp)
583 {
584 	struct cwdinfo *cwdi;
585 	struct vnode *newdp, *rele1, *rele2;
586 	struct proc *p;
587 	bool retry;
588 
589 	if (olddp->v_usecount == 1)
590 		return;
591 	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
592 		panic("mount: lost mount");
593 
594 	do {
595 		retry = false;
596 		mutex_enter(proc_lock);
597 		PROCLIST_FOREACH(p, &allproc) {
598 			if ((p->p_flag & PK_MARKER) != 0)
599 				continue;
600 			if ((cwdi = p->p_cwdi) == NULL)
601 				continue;
602 			/*
603 			 * Can't change to the old directory any more,
604 			 * so even if we see a stale value it's not a
605 			 * problem.
606 			 */
607 			if (cwdi->cwdi_cdir != olddp &&
608 			    cwdi->cwdi_rdir != olddp)
609 			    	continue;
610 			retry = true;
611 			rele1 = NULL;
612 			rele2 = NULL;
613 			atomic_inc_uint(&cwdi->cwdi_refcnt);
614 			mutex_exit(proc_lock);
615 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
616 			if (cwdi->cwdi_cdir == olddp) {
617 				rele1 = cwdi->cwdi_cdir;
618 				VREF(newdp);
619 				cwdi->cwdi_cdir = newdp;
620 			}
621 			if (cwdi->cwdi_rdir == olddp) {
622 				rele2 = cwdi->cwdi_rdir;
623 				VREF(newdp);
624 				cwdi->cwdi_rdir = newdp;
625 			}
626 			rw_exit(&cwdi->cwdi_lock);
627 			cwdfree(cwdi);
628 			if (rele1 != NULL)
629 				vrele(rele1);
630 			if (rele2 != NULL)
631 				vrele(rele2);
632 			mutex_enter(proc_lock);
633 			break;
634 		}
635 		mutex_exit(proc_lock);
636 	} while (retry);
637 
638 	if (rootvnode == olddp) {
639 		vrele(rootvnode);
640 		VREF(newdp);
641 		rootvnode = newdp;
642 	}
643 	vput(newdp);
644 }
645 
646 /*
647  * Unmount a file system.
648  *
649  * Note: unmount takes a path to the vnode mounted on as argument,
650  * not special file (as before).
651  */
652 /* ARGSUSED */
653 int
654 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
655 {
656 	/* {
657 		syscallarg(const char *) path;
658 		syscallarg(int) flags;
659 	} */
660 	struct vnode *vp;
661 	struct mount *mp;
662 	int error;
663 	struct nameidata nd;
664 
665 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
666 	    SCARG(uap, path));
667 	if ((error = namei(&nd)) != 0)
668 		return (error);
669 	vp = nd.ni_vp;
670 	mp = vp->v_mount;
671 	atomic_inc_uint(&mp->mnt_refcnt);
672 	VOP_UNLOCK(vp, 0);
673 
674 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
675 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
676 	if (error) {
677 		vrele(vp);
678 		vfs_destroy(mp);
679 		return (error);
680 	}
681 
682 	/*
683 	 * Don't allow unmounting the root file system.
684 	 */
685 	if (mp->mnt_flag & MNT_ROOTFS) {
686 		vrele(vp);
687 		vfs_destroy(mp);
688 		return (EINVAL);
689 	}
690 
691 	/*
692 	 * Must be the root of the filesystem
693 	 */
694 	if ((vp->v_vflag & VV_ROOT) == 0) {
695 		vrele(vp);
696 		vfs_destroy(mp);
697 		return (EINVAL);
698 	}
699 
700 	vrele(vp);
701 	error = dounmount(mp, SCARG(uap, flags), l);
702 	return error;
703 }
704 
705 /*
706  * Do the actual file system unmount.  File system is assumed to have
707  * been locked by the caller.
708  *
709  * => Caller gain reference to the mount, explicility for unmount.
710  * => Reference will be dropped in all cases.
711  */
712 int
713 dounmount(struct mount *mp, int flags, struct lwp *l)
714 {
715 	struct vnode *coveredvp;
716 	int error;
717 	int async;
718 	int used_syncer;
719 
720 #if NVERIEXEC > 0
721 	error = veriexec_unmountchk(mp);
722 	if (error)
723 		return (error);
724 #endif /* NVERIEXEC > 0 */
725 
726 	/*
727 	 * XXX Freeze syncer.  Must do this before locking the
728 	 * mount point.  See dounmount() for details.
729 	 */
730 	mutex_enter(&syncer_mutex);
731 	rw_enter(&mp->mnt_unmounting, RW_WRITER);
732 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
733 		rw_exit(&mp->mnt_unmounting);
734 		mutex_exit(&syncer_mutex);
735 		vfs_destroy(mp);
736 		return ENOENT;
737 	}
738 
739 	used_syncer = (mp->mnt_syncer != NULL);
740 
741 	/*
742 	 * XXX Syncer must be frozen when we get here.  This should really
743 	 * be done on a per-mountpoint basis, but especially the softdep
744 	 * code possibly called from the syncer doesn't exactly work on a
745 	 * per-mountpoint basis, so the softdep code would become a maze
746 	 * of vfs_busy() calls.
747 	 *
748 	 * The caller of dounmount() must acquire syncer_mutex because
749 	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
750 	 * order, and we must preserve that order to avoid deadlock.
751 	 *
752 	 * So, if the file system did not use the syncer, now is
753 	 * the time to release the syncer_mutex.
754 	 */
755 	if (used_syncer == 0)
756 		mutex_exit(&syncer_mutex);
757 
758 	mp->mnt_iflag |= IMNT_UNMOUNT;
759 	async = mp->mnt_flag & MNT_ASYNC;
760 	mp->mnt_flag &= ~MNT_ASYNC;
761 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
762 	if (mp->mnt_syncer != NULL)
763 		vfs_deallocate_syncvnode(mp);
764 	error = 0;
765 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
766 #if NFSS > 0
767 		error = fss_umount_hook(mp, (flags & MNT_FORCE));
768 #endif
769 		if (error == 0)
770 			error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
771 	}
772 	vfs_scrubvnlist(mp);
773 	if (error == 0 || (flags & MNT_FORCE))
774 		error = VFS_UNMOUNT(mp, flags);
775 	if (error) {
776 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
777 			(void) vfs_allocate_syncvnode(mp);
778 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
779 		mp->mnt_flag |= async;
780 		rw_exit(&mp->mnt_unmounting);
781 		if (used_syncer)
782 			mutex_exit(&syncer_mutex);
783 		return (error);
784 	}
785 	vfs_scrubvnlist(mp);
786 	mutex_enter(&mountlist_lock);
787 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
788 		coveredvp->v_mountedhere = NULL;
789 	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
790 	mp->mnt_iflag |= IMNT_GONE;
791 	mutex_exit(&mountlist_lock);
792 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
793 		panic("unmount: dangling vnode");
794 	if (used_syncer)
795 		mutex_exit(&syncer_mutex);
796 	vfs_hooks_unmount(mp);
797 	rw_exit(&mp->mnt_unmounting);
798 	vfs_destroy(mp);	/* caller provided reference */
799 	vfs_destroy(mp);	/* from mount(), final nail in coffin */
800 	if (coveredvp != NULLVP)
801 		vrele(coveredvp);
802 	return (0);
803 }
804 
805 /*
806  * Sync each mounted filesystem.
807  */
808 #ifdef DEBUG
809 int syncprt = 0;
810 struct ctldebug debug0 = { "syncprt", &syncprt };
811 #endif
812 
813 /* ARGSUSED */
814 int
815 sys_sync(struct lwp *l, const void *v, register_t *retval)
816 {
817 	struct mount *mp, *nmp;
818 	int asyncflag;
819 
820 	if (l == NULL)
821 		l = &lwp0;
822 
823 	mutex_enter(&mountlist_lock);
824 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
825 	     mp = nmp) {
826 		if (vfs_busy(mp, &nmp)) {
827 			continue;
828 		}
829 		mutex_enter(&mp->mnt_updating);
830 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
831 			asyncflag = mp->mnt_flag & MNT_ASYNC;
832 			mp->mnt_flag &= ~MNT_ASYNC;
833 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
834 			if (asyncflag)
835 				 mp->mnt_flag |= MNT_ASYNC;
836 		}
837 		mutex_exit(&mp->mnt_updating);
838 		vfs_unbusy(mp, false, &nmp);
839 	}
840 	mutex_exit(&mountlist_lock);
841 #ifdef DEBUG
842 	if (syncprt)
843 		vfs_bufstats();
844 #endif /* DEBUG */
845 	return (0);
846 }
847 
848 /*
849  * Change filesystem quotas.
850  */
851 /* ARGSUSED */
852 int
853 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
854 {
855 	/* {
856 		syscallarg(const char *) path;
857 		syscallarg(int) cmd;
858 		syscallarg(int) uid;
859 		syscallarg(void *) arg;
860 	} */
861 	struct mount *mp;
862 	int error;
863 	struct nameidata nd;
864 
865 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
866 	    SCARG(uap, path));
867 	if ((error = namei(&nd)) != 0)
868 		return (error);
869 	mp = nd.ni_vp->v_mount;
870 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
871 	    SCARG(uap, arg));
872 	vrele(nd.ni_vp);
873 	return (error);
874 }
875 
876 int
877 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
878     int root)
879 {
880 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
881 	int error = 0;
882 
883 	/*
884 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
885 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
886 	 * overrides MNT_NOWAIT.
887 	 */
888 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
889 	    (flags != MNT_WAIT && flags != 0)) {
890 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
891 		goto done;
892 	}
893 
894 	/* Get the filesystem stats now */
895 	memset(sp, 0, sizeof(*sp));
896 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
897 		return error;
898 	}
899 
900 	if (cwdi->cwdi_rdir == NULL)
901 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
902 done:
903 	if (cwdi->cwdi_rdir != NULL) {
904 		size_t len;
905 		char *bp;
906 		char c;
907 		char *path = PNBUF_GET();
908 
909 		bp = path + MAXPATHLEN;
910 		*--bp = '\0';
911 		rw_enter(&cwdi->cwdi_lock, RW_READER);
912 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
913 		    MAXPATHLEN / 2, 0, l);
914 		rw_exit(&cwdi->cwdi_lock);
915 		if (error) {
916 			PNBUF_PUT(path);
917 			return error;
918 		}
919 		len = strlen(bp);
920 		/*
921 		 * for mount points that are below our root, we can see
922 		 * them, so we fix up the pathname and return them. The
923 		 * rest we cannot see, so we don't allow viewing the
924 		 * data.
925 		 */
926 		if (strncmp(bp, sp->f_mntonname, len) == 0 &&
927 		    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
928 			(void)strlcpy(sp->f_mntonname, &sp->f_mntonname[len],
929 			    sizeof(sp->f_mntonname));
930 			if (sp->f_mntonname[0] == '\0')
931 				(void)strlcpy(sp->f_mntonname, "/",
932 				    sizeof(sp->f_mntonname));
933 		} else {
934 			if (root)
935 				(void)strlcpy(sp->f_mntonname, "/",
936 				    sizeof(sp->f_mntonname));
937 			else
938 				error = EPERM;
939 		}
940 		PNBUF_PUT(path);
941 	}
942 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
943 	return error;
944 }
945 
946 /*
947  * Get filesystem statistics by path.
948  */
949 int
950 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
951 {
952 	struct mount *mp;
953 	int error;
954 	struct nameidata nd;
955 
956 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
957 	if ((error = namei(&nd)) != 0)
958 		return error;
959 	mp = nd.ni_vp->v_mount;
960 	error = dostatvfs(mp, sb, l, flags, 1);
961 	vrele(nd.ni_vp);
962 	return error;
963 }
964 
965 /* ARGSUSED */
966 int
967 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
968 {
969 	/* {
970 		syscallarg(const char *) path;
971 		syscallarg(struct statvfs *) buf;
972 		syscallarg(int) flags;
973 	} */
974 	struct statvfs *sb;
975 	int error;
976 
977 	sb = STATVFSBUF_GET();
978 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
979 	if (error == 0)
980 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
981 	STATVFSBUF_PUT(sb);
982 	return error;
983 }
984 
985 /*
986  * Get filesystem statistics by fd.
987  */
988 int
989 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
990 {
991 	file_t *fp;
992 	struct mount *mp;
993 	int error;
994 
995 	/* fd_getvnode() will use the descriptor for us */
996 	if ((error = fd_getvnode(fd, &fp)) != 0)
997 		return (error);
998 	mp = ((struct vnode *)fp->f_data)->v_mount;
999 	error = dostatvfs(mp, sb, curlwp, flags, 1);
1000 	fd_putfile(fd);
1001 	return error;
1002 }
1003 
1004 /* ARGSUSED */
1005 int
1006 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
1007 {
1008 	/* {
1009 		syscallarg(int) fd;
1010 		syscallarg(struct statvfs *) buf;
1011 		syscallarg(int) flags;
1012 	} */
1013 	struct statvfs *sb;
1014 	int error;
1015 
1016 	sb = STATVFSBUF_GET();
1017 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1018 	if (error == 0)
1019 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1020 	STATVFSBUF_PUT(sb);
1021 	return error;
1022 }
1023 
1024 
1025 /*
1026  * Get statistics on all filesystems.
1027  */
1028 int
1029 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1030     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1031     register_t *retval)
1032 {
1033 	int root = 0;
1034 	struct proc *p = l->l_proc;
1035 	struct mount *mp, *nmp;
1036 	struct statvfs *sb;
1037 	size_t count, maxcount;
1038 	int error = 0;
1039 
1040 	sb = STATVFSBUF_GET();
1041 	maxcount = bufsize / entry_sz;
1042 	mutex_enter(&mountlist_lock);
1043 	count = 0;
1044 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1045 	     mp = nmp) {
1046 		if (vfs_busy(mp, &nmp)) {
1047 			continue;
1048 		}
1049 		if (sfsp && count < maxcount) {
1050 			error = dostatvfs(mp, sb, l, flags, 0);
1051 			if (error) {
1052 				vfs_unbusy(mp, false, &nmp);
1053 				error = 0;
1054 				continue;
1055 			}
1056 			error = copyfn(sb, sfsp, entry_sz);
1057 			if (error) {
1058 				vfs_unbusy(mp, false, NULL);
1059 				goto out;
1060 			}
1061 			sfsp = (char *)sfsp + entry_sz;
1062 			root |= strcmp(sb->f_mntonname, "/") == 0;
1063 		}
1064 		count++;
1065 		vfs_unbusy(mp, false, &nmp);
1066 	}
1067 	mutex_exit(&mountlist_lock);
1068 
1069 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1070 		/*
1071 		 * fake a root entry
1072 		 */
1073 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1074 		    sb, l, flags, 1);
1075 		if (error != 0)
1076 			goto out;
1077 		if (sfsp) {
1078 			error = copyfn(sb, sfsp, entry_sz);
1079 			if (error != 0)
1080 				goto out;
1081 		}
1082 		count++;
1083 	}
1084 	if (sfsp && count > maxcount)
1085 		*retval = maxcount;
1086 	else
1087 		*retval = count;
1088 out:
1089 	STATVFSBUF_PUT(sb);
1090 	return error;
1091 }
1092 
1093 int
1094 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1095 {
1096 	/* {
1097 		syscallarg(struct statvfs *) buf;
1098 		syscallarg(size_t) bufsize;
1099 		syscallarg(int) flags;
1100 	} */
1101 
1102 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1103 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1104 }
1105 
1106 /*
1107  * Change current working directory to a given file descriptor.
1108  */
1109 /* ARGSUSED */
1110 int
1111 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1112 {
1113 	/* {
1114 		syscallarg(int) fd;
1115 	} */
1116 	struct proc *p = l->l_proc;
1117 	struct cwdinfo *cwdi;
1118 	struct vnode *vp, *tdp;
1119 	struct mount *mp;
1120 	file_t *fp;
1121 	int error, fd;
1122 
1123 	/* fd_getvnode() will use the descriptor for us */
1124 	fd = SCARG(uap, fd);
1125 	if ((error = fd_getvnode(fd, &fp)) != 0)
1126 		return (error);
1127 	vp = fp->f_data;
1128 
1129 	VREF(vp);
1130 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1131 	if (vp->v_type != VDIR)
1132 		error = ENOTDIR;
1133 	else
1134 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1135 	if (error) {
1136 		vput(vp);
1137 		goto out;
1138 	}
1139 	while ((mp = vp->v_mountedhere) != NULL) {
1140 		error = vfs_busy(mp, NULL);
1141 		vput(vp);
1142 		if (error != 0)
1143 			goto out;
1144 		error = VFS_ROOT(mp, &tdp);
1145 		vfs_unbusy(mp, false, NULL);
1146 		if (error)
1147 			goto out;
1148 		vp = tdp;
1149 	}
1150 	VOP_UNLOCK(vp, 0);
1151 
1152 	/*
1153 	 * Disallow changing to a directory not under the process's
1154 	 * current root directory (if there is one).
1155 	 */
1156 	cwdi = p->p_cwdi;
1157 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1158 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1159 		vrele(vp);
1160 		error = EPERM;	/* operation not permitted */
1161 	} else {
1162 		vrele(cwdi->cwdi_cdir);
1163 		cwdi->cwdi_cdir = vp;
1164 	}
1165 	rw_exit(&cwdi->cwdi_lock);
1166 
1167  out:
1168 	fd_putfile(fd);
1169 	return (error);
1170 }
1171 
1172 /*
1173  * Change this process's notion of the root directory to a given file
1174  * descriptor.
1175  */
1176 int
1177 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1178 {
1179 	struct proc *p = l->l_proc;
1180 	struct cwdinfo *cwdi;
1181 	struct vnode	*vp;
1182 	file_t	*fp;
1183 	int		 error, fd = SCARG(uap, fd);
1184 
1185 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1186  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1187 		return error;
1188 	/* fd_getvnode() will use the descriptor for us */
1189 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
1190 		return error;
1191 	vp = fp->f_data;
1192 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1193 	if (vp->v_type != VDIR)
1194 		error = ENOTDIR;
1195 	else
1196 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1197 	VOP_UNLOCK(vp, 0);
1198 	if (error)
1199 		goto out;
1200 	VREF(vp);
1201 
1202 	/*
1203 	 * Prevent escaping from chroot by putting the root under
1204 	 * the working directory.  Silently chdir to / if we aren't
1205 	 * already there.
1206 	 */
1207 	cwdi = p->p_cwdi;
1208 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1209 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1210 		/*
1211 		 * XXX would be more failsafe to change directory to a
1212 		 * deadfs node here instead
1213 		 */
1214 		vrele(cwdi->cwdi_cdir);
1215 		VREF(vp);
1216 		cwdi->cwdi_cdir = vp;
1217 	}
1218 
1219 	if (cwdi->cwdi_rdir != NULL)
1220 		vrele(cwdi->cwdi_rdir);
1221 	cwdi->cwdi_rdir = vp;
1222 	rw_exit(&cwdi->cwdi_lock);
1223 
1224  out:
1225 	fd_putfile(fd);
1226 	return (error);
1227 }
1228 
1229 /*
1230  * Change current working directory (``.'').
1231  */
1232 /* ARGSUSED */
1233 int
1234 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1235 {
1236 	/* {
1237 		syscallarg(const char *) path;
1238 	} */
1239 	struct proc *p = l->l_proc;
1240 	struct cwdinfo *cwdi;
1241 	int error;
1242 	struct nameidata nd;
1243 
1244 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1245 	    SCARG(uap, path));
1246 	if ((error = change_dir(&nd, l)) != 0)
1247 		return (error);
1248 	cwdi = p->p_cwdi;
1249 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1250 	vrele(cwdi->cwdi_cdir);
1251 	cwdi->cwdi_cdir = nd.ni_vp;
1252 	rw_exit(&cwdi->cwdi_lock);
1253 	return (0);
1254 }
1255 
1256 /*
1257  * Change notion of root (``/'') directory.
1258  */
1259 /* ARGSUSED */
1260 int
1261 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1262 {
1263 	/* {
1264 		syscallarg(const char *) path;
1265 	} */
1266 	struct proc *p = l->l_proc;
1267 	struct cwdinfo *cwdi;
1268 	struct vnode *vp;
1269 	int error;
1270 	struct nameidata nd;
1271 
1272 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1273 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1274 		return (error);
1275 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1276 	    SCARG(uap, path));
1277 	if ((error = change_dir(&nd, l)) != 0)
1278 		return (error);
1279 
1280 	cwdi = p->p_cwdi;
1281 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1282 	if (cwdi->cwdi_rdir != NULL)
1283 		vrele(cwdi->cwdi_rdir);
1284 	vp = nd.ni_vp;
1285 	cwdi->cwdi_rdir = vp;
1286 
1287 	/*
1288 	 * Prevent escaping from chroot by putting the root under
1289 	 * the working directory.  Silently chdir to / if we aren't
1290 	 * already there.
1291 	 */
1292 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1293 		/*
1294 		 * XXX would be more failsafe to change directory to a
1295 		 * deadfs node here instead
1296 		 */
1297 		vrele(cwdi->cwdi_cdir);
1298 		VREF(vp);
1299 		cwdi->cwdi_cdir = vp;
1300 	}
1301 	rw_exit(&cwdi->cwdi_lock);
1302 
1303 	return (0);
1304 }
1305 
1306 /*
1307  * Common routine for chroot and chdir.
1308  */
1309 static int
1310 change_dir(struct nameidata *ndp, struct lwp *l)
1311 {
1312 	struct vnode *vp;
1313 	int error;
1314 
1315 	if ((error = namei(ndp)) != 0)
1316 		return (error);
1317 	vp = ndp->ni_vp;
1318 	if (vp->v_type != VDIR)
1319 		error = ENOTDIR;
1320 	else
1321 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1322 
1323 	if (error)
1324 		vput(vp);
1325 	else
1326 		VOP_UNLOCK(vp, 0);
1327 	return (error);
1328 }
1329 
1330 /*
1331  * Check permissions, allocate an open file structure,
1332  * and call the device open routine if any.
1333  */
1334 int
1335 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1336 {
1337 	/* {
1338 		syscallarg(const char *) path;
1339 		syscallarg(int) flags;
1340 		syscallarg(int) mode;
1341 	} */
1342 	struct proc *p = l->l_proc;
1343 	struct cwdinfo *cwdi = p->p_cwdi;
1344 	file_t *fp;
1345 	struct vnode *vp;
1346 	int flags, cmode;
1347 	int type, indx, error;
1348 	struct flock lf;
1349 	struct nameidata nd;
1350 
1351 	flags = FFLAGS(SCARG(uap, flags));
1352 	if ((flags & (FREAD | FWRITE)) == 0)
1353 		return (EINVAL);
1354 	if ((error = fd_allocfile(&fp, &indx)) != 0)
1355 		return (error);
1356 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1357 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1358 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1359 	    SCARG(uap, path));
1360 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1361 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1362 		fd_abort(p, fp, indx);
1363 		if ((error == EDUPFD || error == EMOVEFD) &&
1364 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1365 		    (error =
1366 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1367 			*retval = indx;
1368 			return (0);
1369 		}
1370 		if (error == ERESTART)
1371 			error = EINTR;
1372 		return (error);
1373 	}
1374 
1375 	l->l_dupfd = 0;
1376 	vp = nd.ni_vp;
1377 	fp->f_flag = flags & FMASK;
1378 	fp->f_type = DTYPE_VNODE;
1379 	fp->f_ops = &vnops;
1380 	fp->f_data = vp;
1381 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1382 		lf.l_whence = SEEK_SET;
1383 		lf.l_start = 0;
1384 		lf.l_len = 0;
1385 		if (flags & O_EXLOCK)
1386 			lf.l_type = F_WRLCK;
1387 		else
1388 			lf.l_type = F_RDLCK;
1389 		type = F_FLOCK;
1390 		if ((flags & FNONBLOCK) == 0)
1391 			type |= F_WAIT;
1392 		VOP_UNLOCK(vp, 0);
1393 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1394 		if (error) {
1395 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1396 			fd_abort(p, fp, indx);
1397 			return (error);
1398 		}
1399 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1400 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1401 	}
1402 	VOP_UNLOCK(vp, 0);
1403 	*retval = indx;
1404 	fd_affix(p, fp, indx);
1405 	return (0);
1406 }
1407 
1408 static void
1409 vfs__fhfree(fhandle_t *fhp)
1410 {
1411 	size_t fhsize;
1412 
1413 	if (fhp == NULL) {
1414 		return;
1415 	}
1416 	fhsize = FHANDLE_SIZE(fhp);
1417 	kmem_free(fhp, fhsize);
1418 }
1419 
1420 /*
1421  * vfs_composefh: compose a filehandle.
1422  */
1423 
1424 int
1425 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1426 {
1427 	struct mount *mp;
1428 	struct fid *fidp;
1429 	int error;
1430 	size_t needfhsize;
1431 	size_t fidsize;
1432 
1433 	mp = vp->v_mount;
1434 	fidp = NULL;
1435 	if (*fh_size < FHANDLE_SIZE_MIN) {
1436 		fidsize = 0;
1437 	} else {
1438 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1439 		if (fhp != NULL) {
1440 			memset(fhp, 0, *fh_size);
1441 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1442 			fidp = &fhp->fh_fid;
1443 		}
1444 	}
1445 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1446 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1447 	if (error == 0 && *fh_size < needfhsize) {
1448 		error = E2BIG;
1449 	}
1450 	*fh_size = needfhsize;
1451 	return error;
1452 }
1453 
1454 int
1455 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1456 {
1457 	struct mount *mp;
1458 	fhandle_t *fhp;
1459 	size_t fhsize;
1460 	size_t fidsize;
1461 	int error;
1462 
1463 	*fhpp = NULL;
1464 	mp = vp->v_mount;
1465 	fidsize = 0;
1466 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1467 	KASSERT(error != 0);
1468 	if (error != E2BIG) {
1469 		goto out;
1470 	}
1471 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1472 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1473 	if (fhp == NULL) {
1474 		error = ENOMEM;
1475 		goto out;
1476 	}
1477 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1478 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1479 	if (error == 0) {
1480 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1481 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1482 		*fhpp = fhp;
1483 	} else {
1484 		kmem_free(fhp, fhsize);
1485 	}
1486 out:
1487 	return error;
1488 }
1489 
1490 void
1491 vfs_composefh_free(fhandle_t *fhp)
1492 {
1493 
1494 	vfs__fhfree(fhp);
1495 }
1496 
1497 /*
1498  * vfs_fhtovp: lookup a vnode by a filehandle.
1499  */
1500 
1501 int
1502 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1503 {
1504 	struct mount *mp;
1505 	int error;
1506 
1507 	*vpp = NULL;
1508 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1509 	if (mp == NULL) {
1510 		error = ESTALE;
1511 		goto out;
1512 	}
1513 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1514 		error = EOPNOTSUPP;
1515 		goto out;
1516 	}
1517 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1518 out:
1519 	return error;
1520 }
1521 
1522 /*
1523  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1524  * the needed size.
1525  */
1526 
1527 int
1528 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1529 {
1530 	fhandle_t *fhp;
1531 	int error;
1532 
1533 	*fhpp = NULL;
1534 	if (fhsize > FHANDLE_SIZE_MAX) {
1535 		return EINVAL;
1536 	}
1537 	if (fhsize < FHANDLE_SIZE_MIN) {
1538 		return EINVAL;
1539 	}
1540 again:
1541 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1542 	if (fhp == NULL) {
1543 		return ENOMEM;
1544 	}
1545 	error = copyin(ufhp, fhp, fhsize);
1546 	if (error == 0) {
1547 		/* XXX this check shouldn't be here */
1548 		if (FHANDLE_SIZE(fhp) == fhsize) {
1549 			*fhpp = fhp;
1550 			return 0;
1551 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1552 			/*
1553 			 * a kludge for nfsv2 padded handles.
1554 			 */
1555 			size_t sz;
1556 
1557 			sz = FHANDLE_SIZE(fhp);
1558 			kmem_free(fhp, fhsize);
1559 			fhsize = sz;
1560 			goto again;
1561 		} else {
1562 			/*
1563 			 * userland told us wrong size.
1564 			 */
1565 		    	error = EINVAL;
1566 		}
1567 	}
1568 	kmem_free(fhp, fhsize);
1569 	return error;
1570 }
1571 
1572 void
1573 vfs_copyinfh_free(fhandle_t *fhp)
1574 {
1575 
1576 	vfs__fhfree(fhp);
1577 }
1578 
1579 /*
1580  * Get file handle system call
1581  */
1582 int
1583 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1584 {
1585 	/* {
1586 		syscallarg(char *) fname;
1587 		syscallarg(fhandle_t *) fhp;
1588 		syscallarg(size_t *) fh_size;
1589 	} */
1590 	struct vnode *vp;
1591 	fhandle_t *fh;
1592 	int error;
1593 	struct nameidata nd;
1594 	size_t sz;
1595 	size_t usz;
1596 
1597 	/*
1598 	 * Must be super user
1599 	 */
1600 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1601 	    0, NULL, NULL, NULL);
1602 	if (error)
1603 		return (error);
1604 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1605 	    SCARG(uap, fname));
1606 	error = namei(&nd);
1607 	if (error)
1608 		return (error);
1609 	vp = nd.ni_vp;
1610 	error = vfs_composefh_alloc(vp, &fh);
1611 	vput(vp);
1612 	if (error != 0) {
1613 		goto out;
1614 	}
1615 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1616 	if (error != 0) {
1617 		goto out;
1618 	}
1619 	sz = FHANDLE_SIZE(fh);
1620 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1621 	if (error != 0) {
1622 		goto out;
1623 	}
1624 	if (usz >= sz) {
1625 		error = copyout(fh, SCARG(uap, fhp), sz);
1626 	} else {
1627 		error = E2BIG;
1628 	}
1629 out:
1630 	vfs_composefh_free(fh);
1631 	return (error);
1632 }
1633 
1634 /*
1635  * Open a file given a file handle.
1636  *
1637  * Check permissions, allocate an open file structure,
1638  * and call the device open routine if any.
1639  */
1640 
1641 int
1642 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1643     register_t *retval)
1644 {
1645 	file_t *fp;
1646 	struct vnode *vp = NULL;
1647 	kauth_cred_t cred = l->l_cred;
1648 	file_t *nfp;
1649 	int type, indx, error=0;
1650 	struct flock lf;
1651 	struct vattr va;
1652 	fhandle_t *fh;
1653 	int flags;
1654 	proc_t *p;
1655 
1656 	p = curproc;
1657 
1658 	/*
1659 	 * Must be super user
1660 	 */
1661 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1662 	    0, NULL, NULL, NULL)))
1663 		return (error);
1664 
1665 	flags = FFLAGS(oflags);
1666 	if ((flags & (FREAD | FWRITE)) == 0)
1667 		return (EINVAL);
1668 	if ((flags & O_CREAT))
1669 		return (EINVAL);
1670 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1671 		return (error);
1672 	fp = nfp;
1673 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1674 	if (error != 0) {
1675 		goto bad;
1676 	}
1677 	error = vfs_fhtovp(fh, &vp);
1678 	if (error != 0) {
1679 		goto bad;
1680 	}
1681 
1682 	/* Now do an effective vn_open */
1683 
1684 	if (vp->v_type == VSOCK) {
1685 		error = EOPNOTSUPP;
1686 		goto bad;
1687 	}
1688 	error = vn_openchk(vp, cred, flags);
1689 	if (error != 0)
1690 		goto bad;
1691 	if (flags & O_TRUNC) {
1692 		VOP_UNLOCK(vp, 0);			/* XXX */
1693 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1694 		VATTR_NULL(&va);
1695 		va.va_size = 0;
1696 		error = VOP_SETATTR(vp, &va, cred);
1697 		if (error)
1698 			goto bad;
1699 	}
1700 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1701 		goto bad;
1702 	if (flags & FWRITE) {
1703 		mutex_enter(&vp->v_interlock);
1704 		vp->v_writecount++;
1705 		mutex_exit(&vp->v_interlock);
1706 	}
1707 
1708 	/* done with modified vn_open, now finish what sys_open does. */
1709 
1710 	fp->f_flag = flags & FMASK;
1711 	fp->f_type = DTYPE_VNODE;
1712 	fp->f_ops = &vnops;
1713 	fp->f_data = vp;
1714 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1715 		lf.l_whence = SEEK_SET;
1716 		lf.l_start = 0;
1717 		lf.l_len = 0;
1718 		if (flags & O_EXLOCK)
1719 			lf.l_type = F_WRLCK;
1720 		else
1721 			lf.l_type = F_RDLCK;
1722 		type = F_FLOCK;
1723 		if ((flags & FNONBLOCK) == 0)
1724 			type |= F_WAIT;
1725 		VOP_UNLOCK(vp, 0);
1726 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1727 		if (error) {
1728 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1729 			fd_abort(p, fp, indx);
1730 			return (error);
1731 		}
1732 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1733 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1734 	}
1735 	VOP_UNLOCK(vp, 0);
1736 	*retval = indx;
1737 	fd_affix(p, fp, indx);
1738 	vfs_copyinfh_free(fh);
1739 	return (0);
1740 
1741 bad:
1742 	fd_abort(p, fp, indx);
1743 	if (vp != NULL)
1744 		vput(vp);
1745 	vfs_copyinfh_free(fh);
1746 	return (error);
1747 }
1748 
1749 int
1750 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1751 {
1752 	/* {
1753 		syscallarg(const void *) fhp;
1754 		syscallarg(size_t) fh_size;
1755 		syscallarg(int) flags;
1756 	} */
1757 
1758 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1759 	    SCARG(uap, flags), retval);
1760 }
1761 
1762 int
1763 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1764 {
1765 	int error;
1766 	fhandle_t *fh;
1767 	struct vnode *vp;
1768 
1769 	/*
1770 	 * Must be super user
1771 	 */
1772 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1773 	    0, NULL, NULL, NULL)))
1774 		return (error);
1775 
1776 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1777 	if (error != 0)
1778 		return error;
1779 
1780 	error = vfs_fhtovp(fh, &vp);
1781 	vfs_copyinfh_free(fh);
1782 	if (error != 0)
1783 		return error;
1784 
1785 	error = vn_stat(vp, sb);
1786 	vput(vp);
1787 	return error;
1788 }
1789 
1790 
1791 /* ARGSUSED */
1792 int
1793 sys___fhstat40(struct lwp *l, const struct sys___fhstat40_args *uap, register_t *retval)
1794 {
1795 	/* {
1796 		syscallarg(const void *) fhp;
1797 		syscallarg(size_t) fh_size;
1798 		syscallarg(struct stat *) sb;
1799 	} */
1800 	struct stat sb;
1801 	int error;
1802 
1803 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1804 	if (error)
1805 		return error;
1806 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1807 }
1808 
1809 int
1810 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1811     int flags)
1812 {
1813 	fhandle_t *fh;
1814 	struct mount *mp;
1815 	struct vnode *vp;
1816 	int error;
1817 
1818 	/*
1819 	 * Must be super user
1820 	 */
1821 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1822 	    0, NULL, NULL, NULL)))
1823 		return error;
1824 
1825 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1826 	if (error != 0)
1827 		return error;
1828 
1829 	error = vfs_fhtovp(fh, &vp);
1830 	vfs_copyinfh_free(fh);
1831 	if (error != 0)
1832 		return error;
1833 
1834 	mp = vp->v_mount;
1835 	error = dostatvfs(mp, sb, l, flags, 1);
1836 	vput(vp);
1837 	return error;
1838 }
1839 
1840 /* ARGSUSED */
1841 int
1842 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1843 {
1844 	/* {
1845 		syscallarg(const void *) fhp;
1846 		syscallarg(size_t) fh_size;
1847 		syscallarg(struct statvfs *) buf;
1848 		syscallarg(int)	flags;
1849 	} */
1850 	struct statvfs *sb = STATVFSBUF_GET();
1851 	int error;
1852 
1853 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1854 	    SCARG(uap, flags));
1855 	if (error == 0)
1856 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1857 	STATVFSBUF_PUT(sb);
1858 	return error;
1859 }
1860 
1861 /*
1862  * Create a special file.
1863  */
1864 /* ARGSUSED */
1865 int
1866 sys_mknod(struct lwp *l, const struct sys_mknod_args *uap, register_t *retval)
1867 {
1868 	/* {
1869 		syscallarg(const char *) path;
1870 		syscallarg(int) mode;
1871 		syscallarg(int) dev;
1872 	} */
1873 	struct proc *p = l->l_proc;
1874 	struct vnode *vp;
1875 	struct vattr vattr;
1876 	int error, optype;
1877 	struct nameidata nd;
1878 	char *path;
1879 	const char *cpath;
1880 	enum uio_seg seg = UIO_USERSPACE;
1881 
1882 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1883 	    0, NULL, NULL, NULL)) != 0)
1884 		return (error);
1885 
1886 	optype = VOP_MKNOD_DESCOFFSET;
1887 
1888 	VERIEXEC_PATH_GET(SCARG(uap, path), seg, cpath, path);
1889 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1890 
1891 	if ((error = namei(&nd)) != 0)
1892 		goto out;
1893 	vp = nd.ni_vp;
1894 	if (vp != NULL)
1895 		error = EEXIST;
1896 	else {
1897 		VATTR_NULL(&vattr);
1898 		/* We will read cwdi->cwdi_cmask unlocked. */
1899 		vattr.va_mode =
1900 		    (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1901 		vattr.va_rdev = SCARG(uap, dev);
1902 
1903 		switch (SCARG(uap, mode) & S_IFMT) {
1904 		case S_IFMT:	/* used by badsect to flag bad sectors */
1905 			vattr.va_type = VBAD;
1906 			break;
1907 		case S_IFCHR:
1908 			vattr.va_type = VCHR;
1909 			break;
1910 		case S_IFBLK:
1911 			vattr.va_type = VBLK;
1912 			break;
1913 		case S_IFWHT:
1914 			optype = VOP_WHITEOUT_DESCOFFSET;
1915 			break;
1916 		case S_IFREG:
1917 #if NVERIEXEC > 0
1918 			error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1919 			    O_CREAT);
1920 #endif /* NVERIEXEC > 0 */
1921 			vattr.va_type = VREG;
1922 			vattr.va_rdev = VNOVAL;
1923 			optype = VOP_CREATE_DESCOFFSET;
1924 			break;
1925 		default:
1926 			error = EINVAL;
1927 			break;
1928 		}
1929 	}
1930 	if (!error) {
1931 		switch (optype) {
1932 		case VOP_WHITEOUT_DESCOFFSET:
1933 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1934 			if (error)
1935 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1936 			vput(nd.ni_dvp);
1937 			break;
1938 
1939 		case VOP_MKNOD_DESCOFFSET:
1940 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1941 						&nd.ni_cnd, &vattr);
1942 			if (error == 0)
1943 				vput(nd.ni_vp);
1944 			break;
1945 
1946 		case VOP_CREATE_DESCOFFSET:
1947 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1948 						&nd.ni_cnd, &vattr);
1949 			if (error == 0)
1950 				vput(nd.ni_vp);
1951 			break;
1952 		}
1953 	} else {
1954 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1955 		if (nd.ni_dvp == vp)
1956 			vrele(nd.ni_dvp);
1957 		else
1958 			vput(nd.ni_dvp);
1959 		if (vp)
1960 			vrele(vp);
1961 	}
1962 out:
1963 	VERIEXEC_PATH_PUT(path);
1964 	return (error);
1965 }
1966 
1967 /*
1968  * Create a named pipe.
1969  */
1970 /* ARGSUSED */
1971 int
1972 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1973 {
1974 	/* {
1975 		syscallarg(const char *) path;
1976 		syscallarg(int) mode;
1977 	} */
1978 	struct proc *p = l->l_proc;
1979 	struct vattr vattr;
1980 	int error;
1981 	struct nameidata nd;
1982 
1983 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1984 	    SCARG(uap, path));
1985 	if ((error = namei(&nd)) != 0)
1986 		return (error);
1987 	if (nd.ni_vp != NULL) {
1988 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1989 		if (nd.ni_dvp == nd.ni_vp)
1990 			vrele(nd.ni_dvp);
1991 		else
1992 			vput(nd.ni_dvp);
1993 		vrele(nd.ni_vp);
1994 		return (EEXIST);
1995 	}
1996 	VATTR_NULL(&vattr);
1997 	vattr.va_type = VFIFO;
1998 	/* We will read cwdi->cwdi_cmask unlocked. */
1999 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2000 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2001 	if (error == 0)
2002 		vput(nd.ni_vp);
2003 	return (error);
2004 }
2005 
2006 /*
2007  * Make a hard file link.
2008  */
2009 /* ARGSUSED */
2010 int
2011 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2012 {
2013 	/* {
2014 		syscallarg(const char *) path;
2015 		syscallarg(const char *) link;
2016 	} */
2017 	struct vnode *vp;
2018 	struct nameidata nd;
2019 	int error;
2020 
2021 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2022 	    SCARG(uap, path));
2023 	if ((error = namei(&nd)) != 0)
2024 		return (error);
2025 	vp = nd.ni_vp;
2026 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2027 	    SCARG(uap, link));
2028 	if ((error = namei(&nd)) != 0)
2029 		goto out;
2030 	if (nd.ni_vp) {
2031 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2032 		if (nd.ni_dvp == nd.ni_vp)
2033 			vrele(nd.ni_dvp);
2034 		else
2035 			vput(nd.ni_dvp);
2036 		vrele(nd.ni_vp);
2037 		error = EEXIST;
2038 		goto out;
2039 	}
2040 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2041 out:
2042 	vrele(vp);
2043 	return (error);
2044 }
2045 
2046 /*
2047  * Make a symbolic link.
2048  */
2049 /* ARGSUSED */
2050 int
2051 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2052 {
2053 	/* {
2054 		syscallarg(const char *) path;
2055 		syscallarg(const char *) link;
2056 	} */
2057 	struct proc *p = l->l_proc;
2058 	struct vattr vattr;
2059 	char *path;
2060 	int error;
2061 	struct nameidata nd;
2062 
2063 	path = PNBUF_GET();
2064 	error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2065 	if (error)
2066 		goto out;
2067 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2068 	    SCARG(uap, link));
2069 	if ((error = namei(&nd)) != 0)
2070 		goto out;
2071 	if (nd.ni_vp) {
2072 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2073 		if (nd.ni_dvp == nd.ni_vp)
2074 			vrele(nd.ni_dvp);
2075 		else
2076 			vput(nd.ni_dvp);
2077 		vrele(nd.ni_vp);
2078 		error = EEXIST;
2079 		goto out;
2080 	}
2081 	VATTR_NULL(&vattr);
2082 	vattr.va_type = VLNK;
2083 	/* We will read cwdi->cwdi_cmask unlocked. */
2084 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2085 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2086 	if (error == 0)
2087 		vput(nd.ni_vp);
2088 out:
2089 	PNBUF_PUT(path);
2090 	return (error);
2091 }
2092 
2093 /*
2094  * Delete a whiteout from the filesystem.
2095  */
2096 /* ARGSUSED */
2097 int
2098 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2099 {
2100 	/* {
2101 		syscallarg(const char *) path;
2102 	} */
2103 	int error;
2104 	struct nameidata nd;
2105 
2106 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2107 	    UIO_USERSPACE, SCARG(uap, path));
2108 	error = namei(&nd);
2109 	if (error)
2110 		return (error);
2111 
2112 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2113 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2114 		if (nd.ni_dvp == nd.ni_vp)
2115 			vrele(nd.ni_dvp);
2116 		else
2117 			vput(nd.ni_dvp);
2118 		if (nd.ni_vp)
2119 			vrele(nd.ni_vp);
2120 		return (EEXIST);
2121 	}
2122 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2123 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2124 	vput(nd.ni_dvp);
2125 	return (error);
2126 }
2127 
2128 /*
2129  * Delete a name from the filesystem.
2130  */
2131 /* ARGSUSED */
2132 int
2133 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2134 {
2135 	/* {
2136 		syscallarg(const char *) path;
2137 	} */
2138 
2139 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2140 }
2141 
2142 int
2143 do_sys_unlink(const char *arg, enum uio_seg seg)
2144 {
2145 	struct vnode *vp;
2146 	int error;
2147 	struct nameidata nd;
2148 	kauth_cred_t cred;
2149 	char *path;
2150 	const char *cpath;
2151 
2152 	VERIEXEC_PATH_GET(arg, seg, cpath, path);
2153 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2154 
2155 	if ((error = namei(&nd)) != 0)
2156 		goto out;
2157 	vp = nd.ni_vp;
2158 
2159 	/*
2160 	 * The root of a mounted filesystem cannot be deleted.
2161 	 */
2162 	if (vp->v_vflag & VV_ROOT) {
2163 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2164 		if (nd.ni_dvp == vp)
2165 			vrele(nd.ni_dvp);
2166 		else
2167 			vput(nd.ni_dvp);
2168 		vput(vp);
2169 		error = EBUSY;
2170 		goto out;
2171 	}
2172 
2173 #if NVERIEXEC > 0
2174 	/* Handle remove requests for veriexec entries. */
2175 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2176 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2177 		if (nd.ni_dvp == vp)
2178 			vrele(nd.ni_dvp);
2179 		else
2180 			vput(nd.ni_dvp);
2181 		vput(vp);
2182 		goto out;
2183 	}
2184 #endif /* NVERIEXEC > 0 */
2185 
2186 	cred = kauth_cred_get();
2187 #ifdef FILEASSOC
2188 	(void)fileassoc_file_delete(vp);
2189 #endif /* FILEASSOC */
2190 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2191 out:
2192 	VERIEXEC_PATH_PUT(path);
2193 	return (error);
2194 }
2195 
2196 /*
2197  * Reposition read/write file offset.
2198  */
2199 int
2200 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2201 {
2202 	/* {
2203 		syscallarg(int) fd;
2204 		syscallarg(int) pad;
2205 		syscallarg(off_t) offset;
2206 		syscallarg(int) whence;
2207 	} */
2208 	kauth_cred_t cred = l->l_cred;
2209 	file_t *fp;
2210 	struct vnode *vp;
2211 	struct vattr vattr;
2212 	off_t newoff;
2213 	int error, fd;
2214 
2215 	fd = SCARG(uap, fd);
2216 
2217 	if ((fp = fd_getfile(fd)) == NULL)
2218 		return (EBADF);
2219 
2220 	vp = fp->f_data;
2221 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2222 		error = ESPIPE;
2223 		goto out;
2224 	}
2225 
2226 	switch (SCARG(uap, whence)) {
2227 	case SEEK_CUR:
2228 		newoff = fp->f_offset + SCARG(uap, offset);
2229 		break;
2230 	case SEEK_END:
2231 		error = VOP_GETATTR(vp, &vattr, cred);
2232 		if (error) {
2233 			goto out;
2234 		}
2235 		newoff = SCARG(uap, offset) + vattr.va_size;
2236 		break;
2237 	case SEEK_SET:
2238 		newoff = SCARG(uap, offset);
2239 		break;
2240 	default:
2241 		error = EINVAL;
2242 		goto out;
2243 	}
2244 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2245 		*(off_t *)retval = fp->f_offset = newoff;
2246 	}
2247  out:
2248  	fd_putfile(fd);
2249 	return (error);
2250 }
2251 
2252 /*
2253  * Positional read system call.
2254  */
2255 int
2256 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2257 {
2258 	/* {
2259 		syscallarg(int) fd;
2260 		syscallarg(void *) buf;
2261 		syscallarg(size_t) nbyte;
2262 		syscallarg(off_t) offset;
2263 	} */
2264 	file_t *fp;
2265 	struct vnode *vp;
2266 	off_t offset;
2267 	int error, fd = SCARG(uap, fd);
2268 
2269 	if ((fp = fd_getfile(fd)) == NULL)
2270 		return (EBADF);
2271 
2272 	if ((fp->f_flag & FREAD) == 0) {
2273 		fd_putfile(fd);
2274 		return (EBADF);
2275 	}
2276 
2277 	vp = fp->f_data;
2278 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2279 		error = ESPIPE;
2280 		goto out;
2281 	}
2282 
2283 	offset = SCARG(uap, offset);
2284 
2285 	/*
2286 	 * XXX This works because no file systems actually
2287 	 * XXX take any action on the seek operation.
2288 	 */
2289 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2290 		goto out;
2291 
2292 	/* dofileread() will unuse the descriptor for us */
2293 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2294 	    &offset, 0, retval));
2295 
2296  out:
2297 	fd_putfile(fd);
2298 	return (error);
2299 }
2300 
2301 /*
2302  * Positional scatter read system call.
2303  */
2304 int
2305 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2306 {
2307 	/* {
2308 		syscallarg(int) fd;
2309 		syscallarg(const struct iovec *) iovp;
2310 		syscallarg(int) iovcnt;
2311 		syscallarg(off_t) offset;
2312 	} */
2313 	off_t offset = SCARG(uap, offset);
2314 
2315 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2316 	    SCARG(uap, iovcnt), &offset, 0, retval);
2317 }
2318 
2319 /*
2320  * Positional write system call.
2321  */
2322 int
2323 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2324 {
2325 	/* {
2326 		syscallarg(int) fd;
2327 		syscallarg(const void *) buf;
2328 		syscallarg(size_t) nbyte;
2329 		syscallarg(off_t) offset;
2330 	} */
2331 	file_t *fp;
2332 	struct vnode *vp;
2333 	off_t offset;
2334 	int error, fd = SCARG(uap, fd);
2335 
2336 	if ((fp = fd_getfile(fd)) == NULL)
2337 		return (EBADF);
2338 
2339 	if ((fp->f_flag & FWRITE) == 0) {
2340 		fd_putfile(fd);
2341 		return (EBADF);
2342 	}
2343 
2344 	vp = fp->f_data;
2345 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2346 		error = ESPIPE;
2347 		goto out;
2348 	}
2349 
2350 	offset = SCARG(uap, offset);
2351 
2352 	/*
2353 	 * XXX This works because no file systems actually
2354 	 * XXX take any action on the seek operation.
2355 	 */
2356 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2357 		goto out;
2358 
2359 	/* dofilewrite() will unuse the descriptor for us */
2360 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2361 	    &offset, 0, retval));
2362 
2363  out:
2364 	fd_putfile(fd);
2365 	return (error);
2366 }
2367 
2368 /*
2369  * Positional gather write system call.
2370  */
2371 int
2372 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2373 {
2374 	/* {
2375 		syscallarg(int) fd;
2376 		syscallarg(const struct iovec *) iovp;
2377 		syscallarg(int) iovcnt;
2378 		syscallarg(off_t) offset;
2379 	} */
2380 	off_t offset = SCARG(uap, offset);
2381 
2382 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2383 	    SCARG(uap, iovcnt), &offset, 0, retval);
2384 }
2385 
2386 /*
2387  * Check access permissions.
2388  */
2389 int
2390 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2391 {
2392 	/* {
2393 		syscallarg(const char *) path;
2394 		syscallarg(int) flags;
2395 	} */
2396 	kauth_cred_t cred;
2397 	struct vnode *vp;
2398 	int error, flags;
2399 	struct nameidata nd;
2400 
2401 	cred = kauth_cred_dup(l->l_cred);
2402 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2403 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2404 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2405 	    SCARG(uap, path));
2406 	/* Override default credentials */
2407 	nd.ni_cnd.cn_cred = cred;
2408 	if ((error = namei(&nd)) != 0)
2409 		goto out;
2410 	vp = nd.ni_vp;
2411 
2412 	/* Flags == 0 means only check for existence. */
2413 	if (SCARG(uap, flags)) {
2414 		flags = 0;
2415 		if (SCARG(uap, flags) & R_OK)
2416 			flags |= VREAD;
2417 		if (SCARG(uap, flags) & W_OK)
2418 			flags |= VWRITE;
2419 		if (SCARG(uap, flags) & X_OK)
2420 			flags |= VEXEC;
2421 
2422 		error = VOP_ACCESS(vp, flags, cred);
2423 		if (!error && (flags & VWRITE))
2424 			error = vn_writechk(vp);
2425 	}
2426 	vput(vp);
2427 out:
2428 	kauth_cred_free(cred);
2429 	return (error);
2430 }
2431 
2432 /*
2433  * Common code for all sys_stat functions, including compat versions.
2434  */
2435 int
2436 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2437 {
2438 	int error;
2439 	struct nameidata nd;
2440 
2441 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2442 	    UIO_USERSPACE, path);
2443 	error = namei(&nd);
2444 	if (error != 0)
2445 		return error;
2446 	error = vn_stat(nd.ni_vp, sb);
2447 	vput(nd.ni_vp);
2448 	return error;
2449 }
2450 
2451 /*
2452  * Get file status; this version follows links.
2453  */
2454 /* ARGSUSED */
2455 int
2456 sys___stat30(struct lwp *l, const struct sys___stat30_args *uap, register_t *retval)
2457 {
2458 	/* {
2459 		syscallarg(const char *) path;
2460 		syscallarg(struct stat *) ub;
2461 	} */
2462 	struct stat sb;
2463 	int error;
2464 
2465 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2466 	if (error)
2467 		return error;
2468 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2469 }
2470 
2471 /*
2472  * Get file status; this version does not follow links.
2473  */
2474 /* ARGSUSED */
2475 int
2476 sys___lstat30(struct lwp *l, const struct sys___lstat30_args *uap, register_t *retval)
2477 {
2478 	/* {
2479 		syscallarg(const char *) path;
2480 		syscallarg(struct stat *) ub;
2481 	} */
2482 	struct stat sb;
2483 	int error;
2484 
2485 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2486 	if (error)
2487 		return error;
2488 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2489 }
2490 
2491 /*
2492  * Get configurable pathname variables.
2493  */
2494 /* ARGSUSED */
2495 int
2496 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2497 {
2498 	/* {
2499 		syscallarg(const char *) path;
2500 		syscallarg(int) name;
2501 	} */
2502 	int error;
2503 	struct nameidata nd;
2504 
2505 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2506 	    SCARG(uap, path));
2507 	if ((error = namei(&nd)) != 0)
2508 		return (error);
2509 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2510 	vput(nd.ni_vp);
2511 	return (error);
2512 }
2513 
2514 /*
2515  * Return target name of a symbolic link.
2516  */
2517 /* ARGSUSED */
2518 int
2519 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2520 {
2521 	/* {
2522 		syscallarg(const char *) path;
2523 		syscallarg(char *) buf;
2524 		syscallarg(size_t) count;
2525 	} */
2526 	struct vnode *vp;
2527 	struct iovec aiov;
2528 	struct uio auio;
2529 	int error;
2530 	struct nameidata nd;
2531 
2532 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2533 	    SCARG(uap, path));
2534 	if ((error = namei(&nd)) != 0)
2535 		return (error);
2536 	vp = nd.ni_vp;
2537 	if (vp->v_type != VLNK)
2538 		error = EINVAL;
2539 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2540 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2541 		aiov.iov_base = SCARG(uap, buf);
2542 		aiov.iov_len = SCARG(uap, count);
2543 		auio.uio_iov = &aiov;
2544 		auio.uio_iovcnt = 1;
2545 		auio.uio_offset = 0;
2546 		auio.uio_rw = UIO_READ;
2547 		KASSERT(l == curlwp);
2548 		auio.uio_vmspace = l->l_proc->p_vmspace;
2549 		auio.uio_resid = SCARG(uap, count);
2550 		error = VOP_READLINK(vp, &auio, l->l_cred);
2551 	}
2552 	vput(vp);
2553 	*retval = SCARG(uap, count) - auio.uio_resid;
2554 	return (error);
2555 }
2556 
2557 /*
2558  * Change flags of a file given a path name.
2559  */
2560 /* ARGSUSED */
2561 int
2562 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2563 {
2564 	/* {
2565 		syscallarg(const char *) path;
2566 		syscallarg(u_long) flags;
2567 	} */
2568 	struct vnode *vp;
2569 	int error;
2570 	struct nameidata nd;
2571 
2572 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2573 	    SCARG(uap, path));
2574 	if ((error = namei(&nd)) != 0)
2575 		return (error);
2576 	vp = nd.ni_vp;
2577 	error = change_flags(vp, SCARG(uap, flags), l);
2578 	vput(vp);
2579 	return (error);
2580 }
2581 
2582 /*
2583  * Change flags of a file given a file descriptor.
2584  */
2585 /* ARGSUSED */
2586 int
2587 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2588 {
2589 	/* {
2590 		syscallarg(int) fd;
2591 		syscallarg(u_long) flags;
2592 	} */
2593 	struct vnode *vp;
2594 	file_t *fp;
2595 	int error;
2596 
2597 	/* fd_getvnode() will use the descriptor for us */
2598 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2599 		return (error);
2600 	vp = fp->f_data;
2601 	error = change_flags(vp, SCARG(uap, flags), l);
2602 	VOP_UNLOCK(vp, 0);
2603 	fd_putfile(SCARG(uap, fd));
2604 	return (error);
2605 }
2606 
2607 /*
2608  * Change flags of a file given a path name; this version does
2609  * not follow links.
2610  */
2611 int
2612 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2613 {
2614 	/* {
2615 		syscallarg(const char *) path;
2616 		syscallarg(u_long) flags;
2617 	} */
2618 	struct vnode *vp;
2619 	int error;
2620 	struct nameidata nd;
2621 
2622 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2623 	    SCARG(uap, path));
2624 	if ((error = namei(&nd)) != 0)
2625 		return (error);
2626 	vp = nd.ni_vp;
2627 	error = change_flags(vp, SCARG(uap, flags), l);
2628 	vput(vp);
2629 	return (error);
2630 }
2631 
2632 /*
2633  * Common routine to change flags of a file.
2634  */
2635 int
2636 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2637 {
2638 	struct vattr vattr;
2639 	int error;
2640 
2641 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2642 	/*
2643 	 * Non-superusers cannot change the flags on devices, even if they
2644 	 * own them.
2645 	 */
2646 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2647 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2648 			goto out;
2649 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2650 			error = EINVAL;
2651 			goto out;
2652 		}
2653 	}
2654 	VATTR_NULL(&vattr);
2655 	vattr.va_flags = flags;
2656 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2657 out:
2658 	return (error);
2659 }
2660 
2661 /*
2662  * Change mode of a file given path name; this version follows links.
2663  */
2664 /* ARGSUSED */
2665 int
2666 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2667 {
2668 	/* {
2669 		syscallarg(const char *) path;
2670 		syscallarg(int) mode;
2671 	} */
2672 	int error;
2673 	struct nameidata nd;
2674 
2675 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2676 	    SCARG(uap, path));
2677 	if ((error = namei(&nd)) != 0)
2678 		return (error);
2679 
2680 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2681 
2682 	vrele(nd.ni_vp);
2683 	return (error);
2684 }
2685 
2686 /*
2687  * Change mode of a file given a file descriptor.
2688  */
2689 /* ARGSUSED */
2690 int
2691 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2692 {
2693 	/* {
2694 		syscallarg(int) fd;
2695 		syscallarg(int) mode;
2696 	} */
2697 	file_t *fp;
2698 	int error;
2699 
2700 	/* fd_getvnode() will use the descriptor for us */
2701 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2702 		return (error);
2703 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2704 	fd_putfile(SCARG(uap, fd));
2705 	return (error);
2706 }
2707 
2708 /*
2709  * Change mode of a file given path name; this version does not follow links.
2710  */
2711 /* ARGSUSED */
2712 int
2713 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2714 {
2715 	/* {
2716 		syscallarg(const char *) path;
2717 		syscallarg(int) mode;
2718 	} */
2719 	int error;
2720 	struct nameidata nd;
2721 
2722 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2723 	    SCARG(uap, path));
2724 	if ((error = namei(&nd)) != 0)
2725 		return (error);
2726 
2727 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2728 
2729 	vrele(nd.ni_vp);
2730 	return (error);
2731 }
2732 
2733 /*
2734  * Common routine to set mode given a vnode.
2735  */
2736 static int
2737 change_mode(struct vnode *vp, int mode, struct lwp *l)
2738 {
2739 	struct vattr vattr;
2740 	int error;
2741 
2742 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2743 	VATTR_NULL(&vattr);
2744 	vattr.va_mode = mode & ALLPERMS;
2745 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2746 	VOP_UNLOCK(vp, 0);
2747 	return (error);
2748 }
2749 
2750 /*
2751  * Set ownership given a path name; this version follows links.
2752  */
2753 /* ARGSUSED */
2754 int
2755 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2756 {
2757 	/* {
2758 		syscallarg(const char *) path;
2759 		syscallarg(uid_t) uid;
2760 		syscallarg(gid_t) gid;
2761 	} */
2762 	int error;
2763 	struct nameidata nd;
2764 
2765 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2766 	    SCARG(uap, path));
2767 	if ((error = namei(&nd)) != 0)
2768 		return (error);
2769 
2770 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2771 
2772 	vrele(nd.ni_vp);
2773 	return (error);
2774 }
2775 
2776 /*
2777  * Set ownership given a path name; this version follows links.
2778  * Provides POSIX semantics.
2779  */
2780 /* ARGSUSED */
2781 int
2782 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2783 {
2784 	/* {
2785 		syscallarg(const char *) path;
2786 		syscallarg(uid_t) uid;
2787 		syscallarg(gid_t) gid;
2788 	} */
2789 	int error;
2790 	struct nameidata nd;
2791 
2792 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2793 	    SCARG(uap, path));
2794 	if ((error = namei(&nd)) != 0)
2795 		return (error);
2796 
2797 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2798 
2799 	vrele(nd.ni_vp);
2800 	return (error);
2801 }
2802 
2803 /*
2804  * Set ownership given a file descriptor.
2805  */
2806 /* ARGSUSED */
2807 int
2808 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2809 {
2810 	/* {
2811 		syscallarg(int) fd;
2812 		syscallarg(uid_t) uid;
2813 		syscallarg(gid_t) gid;
2814 	} */
2815 	int error;
2816 	file_t *fp;
2817 
2818 	/* fd_getvnode() will use the descriptor for us */
2819 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2820 		return (error);
2821 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2822 	    l, 0);
2823 	fd_putfile(SCARG(uap, fd));
2824 	return (error);
2825 }
2826 
2827 /*
2828  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2829  */
2830 /* ARGSUSED */
2831 int
2832 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2833 {
2834 	/* {
2835 		syscallarg(int) fd;
2836 		syscallarg(uid_t) uid;
2837 		syscallarg(gid_t) gid;
2838 	} */
2839 	int error;
2840 	file_t *fp;
2841 
2842 	/* fd_getvnode() will use the descriptor for us */
2843 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2844 		return (error);
2845 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2846 	    l, 1);
2847 	fd_putfile(SCARG(uap, fd));
2848 	return (error);
2849 }
2850 
2851 /*
2852  * Set ownership given a path name; this version does not follow links.
2853  */
2854 /* ARGSUSED */
2855 int
2856 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2857 {
2858 	/* {
2859 		syscallarg(const char *) path;
2860 		syscallarg(uid_t) uid;
2861 		syscallarg(gid_t) gid;
2862 	} */
2863 	int error;
2864 	struct nameidata nd;
2865 
2866 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2867 	    SCARG(uap, path));
2868 	if ((error = namei(&nd)) != 0)
2869 		return (error);
2870 
2871 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2872 
2873 	vrele(nd.ni_vp);
2874 	return (error);
2875 }
2876 
2877 /*
2878  * Set ownership given a path name; this version does not follow links.
2879  * Provides POSIX/XPG semantics.
2880  */
2881 /* ARGSUSED */
2882 int
2883 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2884 {
2885 	/* {
2886 		syscallarg(const char *) path;
2887 		syscallarg(uid_t) uid;
2888 		syscallarg(gid_t) gid;
2889 	} */
2890 	int error;
2891 	struct nameidata nd;
2892 
2893 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2894 	    SCARG(uap, path));
2895 	if ((error = namei(&nd)) != 0)
2896 		return (error);
2897 
2898 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2899 
2900 	vrele(nd.ni_vp);
2901 	return (error);
2902 }
2903 
2904 /*
2905  * Common routine to set ownership given a vnode.
2906  */
2907 static int
2908 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2909     int posix_semantics)
2910 {
2911 	struct vattr vattr;
2912 	mode_t newmode;
2913 	int error;
2914 
2915 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2916 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2917 		goto out;
2918 
2919 #define CHANGED(x) ((int)(x) != -1)
2920 	newmode = vattr.va_mode;
2921 	if (posix_semantics) {
2922 		/*
2923 		 * POSIX/XPG semantics: if the caller is not the super-user,
2924 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2925 		 * the XPG consider the behaviour for calls by the super-user
2926 		 * implementation-defined; we leave the set-user-id and set-
2927 		 * group-id settings intact in that case.
2928 		 */
2929 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2930 				      NULL) != 0)
2931 			newmode &= ~(S_ISUID | S_ISGID);
2932 	} else {
2933 		/*
2934 		 * NetBSD semantics: when changing owner and/or group,
2935 		 * clear the respective bit(s).
2936 		 */
2937 		if (CHANGED(uid))
2938 			newmode &= ~S_ISUID;
2939 		if (CHANGED(gid))
2940 			newmode &= ~S_ISGID;
2941 	}
2942 	/* Update va_mode iff altered. */
2943 	if (vattr.va_mode == newmode)
2944 		newmode = VNOVAL;
2945 
2946 	VATTR_NULL(&vattr);
2947 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2948 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2949 	vattr.va_mode = newmode;
2950 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2951 #undef CHANGED
2952 
2953 out:
2954 	VOP_UNLOCK(vp, 0);
2955 	return (error);
2956 }
2957 
2958 /*
2959  * Set the access and modification times given a path name; this
2960  * version follows links.
2961  */
2962 /* ARGSUSED */
2963 int
2964 sys_utimes(struct lwp *l, const struct sys_utimes_args *uap, register_t *retval)
2965 {
2966 	/* {
2967 		syscallarg(const char *) path;
2968 		syscallarg(const struct timeval *) tptr;
2969 	} */
2970 
2971 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2972 	    SCARG(uap, tptr), UIO_USERSPACE);
2973 }
2974 
2975 /*
2976  * Set the access and modification times given a file descriptor.
2977  */
2978 /* ARGSUSED */
2979 int
2980 sys_futimes(struct lwp *l, const struct sys_futimes_args *uap, register_t *retval)
2981 {
2982 	/* {
2983 		syscallarg(int) fd;
2984 		syscallarg(const struct timeval *) tptr;
2985 	} */
2986 	int error;
2987 	file_t *fp;
2988 
2989 	/* fd_getvnode() will use the descriptor for us */
2990 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2991 		return (error);
2992 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2993 	    UIO_USERSPACE);
2994 	fd_putfile(SCARG(uap, fd));
2995 	return (error);
2996 }
2997 
2998 /*
2999  * Set the access and modification times given a path name; this
3000  * version does not follow links.
3001  */
3002 int
3003 sys_lutimes(struct lwp *l, const struct sys_lutimes_args *uap, register_t *retval)
3004 {
3005 	/* {
3006 		syscallarg(const char *) path;
3007 		syscallarg(const struct timeval *) tptr;
3008 	} */
3009 
3010 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3011 	    SCARG(uap, tptr), UIO_USERSPACE);
3012 }
3013 
3014 /*
3015  * Common routine to set access and modification times given a vnode.
3016  */
3017 int
3018 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3019     const struct timeval *tptr, enum uio_seg seg)
3020 {
3021 	struct vattr vattr;
3022 	struct nameidata nd;
3023 	int error;
3024 	bool vanull, setbirthtime;
3025 	struct timespec ts[2];
3026 
3027 	if (tptr == NULL) {
3028 		vanull = true;
3029 		nanotime(&ts[0]);
3030 		ts[1] = ts[0];
3031 	} else {
3032 		struct timeval tv[2];
3033 
3034 		vanull = false;
3035 		if (seg != UIO_SYSSPACE) {
3036 			error = copyin(tptr, &tv, sizeof (tv));
3037 			if (error != 0)
3038 				return error;
3039 			tptr = tv;
3040 		}
3041 		TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3042 		TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3043 	}
3044 
3045 	if (vp == NULL) {
3046 		NDINIT(&nd, LOOKUP, flag | TRYEMULROOT, UIO_USERSPACE, path);
3047 		if ((error = namei(&nd)) != 0)
3048 			return error;
3049 		vp = nd.ni_vp;
3050 	} else
3051 		nd.ni_vp = NULL;
3052 
3053 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3054 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3055 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3056 	VATTR_NULL(&vattr);
3057 	vattr.va_atime = ts[0];
3058 	vattr.va_mtime = ts[1];
3059 	if (setbirthtime)
3060 		vattr.va_birthtime = ts[1];
3061 	if (vanull)
3062 		vattr.va_flags |= VA_UTIMES_NULL;
3063 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3064 	VOP_UNLOCK(vp, 0);
3065 
3066 	if (nd.ni_vp != NULL)
3067 		vrele(nd.ni_vp);
3068 
3069 	return error;
3070 }
3071 
3072 /*
3073  * Truncate a file given its path name.
3074  */
3075 /* ARGSUSED */
3076 int
3077 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3078 {
3079 	/* {
3080 		syscallarg(const char *) path;
3081 		syscallarg(int) pad;
3082 		syscallarg(off_t) length;
3083 	} */
3084 	struct vnode *vp;
3085 	struct vattr vattr;
3086 	int error;
3087 	struct nameidata nd;
3088 
3089 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3090 	    SCARG(uap, path));
3091 	if ((error = namei(&nd)) != 0)
3092 		return (error);
3093 	vp = nd.ni_vp;
3094 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3095 	if (vp->v_type == VDIR)
3096 		error = EISDIR;
3097 	else if ((error = vn_writechk(vp)) == 0 &&
3098 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3099 		VATTR_NULL(&vattr);
3100 		vattr.va_size = SCARG(uap, length);
3101 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3102 	}
3103 	vput(vp);
3104 	return (error);
3105 }
3106 
3107 /*
3108  * Truncate a file given a file descriptor.
3109  */
3110 /* ARGSUSED */
3111 int
3112 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3113 {
3114 	/* {
3115 		syscallarg(int) fd;
3116 		syscallarg(int) pad;
3117 		syscallarg(off_t) length;
3118 	} */
3119 	struct vattr vattr;
3120 	struct vnode *vp;
3121 	file_t *fp;
3122 	int error;
3123 
3124 	/* fd_getvnode() will use the descriptor for us */
3125 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3126 		return (error);
3127 	if ((fp->f_flag & FWRITE) == 0) {
3128 		error = EINVAL;
3129 		goto out;
3130 	}
3131 	vp = fp->f_data;
3132 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3133 	if (vp->v_type == VDIR)
3134 		error = EISDIR;
3135 	else if ((error = vn_writechk(vp)) == 0) {
3136 		VATTR_NULL(&vattr);
3137 		vattr.va_size = SCARG(uap, length);
3138 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3139 	}
3140 	VOP_UNLOCK(vp, 0);
3141  out:
3142 	fd_putfile(SCARG(uap, fd));
3143 	return (error);
3144 }
3145 
3146 /*
3147  * Sync an open file.
3148  */
3149 /* ARGSUSED */
3150 int
3151 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3152 {
3153 	/* {
3154 		syscallarg(int) fd;
3155 	} */
3156 	struct vnode *vp;
3157 	file_t *fp;
3158 	int error;
3159 
3160 	/* fd_getvnode() will use the descriptor for us */
3161 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3162 		return (error);
3163 	vp = fp->f_data;
3164 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3165 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3166 	if (error == 0 && bioopsp != NULL &&
3167 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3168 		(*bioopsp->io_fsync)(vp, 0);
3169 	VOP_UNLOCK(vp, 0);
3170 	fd_putfile(SCARG(uap, fd));
3171 	return (error);
3172 }
3173 
3174 /*
3175  * Sync a range of file data.  API modeled after that found in AIX.
3176  *
3177  * FDATASYNC indicates that we need only save enough metadata to be able
3178  * to re-read the written data.  Note we duplicate AIX's requirement that
3179  * the file be open for writing.
3180  */
3181 /* ARGSUSED */
3182 int
3183 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3184 {
3185 	/* {
3186 		syscallarg(int) fd;
3187 		syscallarg(int) flags;
3188 		syscallarg(off_t) start;
3189 		syscallarg(off_t) length;
3190 	} */
3191 	struct vnode *vp;
3192 	file_t *fp;
3193 	int flags, nflags;
3194 	off_t s, e, len;
3195 	int error;
3196 
3197 	/* fd_getvnode() will use the descriptor for us */
3198 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3199 		return (error);
3200 
3201 	if ((fp->f_flag & FWRITE) == 0) {
3202 		error = EBADF;
3203 		goto out;
3204 	}
3205 
3206 	flags = SCARG(uap, flags);
3207 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3208 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3209 		error = EINVAL;
3210 		goto out;
3211 	}
3212 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3213 	if (flags & FDATASYNC)
3214 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3215 	else
3216 		nflags = FSYNC_WAIT;
3217 	if (flags & FDISKSYNC)
3218 		nflags |= FSYNC_CACHE;
3219 
3220 	len = SCARG(uap, length);
3221 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3222 	if (len) {
3223 		s = SCARG(uap, start);
3224 		e = s + len;
3225 		if (e < s) {
3226 			error = EINVAL;
3227 			goto out;
3228 		}
3229 	} else {
3230 		e = 0;
3231 		s = 0;
3232 	}
3233 
3234 	vp = fp->f_data;
3235 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3236 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3237 
3238 	if (error == 0 && bioopsp != NULL &&
3239 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3240 		(*bioopsp->io_fsync)(vp, nflags);
3241 
3242 	VOP_UNLOCK(vp, 0);
3243 out:
3244 	fd_putfile(SCARG(uap, fd));
3245 	return (error);
3246 }
3247 
3248 /*
3249  * Sync the data of an open file.
3250  */
3251 /* ARGSUSED */
3252 int
3253 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3254 {
3255 	/* {
3256 		syscallarg(int) fd;
3257 	} */
3258 	struct vnode *vp;
3259 	file_t *fp;
3260 	int error;
3261 
3262 	/* fd_getvnode() will use the descriptor for us */
3263 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3264 		return (error);
3265 	if ((fp->f_flag & FWRITE) == 0) {
3266 		fd_putfile(SCARG(uap, fd));
3267 		return (EBADF);
3268 	}
3269 	vp = fp->f_data;
3270 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3271 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3272 	VOP_UNLOCK(vp, 0);
3273 	fd_putfile(SCARG(uap, fd));
3274 	return (error);
3275 }
3276 
3277 /*
3278  * Rename files, (standard) BSD semantics frontend.
3279  */
3280 /* ARGSUSED */
3281 int
3282 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3283 {
3284 	/* {
3285 		syscallarg(const char *) from;
3286 		syscallarg(const char *) to;
3287 	} */
3288 
3289 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3290 }
3291 
3292 /*
3293  * Rename files, POSIX semantics frontend.
3294  */
3295 /* ARGSUSED */
3296 int
3297 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3298 {
3299 	/* {
3300 		syscallarg(const char *) from;
3301 		syscallarg(const char *) to;
3302 	} */
3303 
3304 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3305 }
3306 
3307 /*
3308  * Rename files.  Source and destination must either both be directories,
3309  * or both not be directories.  If target is a directory, it must be empty.
3310  * If `from' and `to' refer to the same object, the value of the `retain'
3311  * argument is used to determine whether `from' will be
3312  *
3313  * (retain == 0)	deleted unless `from' and `to' refer to the same
3314  *			object in the file system's name space (BSD).
3315  * (retain == 1)	always retained (POSIX).
3316  */
3317 int
3318 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3319 {
3320 	struct vnode *tvp, *fvp, *tdvp;
3321 	struct nameidata fromnd, tond;
3322 	struct mount *fs;
3323 	struct lwp *l = curlwp;
3324 	struct proc *p;
3325 	uint32_t saveflag;
3326 	int error;
3327 
3328 	NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT,
3329 	    seg, from);
3330 	if ((error = namei(&fromnd)) != 0)
3331 		return (error);
3332 	if (fromnd.ni_dvp != fromnd.ni_vp)
3333 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3334 	fvp = fromnd.ni_vp;
3335 
3336 	fs = fvp->v_mount;
3337 	error = VFS_RENAMELOCK_ENTER(fs);
3338 	if (error) {
3339 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3340 		vrele(fromnd.ni_dvp);
3341 		vrele(fvp);
3342 		goto out1;
3343 	}
3344 
3345 	/*
3346 	 * close, partially, yet another race - ideally we should only
3347 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3348 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3349 	 * that with namei as it stands.
3350 	 *
3351 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3352 	 * under us. The real fix is to get the locks in the right
3353 	 * order and do the lookups in the right places, but that's a
3354 	 * major rototill.
3355 	 *
3356 	 * Preserve the SAVESTART in cn_flags, because who knows what
3357 	 * might happen if we don't.
3358 	 *
3359 	 * Note: this logic (as well as this whole function) is cloned
3360 	 * in nfs_serv.c. Proceed accordingly.
3361 	 */
3362 	vrele(fvp);
3363 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3364 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3365 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3366 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3367 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3368 		error = EINVAL;
3369 		VFS_RENAMELOCK_EXIT(fs);
3370 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3371 		vrele(fromnd.ni_dvp);
3372 		goto out1;
3373 	}
3374 	saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3375 	fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3376 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3377 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3378 	fromnd.ni_cnd.cn_flags |= saveflag;
3379 	if (error) {
3380 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3381 		VFS_RENAMELOCK_EXIT(fs);
3382 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3383 		vrele(fromnd.ni_dvp);
3384 		goto out1;
3385 	}
3386 	VOP_UNLOCK(fromnd.ni_vp, 0);
3387 	if (fromnd.ni_dvp != fromnd.ni_vp)
3388 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3389 	fvp = fromnd.ni_vp;
3390 
3391 	NDINIT(&tond, RENAME,
3392 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3393 	      | (fvp->v_type == VDIR ? CREATEDIR : 0),
3394 	    seg, to);
3395 	if ((error = namei(&tond)) != 0) {
3396 		VFS_RENAMELOCK_EXIT(fs);
3397 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3398 		vrele(fromnd.ni_dvp);
3399 		vrele(fvp);
3400 		goto out1;
3401 	}
3402 	tdvp = tond.ni_dvp;
3403 	tvp = tond.ni_vp;
3404 
3405 	if (tvp != NULL) {
3406 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3407 			error = ENOTDIR;
3408 			goto out;
3409 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3410 			error = EISDIR;
3411 			goto out;
3412 		}
3413 	}
3414 
3415 	if (fvp == tdvp)
3416 		error = EINVAL;
3417 
3418 	/*
3419 	 * Source and destination refer to the same object.
3420 	 */
3421 	if (fvp == tvp) {
3422 		if (retain)
3423 			error = -1;
3424 		else if (fromnd.ni_dvp == tdvp &&
3425 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3426 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3427 		          tond.ni_cnd.cn_nameptr,
3428 		          fromnd.ni_cnd.cn_namelen))
3429 		error = -1;
3430 	}
3431 
3432 #if NVERIEXEC > 0
3433 	if (!error) {
3434 		char *f1, *f2;
3435 
3436 		f1 = malloc(fromnd.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3437 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen);
3438 
3439 		f2 = malloc(tond.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3440 		strlcpy(f2, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen);
3441 
3442 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3443 
3444 		free(f1, M_TEMP);
3445 		free(f2, M_TEMP);
3446 	}
3447 #endif /* NVERIEXEC > 0 */
3448 
3449 out:
3450 	p = l->l_proc;
3451 	if (!error) {
3452 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3453 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3454 		VFS_RENAMELOCK_EXIT(fs);
3455 	} else {
3456 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3457 		if (tdvp == tvp)
3458 			vrele(tdvp);
3459 		else
3460 			vput(tdvp);
3461 		if (tvp)
3462 			vput(tvp);
3463 		VFS_RENAMELOCK_EXIT(fs);
3464 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3465 		vrele(fromnd.ni_dvp);
3466 		vrele(fvp);
3467 	}
3468 	vrele(tond.ni_startdir);
3469 	PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3470 out1:
3471 	if (fromnd.ni_startdir)
3472 		vrele(fromnd.ni_startdir);
3473 	PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3474 	return (error == -1 ? 0 : error);
3475 }
3476 
3477 /*
3478  * Make a directory file.
3479  */
3480 /* ARGSUSED */
3481 int
3482 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3483 {
3484 	/* {
3485 		syscallarg(const char *) path;
3486 		syscallarg(int) mode;
3487 	} */
3488 	struct proc *p = l->l_proc;
3489 	struct vnode *vp;
3490 	struct vattr vattr;
3491 	int error;
3492 	struct nameidata nd;
3493 
3494 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, UIO_USERSPACE,
3495 	    SCARG(uap, path));
3496 	if ((error = namei(&nd)) != 0)
3497 		return (error);
3498 	vp = nd.ni_vp;
3499 	if (vp != NULL) {
3500 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3501 		if (nd.ni_dvp == vp)
3502 			vrele(nd.ni_dvp);
3503 		else
3504 			vput(nd.ni_dvp);
3505 		vrele(vp);
3506 		return (EEXIST);
3507 	}
3508 	VATTR_NULL(&vattr);
3509 	vattr.va_type = VDIR;
3510 	/* We will read cwdi->cwdi_cmask unlocked. */
3511 	vattr.va_mode =
3512 	    (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3513 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3514 	if (!error)
3515 		vput(nd.ni_vp);
3516 	return (error);
3517 }
3518 
3519 /*
3520  * Remove a directory file.
3521  */
3522 /* ARGSUSED */
3523 int
3524 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3525 {
3526 	/* {
3527 		syscallarg(const char *) path;
3528 	} */
3529 	struct vnode *vp;
3530 	int error;
3531 	struct nameidata nd;
3532 
3533 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3534 	    SCARG(uap, path));
3535 	if ((error = namei(&nd)) != 0)
3536 		return (error);
3537 	vp = nd.ni_vp;
3538 	if (vp->v_type != VDIR) {
3539 		error = ENOTDIR;
3540 		goto out;
3541 	}
3542 	/*
3543 	 * No rmdir "." please.
3544 	 */
3545 	if (nd.ni_dvp == vp) {
3546 		error = EINVAL;
3547 		goto out;
3548 	}
3549 	/*
3550 	 * The root of a mounted filesystem cannot be deleted.
3551 	 */
3552 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3553 		error = EBUSY;
3554 		goto out;
3555 	}
3556 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3557 	return (error);
3558 
3559 out:
3560 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3561 	if (nd.ni_dvp == vp)
3562 		vrele(nd.ni_dvp);
3563 	else
3564 		vput(nd.ni_dvp);
3565 	vput(vp);
3566 	return (error);
3567 }
3568 
3569 /*
3570  * Read a block of directory entries in a file system independent format.
3571  */
3572 int
3573 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3574 {
3575 	/* {
3576 		syscallarg(int) fd;
3577 		syscallarg(char *) buf;
3578 		syscallarg(size_t) count;
3579 	} */
3580 	file_t *fp;
3581 	int error, done;
3582 
3583 	/* fd_getvnode() will use the descriptor for us */
3584 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3585 		return (error);
3586 	if ((fp->f_flag & FREAD) == 0) {
3587 		error = EBADF;
3588 		goto out;
3589 	}
3590 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3591 			SCARG(uap, count), &done, l, 0, 0);
3592 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3593 	*retval = done;
3594  out:
3595 	fd_putfile(SCARG(uap, fd));
3596 	return (error);
3597 }
3598 
3599 /*
3600  * Set the mode mask for creation of filesystem nodes.
3601  */
3602 int
3603 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3604 {
3605 	/* {
3606 		syscallarg(mode_t) newmask;
3607 	} */
3608 	struct proc *p = l->l_proc;
3609 	struct cwdinfo *cwdi;
3610 
3611 	/*
3612 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3613 	 * important is that we serialize changes to the mask.  The
3614 	 * rw_exit() will issue a write memory barrier on our behalf,
3615 	 * and force the changes out to other CPUs (as it must use an
3616 	 * atomic operation, draining the local CPU's store buffers).
3617 	 */
3618 	cwdi = p->p_cwdi;
3619 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3620 	*retval = cwdi->cwdi_cmask;
3621 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3622 	rw_exit(&cwdi->cwdi_lock);
3623 
3624 	return (0);
3625 }
3626 
3627 int
3628 dorevoke(struct vnode *vp, kauth_cred_t cred)
3629 {
3630 	struct vattr vattr;
3631 	int error;
3632 
3633 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3634 		return error;
3635 	if (kauth_cred_geteuid(cred) != vattr.va_uid &&
3636 	    (error = kauth_authorize_generic(cred,
3637 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3638 		VOP_REVOKE(vp, REVOKEALL);
3639 	return (error);
3640 }
3641 
3642 /*
3643  * Void all references to file by ripping underlying filesystem
3644  * away from vnode.
3645  */
3646 /* ARGSUSED */
3647 int
3648 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3649 {
3650 	/* {
3651 		syscallarg(const char *) path;
3652 	} */
3653 	struct vnode *vp;
3654 	int error;
3655 	struct nameidata nd;
3656 
3657 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3658 	    SCARG(uap, path));
3659 	if ((error = namei(&nd)) != 0)
3660 		return (error);
3661 	vp = nd.ni_vp;
3662 	error = dorevoke(vp, l->l_cred);
3663 	vrele(vp);
3664 	return (error);
3665 }
3666