xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 10ad5ffa714ce1a679dcc9dd8159648df2d67b5a)
1 /*	$NetBSD: vfs_syscalls.c,v 1.396 2009/07/02 12:53:47 pooka Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.396 2009/07/02 12:53:47 pooka Exp $");
70 
71 #ifdef _KERNEL_OPT
72 #include "opt_fileassoc.h"
73 #include "veriexec.h"
74 #endif
75 
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/namei.h>
79 #include <sys/filedesc.h>
80 #include <sys/kernel.h>
81 #include <sys/file.h>
82 #include <sys/stat.h>
83 #include <sys/vnode.h>
84 #include <sys/mount.h>
85 #include <sys/proc.h>
86 #include <sys/uio.h>
87 #include <sys/kmem.h>
88 #include <sys/dirent.h>
89 #include <sys/sysctl.h>
90 #include <sys/syscallargs.h>
91 #include <sys/vfs_syscalls.h>
92 #include <sys/ktrace.h>
93 #ifdef FILEASSOC
94 #include <sys/fileassoc.h>
95 #endif /* FILEASSOC */
96 #include <sys/verified_exec.h>
97 #include <sys/kauth.h>
98 #include <sys/atomic.h>
99 #include <sys/module.h>
100 #include <sys/buf.h>
101 
102 #include <miscfs/genfs/genfs.h>
103 #include <miscfs/syncfs/syncfs.h>
104 #include <miscfs/specfs/specdev.h>
105 
106 #include <nfs/rpcv2.h>
107 #include <nfs/nfsproto.h>
108 #include <nfs/nfs.h>
109 #include <nfs/nfs_var.h>
110 
111 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
112 
113 static int change_dir(struct nameidata *, struct lwp *);
114 static int change_flags(struct vnode *, u_long, struct lwp *);
115 static int change_mode(struct vnode *, int, struct lwp *l);
116 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
117 
118 void checkdirs(struct vnode *);
119 
120 /*
121  * Virtual File System System Calls
122  */
123 
124 /*
125  * Mount a file system.
126  */
127 
128 /*
129  * This table is used to maintain compatibility with 4.3BSD
130  * and NetBSD 0.9 mount syscalls - and possibly other systems.
131  * Note, the order is important!
132  *
133  * Do not modify this table. It should only contain filesystems
134  * supported by NetBSD 0.9 and 4.3BSD.
135  */
136 const char * const mountcompatnames[] = {
137 	NULL,		/* 0 = MOUNT_NONE */
138 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
139 	MOUNT_NFS,	/* 2 */
140 	MOUNT_MFS,	/* 3 */
141 	MOUNT_MSDOS,	/* 4 */
142 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
143 	MOUNT_FDESC,	/* 6 */
144 	MOUNT_KERNFS,	/* 7 */
145 	NULL,		/* 8 = MOUNT_DEVFS */
146 	MOUNT_AFS,	/* 9 */
147 };
148 const int nmountcompatnames = sizeof(mountcompatnames) /
149     sizeof(mountcompatnames[0]);
150 
151 static int
152 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
153     void *data, size_t *data_len)
154 {
155 	struct mount *mp;
156 	int error = 0, saved_flags;
157 
158 	mp = vp->v_mount;
159 	saved_flags = mp->mnt_flag;
160 
161 	/* We can operate only on VV_ROOT nodes. */
162 	if ((vp->v_vflag & VV_ROOT) == 0) {
163 		error = EINVAL;
164 		goto out;
165 	}
166 
167 	/*
168 	 * We only allow the filesystem to be reloaded if it
169 	 * is currently mounted read-only.  Additionally, we
170 	 * prevent read-write to read-only downgrades.
171 	 */
172 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
173 	    (mp->mnt_flag & MNT_RDONLY) == 0) {
174 		error = EOPNOTSUPP;	/* Needs translation */
175 		goto out;
176 	}
177 
178 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
179 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
180 	if (error)
181 		goto out;
182 
183 	if (vfs_busy(mp, NULL)) {
184 		error = EPERM;
185 		goto out;
186 	}
187 
188 	mutex_enter(&mp->mnt_updating);
189 
190 	mp->mnt_flag &= ~MNT_OP_FLAGS;
191 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
192 
193 	/*
194 	 * Set the mount level flags.
195 	 */
196 	if (flags & MNT_RDONLY)
197 		mp->mnt_flag |= MNT_RDONLY;
198 	else if (mp->mnt_flag & MNT_RDONLY)
199 		mp->mnt_iflag |= IMNT_WANTRDWR;
200 	mp->mnt_flag &=
201 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
202 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
203 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
204 	    MNT_LOG);
205 	mp->mnt_flag |= flags &
206 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
207 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
208 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
209 	    MNT_LOG | MNT_IGNORE);
210 
211 	error = VFS_MOUNT(mp, path, data, data_len);
212 
213 	if (error && data != NULL) {
214 		int error2;
215 
216 		/*
217 		 * Update failed; let's try and see if it was an
218 		 * export request.  For compat with 3.0 and earlier.
219 		 */
220 		error2 = vfs_hooks_reexport(mp, path, data);
221 
222 		/*
223 		 * Only update error code if the export request was
224 		 * understood but some problem occurred while
225 		 * processing it.
226 		 */
227 		if (error2 != EJUSTRETURN)
228 			error = error2;
229 	}
230 
231 	if (mp->mnt_iflag & IMNT_WANTRDWR)
232 		mp->mnt_flag &= ~MNT_RDONLY;
233 	if (error)
234 		mp->mnt_flag = saved_flags;
235 	mp->mnt_flag &= ~MNT_OP_FLAGS;
236 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
237 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
238 		if (mp->mnt_syncer == NULL)
239 			error = vfs_allocate_syncvnode(mp);
240 	} else {
241 		if (mp->mnt_syncer != NULL)
242 			vfs_deallocate_syncvnode(mp);
243 	}
244 	mutex_exit(&mp->mnt_updating);
245 	vfs_unbusy(mp, false, NULL);
246 
247  out:
248 	return (error);
249 }
250 
251 static int
252 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
253 {
254 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
255 	int error;
256 
257 	/* Copy file-system type from userspace.  */
258 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
259 	if (error) {
260 		/*
261 		 * Historically, filesystem types were identified by numbers.
262 		 * If we get an integer for the filesystem type instead of a
263 		 * string, we check to see if it matches one of the historic
264 		 * filesystem types.
265 		 */
266 		u_long fsindex = (u_long)fstype;
267 		if (fsindex >= nmountcompatnames ||
268 		    mountcompatnames[fsindex] == NULL)
269 			return ENODEV;
270 		strlcpy(fstypename, mountcompatnames[fsindex],
271 		    sizeof(fstypename));
272 	}
273 
274 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
275 	if (strcmp(fstypename, "ufs") == 0)
276 		fstypename[0] = 'f';
277 
278 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
279 		return 0;
280 
281 	/* If we can autoload a vfs module, try again */
282 	mutex_enter(&module_lock);
283 	(void)module_autoload(fstype, MODULE_CLASS_VFS);
284 	mutex_exit(&module_lock);
285 
286 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
287 		return 0;
288 
289 	return ENODEV;
290 }
291 
292 static int
293 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
294     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
295 {
296 	struct mount *mp;
297 	struct vnode *vp = *vpp;
298 	struct vattr va;
299 	int error;
300 
301 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
302 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
303 	if (error)
304 		return error;
305 
306 	/* Can't make a non-dir a mount-point (from here anyway). */
307 	if (vp->v_type != VDIR)
308 		return ENOTDIR;
309 
310 	/*
311 	 * If the user is not root, ensure that they own the directory
312 	 * onto which we are attempting to mount.
313 	 */
314 	if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
315 	    (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
316 	    (error = kauth_authorize_generic(l->l_cred,
317 	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
318 		return error;
319 	}
320 
321 	if (flags & MNT_EXPORTED)
322 		return EINVAL;
323 
324 	if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
325 		return error;
326 
327 	/*
328 	 * Check if a file-system is not already mounted on this vnode.
329 	 */
330 	if (vp->v_mountedhere != NULL)
331 		return EBUSY;
332 
333 	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL)
334 		return ENOMEM;
335 
336 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
337 
338 	/*
339 	 * The underlying file system may refuse the mount for
340 	 * various reasons.  Allow the user to force it to happen.
341 	 *
342 	 * Set the mount level flags.
343 	 */
344 	mp->mnt_flag = flags &
345 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
346 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
347 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
348 	    MNT_LOG | MNT_IGNORE | MNT_RDONLY);
349 
350 	mutex_enter(&mp->mnt_updating);
351 	error = VFS_MOUNT(mp, path, data, data_len);
352 	mp->mnt_flag &= ~MNT_OP_FLAGS;
353 
354 	/*
355 	 * Put the new filesystem on the mount list after root.
356 	 */
357 	cache_purge(vp);
358 	if (error != 0) {
359 		vp->v_mountedhere = NULL;
360 		mutex_exit(&mp->mnt_updating);
361 		vfs_unbusy(mp, false, NULL);
362 		vfs_destroy(mp);
363 		return error;
364 	}
365 
366 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
367 	mutex_enter(&mountlist_lock);
368 	vp->v_mountedhere = mp;
369 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
370 	mutex_exit(&mountlist_lock);
371     	vn_restorerecurse(vp, recurse);
372 	VOP_UNLOCK(vp, 0);
373 	checkdirs(vp);
374 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
375 		error = vfs_allocate_syncvnode(mp);
376 	/* Hold an additional reference to the mount across VFS_START(). */
377 	mutex_exit(&mp->mnt_updating);
378 	vfs_unbusy(mp, true, NULL);
379 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
380 	error = VFS_START(mp, 0);
381 	if (error)
382 		vrele(vp);
383 	/* Drop reference held for VFS_START(). */
384 	vfs_destroy(mp);
385 	*vpp = NULL;
386 	return error;
387 }
388 
389 static int
390 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
391     void *data, size_t *data_len)
392 {
393 	struct mount *mp;
394 	int error;
395 
396 	/* If MNT_GETARGS is specified, it should be the only flag. */
397 	if (flags & ~MNT_GETARGS)
398 		return EINVAL;
399 
400 	mp = vp->v_mount;
401 
402 	/* XXX: probably some notion of "can see" here if we want isolation. */
403 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
404 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
405 	if (error)
406 		return error;
407 
408 	if ((vp->v_vflag & VV_ROOT) == 0)
409 		return EINVAL;
410 
411 	if (vfs_busy(mp, NULL))
412 		return EPERM;
413 
414 	mutex_enter(&mp->mnt_updating);
415 	mp->mnt_flag &= ~MNT_OP_FLAGS;
416 	mp->mnt_flag |= MNT_GETARGS;
417 	error = VFS_MOUNT(mp, path, data, data_len);
418 	mp->mnt_flag &= ~MNT_OP_FLAGS;
419 	mutex_exit(&mp->mnt_updating);
420 
421 	vfs_unbusy(mp, false, NULL);
422 	return (error);
423 }
424 
425 int
426 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
427 {
428 	/* {
429 		syscallarg(const char *) type;
430 		syscallarg(const char *) path;
431 		syscallarg(int) flags;
432 		syscallarg(void *) data;
433 		syscallarg(size_t) data_len;
434 	} */
435 
436 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
437 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
438 	    SCARG(uap, data_len), retval);
439 }
440 
441 int
442 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
443     const char *path, int flags, void *data, enum uio_seg data_seg,
444     size_t data_len, register_t *retval)
445 {
446 	struct vnode *vp;
447 	void *data_buf = data;
448 	u_int recurse;
449 	int error;
450 
451 	/*
452 	 * Get vnode to be covered
453 	 */
454 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
455 	if (error != 0)
456 		return (error);
457 
458 	/*
459 	 * A lookup in VFS_MOUNT might result in an attempt to
460 	 * lock this vnode again, so make the lock recursive.
461 	 */
462 	if (vfsops == NULL) {
463 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
464 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
465 			recurse = vn_setrecurse(vp);
466 			vfsops = vp->v_mount->mnt_op;
467 		} else {
468 			/* 'type' is userspace */
469 			error = mount_get_vfsops(type, &vfsops);
470 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
471 			recurse = vn_setrecurse(vp);
472 			if (error != 0)
473 				goto done;
474 		}
475 	} else {
476 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
477 		recurse = vn_setrecurse(vp);
478 	}
479 
480 	if (data != NULL && data_seg == UIO_USERSPACE) {
481 		if (data_len == 0) {
482 			/* No length supplied, use default for filesystem */
483 			data_len = vfsops->vfs_min_mount_data;
484 			if (data_len > VFS_MAX_MOUNT_DATA) {
485 				error = EINVAL;
486 				goto done;
487 			}
488 			/*
489 			 * Hopefully a longer buffer won't make copyin() fail.
490 			 * For compatibility with 3.0 and earlier.
491 			 */
492 			if (flags & MNT_UPDATE
493 			    && data_len < sizeof (struct mnt_export_args30))
494 				data_len = sizeof (struct mnt_export_args30);
495 		}
496 		data_buf = kmem_alloc(data_len, KM_SLEEP);
497 
498 		/* NFS needs the buffer even for mnt_getargs .... */
499 		error = copyin(data, data_buf, data_len);
500 		if (error != 0)
501 			goto done;
502 	}
503 
504 	if (flags & MNT_GETARGS) {
505 		if (data_len == 0) {
506 			error = EINVAL;
507 			goto done;
508 		}
509 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
510 		if (error != 0)
511 			goto done;
512 		if (data_seg == UIO_USERSPACE)
513 			error = copyout(data_buf, data, data_len);
514 		*retval = data_len;
515 	} else if (flags & MNT_UPDATE) {
516 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
517 	} else {
518 		/* Locking is handled internally in mount_domount(). */
519 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
520 		    &data_len, recurse);
521 	}
522 
523     done:
524     	if (vp != NULL) {
525 	    	vn_restorerecurse(vp, recurse);
526 	    	vput(vp);
527 	}
528 	if (data_buf != data)
529 		kmem_free(data_buf, data_len);
530 	return (error);
531 }
532 
533 /*
534  * Scan all active processes to see if any of them have a current
535  * or root directory onto which the new filesystem has just been
536  * mounted. If so, replace them with the new mount point.
537  */
538 void
539 checkdirs(struct vnode *olddp)
540 {
541 	struct cwdinfo *cwdi;
542 	struct vnode *newdp, *rele1, *rele2;
543 	struct proc *p;
544 	bool retry;
545 
546 	if (olddp->v_usecount == 1)
547 		return;
548 	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
549 		panic("mount: lost mount");
550 
551 	do {
552 		retry = false;
553 		mutex_enter(proc_lock);
554 		PROCLIST_FOREACH(p, &allproc) {
555 			if ((p->p_flag & PK_MARKER) != 0)
556 				continue;
557 			if ((cwdi = p->p_cwdi) == NULL)
558 				continue;
559 			/*
560 			 * Can't change to the old directory any more,
561 			 * so even if we see a stale value it's not a
562 			 * problem.
563 			 */
564 			if (cwdi->cwdi_cdir != olddp &&
565 			    cwdi->cwdi_rdir != olddp)
566 			    	continue;
567 			retry = true;
568 			rele1 = NULL;
569 			rele2 = NULL;
570 			atomic_inc_uint(&cwdi->cwdi_refcnt);
571 			mutex_exit(proc_lock);
572 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
573 			if (cwdi->cwdi_cdir == olddp) {
574 				rele1 = cwdi->cwdi_cdir;
575 				VREF(newdp);
576 				cwdi->cwdi_cdir = newdp;
577 			}
578 			if (cwdi->cwdi_rdir == olddp) {
579 				rele2 = cwdi->cwdi_rdir;
580 				VREF(newdp);
581 				cwdi->cwdi_rdir = newdp;
582 			}
583 			rw_exit(&cwdi->cwdi_lock);
584 			cwdfree(cwdi);
585 			if (rele1 != NULL)
586 				vrele(rele1);
587 			if (rele2 != NULL)
588 				vrele(rele2);
589 			mutex_enter(proc_lock);
590 			break;
591 		}
592 		mutex_exit(proc_lock);
593 	} while (retry);
594 
595 	if (rootvnode == olddp) {
596 		vrele(rootvnode);
597 		VREF(newdp);
598 		rootvnode = newdp;
599 	}
600 	vput(newdp);
601 }
602 
603 /*
604  * Unmount a file system.
605  *
606  * Note: unmount takes a path to the vnode mounted on as argument,
607  * not special file (as before).
608  */
609 /* ARGSUSED */
610 int
611 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
612 {
613 	/* {
614 		syscallarg(const char *) path;
615 		syscallarg(int) flags;
616 	} */
617 	struct vnode *vp;
618 	struct mount *mp;
619 	int error;
620 	struct nameidata nd;
621 
622 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
623 	    SCARG(uap, path));
624 	if ((error = namei(&nd)) != 0)
625 		return (error);
626 	vp = nd.ni_vp;
627 	mp = vp->v_mount;
628 	atomic_inc_uint(&mp->mnt_refcnt);
629 	VOP_UNLOCK(vp, 0);
630 
631 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
632 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
633 	if (error) {
634 		vrele(vp);
635 		vfs_destroy(mp);
636 		return (error);
637 	}
638 
639 	/*
640 	 * Don't allow unmounting the root file system.
641 	 */
642 	if (mp->mnt_flag & MNT_ROOTFS) {
643 		vrele(vp);
644 		vfs_destroy(mp);
645 		return (EINVAL);
646 	}
647 
648 	/*
649 	 * Must be the root of the filesystem
650 	 */
651 	if ((vp->v_vflag & VV_ROOT) == 0) {
652 		vrele(vp);
653 		vfs_destroy(mp);
654 		return (EINVAL);
655 	}
656 
657 	vrele(vp);
658 	error = dounmount(mp, SCARG(uap, flags), l);
659 	vfs_destroy(mp);
660 	return error;
661 }
662 
663 /*
664  * Do the actual file system unmount.  File system is assumed to have
665  * been locked by the caller.
666  *
667  * => Caller hold reference to the mount, explicitly for dounmount().
668  */
669 int
670 dounmount(struct mount *mp, int flags, struct lwp *l)
671 {
672 	struct vnode *coveredvp;
673 	int error;
674 	int async;
675 	int used_syncer;
676 
677 #if NVERIEXEC > 0
678 	error = veriexec_unmountchk(mp);
679 	if (error)
680 		return (error);
681 #endif /* NVERIEXEC > 0 */
682 
683 	/*
684 	 * XXX Freeze syncer.  Must do this before locking the
685 	 * mount point.  See dounmount() for details.
686 	 */
687 	mutex_enter(&syncer_mutex);
688 	rw_enter(&mp->mnt_unmounting, RW_WRITER);
689 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
690 		rw_exit(&mp->mnt_unmounting);
691 		mutex_exit(&syncer_mutex);
692 		return ENOENT;
693 	}
694 
695 	used_syncer = (mp->mnt_syncer != NULL);
696 
697 	/*
698 	 * XXX Syncer must be frozen when we get here.  This should really
699 	 * be done on a per-mountpoint basis, but the syncer doesn't work
700 	 * like that.
701 	 *
702 	 * The caller of dounmount() must acquire syncer_mutex because
703 	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
704 	 * order, and we must preserve that order to avoid deadlock.
705 	 *
706 	 * So, if the file system did not use the syncer, now is
707 	 * the time to release the syncer_mutex.
708 	 */
709 	if (used_syncer == 0)
710 		mutex_exit(&syncer_mutex);
711 
712 	mp->mnt_iflag |= IMNT_UNMOUNT;
713 	async = mp->mnt_flag & MNT_ASYNC;
714 	mp->mnt_flag &= ~MNT_ASYNC;
715 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
716 	if (mp->mnt_syncer != NULL)
717 		vfs_deallocate_syncvnode(mp);
718 	error = 0;
719 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
720 		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
721 	}
722 	vfs_scrubvnlist(mp);
723 	if (error == 0 || (flags & MNT_FORCE))
724 		error = VFS_UNMOUNT(mp, flags);
725 	if (error) {
726 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
727 			(void) vfs_allocate_syncvnode(mp);
728 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
729 		mp->mnt_flag |= async;
730 		rw_exit(&mp->mnt_unmounting);
731 		if (used_syncer)
732 			mutex_exit(&syncer_mutex);
733 		return (error);
734 	}
735 	vfs_scrubvnlist(mp);
736 	mutex_enter(&mountlist_lock);
737 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
738 		coveredvp->v_mountedhere = NULL;
739 	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
740 	mp->mnt_iflag |= IMNT_GONE;
741 	mutex_exit(&mountlist_lock);
742 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
743 		panic("unmount: dangling vnode");
744 	if (used_syncer)
745 		mutex_exit(&syncer_mutex);
746 	vfs_hooks_unmount(mp);
747 	rw_exit(&mp->mnt_unmounting);
748 	vfs_destroy(mp);	/* reference from mount() */
749 	if (coveredvp != NULLVP)
750 		vrele(coveredvp);
751 	return (0);
752 }
753 
754 /*
755  * Sync each mounted filesystem.
756  */
757 #ifdef DEBUG
758 int syncprt = 0;
759 struct ctldebug debug0 = { "syncprt", &syncprt };
760 #endif
761 
762 /* ARGSUSED */
763 int
764 sys_sync(struct lwp *l, const void *v, register_t *retval)
765 {
766 	struct mount *mp, *nmp;
767 	int asyncflag;
768 
769 	if (l == NULL)
770 		l = &lwp0;
771 
772 	mutex_enter(&mountlist_lock);
773 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
774 	     mp = nmp) {
775 		if (vfs_busy(mp, &nmp)) {
776 			continue;
777 		}
778 		mutex_enter(&mp->mnt_updating);
779 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
780 			asyncflag = mp->mnt_flag & MNT_ASYNC;
781 			mp->mnt_flag &= ~MNT_ASYNC;
782 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
783 			if (asyncflag)
784 				 mp->mnt_flag |= MNT_ASYNC;
785 		}
786 		mutex_exit(&mp->mnt_updating);
787 		vfs_unbusy(mp, false, &nmp);
788 	}
789 	mutex_exit(&mountlist_lock);
790 #ifdef DEBUG
791 	if (syncprt)
792 		vfs_bufstats();
793 #endif /* DEBUG */
794 	return (0);
795 }
796 
797 /*
798  * Change filesystem quotas.
799  */
800 /* ARGSUSED */
801 int
802 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
803 {
804 	/* {
805 		syscallarg(const char *) path;
806 		syscallarg(int) cmd;
807 		syscallarg(int) uid;
808 		syscallarg(void *) arg;
809 	} */
810 	struct mount *mp;
811 	int error;
812 	struct vnode *vp;
813 
814 	error = namei_simple_user(SCARG(uap, path),
815 				NSM_FOLLOW_TRYEMULROOT, &vp);
816 	if (error != 0)
817 		return (error);
818 	mp = vp->v_mount;
819 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
820 	    SCARG(uap, arg));
821 	vrele(vp);
822 	return (error);
823 }
824 
825 int
826 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
827     int root)
828 {
829 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
830 	int error = 0;
831 
832 	/*
833 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
834 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
835 	 * overrides MNT_NOWAIT.
836 	 */
837 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
838 	    (flags != MNT_WAIT && flags != 0)) {
839 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
840 		goto done;
841 	}
842 
843 	/* Get the filesystem stats now */
844 	memset(sp, 0, sizeof(*sp));
845 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
846 		return error;
847 	}
848 
849 	if (cwdi->cwdi_rdir == NULL)
850 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
851 done:
852 	if (cwdi->cwdi_rdir != NULL) {
853 		size_t len;
854 		char *bp;
855 		char c;
856 		char *path = PNBUF_GET();
857 
858 		bp = path + MAXPATHLEN;
859 		*--bp = '\0';
860 		rw_enter(&cwdi->cwdi_lock, RW_READER);
861 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
862 		    MAXPATHLEN / 2, 0, l);
863 		rw_exit(&cwdi->cwdi_lock);
864 		if (error) {
865 			PNBUF_PUT(path);
866 			return error;
867 		}
868 		len = strlen(bp);
869 		if (len != 1) {
870 			/*
871 			 * for mount points that are below our root, we can see
872 			 * them, so we fix up the pathname and return them. The
873 			 * rest we cannot see, so we don't allow viewing the
874 			 * data.
875 			 */
876 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
877 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
878 				(void)strlcpy(sp->f_mntonname,
879 				    c == '\0' ? "/" : &sp->f_mntonname[len],
880 				    sizeof(sp->f_mntonname));
881 			} else {
882 				if (root)
883 					(void)strlcpy(sp->f_mntonname, "/",
884 					    sizeof(sp->f_mntonname));
885 				else
886 					error = EPERM;
887 			}
888 		}
889 		PNBUF_PUT(path);
890 	}
891 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
892 	return error;
893 }
894 
895 /*
896  * Get filesystem statistics by path.
897  */
898 int
899 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
900 {
901 	struct mount *mp;
902 	int error;
903 	struct vnode *vp;
904 
905 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
906 	if (error != 0)
907 		return error;
908 	mp = vp->v_mount;
909 	error = dostatvfs(mp, sb, l, flags, 1);
910 	vrele(vp);
911 	return error;
912 }
913 
914 /* ARGSUSED */
915 int
916 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
917 {
918 	/* {
919 		syscallarg(const char *) path;
920 		syscallarg(struct statvfs *) buf;
921 		syscallarg(int) flags;
922 	} */
923 	struct statvfs *sb;
924 	int error;
925 
926 	sb = STATVFSBUF_GET();
927 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
928 	if (error == 0)
929 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
930 	STATVFSBUF_PUT(sb);
931 	return error;
932 }
933 
934 /*
935  * Get filesystem statistics by fd.
936  */
937 int
938 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
939 {
940 	file_t *fp;
941 	struct mount *mp;
942 	int error;
943 
944 	/* fd_getvnode() will use the descriptor for us */
945 	if ((error = fd_getvnode(fd, &fp)) != 0)
946 		return (error);
947 	mp = ((struct vnode *)fp->f_data)->v_mount;
948 	error = dostatvfs(mp, sb, curlwp, flags, 1);
949 	fd_putfile(fd);
950 	return error;
951 }
952 
953 /* ARGSUSED */
954 int
955 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
956 {
957 	/* {
958 		syscallarg(int) fd;
959 		syscallarg(struct statvfs *) buf;
960 		syscallarg(int) flags;
961 	} */
962 	struct statvfs *sb;
963 	int error;
964 
965 	sb = STATVFSBUF_GET();
966 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
967 	if (error == 0)
968 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
969 	STATVFSBUF_PUT(sb);
970 	return error;
971 }
972 
973 
974 /*
975  * Get statistics on all filesystems.
976  */
977 int
978 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
979     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
980     register_t *retval)
981 {
982 	int root = 0;
983 	struct proc *p = l->l_proc;
984 	struct mount *mp, *nmp;
985 	struct statvfs *sb;
986 	size_t count, maxcount;
987 	int error = 0;
988 
989 	sb = STATVFSBUF_GET();
990 	maxcount = bufsize / entry_sz;
991 	mutex_enter(&mountlist_lock);
992 	count = 0;
993 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
994 	     mp = nmp) {
995 		if (vfs_busy(mp, &nmp)) {
996 			continue;
997 		}
998 		if (sfsp && count < maxcount) {
999 			error = dostatvfs(mp, sb, l, flags, 0);
1000 			if (error) {
1001 				vfs_unbusy(mp, false, &nmp);
1002 				error = 0;
1003 				continue;
1004 			}
1005 			error = copyfn(sb, sfsp, entry_sz);
1006 			if (error) {
1007 				vfs_unbusy(mp, false, NULL);
1008 				goto out;
1009 			}
1010 			sfsp = (char *)sfsp + entry_sz;
1011 			root |= strcmp(sb->f_mntonname, "/") == 0;
1012 		}
1013 		count++;
1014 		vfs_unbusy(mp, false, &nmp);
1015 	}
1016 	mutex_exit(&mountlist_lock);
1017 
1018 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1019 		/*
1020 		 * fake a root entry
1021 		 */
1022 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1023 		    sb, l, flags, 1);
1024 		if (error != 0)
1025 			goto out;
1026 		if (sfsp) {
1027 			error = copyfn(sb, sfsp, entry_sz);
1028 			if (error != 0)
1029 				goto out;
1030 		}
1031 		count++;
1032 	}
1033 	if (sfsp && count > maxcount)
1034 		*retval = maxcount;
1035 	else
1036 		*retval = count;
1037 out:
1038 	STATVFSBUF_PUT(sb);
1039 	return error;
1040 }
1041 
1042 int
1043 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1044 {
1045 	/* {
1046 		syscallarg(struct statvfs *) buf;
1047 		syscallarg(size_t) bufsize;
1048 		syscallarg(int) flags;
1049 	} */
1050 
1051 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1052 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1053 }
1054 
1055 /*
1056  * Change current working directory to a given file descriptor.
1057  */
1058 /* ARGSUSED */
1059 int
1060 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1061 {
1062 	/* {
1063 		syscallarg(int) fd;
1064 	} */
1065 	struct proc *p = l->l_proc;
1066 	struct cwdinfo *cwdi;
1067 	struct vnode *vp, *tdp;
1068 	struct mount *mp;
1069 	file_t *fp;
1070 	int error, fd;
1071 
1072 	/* fd_getvnode() will use the descriptor for us */
1073 	fd = SCARG(uap, fd);
1074 	if ((error = fd_getvnode(fd, &fp)) != 0)
1075 		return (error);
1076 	vp = fp->f_data;
1077 
1078 	VREF(vp);
1079 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1080 	if (vp->v_type != VDIR)
1081 		error = ENOTDIR;
1082 	else
1083 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1084 	if (error) {
1085 		vput(vp);
1086 		goto out;
1087 	}
1088 	while ((mp = vp->v_mountedhere) != NULL) {
1089 		error = vfs_busy(mp, NULL);
1090 		vput(vp);
1091 		if (error != 0)
1092 			goto out;
1093 		error = VFS_ROOT(mp, &tdp);
1094 		vfs_unbusy(mp, false, NULL);
1095 		if (error)
1096 			goto out;
1097 		vp = tdp;
1098 	}
1099 	VOP_UNLOCK(vp, 0);
1100 
1101 	/*
1102 	 * Disallow changing to a directory not under the process's
1103 	 * current root directory (if there is one).
1104 	 */
1105 	cwdi = p->p_cwdi;
1106 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1107 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1108 		vrele(vp);
1109 		error = EPERM;	/* operation not permitted */
1110 	} else {
1111 		vrele(cwdi->cwdi_cdir);
1112 		cwdi->cwdi_cdir = vp;
1113 	}
1114 	rw_exit(&cwdi->cwdi_lock);
1115 
1116  out:
1117 	fd_putfile(fd);
1118 	return (error);
1119 }
1120 
1121 /*
1122  * Change this process's notion of the root directory to a given file
1123  * descriptor.
1124  */
1125 int
1126 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1127 {
1128 	struct proc *p = l->l_proc;
1129 	struct cwdinfo *cwdi;
1130 	struct vnode	*vp;
1131 	file_t	*fp;
1132 	int		 error, fd = SCARG(uap, fd);
1133 
1134 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1135  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1136 		return error;
1137 	/* fd_getvnode() will use the descriptor for us */
1138 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
1139 		return error;
1140 	vp = fp->f_data;
1141 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1142 	if (vp->v_type != VDIR)
1143 		error = ENOTDIR;
1144 	else
1145 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1146 	VOP_UNLOCK(vp, 0);
1147 	if (error)
1148 		goto out;
1149 	VREF(vp);
1150 
1151 	/*
1152 	 * Prevent escaping from chroot by putting the root under
1153 	 * the working directory.  Silently chdir to / if we aren't
1154 	 * already there.
1155 	 */
1156 	cwdi = p->p_cwdi;
1157 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1158 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1159 		/*
1160 		 * XXX would be more failsafe to change directory to a
1161 		 * deadfs node here instead
1162 		 */
1163 		vrele(cwdi->cwdi_cdir);
1164 		VREF(vp);
1165 		cwdi->cwdi_cdir = vp;
1166 	}
1167 
1168 	if (cwdi->cwdi_rdir != NULL)
1169 		vrele(cwdi->cwdi_rdir);
1170 	cwdi->cwdi_rdir = vp;
1171 	rw_exit(&cwdi->cwdi_lock);
1172 
1173  out:
1174 	fd_putfile(fd);
1175 	return (error);
1176 }
1177 
1178 /*
1179  * Change current working directory (``.'').
1180  */
1181 /* ARGSUSED */
1182 int
1183 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1184 {
1185 	/* {
1186 		syscallarg(const char *) path;
1187 	} */
1188 	struct proc *p = l->l_proc;
1189 	struct cwdinfo *cwdi;
1190 	int error;
1191 	struct nameidata nd;
1192 
1193 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1194 	    SCARG(uap, path));
1195 	if ((error = change_dir(&nd, l)) != 0)
1196 		return (error);
1197 	cwdi = p->p_cwdi;
1198 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1199 	vrele(cwdi->cwdi_cdir);
1200 	cwdi->cwdi_cdir = nd.ni_vp;
1201 	rw_exit(&cwdi->cwdi_lock);
1202 	return (0);
1203 }
1204 
1205 /*
1206  * Change notion of root (``/'') directory.
1207  */
1208 /* ARGSUSED */
1209 int
1210 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1211 {
1212 	/* {
1213 		syscallarg(const char *) path;
1214 	} */
1215 	struct proc *p = l->l_proc;
1216 	struct cwdinfo *cwdi;
1217 	struct vnode *vp;
1218 	int error;
1219 	struct nameidata nd;
1220 
1221 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1222 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1223 		return (error);
1224 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1225 	    SCARG(uap, path));
1226 	if ((error = change_dir(&nd, l)) != 0)
1227 		return (error);
1228 
1229 	cwdi = p->p_cwdi;
1230 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1231 	if (cwdi->cwdi_rdir != NULL)
1232 		vrele(cwdi->cwdi_rdir);
1233 	vp = nd.ni_vp;
1234 	cwdi->cwdi_rdir = vp;
1235 
1236 	/*
1237 	 * Prevent escaping from chroot by putting the root under
1238 	 * the working directory.  Silently chdir to / if we aren't
1239 	 * already there.
1240 	 */
1241 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1242 		/*
1243 		 * XXX would be more failsafe to change directory to a
1244 		 * deadfs node here instead
1245 		 */
1246 		vrele(cwdi->cwdi_cdir);
1247 		VREF(vp);
1248 		cwdi->cwdi_cdir = vp;
1249 	}
1250 	rw_exit(&cwdi->cwdi_lock);
1251 
1252 	return (0);
1253 }
1254 
1255 /*
1256  * Common routine for chroot and chdir.
1257  */
1258 static int
1259 change_dir(struct nameidata *ndp, struct lwp *l)
1260 {
1261 	struct vnode *vp;
1262 	int error;
1263 
1264 	if ((error = namei(ndp)) != 0)
1265 		return (error);
1266 	vp = ndp->ni_vp;
1267 	if (vp->v_type != VDIR)
1268 		error = ENOTDIR;
1269 	else
1270 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1271 
1272 	if (error)
1273 		vput(vp);
1274 	else
1275 		VOP_UNLOCK(vp, 0);
1276 	return (error);
1277 }
1278 
1279 /*
1280  * Check permissions, allocate an open file structure,
1281  * and call the device open routine if any.
1282  */
1283 int
1284 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1285 {
1286 	/* {
1287 		syscallarg(const char *) path;
1288 		syscallarg(int) flags;
1289 		syscallarg(int) mode;
1290 	} */
1291 	struct proc *p = l->l_proc;
1292 	struct cwdinfo *cwdi = p->p_cwdi;
1293 	file_t *fp;
1294 	struct vnode *vp;
1295 	int flags, cmode;
1296 	int type, indx, error;
1297 	struct flock lf;
1298 	struct nameidata nd;
1299 
1300 	flags = FFLAGS(SCARG(uap, flags));
1301 	if ((flags & (FREAD | FWRITE)) == 0)
1302 		return (EINVAL);
1303 	if ((error = fd_allocfile(&fp, &indx)) != 0)
1304 		return (error);
1305 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1306 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1307 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1308 	    SCARG(uap, path));
1309 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1310 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1311 		fd_abort(p, fp, indx);
1312 		if ((error == EDUPFD || error == EMOVEFD) &&
1313 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1314 		    (error =
1315 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1316 			*retval = indx;
1317 			return (0);
1318 		}
1319 		if (error == ERESTART)
1320 			error = EINTR;
1321 		return (error);
1322 	}
1323 
1324 	l->l_dupfd = 0;
1325 	vp = nd.ni_vp;
1326 	fp->f_flag = flags & FMASK;
1327 	fp->f_type = DTYPE_VNODE;
1328 	fp->f_ops = &vnops;
1329 	fp->f_data = vp;
1330 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1331 		lf.l_whence = SEEK_SET;
1332 		lf.l_start = 0;
1333 		lf.l_len = 0;
1334 		if (flags & O_EXLOCK)
1335 			lf.l_type = F_WRLCK;
1336 		else
1337 			lf.l_type = F_RDLCK;
1338 		type = F_FLOCK;
1339 		if ((flags & FNONBLOCK) == 0)
1340 			type |= F_WAIT;
1341 		VOP_UNLOCK(vp, 0);
1342 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1343 		if (error) {
1344 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1345 			fd_abort(p, fp, indx);
1346 			return (error);
1347 		}
1348 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1349 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1350 	}
1351 	VOP_UNLOCK(vp, 0);
1352 	*retval = indx;
1353 	fd_affix(p, fp, indx);
1354 	return (0);
1355 }
1356 
1357 static void
1358 vfs__fhfree(fhandle_t *fhp)
1359 {
1360 	size_t fhsize;
1361 
1362 	if (fhp == NULL) {
1363 		return;
1364 	}
1365 	fhsize = FHANDLE_SIZE(fhp);
1366 	kmem_free(fhp, fhsize);
1367 }
1368 
1369 /*
1370  * vfs_composefh: compose a filehandle.
1371  */
1372 
1373 int
1374 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1375 {
1376 	struct mount *mp;
1377 	struct fid *fidp;
1378 	int error;
1379 	size_t needfhsize;
1380 	size_t fidsize;
1381 
1382 	mp = vp->v_mount;
1383 	fidp = NULL;
1384 	if (*fh_size < FHANDLE_SIZE_MIN) {
1385 		fidsize = 0;
1386 	} else {
1387 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1388 		if (fhp != NULL) {
1389 			memset(fhp, 0, *fh_size);
1390 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1391 			fidp = &fhp->fh_fid;
1392 		}
1393 	}
1394 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1395 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1396 	if (error == 0 && *fh_size < needfhsize) {
1397 		error = E2BIG;
1398 	}
1399 	*fh_size = needfhsize;
1400 	return error;
1401 }
1402 
1403 int
1404 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1405 {
1406 	struct mount *mp;
1407 	fhandle_t *fhp;
1408 	size_t fhsize;
1409 	size_t fidsize;
1410 	int error;
1411 
1412 	*fhpp = NULL;
1413 	mp = vp->v_mount;
1414 	fidsize = 0;
1415 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1416 	KASSERT(error != 0);
1417 	if (error != E2BIG) {
1418 		goto out;
1419 	}
1420 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1421 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1422 	if (fhp == NULL) {
1423 		error = ENOMEM;
1424 		goto out;
1425 	}
1426 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1427 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1428 	if (error == 0) {
1429 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1430 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1431 		*fhpp = fhp;
1432 	} else {
1433 		kmem_free(fhp, fhsize);
1434 	}
1435 out:
1436 	return error;
1437 }
1438 
1439 void
1440 vfs_composefh_free(fhandle_t *fhp)
1441 {
1442 
1443 	vfs__fhfree(fhp);
1444 }
1445 
1446 /*
1447  * vfs_fhtovp: lookup a vnode by a filehandle.
1448  */
1449 
1450 int
1451 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1452 {
1453 	struct mount *mp;
1454 	int error;
1455 
1456 	*vpp = NULL;
1457 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1458 	if (mp == NULL) {
1459 		error = ESTALE;
1460 		goto out;
1461 	}
1462 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1463 		error = EOPNOTSUPP;
1464 		goto out;
1465 	}
1466 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1467 out:
1468 	return error;
1469 }
1470 
1471 /*
1472  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1473  * the needed size.
1474  */
1475 
1476 int
1477 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1478 {
1479 	fhandle_t *fhp;
1480 	int error;
1481 
1482 	*fhpp = NULL;
1483 	if (fhsize > FHANDLE_SIZE_MAX) {
1484 		return EINVAL;
1485 	}
1486 	if (fhsize < FHANDLE_SIZE_MIN) {
1487 		return EINVAL;
1488 	}
1489 again:
1490 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1491 	if (fhp == NULL) {
1492 		return ENOMEM;
1493 	}
1494 	error = copyin(ufhp, fhp, fhsize);
1495 	if (error == 0) {
1496 		/* XXX this check shouldn't be here */
1497 		if (FHANDLE_SIZE(fhp) == fhsize) {
1498 			*fhpp = fhp;
1499 			return 0;
1500 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1501 			/*
1502 			 * a kludge for nfsv2 padded handles.
1503 			 */
1504 			size_t sz;
1505 
1506 			sz = FHANDLE_SIZE(fhp);
1507 			kmem_free(fhp, fhsize);
1508 			fhsize = sz;
1509 			goto again;
1510 		} else {
1511 			/*
1512 			 * userland told us wrong size.
1513 			 */
1514 		    	error = EINVAL;
1515 		}
1516 	}
1517 	kmem_free(fhp, fhsize);
1518 	return error;
1519 }
1520 
1521 void
1522 vfs_copyinfh_free(fhandle_t *fhp)
1523 {
1524 
1525 	vfs__fhfree(fhp);
1526 }
1527 
1528 /*
1529  * Get file handle system call
1530  */
1531 int
1532 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1533 {
1534 	/* {
1535 		syscallarg(char *) fname;
1536 		syscallarg(fhandle_t *) fhp;
1537 		syscallarg(size_t *) fh_size;
1538 	} */
1539 	struct vnode *vp;
1540 	fhandle_t *fh;
1541 	int error;
1542 	struct nameidata nd;
1543 	size_t sz;
1544 	size_t usz;
1545 
1546 	/*
1547 	 * Must be super user
1548 	 */
1549 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1550 	    0, NULL, NULL, NULL);
1551 	if (error)
1552 		return (error);
1553 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1554 	    SCARG(uap, fname));
1555 	error = namei(&nd);
1556 	if (error)
1557 		return (error);
1558 	vp = nd.ni_vp;
1559 	error = vfs_composefh_alloc(vp, &fh);
1560 	vput(vp);
1561 	if (error != 0) {
1562 		goto out;
1563 	}
1564 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1565 	if (error != 0) {
1566 		goto out;
1567 	}
1568 	sz = FHANDLE_SIZE(fh);
1569 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1570 	if (error != 0) {
1571 		goto out;
1572 	}
1573 	if (usz >= sz) {
1574 		error = copyout(fh, SCARG(uap, fhp), sz);
1575 	} else {
1576 		error = E2BIG;
1577 	}
1578 out:
1579 	vfs_composefh_free(fh);
1580 	return (error);
1581 }
1582 
1583 /*
1584  * Open a file given a file handle.
1585  *
1586  * Check permissions, allocate an open file structure,
1587  * and call the device open routine if any.
1588  */
1589 
1590 int
1591 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1592     register_t *retval)
1593 {
1594 	file_t *fp;
1595 	struct vnode *vp = NULL;
1596 	kauth_cred_t cred = l->l_cred;
1597 	file_t *nfp;
1598 	int type, indx, error=0;
1599 	struct flock lf;
1600 	struct vattr va;
1601 	fhandle_t *fh;
1602 	int flags;
1603 	proc_t *p;
1604 
1605 	p = curproc;
1606 
1607 	/*
1608 	 * Must be super user
1609 	 */
1610 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1611 	    0, NULL, NULL, NULL)))
1612 		return (error);
1613 
1614 	flags = FFLAGS(oflags);
1615 	if ((flags & (FREAD | FWRITE)) == 0)
1616 		return (EINVAL);
1617 	if ((flags & O_CREAT))
1618 		return (EINVAL);
1619 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1620 		return (error);
1621 	fp = nfp;
1622 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1623 	if (error != 0) {
1624 		goto bad;
1625 	}
1626 	error = vfs_fhtovp(fh, &vp);
1627 	if (error != 0) {
1628 		goto bad;
1629 	}
1630 
1631 	/* Now do an effective vn_open */
1632 
1633 	if (vp->v_type == VSOCK) {
1634 		error = EOPNOTSUPP;
1635 		goto bad;
1636 	}
1637 	error = vn_openchk(vp, cred, flags);
1638 	if (error != 0)
1639 		goto bad;
1640 	if (flags & O_TRUNC) {
1641 		VOP_UNLOCK(vp, 0);			/* XXX */
1642 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1643 		VATTR_NULL(&va);
1644 		va.va_size = 0;
1645 		error = VOP_SETATTR(vp, &va, cred);
1646 		if (error)
1647 			goto bad;
1648 	}
1649 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1650 		goto bad;
1651 	if (flags & FWRITE) {
1652 		mutex_enter(&vp->v_interlock);
1653 		vp->v_writecount++;
1654 		mutex_exit(&vp->v_interlock);
1655 	}
1656 
1657 	/* done with modified vn_open, now finish what sys_open does. */
1658 
1659 	fp->f_flag = flags & FMASK;
1660 	fp->f_type = DTYPE_VNODE;
1661 	fp->f_ops = &vnops;
1662 	fp->f_data = vp;
1663 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1664 		lf.l_whence = SEEK_SET;
1665 		lf.l_start = 0;
1666 		lf.l_len = 0;
1667 		if (flags & O_EXLOCK)
1668 			lf.l_type = F_WRLCK;
1669 		else
1670 			lf.l_type = F_RDLCK;
1671 		type = F_FLOCK;
1672 		if ((flags & FNONBLOCK) == 0)
1673 			type |= F_WAIT;
1674 		VOP_UNLOCK(vp, 0);
1675 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1676 		if (error) {
1677 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1678 			fd_abort(p, fp, indx);
1679 			return (error);
1680 		}
1681 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1682 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1683 	}
1684 	VOP_UNLOCK(vp, 0);
1685 	*retval = indx;
1686 	fd_affix(p, fp, indx);
1687 	vfs_copyinfh_free(fh);
1688 	return (0);
1689 
1690 bad:
1691 	fd_abort(p, fp, indx);
1692 	if (vp != NULL)
1693 		vput(vp);
1694 	vfs_copyinfh_free(fh);
1695 	return (error);
1696 }
1697 
1698 int
1699 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1700 {
1701 	/* {
1702 		syscallarg(const void *) fhp;
1703 		syscallarg(size_t) fh_size;
1704 		syscallarg(int) flags;
1705 	} */
1706 
1707 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1708 	    SCARG(uap, flags), retval);
1709 }
1710 
1711 int
1712 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1713 {
1714 	int error;
1715 	fhandle_t *fh;
1716 	struct vnode *vp;
1717 
1718 	/*
1719 	 * Must be super user
1720 	 */
1721 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1722 	    0, NULL, NULL, NULL)))
1723 		return (error);
1724 
1725 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1726 	if (error != 0)
1727 		return error;
1728 
1729 	error = vfs_fhtovp(fh, &vp);
1730 	vfs_copyinfh_free(fh);
1731 	if (error != 0)
1732 		return error;
1733 
1734 	error = vn_stat(vp, sb);
1735 	vput(vp);
1736 	return error;
1737 }
1738 
1739 
1740 /* ARGSUSED */
1741 int
1742 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1743 {
1744 	/* {
1745 		syscallarg(const void *) fhp;
1746 		syscallarg(size_t) fh_size;
1747 		syscallarg(struct stat *) sb;
1748 	} */
1749 	struct stat sb;
1750 	int error;
1751 
1752 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1753 	if (error)
1754 		return error;
1755 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1756 }
1757 
1758 int
1759 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1760     int flags)
1761 {
1762 	fhandle_t *fh;
1763 	struct mount *mp;
1764 	struct vnode *vp;
1765 	int error;
1766 
1767 	/*
1768 	 * Must be super user
1769 	 */
1770 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1771 	    0, NULL, NULL, NULL)))
1772 		return error;
1773 
1774 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1775 	if (error != 0)
1776 		return error;
1777 
1778 	error = vfs_fhtovp(fh, &vp);
1779 	vfs_copyinfh_free(fh);
1780 	if (error != 0)
1781 		return error;
1782 
1783 	mp = vp->v_mount;
1784 	error = dostatvfs(mp, sb, l, flags, 1);
1785 	vput(vp);
1786 	return error;
1787 }
1788 
1789 /* ARGSUSED */
1790 int
1791 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1792 {
1793 	/* {
1794 		syscallarg(const void *) fhp;
1795 		syscallarg(size_t) fh_size;
1796 		syscallarg(struct statvfs *) buf;
1797 		syscallarg(int)	flags;
1798 	} */
1799 	struct statvfs *sb = STATVFSBUF_GET();
1800 	int error;
1801 
1802 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1803 	    SCARG(uap, flags));
1804 	if (error == 0)
1805 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1806 	STATVFSBUF_PUT(sb);
1807 	return error;
1808 }
1809 
1810 /*
1811  * Create a special file.
1812  */
1813 /* ARGSUSED */
1814 int
1815 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
1816     register_t *retval)
1817 {
1818 	/* {
1819 		syscallarg(const char *) path;
1820 		syscallarg(mode_t) mode;
1821 		syscallarg(dev_t) dev;
1822 	} */
1823 	return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
1824 	    SCARG(uap, dev), retval);
1825 }
1826 
1827 int
1828 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
1829     register_t *retval)
1830 {
1831 	struct proc *p = l->l_proc;
1832 	struct vnode *vp;
1833 	struct vattr vattr;
1834 	int error, optype;
1835 	struct nameidata nd;
1836 	char *path;
1837 	const char *cpath;
1838 	enum uio_seg seg = UIO_USERSPACE;
1839 
1840 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1841 	    0, NULL, NULL, NULL)) != 0)
1842 		return (error);
1843 
1844 	optype = VOP_MKNOD_DESCOFFSET;
1845 
1846 	VERIEXEC_PATH_GET(pathname, seg, cpath, path);
1847 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1848 
1849 	if ((error = namei(&nd)) != 0)
1850 		goto out;
1851 	vp = nd.ni_vp;
1852 	if (vp != NULL)
1853 		error = EEXIST;
1854 	else {
1855 		VATTR_NULL(&vattr);
1856 		/* We will read cwdi->cwdi_cmask unlocked. */
1857 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1858 		vattr.va_rdev = dev;
1859 
1860 		switch (mode & S_IFMT) {
1861 		case S_IFMT:	/* used by badsect to flag bad sectors */
1862 			vattr.va_type = VBAD;
1863 			break;
1864 		case S_IFCHR:
1865 			vattr.va_type = VCHR;
1866 			break;
1867 		case S_IFBLK:
1868 			vattr.va_type = VBLK;
1869 			break;
1870 		case S_IFWHT:
1871 			optype = VOP_WHITEOUT_DESCOFFSET;
1872 			break;
1873 		case S_IFREG:
1874 #if NVERIEXEC > 0
1875 			error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1876 			    O_CREAT);
1877 #endif /* NVERIEXEC > 0 */
1878 			vattr.va_type = VREG;
1879 			vattr.va_rdev = VNOVAL;
1880 			optype = VOP_CREATE_DESCOFFSET;
1881 			break;
1882 		default:
1883 			error = EINVAL;
1884 			break;
1885 		}
1886 	}
1887 	if (!error) {
1888 		switch (optype) {
1889 		case VOP_WHITEOUT_DESCOFFSET:
1890 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1891 			if (error)
1892 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1893 			vput(nd.ni_dvp);
1894 			break;
1895 
1896 		case VOP_MKNOD_DESCOFFSET:
1897 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1898 						&nd.ni_cnd, &vattr);
1899 			if (error == 0)
1900 				vput(nd.ni_vp);
1901 			break;
1902 
1903 		case VOP_CREATE_DESCOFFSET:
1904 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1905 						&nd.ni_cnd, &vattr);
1906 			if (error == 0)
1907 				vput(nd.ni_vp);
1908 			break;
1909 		}
1910 	} else {
1911 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1912 		if (nd.ni_dvp == vp)
1913 			vrele(nd.ni_dvp);
1914 		else
1915 			vput(nd.ni_dvp);
1916 		if (vp)
1917 			vrele(vp);
1918 	}
1919 out:
1920 	VERIEXEC_PATH_PUT(path);
1921 	return (error);
1922 }
1923 
1924 /*
1925  * Create a named pipe.
1926  */
1927 /* ARGSUSED */
1928 int
1929 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1930 {
1931 	/* {
1932 		syscallarg(const char *) path;
1933 		syscallarg(int) mode;
1934 	} */
1935 	struct proc *p = l->l_proc;
1936 	struct vattr vattr;
1937 	int error;
1938 	struct nameidata nd;
1939 
1940 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1941 	    SCARG(uap, path));
1942 	if ((error = namei(&nd)) != 0)
1943 		return (error);
1944 	if (nd.ni_vp != NULL) {
1945 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1946 		if (nd.ni_dvp == nd.ni_vp)
1947 			vrele(nd.ni_dvp);
1948 		else
1949 			vput(nd.ni_dvp);
1950 		vrele(nd.ni_vp);
1951 		return (EEXIST);
1952 	}
1953 	VATTR_NULL(&vattr);
1954 	vattr.va_type = VFIFO;
1955 	/* We will read cwdi->cwdi_cmask unlocked. */
1956 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1957 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1958 	if (error == 0)
1959 		vput(nd.ni_vp);
1960 	return (error);
1961 }
1962 
1963 /*
1964  * Make a hard file link.
1965  */
1966 /* ARGSUSED */
1967 int
1968 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1969 {
1970 	/* {
1971 		syscallarg(const char *) path;
1972 		syscallarg(const char *) link;
1973 	} */
1974 	struct vnode *vp;
1975 	struct nameidata nd;
1976 	int error;
1977 
1978 	error = namei_simple_user(SCARG(uap, path),
1979 				NSM_FOLLOW_TRYEMULROOT, &vp);
1980 	if (error != 0)
1981 		return (error);
1982 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1983 	    SCARG(uap, link));
1984 	if ((error = namei(&nd)) != 0)
1985 		goto out;
1986 	if (nd.ni_vp) {
1987 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1988 		if (nd.ni_dvp == nd.ni_vp)
1989 			vrele(nd.ni_dvp);
1990 		else
1991 			vput(nd.ni_dvp);
1992 		vrele(nd.ni_vp);
1993 		error = EEXIST;
1994 		goto out;
1995 	}
1996 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1997 out:
1998 	vrele(vp);
1999 	return (error);
2000 }
2001 
2002 /*
2003  * Make a symbolic link.
2004  */
2005 /* ARGSUSED */
2006 int
2007 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2008 {
2009 	/* {
2010 		syscallarg(const char *) path;
2011 		syscallarg(const char *) link;
2012 	} */
2013 	struct proc *p = l->l_proc;
2014 	struct vattr vattr;
2015 	char *path;
2016 	int error;
2017 	struct nameidata nd;
2018 
2019 	path = PNBUF_GET();
2020 	error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2021 	if (error)
2022 		goto out;
2023 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2024 	    SCARG(uap, link));
2025 	if ((error = namei(&nd)) != 0)
2026 		goto out;
2027 	if (nd.ni_vp) {
2028 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2029 		if (nd.ni_dvp == nd.ni_vp)
2030 			vrele(nd.ni_dvp);
2031 		else
2032 			vput(nd.ni_dvp);
2033 		vrele(nd.ni_vp);
2034 		error = EEXIST;
2035 		goto out;
2036 	}
2037 	VATTR_NULL(&vattr);
2038 	vattr.va_type = VLNK;
2039 	/* We will read cwdi->cwdi_cmask unlocked. */
2040 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2041 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2042 	if (error == 0)
2043 		vput(nd.ni_vp);
2044 out:
2045 	PNBUF_PUT(path);
2046 	return (error);
2047 }
2048 
2049 /*
2050  * Delete a whiteout from the filesystem.
2051  */
2052 /* ARGSUSED */
2053 int
2054 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2055 {
2056 	/* {
2057 		syscallarg(const char *) path;
2058 	} */
2059 	int error;
2060 	struct nameidata nd;
2061 
2062 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2063 	    UIO_USERSPACE, SCARG(uap, path));
2064 	error = namei(&nd);
2065 	if (error)
2066 		return (error);
2067 
2068 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2069 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2070 		if (nd.ni_dvp == nd.ni_vp)
2071 			vrele(nd.ni_dvp);
2072 		else
2073 			vput(nd.ni_dvp);
2074 		if (nd.ni_vp)
2075 			vrele(nd.ni_vp);
2076 		return (EEXIST);
2077 	}
2078 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2079 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2080 	vput(nd.ni_dvp);
2081 	return (error);
2082 }
2083 
2084 /*
2085  * Delete a name from the filesystem.
2086  */
2087 /* ARGSUSED */
2088 int
2089 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2090 {
2091 	/* {
2092 		syscallarg(const char *) path;
2093 	} */
2094 
2095 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2096 }
2097 
2098 int
2099 do_sys_unlink(const char *arg, enum uio_seg seg)
2100 {
2101 	struct vnode *vp;
2102 	int error;
2103 	struct nameidata nd;
2104 	char *path;
2105 	const char *cpath;
2106 
2107 	VERIEXEC_PATH_GET(arg, seg, cpath, path);
2108 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2109 
2110 	if ((error = namei(&nd)) != 0)
2111 		goto out;
2112 	vp = nd.ni_vp;
2113 
2114 	/*
2115 	 * The root of a mounted filesystem cannot be deleted.
2116 	 */
2117 	if (vp->v_vflag & VV_ROOT) {
2118 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2119 		if (nd.ni_dvp == vp)
2120 			vrele(nd.ni_dvp);
2121 		else
2122 			vput(nd.ni_dvp);
2123 		vput(vp);
2124 		error = EBUSY;
2125 		goto out;
2126 	}
2127 
2128 #if NVERIEXEC > 0
2129 	/* Handle remove requests for veriexec entries. */
2130 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2131 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2132 		if (nd.ni_dvp == vp)
2133 			vrele(nd.ni_dvp);
2134 		else
2135 			vput(nd.ni_dvp);
2136 		vput(vp);
2137 		goto out;
2138 	}
2139 #endif /* NVERIEXEC > 0 */
2140 
2141 #ifdef FILEASSOC
2142 	(void)fileassoc_file_delete(vp);
2143 #endif /* FILEASSOC */
2144 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2145 out:
2146 	VERIEXEC_PATH_PUT(path);
2147 	return (error);
2148 }
2149 
2150 /*
2151  * Reposition read/write file offset.
2152  */
2153 int
2154 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2155 {
2156 	/* {
2157 		syscallarg(int) fd;
2158 		syscallarg(int) pad;
2159 		syscallarg(off_t) offset;
2160 		syscallarg(int) whence;
2161 	} */
2162 	kauth_cred_t cred = l->l_cred;
2163 	file_t *fp;
2164 	struct vnode *vp;
2165 	struct vattr vattr;
2166 	off_t newoff;
2167 	int error, fd;
2168 
2169 	fd = SCARG(uap, fd);
2170 
2171 	if ((fp = fd_getfile(fd)) == NULL)
2172 		return (EBADF);
2173 
2174 	vp = fp->f_data;
2175 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2176 		error = ESPIPE;
2177 		goto out;
2178 	}
2179 
2180 	switch (SCARG(uap, whence)) {
2181 	case SEEK_CUR:
2182 		newoff = fp->f_offset + SCARG(uap, offset);
2183 		break;
2184 	case SEEK_END:
2185 		error = VOP_GETATTR(vp, &vattr, cred);
2186 		if (error) {
2187 			goto out;
2188 		}
2189 		newoff = SCARG(uap, offset) + vattr.va_size;
2190 		break;
2191 	case SEEK_SET:
2192 		newoff = SCARG(uap, offset);
2193 		break;
2194 	default:
2195 		error = EINVAL;
2196 		goto out;
2197 	}
2198 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2199 		*(off_t *)retval = fp->f_offset = newoff;
2200 	}
2201  out:
2202  	fd_putfile(fd);
2203 	return (error);
2204 }
2205 
2206 /*
2207  * Positional read system call.
2208  */
2209 int
2210 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2211 {
2212 	/* {
2213 		syscallarg(int) fd;
2214 		syscallarg(void *) buf;
2215 		syscallarg(size_t) nbyte;
2216 		syscallarg(off_t) offset;
2217 	} */
2218 	file_t *fp;
2219 	struct vnode *vp;
2220 	off_t offset;
2221 	int error, fd = SCARG(uap, fd);
2222 
2223 	if ((fp = fd_getfile(fd)) == NULL)
2224 		return (EBADF);
2225 
2226 	if ((fp->f_flag & FREAD) == 0) {
2227 		fd_putfile(fd);
2228 		return (EBADF);
2229 	}
2230 
2231 	vp = fp->f_data;
2232 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2233 		error = ESPIPE;
2234 		goto out;
2235 	}
2236 
2237 	offset = SCARG(uap, offset);
2238 
2239 	/*
2240 	 * XXX This works because no file systems actually
2241 	 * XXX take any action on the seek operation.
2242 	 */
2243 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2244 		goto out;
2245 
2246 	/* dofileread() will unuse the descriptor for us */
2247 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2248 	    &offset, 0, retval));
2249 
2250  out:
2251 	fd_putfile(fd);
2252 	return (error);
2253 }
2254 
2255 /*
2256  * Positional scatter read system call.
2257  */
2258 int
2259 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2260 {
2261 	/* {
2262 		syscallarg(int) fd;
2263 		syscallarg(const struct iovec *) iovp;
2264 		syscallarg(int) iovcnt;
2265 		syscallarg(off_t) offset;
2266 	} */
2267 	off_t offset = SCARG(uap, offset);
2268 
2269 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2270 	    SCARG(uap, iovcnt), &offset, 0, retval);
2271 }
2272 
2273 /*
2274  * Positional write system call.
2275  */
2276 int
2277 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2278 {
2279 	/* {
2280 		syscallarg(int) fd;
2281 		syscallarg(const void *) buf;
2282 		syscallarg(size_t) nbyte;
2283 		syscallarg(off_t) offset;
2284 	} */
2285 	file_t *fp;
2286 	struct vnode *vp;
2287 	off_t offset;
2288 	int error, fd = SCARG(uap, fd);
2289 
2290 	if ((fp = fd_getfile(fd)) == NULL)
2291 		return (EBADF);
2292 
2293 	if ((fp->f_flag & FWRITE) == 0) {
2294 		fd_putfile(fd);
2295 		return (EBADF);
2296 	}
2297 
2298 	vp = fp->f_data;
2299 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2300 		error = ESPIPE;
2301 		goto out;
2302 	}
2303 
2304 	offset = SCARG(uap, offset);
2305 
2306 	/*
2307 	 * XXX This works because no file systems actually
2308 	 * XXX take any action on the seek operation.
2309 	 */
2310 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2311 		goto out;
2312 
2313 	/* dofilewrite() will unuse the descriptor for us */
2314 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2315 	    &offset, 0, retval));
2316 
2317  out:
2318 	fd_putfile(fd);
2319 	return (error);
2320 }
2321 
2322 /*
2323  * Positional gather write system call.
2324  */
2325 int
2326 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2327 {
2328 	/* {
2329 		syscallarg(int) fd;
2330 		syscallarg(const struct iovec *) iovp;
2331 		syscallarg(int) iovcnt;
2332 		syscallarg(off_t) offset;
2333 	} */
2334 	off_t offset = SCARG(uap, offset);
2335 
2336 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2337 	    SCARG(uap, iovcnt), &offset, 0, retval);
2338 }
2339 
2340 /*
2341  * Check access permissions.
2342  */
2343 int
2344 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2345 {
2346 	/* {
2347 		syscallarg(const char *) path;
2348 		syscallarg(int) flags;
2349 	} */
2350 	kauth_cred_t cred;
2351 	struct vnode *vp;
2352 	int error, flags;
2353 	struct nameidata nd;
2354 
2355 	cred = kauth_cred_dup(l->l_cred);
2356 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2357 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2358 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2359 	    SCARG(uap, path));
2360 	/* Override default credentials */
2361 	nd.ni_cnd.cn_cred = cred;
2362 	if ((error = namei(&nd)) != 0)
2363 		goto out;
2364 	vp = nd.ni_vp;
2365 
2366 	/* Flags == 0 means only check for existence. */
2367 	if (SCARG(uap, flags)) {
2368 		flags = 0;
2369 		if (SCARG(uap, flags) & R_OK)
2370 			flags |= VREAD;
2371 		if (SCARG(uap, flags) & W_OK)
2372 			flags |= VWRITE;
2373 		if (SCARG(uap, flags) & X_OK)
2374 			flags |= VEXEC;
2375 
2376 		error = VOP_ACCESS(vp, flags, cred);
2377 		if (!error && (flags & VWRITE))
2378 			error = vn_writechk(vp);
2379 	}
2380 	vput(vp);
2381 out:
2382 	kauth_cred_free(cred);
2383 	return (error);
2384 }
2385 
2386 /*
2387  * Common code for all sys_stat functions, including compat versions.
2388  */
2389 int
2390 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2391 {
2392 	int error;
2393 	struct nameidata nd;
2394 
2395 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2396 	    UIO_USERSPACE, path);
2397 	error = namei(&nd);
2398 	if (error != 0)
2399 		return error;
2400 	error = vn_stat(nd.ni_vp, sb);
2401 	vput(nd.ni_vp);
2402 	return error;
2403 }
2404 
2405 /*
2406  * Get file status; this version follows links.
2407  */
2408 /* ARGSUSED */
2409 int
2410 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2411 {
2412 	/* {
2413 		syscallarg(const char *) path;
2414 		syscallarg(struct stat *) ub;
2415 	} */
2416 	struct stat sb;
2417 	int error;
2418 
2419 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2420 	if (error)
2421 		return error;
2422 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2423 }
2424 
2425 /*
2426  * Get file status; this version does not follow links.
2427  */
2428 /* ARGSUSED */
2429 int
2430 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2431 {
2432 	/* {
2433 		syscallarg(const char *) path;
2434 		syscallarg(struct stat *) ub;
2435 	} */
2436 	struct stat sb;
2437 	int error;
2438 
2439 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2440 	if (error)
2441 		return error;
2442 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2443 }
2444 
2445 /*
2446  * Get configurable pathname variables.
2447  */
2448 /* ARGSUSED */
2449 int
2450 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2451 {
2452 	/* {
2453 		syscallarg(const char *) path;
2454 		syscallarg(int) name;
2455 	} */
2456 	int error;
2457 	struct nameidata nd;
2458 
2459 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2460 	    SCARG(uap, path));
2461 	if ((error = namei(&nd)) != 0)
2462 		return (error);
2463 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2464 	vput(nd.ni_vp);
2465 	return (error);
2466 }
2467 
2468 /*
2469  * Return target name of a symbolic link.
2470  */
2471 /* ARGSUSED */
2472 int
2473 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2474 {
2475 	/* {
2476 		syscallarg(const char *) path;
2477 		syscallarg(char *) buf;
2478 		syscallarg(size_t) count;
2479 	} */
2480 	struct vnode *vp;
2481 	struct iovec aiov;
2482 	struct uio auio;
2483 	int error;
2484 	struct nameidata nd;
2485 
2486 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2487 	    SCARG(uap, path));
2488 	if ((error = namei(&nd)) != 0)
2489 		return (error);
2490 	vp = nd.ni_vp;
2491 	if (vp->v_type != VLNK)
2492 		error = EINVAL;
2493 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2494 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2495 		aiov.iov_base = SCARG(uap, buf);
2496 		aiov.iov_len = SCARG(uap, count);
2497 		auio.uio_iov = &aiov;
2498 		auio.uio_iovcnt = 1;
2499 		auio.uio_offset = 0;
2500 		auio.uio_rw = UIO_READ;
2501 		KASSERT(l == curlwp);
2502 		auio.uio_vmspace = l->l_proc->p_vmspace;
2503 		auio.uio_resid = SCARG(uap, count);
2504 		error = VOP_READLINK(vp, &auio, l->l_cred);
2505 	}
2506 	vput(vp);
2507 	*retval = SCARG(uap, count) - auio.uio_resid;
2508 	return (error);
2509 }
2510 
2511 /*
2512  * Change flags of a file given a path name.
2513  */
2514 /* ARGSUSED */
2515 int
2516 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2517 {
2518 	/* {
2519 		syscallarg(const char *) path;
2520 		syscallarg(u_long) flags;
2521 	} */
2522 	struct vnode *vp;
2523 	int error;
2524 
2525 	error = namei_simple_user(SCARG(uap, path),
2526 				NSM_FOLLOW_TRYEMULROOT, &vp);
2527 	if (error != 0)
2528 		return (error);
2529 	error = change_flags(vp, SCARG(uap, flags), l);
2530 	vput(vp);
2531 	return (error);
2532 }
2533 
2534 /*
2535  * Change flags of a file given a file descriptor.
2536  */
2537 /* ARGSUSED */
2538 int
2539 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2540 {
2541 	/* {
2542 		syscallarg(int) fd;
2543 		syscallarg(u_long) flags;
2544 	} */
2545 	struct vnode *vp;
2546 	file_t *fp;
2547 	int error;
2548 
2549 	/* fd_getvnode() will use the descriptor for us */
2550 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2551 		return (error);
2552 	vp = fp->f_data;
2553 	error = change_flags(vp, SCARG(uap, flags), l);
2554 	VOP_UNLOCK(vp, 0);
2555 	fd_putfile(SCARG(uap, fd));
2556 	return (error);
2557 }
2558 
2559 /*
2560  * Change flags of a file given a path name; this version does
2561  * not follow links.
2562  */
2563 int
2564 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2565 {
2566 	/* {
2567 		syscallarg(const char *) path;
2568 		syscallarg(u_long) flags;
2569 	} */
2570 	struct vnode *vp;
2571 	int error;
2572 
2573 	error = namei_simple_user(SCARG(uap, path),
2574 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2575 	if (error != 0)
2576 		return (error);
2577 	error = change_flags(vp, SCARG(uap, flags), l);
2578 	vput(vp);
2579 	return (error);
2580 }
2581 
2582 /*
2583  * Common routine to change flags of a file.
2584  */
2585 int
2586 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2587 {
2588 	struct vattr vattr;
2589 	int error;
2590 
2591 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2592 	/*
2593 	 * Non-superusers cannot change the flags on devices, even if they
2594 	 * own them.
2595 	 */
2596 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2597 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2598 			goto out;
2599 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2600 			error = EINVAL;
2601 			goto out;
2602 		}
2603 	}
2604 	VATTR_NULL(&vattr);
2605 	vattr.va_flags = flags;
2606 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2607 out:
2608 	return (error);
2609 }
2610 
2611 /*
2612  * Change mode of a file given path name; this version follows links.
2613  */
2614 /* ARGSUSED */
2615 int
2616 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2617 {
2618 	/* {
2619 		syscallarg(const char *) path;
2620 		syscallarg(int) mode;
2621 	} */
2622 	int error;
2623 	struct vnode *vp;
2624 
2625 	error = namei_simple_user(SCARG(uap, path),
2626 				NSM_FOLLOW_TRYEMULROOT, &vp);
2627 	if (error != 0)
2628 		return (error);
2629 
2630 	error = change_mode(vp, SCARG(uap, mode), l);
2631 
2632 	vrele(vp);
2633 	return (error);
2634 }
2635 
2636 /*
2637  * Change mode of a file given a file descriptor.
2638  */
2639 /* ARGSUSED */
2640 int
2641 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2642 {
2643 	/* {
2644 		syscallarg(int) fd;
2645 		syscallarg(int) mode;
2646 	} */
2647 	file_t *fp;
2648 	int error;
2649 
2650 	/* fd_getvnode() will use the descriptor for us */
2651 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2652 		return (error);
2653 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2654 	fd_putfile(SCARG(uap, fd));
2655 	return (error);
2656 }
2657 
2658 /*
2659  * Change mode of a file given path name; this version does not follow links.
2660  */
2661 /* ARGSUSED */
2662 int
2663 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2664 {
2665 	/* {
2666 		syscallarg(const char *) path;
2667 		syscallarg(int) mode;
2668 	} */
2669 	int error;
2670 	struct vnode *vp;
2671 
2672 	error = namei_simple_user(SCARG(uap, path),
2673 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2674 	if (error != 0)
2675 		return (error);
2676 
2677 	error = change_mode(vp, SCARG(uap, mode), l);
2678 
2679 	vrele(vp);
2680 	return (error);
2681 }
2682 
2683 /*
2684  * Common routine to set mode given a vnode.
2685  */
2686 static int
2687 change_mode(struct vnode *vp, int mode, struct lwp *l)
2688 {
2689 	struct vattr vattr;
2690 	int error;
2691 
2692 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2693 	VATTR_NULL(&vattr);
2694 	vattr.va_mode = mode & ALLPERMS;
2695 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2696 	VOP_UNLOCK(vp, 0);
2697 	return (error);
2698 }
2699 
2700 /*
2701  * Set ownership given a path name; this version follows links.
2702  */
2703 /* ARGSUSED */
2704 int
2705 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2706 {
2707 	/* {
2708 		syscallarg(const char *) path;
2709 		syscallarg(uid_t) uid;
2710 		syscallarg(gid_t) gid;
2711 	} */
2712 	int error;
2713 	struct vnode *vp;
2714 
2715 	error = namei_simple_user(SCARG(uap, path),
2716 				NSM_FOLLOW_TRYEMULROOT, &vp);
2717 	if (error != 0)
2718 		return (error);
2719 
2720 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2721 
2722 	vrele(vp);
2723 	return (error);
2724 }
2725 
2726 /*
2727  * Set ownership given a path name; this version follows links.
2728  * Provides POSIX semantics.
2729  */
2730 /* ARGSUSED */
2731 int
2732 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2733 {
2734 	/* {
2735 		syscallarg(const char *) path;
2736 		syscallarg(uid_t) uid;
2737 		syscallarg(gid_t) gid;
2738 	} */
2739 	int error;
2740 	struct vnode *vp;
2741 
2742 	error = namei_simple_user(SCARG(uap, path),
2743 				NSM_FOLLOW_TRYEMULROOT, &vp);
2744 	if (error != 0)
2745 		return (error);
2746 
2747 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2748 
2749 	vrele(vp);
2750 	return (error);
2751 }
2752 
2753 /*
2754  * Set ownership given a file descriptor.
2755  */
2756 /* ARGSUSED */
2757 int
2758 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2759 {
2760 	/* {
2761 		syscallarg(int) fd;
2762 		syscallarg(uid_t) uid;
2763 		syscallarg(gid_t) gid;
2764 	} */
2765 	int error;
2766 	file_t *fp;
2767 
2768 	/* fd_getvnode() will use the descriptor for us */
2769 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2770 		return (error);
2771 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2772 	    l, 0);
2773 	fd_putfile(SCARG(uap, fd));
2774 	return (error);
2775 }
2776 
2777 /*
2778  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2779  */
2780 /* ARGSUSED */
2781 int
2782 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2783 {
2784 	/* {
2785 		syscallarg(int) fd;
2786 		syscallarg(uid_t) uid;
2787 		syscallarg(gid_t) gid;
2788 	} */
2789 	int error;
2790 	file_t *fp;
2791 
2792 	/* fd_getvnode() will use the descriptor for us */
2793 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2794 		return (error);
2795 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2796 	    l, 1);
2797 	fd_putfile(SCARG(uap, fd));
2798 	return (error);
2799 }
2800 
2801 /*
2802  * Set ownership given a path name; this version does not follow links.
2803  */
2804 /* ARGSUSED */
2805 int
2806 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2807 {
2808 	/* {
2809 		syscallarg(const char *) path;
2810 		syscallarg(uid_t) uid;
2811 		syscallarg(gid_t) gid;
2812 	} */
2813 	int error;
2814 	struct vnode *vp;
2815 
2816 	error = namei_simple_user(SCARG(uap, path),
2817 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2818 	if (error != 0)
2819 		return (error);
2820 
2821 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2822 
2823 	vrele(vp);
2824 	return (error);
2825 }
2826 
2827 /*
2828  * Set ownership given a path name; this version does not follow links.
2829  * Provides POSIX/XPG semantics.
2830  */
2831 /* ARGSUSED */
2832 int
2833 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2834 {
2835 	/* {
2836 		syscallarg(const char *) path;
2837 		syscallarg(uid_t) uid;
2838 		syscallarg(gid_t) gid;
2839 	} */
2840 	int error;
2841 	struct vnode *vp;
2842 
2843 	error = namei_simple_user(SCARG(uap, path),
2844 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2845 	if (error != 0)
2846 		return (error);
2847 
2848 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2849 
2850 	vrele(vp);
2851 	return (error);
2852 }
2853 
2854 /*
2855  * Common routine to set ownership given a vnode.
2856  */
2857 static int
2858 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2859     int posix_semantics)
2860 {
2861 	struct vattr vattr;
2862 	mode_t newmode;
2863 	int error;
2864 
2865 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2866 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2867 		goto out;
2868 
2869 #define CHANGED(x) ((int)(x) != -1)
2870 	newmode = vattr.va_mode;
2871 	if (posix_semantics) {
2872 		/*
2873 		 * POSIX/XPG semantics: if the caller is not the super-user,
2874 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2875 		 * the XPG consider the behaviour for calls by the super-user
2876 		 * implementation-defined; we leave the set-user-id and set-
2877 		 * group-id settings intact in that case.
2878 		 */
2879 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2880 				      NULL) != 0)
2881 			newmode &= ~(S_ISUID | S_ISGID);
2882 	} else {
2883 		/*
2884 		 * NetBSD semantics: when changing owner and/or group,
2885 		 * clear the respective bit(s).
2886 		 */
2887 		if (CHANGED(uid))
2888 			newmode &= ~S_ISUID;
2889 		if (CHANGED(gid))
2890 			newmode &= ~S_ISGID;
2891 	}
2892 	/* Update va_mode iff altered. */
2893 	if (vattr.va_mode == newmode)
2894 		newmode = VNOVAL;
2895 
2896 	VATTR_NULL(&vattr);
2897 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2898 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2899 	vattr.va_mode = newmode;
2900 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2901 #undef CHANGED
2902 
2903 out:
2904 	VOP_UNLOCK(vp, 0);
2905 	return (error);
2906 }
2907 
2908 /*
2909  * Set the access and modification times given a path name; this
2910  * version follows links.
2911  */
2912 /* ARGSUSED */
2913 int
2914 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
2915     register_t *retval)
2916 {
2917 	/* {
2918 		syscallarg(const char *) path;
2919 		syscallarg(const struct timeval *) tptr;
2920 	} */
2921 
2922 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2923 	    SCARG(uap, tptr), UIO_USERSPACE);
2924 }
2925 
2926 /*
2927  * Set the access and modification times given a file descriptor.
2928  */
2929 /* ARGSUSED */
2930 int
2931 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
2932     register_t *retval)
2933 {
2934 	/* {
2935 		syscallarg(int) fd;
2936 		syscallarg(const struct timeval *) tptr;
2937 	} */
2938 	int error;
2939 	file_t *fp;
2940 
2941 	/* fd_getvnode() will use the descriptor for us */
2942 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2943 		return (error);
2944 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2945 	    UIO_USERSPACE);
2946 	fd_putfile(SCARG(uap, fd));
2947 	return (error);
2948 }
2949 
2950 /*
2951  * Set the access and modification times given a path name; this
2952  * version does not follow links.
2953  */
2954 int
2955 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
2956     register_t *retval)
2957 {
2958 	/* {
2959 		syscallarg(const char *) path;
2960 		syscallarg(const struct timeval *) tptr;
2961 	} */
2962 
2963 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
2964 	    SCARG(uap, tptr), UIO_USERSPACE);
2965 }
2966 
2967 /*
2968  * Common routine to set access and modification times given a vnode.
2969  */
2970 int
2971 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
2972     const struct timeval *tptr, enum uio_seg seg)
2973 {
2974 	struct vattr vattr;
2975 	int error, dorele = 0;
2976 	namei_simple_flags_t sflags;
2977 
2978 	bool vanull, setbirthtime;
2979 	struct timespec ts[2];
2980 
2981 	/*
2982 	 * I have checked all callers and they pass either FOLLOW,
2983 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
2984 	 * is 0. More to the point, they don't pass anything else.
2985 	 * Let's keep it that way at least until the namei interfaces
2986 	 * are fully sanitized.
2987 	 */
2988 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
2989 	sflags = (flag == FOLLOW) ?
2990 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
2991 
2992 	if (tptr == NULL) {
2993 		vanull = true;
2994 		nanotime(&ts[0]);
2995 		ts[1] = ts[0];
2996 	} else {
2997 		struct timeval tv[2];
2998 
2999 		vanull = false;
3000 		if (seg != UIO_SYSSPACE) {
3001 			error = copyin(tptr, tv, sizeof (tv));
3002 			if (error != 0)
3003 				return error;
3004 			tptr = tv;
3005 		}
3006 		TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3007 		TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3008 	}
3009 
3010 	if (vp == NULL) {
3011 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3012 		error = namei_simple_user(path, sflags, &vp);
3013 		if (error != 0)
3014 			return error;
3015 		dorele = 1;
3016 	}
3017 
3018 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3019 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3020 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3021 	VATTR_NULL(&vattr);
3022 	vattr.va_atime = ts[0];
3023 	vattr.va_mtime = ts[1];
3024 	if (setbirthtime)
3025 		vattr.va_birthtime = ts[1];
3026 	if (vanull)
3027 		vattr.va_vaflags |= VA_UTIMES_NULL;
3028 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3029 	VOP_UNLOCK(vp, 0);
3030 
3031 	if (dorele != 0)
3032 		vrele(vp);
3033 
3034 	return error;
3035 }
3036 
3037 /*
3038  * Truncate a file given its path name.
3039  */
3040 /* ARGSUSED */
3041 int
3042 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3043 {
3044 	/* {
3045 		syscallarg(const char *) path;
3046 		syscallarg(int) pad;
3047 		syscallarg(off_t) length;
3048 	} */
3049 	struct vnode *vp;
3050 	struct vattr vattr;
3051 	int error;
3052 
3053 	error = namei_simple_user(SCARG(uap, path),
3054 				NSM_FOLLOW_TRYEMULROOT, &vp);
3055 	if (error != 0)
3056 		return (error);
3057 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3058 	if (vp->v_type == VDIR)
3059 		error = EISDIR;
3060 	else if ((error = vn_writechk(vp)) == 0 &&
3061 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3062 		VATTR_NULL(&vattr);
3063 		vattr.va_size = SCARG(uap, length);
3064 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3065 	}
3066 	vput(vp);
3067 	return (error);
3068 }
3069 
3070 /*
3071  * Truncate a file given a file descriptor.
3072  */
3073 /* ARGSUSED */
3074 int
3075 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3076 {
3077 	/* {
3078 		syscallarg(int) fd;
3079 		syscallarg(int) pad;
3080 		syscallarg(off_t) length;
3081 	} */
3082 	struct vattr vattr;
3083 	struct vnode *vp;
3084 	file_t *fp;
3085 	int error;
3086 
3087 	/* fd_getvnode() will use the descriptor for us */
3088 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3089 		return (error);
3090 	if ((fp->f_flag & FWRITE) == 0) {
3091 		error = EINVAL;
3092 		goto out;
3093 	}
3094 	vp = fp->f_data;
3095 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3096 	if (vp->v_type == VDIR)
3097 		error = EISDIR;
3098 	else if ((error = vn_writechk(vp)) == 0) {
3099 		VATTR_NULL(&vattr);
3100 		vattr.va_size = SCARG(uap, length);
3101 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3102 	}
3103 	VOP_UNLOCK(vp, 0);
3104  out:
3105 	fd_putfile(SCARG(uap, fd));
3106 	return (error);
3107 }
3108 
3109 /*
3110  * Sync an open file.
3111  */
3112 /* ARGSUSED */
3113 int
3114 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3115 {
3116 	/* {
3117 		syscallarg(int) fd;
3118 	} */
3119 	struct vnode *vp;
3120 	file_t *fp;
3121 	int error;
3122 
3123 	/* fd_getvnode() will use the descriptor for us */
3124 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3125 		return (error);
3126 	vp = fp->f_data;
3127 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3128 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3129 	VOP_UNLOCK(vp, 0);
3130 	fd_putfile(SCARG(uap, fd));
3131 	return (error);
3132 }
3133 
3134 /*
3135  * Sync a range of file data.  API modeled after that found in AIX.
3136  *
3137  * FDATASYNC indicates that we need only save enough metadata to be able
3138  * to re-read the written data.  Note we duplicate AIX's requirement that
3139  * the file be open for writing.
3140  */
3141 /* ARGSUSED */
3142 int
3143 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3144 {
3145 	/* {
3146 		syscallarg(int) fd;
3147 		syscallarg(int) flags;
3148 		syscallarg(off_t) start;
3149 		syscallarg(off_t) length;
3150 	} */
3151 	struct vnode *vp;
3152 	file_t *fp;
3153 	int flags, nflags;
3154 	off_t s, e, len;
3155 	int error;
3156 
3157 	/* fd_getvnode() will use the descriptor for us */
3158 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3159 		return (error);
3160 
3161 	if ((fp->f_flag & FWRITE) == 0) {
3162 		error = EBADF;
3163 		goto out;
3164 	}
3165 
3166 	flags = SCARG(uap, flags);
3167 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3168 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3169 		error = EINVAL;
3170 		goto out;
3171 	}
3172 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3173 	if (flags & FDATASYNC)
3174 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3175 	else
3176 		nflags = FSYNC_WAIT;
3177 	if (flags & FDISKSYNC)
3178 		nflags |= FSYNC_CACHE;
3179 
3180 	len = SCARG(uap, length);
3181 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3182 	if (len) {
3183 		s = SCARG(uap, start);
3184 		e = s + len;
3185 		if (e < s) {
3186 			error = EINVAL;
3187 			goto out;
3188 		}
3189 	} else {
3190 		e = 0;
3191 		s = 0;
3192 	}
3193 
3194 	vp = fp->f_data;
3195 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3196 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3197 	VOP_UNLOCK(vp, 0);
3198 out:
3199 	fd_putfile(SCARG(uap, fd));
3200 	return (error);
3201 }
3202 
3203 /*
3204  * Sync the data of an open file.
3205  */
3206 /* ARGSUSED */
3207 int
3208 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3209 {
3210 	/* {
3211 		syscallarg(int) fd;
3212 	} */
3213 	struct vnode *vp;
3214 	file_t *fp;
3215 	int error;
3216 
3217 	/* fd_getvnode() will use the descriptor for us */
3218 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3219 		return (error);
3220 	if ((fp->f_flag & FWRITE) == 0) {
3221 		fd_putfile(SCARG(uap, fd));
3222 		return (EBADF);
3223 	}
3224 	vp = fp->f_data;
3225 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3226 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3227 	VOP_UNLOCK(vp, 0);
3228 	fd_putfile(SCARG(uap, fd));
3229 	return (error);
3230 }
3231 
3232 /*
3233  * Rename files, (standard) BSD semantics frontend.
3234  */
3235 /* ARGSUSED */
3236 int
3237 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3238 {
3239 	/* {
3240 		syscallarg(const char *) from;
3241 		syscallarg(const char *) to;
3242 	} */
3243 
3244 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3245 }
3246 
3247 /*
3248  * Rename files, POSIX semantics frontend.
3249  */
3250 /* ARGSUSED */
3251 int
3252 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3253 {
3254 	/* {
3255 		syscallarg(const char *) from;
3256 		syscallarg(const char *) to;
3257 	} */
3258 
3259 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3260 }
3261 
3262 /*
3263  * Rename files.  Source and destination must either both be directories,
3264  * or both not be directories.  If target is a directory, it must be empty.
3265  * If `from' and `to' refer to the same object, the value of the `retain'
3266  * argument is used to determine whether `from' will be
3267  *
3268  * (retain == 0)	deleted unless `from' and `to' refer to the same
3269  *			object in the file system's name space (BSD).
3270  * (retain == 1)	always retained (POSIX).
3271  */
3272 int
3273 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3274 {
3275 	struct vnode *tvp, *fvp, *tdvp;
3276 	struct nameidata fromnd, tond;
3277 	struct mount *fs;
3278 	struct lwp *l = curlwp;
3279 	struct proc *p;
3280 	uint32_t saveflag;
3281 	int error;
3282 
3283 	NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT,
3284 	    seg, from);
3285 	if ((error = namei(&fromnd)) != 0)
3286 		return (error);
3287 	if (fromnd.ni_dvp != fromnd.ni_vp)
3288 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3289 	fvp = fromnd.ni_vp;
3290 
3291 	fs = fvp->v_mount;
3292 	error = VFS_RENAMELOCK_ENTER(fs);
3293 	if (error) {
3294 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3295 		vrele(fromnd.ni_dvp);
3296 		vrele(fvp);
3297 		goto out1;
3298 	}
3299 
3300 	/*
3301 	 * close, partially, yet another race - ideally we should only
3302 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3303 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3304 	 * that with namei as it stands.
3305 	 *
3306 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3307 	 * under us. The real fix is to get the locks in the right
3308 	 * order and do the lookups in the right places, but that's a
3309 	 * major rototill.
3310 	 *
3311 	 * Preserve the SAVESTART in cn_flags, because who knows what
3312 	 * might happen if we don't.
3313 	 *
3314 	 * Note: this logic (as well as this whole function) is cloned
3315 	 * in nfs_serv.c. Proceed accordingly.
3316 	 */
3317 	vrele(fvp);
3318 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3319 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3320 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3321 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3322 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3323 		error = EINVAL;
3324 		VFS_RENAMELOCK_EXIT(fs);
3325 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3326 		vrele(fromnd.ni_dvp);
3327 		goto out1;
3328 	}
3329 	saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3330 	fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3331 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3332 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3333 	fromnd.ni_cnd.cn_flags |= saveflag;
3334 	if (error) {
3335 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3336 		VFS_RENAMELOCK_EXIT(fs);
3337 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3338 		vrele(fromnd.ni_dvp);
3339 		goto out1;
3340 	}
3341 	VOP_UNLOCK(fromnd.ni_vp, 0);
3342 	if (fromnd.ni_dvp != fromnd.ni_vp)
3343 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3344 	fvp = fromnd.ni_vp;
3345 
3346 	NDINIT(&tond, RENAME,
3347 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3348 	      | (fvp->v_type == VDIR ? CREATEDIR : 0),
3349 	    seg, to);
3350 	if ((error = namei(&tond)) != 0) {
3351 		VFS_RENAMELOCK_EXIT(fs);
3352 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3353 		vrele(fromnd.ni_dvp);
3354 		vrele(fvp);
3355 		goto out1;
3356 	}
3357 	tdvp = tond.ni_dvp;
3358 	tvp = tond.ni_vp;
3359 
3360 	if (tvp != NULL) {
3361 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3362 			error = ENOTDIR;
3363 			goto out;
3364 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3365 			error = EISDIR;
3366 			goto out;
3367 		}
3368 	}
3369 
3370 	if (fvp == tdvp)
3371 		error = EINVAL;
3372 
3373 	/*
3374 	 * Source and destination refer to the same object.
3375 	 */
3376 	if (fvp == tvp) {
3377 		if (retain)
3378 			error = -1;
3379 		else if (fromnd.ni_dvp == tdvp &&
3380 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3381 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3382 		          tond.ni_cnd.cn_nameptr,
3383 		          fromnd.ni_cnd.cn_namelen))
3384 		error = -1;
3385 	}
3386 
3387 #if NVERIEXEC > 0
3388 	if (!error) {
3389 		char *f1, *f2;
3390 		size_t f1_len;
3391 		size_t f2_len;
3392 
3393 		f1_len = fromnd.ni_cnd.cn_namelen + 1;
3394 		f1 = kmem_alloc(f1_len, KM_SLEEP);
3395 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
3396 
3397 		f2_len = tond.ni_cnd.cn_namelen + 1;
3398 		f2 = kmem_alloc(f2_len, KM_SLEEP);
3399 		strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
3400 
3401 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3402 
3403 		kmem_free(f1, f1_len);
3404 		kmem_free(f2, f2_len);
3405 	}
3406 #endif /* NVERIEXEC > 0 */
3407 
3408 out:
3409 	p = l->l_proc;
3410 	if (!error) {
3411 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3412 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3413 		VFS_RENAMELOCK_EXIT(fs);
3414 	} else {
3415 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3416 		if (tdvp == tvp)
3417 			vrele(tdvp);
3418 		else
3419 			vput(tdvp);
3420 		if (tvp)
3421 			vput(tvp);
3422 		VFS_RENAMELOCK_EXIT(fs);
3423 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3424 		vrele(fromnd.ni_dvp);
3425 		vrele(fvp);
3426 	}
3427 	vrele(tond.ni_startdir);
3428 	PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3429 out1:
3430 	if (fromnd.ni_startdir)
3431 		vrele(fromnd.ni_startdir);
3432 	PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3433 	return (error == -1 ? 0 : error);
3434 }
3435 
3436 /*
3437  * Make a directory file.
3438  */
3439 /* ARGSUSED */
3440 int
3441 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3442 {
3443 	/* {
3444 		syscallarg(const char *) path;
3445 		syscallarg(int) mode;
3446 	} */
3447 
3448 	return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode));
3449 }
3450 
3451 int
3452 do_sys_mkdir(const char *path, mode_t mode)
3453 {
3454 	struct proc *p = curlwp->l_proc;
3455 	struct vnode *vp;
3456 	struct vattr vattr;
3457 	int error;
3458 	struct nameidata nd;
3459 
3460 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT,
3461 	    UIO_USERSPACE, path);
3462 	if ((error = namei(&nd)) != 0)
3463 		return (error);
3464 	vp = nd.ni_vp;
3465 	if (vp != NULL) {
3466 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3467 		if (nd.ni_dvp == vp)
3468 			vrele(nd.ni_dvp);
3469 		else
3470 			vput(nd.ni_dvp);
3471 		vrele(vp);
3472 		return (EEXIST);
3473 	}
3474 	VATTR_NULL(&vattr);
3475 	vattr.va_type = VDIR;
3476 	/* We will read cwdi->cwdi_cmask unlocked. */
3477 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3478 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3479 	if (!error)
3480 		vput(nd.ni_vp);
3481 	return (error);
3482 }
3483 
3484 /*
3485  * Remove a directory file.
3486  */
3487 /* ARGSUSED */
3488 int
3489 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3490 {
3491 	/* {
3492 		syscallarg(const char *) path;
3493 	} */
3494 	struct vnode *vp;
3495 	int error;
3496 	struct nameidata nd;
3497 
3498 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3499 	    SCARG(uap, path));
3500 	if ((error = namei(&nd)) != 0)
3501 		return (error);
3502 	vp = nd.ni_vp;
3503 	if (vp->v_type != VDIR) {
3504 		error = ENOTDIR;
3505 		goto out;
3506 	}
3507 	/*
3508 	 * No rmdir "." please.
3509 	 */
3510 	if (nd.ni_dvp == vp) {
3511 		error = EINVAL;
3512 		goto out;
3513 	}
3514 	/*
3515 	 * The root of a mounted filesystem cannot be deleted.
3516 	 */
3517 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3518 		error = EBUSY;
3519 		goto out;
3520 	}
3521 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3522 	return (error);
3523 
3524 out:
3525 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3526 	if (nd.ni_dvp == vp)
3527 		vrele(nd.ni_dvp);
3528 	else
3529 		vput(nd.ni_dvp);
3530 	vput(vp);
3531 	return (error);
3532 }
3533 
3534 /*
3535  * Read a block of directory entries in a file system independent format.
3536  */
3537 int
3538 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3539 {
3540 	/* {
3541 		syscallarg(int) fd;
3542 		syscallarg(char *) buf;
3543 		syscallarg(size_t) count;
3544 	} */
3545 	file_t *fp;
3546 	int error, done;
3547 
3548 	/* fd_getvnode() will use the descriptor for us */
3549 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3550 		return (error);
3551 	if ((fp->f_flag & FREAD) == 0) {
3552 		error = EBADF;
3553 		goto out;
3554 	}
3555 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3556 			SCARG(uap, count), &done, l, 0, 0);
3557 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3558 	*retval = done;
3559  out:
3560 	fd_putfile(SCARG(uap, fd));
3561 	return (error);
3562 }
3563 
3564 /*
3565  * Set the mode mask for creation of filesystem nodes.
3566  */
3567 int
3568 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3569 {
3570 	/* {
3571 		syscallarg(mode_t) newmask;
3572 	} */
3573 	struct proc *p = l->l_proc;
3574 	struct cwdinfo *cwdi;
3575 
3576 	/*
3577 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3578 	 * important is that we serialize changes to the mask.  The
3579 	 * rw_exit() will issue a write memory barrier on our behalf,
3580 	 * and force the changes out to other CPUs (as it must use an
3581 	 * atomic operation, draining the local CPU's store buffers).
3582 	 */
3583 	cwdi = p->p_cwdi;
3584 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3585 	*retval = cwdi->cwdi_cmask;
3586 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3587 	rw_exit(&cwdi->cwdi_lock);
3588 
3589 	return (0);
3590 }
3591 
3592 int
3593 dorevoke(struct vnode *vp, kauth_cred_t cred)
3594 {
3595 	struct vattr vattr;
3596 	int error;
3597 
3598 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3599 		return error;
3600 	if (kauth_cred_geteuid(cred) == vattr.va_uid ||
3601 	    (error = kauth_authorize_generic(cred,
3602 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3603 		VOP_REVOKE(vp, REVOKEALL);
3604 	return (error);
3605 }
3606 
3607 /*
3608  * Void all references to file by ripping underlying filesystem
3609  * away from vnode.
3610  */
3611 /* ARGSUSED */
3612 int
3613 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3614 {
3615 	/* {
3616 		syscallarg(const char *) path;
3617 	} */
3618 	struct vnode *vp;
3619 	int error;
3620 
3621 	error = namei_simple_user(SCARG(uap, path),
3622 				NSM_FOLLOW_TRYEMULROOT, &vp);
3623 	if (error != 0)
3624 		return (error);
3625 	error = dorevoke(vp, l->l_cred);
3626 	vrele(vp);
3627 	return (error);
3628 }
3629