xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 3816d47b2c42fcd6e549e3407f842a5b1a1d23ad)
1 /*	$NetBSD: vfs_syscalls.c,v 1.402 2010/01/08 11:35:10 pooka Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.402 2010/01/08 11:35:10 pooka Exp $");
70 
71 #ifdef _KERNEL_OPT
72 #include "opt_fileassoc.h"
73 #include "veriexec.h"
74 #endif
75 
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/namei.h>
79 #include <sys/filedesc.h>
80 #include <sys/kernel.h>
81 #include <sys/file.h>
82 #include <sys/stat.h>
83 #include <sys/vnode.h>
84 #include <sys/mount.h>
85 #include <sys/proc.h>
86 #include <sys/uio.h>
87 #include <sys/kmem.h>
88 #include <sys/dirent.h>
89 #include <sys/sysctl.h>
90 #include <sys/syscallargs.h>
91 #include <sys/vfs_syscalls.h>
92 #include <sys/ktrace.h>
93 #ifdef FILEASSOC
94 #include <sys/fileassoc.h>
95 #endif /* FILEASSOC */
96 #include <sys/verified_exec.h>
97 #include <sys/kauth.h>
98 #include <sys/atomic.h>
99 #include <sys/module.h>
100 #include <sys/buf.h>
101 
102 #include <miscfs/genfs/genfs.h>
103 #include <miscfs/syncfs/syncfs.h>
104 #include <miscfs/specfs/specdev.h>
105 
106 #include <nfs/rpcv2.h>
107 #include <nfs/nfsproto.h>
108 #include <nfs/nfs.h>
109 #include <nfs/nfs_var.h>
110 
111 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
112 
113 static int change_flags(struct vnode *, u_long, struct lwp *);
114 static int change_mode(struct vnode *, int, struct lwp *l);
115 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
116 
117 void checkdirs(struct vnode *);
118 
119 /*
120  * Virtual File System System Calls
121  */
122 
123 /*
124  * Mount a file system.
125  */
126 
127 /*
128  * This table is used to maintain compatibility with 4.3BSD
129  * and NetBSD 0.9 mount syscalls - and possibly other systems.
130  * Note, the order is important!
131  *
132  * Do not modify this table. It should only contain filesystems
133  * supported by NetBSD 0.9 and 4.3BSD.
134  */
135 const char * const mountcompatnames[] = {
136 	NULL,		/* 0 = MOUNT_NONE */
137 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
138 	MOUNT_NFS,	/* 2 */
139 	MOUNT_MFS,	/* 3 */
140 	MOUNT_MSDOS,	/* 4 */
141 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
142 	MOUNT_FDESC,	/* 6 */
143 	MOUNT_KERNFS,	/* 7 */
144 	NULL,		/* 8 = MOUNT_DEVFS */
145 	MOUNT_AFS,	/* 9 */
146 };
147 const int nmountcompatnames = sizeof(mountcompatnames) /
148     sizeof(mountcompatnames[0]);
149 
150 static int
151 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
152     void *data, size_t *data_len)
153 {
154 	struct mount *mp;
155 	int error = 0, saved_flags;
156 
157 	mp = vp->v_mount;
158 	saved_flags = mp->mnt_flag;
159 
160 	/* We can operate only on VV_ROOT nodes. */
161 	if ((vp->v_vflag & VV_ROOT) == 0) {
162 		error = EINVAL;
163 		goto out;
164 	}
165 
166 	/*
167 	 * We only allow the filesystem to be reloaded if it
168 	 * is currently mounted read-only.  Additionally, we
169 	 * prevent read-write to read-only downgrades.
170 	 */
171 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
172 	    (mp->mnt_flag & MNT_RDONLY) == 0) {
173 		error = EOPNOTSUPP;	/* Needs translation */
174 		goto out;
175 	}
176 
177 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
178 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
179 	if (error)
180 		goto out;
181 
182 	if (vfs_busy(mp, NULL)) {
183 		error = EPERM;
184 		goto out;
185 	}
186 
187 	mutex_enter(&mp->mnt_updating);
188 
189 	mp->mnt_flag &= ~MNT_OP_FLAGS;
190 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
191 
192 	/*
193 	 * Set the mount level flags.
194 	 */
195 	if (flags & MNT_RDONLY)
196 		mp->mnt_flag |= MNT_RDONLY;
197 	else if (mp->mnt_flag & MNT_RDONLY)
198 		mp->mnt_iflag |= IMNT_WANTRDWR;
199 	mp->mnt_flag &=
200 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
201 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
202 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
203 	    MNT_LOG);
204 	mp->mnt_flag |= flags &
205 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
206 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
207 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
208 	    MNT_LOG | MNT_IGNORE);
209 
210 	error = VFS_MOUNT(mp, path, data, data_len);
211 
212 	if (error && data != NULL) {
213 		int error2;
214 
215 		/*
216 		 * Update failed; let's try and see if it was an
217 		 * export request.  For compat with 3.0 and earlier.
218 		 */
219 		error2 = vfs_hooks_reexport(mp, path, data);
220 
221 		/*
222 		 * Only update error code if the export request was
223 		 * understood but some problem occurred while
224 		 * processing it.
225 		 */
226 		if (error2 != EJUSTRETURN)
227 			error = error2;
228 	}
229 
230 	if (mp->mnt_iflag & IMNT_WANTRDWR)
231 		mp->mnt_flag &= ~MNT_RDONLY;
232 	if (error)
233 		mp->mnt_flag = saved_flags;
234 	mp->mnt_flag &= ~MNT_OP_FLAGS;
235 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
236 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
237 		if (mp->mnt_syncer == NULL)
238 			error = vfs_allocate_syncvnode(mp);
239 	} else {
240 		if (mp->mnt_syncer != NULL)
241 			vfs_deallocate_syncvnode(mp);
242 	}
243 	mutex_exit(&mp->mnt_updating);
244 	vfs_unbusy(mp, false, NULL);
245 
246  out:
247 	return (error);
248 }
249 
250 static int
251 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
252 {
253 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
254 	int error;
255 
256 	/* Copy file-system type from userspace.  */
257 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
258 	if (error) {
259 		/*
260 		 * Historically, filesystem types were identified by numbers.
261 		 * If we get an integer for the filesystem type instead of a
262 		 * string, we check to see if it matches one of the historic
263 		 * filesystem types.
264 		 */
265 		u_long fsindex = (u_long)fstype;
266 		if (fsindex >= nmountcompatnames ||
267 		    mountcompatnames[fsindex] == NULL)
268 			return ENODEV;
269 		strlcpy(fstypename, mountcompatnames[fsindex],
270 		    sizeof(fstypename));
271 	}
272 
273 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
274 	if (strcmp(fstypename, "ufs") == 0)
275 		fstypename[0] = 'f';
276 
277 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
278 		return 0;
279 
280 	/* If we can autoload a vfs module, try again */
281 	mutex_enter(&module_lock);
282 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
283 	mutex_exit(&module_lock);
284 
285 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
286 		return 0;
287 
288 	return ENODEV;
289 }
290 
291 static int
292 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
293     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
294 {
295 	struct mount *mp;
296 	struct vnode *vp = *vpp;
297 	struct vattr va;
298 	int error;
299 
300 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
301 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
302 	if (error)
303 		return error;
304 
305 	/* Can't make a non-dir a mount-point (from here anyway). */
306 	if (vp->v_type != VDIR)
307 		return ENOTDIR;
308 
309 	/*
310 	 * If the user is not root, ensure that they own the directory
311 	 * onto which we are attempting to mount.
312 	 */
313 	if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
314 	    (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
315 	    (error = kauth_authorize_generic(l->l_cred,
316 	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
317 		return error;
318 	}
319 
320 	if (flags & MNT_EXPORTED)
321 		return EINVAL;
322 
323 	if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
324 		return error;
325 
326 	/*
327 	 * Check if a file-system is not already mounted on this vnode.
328 	 */
329 	if (vp->v_mountedhere != NULL)
330 		return EBUSY;
331 
332 	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL)
333 		return ENOMEM;
334 
335 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
336 
337 	/*
338 	 * The underlying file system may refuse the mount for
339 	 * various reasons.  Allow the user to force it to happen.
340 	 *
341 	 * Set the mount level flags.
342 	 */
343 	mp->mnt_flag = flags &
344 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
345 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
346 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
347 	    MNT_LOG | MNT_IGNORE | MNT_RDONLY);
348 
349 	mutex_enter(&mp->mnt_updating);
350 	error = VFS_MOUNT(mp, path, data, data_len);
351 	mp->mnt_flag &= ~MNT_OP_FLAGS;
352 
353 	/*
354 	 * Put the new filesystem on the mount list after root.
355 	 */
356 	cache_purge(vp);
357 	if (error != 0) {
358 		vp->v_mountedhere = NULL;
359 		mutex_exit(&mp->mnt_updating);
360 		vfs_unbusy(mp, false, NULL);
361 		vfs_destroy(mp);
362 		return error;
363 	}
364 
365 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
366 	mutex_enter(&mountlist_lock);
367 	vp->v_mountedhere = mp;
368 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
369 	mutex_exit(&mountlist_lock);
370     	vn_restorerecurse(vp, recurse);
371 	VOP_UNLOCK(vp, 0);
372 	checkdirs(vp);
373 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
374 		error = vfs_allocate_syncvnode(mp);
375 	/* Hold an additional reference to the mount across VFS_START(). */
376 	mutex_exit(&mp->mnt_updating);
377 	vfs_unbusy(mp, true, NULL);
378 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
379 	error = VFS_START(mp, 0);
380 	if (error)
381 		vrele(vp);
382 	/* Drop reference held for VFS_START(). */
383 	vfs_destroy(mp);
384 	*vpp = NULL;
385 	return error;
386 }
387 
388 static int
389 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
390     void *data, size_t *data_len)
391 {
392 	struct mount *mp;
393 	int error;
394 
395 	/* If MNT_GETARGS is specified, it should be the only flag. */
396 	if (flags & ~MNT_GETARGS)
397 		return EINVAL;
398 
399 	mp = vp->v_mount;
400 
401 	/* XXX: probably some notion of "can see" here if we want isolation. */
402 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
403 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
404 	if (error)
405 		return error;
406 
407 	if ((vp->v_vflag & VV_ROOT) == 0)
408 		return EINVAL;
409 
410 	if (vfs_busy(mp, NULL))
411 		return EPERM;
412 
413 	mutex_enter(&mp->mnt_updating);
414 	mp->mnt_flag &= ~MNT_OP_FLAGS;
415 	mp->mnt_flag |= MNT_GETARGS;
416 	error = VFS_MOUNT(mp, path, data, data_len);
417 	mp->mnt_flag &= ~MNT_OP_FLAGS;
418 	mutex_exit(&mp->mnt_updating);
419 
420 	vfs_unbusy(mp, false, NULL);
421 	return (error);
422 }
423 
424 int
425 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
426 {
427 	/* {
428 		syscallarg(const char *) type;
429 		syscallarg(const char *) path;
430 		syscallarg(int) flags;
431 		syscallarg(void *) data;
432 		syscallarg(size_t) data_len;
433 	} */
434 
435 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
436 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
437 	    SCARG(uap, data_len), retval);
438 }
439 
440 int
441 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
442     const char *path, int flags, void *data, enum uio_seg data_seg,
443     size_t data_len, register_t *retval)
444 {
445 	struct vnode *vp;
446 	void *data_buf = data;
447 	u_int recurse;
448 	int error;
449 
450 	/*
451 	 * Get vnode to be covered
452 	 */
453 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
454 	if (error != 0)
455 		return (error);
456 
457 	/*
458 	 * A lookup in VFS_MOUNT might result in an attempt to
459 	 * lock this vnode again, so make the lock recursive.
460 	 */
461 	if (vfsops == NULL) {
462 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
463 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
464 			recurse = vn_setrecurse(vp);
465 			vfsops = vp->v_mount->mnt_op;
466 		} else {
467 			/* 'type' is userspace */
468 			error = mount_get_vfsops(type, &vfsops);
469 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
470 			recurse = vn_setrecurse(vp);
471 			if (error != 0)
472 				goto done;
473 		}
474 	} else {
475 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
476 		recurse = vn_setrecurse(vp);
477 	}
478 
479 	if (data != NULL && data_seg == UIO_USERSPACE) {
480 		if (data_len == 0) {
481 			/* No length supplied, use default for filesystem */
482 			data_len = vfsops->vfs_min_mount_data;
483 			if (data_len > VFS_MAX_MOUNT_DATA) {
484 				error = EINVAL;
485 				goto done;
486 			}
487 			/*
488 			 * Hopefully a longer buffer won't make copyin() fail.
489 			 * For compatibility with 3.0 and earlier.
490 			 */
491 			if (flags & MNT_UPDATE
492 			    && data_len < sizeof (struct mnt_export_args30))
493 				data_len = sizeof (struct mnt_export_args30);
494 		}
495 		data_buf = kmem_alloc(data_len, KM_SLEEP);
496 
497 		/* NFS needs the buffer even for mnt_getargs .... */
498 		error = copyin(data, data_buf, data_len);
499 		if (error != 0)
500 			goto done;
501 	}
502 
503 	if (flags & MNT_GETARGS) {
504 		if (data_len == 0) {
505 			error = EINVAL;
506 			goto done;
507 		}
508 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
509 		if (error != 0)
510 			goto done;
511 		if (data_seg == UIO_USERSPACE)
512 			error = copyout(data_buf, data, data_len);
513 		*retval = data_len;
514 	} else if (flags & MNT_UPDATE) {
515 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
516 	} else {
517 		/* Locking is handled internally in mount_domount(). */
518 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
519 		    &data_len, recurse);
520 	}
521 
522     done:
523     	if (vp != NULL) {
524 	    	vn_restorerecurse(vp, recurse);
525 	    	vput(vp);
526 	}
527 	if (data_buf != data)
528 		kmem_free(data_buf, data_len);
529 	return (error);
530 }
531 
532 /*
533  * Scan all active processes to see if any of them have a current
534  * or root directory onto which the new filesystem has just been
535  * mounted. If so, replace them with the new mount point.
536  */
537 void
538 checkdirs(struct vnode *olddp)
539 {
540 	struct cwdinfo *cwdi;
541 	struct vnode *newdp, *rele1, *rele2;
542 	struct proc *p;
543 	bool retry;
544 
545 	if (olddp->v_usecount == 1)
546 		return;
547 	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
548 		panic("mount: lost mount");
549 
550 	do {
551 		retry = false;
552 		mutex_enter(proc_lock);
553 		PROCLIST_FOREACH(p, &allproc) {
554 			if ((p->p_flag & PK_MARKER) != 0)
555 				continue;
556 			if ((cwdi = p->p_cwdi) == NULL)
557 				continue;
558 			/*
559 			 * Can't change to the old directory any more,
560 			 * so even if we see a stale value it's not a
561 			 * problem.
562 			 */
563 			if (cwdi->cwdi_cdir != olddp &&
564 			    cwdi->cwdi_rdir != olddp)
565 			    	continue;
566 			retry = true;
567 			rele1 = NULL;
568 			rele2 = NULL;
569 			atomic_inc_uint(&cwdi->cwdi_refcnt);
570 			mutex_exit(proc_lock);
571 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
572 			if (cwdi->cwdi_cdir == olddp) {
573 				rele1 = cwdi->cwdi_cdir;
574 				vref(newdp);
575 				cwdi->cwdi_cdir = newdp;
576 			}
577 			if (cwdi->cwdi_rdir == olddp) {
578 				rele2 = cwdi->cwdi_rdir;
579 				vref(newdp);
580 				cwdi->cwdi_rdir = newdp;
581 			}
582 			rw_exit(&cwdi->cwdi_lock);
583 			cwdfree(cwdi);
584 			if (rele1 != NULL)
585 				vrele(rele1);
586 			if (rele2 != NULL)
587 				vrele(rele2);
588 			mutex_enter(proc_lock);
589 			break;
590 		}
591 		mutex_exit(proc_lock);
592 	} while (retry);
593 
594 	if (rootvnode == olddp) {
595 		vrele(rootvnode);
596 		vref(newdp);
597 		rootvnode = newdp;
598 	}
599 	vput(newdp);
600 }
601 
602 /*
603  * Unmount a file system.
604  *
605  * Note: unmount takes a path to the vnode mounted on as argument,
606  * not special file (as before).
607  */
608 /* ARGSUSED */
609 int
610 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
611 {
612 	/* {
613 		syscallarg(const char *) path;
614 		syscallarg(int) flags;
615 	} */
616 	struct vnode *vp;
617 	struct mount *mp;
618 	int error;
619 	struct nameidata nd;
620 
621 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
622 	    SCARG(uap, path));
623 	if ((error = namei(&nd)) != 0)
624 		return (error);
625 	vp = nd.ni_vp;
626 	mp = vp->v_mount;
627 	atomic_inc_uint(&mp->mnt_refcnt);
628 	VOP_UNLOCK(vp, 0);
629 
630 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
631 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
632 	if (error) {
633 		vrele(vp);
634 		vfs_destroy(mp);
635 		return (error);
636 	}
637 
638 	/*
639 	 * Don't allow unmounting the root file system.
640 	 */
641 	if (mp->mnt_flag & MNT_ROOTFS) {
642 		vrele(vp);
643 		vfs_destroy(mp);
644 		return (EINVAL);
645 	}
646 
647 	/*
648 	 * Must be the root of the filesystem
649 	 */
650 	if ((vp->v_vflag & VV_ROOT) == 0) {
651 		vrele(vp);
652 		vfs_destroy(mp);
653 		return (EINVAL);
654 	}
655 
656 	vrele(vp);
657 	error = dounmount(mp, SCARG(uap, flags), l);
658 	vfs_destroy(mp);
659 	return error;
660 }
661 
662 /*
663  * Do the actual file system unmount.  File system is assumed to have
664  * been locked by the caller.
665  *
666  * => Caller hold reference to the mount, explicitly for dounmount().
667  */
668 int
669 dounmount(struct mount *mp, int flags, struct lwp *l)
670 {
671 	struct vnode *coveredvp;
672 	int error;
673 	int async;
674 	int used_syncer;
675 
676 #if NVERIEXEC > 0
677 	error = veriexec_unmountchk(mp);
678 	if (error)
679 		return (error);
680 #endif /* NVERIEXEC > 0 */
681 
682 	/*
683 	 * XXX Freeze syncer.  Must do this before locking the
684 	 * mount point.  See dounmount() for details.
685 	 */
686 	mutex_enter(&syncer_mutex);
687 	rw_enter(&mp->mnt_unmounting, RW_WRITER);
688 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
689 		rw_exit(&mp->mnt_unmounting);
690 		mutex_exit(&syncer_mutex);
691 		return ENOENT;
692 	}
693 
694 	used_syncer = (mp->mnt_syncer != NULL);
695 
696 	/*
697 	 * XXX Syncer must be frozen when we get here.  This should really
698 	 * be done on a per-mountpoint basis, but the syncer doesn't work
699 	 * like that.
700 	 *
701 	 * The caller of dounmount() must acquire syncer_mutex because
702 	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
703 	 * order, and we must preserve that order to avoid deadlock.
704 	 *
705 	 * So, if the file system did not use the syncer, now is
706 	 * the time to release the syncer_mutex.
707 	 */
708 	if (used_syncer == 0)
709 		mutex_exit(&syncer_mutex);
710 
711 	mp->mnt_iflag |= IMNT_UNMOUNT;
712 	async = mp->mnt_flag & MNT_ASYNC;
713 	mp->mnt_flag &= ~MNT_ASYNC;
714 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
715 	if (mp->mnt_syncer != NULL)
716 		vfs_deallocate_syncvnode(mp);
717 	error = 0;
718 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
719 		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
720 	}
721 	vfs_scrubvnlist(mp);
722 	if (error == 0 || (flags & MNT_FORCE))
723 		error = VFS_UNMOUNT(mp, flags);
724 	if (error) {
725 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
726 			(void) vfs_allocate_syncvnode(mp);
727 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
728 		mp->mnt_flag |= async;
729 		rw_exit(&mp->mnt_unmounting);
730 		if (used_syncer)
731 			mutex_exit(&syncer_mutex);
732 		return (error);
733 	}
734 	vfs_scrubvnlist(mp);
735 	mutex_enter(&mountlist_lock);
736 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
737 		coveredvp->v_mountedhere = NULL;
738 	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
739 	mp->mnt_iflag |= IMNT_GONE;
740 	mutex_exit(&mountlist_lock);
741 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
742 		panic("unmount: dangling vnode");
743 	if (used_syncer)
744 		mutex_exit(&syncer_mutex);
745 	vfs_hooks_unmount(mp);
746 	rw_exit(&mp->mnt_unmounting);
747 	vfs_destroy(mp);	/* reference from mount() */
748 	if (coveredvp != NULLVP)
749 		vrele(coveredvp);
750 	return (0);
751 }
752 
753 /*
754  * Sync each mounted filesystem.
755  */
756 #ifdef DEBUG
757 int syncprt = 0;
758 struct ctldebug debug0 = { "syncprt", &syncprt };
759 #endif
760 
761 /* ARGSUSED */
762 int
763 sys_sync(struct lwp *l, const void *v, register_t *retval)
764 {
765 	struct mount *mp, *nmp;
766 	int asyncflag;
767 
768 	if (l == NULL)
769 		l = &lwp0;
770 
771 	mutex_enter(&mountlist_lock);
772 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
773 	     mp = nmp) {
774 		if (vfs_busy(mp, &nmp)) {
775 			continue;
776 		}
777 		mutex_enter(&mp->mnt_updating);
778 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
779 			asyncflag = mp->mnt_flag & MNT_ASYNC;
780 			mp->mnt_flag &= ~MNT_ASYNC;
781 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
782 			if (asyncflag)
783 				 mp->mnt_flag |= MNT_ASYNC;
784 		}
785 		mutex_exit(&mp->mnt_updating);
786 		vfs_unbusy(mp, false, &nmp);
787 	}
788 	mutex_exit(&mountlist_lock);
789 #ifdef DEBUG
790 	if (syncprt)
791 		vfs_bufstats();
792 #endif /* DEBUG */
793 	return (0);
794 }
795 
796 /*
797  * Change filesystem quotas.
798  */
799 /* ARGSUSED */
800 int
801 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
802 {
803 	/* {
804 		syscallarg(const char *) path;
805 		syscallarg(int) cmd;
806 		syscallarg(int) uid;
807 		syscallarg(void *) arg;
808 	} */
809 	struct mount *mp;
810 	int error;
811 	struct vnode *vp;
812 
813 	error = namei_simple_user(SCARG(uap, path),
814 				NSM_FOLLOW_TRYEMULROOT, &vp);
815 	if (error != 0)
816 		return (error);
817 	mp = vp->v_mount;
818 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
819 	    SCARG(uap, arg));
820 	vrele(vp);
821 	return (error);
822 }
823 
824 int
825 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
826     int root)
827 {
828 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
829 	int error = 0;
830 
831 	/*
832 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
833 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
834 	 * overrides MNT_NOWAIT.
835 	 */
836 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
837 	    (flags != MNT_WAIT && flags != 0)) {
838 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
839 		goto done;
840 	}
841 
842 	/* Get the filesystem stats now */
843 	memset(sp, 0, sizeof(*sp));
844 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
845 		return error;
846 	}
847 
848 	if (cwdi->cwdi_rdir == NULL)
849 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
850 done:
851 	if (cwdi->cwdi_rdir != NULL) {
852 		size_t len;
853 		char *bp;
854 		char c;
855 		char *path = PNBUF_GET();
856 
857 		bp = path + MAXPATHLEN;
858 		*--bp = '\0';
859 		rw_enter(&cwdi->cwdi_lock, RW_READER);
860 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
861 		    MAXPATHLEN / 2, 0, l);
862 		rw_exit(&cwdi->cwdi_lock);
863 		if (error) {
864 			PNBUF_PUT(path);
865 			return error;
866 		}
867 		len = strlen(bp);
868 		if (len != 1) {
869 			/*
870 			 * for mount points that are below our root, we can see
871 			 * them, so we fix up the pathname and return them. The
872 			 * rest we cannot see, so we don't allow viewing the
873 			 * data.
874 			 */
875 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
876 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
877 				(void)strlcpy(sp->f_mntonname,
878 				    c == '\0' ? "/" : &sp->f_mntonname[len],
879 				    sizeof(sp->f_mntonname));
880 			} else {
881 				if (root)
882 					(void)strlcpy(sp->f_mntonname, "/",
883 					    sizeof(sp->f_mntonname));
884 				else
885 					error = EPERM;
886 			}
887 		}
888 		PNBUF_PUT(path);
889 	}
890 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
891 	return error;
892 }
893 
894 /*
895  * Get filesystem statistics by path.
896  */
897 int
898 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
899 {
900 	struct mount *mp;
901 	int error;
902 	struct vnode *vp;
903 
904 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
905 	if (error != 0)
906 		return error;
907 	mp = vp->v_mount;
908 	error = dostatvfs(mp, sb, l, flags, 1);
909 	vrele(vp);
910 	return error;
911 }
912 
913 /* ARGSUSED */
914 int
915 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
916 {
917 	/* {
918 		syscallarg(const char *) path;
919 		syscallarg(struct statvfs *) buf;
920 		syscallarg(int) flags;
921 	} */
922 	struct statvfs *sb;
923 	int error;
924 
925 	sb = STATVFSBUF_GET();
926 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
927 	if (error == 0)
928 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
929 	STATVFSBUF_PUT(sb);
930 	return error;
931 }
932 
933 /*
934  * Get filesystem statistics by fd.
935  */
936 int
937 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
938 {
939 	file_t *fp;
940 	struct mount *mp;
941 	int error;
942 
943 	/* fd_getvnode() will use the descriptor for us */
944 	if ((error = fd_getvnode(fd, &fp)) != 0)
945 		return (error);
946 	mp = ((struct vnode *)fp->f_data)->v_mount;
947 	error = dostatvfs(mp, sb, curlwp, flags, 1);
948 	fd_putfile(fd);
949 	return error;
950 }
951 
952 /* ARGSUSED */
953 int
954 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
955 {
956 	/* {
957 		syscallarg(int) fd;
958 		syscallarg(struct statvfs *) buf;
959 		syscallarg(int) flags;
960 	} */
961 	struct statvfs *sb;
962 	int error;
963 
964 	sb = STATVFSBUF_GET();
965 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
966 	if (error == 0)
967 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
968 	STATVFSBUF_PUT(sb);
969 	return error;
970 }
971 
972 
973 /*
974  * Get statistics on all filesystems.
975  */
976 int
977 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
978     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
979     register_t *retval)
980 {
981 	int root = 0;
982 	struct proc *p = l->l_proc;
983 	struct mount *mp, *nmp;
984 	struct statvfs *sb;
985 	size_t count, maxcount;
986 	int error = 0;
987 
988 	sb = STATVFSBUF_GET();
989 	maxcount = bufsize / entry_sz;
990 	mutex_enter(&mountlist_lock);
991 	count = 0;
992 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
993 	     mp = nmp) {
994 		if (vfs_busy(mp, &nmp)) {
995 			continue;
996 		}
997 		if (sfsp && count < maxcount) {
998 			error = dostatvfs(mp, sb, l, flags, 0);
999 			if (error) {
1000 				vfs_unbusy(mp, false, &nmp);
1001 				error = 0;
1002 				continue;
1003 			}
1004 			error = copyfn(sb, sfsp, entry_sz);
1005 			if (error) {
1006 				vfs_unbusy(mp, false, NULL);
1007 				goto out;
1008 			}
1009 			sfsp = (char *)sfsp + entry_sz;
1010 			root |= strcmp(sb->f_mntonname, "/") == 0;
1011 		}
1012 		count++;
1013 		vfs_unbusy(mp, false, &nmp);
1014 	}
1015 	mutex_exit(&mountlist_lock);
1016 
1017 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1018 		/*
1019 		 * fake a root entry
1020 		 */
1021 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1022 		    sb, l, flags, 1);
1023 		if (error != 0)
1024 			goto out;
1025 		if (sfsp) {
1026 			error = copyfn(sb, sfsp, entry_sz);
1027 			if (error != 0)
1028 				goto out;
1029 		}
1030 		count++;
1031 	}
1032 	if (sfsp && count > maxcount)
1033 		*retval = maxcount;
1034 	else
1035 		*retval = count;
1036 out:
1037 	STATVFSBUF_PUT(sb);
1038 	return error;
1039 }
1040 
1041 int
1042 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1043 {
1044 	/* {
1045 		syscallarg(struct statvfs *) buf;
1046 		syscallarg(size_t) bufsize;
1047 		syscallarg(int) flags;
1048 	} */
1049 
1050 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1051 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1052 }
1053 
1054 /*
1055  * Change current working directory to a given file descriptor.
1056  */
1057 /* ARGSUSED */
1058 int
1059 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1060 {
1061 	/* {
1062 		syscallarg(int) fd;
1063 	} */
1064 	struct proc *p = l->l_proc;
1065 	struct cwdinfo *cwdi;
1066 	struct vnode *vp, *tdp;
1067 	struct mount *mp;
1068 	file_t *fp;
1069 	int error, fd;
1070 
1071 	/* fd_getvnode() will use the descriptor for us */
1072 	fd = SCARG(uap, fd);
1073 	if ((error = fd_getvnode(fd, &fp)) != 0)
1074 		return (error);
1075 	vp = fp->f_data;
1076 
1077 	vref(vp);
1078 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1079 	if (vp->v_type != VDIR)
1080 		error = ENOTDIR;
1081 	else
1082 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1083 	if (error) {
1084 		vput(vp);
1085 		goto out;
1086 	}
1087 	while ((mp = vp->v_mountedhere) != NULL) {
1088 		error = vfs_busy(mp, NULL);
1089 		vput(vp);
1090 		if (error != 0)
1091 			goto out;
1092 		error = VFS_ROOT(mp, &tdp);
1093 		vfs_unbusy(mp, false, NULL);
1094 		if (error)
1095 			goto out;
1096 		vp = tdp;
1097 	}
1098 	VOP_UNLOCK(vp, 0);
1099 
1100 	/*
1101 	 * Disallow changing to a directory not under the process's
1102 	 * current root directory (if there is one).
1103 	 */
1104 	cwdi = p->p_cwdi;
1105 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1106 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1107 		vrele(vp);
1108 		error = EPERM;	/* operation not permitted */
1109 	} else {
1110 		vrele(cwdi->cwdi_cdir);
1111 		cwdi->cwdi_cdir = vp;
1112 	}
1113 	rw_exit(&cwdi->cwdi_lock);
1114 
1115  out:
1116 	fd_putfile(fd);
1117 	return (error);
1118 }
1119 
1120 /*
1121  * Change this process's notion of the root directory to a given file
1122  * descriptor.
1123  */
1124 int
1125 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1126 {
1127 	struct proc *p = l->l_proc;
1128 	struct vnode	*vp;
1129 	file_t	*fp;
1130 	int		 error, fd = SCARG(uap, fd);
1131 
1132 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1133  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1134 		return error;
1135 	/* fd_getvnode() will use the descriptor for us */
1136 	if ((error = fd_getvnode(fd, &fp)) != 0)
1137 		return error;
1138 	vp = fp->f_data;
1139 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1140 	if (vp->v_type != VDIR)
1141 		error = ENOTDIR;
1142 	else
1143 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1144 	VOP_UNLOCK(vp, 0);
1145 	if (error)
1146 		goto out;
1147 	vref(vp);
1148 
1149 	change_root(p->p_cwdi, vp, l);
1150 
1151  out:
1152 	fd_putfile(fd);
1153 	return (error);
1154 }
1155 
1156 /*
1157  * Change current working directory (``.'').
1158  */
1159 /* ARGSUSED */
1160 int
1161 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1162 {
1163 	/* {
1164 		syscallarg(const char *) path;
1165 	} */
1166 	struct proc *p = l->l_proc;
1167 	struct cwdinfo *cwdi;
1168 	int error;
1169 	struct vnode *vp;
1170 
1171 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1172 				  &vp, l)) != 0)
1173 		return (error);
1174 	cwdi = p->p_cwdi;
1175 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1176 	vrele(cwdi->cwdi_cdir);
1177 	cwdi->cwdi_cdir = vp;
1178 	rw_exit(&cwdi->cwdi_lock);
1179 	return (0);
1180 }
1181 
1182 /*
1183  * Change notion of root (``/'') directory.
1184  */
1185 /* ARGSUSED */
1186 int
1187 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1188 {
1189 	/* {
1190 		syscallarg(const char *) path;
1191 	} */
1192 	struct proc *p = l->l_proc;
1193 	int error;
1194 	struct vnode *vp;
1195 
1196 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1197 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1198 		return (error);
1199 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1200 				  &vp, l)) != 0)
1201 		return (error);
1202 
1203 	change_root(p->p_cwdi, vp, l);
1204 
1205 	return (0);
1206 }
1207 
1208 /*
1209  * Common routine for chroot and fchroot.
1210  * NB: callers need to properly authorize the change root operation.
1211  */
1212 void
1213 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1214 {
1215 
1216 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1217 	if (cwdi->cwdi_rdir != NULL)
1218 		vrele(cwdi->cwdi_rdir);
1219 	cwdi->cwdi_rdir = vp;
1220 
1221 	/*
1222 	 * Prevent escaping from chroot by putting the root under
1223 	 * the working directory.  Silently chdir to / if we aren't
1224 	 * already there.
1225 	 */
1226 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1227 		/*
1228 		 * XXX would be more failsafe to change directory to a
1229 		 * deadfs node here instead
1230 		 */
1231 		vrele(cwdi->cwdi_cdir);
1232 		vref(vp);
1233 		cwdi->cwdi_cdir = vp;
1234 	}
1235 	rw_exit(&cwdi->cwdi_lock);
1236 }
1237 
1238 /*
1239  * Common routine for chroot and chdir.
1240  */
1241 int
1242 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1243 {
1244 	struct nameidata nd;
1245 	int error;
1246 
1247 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, where,
1248 	    path);
1249 	if ((error = namei(&nd)) != 0)
1250 		return (error);
1251 	*vpp = nd.ni_vp;
1252 	if ((*vpp)->v_type != VDIR)
1253 		error = ENOTDIR;
1254 	else
1255 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1256 
1257 	if (error)
1258 		vput(*vpp);
1259 	else
1260 		VOP_UNLOCK(*vpp, 0);
1261 	return (error);
1262 }
1263 
1264 /*
1265  * Check permissions, allocate an open file structure,
1266  * and call the device open routine if any.
1267  */
1268 int
1269 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1270 {
1271 	/* {
1272 		syscallarg(const char *) path;
1273 		syscallarg(int) flags;
1274 		syscallarg(int) mode;
1275 	} */
1276 	struct proc *p = l->l_proc;
1277 	struct cwdinfo *cwdi = p->p_cwdi;
1278 	file_t *fp;
1279 	struct vnode *vp;
1280 	int flags, cmode;
1281 	int type, indx, error;
1282 	struct flock lf;
1283 	struct nameidata nd;
1284 
1285 	flags = FFLAGS(SCARG(uap, flags));
1286 	if ((flags & (FREAD | FWRITE)) == 0)
1287 		return (EINVAL);
1288 	if ((error = fd_allocfile(&fp, &indx)) != 0)
1289 		return (error);
1290 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1291 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1292 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1293 	    SCARG(uap, path));
1294 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1295 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1296 		fd_abort(p, fp, indx);
1297 		if ((error == EDUPFD || error == EMOVEFD) &&
1298 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1299 		    (error =
1300 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1301 			*retval = indx;
1302 			return (0);
1303 		}
1304 		if (error == ERESTART)
1305 			error = EINTR;
1306 		return (error);
1307 	}
1308 
1309 	l->l_dupfd = 0;
1310 	vp = nd.ni_vp;
1311 	fp->f_flag = flags & FMASK;
1312 	fp->f_type = DTYPE_VNODE;
1313 	fp->f_ops = &vnops;
1314 	fp->f_data = vp;
1315 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1316 		lf.l_whence = SEEK_SET;
1317 		lf.l_start = 0;
1318 		lf.l_len = 0;
1319 		if (flags & O_EXLOCK)
1320 			lf.l_type = F_WRLCK;
1321 		else
1322 			lf.l_type = F_RDLCK;
1323 		type = F_FLOCK;
1324 		if ((flags & FNONBLOCK) == 0)
1325 			type |= F_WAIT;
1326 		VOP_UNLOCK(vp, 0);
1327 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1328 		if (error) {
1329 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1330 			fd_abort(p, fp, indx);
1331 			return (error);
1332 		}
1333 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1334 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1335 	}
1336 	VOP_UNLOCK(vp, 0);
1337 	*retval = indx;
1338 	fd_affix(p, fp, indx);
1339 	return (0);
1340 }
1341 
1342 static void
1343 vfs__fhfree(fhandle_t *fhp)
1344 {
1345 	size_t fhsize;
1346 
1347 	if (fhp == NULL) {
1348 		return;
1349 	}
1350 	fhsize = FHANDLE_SIZE(fhp);
1351 	kmem_free(fhp, fhsize);
1352 }
1353 
1354 /*
1355  * vfs_composefh: compose a filehandle.
1356  */
1357 
1358 int
1359 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1360 {
1361 	struct mount *mp;
1362 	struct fid *fidp;
1363 	int error;
1364 	size_t needfhsize;
1365 	size_t fidsize;
1366 
1367 	mp = vp->v_mount;
1368 	fidp = NULL;
1369 	if (*fh_size < FHANDLE_SIZE_MIN) {
1370 		fidsize = 0;
1371 	} else {
1372 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1373 		if (fhp != NULL) {
1374 			memset(fhp, 0, *fh_size);
1375 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1376 			fidp = &fhp->fh_fid;
1377 		}
1378 	}
1379 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1380 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1381 	if (error == 0 && *fh_size < needfhsize) {
1382 		error = E2BIG;
1383 	}
1384 	*fh_size = needfhsize;
1385 	return error;
1386 }
1387 
1388 int
1389 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1390 {
1391 	struct mount *mp;
1392 	fhandle_t *fhp;
1393 	size_t fhsize;
1394 	size_t fidsize;
1395 	int error;
1396 
1397 	*fhpp = NULL;
1398 	mp = vp->v_mount;
1399 	fidsize = 0;
1400 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1401 	KASSERT(error != 0);
1402 	if (error != E2BIG) {
1403 		goto out;
1404 	}
1405 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1406 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1407 	if (fhp == NULL) {
1408 		error = ENOMEM;
1409 		goto out;
1410 	}
1411 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1412 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1413 	if (error == 0) {
1414 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1415 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1416 		*fhpp = fhp;
1417 	} else {
1418 		kmem_free(fhp, fhsize);
1419 	}
1420 out:
1421 	return error;
1422 }
1423 
1424 void
1425 vfs_composefh_free(fhandle_t *fhp)
1426 {
1427 
1428 	vfs__fhfree(fhp);
1429 }
1430 
1431 /*
1432  * vfs_fhtovp: lookup a vnode by a filehandle.
1433  */
1434 
1435 int
1436 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1437 {
1438 	struct mount *mp;
1439 	int error;
1440 
1441 	*vpp = NULL;
1442 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1443 	if (mp == NULL) {
1444 		error = ESTALE;
1445 		goto out;
1446 	}
1447 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1448 		error = EOPNOTSUPP;
1449 		goto out;
1450 	}
1451 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1452 out:
1453 	return error;
1454 }
1455 
1456 /*
1457  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1458  * the needed size.
1459  */
1460 
1461 int
1462 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1463 {
1464 	fhandle_t *fhp;
1465 	int error;
1466 
1467 	*fhpp = NULL;
1468 	if (fhsize > FHANDLE_SIZE_MAX) {
1469 		return EINVAL;
1470 	}
1471 	if (fhsize < FHANDLE_SIZE_MIN) {
1472 		return EINVAL;
1473 	}
1474 again:
1475 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1476 	if (fhp == NULL) {
1477 		return ENOMEM;
1478 	}
1479 	error = copyin(ufhp, fhp, fhsize);
1480 	if (error == 0) {
1481 		/* XXX this check shouldn't be here */
1482 		if (FHANDLE_SIZE(fhp) == fhsize) {
1483 			*fhpp = fhp;
1484 			return 0;
1485 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1486 			/*
1487 			 * a kludge for nfsv2 padded handles.
1488 			 */
1489 			size_t sz;
1490 
1491 			sz = FHANDLE_SIZE(fhp);
1492 			kmem_free(fhp, fhsize);
1493 			fhsize = sz;
1494 			goto again;
1495 		} else {
1496 			/*
1497 			 * userland told us wrong size.
1498 			 */
1499 		    	error = EINVAL;
1500 		}
1501 	}
1502 	kmem_free(fhp, fhsize);
1503 	return error;
1504 }
1505 
1506 void
1507 vfs_copyinfh_free(fhandle_t *fhp)
1508 {
1509 
1510 	vfs__fhfree(fhp);
1511 }
1512 
1513 /*
1514  * Get file handle system call
1515  */
1516 int
1517 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1518 {
1519 	/* {
1520 		syscallarg(char *) fname;
1521 		syscallarg(fhandle_t *) fhp;
1522 		syscallarg(size_t *) fh_size;
1523 	} */
1524 	struct vnode *vp;
1525 	fhandle_t *fh;
1526 	int error;
1527 	struct nameidata nd;
1528 	size_t sz;
1529 	size_t usz;
1530 
1531 	/*
1532 	 * Must be super user
1533 	 */
1534 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1535 	    0, NULL, NULL, NULL);
1536 	if (error)
1537 		return (error);
1538 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1539 	    SCARG(uap, fname));
1540 	error = namei(&nd);
1541 	if (error)
1542 		return (error);
1543 	vp = nd.ni_vp;
1544 	error = vfs_composefh_alloc(vp, &fh);
1545 	vput(vp);
1546 	if (error != 0) {
1547 		goto out;
1548 	}
1549 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1550 	if (error != 0) {
1551 		goto out;
1552 	}
1553 	sz = FHANDLE_SIZE(fh);
1554 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1555 	if (error != 0) {
1556 		goto out;
1557 	}
1558 	if (usz >= sz) {
1559 		error = copyout(fh, SCARG(uap, fhp), sz);
1560 	} else {
1561 		error = E2BIG;
1562 	}
1563 out:
1564 	vfs_composefh_free(fh);
1565 	return (error);
1566 }
1567 
1568 /*
1569  * Open a file given a file handle.
1570  *
1571  * Check permissions, allocate an open file structure,
1572  * and call the device open routine if any.
1573  */
1574 
1575 int
1576 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1577     register_t *retval)
1578 {
1579 	file_t *fp;
1580 	struct vnode *vp = NULL;
1581 	kauth_cred_t cred = l->l_cred;
1582 	file_t *nfp;
1583 	int type, indx, error=0;
1584 	struct flock lf;
1585 	struct vattr va;
1586 	fhandle_t *fh;
1587 	int flags;
1588 	proc_t *p;
1589 
1590 	p = curproc;
1591 
1592 	/*
1593 	 * Must be super user
1594 	 */
1595 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1596 	    0, NULL, NULL, NULL)))
1597 		return (error);
1598 
1599 	flags = FFLAGS(oflags);
1600 	if ((flags & (FREAD | FWRITE)) == 0)
1601 		return (EINVAL);
1602 	if ((flags & O_CREAT))
1603 		return (EINVAL);
1604 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1605 		return (error);
1606 	fp = nfp;
1607 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1608 	if (error != 0) {
1609 		goto bad;
1610 	}
1611 	error = vfs_fhtovp(fh, &vp);
1612 	if (error != 0) {
1613 		goto bad;
1614 	}
1615 
1616 	/* Now do an effective vn_open */
1617 
1618 	if (vp->v_type == VSOCK) {
1619 		error = EOPNOTSUPP;
1620 		goto bad;
1621 	}
1622 	error = vn_openchk(vp, cred, flags);
1623 	if (error != 0)
1624 		goto bad;
1625 	if (flags & O_TRUNC) {
1626 		VOP_UNLOCK(vp, 0);			/* XXX */
1627 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1628 		vattr_null(&va);
1629 		va.va_size = 0;
1630 		error = VOP_SETATTR(vp, &va, cred);
1631 		if (error)
1632 			goto bad;
1633 	}
1634 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1635 		goto bad;
1636 	if (flags & FWRITE) {
1637 		mutex_enter(&vp->v_interlock);
1638 		vp->v_writecount++;
1639 		mutex_exit(&vp->v_interlock);
1640 	}
1641 
1642 	/* done with modified vn_open, now finish what sys_open does. */
1643 
1644 	fp->f_flag = flags & FMASK;
1645 	fp->f_type = DTYPE_VNODE;
1646 	fp->f_ops = &vnops;
1647 	fp->f_data = vp;
1648 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1649 		lf.l_whence = SEEK_SET;
1650 		lf.l_start = 0;
1651 		lf.l_len = 0;
1652 		if (flags & O_EXLOCK)
1653 			lf.l_type = F_WRLCK;
1654 		else
1655 			lf.l_type = F_RDLCK;
1656 		type = F_FLOCK;
1657 		if ((flags & FNONBLOCK) == 0)
1658 			type |= F_WAIT;
1659 		VOP_UNLOCK(vp, 0);
1660 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1661 		if (error) {
1662 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1663 			fd_abort(p, fp, indx);
1664 			return (error);
1665 		}
1666 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1667 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1668 	}
1669 	VOP_UNLOCK(vp, 0);
1670 	*retval = indx;
1671 	fd_affix(p, fp, indx);
1672 	vfs_copyinfh_free(fh);
1673 	return (0);
1674 
1675 bad:
1676 	fd_abort(p, fp, indx);
1677 	if (vp != NULL)
1678 		vput(vp);
1679 	vfs_copyinfh_free(fh);
1680 	return (error);
1681 }
1682 
1683 int
1684 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1685 {
1686 	/* {
1687 		syscallarg(const void *) fhp;
1688 		syscallarg(size_t) fh_size;
1689 		syscallarg(int) flags;
1690 	} */
1691 
1692 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1693 	    SCARG(uap, flags), retval);
1694 }
1695 
1696 int
1697 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1698 {
1699 	int error;
1700 	fhandle_t *fh;
1701 	struct vnode *vp;
1702 
1703 	/*
1704 	 * Must be super user
1705 	 */
1706 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1707 	    0, NULL, NULL, NULL)))
1708 		return (error);
1709 
1710 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1711 	if (error != 0)
1712 		return error;
1713 
1714 	error = vfs_fhtovp(fh, &vp);
1715 	vfs_copyinfh_free(fh);
1716 	if (error != 0)
1717 		return error;
1718 
1719 	error = vn_stat(vp, sb);
1720 	vput(vp);
1721 	return error;
1722 }
1723 
1724 
1725 /* ARGSUSED */
1726 int
1727 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1728 {
1729 	/* {
1730 		syscallarg(const void *) fhp;
1731 		syscallarg(size_t) fh_size;
1732 		syscallarg(struct stat *) sb;
1733 	} */
1734 	struct stat sb;
1735 	int error;
1736 
1737 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1738 	if (error)
1739 		return error;
1740 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1741 }
1742 
1743 int
1744 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1745     int flags)
1746 {
1747 	fhandle_t *fh;
1748 	struct mount *mp;
1749 	struct vnode *vp;
1750 	int error;
1751 
1752 	/*
1753 	 * Must be super user
1754 	 */
1755 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1756 	    0, NULL, NULL, NULL)))
1757 		return error;
1758 
1759 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1760 	if (error != 0)
1761 		return error;
1762 
1763 	error = vfs_fhtovp(fh, &vp);
1764 	vfs_copyinfh_free(fh);
1765 	if (error != 0)
1766 		return error;
1767 
1768 	mp = vp->v_mount;
1769 	error = dostatvfs(mp, sb, l, flags, 1);
1770 	vput(vp);
1771 	return error;
1772 }
1773 
1774 /* ARGSUSED */
1775 int
1776 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1777 {
1778 	/* {
1779 		syscallarg(const void *) fhp;
1780 		syscallarg(size_t) fh_size;
1781 		syscallarg(struct statvfs *) buf;
1782 		syscallarg(int)	flags;
1783 	} */
1784 	struct statvfs *sb = STATVFSBUF_GET();
1785 	int error;
1786 
1787 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1788 	    SCARG(uap, flags));
1789 	if (error == 0)
1790 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1791 	STATVFSBUF_PUT(sb);
1792 	return error;
1793 }
1794 
1795 /*
1796  * Create a special file.
1797  */
1798 /* ARGSUSED */
1799 int
1800 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
1801     register_t *retval)
1802 {
1803 	/* {
1804 		syscallarg(const char *) path;
1805 		syscallarg(mode_t) mode;
1806 		syscallarg(dev_t) dev;
1807 	} */
1808 	return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
1809 	    SCARG(uap, dev), retval, UIO_USERSPACE);
1810 }
1811 
1812 int
1813 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
1814     register_t *retval, enum uio_seg seg)
1815 {
1816 	struct proc *p = l->l_proc;
1817 	struct vnode *vp;
1818 	struct vattr vattr;
1819 	int error, optype;
1820 	struct nameidata nd;
1821 	char *path;
1822 	const char *cpath;
1823 
1824 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1825 	    0, NULL, NULL, NULL)) != 0)
1826 		return (error);
1827 
1828 	optype = VOP_MKNOD_DESCOFFSET;
1829 
1830 	VERIEXEC_PATH_GET(pathname, seg, cpath, path);
1831 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1832 
1833 	if ((error = namei(&nd)) != 0)
1834 		goto out;
1835 	vp = nd.ni_vp;
1836 	if (vp != NULL)
1837 		error = EEXIST;
1838 	else {
1839 		vattr_null(&vattr);
1840 		/* We will read cwdi->cwdi_cmask unlocked. */
1841 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1842 		vattr.va_rdev = dev;
1843 
1844 		switch (mode & S_IFMT) {
1845 		case S_IFMT:	/* used by badsect to flag bad sectors */
1846 			vattr.va_type = VBAD;
1847 			break;
1848 		case S_IFCHR:
1849 			vattr.va_type = VCHR;
1850 			break;
1851 		case S_IFBLK:
1852 			vattr.va_type = VBLK;
1853 			break;
1854 		case S_IFWHT:
1855 			optype = VOP_WHITEOUT_DESCOFFSET;
1856 			break;
1857 		case S_IFREG:
1858 #if NVERIEXEC > 0
1859 			error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1860 			    O_CREAT);
1861 #endif /* NVERIEXEC > 0 */
1862 			vattr.va_type = VREG;
1863 			vattr.va_rdev = VNOVAL;
1864 			optype = VOP_CREATE_DESCOFFSET;
1865 			break;
1866 		default:
1867 			error = EINVAL;
1868 			break;
1869 		}
1870 	}
1871 	if (!error) {
1872 		switch (optype) {
1873 		case VOP_WHITEOUT_DESCOFFSET:
1874 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1875 			if (error)
1876 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1877 			vput(nd.ni_dvp);
1878 			break;
1879 
1880 		case VOP_MKNOD_DESCOFFSET:
1881 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1882 						&nd.ni_cnd, &vattr);
1883 			if (error == 0)
1884 				vput(nd.ni_vp);
1885 			break;
1886 
1887 		case VOP_CREATE_DESCOFFSET:
1888 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1889 						&nd.ni_cnd, &vattr);
1890 			if (error == 0)
1891 				vput(nd.ni_vp);
1892 			break;
1893 		}
1894 	} else {
1895 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1896 		if (nd.ni_dvp == vp)
1897 			vrele(nd.ni_dvp);
1898 		else
1899 			vput(nd.ni_dvp);
1900 		if (vp)
1901 			vrele(vp);
1902 	}
1903 out:
1904 	VERIEXEC_PATH_PUT(path);
1905 	return (error);
1906 }
1907 
1908 /*
1909  * Create a named pipe.
1910  */
1911 /* ARGSUSED */
1912 int
1913 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1914 {
1915 	/* {
1916 		syscallarg(const char *) path;
1917 		syscallarg(int) mode;
1918 	} */
1919 	struct proc *p = l->l_proc;
1920 	struct vattr vattr;
1921 	int error;
1922 	struct nameidata nd;
1923 
1924 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1925 	    SCARG(uap, path));
1926 	if ((error = namei(&nd)) != 0)
1927 		return (error);
1928 	if (nd.ni_vp != NULL) {
1929 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1930 		if (nd.ni_dvp == nd.ni_vp)
1931 			vrele(nd.ni_dvp);
1932 		else
1933 			vput(nd.ni_dvp);
1934 		vrele(nd.ni_vp);
1935 		return (EEXIST);
1936 	}
1937 	vattr_null(&vattr);
1938 	vattr.va_type = VFIFO;
1939 	/* We will read cwdi->cwdi_cmask unlocked. */
1940 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1941 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1942 	if (error == 0)
1943 		vput(nd.ni_vp);
1944 	return (error);
1945 }
1946 
1947 /*
1948  * Make a hard file link.
1949  */
1950 /* ARGSUSED */
1951 int
1952 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1953 {
1954 	/* {
1955 		syscallarg(const char *) path;
1956 		syscallarg(const char *) link;
1957 	} */
1958 	struct vnode *vp;
1959 	struct nameidata nd;
1960 	int error;
1961 
1962 	error = namei_simple_user(SCARG(uap, path),
1963 				NSM_FOLLOW_TRYEMULROOT, &vp);
1964 	if (error != 0)
1965 		return (error);
1966 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1967 	    SCARG(uap, link));
1968 	if ((error = namei(&nd)) != 0)
1969 		goto out;
1970 	if (nd.ni_vp) {
1971 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1972 		if (nd.ni_dvp == nd.ni_vp)
1973 			vrele(nd.ni_dvp);
1974 		else
1975 			vput(nd.ni_dvp);
1976 		vrele(nd.ni_vp);
1977 		error = EEXIST;
1978 		goto out;
1979 	}
1980 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1981 out:
1982 	vrele(vp);
1983 	return (error);
1984 }
1985 
1986 /*
1987  * Make a symbolic link.
1988  */
1989 /* ARGSUSED */
1990 int
1991 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
1992 {
1993 	/* {
1994 		syscallarg(const char *) path;
1995 		syscallarg(const char *) link;
1996 	} */
1997 	struct proc *p = l->l_proc;
1998 	struct vattr vattr;
1999 	char *path;
2000 	int error;
2001 	struct nameidata nd;
2002 
2003 	path = PNBUF_GET();
2004 	error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2005 	if (error)
2006 		goto out;
2007 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2008 	    SCARG(uap, link));
2009 	if ((error = namei(&nd)) != 0)
2010 		goto out;
2011 	if (nd.ni_vp) {
2012 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2013 		if (nd.ni_dvp == nd.ni_vp)
2014 			vrele(nd.ni_dvp);
2015 		else
2016 			vput(nd.ni_dvp);
2017 		vrele(nd.ni_vp);
2018 		error = EEXIST;
2019 		goto out;
2020 	}
2021 	vattr_null(&vattr);
2022 	vattr.va_type = VLNK;
2023 	/* We will read cwdi->cwdi_cmask unlocked. */
2024 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2025 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2026 	if (error == 0)
2027 		vput(nd.ni_vp);
2028 out:
2029 	PNBUF_PUT(path);
2030 	return (error);
2031 }
2032 
2033 /*
2034  * Delete a whiteout from the filesystem.
2035  */
2036 /* ARGSUSED */
2037 int
2038 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2039 {
2040 	/* {
2041 		syscallarg(const char *) path;
2042 	} */
2043 	int error;
2044 	struct nameidata nd;
2045 
2046 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2047 	    UIO_USERSPACE, SCARG(uap, path));
2048 	error = namei(&nd);
2049 	if (error)
2050 		return (error);
2051 
2052 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2053 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2054 		if (nd.ni_dvp == nd.ni_vp)
2055 			vrele(nd.ni_dvp);
2056 		else
2057 			vput(nd.ni_dvp);
2058 		if (nd.ni_vp)
2059 			vrele(nd.ni_vp);
2060 		return (EEXIST);
2061 	}
2062 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2063 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2064 	vput(nd.ni_dvp);
2065 	return (error);
2066 }
2067 
2068 /*
2069  * Delete a name from the filesystem.
2070  */
2071 /* ARGSUSED */
2072 int
2073 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2074 {
2075 	/* {
2076 		syscallarg(const char *) path;
2077 	} */
2078 
2079 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2080 }
2081 
2082 int
2083 do_sys_unlink(const char *arg, enum uio_seg seg)
2084 {
2085 	struct vnode *vp;
2086 	int error;
2087 	struct nameidata nd;
2088 	char *path;
2089 	const char *cpath;
2090 
2091 	VERIEXEC_PATH_GET(arg, seg, cpath, path);
2092 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2093 
2094 	if ((error = namei(&nd)) != 0)
2095 		goto out;
2096 	vp = nd.ni_vp;
2097 
2098 	/*
2099 	 * The root of a mounted filesystem cannot be deleted.
2100 	 */
2101 	if (vp->v_vflag & VV_ROOT) {
2102 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2103 		if (nd.ni_dvp == vp)
2104 			vrele(nd.ni_dvp);
2105 		else
2106 			vput(nd.ni_dvp);
2107 		vput(vp);
2108 		error = EBUSY;
2109 		goto out;
2110 	}
2111 
2112 #if NVERIEXEC > 0
2113 	/* Handle remove requests for veriexec entries. */
2114 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2115 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2116 		if (nd.ni_dvp == vp)
2117 			vrele(nd.ni_dvp);
2118 		else
2119 			vput(nd.ni_dvp);
2120 		vput(vp);
2121 		goto out;
2122 	}
2123 #endif /* NVERIEXEC > 0 */
2124 
2125 #ifdef FILEASSOC
2126 	(void)fileassoc_file_delete(vp);
2127 #endif /* FILEASSOC */
2128 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2129 out:
2130 	VERIEXEC_PATH_PUT(path);
2131 	return (error);
2132 }
2133 
2134 /*
2135  * Reposition read/write file offset.
2136  */
2137 int
2138 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2139 {
2140 	/* {
2141 		syscallarg(int) fd;
2142 		syscallarg(int) pad;
2143 		syscallarg(off_t) offset;
2144 		syscallarg(int) whence;
2145 	} */
2146 	kauth_cred_t cred = l->l_cred;
2147 	file_t *fp;
2148 	struct vnode *vp;
2149 	struct vattr vattr;
2150 	off_t newoff;
2151 	int error, fd;
2152 
2153 	fd = SCARG(uap, fd);
2154 
2155 	if ((fp = fd_getfile(fd)) == NULL)
2156 		return (EBADF);
2157 
2158 	vp = fp->f_data;
2159 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2160 		error = ESPIPE;
2161 		goto out;
2162 	}
2163 
2164 	switch (SCARG(uap, whence)) {
2165 	case SEEK_CUR:
2166 		newoff = fp->f_offset + SCARG(uap, offset);
2167 		break;
2168 	case SEEK_END:
2169 		error = VOP_GETATTR(vp, &vattr, cred);
2170 		if (error) {
2171 			goto out;
2172 		}
2173 		newoff = SCARG(uap, offset) + vattr.va_size;
2174 		break;
2175 	case SEEK_SET:
2176 		newoff = SCARG(uap, offset);
2177 		break;
2178 	default:
2179 		error = EINVAL;
2180 		goto out;
2181 	}
2182 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2183 		*(off_t *)retval = fp->f_offset = newoff;
2184 	}
2185  out:
2186  	fd_putfile(fd);
2187 	return (error);
2188 }
2189 
2190 /*
2191  * Positional read system call.
2192  */
2193 int
2194 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2195 {
2196 	/* {
2197 		syscallarg(int) fd;
2198 		syscallarg(void *) buf;
2199 		syscallarg(size_t) nbyte;
2200 		syscallarg(off_t) offset;
2201 	} */
2202 	file_t *fp;
2203 	struct vnode *vp;
2204 	off_t offset;
2205 	int error, fd = SCARG(uap, fd);
2206 
2207 	if ((fp = fd_getfile(fd)) == NULL)
2208 		return (EBADF);
2209 
2210 	if ((fp->f_flag & FREAD) == 0) {
2211 		fd_putfile(fd);
2212 		return (EBADF);
2213 	}
2214 
2215 	vp = fp->f_data;
2216 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2217 		error = ESPIPE;
2218 		goto out;
2219 	}
2220 
2221 	offset = SCARG(uap, offset);
2222 
2223 	/*
2224 	 * XXX This works because no file systems actually
2225 	 * XXX take any action on the seek operation.
2226 	 */
2227 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2228 		goto out;
2229 
2230 	/* dofileread() will unuse the descriptor for us */
2231 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2232 	    &offset, 0, retval));
2233 
2234  out:
2235 	fd_putfile(fd);
2236 	return (error);
2237 }
2238 
2239 /*
2240  * Positional scatter read system call.
2241  */
2242 int
2243 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2244 {
2245 	/* {
2246 		syscallarg(int) fd;
2247 		syscallarg(const struct iovec *) iovp;
2248 		syscallarg(int) iovcnt;
2249 		syscallarg(off_t) offset;
2250 	} */
2251 	off_t offset = SCARG(uap, offset);
2252 
2253 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2254 	    SCARG(uap, iovcnt), &offset, 0, retval);
2255 }
2256 
2257 /*
2258  * Positional write system call.
2259  */
2260 int
2261 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2262 {
2263 	/* {
2264 		syscallarg(int) fd;
2265 		syscallarg(const void *) buf;
2266 		syscallarg(size_t) nbyte;
2267 		syscallarg(off_t) offset;
2268 	} */
2269 	file_t *fp;
2270 	struct vnode *vp;
2271 	off_t offset;
2272 	int error, fd = SCARG(uap, fd);
2273 
2274 	if ((fp = fd_getfile(fd)) == NULL)
2275 		return (EBADF);
2276 
2277 	if ((fp->f_flag & FWRITE) == 0) {
2278 		fd_putfile(fd);
2279 		return (EBADF);
2280 	}
2281 
2282 	vp = fp->f_data;
2283 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2284 		error = ESPIPE;
2285 		goto out;
2286 	}
2287 
2288 	offset = SCARG(uap, offset);
2289 
2290 	/*
2291 	 * XXX This works because no file systems actually
2292 	 * XXX take any action on the seek operation.
2293 	 */
2294 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2295 		goto out;
2296 
2297 	/* dofilewrite() will unuse the descriptor for us */
2298 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2299 	    &offset, 0, retval));
2300 
2301  out:
2302 	fd_putfile(fd);
2303 	return (error);
2304 }
2305 
2306 /*
2307  * Positional gather write system call.
2308  */
2309 int
2310 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2311 {
2312 	/* {
2313 		syscallarg(int) fd;
2314 		syscallarg(const struct iovec *) iovp;
2315 		syscallarg(int) iovcnt;
2316 		syscallarg(off_t) offset;
2317 	} */
2318 	off_t offset = SCARG(uap, offset);
2319 
2320 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2321 	    SCARG(uap, iovcnt), &offset, 0, retval);
2322 }
2323 
2324 /*
2325  * Check access permissions.
2326  */
2327 int
2328 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2329 {
2330 	/* {
2331 		syscallarg(const char *) path;
2332 		syscallarg(int) flags;
2333 	} */
2334 	kauth_cred_t cred;
2335 	struct vnode *vp;
2336 	int error, flags;
2337 	struct nameidata nd;
2338 
2339 	cred = kauth_cred_dup(l->l_cred);
2340 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2341 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2342 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2343 	    SCARG(uap, path));
2344 	/* Override default credentials */
2345 	nd.ni_cnd.cn_cred = cred;
2346 	if ((error = namei(&nd)) != 0)
2347 		goto out;
2348 	vp = nd.ni_vp;
2349 
2350 	/* Flags == 0 means only check for existence. */
2351 	if (SCARG(uap, flags)) {
2352 		flags = 0;
2353 		if (SCARG(uap, flags) & R_OK)
2354 			flags |= VREAD;
2355 		if (SCARG(uap, flags) & W_OK)
2356 			flags |= VWRITE;
2357 		if (SCARG(uap, flags) & X_OK)
2358 			flags |= VEXEC;
2359 
2360 		error = VOP_ACCESS(vp, flags, cred);
2361 		if (!error && (flags & VWRITE))
2362 			error = vn_writechk(vp);
2363 	}
2364 	vput(vp);
2365 out:
2366 	kauth_cred_free(cred);
2367 	return (error);
2368 }
2369 
2370 /*
2371  * Common code for all sys_stat functions, including compat versions.
2372  */
2373 int
2374 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2375 {
2376 	int error;
2377 	struct nameidata nd;
2378 
2379 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2380 	    UIO_USERSPACE, path);
2381 	error = namei(&nd);
2382 	if (error != 0)
2383 		return error;
2384 	error = vn_stat(nd.ni_vp, sb);
2385 	vput(nd.ni_vp);
2386 	return error;
2387 }
2388 
2389 /*
2390  * Get file status; this version follows links.
2391  */
2392 /* ARGSUSED */
2393 int
2394 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2395 {
2396 	/* {
2397 		syscallarg(const char *) path;
2398 		syscallarg(struct stat *) ub;
2399 	} */
2400 	struct stat sb;
2401 	int error;
2402 
2403 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2404 	if (error)
2405 		return error;
2406 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2407 }
2408 
2409 /*
2410  * Get file status; this version does not follow links.
2411  */
2412 /* ARGSUSED */
2413 int
2414 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2415 {
2416 	/* {
2417 		syscallarg(const char *) path;
2418 		syscallarg(struct stat *) ub;
2419 	} */
2420 	struct stat sb;
2421 	int error;
2422 
2423 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2424 	if (error)
2425 		return error;
2426 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2427 }
2428 
2429 /*
2430  * Get configurable pathname variables.
2431  */
2432 /* ARGSUSED */
2433 int
2434 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2435 {
2436 	/* {
2437 		syscallarg(const char *) path;
2438 		syscallarg(int) name;
2439 	} */
2440 	int error;
2441 	struct nameidata nd;
2442 
2443 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2444 	    SCARG(uap, path));
2445 	if ((error = namei(&nd)) != 0)
2446 		return (error);
2447 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2448 	vput(nd.ni_vp);
2449 	return (error);
2450 }
2451 
2452 /*
2453  * Return target name of a symbolic link.
2454  */
2455 /* ARGSUSED */
2456 int
2457 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2458 {
2459 	/* {
2460 		syscallarg(const char *) path;
2461 		syscallarg(char *) buf;
2462 		syscallarg(size_t) count;
2463 	} */
2464 	struct vnode *vp;
2465 	struct iovec aiov;
2466 	struct uio auio;
2467 	int error;
2468 	struct nameidata nd;
2469 
2470 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2471 	    SCARG(uap, path));
2472 	if ((error = namei(&nd)) != 0)
2473 		return (error);
2474 	vp = nd.ni_vp;
2475 	if (vp->v_type != VLNK)
2476 		error = EINVAL;
2477 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2478 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2479 		aiov.iov_base = SCARG(uap, buf);
2480 		aiov.iov_len = SCARG(uap, count);
2481 		auio.uio_iov = &aiov;
2482 		auio.uio_iovcnt = 1;
2483 		auio.uio_offset = 0;
2484 		auio.uio_rw = UIO_READ;
2485 		KASSERT(l == curlwp);
2486 		auio.uio_vmspace = l->l_proc->p_vmspace;
2487 		auio.uio_resid = SCARG(uap, count);
2488 		error = VOP_READLINK(vp, &auio, l->l_cred);
2489 	}
2490 	vput(vp);
2491 	*retval = SCARG(uap, count) - auio.uio_resid;
2492 	return (error);
2493 }
2494 
2495 /*
2496  * Change flags of a file given a path name.
2497  */
2498 /* ARGSUSED */
2499 int
2500 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2501 {
2502 	/* {
2503 		syscallarg(const char *) path;
2504 		syscallarg(u_long) flags;
2505 	} */
2506 	struct vnode *vp;
2507 	int error;
2508 
2509 	error = namei_simple_user(SCARG(uap, path),
2510 				NSM_FOLLOW_TRYEMULROOT, &vp);
2511 	if (error != 0)
2512 		return (error);
2513 	error = change_flags(vp, SCARG(uap, flags), l);
2514 	vput(vp);
2515 	return (error);
2516 }
2517 
2518 /*
2519  * Change flags of a file given a file descriptor.
2520  */
2521 /* ARGSUSED */
2522 int
2523 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2524 {
2525 	/* {
2526 		syscallarg(int) fd;
2527 		syscallarg(u_long) flags;
2528 	} */
2529 	struct vnode *vp;
2530 	file_t *fp;
2531 	int error;
2532 
2533 	/* fd_getvnode() will use the descriptor for us */
2534 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2535 		return (error);
2536 	vp = fp->f_data;
2537 	error = change_flags(vp, SCARG(uap, flags), l);
2538 	VOP_UNLOCK(vp, 0);
2539 	fd_putfile(SCARG(uap, fd));
2540 	return (error);
2541 }
2542 
2543 /*
2544  * Change flags of a file given a path name; this version does
2545  * not follow links.
2546  */
2547 int
2548 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2549 {
2550 	/* {
2551 		syscallarg(const char *) path;
2552 		syscallarg(u_long) flags;
2553 	} */
2554 	struct vnode *vp;
2555 	int error;
2556 
2557 	error = namei_simple_user(SCARG(uap, path),
2558 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2559 	if (error != 0)
2560 		return (error);
2561 	error = change_flags(vp, SCARG(uap, flags), l);
2562 	vput(vp);
2563 	return (error);
2564 }
2565 
2566 /*
2567  * Common routine to change flags of a file.
2568  */
2569 int
2570 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2571 {
2572 	struct vattr vattr;
2573 	int error;
2574 
2575 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2576 	/*
2577 	 * Non-superusers cannot change the flags on devices, even if they
2578 	 * own them.
2579 	 */
2580 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2581 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2582 			goto out;
2583 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2584 			error = EINVAL;
2585 			goto out;
2586 		}
2587 	}
2588 	vattr_null(&vattr);
2589 	vattr.va_flags = flags;
2590 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2591 out:
2592 	return (error);
2593 }
2594 
2595 /*
2596  * Change mode of a file given path name; this version follows links.
2597  */
2598 /* ARGSUSED */
2599 int
2600 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2601 {
2602 	/* {
2603 		syscallarg(const char *) path;
2604 		syscallarg(int) mode;
2605 	} */
2606 	int error;
2607 	struct vnode *vp;
2608 
2609 	error = namei_simple_user(SCARG(uap, path),
2610 				NSM_FOLLOW_TRYEMULROOT, &vp);
2611 	if (error != 0)
2612 		return (error);
2613 
2614 	error = change_mode(vp, SCARG(uap, mode), l);
2615 
2616 	vrele(vp);
2617 	return (error);
2618 }
2619 
2620 /*
2621  * Change mode of a file given a file descriptor.
2622  */
2623 /* ARGSUSED */
2624 int
2625 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2626 {
2627 	/* {
2628 		syscallarg(int) fd;
2629 		syscallarg(int) mode;
2630 	} */
2631 	file_t *fp;
2632 	int error;
2633 
2634 	/* fd_getvnode() will use the descriptor for us */
2635 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2636 		return (error);
2637 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2638 	fd_putfile(SCARG(uap, fd));
2639 	return (error);
2640 }
2641 
2642 /*
2643  * Change mode of a file given path name; this version does not follow links.
2644  */
2645 /* ARGSUSED */
2646 int
2647 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2648 {
2649 	/* {
2650 		syscallarg(const char *) path;
2651 		syscallarg(int) mode;
2652 	} */
2653 	int error;
2654 	struct vnode *vp;
2655 
2656 	error = namei_simple_user(SCARG(uap, path),
2657 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2658 	if (error != 0)
2659 		return (error);
2660 
2661 	error = change_mode(vp, SCARG(uap, mode), l);
2662 
2663 	vrele(vp);
2664 	return (error);
2665 }
2666 
2667 /*
2668  * Common routine to set mode given a vnode.
2669  */
2670 static int
2671 change_mode(struct vnode *vp, int mode, struct lwp *l)
2672 {
2673 	struct vattr vattr;
2674 	int error;
2675 
2676 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2677 	vattr_null(&vattr);
2678 	vattr.va_mode = mode & ALLPERMS;
2679 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2680 	VOP_UNLOCK(vp, 0);
2681 	return (error);
2682 }
2683 
2684 /*
2685  * Set ownership given a path name; this version follows links.
2686  */
2687 /* ARGSUSED */
2688 int
2689 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2690 {
2691 	/* {
2692 		syscallarg(const char *) path;
2693 		syscallarg(uid_t) uid;
2694 		syscallarg(gid_t) gid;
2695 	} */
2696 	int error;
2697 	struct vnode *vp;
2698 
2699 	error = namei_simple_user(SCARG(uap, path),
2700 				NSM_FOLLOW_TRYEMULROOT, &vp);
2701 	if (error != 0)
2702 		return (error);
2703 
2704 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2705 
2706 	vrele(vp);
2707 	return (error);
2708 }
2709 
2710 /*
2711  * Set ownership given a path name; this version follows links.
2712  * Provides POSIX semantics.
2713  */
2714 /* ARGSUSED */
2715 int
2716 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2717 {
2718 	/* {
2719 		syscallarg(const char *) path;
2720 		syscallarg(uid_t) uid;
2721 		syscallarg(gid_t) gid;
2722 	} */
2723 	int error;
2724 	struct vnode *vp;
2725 
2726 	error = namei_simple_user(SCARG(uap, path),
2727 				NSM_FOLLOW_TRYEMULROOT, &vp);
2728 	if (error != 0)
2729 		return (error);
2730 
2731 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2732 
2733 	vrele(vp);
2734 	return (error);
2735 }
2736 
2737 /*
2738  * Set ownership given a file descriptor.
2739  */
2740 /* ARGSUSED */
2741 int
2742 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2743 {
2744 	/* {
2745 		syscallarg(int) fd;
2746 		syscallarg(uid_t) uid;
2747 		syscallarg(gid_t) gid;
2748 	} */
2749 	int error;
2750 	file_t *fp;
2751 
2752 	/* fd_getvnode() will use the descriptor for us */
2753 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2754 		return (error);
2755 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2756 	    l, 0);
2757 	fd_putfile(SCARG(uap, fd));
2758 	return (error);
2759 }
2760 
2761 /*
2762  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2763  */
2764 /* ARGSUSED */
2765 int
2766 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2767 {
2768 	/* {
2769 		syscallarg(int) fd;
2770 		syscallarg(uid_t) uid;
2771 		syscallarg(gid_t) gid;
2772 	} */
2773 	int error;
2774 	file_t *fp;
2775 
2776 	/* fd_getvnode() will use the descriptor for us */
2777 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2778 		return (error);
2779 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2780 	    l, 1);
2781 	fd_putfile(SCARG(uap, fd));
2782 	return (error);
2783 }
2784 
2785 /*
2786  * Set ownership given a path name; this version does not follow links.
2787  */
2788 /* ARGSUSED */
2789 int
2790 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2791 {
2792 	/* {
2793 		syscallarg(const char *) path;
2794 		syscallarg(uid_t) uid;
2795 		syscallarg(gid_t) gid;
2796 	} */
2797 	int error;
2798 	struct vnode *vp;
2799 
2800 	error = namei_simple_user(SCARG(uap, path),
2801 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2802 	if (error != 0)
2803 		return (error);
2804 
2805 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2806 
2807 	vrele(vp);
2808 	return (error);
2809 }
2810 
2811 /*
2812  * Set ownership given a path name; this version does not follow links.
2813  * Provides POSIX/XPG semantics.
2814  */
2815 /* ARGSUSED */
2816 int
2817 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2818 {
2819 	/* {
2820 		syscallarg(const char *) path;
2821 		syscallarg(uid_t) uid;
2822 		syscallarg(gid_t) gid;
2823 	} */
2824 	int error;
2825 	struct vnode *vp;
2826 
2827 	error = namei_simple_user(SCARG(uap, path),
2828 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2829 	if (error != 0)
2830 		return (error);
2831 
2832 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2833 
2834 	vrele(vp);
2835 	return (error);
2836 }
2837 
2838 /*
2839  * Common routine to set ownership given a vnode.
2840  */
2841 static int
2842 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2843     int posix_semantics)
2844 {
2845 	struct vattr vattr;
2846 	mode_t newmode;
2847 	int error;
2848 
2849 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2850 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2851 		goto out;
2852 
2853 #define CHANGED(x) ((int)(x) != -1)
2854 	newmode = vattr.va_mode;
2855 	if (posix_semantics) {
2856 		/*
2857 		 * POSIX/XPG semantics: if the caller is not the super-user,
2858 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2859 		 * the XPG consider the behaviour for calls by the super-user
2860 		 * implementation-defined; we leave the set-user-id and set-
2861 		 * group-id settings intact in that case.
2862 		 */
2863 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2864 				      NULL) != 0)
2865 			newmode &= ~(S_ISUID | S_ISGID);
2866 	} else {
2867 		/*
2868 		 * NetBSD semantics: when changing owner and/or group,
2869 		 * clear the respective bit(s).
2870 		 */
2871 		if (CHANGED(uid))
2872 			newmode &= ~S_ISUID;
2873 		if (CHANGED(gid))
2874 			newmode &= ~S_ISGID;
2875 	}
2876 	/* Update va_mode iff altered. */
2877 	if (vattr.va_mode == newmode)
2878 		newmode = VNOVAL;
2879 
2880 	vattr_null(&vattr);
2881 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2882 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2883 	vattr.va_mode = newmode;
2884 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2885 #undef CHANGED
2886 
2887 out:
2888 	VOP_UNLOCK(vp, 0);
2889 	return (error);
2890 }
2891 
2892 /*
2893  * Set the access and modification times given a path name; this
2894  * version follows links.
2895  */
2896 /* ARGSUSED */
2897 int
2898 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
2899     register_t *retval)
2900 {
2901 	/* {
2902 		syscallarg(const char *) path;
2903 		syscallarg(const struct timeval *) tptr;
2904 	} */
2905 
2906 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2907 	    SCARG(uap, tptr), UIO_USERSPACE);
2908 }
2909 
2910 /*
2911  * Set the access and modification times given a file descriptor.
2912  */
2913 /* ARGSUSED */
2914 int
2915 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
2916     register_t *retval)
2917 {
2918 	/* {
2919 		syscallarg(int) fd;
2920 		syscallarg(const struct timeval *) tptr;
2921 	} */
2922 	int error;
2923 	file_t *fp;
2924 
2925 	/* fd_getvnode() will use the descriptor for us */
2926 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2927 		return (error);
2928 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2929 	    UIO_USERSPACE);
2930 	fd_putfile(SCARG(uap, fd));
2931 	return (error);
2932 }
2933 
2934 /*
2935  * Set the access and modification times given a path name; this
2936  * version does not follow links.
2937  */
2938 int
2939 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
2940     register_t *retval)
2941 {
2942 	/* {
2943 		syscallarg(const char *) path;
2944 		syscallarg(const struct timeval *) tptr;
2945 	} */
2946 
2947 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
2948 	    SCARG(uap, tptr), UIO_USERSPACE);
2949 }
2950 
2951 /*
2952  * Common routine to set access and modification times given a vnode.
2953  */
2954 int
2955 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
2956     const struct timeval *tptr, enum uio_seg seg)
2957 {
2958 	struct vattr vattr;
2959 	int error, dorele = 0;
2960 	namei_simple_flags_t sflags;
2961 
2962 	bool vanull, setbirthtime;
2963 	struct timespec ts[2];
2964 
2965 	/*
2966 	 * I have checked all callers and they pass either FOLLOW,
2967 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
2968 	 * is 0. More to the point, they don't pass anything else.
2969 	 * Let's keep it that way at least until the namei interfaces
2970 	 * are fully sanitized.
2971 	 */
2972 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
2973 	sflags = (flag == FOLLOW) ?
2974 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
2975 
2976 	if (tptr == NULL) {
2977 		vanull = true;
2978 		nanotime(&ts[0]);
2979 		ts[1] = ts[0];
2980 	} else {
2981 		struct timeval tv[2];
2982 
2983 		vanull = false;
2984 		if (seg != UIO_SYSSPACE) {
2985 			error = copyin(tptr, tv, sizeof (tv));
2986 			if (error != 0)
2987 				return error;
2988 			tptr = tv;
2989 		}
2990 		TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
2991 		TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
2992 	}
2993 
2994 	if (vp == NULL) {
2995 		/* note: SEG describes TPTR, not PATH; PATH is always user */
2996 		error = namei_simple_user(path, sflags, &vp);
2997 		if (error != 0)
2998 			return error;
2999 		dorele = 1;
3000 	}
3001 
3002 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3003 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3004 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3005 	vattr_null(&vattr);
3006 	vattr.va_atime = ts[0];
3007 	vattr.va_mtime = ts[1];
3008 	if (setbirthtime)
3009 		vattr.va_birthtime = ts[1];
3010 	if (vanull)
3011 		vattr.va_vaflags |= VA_UTIMES_NULL;
3012 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3013 	VOP_UNLOCK(vp, 0);
3014 
3015 	if (dorele != 0)
3016 		vrele(vp);
3017 
3018 	return error;
3019 }
3020 
3021 /*
3022  * Truncate a file given its path name.
3023  */
3024 /* ARGSUSED */
3025 int
3026 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3027 {
3028 	/* {
3029 		syscallarg(const char *) path;
3030 		syscallarg(int) pad;
3031 		syscallarg(off_t) length;
3032 	} */
3033 	struct vnode *vp;
3034 	struct vattr vattr;
3035 	int error;
3036 
3037 	error = namei_simple_user(SCARG(uap, path),
3038 				NSM_FOLLOW_TRYEMULROOT, &vp);
3039 	if (error != 0)
3040 		return (error);
3041 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3042 	if (vp->v_type == VDIR)
3043 		error = EISDIR;
3044 	else if ((error = vn_writechk(vp)) == 0 &&
3045 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3046 		vattr_null(&vattr);
3047 		vattr.va_size = SCARG(uap, length);
3048 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3049 	}
3050 	vput(vp);
3051 	return (error);
3052 }
3053 
3054 /*
3055  * Truncate a file given a file descriptor.
3056  */
3057 /* ARGSUSED */
3058 int
3059 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3060 {
3061 	/* {
3062 		syscallarg(int) fd;
3063 		syscallarg(int) pad;
3064 		syscallarg(off_t) length;
3065 	} */
3066 	struct vattr vattr;
3067 	struct vnode *vp;
3068 	file_t *fp;
3069 	int error;
3070 
3071 	/* fd_getvnode() will use the descriptor for us */
3072 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3073 		return (error);
3074 	if ((fp->f_flag & FWRITE) == 0) {
3075 		error = EINVAL;
3076 		goto out;
3077 	}
3078 	vp = fp->f_data;
3079 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3080 	if (vp->v_type == VDIR)
3081 		error = EISDIR;
3082 	else if ((error = vn_writechk(vp)) == 0) {
3083 		vattr_null(&vattr);
3084 		vattr.va_size = SCARG(uap, length);
3085 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3086 	}
3087 	VOP_UNLOCK(vp, 0);
3088  out:
3089 	fd_putfile(SCARG(uap, fd));
3090 	return (error);
3091 }
3092 
3093 /*
3094  * Sync an open file.
3095  */
3096 /* ARGSUSED */
3097 int
3098 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3099 {
3100 	/* {
3101 		syscallarg(int) fd;
3102 	} */
3103 	struct vnode *vp;
3104 	file_t *fp;
3105 	int error;
3106 
3107 	/* fd_getvnode() will use the descriptor for us */
3108 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3109 		return (error);
3110 	vp = fp->f_data;
3111 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3112 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3113 	VOP_UNLOCK(vp, 0);
3114 	fd_putfile(SCARG(uap, fd));
3115 	return (error);
3116 }
3117 
3118 /*
3119  * Sync a range of file data.  API modeled after that found in AIX.
3120  *
3121  * FDATASYNC indicates that we need only save enough metadata to be able
3122  * to re-read the written data.  Note we duplicate AIX's requirement that
3123  * the file be open for writing.
3124  */
3125 /* ARGSUSED */
3126 int
3127 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3128 {
3129 	/* {
3130 		syscallarg(int) fd;
3131 		syscallarg(int) flags;
3132 		syscallarg(off_t) start;
3133 		syscallarg(off_t) length;
3134 	} */
3135 	struct vnode *vp;
3136 	file_t *fp;
3137 	int flags, nflags;
3138 	off_t s, e, len;
3139 	int error;
3140 
3141 	/* fd_getvnode() will use the descriptor for us */
3142 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3143 		return (error);
3144 
3145 	if ((fp->f_flag & FWRITE) == 0) {
3146 		error = EBADF;
3147 		goto out;
3148 	}
3149 
3150 	flags = SCARG(uap, flags);
3151 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3152 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3153 		error = EINVAL;
3154 		goto out;
3155 	}
3156 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3157 	if (flags & FDATASYNC)
3158 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3159 	else
3160 		nflags = FSYNC_WAIT;
3161 	if (flags & FDISKSYNC)
3162 		nflags |= FSYNC_CACHE;
3163 
3164 	len = SCARG(uap, length);
3165 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3166 	if (len) {
3167 		s = SCARG(uap, start);
3168 		e = s + len;
3169 		if (e < s) {
3170 			error = EINVAL;
3171 			goto out;
3172 		}
3173 	} else {
3174 		e = 0;
3175 		s = 0;
3176 	}
3177 
3178 	vp = fp->f_data;
3179 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3180 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3181 	VOP_UNLOCK(vp, 0);
3182 out:
3183 	fd_putfile(SCARG(uap, fd));
3184 	return (error);
3185 }
3186 
3187 /*
3188  * Sync the data of an open file.
3189  */
3190 /* ARGSUSED */
3191 int
3192 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3193 {
3194 	/* {
3195 		syscallarg(int) fd;
3196 	} */
3197 	struct vnode *vp;
3198 	file_t *fp;
3199 	int error;
3200 
3201 	/* fd_getvnode() will use the descriptor for us */
3202 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3203 		return (error);
3204 	if ((fp->f_flag & FWRITE) == 0) {
3205 		fd_putfile(SCARG(uap, fd));
3206 		return (EBADF);
3207 	}
3208 	vp = fp->f_data;
3209 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3210 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3211 	VOP_UNLOCK(vp, 0);
3212 	fd_putfile(SCARG(uap, fd));
3213 	return (error);
3214 }
3215 
3216 /*
3217  * Rename files, (standard) BSD semantics frontend.
3218  */
3219 /* ARGSUSED */
3220 int
3221 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3222 {
3223 	/* {
3224 		syscallarg(const char *) from;
3225 		syscallarg(const char *) to;
3226 	} */
3227 
3228 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3229 }
3230 
3231 /*
3232  * Rename files, POSIX semantics frontend.
3233  */
3234 /* ARGSUSED */
3235 int
3236 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3237 {
3238 	/* {
3239 		syscallarg(const char *) from;
3240 		syscallarg(const char *) to;
3241 	} */
3242 
3243 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3244 }
3245 
3246 /*
3247  * Rename files.  Source and destination must either both be directories,
3248  * or both not be directories.  If target is a directory, it must be empty.
3249  * If `from' and `to' refer to the same object, the value of the `retain'
3250  * argument is used to determine whether `from' will be
3251  *
3252  * (retain == 0)	deleted unless `from' and `to' refer to the same
3253  *			object in the file system's name space (BSD).
3254  * (retain == 1)	always retained (POSIX).
3255  */
3256 int
3257 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3258 {
3259 	struct vnode *tvp, *fvp, *tdvp;
3260 	struct nameidata fromnd, tond;
3261 	struct mount *fs;
3262 	struct lwp *l = curlwp;
3263 	struct proc *p;
3264 	uint32_t saveflag;
3265 	int error;
3266 
3267 	NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT | INRENAME,
3268 	    seg, from);
3269 	if ((error = namei(&fromnd)) != 0)
3270 		return (error);
3271 	if (fromnd.ni_dvp != fromnd.ni_vp)
3272 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3273 	fvp = fromnd.ni_vp;
3274 
3275 	fs = fvp->v_mount;
3276 	error = VFS_RENAMELOCK_ENTER(fs);
3277 	if (error) {
3278 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3279 		vrele(fromnd.ni_dvp);
3280 		vrele(fvp);
3281 		goto out1;
3282 	}
3283 
3284 	/*
3285 	 * close, partially, yet another race - ideally we should only
3286 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3287 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3288 	 * that with namei as it stands.
3289 	 *
3290 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3291 	 * under us. The real fix is to get the locks in the right
3292 	 * order and do the lookups in the right places, but that's a
3293 	 * major rototill.
3294 	 *
3295 	 * Preserve the SAVESTART in cn_flags, because who knows what
3296 	 * might happen if we don't.
3297 	 *
3298 	 * Note: this logic (as well as this whole function) is cloned
3299 	 * in nfs_serv.c. Proceed accordingly.
3300 	 */
3301 	vrele(fvp);
3302 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3303 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3304 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3305 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3306 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3307 		error = EINVAL;
3308 		VFS_RENAMELOCK_EXIT(fs);
3309 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3310 		vrele(fromnd.ni_dvp);
3311 		goto out1;
3312 	}
3313 	saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3314 	fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3315 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3316 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3317 	fromnd.ni_cnd.cn_flags |= saveflag;
3318 	if (error) {
3319 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3320 		VFS_RENAMELOCK_EXIT(fs);
3321 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3322 		vrele(fromnd.ni_dvp);
3323 		goto out1;
3324 	}
3325 	VOP_UNLOCK(fromnd.ni_vp, 0);
3326 	if (fromnd.ni_dvp != fromnd.ni_vp)
3327 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3328 	fvp = fromnd.ni_vp;
3329 
3330 	NDINIT(&tond, RENAME,
3331 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3332 	      | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
3333 	    seg, to);
3334 	if ((error = namei(&tond)) != 0) {
3335 		VFS_RENAMELOCK_EXIT(fs);
3336 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3337 		vrele(fromnd.ni_dvp);
3338 		vrele(fvp);
3339 		goto out1;
3340 	}
3341 	tdvp = tond.ni_dvp;
3342 	tvp = tond.ni_vp;
3343 
3344 	if (tvp != NULL) {
3345 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3346 			error = ENOTDIR;
3347 			goto out;
3348 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3349 			error = EISDIR;
3350 			goto out;
3351 		}
3352 	}
3353 
3354 	if (fvp == tdvp)
3355 		error = EINVAL;
3356 
3357 	/*
3358 	 * Source and destination refer to the same object.
3359 	 */
3360 	if (fvp == tvp) {
3361 		if (retain)
3362 			error = -1;
3363 		else if (fromnd.ni_dvp == tdvp &&
3364 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3365 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3366 		          tond.ni_cnd.cn_nameptr,
3367 		          fromnd.ni_cnd.cn_namelen))
3368 		error = -1;
3369 	}
3370 
3371 #if NVERIEXEC > 0
3372 	if (!error) {
3373 		char *f1, *f2;
3374 		size_t f1_len;
3375 		size_t f2_len;
3376 
3377 		f1_len = fromnd.ni_cnd.cn_namelen + 1;
3378 		f1 = kmem_alloc(f1_len, KM_SLEEP);
3379 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
3380 
3381 		f2_len = tond.ni_cnd.cn_namelen + 1;
3382 		f2 = kmem_alloc(f2_len, KM_SLEEP);
3383 		strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
3384 
3385 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3386 
3387 		kmem_free(f1, f1_len);
3388 		kmem_free(f2, f2_len);
3389 	}
3390 #endif /* NVERIEXEC > 0 */
3391 
3392 out:
3393 	p = l->l_proc;
3394 	if (!error) {
3395 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3396 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3397 		VFS_RENAMELOCK_EXIT(fs);
3398 	} else {
3399 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3400 		if (tdvp == tvp)
3401 			vrele(tdvp);
3402 		else
3403 			vput(tdvp);
3404 		if (tvp)
3405 			vput(tvp);
3406 		VFS_RENAMELOCK_EXIT(fs);
3407 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3408 		vrele(fromnd.ni_dvp);
3409 		vrele(fvp);
3410 	}
3411 	vrele(tond.ni_startdir);
3412 	PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3413 out1:
3414 	if (fromnd.ni_startdir)
3415 		vrele(fromnd.ni_startdir);
3416 	PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3417 	return (error == -1 ? 0 : error);
3418 }
3419 
3420 /*
3421  * Make a directory file.
3422  */
3423 /* ARGSUSED */
3424 int
3425 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3426 {
3427 	/* {
3428 		syscallarg(const char *) path;
3429 		syscallarg(int) mode;
3430 	} */
3431 
3432 	return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE);
3433 }
3434 
3435 int
3436 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
3437 {
3438 	struct proc *p = curlwp->l_proc;
3439 	struct vnode *vp;
3440 	struct vattr vattr;
3441 	int error;
3442 	struct nameidata nd;
3443 
3444 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT,
3445 	    seg, path);
3446 	if ((error = namei(&nd)) != 0)
3447 		return (error);
3448 	vp = nd.ni_vp;
3449 	if (vp != NULL) {
3450 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3451 		if (nd.ni_dvp == vp)
3452 			vrele(nd.ni_dvp);
3453 		else
3454 			vput(nd.ni_dvp);
3455 		vrele(vp);
3456 		return (EEXIST);
3457 	}
3458 	vattr_null(&vattr);
3459 	vattr.va_type = VDIR;
3460 	/* We will read cwdi->cwdi_cmask unlocked. */
3461 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3462 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3463 	if (!error)
3464 		vput(nd.ni_vp);
3465 	return (error);
3466 }
3467 
3468 /*
3469  * Remove a directory file.
3470  */
3471 /* ARGSUSED */
3472 int
3473 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3474 {
3475 	/* {
3476 		syscallarg(const char *) path;
3477 	} */
3478 	struct vnode *vp;
3479 	int error;
3480 	struct nameidata nd;
3481 
3482 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3483 	    SCARG(uap, path));
3484 	if ((error = namei(&nd)) != 0)
3485 		return (error);
3486 	vp = nd.ni_vp;
3487 	if (vp->v_type != VDIR) {
3488 		error = ENOTDIR;
3489 		goto out;
3490 	}
3491 	/*
3492 	 * No rmdir "." please.
3493 	 */
3494 	if (nd.ni_dvp == vp) {
3495 		error = EINVAL;
3496 		goto out;
3497 	}
3498 	/*
3499 	 * The root of a mounted filesystem cannot be deleted.
3500 	 */
3501 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3502 		error = EBUSY;
3503 		goto out;
3504 	}
3505 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3506 	return (error);
3507 
3508 out:
3509 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3510 	if (nd.ni_dvp == vp)
3511 		vrele(nd.ni_dvp);
3512 	else
3513 		vput(nd.ni_dvp);
3514 	vput(vp);
3515 	return (error);
3516 }
3517 
3518 /*
3519  * Read a block of directory entries in a file system independent format.
3520  */
3521 int
3522 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3523 {
3524 	/* {
3525 		syscallarg(int) fd;
3526 		syscallarg(char *) buf;
3527 		syscallarg(size_t) count;
3528 	} */
3529 	file_t *fp;
3530 	int error, done;
3531 
3532 	/* fd_getvnode() will use the descriptor for us */
3533 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3534 		return (error);
3535 	if ((fp->f_flag & FREAD) == 0) {
3536 		error = EBADF;
3537 		goto out;
3538 	}
3539 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3540 			SCARG(uap, count), &done, l, 0, 0);
3541 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3542 	*retval = done;
3543  out:
3544 	fd_putfile(SCARG(uap, fd));
3545 	return (error);
3546 }
3547 
3548 /*
3549  * Set the mode mask for creation of filesystem nodes.
3550  */
3551 int
3552 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3553 {
3554 	/* {
3555 		syscallarg(mode_t) newmask;
3556 	} */
3557 	struct proc *p = l->l_proc;
3558 	struct cwdinfo *cwdi;
3559 
3560 	/*
3561 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3562 	 * important is that we serialize changes to the mask.  The
3563 	 * rw_exit() will issue a write memory barrier on our behalf,
3564 	 * and force the changes out to other CPUs (as it must use an
3565 	 * atomic operation, draining the local CPU's store buffers).
3566 	 */
3567 	cwdi = p->p_cwdi;
3568 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3569 	*retval = cwdi->cwdi_cmask;
3570 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3571 	rw_exit(&cwdi->cwdi_lock);
3572 
3573 	return (0);
3574 }
3575 
3576 int
3577 dorevoke(struct vnode *vp, kauth_cred_t cred)
3578 {
3579 	struct vattr vattr;
3580 	int error;
3581 
3582 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3583 		return error;
3584 	if (kauth_cred_geteuid(cred) == vattr.va_uid ||
3585 	    (error = kauth_authorize_generic(cred,
3586 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3587 		VOP_REVOKE(vp, REVOKEALL);
3588 	return (error);
3589 }
3590 
3591 /*
3592  * Void all references to file by ripping underlying filesystem
3593  * away from vnode.
3594  */
3595 /* ARGSUSED */
3596 int
3597 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3598 {
3599 	/* {
3600 		syscallarg(const char *) path;
3601 	} */
3602 	struct vnode *vp;
3603 	int error;
3604 
3605 	error = namei_simple_user(SCARG(uap, path),
3606 				NSM_FOLLOW_TRYEMULROOT, &vp);
3607 	if (error != 0)
3608 		return (error);
3609 	error = dorevoke(vp, l->l_cred);
3610 	vrele(vp);
3611 	return (error);
3612 }
3613