xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision cd22f25e6f6d1cc1f197fe8c5468a80f51d1c4e1)
1 /*	$NetBSD: vfs_syscalls.c,v 1.354 2008/04/30 12:49:17 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
63  */
64 
65 #include <sys/cdefs.h>
66 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.354 2008/04/30 12:49:17 ad Exp $");
67 
68 #include "opt_compat_netbsd.h"
69 #include "opt_compat_43.h"
70 #include "opt_fileassoc.h"
71 #include "fss.h"
72 #include "veriexec.h"
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file.h>
80 #include <sys/stat.h>
81 #include <sys/vnode.h>
82 #include <sys/mount.h>
83 #include <sys/proc.h>
84 #include <sys/uio.h>
85 #include <sys/malloc.h>
86 #include <sys/kmem.h>
87 #include <sys/dirent.h>
88 #include <sys/sysctl.h>
89 #include <sys/syscallargs.h>
90 #include <sys/vfs_syscalls.h>
91 #include <sys/ktrace.h>
92 #ifdef FILEASSOC
93 #include <sys/fileassoc.h>
94 #endif /* FILEASSOC */
95 #include <sys/verified_exec.h>
96 #include <sys/kauth.h>
97 #include <sys/atomic.h>
98 
99 #include <miscfs/genfs/genfs.h>
100 #include <miscfs/syncfs/syncfs.h>
101 #include <miscfs/specfs/specdev.h>
102 
103 #ifdef COMPAT_30
104 #include "opt_nfsserver.h"
105 #include <nfs/rpcv2.h>
106 #endif
107 #include <nfs/nfsproto.h>
108 #ifdef COMPAT_30
109 #include <nfs/nfs.h>
110 #include <nfs/nfs_var.h>
111 #endif
112 
113 #if NFSS > 0
114 #include <dev/fssvar.h>
115 #endif
116 
117 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
118 
119 static int change_dir(struct nameidata *, struct lwp *);
120 static int change_flags(struct vnode *, u_long, struct lwp *);
121 static int change_mode(struct vnode *, int, struct lwp *l);
122 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
123 
124 void checkdirs(struct vnode *);
125 
126 int dovfsusermount = 0;
127 
128 /*
129  * Virtual File System System Calls
130  */
131 
132 /*
133  * Mount a file system.
134  */
135 
136 #if defined(COMPAT_09) || defined(COMPAT_43)
137 /*
138  * This table is used to maintain compatibility with 4.3BSD
139  * and NetBSD 0.9 mount syscalls.  Note, the order is important!
140  *
141  * Do not modify this table. It should only contain filesystems
142  * supported by NetBSD 0.9 and 4.3BSD.
143  */
144 const char * const mountcompatnames[] = {
145 	NULL,		/* 0 = MOUNT_NONE */
146 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
147 	MOUNT_NFS,	/* 2 */
148 	MOUNT_MFS,	/* 3 */
149 	MOUNT_MSDOS,	/* 4 */
150 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
151 	MOUNT_FDESC,	/* 6 */
152 	MOUNT_KERNFS,	/* 7 */
153 	NULL,		/* 8 = MOUNT_DEVFS */
154 	MOUNT_AFS,	/* 9 */
155 };
156 const int nmountcompatnames = sizeof(mountcompatnames) /
157     sizeof(mountcompatnames[0]);
158 #endif /* COMPAT_09 || COMPAT_43 */
159 
160 static int
161 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
162     void *data, size_t *data_len)
163 {
164 	struct mount *mp;
165 	int error = 0, saved_flags;
166 
167 	mp = vp->v_mount;
168 	saved_flags = mp->mnt_flag;
169 
170 	/* We can operate only on VV_ROOT nodes. */
171 	if ((vp->v_vflag & VV_ROOT) == 0) {
172 		error = EINVAL;
173 		goto out;
174 	}
175 
176 	/*
177 	 * We only allow the filesystem to be reloaded if it
178 	 * is currently mounted read-only.
179 	 */
180 	if (flags & MNT_RELOAD && !(mp->mnt_flag & MNT_RDONLY)) {
181 		error = EOPNOTSUPP;	/* Needs translation */
182 		goto out;
183 	}
184 
185 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
186 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
187 	if (error)
188 		goto out;
189 
190 	if (vfs_busy(mp, RW_WRITER)) {
191 		error = EPERM;
192 		goto out;
193 	}
194 
195 	mp->mnt_flag &= ~MNT_OP_FLAGS;
196 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
197 
198 	/*
199 	 * Set the mount level flags.
200 	 */
201 	if (flags & MNT_RDONLY)
202 		mp->mnt_flag |= MNT_RDONLY;
203 	else if (mp->mnt_flag & MNT_RDONLY)
204 		mp->mnt_iflag |= IMNT_WANTRDWR;
205 	mp->mnt_flag &=
206 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
207 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
208 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP);
209 	mp->mnt_flag |= flags &
210 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
211 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
212 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
213 	    MNT_IGNORE);
214 
215 	error = VFS_MOUNT(mp, path, data, data_len);
216 
217 #if defined(COMPAT_30) && defined(NFSSERVER)
218 	if (error && data != NULL) {
219 		int error2;
220 
221 		/* Update failed; let's try and see if it was an
222 		 * export request. */
223 		error2 = nfs_update_exports_30(mp, path, data, l);
224 
225 		/* Only update error code if the export request was
226 		 * understood but some problem occurred while
227 		 * processing it. */
228 		if (error2 != EJUSTRETURN)
229 			error = error2;
230 	}
231 #endif
232 	if (mp->mnt_iflag & IMNT_WANTRDWR)
233 		mp->mnt_flag &= ~MNT_RDONLY;
234 	if (error)
235 		mp->mnt_flag = saved_flags;
236 	mp->mnt_flag &= ~MNT_OP_FLAGS;
237 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
238 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
239 		if (mp->mnt_syncer == NULL)
240 			error = vfs_allocate_syncvnode(mp);
241 	} else {
242 		if (mp->mnt_syncer != NULL)
243 			vfs_deallocate_syncvnode(mp);
244 	}
245 	vfs_unbusy(mp, false, NULL);
246 
247  out:
248 	return (error);
249 }
250 
251 static int
252 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
253 {
254 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
255 	int error;
256 
257 	/* Copy file-system type from userspace.  */
258 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
259 	if (error) {
260 #if defined(COMPAT_09) || defined(COMPAT_43)
261 		/*
262 		 * Historically, filesystem types were identified by numbers.
263 		 * If we get an integer for the filesystem type instead of a
264 		 * string, we check to see if it matches one of the historic
265 		 * filesystem types.
266 		 */
267 		u_long fsindex = (u_long)fstype;
268 		if (fsindex >= nmountcompatnames ||
269 		    mountcompatnames[fsindex] == NULL)
270 			return ENODEV;
271 		strlcpy(fstypename, mountcompatnames[fsindex],
272 		    sizeof(fstypename));
273 #else
274 		return error;
275 #endif
276 	}
277 
278 #ifdef	COMPAT_10
279 	/* Accept `ufs' as an alias for `ffs'. */
280 	if (strcmp(fstypename, "ufs") == 0)
281 		fstypename[0] = 'f';
282 #endif
283 
284 	if ((*vfsops = vfs_getopsbyname(fstypename)) == NULL)
285 		return ENODEV;
286 	return 0;
287 }
288 
289 static int
290 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
291     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
292 {
293 	struct mount *mp = NULL;
294 	struct vnode *vp = *vpp;
295 	struct vattr va;
296 	int error;
297 
298 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
299 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
300 	if (error)
301 		return error;
302 
303 	/* Can't make a non-dir a mount-point (from here anyway). */
304 	if (vp->v_type != VDIR)
305 		return ENOTDIR;
306 
307 	/*
308 	 * If the user is not root, ensure that they own the directory
309 	 * onto which we are attempting to mount.
310 	 */
311 	if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
312 	    (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
313 	    (error = kauth_authorize_generic(l->l_cred,
314 	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
315 		return error;
316 	}
317 
318 	if (flags & MNT_EXPORTED)
319 		return EINVAL;
320 
321 	if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
322 		return error;
323 
324 	/*
325 	 * Check if a file-system is not already mounted on this vnode.
326 	 */
327 	if (vp->v_mountedhere != NULL)
328 		return EBUSY;
329 
330 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
331 	if (mp == NULL)
332 		return ENOMEM;
333 
334 	mp->mnt_op = vfsops;
335 	mp->mnt_refcnt = 1;
336 
337 	TAILQ_INIT(&mp->mnt_vnodelist);
338 	rw_init(&mp->mnt_lock);
339  	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
340 	(void)vfs_busy(mp, RW_WRITER);
341 
342 	mp->mnt_vnodecovered = vp;
343 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
344 	mount_initspecific(mp);
345 
346 	/*
347 	 * The underlying file system may refuse the mount for
348 	 * various reasons.  Allow the user to force it to happen.
349 	 *
350 	 * Set the mount level flags.
351 	 */
352 	mp->mnt_flag = flags &
353 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
354 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
355 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
356 	    MNT_IGNORE | MNT_RDONLY);
357 
358 	error = VFS_MOUNT(mp, path, data, data_len);
359 	mp->mnt_flag &= ~MNT_OP_FLAGS;
360 
361 	/*
362 	 * Put the new filesystem on the mount list after root.
363 	 */
364 	cache_purge(vp);
365 	if (error != 0) {
366 		vp->v_mountedhere = NULL;
367 		vfs_unbusy(mp, false, NULL);
368 		vfs_destroy(mp, false);
369 		return error;
370 	}
371 
372 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
373 	mutex_enter(&mountlist_lock);
374 	vp->v_mountedhere = mp;
375 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
376 	mp->mnt_iflag |= IMNT_ONLIST;
377 	mutex_exit(&mountlist_lock);
378     	vn_restorerecurse(vp, recurse);
379 	VOP_UNLOCK(vp, 0);
380 	checkdirs(vp);
381 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
382 		error = vfs_allocate_syncvnode(mp);
383 	/* Hold an additional reference to the mount across VFS_START(). */
384 	vfs_unbusy(mp, true, NULL);
385 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
386 	error = VFS_START(mp, 0);
387 	if (error) {
388 		vrele(vp);
389 		vfs_destroy(mp, false);
390 	}
391 	/* Drop reference held for VFS_START(). */
392 	vfs_destroy(mp, false);
393 	*vpp = NULL;
394 	return error;
395 }
396 
397 static int
398 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
399     void *data, size_t *data_len)
400 {
401 	struct mount *mp;
402 	int error;
403 
404 	/* If MNT_GETARGS is specified, it should be the only flag. */
405 	if (flags & ~MNT_GETARGS)
406 		return EINVAL;
407 
408 	mp = vp->v_mount;
409 
410 	/* XXX: probably some notion of "can see" here if we want isolation. */
411 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
412 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
413 	if (error)
414 		return error;
415 
416 	if ((vp->v_vflag & VV_ROOT) == 0)
417 		return EINVAL;
418 
419 	if (vfs_busy(mp, RW_WRITER))
420 		return EPERM;
421 
422 	mp->mnt_flag &= ~MNT_OP_FLAGS;
423 	mp->mnt_flag |= MNT_GETARGS;
424 	error = VFS_MOUNT(mp, path, data, data_len);
425 	mp->mnt_flag &= ~MNT_OP_FLAGS;
426 
427 	vfs_unbusy(mp, false, NULL);
428 	return (error);
429 }
430 
431 #ifdef COMPAT_40
432 /* ARGSUSED */
433 int
434 compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
435 {
436 	/* {
437 		syscallarg(const char *) type;
438 		syscallarg(const char *) path;
439 		syscallarg(int) flags;
440 		syscallarg(void *) data;
441 	} */
442 	register_t dummy;
443 
444 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
445 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
446 }
447 #endif
448 
449 int
450 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
451 {
452 	/* {
453 		syscallarg(const char *) type;
454 		syscallarg(const char *) path;
455 		syscallarg(int) flags;
456 		syscallarg(void *) data;
457 		syscallarg(size_t) data_len;
458 	} */
459 
460 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
461 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
462 	    SCARG(uap, data_len), retval);
463 }
464 
465 int
466 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
467     const char *path, int flags, void *data, enum uio_seg data_seg,
468     size_t data_len, register_t *retval)
469 {
470 	struct vnode *vp;
471 	struct nameidata nd;
472 	void *data_buf = data;
473 	u_int recurse;
474 	int error;
475 
476 	/*
477 	 * Get vnode to be covered
478 	 */
479 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
480 	if ((error = namei(&nd)) != 0)
481 		return (error);
482 	vp = nd.ni_vp;
483 
484 	/*
485 	 * A lookup in VFS_MOUNT might result in an attempt to
486 	 * lock this vnode again, so make the lock recursive.
487 	 */
488 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
489 	recurse = vn_setrecurse(vp);
490 
491 	if (vfsops == NULL) {
492 		if (flags & (MNT_GETARGS | MNT_UPDATE))
493 			vfsops = vp->v_mount->mnt_op;
494 		else {
495 			/* 'type' is userspace */
496 			error = mount_get_vfsops(type, &vfsops);
497 			if (error != 0)
498 				goto done;
499 		}
500 	}
501 
502 	if (data != NULL && data_seg == UIO_USERSPACE) {
503 		if (data_len == 0) {
504 			/* No length supplied, use default for filesystem */
505 			data_len = vfsops->vfs_min_mount_data;
506 			if (data_len > VFS_MAX_MOUNT_DATA) {
507 				/* maybe a force loaded old LKM */
508 				error = EINVAL;
509 				goto done;
510 			}
511 #ifdef COMPAT_30
512 			/* Hopefully a longer buffer won't make copyin() fail */
513 			if (flags & MNT_UPDATE
514 			    && data_len < sizeof (struct mnt_export_args30))
515 				data_len = sizeof (struct mnt_export_args30);
516 #endif
517 		}
518 		data_buf = malloc(data_len, M_TEMP, M_WAITOK);
519 
520 		/* NFS needs the buffer even for mnt_getargs .... */
521 		error = copyin(data, data_buf, data_len);
522 		if (error != 0)
523 			goto done;
524 	}
525 
526 	if (flags & MNT_GETARGS) {
527 		if (data_len == 0) {
528 			error = EINVAL;
529 			goto done;
530 		}
531 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
532 		if (error != 0)
533 			goto done;
534 		if (data_seg == UIO_USERSPACE)
535 			error = copyout(data_buf, data, data_len);
536 		*retval = data_len;
537 	} else if (flags & MNT_UPDATE) {
538 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
539 	} else {
540 		/* Locking is handled internally in mount_domount(). */
541 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
542 		    &data_len, recurse);
543 	}
544 
545     done:
546     	if (vp != NULL) {
547 	    	vn_restorerecurse(vp, recurse);
548 	    	vput(vp);
549 	}
550 	if (data_buf != data)
551 		free(data_buf, M_TEMP);
552 	return (error);
553 }
554 
555 /*
556  * Scan all active processes to see if any of them have a current
557  * or root directory onto which the new filesystem has just been
558  * mounted. If so, replace them with the new mount point.
559  */
560 void
561 checkdirs(struct vnode *olddp)
562 {
563 	struct cwdinfo *cwdi;
564 	struct vnode *newdp;
565 	struct proc *p;
566 
567 	if (olddp->v_usecount == 1)
568 		return;
569 	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
570 		panic("mount: lost mount");
571 	mutex_enter(proc_lock);
572 	/* XXXAD Should not be acquiring these locks with proc_lock held!! */
573 	PROCLIST_FOREACH(p, &allproc) {
574 		if ((p->p_flag & PK_MARKER) != 0)
575 			continue;
576 		cwdi = p->p_cwdi;
577 		if (!cwdi)
578 			continue;
579 		rw_enter(&cwdi->cwdi_lock, RW_WRITER);
580 		if (cwdi->cwdi_cdir == olddp) {
581 			vrele(cwdi->cwdi_cdir);
582 			VREF(newdp);
583 			cwdi->cwdi_cdir = newdp;
584 		}
585 		if (cwdi->cwdi_rdir == olddp) {
586 			vrele(cwdi->cwdi_rdir);
587 			VREF(newdp);
588 			cwdi->cwdi_rdir = newdp;
589 		}
590 		rw_exit(&cwdi->cwdi_lock);
591 	}
592 	mutex_exit(proc_lock);
593 	if (rootvnode == olddp) {
594 		vrele(rootvnode);
595 		VREF(newdp);
596 		rootvnode = newdp;
597 	}
598 	vput(newdp);
599 }
600 
601 /*
602  * Unmount a file system.
603  *
604  * Note: unmount takes a path to the vnode mounted on as argument,
605  * not special file (as before).
606  */
607 /* ARGSUSED */
608 int
609 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
610 {
611 	/* {
612 		syscallarg(const char *) path;
613 		syscallarg(int) flags;
614 	} */
615 	struct vnode *vp;
616 	struct mount *mp;
617 	int error;
618 	struct nameidata nd;
619 
620 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
621 	    SCARG(uap, path));
622 	if ((error = namei(&nd)) != 0)
623 		return (error);
624 	vp = nd.ni_vp;
625 	mp = vp->v_mount;
626 	VOP_UNLOCK(vp, 0);
627 
628 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
629 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
630 	if (error) {
631 		vrele(vp);
632 		return (error);
633 	}
634 
635 	/*
636 	 * Don't allow unmounting the root file system.
637 	 */
638 	if (mp->mnt_flag & MNT_ROOTFS) {
639 		vrele(vp);
640 		return (EINVAL);
641 	}
642 
643 	/*
644 	 * Must be the root of the filesystem
645 	 */
646 	if ((vp->v_vflag & VV_ROOT) == 0) {
647 		vrele(vp);
648 		return (EINVAL);
649 	}
650 
651 	/*
652 	 * XXX Freeze syncer.  Must do this before locking the
653 	 * mount point.  See dounmount() for details.
654 	 */
655 	mutex_enter(&syncer_mutex);
656 	error = vfs_busy(mp, RW_WRITER);
657 	vrele(vp);
658 	if (error != 0) {
659 		mutex_exit(&syncer_mutex);
660 		return (error);
661 	}
662 
663 	return (dounmount(mp, SCARG(uap, flags), l));
664 }
665 
666 /*
667  * Lock mount and keep additional reference across unmount.
668  */
669 static void
670 dounmount_lock(struct mount *mp)
671 {
672 
673 	KASSERT(rw_write_held(&mp->mnt_lock));
674 	KASSERT(mp->mnt_unmounter == NULL);
675 
676 	mp->mnt_unmounter = curlwp;
677 	vfs_unbusy(mp, true, NULL);
678 }
679 
680 /*
681  * Unlock mount and drop additional reference.
682  */
683 static void
684 dounmount_unlock(struct mount *mp)
685 {
686 
687 	KASSERT(mp->mnt_unmounter == curlwp);
688 
689 	mutex_enter(&mount_lock);
690 	mp->mnt_unmounter = NULL;
691 	cv_broadcast(&mount_cv);
692 	mutex_exit(&mount_lock);
693 	vfs_destroy(mp, false);
694 }
695 
696 /*
697  * Do the actual file system unmount. File system is assumed to have been
698  * marked busy by the caller.
699  */
700 int
701 dounmount(struct mount *mp, int flags, struct lwp *l)
702 {
703 	struct vnode *coveredvp;
704 	int error;
705 	int async;
706 	int used_syncer;
707 
708 	KASSERT(rw_write_held(&mp->mnt_lock));
709 
710 #if NVERIEXEC > 0
711 	error = veriexec_unmountchk(mp);
712 	if (error)
713 		return (error);
714 #endif /* NVERIEXEC > 0 */
715 
716 	dounmount_lock(mp);
717 	used_syncer = (mp->mnt_syncer != NULL);
718 
719 	/*
720 	 * XXX Syncer must be frozen when we get here.  This should really
721 	 * be done on a per-mountpoint basis, but especially the softdep
722 	 * code possibly called from the syncer doesn't exactly work on a
723 	 * per-mountpoint basis, so the softdep code would become a maze
724 	 * of vfs_busy() calls.
725 	 *
726 	 * The caller of dounmount() must acquire syncer_mutex because
727 	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
728 	 * order, and we must preserve that order to avoid deadlock.
729 	 *
730 	 * So, if the file system did not use the syncer, now is
731 	 * the time to release the syncer_mutex.
732 	 */
733 	if (used_syncer == 0)
734 		mutex_exit(&syncer_mutex);
735 
736 	mp->mnt_iflag |= IMNT_UNMOUNT;
737 	async = mp->mnt_flag & MNT_ASYNC;
738 	mp->mnt_flag &= ~MNT_ASYNC;
739 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
740 	if (mp->mnt_syncer != NULL)
741 		vfs_deallocate_syncvnode(mp);
742 	error = 0;
743 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
744 #if NFSS > 0
745 		error = fss_umount_hook(mp, (flags & MNT_FORCE));
746 #endif
747 		if (error == 0)
748 			error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
749 	}
750 	vfs_scrubvnlist(mp);
751 	if (error == 0 || (flags & MNT_FORCE))
752 		error = VFS_UNMOUNT(mp, flags);
753 	if (error) {
754 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
755 			(void) vfs_allocate_syncvnode(mp);
756 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
757 		mp->mnt_flag |= async;
758 		if (used_syncer)
759 			mutex_exit(&syncer_mutex);
760 		dounmount_unlock(mp);
761 		return (error);
762 	}
763 	vfs_scrubvnlist(mp);
764 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
765 		coveredvp->v_mountedhere = NULL;
766 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
767 		panic("unmount: dangling vnode");
768 	mp->mnt_iflag |= IMNT_GONE;
769 	if (used_syncer)
770 		mutex_exit(&syncer_mutex);
771 	vfs_hooks_unmount(mp);
772 	dounmount_unlock(mp);
773 	vfs_destroy(mp, false);
774 	if (coveredvp != NULLVP)
775 		vrele(coveredvp);
776 	return (0);
777 }
778 
779 /*
780  * Sync each mounted filesystem.
781  */
782 #ifdef DEBUG
783 int syncprt = 0;
784 struct ctldebug debug0 = { "syncprt", &syncprt };
785 #endif
786 
787 /* ARGSUSED */
788 int
789 sys_sync(struct lwp *l, const void *v, register_t *retval)
790 {
791 	struct mount *mp, *nmp;
792 	int asyncflag;
793 
794 	if (l == NULL)
795 		l = &lwp0;
796 
797 	mutex_enter(&mountlist_lock);
798 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
799 	     mp = nmp) {
800 		if (vfs_trybusy(mp, RW_READER, &nmp)) {
801 			continue;
802 		}
803 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
804 			asyncflag = mp->mnt_flag & MNT_ASYNC;
805 			mp->mnt_flag &= ~MNT_ASYNC;
806 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
807 			if (asyncflag)
808 				 mp->mnt_flag |= MNT_ASYNC;
809 		}
810 		mutex_enter(&mountlist_lock);
811 		vfs_unbusy(mp, false, &nmp);
812 
813 	}
814 	mutex_exit(&mountlist_lock);
815 #ifdef DEBUG
816 	if (syncprt)
817 		vfs_bufstats();
818 #endif /* DEBUG */
819 	return (0);
820 }
821 
822 /*
823  * Change filesystem quotas.
824  */
825 /* ARGSUSED */
826 int
827 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
828 {
829 	/* {
830 		syscallarg(const char *) path;
831 		syscallarg(int) cmd;
832 		syscallarg(int) uid;
833 		syscallarg(void *) arg;
834 	} */
835 	struct mount *mp;
836 	int error;
837 	struct nameidata nd;
838 
839 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
840 	    SCARG(uap, path));
841 	if ((error = namei(&nd)) != 0)
842 		return (error);
843 	mp = nd.ni_vp->v_mount;
844 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
845 	    SCARG(uap, arg));
846 	vrele(nd.ni_vp);
847 	return (error);
848 }
849 
850 int
851 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
852     int root)
853 {
854 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
855 	int error = 0;
856 
857 	/*
858 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
859 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
860 	 * overrides MNT_NOWAIT.
861 	 */
862 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
863 	    (flags != MNT_WAIT && flags != 0)) {
864 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
865 		goto done;
866 	}
867 
868 	/* Get the filesystem stats now */
869 	memset(sp, 0, sizeof(*sp));
870 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
871 		return error;
872 	}
873 
874 	if (cwdi->cwdi_rdir == NULL)
875 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
876 done:
877 	if (cwdi->cwdi_rdir != NULL) {
878 		size_t len;
879 		char *bp;
880 		char *path = PNBUF_GET();
881 
882 		bp = path + MAXPATHLEN;
883 		*--bp = '\0';
884 		rw_enter(&cwdi->cwdi_lock, RW_READER);
885 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
886 		    MAXPATHLEN / 2, 0, l);
887 		rw_exit(&cwdi->cwdi_lock);
888 		if (error) {
889 			PNBUF_PUT(path);
890 			return error;
891 		}
892 		len = strlen(bp);
893 		/*
894 		 * for mount points that are below our root, we can see
895 		 * them, so we fix up the pathname and return them. The
896 		 * rest we cannot see, so we don't allow viewing the
897 		 * data.
898 		 */
899 		if (strncmp(bp, sp->f_mntonname, len) == 0) {
900 			strlcpy(sp->f_mntonname, &sp->f_mntonname[len],
901 			    sizeof(sp->f_mntonname));
902 			if (sp->f_mntonname[0] == '\0')
903 				(void)strlcpy(sp->f_mntonname, "/",
904 				    sizeof(sp->f_mntonname));
905 		} else {
906 			if (root)
907 				(void)strlcpy(sp->f_mntonname, "/",
908 				    sizeof(sp->f_mntonname));
909 			else
910 				error = EPERM;
911 		}
912 		PNBUF_PUT(path);
913 	}
914 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
915 	return error;
916 }
917 
918 /*
919  * Get filesystem statistics by path.
920  */
921 int
922 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
923 {
924 	struct mount *mp;
925 	int error;
926 	struct nameidata nd;
927 
928 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
929 	if ((error = namei(&nd)) != 0)
930 		return error;
931 	mp = nd.ni_vp->v_mount;
932 	error = dostatvfs(mp, sb, l, flags, 1);
933 	vrele(nd.ni_vp);
934 	return error;
935 }
936 
937 /* ARGSUSED */
938 int
939 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
940 {
941 	/* {
942 		syscallarg(const char *) path;
943 		syscallarg(struct statvfs *) buf;
944 		syscallarg(int) flags;
945 	} */
946 	struct statvfs *sb;
947 	int error;
948 
949 	sb = STATVFSBUF_GET();
950 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
951 	if (error == 0)
952 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
953 	STATVFSBUF_PUT(sb);
954 	return error;
955 }
956 
957 /*
958  * Get filesystem statistics by fd.
959  */
960 int
961 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
962 {
963 	file_t *fp;
964 	struct mount *mp;
965 	int error;
966 
967 	/* fd_getvnode() will use the descriptor for us */
968 	if ((error = fd_getvnode(fd, &fp)) != 0)
969 		return (error);
970 	mp = ((struct vnode *)fp->f_data)->v_mount;
971 	error = dostatvfs(mp, sb, curlwp, flags, 1);
972 	fd_putfile(fd);
973 	return error;
974 }
975 
976 /* ARGSUSED */
977 int
978 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
979 {
980 	/* {
981 		syscallarg(int) fd;
982 		syscallarg(struct statvfs *) buf;
983 		syscallarg(int) flags;
984 	} */
985 	struct statvfs *sb;
986 	int error;
987 
988 	sb = STATVFSBUF_GET();
989 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
990 	if (error == 0)
991 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
992 	STATVFSBUF_PUT(sb);
993 	return error;
994 }
995 
996 
997 /*
998  * Get statistics on all filesystems.
999  */
1000 int
1001 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1002     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1003     register_t *retval)
1004 {
1005 	int root = 0;
1006 	struct proc *p = l->l_proc;
1007 	struct mount *mp, *nmp;
1008 	struct statvfs *sb;
1009 	size_t count, maxcount;
1010 	int error = 0;
1011 
1012 	sb = STATVFSBUF_GET();
1013 	maxcount = bufsize / entry_sz;
1014 	mutex_enter(&mountlist_lock);
1015 	count = 0;
1016 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1017 	     mp = nmp) {
1018 		if (vfs_trybusy(mp, RW_READER, &nmp)) {
1019 			continue;
1020 		}
1021 		if (sfsp && count < maxcount) {
1022 			error = dostatvfs(mp, sb, l, flags, 0);
1023 			if (error) {
1024 				mutex_enter(&mountlist_lock);
1025 				vfs_unbusy(mp, false, &nmp);
1026 				continue;
1027 			}
1028 			error = copyfn(sb, sfsp, entry_sz);
1029 			if (error) {
1030 				vfs_unbusy(mp, false, NULL);
1031 				goto out;
1032 			}
1033 			sfsp = (char *)sfsp + entry_sz;
1034 			root |= strcmp(sb->f_mntonname, "/") == 0;
1035 		}
1036 		count++;
1037 		mutex_enter(&mountlist_lock);
1038 		vfs_unbusy(mp, false, &nmp);
1039 	}
1040 
1041 	mutex_exit(&mountlist_lock);
1042 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1043 		/*
1044 		 * fake a root entry
1045 		 */
1046 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1047 		    sb, l, flags, 1);
1048 		if (error != 0)
1049 			goto out;
1050 		if (sfsp)
1051 			error = copyfn(sb, sfsp, entry_sz);
1052 		count++;
1053 	}
1054 	if (sfsp && count > maxcount)
1055 		*retval = maxcount;
1056 	else
1057 		*retval = count;
1058 out:
1059 	STATVFSBUF_PUT(sb);
1060 	return error;
1061 }
1062 
1063 int
1064 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1065 {
1066 	/* {
1067 		syscallarg(struct statvfs *) buf;
1068 		syscallarg(size_t) bufsize;
1069 		syscallarg(int) flags;
1070 	} */
1071 
1072 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1073 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1074 }
1075 
1076 /*
1077  * Change current working directory to a given file descriptor.
1078  */
1079 /* ARGSUSED */
1080 int
1081 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1082 {
1083 	/* {
1084 		syscallarg(int) fd;
1085 	} */
1086 	struct proc *p = l->l_proc;
1087 	struct cwdinfo *cwdi;
1088 	struct vnode *vp, *tdp;
1089 	struct mount *mp;
1090 	file_t *fp;
1091 	int error, fd;
1092 
1093 	/* fd_getvnode() will use the descriptor for us */
1094 	fd = SCARG(uap, fd);
1095 	if ((error = fd_getvnode(fd, &fp)) != 0)
1096 		return (error);
1097 	vp = fp->f_data;
1098 
1099 	VREF(vp);
1100 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1101 	if (vp->v_type != VDIR)
1102 		error = ENOTDIR;
1103 	else
1104 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1105 	if (error) {
1106 		vput(vp);
1107 		goto out;
1108 	}
1109 	while ((mp = vp->v_mountedhere) != NULL) {
1110 		if (vfs_busy(mp, RW_READER))
1111 			continue;
1112 		vput(vp);
1113 		error = VFS_ROOT(mp, &tdp);
1114 		vfs_unbusy(mp, false, NULL);
1115 		if (error)
1116 			goto out;
1117 		vp = tdp;
1118 	}
1119 	VOP_UNLOCK(vp, 0);
1120 
1121 	/*
1122 	 * Disallow changing to a directory not under the process's
1123 	 * current root directory (if there is one).
1124 	 */
1125 	cwdi = p->p_cwdi;
1126 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1127 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1128 		vrele(vp);
1129 		error = EPERM;	/* operation not permitted */
1130 	} else {
1131 		vrele(cwdi->cwdi_cdir);
1132 		cwdi->cwdi_cdir = vp;
1133 	}
1134 	rw_exit(&cwdi->cwdi_lock);
1135 
1136  out:
1137 	fd_putfile(fd);
1138 	return (error);
1139 }
1140 
1141 /*
1142  * Change this process's notion of the root directory to a given file
1143  * descriptor.
1144  */
1145 int
1146 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1147 {
1148 	struct proc *p = l->l_proc;
1149 	struct cwdinfo *cwdi;
1150 	struct vnode	*vp;
1151 	file_t	*fp;
1152 	int		 error, fd = SCARG(uap, fd);
1153 
1154 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1155  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1156 		return error;
1157 	/* fd_getvnode() will use the descriptor for us */
1158 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
1159 		return error;
1160 	vp = fp->f_data;
1161 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1162 	if (vp->v_type != VDIR)
1163 		error = ENOTDIR;
1164 	else
1165 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1166 	VOP_UNLOCK(vp, 0);
1167 	if (error)
1168 		goto out;
1169 	VREF(vp);
1170 
1171 	/*
1172 	 * Prevent escaping from chroot by putting the root under
1173 	 * the working directory.  Silently chdir to / if we aren't
1174 	 * already there.
1175 	 */
1176 	cwdi = p->p_cwdi;
1177 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1178 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1179 		/*
1180 		 * XXX would be more failsafe to change directory to a
1181 		 * deadfs node here instead
1182 		 */
1183 		vrele(cwdi->cwdi_cdir);
1184 		VREF(vp);
1185 		cwdi->cwdi_cdir = vp;
1186 	}
1187 
1188 	if (cwdi->cwdi_rdir != NULL)
1189 		vrele(cwdi->cwdi_rdir);
1190 	cwdi->cwdi_rdir = vp;
1191 	rw_exit(&cwdi->cwdi_lock);
1192 
1193  out:
1194 	fd_putfile(fd);
1195 	return (error);
1196 }
1197 
1198 /*
1199  * Change current working directory (``.'').
1200  */
1201 /* ARGSUSED */
1202 int
1203 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1204 {
1205 	/* {
1206 		syscallarg(const char *) path;
1207 	} */
1208 	struct proc *p = l->l_proc;
1209 	struct cwdinfo *cwdi;
1210 	int error;
1211 	struct nameidata nd;
1212 
1213 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1214 	    SCARG(uap, path));
1215 	if ((error = change_dir(&nd, l)) != 0)
1216 		return (error);
1217 	cwdi = p->p_cwdi;
1218 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1219 	vrele(cwdi->cwdi_cdir);
1220 	cwdi->cwdi_cdir = nd.ni_vp;
1221 	rw_exit(&cwdi->cwdi_lock);
1222 	return (0);
1223 }
1224 
1225 /*
1226  * Change notion of root (``/'') directory.
1227  */
1228 /* ARGSUSED */
1229 int
1230 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1231 {
1232 	/* {
1233 		syscallarg(const char *) path;
1234 	} */
1235 	struct proc *p = l->l_proc;
1236 	struct cwdinfo *cwdi;
1237 	struct vnode *vp;
1238 	int error;
1239 	struct nameidata nd;
1240 
1241 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1242 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1243 		return (error);
1244 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1245 	    SCARG(uap, path));
1246 	if ((error = change_dir(&nd, l)) != 0)
1247 		return (error);
1248 
1249 	cwdi = p->p_cwdi;
1250 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1251 	if (cwdi->cwdi_rdir != NULL)
1252 		vrele(cwdi->cwdi_rdir);
1253 	vp = nd.ni_vp;
1254 	cwdi->cwdi_rdir = vp;
1255 
1256 	/*
1257 	 * Prevent escaping from chroot by putting the root under
1258 	 * the working directory.  Silently chdir to / if we aren't
1259 	 * already there.
1260 	 */
1261 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1262 		/*
1263 		 * XXX would be more failsafe to change directory to a
1264 		 * deadfs node here instead
1265 		 */
1266 		vrele(cwdi->cwdi_cdir);
1267 		VREF(vp);
1268 		cwdi->cwdi_cdir = vp;
1269 	}
1270 	rw_exit(&cwdi->cwdi_lock);
1271 
1272 	return (0);
1273 }
1274 
1275 /*
1276  * Common routine for chroot and chdir.
1277  */
1278 static int
1279 change_dir(struct nameidata *ndp, struct lwp *l)
1280 {
1281 	struct vnode *vp;
1282 	int error;
1283 
1284 	if ((error = namei(ndp)) != 0)
1285 		return (error);
1286 	vp = ndp->ni_vp;
1287 	if (vp->v_type != VDIR)
1288 		error = ENOTDIR;
1289 	else
1290 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1291 
1292 	if (error)
1293 		vput(vp);
1294 	else
1295 		VOP_UNLOCK(vp, 0);
1296 	return (error);
1297 }
1298 
1299 /*
1300  * Check permissions, allocate an open file structure,
1301  * and call the device open routine if any.
1302  */
1303 int
1304 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1305 {
1306 	/* {
1307 		syscallarg(const char *) path;
1308 		syscallarg(int) flags;
1309 		syscallarg(int) mode;
1310 	} */
1311 	struct proc *p = l->l_proc;
1312 	struct cwdinfo *cwdi = p->p_cwdi;
1313 	file_t *fp;
1314 	struct vnode *vp;
1315 	int flags, cmode;
1316 	int type, indx, error;
1317 	struct flock lf;
1318 	struct nameidata nd;
1319 
1320 	flags = FFLAGS(SCARG(uap, flags));
1321 	if ((flags & (FREAD | FWRITE)) == 0)
1322 		return (EINVAL);
1323 	if ((error = fd_allocfile(&fp, &indx)) != 0)
1324 		return (error);
1325 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1326 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1327 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1328 	    SCARG(uap, path));
1329 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1330 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1331 		fd_abort(p, fp, indx);
1332 		if ((error == EDUPFD || error == EMOVEFD) &&
1333 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1334 		    (error =
1335 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1336 			*retval = indx;
1337 			return (0);
1338 		}
1339 		if (error == ERESTART)
1340 			error = EINTR;
1341 		return (error);
1342 	}
1343 
1344 	l->l_dupfd = 0;
1345 	vp = nd.ni_vp;
1346 	fp->f_flag = flags & FMASK;
1347 	fp->f_type = DTYPE_VNODE;
1348 	fp->f_ops = &vnops;
1349 	fp->f_data = vp;
1350 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1351 		lf.l_whence = SEEK_SET;
1352 		lf.l_start = 0;
1353 		lf.l_len = 0;
1354 		if (flags & O_EXLOCK)
1355 			lf.l_type = F_WRLCK;
1356 		else
1357 			lf.l_type = F_RDLCK;
1358 		type = F_FLOCK;
1359 		if ((flags & FNONBLOCK) == 0)
1360 			type |= F_WAIT;
1361 		VOP_UNLOCK(vp, 0);
1362 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1363 		if (error) {
1364 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1365 			fd_abort(p, fp, indx);
1366 			return (error);
1367 		}
1368 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1369 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1370 	}
1371 	VOP_UNLOCK(vp, 0);
1372 	*retval = indx;
1373 	fd_affix(p, fp, indx);
1374 	return (0);
1375 }
1376 
1377 static void
1378 vfs__fhfree(fhandle_t *fhp)
1379 {
1380 	size_t fhsize;
1381 
1382 	if (fhp == NULL) {
1383 		return;
1384 	}
1385 	fhsize = FHANDLE_SIZE(fhp);
1386 	kmem_free(fhp, fhsize);
1387 }
1388 
1389 /*
1390  * vfs_composefh: compose a filehandle.
1391  */
1392 
1393 int
1394 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1395 {
1396 	struct mount *mp;
1397 	struct fid *fidp;
1398 	int error;
1399 	size_t needfhsize;
1400 	size_t fidsize;
1401 
1402 	mp = vp->v_mount;
1403 	fidp = NULL;
1404 	if (*fh_size < FHANDLE_SIZE_MIN) {
1405 		fidsize = 0;
1406 	} else {
1407 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1408 		if (fhp != NULL) {
1409 			memset(fhp, 0, *fh_size);
1410 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1411 			fidp = &fhp->fh_fid;
1412 		}
1413 	}
1414 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1415 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1416 	if (error == 0 && *fh_size < needfhsize) {
1417 		error = E2BIG;
1418 	}
1419 	*fh_size = needfhsize;
1420 	return error;
1421 }
1422 
1423 int
1424 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1425 {
1426 	struct mount *mp;
1427 	fhandle_t *fhp;
1428 	size_t fhsize;
1429 	size_t fidsize;
1430 	int error;
1431 
1432 	*fhpp = NULL;
1433 	mp = vp->v_mount;
1434 	fidsize = 0;
1435 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1436 	KASSERT(error != 0);
1437 	if (error != E2BIG) {
1438 		goto out;
1439 	}
1440 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1441 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1442 	if (fhp == NULL) {
1443 		error = ENOMEM;
1444 		goto out;
1445 	}
1446 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1447 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1448 	if (error == 0) {
1449 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1450 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1451 		*fhpp = fhp;
1452 	} else {
1453 		kmem_free(fhp, fhsize);
1454 	}
1455 out:
1456 	return error;
1457 }
1458 
1459 void
1460 vfs_composefh_free(fhandle_t *fhp)
1461 {
1462 
1463 	vfs__fhfree(fhp);
1464 }
1465 
1466 /*
1467  * vfs_fhtovp: lookup a vnode by a filehandle.
1468  */
1469 
1470 int
1471 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1472 {
1473 	struct mount *mp;
1474 	int error;
1475 
1476 	*vpp = NULL;
1477 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1478 	if (mp == NULL) {
1479 		error = ESTALE;
1480 		goto out;
1481 	}
1482 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1483 		error = EOPNOTSUPP;
1484 		goto out;
1485 	}
1486 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1487 out:
1488 	return error;
1489 }
1490 
1491 /*
1492  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1493  * the needed size.
1494  */
1495 
1496 int
1497 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1498 {
1499 	fhandle_t *fhp;
1500 	int error;
1501 
1502 	*fhpp = NULL;
1503 	if (fhsize > FHANDLE_SIZE_MAX) {
1504 		return EINVAL;
1505 	}
1506 	if (fhsize < FHANDLE_SIZE_MIN) {
1507 		return EINVAL;
1508 	}
1509 again:
1510 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1511 	if (fhp == NULL) {
1512 		return ENOMEM;
1513 	}
1514 	error = copyin(ufhp, fhp, fhsize);
1515 	if (error == 0) {
1516 		/* XXX this check shouldn't be here */
1517 		if (FHANDLE_SIZE(fhp) == fhsize) {
1518 			*fhpp = fhp;
1519 			return 0;
1520 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1521 			/*
1522 			 * a kludge for nfsv2 padded handles.
1523 			 */
1524 			size_t sz;
1525 
1526 			sz = FHANDLE_SIZE(fhp);
1527 			kmem_free(fhp, fhsize);
1528 			fhsize = sz;
1529 			goto again;
1530 		} else {
1531 			/*
1532 			 * userland told us wrong size.
1533 			 */
1534 		    	error = EINVAL;
1535 		}
1536 	}
1537 	kmem_free(fhp, fhsize);
1538 	return error;
1539 }
1540 
1541 void
1542 vfs_copyinfh_free(fhandle_t *fhp)
1543 {
1544 
1545 	vfs__fhfree(fhp);
1546 }
1547 
1548 /*
1549  * Get file handle system call
1550  */
1551 int
1552 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1553 {
1554 	/* {
1555 		syscallarg(char *) fname;
1556 		syscallarg(fhandle_t *) fhp;
1557 		syscallarg(size_t *) fh_size;
1558 	} */
1559 	struct vnode *vp;
1560 	fhandle_t *fh;
1561 	int error;
1562 	struct nameidata nd;
1563 	size_t sz;
1564 	size_t usz;
1565 
1566 	/*
1567 	 * Must be super user
1568 	 */
1569 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1570 	    0, NULL, NULL, NULL);
1571 	if (error)
1572 		return (error);
1573 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1574 	    SCARG(uap, fname));
1575 	error = namei(&nd);
1576 	if (error)
1577 		return (error);
1578 	vp = nd.ni_vp;
1579 	error = vfs_composefh_alloc(vp, &fh);
1580 	vput(vp);
1581 	if (error != 0) {
1582 		goto out;
1583 	}
1584 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1585 	if (error != 0) {
1586 		goto out;
1587 	}
1588 	sz = FHANDLE_SIZE(fh);
1589 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1590 	if (error != 0) {
1591 		goto out;
1592 	}
1593 	if (usz >= sz) {
1594 		error = copyout(fh, SCARG(uap, fhp), sz);
1595 	} else {
1596 		error = E2BIG;
1597 	}
1598 out:
1599 	vfs_composefh_free(fh);
1600 	return (error);
1601 }
1602 
1603 /*
1604  * Open a file given a file handle.
1605  *
1606  * Check permissions, allocate an open file structure,
1607  * and call the device open routine if any.
1608  */
1609 
1610 int
1611 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1612     register_t *retval)
1613 {
1614 	file_t *fp;
1615 	struct vnode *vp = NULL;
1616 	kauth_cred_t cred = l->l_cred;
1617 	file_t *nfp;
1618 	int type, indx, error=0;
1619 	struct flock lf;
1620 	struct vattr va;
1621 	fhandle_t *fh;
1622 	int flags;
1623 	proc_t *p;
1624 
1625 	p = curproc;
1626 
1627 	/*
1628 	 * Must be super user
1629 	 */
1630 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1631 	    0, NULL, NULL, NULL)))
1632 		return (error);
1633 
1634 	flags = FFLAGS(oflags);
1635 	if ((flags & (FREAD | FWRITE)) == 0)
1636 		return (EINVAL);
1637 	if ((flags & O_CREAT))
1638 		return (EINVAL);
1639 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1640 		return (error);
1641 	fp = nfp;
1642 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1643 	if (error != 0) {
1644 		goto bad;
1645 	}
1646 	error = vfs_fhtovp(fh, &vp);
1647 	if (error != 0) {
1648 		goto bad;
1649 	}
1650 
1651 	/* Now do an effective vn_open */
1652 
1653 	if (vp->v_type == VSOCK) {
1654 		error = EOPNOTSUPP;
1655 		goto bad;
1656 	}
1657 	error = vn_openchk(vp, cred, flags);
1658 	if (error != 0)
1659 		goto bad;
1660 	if (flags & O_TRUNC) {
1661 		VOP_UNLOCK(vp, 0);			/* XXX */
1662 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1663 		VATTR_NULL(&va);
1664 		va.va_size = 0;
1665 		error = VOP_SETATTR(vp, &va, cred);
1666 		if (error)
1667 			goto bad;
1668 	}
1669 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1670 		goto bad;
1671 	if (flags & FWRITE) {
1672 		mutex_enter(&vp->v_interlock);
1673 		vp->v_writecount++;
1674 		mutex_exit(&vp->v_interlock);
1675 	}
1676 
1677 	/* done with modified vn_open, now finish what sys_open does. */
1678 
1679 	fp->f_flag = flags & FMASK;
1680 	fp->f_type = DTYPE_VNODE;
1681 	fp->f_ops = &vnops;
1682 	fp->f_data = vp;
1683 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1684 		lf.l_whence = SEEK_SET;
1685 		lf.l_start = 0;
1686 		lf.l_len = 0;
1687 		if (flags & O_EXLOCK)
1688 			lf.l_type = F_WRLCK;
1689 		else
1690 			lf.l_type = F_RDLCK;
1691 		type = F_FLOCK;
1692 		if ((flags & FNONBLOCK) == 0)
1693 			type |= F_WAIT;
1694 		VOP_UNLOCK(vp, 0);
1695 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1696 		if (error) {
1697 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1698 			fd_abort(p, fp, indx);
1699 			return (error);
1700 		}
1701 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1702 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1703 	}
1704 	VOP_UNLOCK(vp, 0);
1705 	*retval = indx;
1706 	fd_affix(p, fp, indx);
1707 	vfs_copyinfh_free(fh);
1708 	return (0);
1709 
1710 bad:
1711 	fd_abort(p, fp, indx);
1712 	if (vp != NULL)
1713 		vput(vp);
1714 	vfs_copyinfh_free(fh);
1715 	return (error);
1716 }
1717 
1718 int
1719 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1720 {
1721 	/* {
1722 		syscallarg(const void *) fhp;
1723 		syscallarg(size_t) fh_size;
1724 		syscallarg(int) flags;
1725 	} */
1726 
1727 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1728 	    SCARG(uap, flags), retval);
1729 }
1730 
1731 int
1732 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1733 {
1734 	int error;
1735 	fhandle_t *fh;
1736 	struct vnode *vp;
1737 
1738 	/*
1739 	 * Must be super user
1740 	 */
1741 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1742 	    0, NULL, NULL, NULL)))
1743 		return (error);
1744 
1745 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1746 	if (error != 0)
1747 		return error;
1748 
1749 	error = vfs_fhtovp(fh, &vp);
1750 	vfs_copyinfh_free(fh);
1751 	if (error != 0)
1752 		return error;
1753 
1754 	error = vn_stat(vp, sb);
1755 	vput(vp);
1756 	return error;
1757 }
1758 
1759 
1760 /* ARGSUSED */
1761 int
1762 sys___fhstat40(struct lwp *l, const struct sys___fhstat40_args *uap, register_t *retval)
1763 {
1764 	/* {
1765 		syscallarg(const void *) fhp;
1766 		syscallarg(size_t) fh_size;
1767 		syscallarg(struct stat *) sb;
1768 	} */
1769 	struct stat sb;
1770 	int error;
1771 
1772 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1773 	if (error)
1774 		return error;
1775 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1776 }
1777 
1778 int
1779 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1780     int flags)
1781 {
1782 	fhandle_t *fh;
1783 	struct mount *mp;
1784 	struct vnode *vp;
1785 	int error;
1786 
1787 	/*
1788 	 * Must be super user
1789 	 */
1790 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1791 	    0, NULL, NULL, NULL)))
1792 		return error;
1793 
1794 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1795 	if (error != 0)
1796 		return error;
1797 
1798 	error = vfs_fhtovp(fh, &vp);
1799 	vfs_copyinfh_free(fh);
1800 	if (error != 0)
1801 		return error;
1802 
1803 	mp = vp->v_mount;
1804 	error = dostatvfs(mp, sb, l, flags, 1);
1805 	vput(vp);
1806 	return error;
1807 }
1808 
1809 /* ARGSUSED */
1810 int
1811 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1812 {
1813 	/* {
1814 		syscallarg(const void *) fhp;
1815 		syscallarg(size_t) fh_size;
1816 		syscallarg(struct statvfs *) buf;
1817 		syscallarg(int)	flags;
1818 	} */
1819 	struct statvfs *sb = STATVFSBUF_GET();
1820 	int error;
1821 
1822 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1823 	    SCARG(uap, flags));
1824 	if (error == 0)
1825 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1826 	STATVFSBUF_PUT(sb);
1827 	return error;
1828 }
1829 
1830 /*
1831  * Create a special file.
1832  */
1833 /* ARGSUSED */
1834 int
1835 sys_mknod(struct lwp *l, const struct sys_mknod_args *uap, register_t *retval)
1836 {
1837 	/* {
1838 		syscallarg(const char *) path;
1839 		syscallarg(int) mode;
1840 		syscallarg(int) dev;
1841 	} */
1842 	struct proc *p = l->l_proc;
1843 	struct vnode *vp;
1844 	struct vattr vattr;
1845 	int error, optype;
1846 	struct nameidata nd;
1847 	char *path;
1848 	const char *cpath;
1849 	enum uio_seg seg = UIO_USERSPACE;
1850 
1851 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1852 	    0, NULL, NULL, NULL)) != 0)
1853 		return (error);
1854 
1855 	optype = VOP_MKNOD_DESCOFFSET;
1856 
1857 	VERIEXEC_PATH_GET(SCARG(uap, path), seg, cpath, path);
1858 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1859 
1860 	if ((error = namei(&nd)) != 0)
1861 		goto out;
1862 	vp = nd.ni_vp;
1863 	if (vp != NULL)
1864 		error = EEXIST;
1865 	else {
1866 		VATTR_NULL(&vattr);
1867 		/* We will read cwdi->cwdi_cmask unlocked. */
1868 		vattr.va_mode =
1869 		    (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1870 		vattr.va_rdev = SCARG(uap, dev);
1871 
1872 		switch (SCARG(uap, mode) & S_IFMT) {
1873 		case S_IFMT:	/* used by badsect to flag bad sectors */
1874 			vattr.va_type = VBAD;
1875 			break;
1876 		case S_IFCHR:
1877 			vattr.va_type = VCHR;
1878 			break;
1879 		case S_IFBLK:
1880 			vattr.va_type = VBLK;
1881 			break;
1882 		case S_IFWHT:
1883 			optype = VOP_WHITEOUT_DESCOFFSET;
1884 			break;
1885 		case S_IFREG:
1886 #if NVERIEXEC > 0
1887 			error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1888 			    O_CREAT);
1889 #endif /* NVERIEXEC > 0 */
1890 			vattr.va_type = VREG;
1891 			vattr.va_rdev = VNOVAL;
1892 			optype = VOP_CREATE_DESCOFFSET;
1893 			break;
1894 		default:
1895 			error = EINVAL;
1896 			break;
1897 		}
1898 	}
1899 	if (!error) {
1900 		switch (optype) {
1901 		case VOP_WHITEOUT_DESCOFFSET:
1902 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1903 			if (error)
1904 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1905 			vput(nd.ni_dvp);
1906 			break;
1907 
1908 		case VOP_MKNOD_DESCOFFSET:
1909 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1910 						&nd.ni_cnd, &vattr);
1911 			if (error == 0)
1912 				vput(nd.ni_vp);
1913 			break;
1914 
1915 		case VOP_CREATE_DESCOFFSET:
1916 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1917 						&nd.ni_cnd, &vattr);
1918 			if (error == 0)
1919 				vput(nd.ni_vp);
1920 			break;
1921 		}
1922 	} else {
1923 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1924 		if (nd.ni_dvp == vp)
1925 			vrele(nd.ni_dvp);
1926 		else
1927 			vput(nd.ni_dvp);
1928 		if (vp)
1929 			vrele(vp);
1930 	}
1931 out:
1932 	VERIEXEC_PATH_PUT(path);
1933 	return (error);
1934 }
1935 
1936 /*
1937  * Create a named pipe.
1938  */
1939 /* ARGSUSED */
1940 int
1941 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1942 {
1943 	/* {
1944 		syscallarg(const char *) path;
1945 		syscallarg(int) mode;
1946 	} */
1947 	struct proc *p = l->l_proc;
1948 	struct vattr vattr;
1949 	int error;
1950 	struct nameidata nd;
1951 
1952 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1953 	    SCARG(uap, path));
1954 	if ((error = namei(&nd)) != 0)
1955 		return (error);
1956 	if (nd.ni_vp != NULL) {
1957 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1958 		if (nd.ni_dvp == nd.ni_vp)
1959 			vrele(nd.ni_dvp);
1960 		else
1961 			vput(nd.ni_dvp);
1962 		vrele(nd.ni_vp);
1963 		return (EEXIST);
1964 	}
1965 	VATTR_NULL(&vattr);
1966 	vattr.va_type = VFIFO;
1967 	/* We will read cwdi->cwdi_cmask unlocked. */
1968 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1969 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1970 	if (error == 0)
1971 		vput(nd.ni_vp);
1972 	return (error);
1973 }
1974 
1975 /*
1976  * Make a hard file link.
1977  */
1978 /* ARGSUSED */
1979 int
1980 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1981 {
1982 	/* {
1983 		syscallarg(const char *) path;
1984 		syscallarg(const char *) link;
1985 	} */
1986 	struct vnode *vp;
1987 	struct nameidata nd;
1988 	int error;
1989 
1990 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1991 	    SCARG(uap, path));
1992 	if ((error = namei(&nd)) != 0)
1993 		return (error);
1994 	vp = nd.ni_vp;
1995 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1996 	    SCARG(uap, link));
1997 	if ((error = namei(&nd)) != 0)
1998 		goto out;
1999 	if (nd.ni_vp) {
2000 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2001 		if (nd.ni_dvp == nd.ni_vp)
2002 			vrele(nd.ni_dvp);
2003 		else
2004 			vput(nd.ni_dvp);
2005 		vrele(nd.ni_vp);
2006 		error = EEXIST;
2007 		goto out;
2008 	}
2009 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2010 out:
2011 	vrele(vp);
2012 	return (error);
2013 }
2014 
2015 /*
2016  * Make a symbolic link.
2017  */
2018 /* ARGSUSED */
2019 int
2020 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2021 {
2022 	/* {
2023 		syscallarg(const char *) path;
2024 		syscallarg(const char *) link;
2025 	} */
2026 	struct proc *p = l->l_proc;
2027 	struct vattr vattr;
2028 	char *path;
2029 	int error;
2030 	struct nameidata nd;
2031 
2032 	path = PNBUF_GET();
2033 	error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2034 	if (error)
2035 		goto out;
2036 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2037 	    SCARG(uap, link));
2038 	if ((error = namei(&nd)) != 0)
2039 		goto out;
2040 	if (nd.ni_vp) {
2041 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2042 		if (nd.ni_dvp == nd.ni_vp)
2043 			vrele(nd.ni_dvp);
2044 		else
2045 			vput(nd.ni_dvp);
2046 		vrele(nd.ni_vp);
2047 		error = EEXIST;
2048 		goto out;
2049 	}
2050 	VATTR_NULL(&vattr);
2051 	vattr.va_type = VLNK;
2052 	/* We will read cwdi->cwdi_cmask unlocked. */
2053 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2054 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2055 	if (error == 0)
2056 		vput(nd.ni_vp);
2057 out:
2058 	PNBUF_PUT(path);
2059 	return (error);
2060 }
2061 
2062 /*
2063  * Delete a whiteout from the filesystem.
2064  */
2065 /* ARGSUSED */
2066 int
2067 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2068 {
2069 	/* {
2070 		syscallarg(const char *) path;
2071 	} */
2072 	int error;
2073 	struct nameidata nd;
2074 
2075 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2076 	    UIO_USERSPACE, SCARG(uap, path));
2077 	error = namei(&nd);
2078 	if (error)
2079 		return (error);
2080 
2081 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2082 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2083 		if (nd.ni_dvp == nd.ni_vp)
2084 			vrele(nd.ni_dvp);
2085 		else
2086 			vput(nd.ni_dvp);
2087 		if (nd.ni_vp)
2088 			vrele(nd.ni_vp);
2089 		return (EEXIST);
2090 	}
2091 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2092 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2093 	vput(nd.ni_dvp);
2094 	return (error);
2095 }
2096 
2097 /*
2098  * Delete a name from the filesystem.
2099  */
2100 /* ARGSUSED */
2101 int
2102 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2103 {
2104 	/* {
2105 		syscallarg(const char *) path;
2106 	} */
2107 
2108 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2109 }
2110 
2111 int
2112 do_sys_unlink(const char *arg, enum uio_seg seg)
2113 {
2114 	struct vnode *vp;
2115 	int error;
2116 	struct nameidata nd;
2117 	kauth_cred_t cred;
2118 	char *path;
2119 	const char *cpath;
2120 
2121 	VERIEXEC_PATH_GET(arg, seg, cpath, path);
2122 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2123 
2124 	if ((error = namei(&nd)) != 0)
2125 		goto out;
2126 	vp = nd.ni_vp;
2127 
2128 	/*
2129 	 * The root of a mounted filesystem cannot be deleted.
2130 	 */
2131 	if (vp->v_vflag & VV_ROOT) {
2132 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2133 		if (nd.ni_dvp == vp)
2134 			vrele(nd.ni_dvp);
2135 		else
2136 			vput(nd.ni_dvp);
2137 		vput(vp);
2138 		error = EBUSY;
2139 		goto out;
2140 	}
2141 
2142 #if NVERIEXEC > 0
2143 	/* Handle remove requests for veriexec entries. */
2144 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2145 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2146 		if (nd.ni_dvp == vp)
2147 			vrele(nd.ni_dvp);
2148 		else
2149 			vput(nd.ni_dvp);
2150 		vput(vp);
2151 		goto out;
2152 	}
2153 #endif /* NVERIEXEC > 0 */
2154 
2155 	cred = kauth_cred_get();
2156 #ifdef FILEASSOC
2157 	(void)fileassoc_file_delete(vp);
2158 #endif /* FILEASSOC */
2159 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2160 out:
2161 	VERIEXEC_PATH_PUT(path);
2162 	return (error);
2163 }
2164 
2165 /*
2166  * Reposition read/write file offset.
2167  */
2168 int
2169 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2170 {
2171 	/* {
2172 		syscallarg(int) fd;
2173 		syscallarg(int) pad;
2174 		syscallarg(off_t) offset;
2175 		syscallarg(int) whence;
2176 	} */
2177 	kauth_cred_t cred = l->l_cred;
2178 	file_t *fp;
2179 	struct vnode *vp;
2180 	struct vattr vattr;
2181 	off_t newoff;
2182 	int error, fd;
2183 
2184 	fd = SCARG(uap, fd);
2185 
2186 	if ((fp = fd_getfile(fd)) == NULL)
2187 		return (EBADF);
2188 
2189 	vp = fp->f_data;
2190 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2191 		error = ESPIPE;
2192 		goto out;
2193 	}
2194 
2195 	switch (SCARG(uap, whence)) {
2196 	case SEEK_CUR:
2197 		newoff = fp->f_offset + SCARG(uap, offset);
2198 		break;
2199 	case SEEK_END:
2200 		error = VOP_GETATTR(vp, &vattr, cred);
2201 		if (error) {
2202 			goto out;
2203 		}
2204 		newoff = SCARG(uap, offset) + vattr.va_size;
2205 		break;
2206 	case SEEK_SET:
2207 		newoff = SCARG(uap, offset);
2208 		break;
2209 	default:
2210 		error = EINVAL;
2211 		goto out;
2212 	}
2213 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2214 		*(off_t *)retval = fp->f_offset = newoff;
2215 	}
2216  out:
2217  	fd_putfile(fd);
2218 	return (error);
2219 }
2220 
2221 /*
2222  * Positional read system call.
2223  */
2224 int
2225 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2226 {
2227 	/* {
2228 		syscallarg(int) fd;
2229 		syscallarg(void *) buf;
2230 		syscallarg(size_t) nbyte;
2231 		syscallarg(off_t) offset;
2232 	} */
2233 	file_t *fp;
2234 	struct vnode *vp;
2235 	off_t offset;
2236 	int error, fd = SCARG(uap, fd);
2237 
2238 	if ((fp = fd_getfile(fd)) == NULL)
2239 		return (EBADF);
2240 
2241 	if ((fp->f_flag & FREAD) == 0) {
2242 		fd_putfile(fd);
2243 		return (EBADF);
2244 	}
2245 
2246 	vp = fp->f_data;
2247 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2248 		error = ESPIPE;
2249 		goto out;
2250 	}
2251 
2252 	offset = SCARG(uap, offset);
2253 
2254 	/*
2255 	 * XXX This works because no file systems actually
2256 	 * XXX take any action on the seek operation.
2257 	 */
2258 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2259 		goto out;
2260 
2261 	/* dofileread() will unuse the descriptor for us */
2262 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2263 	    &offset, 0, retval));
2264 
2265  out:
2266 	fd_putfile(fd);
2267 	return (error);
2268 }
2269 
2270 /*
2271  * Positional scatter read system call.
2272  */
2273 int
2274 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2275 {
2276 	/* {
2277 		syscallarg(int) fd;
2278 		syscallarg(const struct iovec *) iovp;
2279 		syscallarg(int) iovcnt;
2280 		syscallarg(off_t) offset;
2281 	} */
2282 	off_t offset = SCARG(uap, offset);
2283 
2284 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2285 	    SCARG(uap, iovcnt), &offset, 0, retval);
2286 }
2287 
2288 /*
2289  * Positional write system call.
2290  */
2291 int
2292 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2293 {
2294 	/* {
2295 		syscallarg(int) fd;
2296 		syscallarg(const void *) buf;
2297 		syscallarg(size_t) nbyte;
2298 		syscallarg(off_t) offset;
2299 	} */
2300 	file_t *fp;
2301 	struct vnode *vp;
2302 	off_t offset;
2303 	int error, fd = SCARG(uap, fd);
2304 
2305 	if ((fp = fd_getfile(fd)) == NULL)
2306 		return (EBADF);
2307 
2308 	if ((fp->f_flag & FWRITE) == 0) {
2309 		fd_putfile(fd);
2310 		return (EBADF);
2311 	}
2312 
2313 	vp = fp->f_data;
2314 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2315 		error = ESPIPE;
2316 		goto out;
2317 	}
2318 
2319 	offset = SCARG(uap, offset);
2320 
2321 	/*
2322 	 * XXX This works because no file systems actually
2323 	 * XXX take any action on the seek operation.
2324 	 */
2325 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2326 		goto out;
2327 
2328 	/* dofilewrite() will unuse the descriptor for us */
2329 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2330 	    &offset, 0, retval));
2331 
2332  out:
2333 	fd_putfile(fd);
2334 	return (error);
2335 }
2336 
2337 /*
2338  * Positional gather write system call.
2339  */
2340 int
2341 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2342 {
2343 	/* {
2344 		syscallarg(int) fd;
2345 		syscallarg(const struct iovec *) iovp;
2346 		syscallarg(int) iovcnt;
2347 		syscallarg(off_t) offset;
2348 	} */
2349 	off_t offset = SCARG(uap, offset);
2350 
2351 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2352 	    SCARG(uap, iovcnt), &offset, 0, retval);
2353 }
2354 
2355 /*
2356  * Check access permissions.
2357  */
2358 int
2359 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2360 {
2361 	/* {
2362 		syscallarg(const char *) path;
2363 		syscallarg(int) flags;
2364 	} */
2365 	kauth_cred_t cred;
2366 	struct vnode *vp;
2367 	int error, flags;
2368 	struct nameidata nd;
2369 
2370 	cred = kauth_cred_dup(l->l_cred);
2371 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2372 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2373 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2374 	    SCARG(uap, path));
2375 	/* Override default credentials */
2376 	nd.ni_cnd.cn_cred = cred;
2377 	if ((error = namei(&nd)) != 0)
2378 		goto out;
2379 	vp = nd.ni_vp;
2380 
2381 	/* Flags == 0 means only check for existence. */
2382 	if (SCARG(uap, flags)) {
2383 		flags = 0;
2384 		if (SCARG(uap, flags) & R_OK)
2385 			flags |= VREAD;
2386 		if (SCARG(uap, flags) & W_OK)
2387 			flags |= VWRITE;
2388 		if (SCARG(uap, flags) & X_OK)
2389 			flags |= VEXEC;
2390 
2391 		error = VOP_ACCESS(vp, flags, cred);
2392 		if (!error && (flags & VWRITE))
2393 			error = vn_writechk(vp);
2394 	}
2395 	vput(vp);
2396 out:
2397 	kauth_cred_free(cred);
2398 	return (error);
2399 }
2400 
2401 /*
2402  * Common code for all sys_stat functions, including compat versions.
2403  */
2404 int
2405 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2406 {
2407 	int error;
2408 	struct nameidata nd;
2409 
2410 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2411 	    UIO_USERSPACE, path);
2412 	error = namei(&nd);
2413 	if (error != 0)
2414 		return error;
2415 	error = vn_stat(nd.ni_vp, sb);
2416 	vput(nd.ni_vp);
2417 	return error;
2418 }
2419 
2420 /*
2421  * Get file status; this version follows links.
2422  */
2423 /* ARGSUSED */
2424 int
2425 sys___stat30(struct lwp *l, const struct sys___stat30_args *uap, register_t *retval)
2426 {
2427 	/* {
2428 		syscallarg(const char *) path;
2429 		syscallarg(struct stat *) ub;
2430 	} */
2431 	struct stat sb;
2432 	int error;
2433 
2434 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2435 	if (error)
2436 		return error;
2437 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2438 }
2439 
2440 /*
2441  * Get file status; this version does not follow links.
2442  */
2443 /* ARGSUSED */
2444 int
2445 sys___lstat30(struct lwp *l, const struct sys___lstat30_args *uap, register_t *retval)
2446 {
2447 	/* {
2448 		syscallarg(const char *) path;
2449 		syscallarg(struct stat *) ub;
2450 	} */
2451 	struct stat sb;
2452 	int error;
2453 
2454 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2455 	if (error)
2456 		return error;
2457 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2458 }
2459 
2460 /*
2461  * Get configurable pathname variables.
2462  */
2463 /* ARGSUSED */
2464 int
2465 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2466 {
2467 	/* {
2468 		syscallarg(const char *) path;
2469 		syscallarg(int) name;
2470 	} */
2471 	int error;
2472 	struct nameidata nd;
2473 
2474 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2475 	    SCARG(uap, path));
2476 	if ((error = namei(&nd)) != 0)
2477 		return (error);
2478 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2479 	vput(nd.ni_vp);
2480 	return (error);
2481 }
2482 
2483 /*
2484  * Return target name of a symbolic link.
2485  */
2486 /* ARGSUSED */
2487 int
2488 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2489 {
2490 	/* {
2491 		syscallarg(const char *) path;
2492 		syscallarg(char *) buf;
2493 		syscallarg(size_t) count;
2494 	} */
2495 	struct vnode *vp;
2496 	struct iovec aiov;
2497 	struct uio auio;
2498 	int error;
2499 	struct nameidata nd;
2500 
2501 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2502 	    SCARG(uap, path));
2503 	if ((error = namei(&nd)) != 0)
2504 		return (error);
2505 	vp = nd.ni_vp;
2506 	if (vp->v_type != VLNK)
2507 		error = EINVAL;
2508 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2509 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2510 		aiov.iov_base = SCARG(uap, buf);
2511 		aiov.iov_len = SCARG(uap, count);
2512 		auio.uio_iov = &aiov;
2513 		auio.uio_iovcnt = 1;
2514 		auio.uio_offset = 0;
2515 		auio.uio_rw = UIO_READ;
2516 		KASSERT(l == curlwp);
2517 		auio.uio_vmspace = l->l_proc->p_vmspace;
2518 		auio.uio_resid = SCARG(uap, count);
2519 		error = VOP_READLINK(vp, &auio, l->l_cred);
2520 	}
2521 	vput(vp);
2522 	*retval = SCARG(uap, count) - auio.uio_resid;
2523 	return (error);
2524 }
2525 
2526 /*
2527  * Change flags of a file given a path name.
2528  */
2529 /* ARGSUSED */
2530 int
2531 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2532 {
2533 	/* {
2534 		syscallarg(const char *) path;
2535 		syscallarg(u_long) flags;
2536 	} */
2537 	struct vnode *vp;
2538 	int error;
2539 	struct nameidata nd;
2540 
2541 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2542 	    SCARG(uap, path));
2543 	if ((error = namei(&nd)) != 0)
2544 		return (error);
2545 	vp = nd.ni_vp;
2546 	error = change_flags(vp, SCARG(uap, flags), l);
2547 	vput(vp);
2548 	return (error);
2549 }
2550 
2551 /*
2552  * Change flags of a file given a file descriptor.
2553  */
2554 /* ARGSUSED */
2555 int
2556 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2557 {
2558 	/* {
2559 		syscallarg(int) fd;
2560 		syscallarg(u_long) flags;
2561 	} */
2562 	struct vnode *vp;
2563 	file_t *fp;
2564 	int error;
2565 
2566 	/* fd_getvnode() will use the descriptor for us */
2567 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2568 		return (error);
2569 	vp = fp->f_data;
2570 	error = change_flags(vp, SCARG(uap, flags), l);
2571 	VOP_UNLOCK(vp, 0);
2572 	fd_putfile(SCARG(uap, fd));
2573 	return (error);
2574 }
2575 
2576 /*
2577  * Change flags of a file given a path name; this version does
2578  * not follow links.
2579  */
2580 int
2581 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2582 {
2583 	/* {
2584 		syscallarg(const char *) path;
2585 		syscallarg(u_long) flags;
2586 	} */
2587 	struct vnode *vp;
2588 	int error;
2589 	struct nameidata nd;
2590 
2591 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2592 	    SCARG(uap, path));
2593 	if ((error = namei(&nd)) != 0)
2594 		return (error);
2595 	vp = nd.ni_vp;
2596 	error = change_flags(vp, SCARG(uap, flags), l);
2597 	vput(vp);
2598 	return (error);
2599 }
2600 
2601 /*
2602  * Common routine to change flags of a file.
2603  */
2604 int
2605 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2606 {
2607 	struct vattr vattr;
2608 	int error;
2609 
2610 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2611 	/*
2612 	 * Non-superusers cannot change the flags on devices, even if they
2613 	 * own them.
2614 	 */
2615 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2616 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2617 			goto out;
2618 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2619 			error = EINVAL;
2620 			goto out;
2621 		}
2622 	}
2623 	VATTR_NULL(&vattr);
2624 	vattr.va_flags = flags;
2625 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2626 out:
2627 	return (error);
2628 }
2629 
2630 /*
2631  * Change mode of a file given path name; this version follows links.
2632  */
2633 /* ARGSUSED */
2634 int
2635 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2636 {
2637 	/* {
2638 		syscallarg(const char *) path;
2639 		syscallarg(int) mode;
2640 	} */
2641 	int error;
2642 	struct nameidata nd;
2643 
2644 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2645 	    SCARG(uap, path));
2646 	if ((error = namei(&nd)) != 0)
2647 		return (error);
2648 
2649 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2650 
2651 	vrele(nd.ni_vp);
2652 	return (error);
2653 }
2654 
2655 /*
2656  * Change mode of a file given a file descriptor.
2657  */
2658 /* ARGSUSED */
2659 int
2660 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2661 {
2662 	/* {
2663 		syscallarg(int) fd;
2664 		syscallarg(int) mode;
2665 	} */
2666 	file_t *fp;
2667 	int error;
2668 
2669 	/* fd_getvnode() will use the descriptor for us */
2670 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2671 		return (error);
2672 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2673 	fd_putfile(SCARG(uap, fd));
2674 	return (error);
2675 }
2676 
2677 /*
2678  * Change mode of a file given path name; this version does not follow links.
2679  */
2680 /* ARGSUSED */
2681 int
2682 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2683 {
2684 	/* {
2685 		syscallarg(const char *) path;
2686 		syscallarg(int) mode;
2687 	} */
2688 	int error;
2689 	struct nameidata nd;
2690 
2691 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2692 	    SCARG(uap, path));
2693 	if ((error = namei(&nd)) != 0)
2694 		return (error);
2695 
2696 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2697 
2698 	vrele(nd.ni_vp);
2699 	return (error);
2700 }
2701 
2702 /*
2703  * Common routine to set mode given a vnode.
2704  */
2705 static int
2706 change_mode(struct vnode *vp, int mode, struct lwp *l)
2707 {
2708 	struct vattr vattr;
2709 	int error;
2710 
2711 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2712 	VATTR_NULL(&vattr);
2713 	vattr.va_mode = mode & ALLPERMS;
2714 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2715 	VOP_UNLOCK(vp, 0);
2716 	return (error);
2717 }
2718 
2719 /*
2720  * Set ownership given a path name; this version follows links.
2721  */
2722 /* ARGSUSED */
2723 int
2724 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2725 {
2726 	/* {
2727 		syscallarg(const char *) path;
2728 		syscallarg(uid_t) uid;
2729 		syscallarg(gid_t) gid;
2730 	} */
2731 	int error;
2732 	struct nameidata nd;
2733 
2734 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2735 	    SCARG(uap, path));
2736 	if ((error = namei(&nd)) != 0)
2737 		return (error);
2738 
2739 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2740 
2741 	vrele(nd.ni_vp);
2742 	return (error);
2743 }
2744 
2745 /*
2746  * Set ownership given a path name; this version follows links.
2747  * Provides POSIX semantics.
2748  */
2749 /* ARGSUSED */
2750 int
2751 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2752 {
2753 	/* {
2754 		syscallarg(const char *) path;
2755 		syscallarg(uid_t) uid;
2756 		syscallarg(gid_t) gid;
2757 	} */
2758 	int error;
2759 	struct nameidata nd;
2760 
2761 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2762 	    SCARG(uap, path));
2763 	if ((error = namei(&nd)) != 0)
2764 		return (error);
2765 
2766 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2767 
2768 	vrele(nd.ni_vp);
2769 	return (error);
2770 }
2771 
2772 /*
2773  * Set ownership given a file descriptor.
2774  */
2775 /* ARGSUSED */
2776 int
2777 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2778 {
2779 	/* {
2780 		syscallarg(int) fd;
2781 		syscallarg(uid_t) uid;
2782 		syscallarg(gid_t) gid;
2783 	} */
2784 	int error;
2785 	file_t *fp;
2786 
2787 	/* fd_getvnode() will use the descriptor for us */
2788 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2789 		return (error);
2790 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2791 	    l, 0);
2792 	fd_putfile(SCARG(uap, fd));
2793 	return (error);
2794 }
2795 
2796 /*
2797  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2798  */
2799 /* ARGSUSED */
2800 int
2801 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2802 {
2803 	/* {
2804 		syscallarg(int) fd;
2805 		syscallarg(uid_t) uid;
2806 		syscallarg(gid_t) gid;
2807 	} */
2808 	int error;
2809 	file_t *fp;
2810 
2811 	/* fd_getvnode() will use the descriptor for us */
2812 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2813 		return (error);
2814 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2815 	    l, 1);
2816 	fd_putfile(SCARG(uap, fd));
2817 	return (error);
2818 }
2819 
2820 /*
2821  * Set ownership given a path name; this version does not follow links.
2822  */
2823 /* ARGSUSED */
2824 int
2825 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2826 {
2827 	/* {
2828 		syscallarg(const char *) path;
2829 		syscallarg(uid_t) uid;
2830 		syscallarg(gid_t) gid;
2831 	} */
2832 	int error;
2833 	struct nameidata nd;
2834 
2835 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2836 	    SCARG(uap, path));
2837 	if ((error = namei(&nd)) != 0)
2838 		return (error);
2839 
2840 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2841 
2842 	vrele(nd.ni_vp);
2843 	return (error);
2844 }
2845 
2846 /*
2847  * Set ownership given a path name; this version does not follow links.
2848  * Provides POSIX/XPG semantics.
2849  */
2850 /* ARGSUSED */
2851 int
2852 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2853 {
2854 	/* {
2855 		syscallarg(const char *) path;
2856 		syscallarg(uid_t) uid;
2857 		syscallarg(gid_t) gid;
2858 	} */
2859 	int error;
2860 	struct nameidata nd;
2861 
2862 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2863 	    SCARG(uap, path));
2864 	if ((error = namei(&nd)) != 0)
2865 		return (error);
2866 
2867 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2868 
2869 	vrele(nd.ni_vp);
2870 	return (error);
2871 }
2872 
2873 /*
2874  * Common routine to set ownership given a vnode.
2875  */
2876 static int
2877 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2878     int posix_semantics)
2879 {
2880 	struct vattr vattr;
2881 	mode_t newmode;
2882 	int error;
2883 
2884 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2885 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2886 		goto out;
2887 
2888 #define CHANGED(x) ((int)(x) != -1)
2889 	newmode = vattr.va_mode;
2890 	if (posix_semantics) {
2891 		/*
2892 		 * POSIX/XPG semantics: if the caller is not the super-user,
2893 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2894 		 * the XPG consider the behaviour for calls by the super-user
2895 		 * implementation-defined; we leave the set-user-id and set-
2896 		 * group-id settings intact in that case.
2897 		 */
2898 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2899 				      NULL) != 0)
2900 			newmode &= ~(S_ISUID | S_ISGID);
2901 	} else {
2902 		/*
2903 		 * NetBSD semantics: when changing owner and/or group,
2904 		 * clear the respective bit(s).
2905 		 */
2906 		if (CHANGED(uid))
2907 			newmode &= ~S_ISUID;
2908 		if (CHANGED(gid))
2909 			newmode &= ~S_ISGID;
2910 	}
2911 	/* Update va_mode iff altered. */
2912 	if (vattr.va_mode == newmode)
2913 		newmode = VNOVAL;
2914 
2915 	VATTR_NULL(&vattr);
2916 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2917 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2918 	vattr.va_mode = newmode;
2919 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2920 #undef CHANGED
2921 
2922 out:
2923 	VOP_UNLOCK(vp, 0);
2924 	return (error);
2925 }
2926 
2927 /*
2928  * Set the access and modification times given a path name; this
2929  * version follows links.
2930  */
2931 /* ARGSUSED */
2932 int
2933 sys_utimes(struct lwp *l, const struct sys_utimes_args *uap, register_t *retval)
2934 {
2935 	/* {
2936 		syscallarg(const char *) path;
2937 		syscallarg(const struct timeval *) tptr;
2938 	} */
2939 
2940 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2941 	    SCARG(uap, tptr), UIO_USERSPACE);
2942 }
2943 
2944 /*
2945  * Set the access and modification times given a file descriptor.
2946  */
2947 /* ARGSUSED */
2948 int
2949 sys_futimes(struct lwp *l, const struct sys_futimes_args *uap, register_t *retval)
2950 {
2951 	/* {
2952 		syscallarg(int) fd;
2953 		syscallarg(const struct timeval *) tptr;
2954 	} */
2955 	int error;
2956 	file_t *fp;
2957 
2958 	/* fd_getvnode() will use the descriptor for us */
2959 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2960 		return (error);
2961 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2962 	    UIO_USERSPACE);
2963 	fd_putfile(SCARG(uap, fd));
2964 	return (error);
2965 }
2966 
2967 /*
2968  * Set the access and modification times given a path name; this
2969  * version does not follow links.
2970  */
2971 int
2972 sys_lutimes(struct lwp *l, const struct sys_lutimes_args *uap, register_t *retval)
2973 {
2974 	/* {
2975 		syscallarg(const char *) path;
2976 		syscallarg(const struct timeval *) tptr;
2977 	} */
2978 
2979 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
2980 	    SCARG(uap, tptr), UIO_USERSPACE);
2981 }
2982 
2983 /*
2984  * Common routine to set access and modification times given a vnode.
2985  */
2986 int
2987 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
2988     const struct timeval *tptr, enum uio_seg seg)
2989 {
2990 	struct vattr vattr;
2991 	struct nameidata nd;
2992 	int error;
2993 
2994 	VATTR_NULL(&vattr);
2995 	if (tptr == NULL) {
2996 		nanotime(&vattr.va_atime);
2997 		vattr.va_mtime = vattr.va_atime;
2998 		vattr.va_vaflags |= VA_UTIMES_NULL;
2999 	} else {
3000 		struct timeval tv[2];
3001 
3002 		if (seg != UIO_SYSSPACE) {
3003 			error = copyin(tptr, &tv, sizeof (tv));
3004 			if (error != 0)
3005 				return error;
3006 			tptr = tv;
3007 		}
3008 		TIMEVAL_TO_TIMESPEC(tptr, &vattr.va_atime);
3009 		TIMEVAL_TO_TIMESPEC(tptr + 1, &vattr.va_mtime);
3010 	}
3011 
3012 	if (vp == NULL) {
3013 		NDINIT(&nd, LOOKUP, flag | TRYEMULROOT, UIO_USERSPACE, path);
3014 		if ((error = namei(&nd)) != 0)
3015 			return (error);
3016 		vp = nd.ni_vp;
3017 	} else
3018 		nd.ni_vp = NULL;
3019 
3020 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3021 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3022 	VOP_UNLOCK(vp, 0);
3023 
3024 	if (nd.ni_vp != NULL)
3025 		vrele(nd.ni_vp);
3026 
3027 	return (error);
3028 }
3029 
3030 /*
3031  * Truncate a file given its path name.
3032  */
3033 /* ARGSUSED */
3034 int
3035 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3036 {
3037 	/* {
3038 		syscallarg(const char *) path;
3039 		syscallarg(int) pad;
3040 		syscallarg(off_t) length;
3041 	} */
3042 	struct vnode *vp;
3043 	struct vattr vattr;
3044 	int error;
3045 	struct nameidata nd;
3046 
3047 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3048 	    SCARG(uap, path));
3049 	if ((error = namei(&nd)) != 0)
3050 		return (error);
3051 	vp = nd.ni_vp;
3052 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3053 	if (vp->v_type == VDIR)
3054 		error = EISDIR;
3055 	else if ((error = vn_writechk(vp)) == 0 &&
3056 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3057 		VATTR_NULL(&vattr);
3058 		vattr.va_size = SCARG(uap, length);
3059 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3060 	}
3061 	vput(vp);
3062 	return (error);
3063 }
3064 
3065 /*
3066  * Truncate a file given a file descriptor.
3067  */
3068 /* ARGSUSED */
3069 int
3070 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3071 {
3072 	/* {
3073 		syscallarg(int) fd;
3074 		syscallarg(int) pad;
3075 		syscallarg(off_t) length;
3076 	} */
3077 	struct vattr vattr;
3078 	struct vnode *vp;
3079 	file_t *fp;
3080 	int error;
3081 
3082 	/* fd_getvnode() will use the descriptor for us */
3083 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3084 		return (error);
3085 	if ((fp->f_flag & FWRITE) == 0) {
3086 		error = EINVAL;
3087 		goto out;
3088 	}
3089 	vp = fp->f_data;
3090 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3091 	if (vp->v_type == VDIR)
3092 		error = EISDIR;
3093 	else if ((error = vn_writechk(vp)) == 0) {
3094 		VATTR_NULL(&vattr);
3095 		vattr.va_size = SCARG(uap, length);
3096 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3097 	}
3098 	VOP_UNLOCK(vp, 0);
3099  out:
3100 	fd_putfile(SCARG(uap, fd));
3101 	return (error);
3102 }
3103 
3104 /*
3105  * Sync an open file.
3106  */
3107 /* ARGSUSED */
3108 int
3109 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3110 {
3111 	/* {
3112 		syscallarg(int) fd;
3113 	} */
3114 	struct vnode *vp;
3115 	file_t *fp;
3116 	int error;
3117 
3118 	/* fd_getvnode() will use the descriptor for us */
3119 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3120 		return (error);
3121 	vp = fp->f_data;
3122 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3123 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3124 	if (error == 0 && bioopsp != NULL &&
3125 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3126 		(*bioopsp->io_fsync)(vp, 0);
3127 	VOP_UNLOCK(vp, 0);
3128 	fd_putfile(SCARG(uap, fd));
3129 	return (error);
3130 }
3131 
3132 /*
3133  * Sync a range of file data.  API modeled after that found in AIX.
3134  *
3135  * FDATASYNC indicates that we need only save enough metadata to be able
3136  * to re-read the written data.  Note we duplicate AIX's requirement that
3137  * the file be open for writing.
3138  */
3139 /* ARGSUSED */
3140 int
3141 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3142 {
3143 	/* {
3144 		syscallarg(int) fd;
3145 		syscallarg(int) flags;
3146 		syscallarg(off_t) start;
3147 		syscallarg(off_t) length;
3148 	} */
3149 	struct vnode *vp;
3150 	file_t *fp;
3151 	int flags, nflags;
3152 	off_t s, e, len;
3153 	int error;
3154 
3155 	/* fd_getvnode() will use the descriptor for us */
3156 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3157 		return (error);
3158 
3159 	if ((fp->f_flag & FWRITE) == 0) {
3160 		error = EBADF;
3161 		goto out;
3162 	}
3163 
3164 	flags = SCARG(uap, flags);
3165 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3166 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3167 		error = EINVAL;
3168 		goto out;
3169 	}
3170 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3171 	if (flags & FDATASYNC)
3172 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3173 	else
3174 		nflags = FSYNC_WAIT;
3175 	if (flags & FDISKSYNC)
3176 		nflags |= FSYNC_CACHE;
3177 
3178 	len = SCARG(uap, length);
3179 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3180 	if (len) {
3181 		s = SCARG(uap, start);
3182 		e = s + len;
3183 		if (e < s) {
3184 			error = EINVAL;
3185 			goto out;
3186 		}
3187 	} else {
3188 		e = 0;
3189 		s = 0;
3190 	}
3191 
3192 	vp = fp->f_data;
3193 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3194 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3195 
3196 	if (error == 0 && bioopsp != NULL &&
3197 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3198 		(*bioopsp->io_fsync)(vp, nflags);
3199 
3200 	VOP_UNLOCK(vp, 0);
3201 out:
3202 	fd_putfile(SCARG(uap, fd));
3203 	return (error);
3204 }
3205 
3206 /*
3207  * Sync the data of an open file.
3208  */
3209 /* ARGSUSED */
3210 int
3211 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3212 {
3213 	/* {
3214 		syscallarg(int) fd;
3215 	} */
3216 	struct vnode *vp;
3217 	file_t *fp;
3218 	int error;
3219 
3220 	/* fd_getvnode() will use the descriptor for us */
3221 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3222 		return (error);
3223 	if ((fp->f_flag & FWRITE) == 0) {
3224 		fd_putfile(SCARG(uap, fd));
3225 		return (EBADF);
3226 	}
3227 	vp = fp->f_data;
3228 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3229 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3230 	VOP_UNLOCK(vp, 0);
3231 	fd_putfile(SCARG(uap, fd));
3232 	return (error);
3233 }
3234 
3235 /*
3236  * Rename files, (standard) BSD semantics frontend.
3237  */
3238 /* ARGSUSED */
3239 int
3240 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3241 {
3242 	/* {
3243 		syscallarg(const char *) from;
3244 		syscallarg(const char *) to;
3245 	} */
3246 
3247 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3248 }
3249 
3250 /*
3251  * Rename files, POSIX semantics frontend.
3252  */
3253 /* ARGSUSED */
3254 int
3255 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3256 {
3257 	/* {
3258 		syscallarg(const char *) from;
3259 		syscallarg(const char *) to;
3260 	} */
3261 
3262 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3263 }
3264 
3265 /*
3266  * Rename files.  Source and destination must either both be directories,
3267  * or both not be directories.  If target is a directory, it must be empty.
3268  * If `from' and `to' refer to the same object, the value of the `retain'
3269  * argument is used to determine whether `from' will be
3270  *
3271  * (retain == 0)	deleted unless `from' and `to' refer to the same
3272  *			object in the file system's name space (BSD).
3273  * (retain == 1)	always retained (POSIX).
3274  */
3275 int
3276 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3277 {
3278 	struct vnode *tvp, *fvp, *tdvp;
3279 	struct nameidata fromnd, tond;
3280 	struct mount *fs;
3281 	struct lwp *l = curlwp;
3282 	struct proc *p;
3283 	uint32_t saveflag;
3284 	int error;
3285 
3286 	NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT,
3287 	    seg, from);
3288 	if ((error = namei(&fromnd)) != 0)
3289 		return (error);
3290 	if (fromnd.ni_dvp != fromnd.ni_vp)
3291 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3292 	fvp = fromnd.ni_vp;
3293 
3294 	fs = fvp->v_mount;
3295 	error = VFS_RENAMELOCK_ENTER(fs);
3296 	if (error) {
3297 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3298 		vrele(fromnd.ni_dvp);
3299 		vrele(fvp);
3300 		goto out1;
3301 	}
3302 
3303 	/*
3304 	 * close, partially, yet another race - ideally we should only
3305 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3306 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3307 	 * that with namei as it stands.
3308 	 *
3309 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3310 	 * under us. The real fix is to get the locks in the right
3311 	 * order and do the lookups in the right places, but that's a
3312 	 * major rototill.
3313 	 *
3314 	 * Preserve the SAVESTART in cn_flags, because who knows what
3315 	 * might happen if we don't.
3316 	 *
3317 	 * Note: this logic (as well as this whole function) is cloned
3318 	 * in nfs_serv.c. Proceed accordingly.
3319 	 */
3320 	vrele(fvp);
3321 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3322 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3323 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3324 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3325 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3326 		error = EINVAL;
3327 		VFS_RENAMELOCK_EXIT(fs);
3328 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3329 		vrele(fromnd.ni_dvp);
3330 		goto out1;
3331 	}
3332 	saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3333 	fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3334 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3335 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3336 	fromnd.ni_cnd.cn_flags |= saveflag;
3337 	if (error) {
3338 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3339 		VFS_RENAMELOCK_EXIT(fs);
3340 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3341 		vrele(fromnd.ni_dvp);
3342 		goto out1;
3343 	}
3344 	VOP_UNLOCK(fromnd.ni_vp, 0);
3345 	if (fromnd.ni_dvp != fromnd.ni_vp)
3346 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3347 	fvp = fromnd.ni_vp;
3348 
3349 	NDINIT(&tond, RENAME,
3350 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3351 	      | (fvp->v_type == VDIR ? CREATEDIR : 0),
3352 	    seg, to);
3353 	if ((error = namei(&tond)) != 0) {
3354 		VFS_RENAMELOCK_EXIT(fs);
3355 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3356 		vrele(fromnd.ni_dvp);
3357 		vrele(fvp);
3358 		goto out1;
3359 	}
3360 	tdvp = tond.ni_dvp;
3361 	tvp = tond.ni_vp;
3362 
3363 	if (tvp != NULL) {
3364 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3365 			error = ENOTDIR;
3366 			goto out;
3367 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3368 			error = EISDIR;
3369 			goto out;
3370 		}
3371 	}
3372 
3373 	if (fvp == tdvp)
3374 		error = EINVAL;
3375 
3376 	/*
3377 	 * Source and destination refer to the same object.
3378 	 */
3379 	if (fvp == tvp) {
3380 		if (retain)
3381 			error = -1;
3382 		else if (fromnd.ni_dvp == tdvp &&
3383 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3384 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3385 		          tond.ni_cnd.cn_nameptr,
3386 		          fromnd.ni_cnd.cn_namelen))
3387 		error = -1;
3388 	}
3389 
3390 #if NVERIEXEC > 0
3391 	if (!error) {
3392 		char *f1, *f2;
3393 
3394 		f1 = malloc(fromnd.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3395 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen);
3396 
3397 		f2 = malloc(tond.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3398 		strlcpy(f2, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen);
3399 
3400 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3401 
3402 		free(f1, M_TEMP);
3403 		free(f2, M_TEMP);
3404 	}
3405 #endif /* NVERIEXEC > 0 */
3406 
3407 out:
3408 	p = l->l_proc;
3409 	if (!error) {
3410 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3411 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3412 		VFS_RENAMELOCK_EXIT(fs);
3413 	} else {
3414 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3415 		if (tdvp == tvp)
3416 			vrele(tdvp);
3417 		else
3418 			vput(tdvp);
3419 		if (tvp)
3420 			vput(tvp);
3421 		VFS_RENAMELOCK_EXIT(fs);
3422 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3423 		vrele(fromnd.ni_dvp);
3424 		vrele(fvp);
3425 	}
3426 	vrele(tond.ni_startdir);
3427 	PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3428 out1:
3429 	if (fromnd.ni_startdir)
3430 		vrele(fromnd.ni_startdir);
3431 	PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3432 	return (error == -1 ? 0 : error);
3433 }
3434 
3435 /*
3436  * Make a directory file.
3437  */
3438 /* ARGSUSED */
3439 int
3440 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3441 {
3442 	/* {
3443 		syscallarg(const char *) path;
3444 		syscallarg(int) mode;
3445 	} */
3446 	struct proc *p = l->l_proc;
3447 	struct vnode *vp;
3448 	struct vattr vattr;
3449 	int error;
3450 	struct nameidata nd;
3451 
3452 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, UIO_USERSPACE,
3453 	    SCARG(uap, path));
3454 	if ((error = namei(&nd)) != 0)
3455 		return (error);
3456 	vp = nd.ni_vp;
3457 	if (vp != NULL) {
3458 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3459 		if (nd.ni_dvp == vp)
3460 			vrele(nd.ni_dvp);
3461 		else
3462 			vput(nd.ni_dvp);
3463 		vrele(vp);
3464 		return (EEXIST);
3465 	}
3466 	VATTR_NULL(&vattr);
3467 	vattr.va_type = VDIR;
3468 	/* We will read cwdi->cwdi_cmask unlocked. */
3469 	vattr.va_mode =
3470 	    (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3471 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3472 	if (!error)
3473 		vput(nd.ni_vp);
3474 	return (error);
3475 }
3476 
3477 /*
3478  * Remove a directory file.
3479  */
3480 /* ARGSUSED */
3481 int
3482 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3483 {
3484 	/* {
3485 		syscallarg(const char *) path;
3486 	} */
3487 	struct vnode *vp;
3488 	int error;
3489 	struct nameidata nd;
3490 
3491 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3492 	    SCARG(uap, path));
3493 	if ((error = namei(&nd)) != 0)
3494 		return (error);
3495 	vp = nd.ni_vp;
3496 	if (vp->v_type != VDIR) {
3497 		error = ENOTDIR;
3498 		goto out;
3499 	}
3500 	/*
3501 	 * No rmdir "." please.
3502 	 */
3503 	if (nd.ni_dvp == vp) {
3504 		error = EINVAL;
3505 		goto out;
3506 	}
3507 	/*
3508 	 * The root of a mounted filesystem cannot be deleted.
3509 	 */
3510 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3511 		error = EBUSY;
3512 		goto out;
3513 	}
3514 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3515 	return (error);
3516 
3517 out:
3518 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3519 	if (nd.ni_dvp == vp)
3520 		vrele(nd.ni_dvp);
3521 	else
3522 		vput(nd.ni_dvp);
3523 	vput(vp);
3524 	return (error);
3525 }
3526 
3527 /*
3528  * Read a block of directory entries in a file system independent format.
3529  */
3530 int
3531 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3532 {
3533 	/* {
3534 		syscallarg(int) fd;
3535 		syscallarg(char *) buf;
3536 		syscallarg(size_t) count;
3537 	} */
3538 	file_t *fp;
3539 	int error, done;
3540 
3541 	/* fd_getvnode() will use the descriptor for us */
3542 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3543 		return (error);
3544 	if ((fp->f_flag & FREAD) == 0) {
3545 		error = EBADF;
3546 		goto out;
3547 	}
3548 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3549 			SCARG(uap, count), &done, l, 0, 0);
3550 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3551 	*retval = done;
3552  out:
3553 	fd_putfile(SCARG(uap, fd));
3554 	return (error);
3555 }
3556 
3557 /*
3558  * Set the mode mask for creation of filesystem nodes.
3559  */
3560 int
3561 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3562 {
3563 	/* {
3564 		syscallarg(mode_t) newmask;
3565 	} */
3566 	struct proc *p = l->l_proc;
3567 	struct cwdinfo *cwdi;
3568 
3569 	/*
3570 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3571 	 * important is that we serialize changes to the mask.  The
3572 	 * rw_exit() will issue a write memory barrier on our behalf,
3573 	 * and force the changes out to other CPUs (as it must use an
3574 	 * atomic operation, draining the local CPU's store buffers).
3575 	 */
3576 	cwdi = p->p_cwdi;
3577 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3578 	*retval = cwdi->cwdi_cmask;
3579 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3580 	rw_exit(&cwdi->cwdi_lock);
3581 
3582 	return (0);
3583 }
3584 
3585 int
3586 dorevoke(struct vnode *vp, kauth_cred_t cred)
3587 {
3588 	struct vattr vattr;
3589 	int error;
3590 
3591 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3592 		return error;
3593 	if (kauth_cred_geteuid(cred) != vattr.va_uid &&
3594 	    (error = kauth_authorize_generic(cred,
3595 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3596 		VOP_REVOKE(vp, REVOKEALL);
3597 	return (error);
3598 }
3599 
3600 /*
3601  * Void all references to file by ripping underlying filesystem
3602  * away from vnode.
3603  */
3604 /* ARGSUSED */
3605 int
3606 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3607 {
3608 	/* {
3609 		syscallarg(const char *) path;
3610 	} */
3611 	struct vnode *vp;
3612 	int error;
3613 	struct nameidata nd;
3614 
3615 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3616 	    SCARG(uap, path));
3617 	if ((error = namei(&nd)) != 0)
3618 		return (error);
3619 	vp = nd.ni_vp;
3620 	error = dorevoke(vp, l->l_cred);
3621 	vrele(vp);
3622 	return (error);
3623 }
3624 
3625 /*
3626  * Convert a user file descriptor to a kernel file entry.
3627  */
3628 int
3629 getvnode(int fd, file_t **fpp)
3630 {
3631 	struct vnode *vp;
3632 	file_t *fp;
3633 
3634 	if ((fp = fd_getfile(fd)) == NULL)
3635 		return (EBADF);
3636 
3637 	if (fp->f_type != DTYPE_VNODE) {
3638 		fd_putfile(fd);
3639 		return (EINVAL);
3640 	}
3641 
3642 	vp = fp->f_data;
3643 	if (vp->v_type == VBAD) {
3644 		fd_putfile(fd);
3645 		return (EBADF);
3646 	}
3647 
3648 	*fpp = fp;
3649 	return (0);
3650 }
3651