xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision f82d7874c259b2a6cc59b714f844919f32bf7b51)
1 /*	$NetBSD: vfs_syscalls.c,v 1.363 2008/05/20 19:30:03 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
63  */
64 
65 #include <sys/cdefs.h>
66 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.363 2008/05/20 19:30:03 ad Exp $");
67 
68 #include "opt_compat_netbsd.h"
69 #include "opt_compat_43.h"
70 #include "opt_fileassoc.h"
71 #include "fss.h"
72 #include "veriexec.h"
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file.h>
80 #include <sys/stat.h>
81 #include <sys/vnode.h>
82 #include <sys/mount.h>
83 #include <sys/proc.h>
84 #include <sys/uio.h>
85 #include <sys/malloc.h>
86 #include <sys/kmem.h>
87 #include <sys/dirent.h>
88 #include <sys/sysctl.h>
89 #include <sys/syscallargs.h>
90 #include <sys/vfs_syscalls.h>
91 #include <sys/ktrace.h>
92 #ifdef FILEASSOC
93 #include <sys/fileassoc.h>
94 #endif /* FILEASSOC */
95 #include <sys/verified_exec.h>
96 #include <sys/kauth.h>
97 #include <sys/atomic.h>
98 #include <sys/module.h>
99 
100 #include <miscfs/genfs/genfs.h>
101 #include <miscfs/syncfs/syncfs.h>
102 #include <miscfs/specfs/specdev.h>
103 
104 #ifdef COMPAT_30
105 #include "opt_nfsserver.h"
106 #include <nfs/rpcv2.h>
107 #endif
108 #include <nfs/nfsproto.h>
109 #ifdef COMPAT_30
110 #include <nfs/nfs.h>
111 #include <nfs/nfs_var.h>
112 #endif
113 
114 #if NFSS > 0
115 #include <dev/fssvar.h>
116 #endif
117 
118 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct");
119 
120 static int change_dir(struct nameidata *, struct lwp *);
121 static int change_flags(struct vnode *, u_long, struct lwp *);
122 static int change_mode(struct vnode *, int, struct lwp *l);
123 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
124 
125 void checkdirs(struct vnode *);
126 
127 int dovfsusermount = 0;
128 
129 /*
130  * Virtual File System System Calls
131  */
132 
133 /*
134  * Mount a file system.
135  */
136 
137 #if defined(COMPAT_09) || defined(COMPAT_43)
138 /*
139  * This table is used to maintain compatibility with 4.3BSD
140  * and NetBSD 0.9 mount syscalls.  Note, the order is important!
141  *
142  * Do not modify this table. It should only contain filesystems
143  * supported by NetBSD 0.9 and 4.3BSD.
144  */
145 const char * const mountcompatnames[] = {
146 	NULL,		/* 0 = MOUNT_NONE */
147 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
148 	MOUNT_NFS,	/* 2 */
149 	MOUNT_MFS,	/* 3 */
150 	MOUNT_MSDOS,	/* 4 */
151 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
152 	MOUNT_FDESC,	/* 6 */
153 	MOUNT_KERNFS,	/* 7 */
154 	NULL,		/* 8 = MOUNT_DEVFS */
155 	MOUNT_AFS,	/* 9 */
156 };
157 const int nmountcompatnames = sizeof(mountcompatnames) /
158     sizeof(mountcompatnames[0]);
159 #endif /* COMPAT_09 || COMPAT_43 */
160 
161 static int
162 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
163     void *data, size_t *data_len)
164 {
165 	struct mount *mp;
166 	int error = 0, saved_flags;
167 
168 	mp = vp->v_mount;
169 	saved_flags = mp->mnt_flag;
170 
171 	/* We can operate only on VV_ROOT nodes. */
172 	if ((vp->v_vflag & VV_ROOT) == 0) {
173 		error = EINVAL;
174 		goto out;
175 	}
176 
177 	/*
178 	 * We only allow the filesystem to be reloaded if it
179 	 * is currently mounted read-only.
180 	 */
181 	if (flags & MNT_RELOAD && !(mp->mnt_flag & MNT_RDONLY)) {
182 		error = EOPNOTSUPP;	/* Needs translation */
183 		goto out;
184 	}
185 
186 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
187 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
188 	if (error)
189 		goto out;
190 
191 	if (vfs_busy(mp, NULL)) {
192 		error = EPERM;
193 		goto out;
194 	}
195 
196 	mutex_enter(&mp->mnt_updating);
197 
198 	mp->mnt_flag &= ~MNT_OP_FLAGS;
199 	mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
200 
201 	/*
202 	 * Set the mount level flags.
203 	 */
204 	if (flags & MNT_RDONLY)
205 		mp->mnt_flag |= MNT_RDONLY;
206 	else if (mp->mnt_flag & MNT_RDONLY)
207 		mp->mnt_iflag |= IMNT_WANTRDWR;
208 	mp->mnt_flag &=
209 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
210 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
211 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP);
212 	mp->mnt_flag |= flags &
213 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
214 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
215 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
216 	    MNT_IGNORE);
217 
218 	error = VFS_MOUNT(mp, path, data, data_len);
219 
220 #if defined(COMPAT_30) && defined(NFSSERVER)
221 	if (error && data != NULL) {
222 		int error2;
223 
224 		/* Update failed; let's try and see if it was an
225 		 * export request. */
226 		error2 = nfs_update_exports_30(mp, path, data, l);
227 
228 		/* Only update error code if the export request was
229 		 * understood but some problem occurred while
230 		 * processing it. */
231 		if (error2 != EJUSTRETURN)
232 			error = error2;
233 	}
234 #endif
235 	if (mp->mnt_iflag & IMNT_WANTRDWR)
236 		mp->mnt_flag &= ~MNT_RDONLY;
237 	if (error)
238 		mp->mnt_flag = saved_flags;
239 	mp->mnt_flag &= ~MNT_OP_FLAGS;
240 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
241 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
242 		if (mp->mnt_syncer == NULL)
243 			error = vfs_allocate_syncvnode(mp);
244 	} else {
245 		if (mp->mnt_syncer != NULL)
246 			vfs_deallocate_syncvnode(mp);
247 	}
248 	mutex_exit(&mp->mnt_updating);
249 	vfs_unbusy(mp, false, NULL);
250 
251  out:
252 	return (error);
253 }
254 
255 static int
256 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
257 {
258 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
259 	int error;
260 
261 	/* Copy file-system type from userspace.  */
262 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
263 	if (error) {
264 #if defined(COMPAT_09) || defined(COMPAT_43)
265 		/*
266 		 * Historically, filesystem types were identified by numbers.
267 		 * If we get an integer for the filesystem type instead of a
268 		 * string, we check to see if it matches one of the historic
269 		 * filesystem types.
270 		 */
271 		u_long fsindex = (u_long)fstype;
272 		if (fsindex >= nmountcompatnames ||
273 		    mountcompatnames[fsindex] == NULL)
274 			return ENODEV;
275 		strlcpy(fstypename, mountcompatnames[fsindex],
276 		    sizeof(fstypename));
277 #else
278 		return error;
279 #endif
280 	}
281 
282 #ifdef	COMPAT_10
283 	/* Accept `ufs' as an alias for `ffs'. */
284 	if (strcmp(fstypename, "ufs") == 0)
285 		fstypename[0] = 'f';
286 #endif
287 
288 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
289 		return 0;
290 
291 	/* If we can autoload a vfs module, try again */
292 	(void)module_load(fstype, 0, NULL, MODULE_CLASS_VFS, true);
293 
294 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
295 		return 0;
296 
297 	return ENODEV;
298 }
299 
300 static int
301 mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
302     const char *path, int flags, void *data, size_t *data_len, u_int recurse)
303 {
304 	struct mount *mp = NULL;
305 	struct vnode *vp = *vpp;
306 	struct vattr va;
307 	int error;
308 
309 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
310 	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
311 	if (error)
312 		return error;
313 
314 	/* Can't make a non-dir a mount-point (from here anyway). */
315 	if (vp->v_type != VDIR)
316 		return ENOTDIR;
317 
318 	/*
319 	 * If the user is not root, ensure that they own the directory
320 	 * onto which we are attempting to mount.
321 	 */
322 	if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0 ||
323 	    (va.va_uid != kauth_cred_geteuid(l->l_cred) &&
324 	    (error = kauth_authorize_generic(l->l_cred,
325 	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
326 		return error;
327 	}
328 
329 	if (flags & MNT_EXPORTED)
330 		return EINVAL;
331 
332 	if ((error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0)) != 0)
333 		return error;
334 
335 	/*
336 	 * Check if a file-system is not already mounted on this vnode.
337 	 */
338 	if (vp->v_mountedhere != NULL)
339 		return EBUSY;
340 
341 	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
342 	if (mp == NULL)
343 		return ENOMEM;
344 
345 	mp->mnt_op = vfsops;
346 	mp->mnt_refcnt = 1;
347 
348 	TAILQ_INIT(&mp->mnt_vnodelist);
349 	rw_init(&mp->mnt_unmounting);
350  	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
351 	mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
352 	error = vfs_busy(mp, NULL);
353 	KASSERT(error == 0);
354 	mutex_enter(&mp->mnt_updating);
355 
356 	mp->mnt_vnodecovered = vp;
357 	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
358 	mount_initspecific(mp);
359 
360 	/*
361 	 * The underlying file system may refuse the mount for
362 	 * various reasons.  Allow the user to force it to happen.
363 	 *
364 	 * Set the mount level flags.
365 	 */
366 	mp->mnt_flag = flags &
367 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
368 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
369 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
370 	    MNT_IGNORE | MNT_RDONLY);
371 
372 	error = VFS_MOUNT(mp, path, data, data_len);
373 	mp->mnt_flag &= ~MNT_OP_FLAGS;
374 
375 	/*
376 	 * Put the new filesystem on the mount list after root.
377 	 */
378 	cache_purge(vp);
379 	if (error != 0) {
380 		vp->v_mountedhere = NULL;
381 		mutex_exit(&mp->mnt_updating);
382 		vfs_unbusy(mp, false, NULL);
383 		vfs_destroy(mp);
384 		return error;
385 	}
386 
387 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
388 	mutex_enter(&mountlist_lock);
389 	vp->v_mountedhere = mp;
390 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
391 	mutex_exit(&mountlist_lock);
392     	vn_restorerecurse(vp, recurse);
393 	VOP_UNLOCK(vp, 0);
394 	checkdirs(vp);
395 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
396 		error = vfs_allocate_syncvnode(mp);
397 	/* Hold an additional reference to the mount across VFS_START(). */
398 	mutex_exit(&mp->mnt_updating);
399 	vfs_unbusy(mp, true, NULL);
400 	(void) VFS_STATVFS(mp, &mp->mnt_stat);
401 	error = VFS_START(mp, 0);
402 	if (error) {
403 		vrele(vp);
404 		vfs_destroy(mp);
405 	}
406 	/* Drop reference held for VFS_START(). */
407 	vfs_destroy(mp);
408 	*vpp = NULL;
409 	return error;
410 }
411 
412 static int
413 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
414     void *data, size_t *data_len)
415 {
416 	struct mount *mp;
417 	int error;
418 
419 	/* If MNT_GETARGS is specified, it should be the only flag. */
420 	if (flags & ~MNT_GETARGS)
421 		return EINVAL;
422 
423 	mp = vp->v_mount;
424 
425 	/* XXX: probably some notion of "can see" here if we want isolation. */
426 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
427 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
428 	if (error)
429 		return error;
430 
431 	if ((vp->v_vflag & VV_ROOT) == 0)
432 		return EINVAL;
433 
434 	if (vfs_busy(mp, NULL))
435 		return EPERM;
436 
437 	mutex_enter(&mp->mnt_updating);
438 	mp->mnt_flag &= ~MNT_OP_FLAGS;
439 	mp->mnt_flag |= MNT_GETARGS;
440 	error = VFS_MOUNT(mp, path, data, data_len);
441 	mp->mnt_flag &= ~MNT_OP_FLAGS;
442 	mutex_exit(&mp->mnt_updating);
443 
444 	vfs_unbusy(mp, false, NULL);
445 	return (error);
446 }
447 
448 #ifdef COMPAT_40
449 /* ARGSUSED */
450 int
451 compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
452 {
453 	/* {
454 		syscallarg(const char *) type;
455 		syscallarg(const char *) path;
456 		syscallarg(int) flags;
457 		syscallarg(void *) data;
458 	} */
459 	register_t dummy;
460 
461 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
462 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
463 }
464 #endif
465 
466 int
467 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
468 {
469 	/* {
470 		syscallarg(const char *) type;
471 		syscallarg(const char *) path;
472 		syscallarg(int) flags;
473 		syscallarg(void *) data;
474 		syscallarg(size_t) data_len;
475 	} */
476 
477 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
478 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
479 	    SCARG(uap, data_len), retval);
480 }
481 
482 int
483 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
484     const char *path, int flags, void *data, enum uio_seg data_seg,
485     size_t data_len, register_t *retval)
486 {
487 	struct vnode *vp;
488 	struct nameidata nd;
489 	void *data_buf = data;
490 	u_int recurse;
491 	int error;
492 
493 	/*
494 	 * Get vnode to be covered
495 	 */
496 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
497 	if ((error = namei(&nd)) != 0)
498 		return (error);
499 	vp = nd.ni_vp;
500 
501 	/*
502 	 * A lookup in VFS_MOUNT might result in an attempt to
503 	 * lock this vnode again, so make the lock recursive.
504 	 */
505 	if (vfsops == NULL) {
506 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
507 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
508 			recurse = vn_setrecurse(vp);
509 			vfsops = vp->v_mount->mnt_op;
510 		} else {
511 			/* 'type' is userspace */
512 			error = mount_get_vfsops(type, &vfsops);
513 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
514 			recurse = vn_setrecurse(vp);
515 			if (error != 0)
516 				goto done;
517 		}
518 	} else {
519 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
520 		recurse = vn_setrecurse(vp);
521 	}
522 
523 	if (data != NULL && data_seg == UIO_USERSPACE) {
524 		if (data_len == 0) {
525 			/* No length supplied, use default for filesystem */
526 			data_len = vfsops->vfs_min_mount_data;
527 			if (data_len > VFS_MAX_MOUNT_DATA) {
528 				/* maybe a force loaded old LKM */
529 				error = EINVAL;
530 				goto done;
531 			}
532 #ifdef COMPAT_30
533 			/* Hopefully a longer buffer won't make copyin() fail */
534 			if (flags & MNT_UPDATE
535 			    && data_len < sizeof (struct mnt_export_args30))
536 				data_len = sizeof (struct mnt_export_args30);
537 #endif
538 		}
539 		data_buf = malloc(data_len, M_TEMP, M_WAITOK);
540 
541 		/* NFS needs the buffer even for mnt_getargs .... */
542 		error = copyin(data, data_buf, data_len);
543 		if (error != 0)
544 			goto done;
545 	}
546 
547 	if (flags & MNT_GETARGS) {
548 		if (data_len == 0) {
549 			error = EINVAL;
550 			goto done;
551 		}
552 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
553 		if (error != 0)
554 			goto done;
555 		if (data_seg == UIO_USERSPACE)
556 			error = copyout(data_buf, data, data_len);
557 		*retval = data_len;
558 	} else if (flags & MNT_UPDATE) {
559 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
560 	} else {
561 		/* Locking is handled internally in mount_domount(). */
562 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
563 		    &data_len, recurse);
564 	}
565 
566     done:
567     	if (vp != NULL) {
568 	    	vn_restorerecurse(vp, recurse);
569 	    	vput(vp);
570 	}
571 	if (data_buf != data)
572 		free(data_buf, M_TEMP);
573 	return (error);
574 }
575 
576 /*
577  * Scan all active processes to see if any of them have a current
578  * or root directory onto which the new filesystem has just been
579  * mounted. If so, replace them with the new mount point.
580  */
581 void
582 checkdirs(struct vnode *olddp)
583 {
584 	struct cwdinfo *cwdi;
585 	struct vnode *newdp, *rele1, *rele2;
586 	struct proc *p;
587 	bool retry;
588 
589 	if (olddp->v_usecount == 1)
590 		return;
591 	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
592 		panic("mount: lost mount");
593 
594 	do {
595 		retry = false;
596 		mutex_enter(proc_lock);
597 		PROCLIST_FOREACH(p, &allproc) {
598 			if ((p->p_flag & PK_MARKER) != 0)
599 				continue;
600 			if ((cwdi = p->p_cwdi) == NULL)
601 				continue;
602 			/*
603 			 * Can't change to the old directory any more,
604 			 * so even if we see a stale value it's not a
605 			 * problem.
606 			 */
607 			if (cwdi->cwdi_cdir != olddp &&
608 			    cwdi->cwdi_rdir != olddp)
609 			    	continue;
610 			retry = true;
611 			rele1 = NULL;
612 			rele2 = NULL;
613 			atomic_inc_uint(&cwdi->cwdi_refcnt);
614 			mutex_exit(proc_lock);
615 			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
616 			if (cwdi->cwdi_cdir == olddp) {
617 				rele1 = cwdi->cwdi_cdir;
618 				VREF(newdp);
619 				cwdi->cwdi_cdir = newdp;
620 			}
621 			if (cwdi->cwdi_rdir == olddp) {
622 				rele2 = cwdi->cwdi_rdir;
623 				VREF(newdp);
624 				cwdi->cwdi_rdir = newdp;
625 			}
626 			rw_exit(&cwdi->cwdi_lock);
627 			cwdfree(cwdi);
628 			if (rele1 != NULL)
629 				vrele(rele1);
630 			if (rele2 != NULL)
631 				vrele(rele2);
632 			mutex_enter(proc_lock);
633 			break;
634 		}
635 		mutex_exit(proc_lock);
636 	} while (retry);
637 
638 	if (rootvnode == olddp) {
639 		vrele(rootvnode);
640 		VREF(newdp);
641 		rootvnode = newdp;
642 	}
643 	vput(newdp);
644 }
645 
646 /*
647  * Unmount a file system.
648  *
649  * Note: unmount takes a path to the vnode mounted on as argument,
650  * not special file (as before).
651  */
652 /* ARGSUSED */
653 int
654 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
655 {
656 	/* {
657 		syscallarg(const char *) path;
658 		syscallarg(int) flags;
659 	} */
660 	struct vnode *vp;
661 	struct mount *mp;
662 	int error;
663 	struct nameidata nd;
664 
665 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
666 	    SCARG(uap, path));
667 	if ((error = namei(&nd)) != 0)
668 		return (error);
669 	vp = nd.ni_vp;
670 	mp = vp->v_mount;
671 	atomic_inc_uint(&mp->mnt_refcnt);
672 	VOP_UNLOCK(vp, 0);
673 
674 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
675 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
676 	if (error) {
677 		vrele(vp);
678 		vfs_destroy(mp);
679 		return (error);
680 	}
681 
682 	/*
683 	 * Don't allow unmounting the root file system.
684 	 */
685 	if (mp->mnt_flag & MNT_ROOTFS) {
686 		vrele(vp);
687 		vfs_destroy(mp);
688 		return (EINVAL);
689 	}
690 
691 	/*
692 	 * Must be the root of the filesystem
693 	 */
694 	if ((vp->v_vflag & VV_ROOT) == 0) {
695 		vrele(vp);
696 		vfs_destroy(mp);
697 		return (EINVAL);
698 	}
699 
700 	vrele(vp);
701 	error = dounmount(mp, SCARG(uap, flags), l);
702 	return error;
703 }
704 
705 /*
706  * Do the actual file system unmount.  File system is assumed to have
707  * been locked by the caller.
708  *
709  * => Caller gain reference to the mount, explicility for unmount.
710  * => Reference will be dropped in all cases.
711  */
712 int
713 dounmount(struct mount *mp, int flags, struct lwp *l)
714 {
715 	struct vnode *coveredvp;
716 	int error;
717 	int async;
718 	int used_syncer;
719 
720 #if NVERIEXEC > 0
721 	error = veriexec_unmountchk(mp);
722 	if (error)
723 		return (error);
724 #endif /* NVERIEXEC > 0 */
725 
726 	/*
727 	 * XXX Freeze syncer.  Must do this before locking the
728 	 * mount point.  See dounmount() for details.
729 	 */
730 	mutex_enter(&syncer_mutex);
731 	rw_enter(&mp->mnt_unmounting, RW_WRITER);
732 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
733 		rw_exit(&mp->mnt_unmounting);
734 		mutex_exit(&syncer_mutex);
735 		vfs_destroy(mp);
736 		return ENOENT;
737 	}
738 
739 	used_syncer = (mp->mnt_syncer != NULL);
740 
741 	/*
742 	 * XXX Syncer must be frozen when we get here.  This should really
743 	 * be done on a per-mountpoint basis, but especially the softdep
744 	 * code possibly called from the syncer doesn't exactly work on a
745 	 * per-mountpoint basis, so the softdep code would become a maze
746 	 * of vfs_busy() calls.
747 	 *
748 	 * The caller of dounmount() must acquire syncer_mutex because
749 	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
750 	 * order, and we must preserve that order to avoid deadlock.
751 	 *
752 	 * So, if the file system did not use the syncer, now is
753 	 * the time to release the syncer_mutex.
754 	 */
755 	if (used_syncer == 0)
756 		mutex_exit(&syncer_mutex);
757 
758 	mp->mnt_iflag |= IMNT_UNMOUNT;
759 	async = mp->mnt_flag & MNT_ASYNC;
760 	mp->mnt_flag &= ~MNT_ASYNC;
761 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
762 	if (mp->mnt_syncer != NULL)
763 		vfs_deallocate_syncvnode(mp);
764 	error = 0;
765 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
766 #if NFSS > 0
767 		error = fss_umount_hook(mp, (flags & MNT_FORCE));
768 #endif
769 		if (error == 0)
770 			error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
771 	}
772 	vfs_scrubvnlist(mp);
773 	if (error == 0 || (flags & MNT_FORCE))
774 		error = VFS_UNMOUNT(mp, flags);
775 	if (error) {
776 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
777 			(void) vfs_allocate_syncvnode(mp);
778 		mp->mnt_iflag &= ~IMNT_UNMOUNT;
779 		mp->mnt_flag |= async;
780 		rw_exit(&mp->mnt_unmounting);
781 		if (used_syncer)
782 			mutex_exit(&syncer_mutex);
783 		return (error);
784 	}
785 	vfs_scrubvnlist(mp);
786 	mutex_enter(&mountlist_lock);
787 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
788 		coveredvp->v_mountedhere = NULL;
789 	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
790 	mp->mnt_iflag |= IMNT_GONE;
791 	mutex_exit(&mountlist_lock);
792 	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
793 		panic("unmount: dangling vnode");
794 	if (used_syncer)
795 		mutex_exit(&syncer_mutex);
796 	vfs_hooks_unmount(mp);
797 	rw_exit(&mp->mnt_unmounting);
798 	vfs_destroy(mp);	/* caller provided reference */
799 	vfs_destroy(mp);	/* from mount(), final nail in coffin */
800 	if (coveredvp != NULLVP)
801 		vrele(coveredvp);
802 	return (0);
803 }
804 
805 /*
806  * Sync each mounted filesystem.
807  */
808 #ifdef DEBUG
809 int syncprt = 0;
810 struct ctldebug debug0 = { "syncprt", &syncprt };
811 #endif
812 
813 /* ARGSUSED */
814 int
815 sys_sync(struct lwp *l, const void *v, register_t *retval)
816 {
817 	struct mount *mp, *nmp;
818 	int asyncflag;
819 
820 	if (l == NULL)
821 		l = &lwp0;
822 
823 	mutex_enter(&mountlist_lock);
824 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
825 	     mp = nmp) {
826 		if (vfs_busy(mp, &nmp)) {
827 			continue;
828 		}
829 		mutex_enter(&mp->mnt_updating);
830 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
831 			asyncflag = mp->mnt_flag & MNT_ASYNC;
832 			mp->mnt_flag &= ~MNT_ASYNC;
833 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
834 			if (asyncflag)
835 				 mp->mnt_flag |= MNT_ASYNC;
836 		}
837 		mutex_exit(&mp->mnt_updating);
838 		vfs_unbusy(mp, false, &nmp);
839 	}
840 	mutex_exit(&mountlist_lock);
841 #ifdef DEBUG
842 	if (syncprt)
843 		vfs_bufstats();
844 #endif /* DEBUG */
845 	return (0);
846 }
847 
848 /*
849  * Change filesystem quotas.
850  */
851 /* ARGSUSED */
852 int
853 sys_quotactl(struct lwp *l, const struct sys_quotactl_args *uap, register_t *retval)
854 {
855 	/* {
856 		syscallarg(const char *) path;
857 		syscallarg(int) cmd;
858 		syscallarg(int) uid;
859 		syscallarg(void *) arg;
860 	} */
861 	struct mount *mp;
862 	int error;
863 	struct nameidata nd;
864 
865 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
866 	    SCARG(uap, path));
867 	if ((error = namei(&nd)) != 0)
868 		return (error);
869 	mp = nd.ni_vp->v_mount;
870 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
871 	    SCARG(uap, arg));
872 	vrele(nd.ni_vp);
873 	return (error);
874 }
875 
876 int
877 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
878     int root)
879 {
880 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
881 	int error = 0;
882 
883 	/*
884 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
885 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
886 	 * overrides MNT_NOWAIT.
887 	 */
888 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
889 	    (flags != MNT_WAIT && flags != 0)) {
890 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
891 		goto done;
892 	}
893 
894 	/* Get the filesystem stats now */
895 	memset(sp, 0, sizeof(*sp));
896 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
897 		return error;
898 	}
899 
900 	if (cwdi->cwdi_rdir == NULL)
901 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
902 done:
903 	if (cwdi->cwdi_rdir != NULL) {
904 		size_t len;
905 		char *bp;
906 		char *path = PNBUF_GET();
907 
908 		bp = path + MAXPATHLEN;
909 		*--bp = '\0';
910 		rw_enter(&cwdi->cwdi_lock, RW_READER);
911 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
912 		    MAXPATHLEN / 2, 0, l);
913 		rw_exit(&cwdi->cwdi_lock);
914 		if (error) {
915 			PNBUF_PUT(path);
916 			return error;
917 		}
918 		len = strlen(bp);
919 		/*
920 		 * for mount points that are below our root, we can see
921 		 * them, so we fix up the pathname and return them. The
922 		 * rest we cannot see, so we don't allow viewing the
923 		 * data.
924 		 */
925 		if (strncmp(bp, sp->f_mntonname, len) == 0) {
926 			strlcpy(sp->f_mntonname, &sp->f_mntonname[len],
927 			    sizeof(sp->f_mntonname));
928 			if (sp->f_mntonname[0] == '\0')
929 				(void)strlcpy(sp->f_mntonname, "/",
930 				    sizeof(sp->f_mntonname));
931 		} else {
932 			if (root)
933 				(void)strlcpy(sp->f_mntonname, "/",
934 				    sizeof(sp->f_mntonname));
935 			else
936 				error = EPERM;
937 		}
938 		PNBUF_PUT(path);
939 	}
940 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
941 	return error;
942 }
943 
944 /*
945  * Get filesystem statistics by path.
946  */
947 int
948 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
949 {
950 	struct mount *mp;
951 	int error;
952 	struct nameidata nd;
953 
954 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE, path);
955 	if ((error = namei(&nd)) != 0)
956 		return error;
957 	mp = nd.ni_vp->v_mount;
958 	error = dostatvfs(mp, sb, l, flags, 1);
959 	vrele(nd.ni_vp);
960 	return error;
961 }
962 
963 /* ARGSUSED */
964 int
965 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
966 {
967 	/* {
968 		syscallarg(const char *) path;
969 		syscallarg(struct statvfs *) buf;
970 		syscallarg(int) flags;
971 	} */
972 	struct statvfs *sb;
973 	int error;
974 
975 	sb = STATVFSBUF_GET();
976 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
977 	if (error == 0)
978 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
979 	STATVFSBUF_PUT(sb);
980 	return error;
981 }
982 
983 /*
984  * Get filesystem statistics by fd.
985  */
986 int
987 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
988 {
989 	file_t *fp;
990 	struct mount *mp;
991 	int error;
992 
993 	/* fd_getvnode() will use the descriptor for us */
994 	if ((error = fd_getvnode(fd, &fp)) != 0)
995 		return (error);
996 	mp = ((struct vnode *)fp->f_data)->v_mount;
997 	error = dostatvfs(mp, sb, curlwp, flags, 1);
998 	fd_putfile(fd);
999 	return error;
1000 }
1001 
1002 /* ARGSUSED */
1003 int
1004 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
1005 {
1006 	/* {
1007 		syscallarg(int) fd;
1008 		syscallarg(struct statvfs *) buf;
1009 		syscallarg(int) flags;
1010 	} */
1011 	struct statvfs *sb;
1012 	int error;
1013 
1014 	sb = STATVFSBUF_GET();
1015 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1016 	if (error == 0)
1017 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1018 	STATVFSBUF_PUT(sb);
1019 	return error;
1020 }
1021 
1022 
1023 /*
1024  * Get statistics on all filesystems.
1025  */
1026 int
1027 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1028     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1029     register_t *retval)
1030 {
1031 	int root = 0;
1032 	struct proc *p = l->l_proc;
1033 	struct mount *mp, *nmp;
1034 	struct statvfs *sb;
1035 	size_t count, maxcount;
1036 	int error = 0;
1037 
1038 	sb = STATVFSBUF_GET();
1039 	maxcount = bufsize / entry_sz;
1040 	mutex_enter(&mountlist_lock);
1041 	count = 0;
1042 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1043 	     mp = nmp) {
1044 		if (vfs_busy(mp, &nmp)) {
1045 			continue;
1046 		}
1047 		if (sfsp && count < maxcount) {
1048 			error = dostatvfs(mp, sb, l, flags, 0);
1049 			if (error) {
1050 				vfs_unbusy(mp, false, &nmp);
1051 				continue;
1052 			}
1053 			error = copyfn(sb, sfsp, entry_sz);
1054 			if (error) {
1055 				vfs_unbusy(mp, false, NULL);
1056 				goto out;
1057 			}
1058 			sfsp = (char *)sfsp + entry_sz;
1059 			root |= strcmp(sb->f_mntonname, "/") == 0;
1060 		}
1061 		count++;
1062 		vfs_unbusy(mp, false, &nmp);
1063 	}
1064 	mutex_exit(&mountlist_lock);
1065 
1066 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
1067 		/*
1068 		 * fake a root entry
1069 		 */
1070 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1071 		    sb, l, flags, 1);
1072 		if (error != 0)
1073 			goto out;
1074 		if (sfsp)
1075 			error = copyfn(sb, sfsp, entry_sz);
1076 		count++;
1077 	}
1078 	if (sfsp && count > maxcount)
1079 		*retval = maxcount;
1080 	else
1081 		*retval = count;
1082 out:
1083 	STATVFSBUF_PUT(sb);
1084 	return error;
1085 }
1086 
1087 int
1088 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
1089 {
1090 	/* {
1091 		syscallarg(struct statvfs *) buf;
1092 		syscallarg(size_t) bufsize;
1093 		syscallarg(int) flags;
1094 	} */
1095 
1096 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1097 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1098 }
1099 
1100 /*
1101  * Change current working directory to a given file descriptor.
1102  */
1103 /* ARGSUSED */
1104 int
1105 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
1106 {
1107 	/* {
1108 		syscallarg(int) fd;
1109 	} */
1110 	struct proc *p = l->l_proc;
1111 	struct cwdinfo *cwdi;
1112 	struct vnode *vp, *tdp;
1113 	struct mount *mp;
1114 	file_t *fp;
1115 	int error, fd;
1116 
1117 	/* fd_getvnode() will use the descriptor for us */
1118 	fd = SCARG(uap, fd);
1119 	if ((error = fd_getvnode(fd, &fp)) != 0)
1120 		return (error);
1121 	vp = fp->f_data;
1122 
1123 	VREF(vp);
1124 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
1125 	if (vp->v_type != VDIR)
1126 		error = ENOTDIR;
1127 	else
1128 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1129 	if (error) {
1130 		vput(vp);
1131 		goto out;
1132 	}
1133 	while ((mp = vp->v_mountedhere) != NULL) {
1134 		error = vfs_busy(mp, NULL);
1135 		vput(vp);
1136 		if (error != 0)
1137 			goto out;
1138 		error = VFS_ROOT(mp, &tdp);
1139 		vfs_unbusy(mp, false, NULL);
1140 		if (error)
1141 			goto out;
1142 		vp = tdp;
1143 	}
1144 	VOP_UNLOCK(vp, 0);
1145 
1146 	/*
1147 	 * Disallow changing to a directory not under the process's
1148 	 * current root directory (if there is one).
1149 	 */
1150 	cwdi = p->p_cwdi;
1151 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1152 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1153 		vrele(vp);
1154 		error = EPERM;	/* operation not permitted */
1155 	} else {
1156 		vrele(cwdi->cwdi_cdir);
1157 		cwdi->cwdi_cdir = vp;
1158 	}
1159 	rw_exit(&cwdi->cwdi_lock);
1160 
1161  out:
1162 	fd_putfile(fd);
1163 	return (error);
1164 }
1165 
1166 /*
1167  * Change this process's notion of the root directory to a given file
1168  * descriptor.
1169  */
1170 int
1171 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
1172 {
1173 	struct proc *p = l->l_proc;
1174 	struct cwdinfo *cwdi;
1175 	struct vnode	*vp;
1176 	file_t	*fp;
1177 	int		 error, fd = SCARG(uap, fd);
1178 
1179 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1180  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1181 		return error;
1182 	/* fd_getvnode() will use the descriptor for us */
1183 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
1184 		return error;
1185 	vp = fp->f_data;
1186 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1187 	if (vp->v_type != VDIR)
1188 		error = ENOTDIR;
1189 	else
1190 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1191 	VOP_UNLOCK(vp, 0);
1192 	if (error)
1193 		goto out;
1194 	VREF(vp);
1195 
1196 	/*
1197 	 * Prevent escaping from chroot by putting the root under
1198 	 * the working directory.  Silently chdir to / if we aren't
1199 	 * already there.
1200 	 */
1201 	cwdi = p->p_cwdi;
1202 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1203 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1204 		/*
1205 		 * XXX would be more failsafe to change directory to a
1206 		 * deadfs node here instead
1207 		 */
1208 		vrele(cwdi->cwdi_cdir);
1209 		VREF(vp);
1210 		cwdi->cwdi_cdir = vp;
1211 	}
1212 
1213 	if (cwdi->cwdi_rdir != NULL)
1214 		vrele(cwdi->cwdi_rdir);
1215 	cwdi->cwdi_rdir = vp;
1216 	rw_exit(&cwdi->cwdi_lock);
1217 
1218  out:
1219 	fd_putfile(fd);
1220 	return (error);
1221 }
1222 
1223 /*
1224  * Change current working directory (``.'').
1225  */
1226 /* ARGSUSED */
1227 int
1228 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1229 {
1230 	/* {
1231 		syscallarg(const char *) path;
1232 	} */
1233 	struct proc *p = l->l_proc;
1234 	struct cwdinfo *cwdi;
1235 	int error;
1236 	struct nameidata nd;
1237 
1238 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1239 	    SCARG(uap, path));
1240 	if ((error = change_dir(&nd, l)) != 0)
1241 		return (error);
1242 	cwdi = p->p_cwdi;
1243 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1244 	vrele(cwdi->cwdi_cdir);
1245 	cwdi->cwdi_cdir = nd.ni_vp;
1246 	rw_exit(&cwdi->cwdi_lock);
1247 	return (0);
1248 }
1249 
1250 /*
1251  * Change notion of root (``/'') directory.
1252  */
1253 /* ARGSUSED */
1254 int
1255 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1256 {
1257 	/* {
1258 		syscallarg(const char *) path;
1259 	} */
1260 	struct proc *p = l->l_proc;
1261 	struct cwdinfo *cwdi;
1262 	struct vnode *vp;
1263 	int error;
1264 	struct nameidata nd;
1265 
1266 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1267 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1268 		return (error);
1269 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1270 	    SCARG(uap, path));
1271 	if ((error = change_dir(&nd, l)) != 0)
1272 		return (error);
1273 
1274 	cwdi = p->p_cwdi;
1275 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1276 	if (cwdi->cwdi_rdir != NULL)
1277 		vrele(cwdi->cwdi_rdir);
1278 	vp = nd.ni_vp;
1279 	cwdi->cwdi_rdir = vp;
1280 
1281 	/*
1282 	 * Prevent escaping from chroot by putting the root under
1283 	 * the working directory.  Silently chdir to / if we aren't
1284 	 * already there.
1285 	 */
1286 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1287 		/*
1288 		 * XXX would be more failsafe to change directory to a
1289 		 * deadfs node here instead
1290 		 */
1291 		vrele(cwdi->cwdi_cdir);
1292 		VREF(vp);
1293 		cwdi->cwdi_cdir = vp;
1294 	}
1295 	rw_exit(&cwdi->cwdi_lock);
1296 
1297 	return (0);
1298 }
1299 
1300 /*
1301  * Common routine for chroot and chdir.
1302  */
1303 static int
1304 change_dir(struct nameidata *ndp, struct lwp *l)
1305 {
1306 	struct vnode *vp;
1307 	int error;
1308 
1309 	if ((error = namei(ndp)) != 0)
1310 		return (error);
1311 	vp = ndp->ni_vp;
1312 	if (vp->v_type != VDIR)
1313 		error = ENOTDIR;
1314 	else
1315 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1316 
1317 	if (error)
1318 		vput(vp);
1319 	else
1320 		VOP_UNLOCK(vp, 0);
1321 	return (error);
1322 }
1323 
1324 /*
1325  * Check permissions, allocate an open file structure,
1326  * and call the device open routine if any.
1327  */
1328 int
1329 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1330 {
1331 	/* {
1332 		syscallarg(const char *) path;
1333 		syscallarg(int) flags;
1334 		syscallarg(int) mode;
1335 	} */
1336 	struct proc *p = l->l_proc;
1337 	struct cwdinfo *cwdi = p->p_cwdi;
1338 	file_t *fp;
1339 	struct vnode *vp;
1340 	int flags, cmode;
1341 	int type, indx, error;
1342 	struct flock lf;
1343 	struct nameidata nd;
1344 
1345 	flags = FFLAGS(SCARG(uap, flags));
1346 	if ((flags & (FREAD | FWRITE)) == 0)
1347 		return (EINVAL);
1348 	if ((error = fd_allocfile(&fp, &indx)) != 0)
1349 		return (error);
1350 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1351 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1352 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
1353 	    SCARG(uap, path));
1354 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1355 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1356 		fd_abort(p, fp, indx);
1357 		if ((error == EDUPFD || error == EMOVEFD) &&
1358 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1359 		    (error =
1360 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1361 			*retval = indx;
1362 			return (0);
1363 		}
1364 		if (error == ERESTART)
1365 			error = EINTR;
1366 		return (error);
1367 	}
1368 
1369 	l->l_dupfd = 0;
1370 	vp = nd.ni_vp;
1371 	fp->f_flag = flags & FMASK;
1372 	fp->f_type = DTYPE_VNODE;
1373 	fp->f_ops = &vnops;
1374 	fp->f_data = vp;
1375 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1376 		lf.l_whence = SEEK_SET;
1377 		lf.l_start = 0;
1378 		lf.l_len = 0;
1379 		if (flags & O_EXLOCK)
1380 			lf.l_type = F_WRLCK;
1381 		else
1382 			lf.l_type = F_RDLCK;
1383 		type = F_FLOCK;
1384 		if ((flags & FNONBLOCK) == 0)
1385 			type |= F_WAIT;
1386 		VOP_UNLOCK(vp, 0);
1387 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1388 		if (error) {
1389 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1390 			fd_abort(p, fp, indx);
1391 			return (error);
1392 		}
1393 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1394 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1395 	}
1396 	VOP_UNLOCK(vp, 0);
1397 	*retval = indx;
1398 	fd_affix(p, fp, indx);
1399 	return (0);
1400 }
1401 
1402 static void
1403 vfs__fhfree(fhandle_t *fhp)
1404 {
1405 	size_t fhsize;
1406 
1407 	if (fhp == NULL) {
1408 		return;
1409 	}
1410 	fhsize = FHANDLE_SIZE(fhp);
1411 	kmem_free(fhp, fhsize);
1412 }
1413 
1414 /*
1415  * vfs_composefh: compose a filehandle.
1416  */
1417 
1418 int
1419 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1420 {
1421 	struct mount *mp;
1422 	struct fid *fidp;
1423 	int error;
1424 	size_t needfhsize;
1425 	size_t fidsize;
1426 
1427 	mp = vp->v_mount;
1428 	fidp = NULL;
1429 	if (*fh_size < FHANDLE_SIZE_MIN) {
1430 		fidsize = 0;
1431 	} else {
1432 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1433 		if (fhp != NULL) {
1434 			memset(fhp, 0, *fh_size);
1435 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1436 			fidp = &fhp->fh_fid;
1437 		}
1438 	}
1439 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1440 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1441 	if (error == 0 && *fh_size < needfhsize) {
1442 		error = E2BIG;
1443 	}
1444 	*fh_size = needfhsize;
1445 	return error;
1446 }
1447 
1448 int
1449 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1450 {
1451 	struct mount *mp;
1452 	fhandle_t *fhp;
1453 	size_t fhsize;
1454 	size_t fidsize;
1455 	int error;
1456 
1457 	*fhpp = NULL;
1458 	mp = vp->v_mount;
1459 	fidsize = 0;
1460 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1461 	KASSERT(error != 0);
1462 	if (error != E2BIG) {
1463 		goto out;
1464 	}
1465 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1466 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1467 	if (fhp == NULL) {
1468 		error = ENOMEM;
1469 		goto out;
1470 	}
1471 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1472 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1473 	if (error == 0) {
1474 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1475 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1476 		*fhpp = fhp;
1477 	} else {
1478 		kmem_free(fhp, fhsize);
1479 	}
1480 out:
1481 	return error;
1482 }
1483 
1484 void
1485 vfs_composefh_free(fhandle_t *fhp)
1486 {
1487 
1488 	vfs__fhfree(fhp);
1489 }
1490 
1491 /*
1492  * vfs_fhtovp: lookup a vnode by a filehandle.
1493  */
1494 
1495 int
1496 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1497 {
1498 	struct mount *mp;
1499 	int error;
1500 
1501 	*vpp = NULL;
1502 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1503 	if (mp == NULL) {
1504 		error = ESTALE;
1505 		goto out;
1506 	}
1507 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1508 		error = EOPNOTSUPP;
1509 		goto out;
1510 	}
1511 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1512 out:
1513 	return error;
1514 }
1515 
1516 /*
1517  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1518  * the needed size.
1519  */
1520 
1521 int
1522 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1523 {
1524 	fhandle_t *fhp;
1525 	int error;
1526 
1527 	*fhpp = NULL;
1528 	if (fhsize > FHANDLE_SIZE_MAX) {
1529 		return EINVAL;
1530 	}
1531 	if (fhsize < FHANDLE_SIZE_MIN) {
1532 		return EINVAL;
1533 	}
1534 again:
1535 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1536 	if (fhp == NULL) {
1537 		return ENOMEM;
1538 	}
1539 	error = copyin(ufhp, fhp, fhsize);
1540 	if (error == 0) {
1541 		/* XXX this check shouldn't be here */
1542 		if (FHANDLE_SIZE(fhp) == fhsize) {
1543 			*fhpp = fhp;
1544 			return 0;
1545 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1546 			/*
1547 			 * a kludge for nfsv2 padded handles.
1548 			 */
1549 			size_t sz;
1550 
1551 			sz = FHANDLE_SIZE(fhp);
1552 			kmem_free(fhp, fhsize);
1553 			fhsize = sz;
1554 			goto again;
1555 		} else {
1556 			/*
1557 			 * userland told us wrong size.
1558 			 */
1559 		    	error = EINVAL;
1560 		}
1561 	}
1562 	kmem_free(fhp, fhsize);
1563 	return error;
1564 }
1565 
1566 void
1567 vfs_copyinfh_free(fhandle_t *fhp)
1568 {
1569 
1570 	vfs__fhfree(fhp);
1571 }
1572 
1573 /*
1574  * Get file handle system call
1575  */
1576 int
1577 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1578 {
1579 	/* {
1580 		syscallarg(char *) fname;
1581 		syscallarg(fhandle_t *) fhp;
1582 		syscallarg(size_t *) fh_size;
1583 	} */
1584 	struct vnode *vp;
1585 	fhandle_t *fh;
1586 	int error;
1587 	struct nameidata nd;
1588 	size_t sz;
1589 	size_t usz;
1590 
1591 	/*
1592 	 * Must be super user
1593 	 */
1594 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1595 	    0, NULL, NULL, NULL);
1596 	if (error)
1597 		return (error);
1598 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
1599 	    SCARG(uap, fname));
1600 	error = namei(&nd);
1601 	if (error)
1602 		return (error);
1603 	vp = nd.ni_vp;
1604 	error = vfs_composefh_alloc(vp, &fh);
1605 	vput(vp);
1606 	if (error != 0) {
1607 		goto out;
1608 	}
1609 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1610 	if (error != 0) {
1611 		goto out;
1612 	}
1613 	sz = FHANDLE_SIZE(fh);
1614 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1615 	if (error != 0) {
1616 		goto out;
1617 	}
1618 	if (usz >= sz) {
1619 		error = copyout(fh, SCARG(uap, fhp), sz);
1620 	} else {
1621 		error = E2BIG;
1622 	}
1623 out:
1624 	vfs_composefh_free(fh);
1625 	return (error);
1626 }
1627 
1628 /*
1629  * Open a file given a file handle.
1630  *
1631  * Check permissions, allocate an open file structure,
1632  * and call the device open routine if any.
1633  */
1634 
1635 int
1636 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1637     register_t *retval)
1638 {
1639 	file_t *fp;
1640 	struct vnode *vp = NULL;
1641 	kauth_cred_t cred = l->l_cred;
1642 	file_t *nfp;
1643 	int type, indx, error=0;
1644 	struct flock lf;
1645 	struct vattr va;
1646 	fhandle_t *fh;
1647 	int flags;
1648 	proc_t *p;
1649 
1650 	p = curproc;
1651 
1652 	/*
1653 	 * Must be super user
1654 	 */
1655 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1656 	    0, NULL, NULL, NULL)))
1657 		return (error);
1658 
1659 	flags = FFLAGS(oflags);
1660 	if ((flags & (FREAD | FWRITE)) == 0)
1661 		return (EINVAL);
1662 	if ((flags & O_CREAT))
1663 		return (EINVAL);
1664 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1665 		return (error);
1666 	fp = nfp;
1667 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1668 	if (error != 0) {
1669 		goto bad;
1670 	}
1671 	error = vfs_fhtovp(fh, &vp);
1672 	if (error != 0) {
1673 		goto bad;
1674 	}
1675 
1676 	/* Now do an effective vn_open */
1677 
1678 	if (vp->v_type == VSOCK) {
1679 		error = EOPNOTSUPP;
1680 		goto bad;
1681 	}
1682 	error = vn_openchk(vp, cred, flags);
1683 	if (error != 0)
1684 		goto bad;
1685 	if (flags & O_TRUNC) {
1686 		VOP_UNLOCK(vp, 0);			/* XXX */
1687 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1688 		VATTR_NULL(&va);
1689 		va.va_size = 0;
1690 		error = VOP_SETATTR(vp, &va, cred);
1691 		if (error)
1692 			goto bad;
1693 	}
1694 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1695 		goto bad;
1696 	if (flags & FWRITE) {
1697 		mutex_enter(&vp->v_interlock);
1698 		vp->v_writecount++;
1699 		mutex_exit(&vp->v_interlock);
1700 	}
1701 
1702 	/* done with modified vn_open, now finish what sys_open does. */
1703 
1704 	fp->f_flag = flags & FMASK;
1705 	fp->f_type = DTYPE_VNODE;
1706 	fp->f_ops = &vnops;
1707 	fp->f_data = vp;
1708 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1709 		lf.l_whence = SEEK_SET;
1710 		lf.l_start = 0;
1711 		lf.l_len = 0;
1712 		if (flags & O_EXLOCK)
1713 			lf.l_type = F_WRLCK;
1714 		else
1715 			lf.l_type = F_RDLCK;
1716 		type = F_FLOCK;
1717 		if ((flags & FNONBLOCK) == 0)
1718 			type |= F_WAIT;
1719 		VOP_UNLOCK(vp, 0);
1720 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
1721 		if (error) {
1722 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
1723 			fd_abort(p, fp, indx);
1724 			return (error);
1725 		}
1726 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1727 		atomic_or_uint(&fp->f_flag, FHASLOCK);
1728 	}
1729 	VOP_UNLOCK(vp, 0);
1730 	*retval = indx;
1731 	fd_affix(p, fp, indx);
1732 	vfs_copyinfh_free(fh);
1733 	return (0);
1734 
1735 bad:
1736 	fd_abort(p, fp, indx);
1737 	if (vp != NULL)
1738 		vput(vp);
1739 	vfs_copyinfh_free(fh);
1740 	return (error);
1741 }
1742 
1743 int
1744 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1745 {
1746 	/* {
1747 		syscallarg(const void *) fhp;
1748 		syscallarg(size_t) fh_size;
1749 		syscallarg(int) flags;
1750 	} */
1751 
1752 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1753 	    SCARG(uap, flags), retval);
1754 }
1755 
1756 int
1757 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1758 {
1759 	int error;
1760 	fhandle_t *fh;
1761 	struct vnode *vp;
1762 
1763 	/*
1764 	 * Must be super user
1765 	 */
1766 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1767 	    0, NULL, NULL, NULL)))
1768 		return (error);
1769 
1770 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1771 	if (error != 0)
1772 		return error;
1773 
1774 	error = vfs_fhtovp(fh, &vp);
1775 	vfs_copyinfh_free(fh);
1776 	if (error != 0)
1777 		return error;
1778 
1779 	error = vn_stat(vp, sb);
1780 	vput(vp);
1781 	return error;
1782 }
1783 
1784 
1785 /* ARGSUSED */
1786 int
1787 sys___fhstat40(struct lwp *l, const struct sys___fhstat40_args *uap, register_t *retval)
1788 {
1789 	/* {
1790 		syscallarg(const void *) fhp;
1791 		syscallarg(size_t) fh_size;
1792 		syscallarg(struct stat *) sb;
1793 	} */
1794 	struct stat sb;
1795 	int error;
1796 
1797 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1798 	if (error)
1799 		return error;
1800 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1801 }
1802 
1803 int
1804 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1805     int flags)
1806 {
1807 	fhandle_t *fh;
1808 	struct mount *mp;
1809 	struct vnode *vp;
1810 	int error;
1811 
1812 	/*
1813 	 * Must be super user
1814 	 */
1815 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1816 	    0, NULL, NULL, NULL)))
1817 		return error;
1818 
1819 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1820 	if (error != 0)
1821 		return error;
1822 
1823 	error = vfs_fhtovp(fh, &vp);
1824 	vfs_copyinfh_free(fh);
1825 	if (error != 0)
1826 		return error;
1827 
1828 	mp = vp->v_mount;
1829 	error = dostatvfs(mp, sb, l, flags, 1);
1830 	vput(vp);
1831 	return error;
1832 }
1833 
1834 /* ARGSUSED */
1835 int
1836 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1837 {
1838 	/* {
1839 		syscallarg(const void *) fhp;
1840 		syscallarg(size_t) fh_size;
1841 		syscallarg(struct statvfs *) buf;
1842 		syscallarg(int)	flags;
1843 	} */
1844 	struct statvfs *sb = STATVFSBUF_GET();
1845 	int error;
1846 
1847 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1848 	    SCARG(uap, flags));
1849 	if (error == 0)
1850 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1851 	STATVFSBUF_PUT(sb);
1852 	return error;
1853 }
1854 
1855 /*
1856  * Create a special file.
1857  */
1858 /* ARGSUSED */
1859 int
1860 sys_mknod(struct lwp *l, const struct sys_mknod_args *uap, register_t *retval)
1861 {
1862 	/* {
1863 		syscallarg(const char *) path;
1864 		syscallarg(int) mode;
1865 		syscallarg(int) dev;
1866 	} */
1867 	struct proc *p = l->l_proc;
1868 	struct vnode *vp;
1869 	struct vattr vattr;
1870 	int error, optype;
1871 	struct nameidata nd;
1872 	char *path;
1873 	const char *cpath;
1874 	enum uio_seg seg = UIO_USERSPACE;
1875 
1876 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1877 	    0, NULL, NULL, NULL)) != 0)
1878 		return (error);
1879 
1880 	optype = VOP_MKNOD_DESCOFFSET;
1881 
1882 	VERIEXEC_PATH_GET(SCARG(uap, path), seg, cpath, path);
1883 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, seg, cpath);
1884 
1885 	if ((error = namei(&nd)) != 0)
1886 		goto out;
1887 	vp = nd.ni_vp;
1888 	if (vp != NULL)
1889 		error = EEXIST;
1890 	else {
1891 		VATTR_NULL(&vattr);
1892 		/* We will read cwdi->cwdi_cmask unlocked. */
1893 		vattr.va_mode =
1894 		    (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1895 		vattr.va_rdev = SCARG(uap, dev);
1896 
1897 		switch (SCARG(uap, mode) & S_IFMT) {
1898 		case S_IFMT:	/* used by badsect to flag bad sectors */
1899 			vattr.va_type = VBAD;
1900 			break;
1901 		case S_IFCHR:
1902 			vattr.va_type = VCHR;
1903 			break;
1904 		case S_IFBLK:
1905 			vattr.va_type = VBLK;
1906 			break;
1907 		case S_IFWHT:
1908 			optype = VOP_WHITEOUT_DESCOFFSET;
1909 			break;
1910 		case S_IFREG:
1911 #if NVERIEXEC > 0
1912 			error = veriexec_openchk(l, nd.ni_vp, nd.ni_dirp,
1913 			    O_CREAT);
1914 #endif /* NVERIEXEC > 0 */
1915 			vattr.va_type = VREG;
1916 			vattr.va_rdev = VNOVAL;
1917 			optype = VOP_CREATE_DESCOFFSET;
1918 			break;
1919 		default:
1920 			error = EINVAL;
1921 			break;
1922 		}
1923 	}
1924 	if (!error) {
1925 		switch (optype) {
1926 		case VOP_WHITEOUT_DESCOFFSET:
1927 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1928 			if (error)
1929 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1930 			vput(nd.ni_dvp);
1931 			break;
1932 
1933 		case VOP_MKNOD_DESCOFFSET:
1934 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1935 						&nd.ni_cnd, &vattr);
1936 			if (error == 0)
1937 				vput(nd.ni_vp);
1938 			break;
1939 
1940 		case VOP_CREATE_DESCOFFSET:
1941 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1942 						&nd.ni_cnd, &vattr);
1943 			if (error == 0)
1944 				vput(nd.ni_vp);
1945 			break;
1946 		}
1947 	} else {
1948 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1949 		if (nd.ni_dvp == vp)
1950 			vrele(nd.ni_dvp);
1951 		else
1952 			vput(nd.ni_dvp);
1953 		if (vp)
1954 			vrele(vp);
1955 	}
1956 out:
1957 	VERIEXEC_PATH_PUT(path);
1958 	return (error);
1959 }
1960 
1961 /*
1962  * Create a named pipe.
1963  */
1964 /* ARGSUSED */
1965 int
1966 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1967 {
1968 	/* {
1969 		syscallarg(const char *) path;
1970 		syscallarg(int) mode;
1971 	} */
1972 	struct proc *p = l->l_proc;
1973 	struct vattr vattr;
1974 	int error;
1975 	struct nameidata nd;
1976 
1977 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
1978 	    SCARG(uap, path));
1979 	if ((error = namei(&nd)) != 0)
1980 		return (error);
1981 	if (nd.ni_vp != NULL) {
1982 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1983 		if (nd.ni_dvp == nd.ni_vp)
1984 			vrele(nd.ni_dvp);
1985 		else
1986 			vput(nd.ni_dvp);
1987 		vrele(nd.ni_vp);
1988 		return (EEXIST);
1989 	}
1990 	VATTR_NULL(&vattr);
1991 	vattr.va_type = VFIFO;
1992 	/* We will read cwdi->cwdi_cmask unlocked. */
1993 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1994 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1995 	if (error == 0)
1996 		vput(nd.ni_vp);
1997 	return (error);
1998 }
1999 
2000 /*
2001  * Make a hard file link.
2002  */
2003 /* ARGSUSED */
2004 int
2005 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2006 {
2007 	/* {
2008 		syscallarg(const char *) path;
2009 		syscallarg(const char *) link;
2010 	} */
2011 	struct vnode *vp;
2012 	struct nameidata nd;
2013 	int error;
2014 
2015 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2016 	    SCARG(uap, path));
2017 	if ((error = namei(&nd)) != 0)
2018 		return (error);
2019 	vp = nd.ni_vp;
2020 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2021 	    SCARG(uap, link));
2022 	if ((error = namei(&nd)) != 0)
2023 		goto out;
2024 	if (nd.ni_vp) {
2025 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2026 		if (nd.ni_dvp == nd.ni_vp)
2027 			vrele(nd.ni_dvp);
2028 		else
2029 			vput(nd.ni_dvp);
2030 		vrele(nd.ni_vp);
2031 		error = EEXIST;
2032 		goto out;
2033 	}
2034 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2035 out:
2036 	vrele(vp);
2037 	return (error);
2038 }
2039 
2040 /*
2041  * Make a symbolic link.
2042  */
2043 /* ARGSUSED */
2044 int
2045 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2046 {
2047 	/* {
2048 		syscallarg(const char *) path;
2049 		syscallarg(const char *) link;
2050 	} */
2051 	struct proc *p = l->l_proc;
2052 	struct vattr vattr;
2053 	char *path;
2054 	int error;
2055 	struct nameidata nd;
2056 
2057 	path = PNBUF_GET();
2058 	error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL);
2059 	if (error)
2060 		goto out;
2061 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, UIO_USERSPACE,
2062 	    SCARG(uap, link));
2063 	if ((error = namei(&nd)) != 0)
2064 		goto out;
2065 	if (nd.ni_vp) {
2066 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2067 		if (nd.ni_dvp == nd.ni_vp)
2068 			vrele(nd.ni_dvp);
2069 		else
2070 			vput(nd.ni_dvp);
2071 		vrele(nd.ni_vp);
2072 		error = EEXIST;
2073 		goto out;
2074 	}
2075 	VATTR_NULL(&vattr);
2076 	vattr.va_type = VLNK;
2077 	/* We will read cwdi->cwdi_cmask unlocked. */
2078 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2079 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2080 	if (error == 0)
2081 		vput(nd.ni_vp);
2082 out:
2083 	PNBUF_PUT(path);
2084 	return (error);
2085 }
2086 
2087 /*
2088  * Delete a whiteout from the filesystem.
2089  */
2090 /* ARGSUSED */
2091 int
2092 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
2093 {
2094 	/* {
2095 		syscallarg(const char *) path;
2096 	} */
2097 	int error;
2098 	struct nameidata nd;
2099 
2100 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT,
2101 	    UIO_USERSPACE, SCARG(uap, path));
2102 	error = namei(&nd);
2103 	if (error)
2104 		return (error);
2105 
2106 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2107 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2108 		if (nd.ni_dvp == nd.ni_vp)
2109 			vrele(nd.ni_dvp);
2110 		else
2111 			vput(nd.ni_dvp);
2112 		if (nd.ni_vp)
2113 			vrele(nd.ni_vp);
2114 		return (EEXIST);
2115 	}
2116 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2117 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2118 	vput(nd.ni_dvp);
2119 	return (error);
2120 }
2121 
2122 /*
2123  * Delete a name from the filesystem.
2124  */
2125 /* ARGSUSED */
2126 int
2127 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2128 {
2129 	/* {
2130 		syscallarg(const char *) path;
2131 	} */
2132 
2133 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2134 }
2135 
2136 int
2137 do_sys_unlink(const char *arg, enum uio_seg seg)
2138 {
2139 	struct vnode *vp;
2140 	int error;
2141 	struct nameidata nd;
2142 	kauth_cred_t cred;
2143 	char *path;
2144 	const char *cpath;
2145 
2146 	VERIEXEC_PATH_GET(arg, seg, cpath, path);
2147 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, seg, cpath);
2148 
2149 	if ((error = namei(&nd)) != 0)
2150 		goto out;
2151 	vp = nd.ni_vp;
2152 
2153 	/*
2154 	 * The root of a mounted filesystem cannot be deleted.
2155 	 */
2156 	if (vp->v_vflag & VV_ROOT) {
2157 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2158 		if (nd.ni_dvp == vp)
2159 			vrele(nd.ni_dvp);
2160 		else
2161 			vput(nd.ni_dvp);
2162 		vput(vp);
2163 		error = EBUSY;
2164 		goto out;
2165 	}
2166 
2167 #if NVERIEXEC > 0
2168 	/* Handle remove requests for veriexec entries. */
2169 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, nd.ni_dirp)) != 0) {
2170 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2171 		if (nd.ni_dvp == vp)
2172 			vrele(nd.ni_dvp);
2173 		else
2174 			vput(nd.ni_dvp);
2175 		vput(vp);
2176 		goto out;
2177 	}
2178 #endif /* NVERIEXEC > 0 */
2179 
2180 	cred = kauth_cred_get();
2181 #ifdef FILEASSOC
2182 	(void)fileassoc_file_delete(vp);
2183 #endif /* FILEASSOC */
2184 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2185 out:
2186 	VERIEXEC_PATH_PUT(path);
2187 	return (error);
2188 }
2189 
2190 /*
2191  * Reposition read/write file offset.
2192  */
2193 int
2194 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2195 {
2196 	/* {
2197 		syscallarg(int) fd;
2198 		syscallarg(int) pad;
2199 		syscallarg(off_t) offset;
2200 		syscallarg(int) whence;
2201 	} */
2202 	kauth_cred_t cred = l->l_cred;
2203 	file_t *fp;
2204 	struct vnode *vp;
2205 	struct vattr vattr;
2206 	off_t newoff;
2207 	int error, fd;
2208 
2209 	fd = SCARG(uap, fd);
2210 
2211 	if ((fp = fd_getfile(fd)) == NULL)
2212 		return (EBADF);
2213 
2214 	vp = fp->f_data;
2215 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2216 		error = ESPIPE;
2217 		goto out;
2218 	}
2219 
2220 	switch (SCARG(uap, whence)) {
2221 	case SEEK_CUR:
2222 		newoff = fp->f_offset + SCARG(uap, offset);
2223 		break;
2224 	case SEEK_END:
2225 		error = VOP_GETATTR(vp, &vattr, cred);
2226 		if (error) {
2227 			goto out;
2228 		}
2229 		newoff = SCARG(uap, offset) + vattr.va_size;
2230 		break;
2231 	case SEEK_SET:
2232 		newoff = SCARG(uap, offset);
2233 		break;
2234 	default:
2235 		error = EINVAL;
2236 		goto out;
2237 	}
2238 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2239 		*(off_t *)retval = fp->f_offset = newoff;
2240 	}
2241  out:
2242  	fd_putfile(fd);
2243 	return (error);
2244 }
2245 
2246 /*
2247  * Positional read system call.
2248  */
2249 int
2250 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2251 {
2252 	/* {
2253 		syscallarg(int) fd;
2254 		syscallarg(void *) buf;
2255 		syscallarg(size_t) nbyte;
2256 		syscallarg(off_t) offset;
2257 	} */
2258 	file_t *fp;
2259 	struct vnode *vp;
2260 	off_t offset;
2261 	int error, fd = SCARG(uap, fd);
2262 
2263 	if ((fp = fd_getfile(fd)) == NULL)
2264 		return (EBADF);
2265 
2266 	if ((fp->f_flag & FREAD) == 0) {
2267 		fd_putfile(fd);
2268 		return (EBADF);
2269 	}
2270 
2271 	vp = fp->f_data;
2272 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2273 		error = ESPIPE;
2274 		goto out;
2275 	}
2276 
2277 	offset = SCARG(uap, offset);
2278 
2279 	/*
2280 	 * XXX This works because no file systems actually
2281 	 * XXX take any action on the seek operation.
2282 	 */
2283 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2284 		goto out;
2285 
2286 	/* dofileread() will unuse the descriptor for us */
2287 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2288 	    &offset, 0, retval));
2289 
2290  out:
2291 	fd_putfile(fd);
2292 	return (error);
2293 }
2294 
2295 /*
2296  * Positional scatter read system call.
2297  */
2298 int
2299 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2300 {
2301 	/* {
2302 		syscallarg(int) fd;
2303 		syscallarg(const struct iovec *) iovp;
2304 		syscallarg(int) iovcnt;
2305 		syscallarg(off_t) offset;
2306 	} */
2307 	off_t offset = SCARG(uap, offset);
2308 
2309 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2310 	    SCARG(uap, iovcnt), &offset, 0, retval);
2311 }
2312 
2313 /*
2314  * Positional write system call.
2315  */
2316 int
2317 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2318 {
2319 	/* {
2320 		syscallarg(int) fd;
2321 		syscallarg(const void *) buf;
2322 		syscallarg(size_t) nbyte;
2323 		syscallarg(off_t) offset;
2324 	} */
2325 	file_t *fp;
2326 	struct vnode *vp;
2327 	off_t offset;
2328 	int error, fd = SCARG(uap, fd);
2329 
2330 	if ((fp = fd_getfile(fd)) == NULL)
2331 		return (EBADF);
2332 
2333 	if ((fp->f_flag & FWRITE) == 0) {
2334 		fd_putfile(fd);
2335 		return (EBADF);
2336 	}
2337 
2338 	vp = fp->f_data;
2339 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2340 		error = ESPIPE;
2341 		goto out;
2342 	}
2343 
2344 	offset = SCARG(uap, offset);
2345 
2346 	/*
2347 	 * XXX This works because no file systems actually
2348 	 * XXX take any action on the seek operation.
2349 	 */
2350 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2351 		goto out;
2352 
2353 	/* dofilewrite() will unuse the descriptor for us */
2354 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2355 	    &offset, 0, retval));
2356 
2357  out:
2358 	fd_putfile(fd);
2359 	return (error);
2360 }
2361 
2362 /*
2363  * Positional gather write system call.
2364  */
2365 int
2366 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2367 {
2368 	/* {
2369 		syscallarg(int) fd;
2370 		syscallarg(const struct iovec *) iovp;
2371 		syscallarg(int) iovcnt;
2372 		syscallarg(off_t) offset;
2373 	} */
2374 	off_t offset = SCARG(uap, offset);
2375 
2376 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2377 	    SCARG(uap, iovcnt), &offset, 0, retval);
2378 }
2379 
2380 /*
2381  * Check access permissions.
2382  */
2383 int
2384 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2385 {
2386 	/* {
2387 		syscallarg(const char *) path;
2388 		syscallarg(int) flags;
2389 	} */
2390 	kauth_cred_t cred;
2391 	struct vnode *vp;
2392 	int error, flags;
2393 	struct nameidata nd;
2394 
2395 	cred = kauth_cred_dup(l->l_cred);
2396 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2397 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2398 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2399 	    SCARG(uap, path));
2400 	/* Override default credentials */
2401 	nd.ni_cnd.cn_cred = cred;
2402 	if ((error = namei(&nd)) != 0)
2403 		goto out;
2404 	vp = nd.ni_vp;
2405 
2406 	/* Flags == 0 means only check for existence. */
2407 	if (SCARG(uap, flags)) {
2408 		flags = 0;
2409 		if (SCARG(uap, flags) & R_OK)
2410 			flags |= VREAD;
2411 		if (SCARG(uap, flags) & W_OK)
2412 			flags |= VWRITE;
2413 		if (SCARG(uap, flags) & X_OK)
2414 			flags |= VEXEC;
2415 
2416 		error = VOP_ACCESS(vp, flags, cred);
2417 		if (!error && (flags & VWRITE))
2418 			error = vn_writechk(vp);
2419 	}
2420 	vput(vp);
2421 out:
2422 	kauth_cred_free(cred);
2423 	return (error);
2424 }
2425 
2426 /*
2427  * Common code for all sys_stat functions, including compat versions.
2428  */
2429 int
2430 do_sys_stat(const char *path, unsigned int nd_flags, struct stat *sb)
2431 {
2432 	int error;
2433 	struct nameidata nd;
2434 
2435 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT,
2436 	    UIO_USERSPACE, path);
2437 	error = namei(&nd);
2438 	if (error != 0)
2439 		return error;
2440 	error = vn_stat(nd.ni_vp, sb);
2441 	vput(nd.ni_vp);
2442 	return error;
2443 }
2444 
2445 /*
2446  * Get file status; this version follows links.
2447  */
2448 /* ARGSUSED */
2449 int
2450 sys___stat30(struct lwp *l, const struct sys___stat30_args *uap, register_t *retval)
2451 {
2452 	/* {
2453 		syscallarg(const char *) path;
2454 		syscallarg(struct stat *) ub;
2455 	} */
2456 	struct stat sb;
2457 	int error;
2458 
2459 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2460 	if (error)
2461 		return error;
2462 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2463 }
2464 
2465 /*
2466  * Get file status; this version does not follow links.
2467  */
2468 /* ARGSUSED */
2469 int
2470 sys___lstat30(struct lwp *l, const struct sys___lstat30_args *uap, register_t *retval)
2471 {
2472 	/* {
2473 		syscallarg(const char *) path;
2474 		syscallarg(struct stat *) ub;
2475 	} */
2476 	struct stat sb;
2477 	int error;
2478 
2479 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2480 	if (error)
2481 		return error;
2482 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2483 }
2484 
2485 /*
2486  * Get configurable pathname variables.
2487  */
2488 /* ARGSUSED */
2489 int
2490 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2491 {
2492 	/* {
2493 		syscallarg(const char *) path;
2494 		syscallarg(int) name;
2495 	} */
2496 	int error;
2497 	struct nameidata nd;
2498 
2499 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2500 	    SCARG(uap, path));
2501 	if ((error = namei(&nd)) != 0)
2502 		return (error);
2503 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2504 	vput(nd.ni_vp);
2505 	return (error);
2506 }
2507 
2508 /*
2509  * Return target name of a symbolic link.
2510  */
2511 /* ARGSUSED */
2512 int
2513 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2514 {
2515 	/* {
2516 		syscallarg(const char *) path;
2517 		syscallarg(char *) buf;
2518 		syscallarg(size_t) count;
2519 	} */
2520 	struct vnode *vp;
2521 	struct iovec aiov;
2522 	struct uio auio;
2523 	int error;
2524 	struct nameidata nd;
2525 
2526 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
2527 	    SCARG(uap, path));
2528 	if ((error = namei(&nd)) != 0)
2529 		return (error);
2530 	vp = nd.ni_vp;
2531 	if (vp->v_type != VLNK)
2532 		error = EINVAL;
2533 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2534 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2535 		aiov.iov_base = SCARG(uap, buf);
2536 		aiov.iov_len = SCARG(uap, count);
2537 		auio.uio_iov = &aiov;
2538 		auio.uio_iovcnt = 1;
2539 		auio.uio_offset = 0;
2540 		auio.uio_rw = UIO_READ;
2541 		KASSERT(l == curlwp);
2542 		auio.uio_vmspace = l->l_proc->p_vmspace;
2543 		auio.uio_resid = SCARG(uap, count);
2544 		error = VOP_READLINK(vp, &auio, l->l_cred);
2545 	}
2546 	vput(vp);
2547 	*retval = SCARG(uap, count) - auio.uio_resid;
2548 	return (error);
2549 }
2550 
2551 /*
2552  * Change flags of a file given a path name.
2553  */
2554 /* ARGSUSED */
2555 int
2556 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2557 {
2558 	/* {
2559 		syscallarg(const char *) path;
2560 		syscallarg(u_long) flags;
2561 	} */
2562 	struct vnode *vp;
2563 	int error;
2564 	struct nameidata nd;
2565 
2566 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2567 	    SCARG(uap, path));
2568 	if ((error = namei(&nd)) != 0)
2569 		return (error);
2570 	vp = nd.ni_vp;
2571 	error = change_flags(vp, SCARG(uap, flags), l);
2572 	vput(vp);
2573 	return (error);
2574 }
2575 
2576 /*
2577  * Change flags of a file given a file descriptor.
2578  */
2579 /* ARGSUSED */
2580 int
2581 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2582 {
2583 	/* {
2584 		syscallarg(int) fd;
2585 		syscallarg(u_long) flags;
2586 	} */
2587 	struct vnode *vp;
2588 	file_t *fp;
2589 	int error;
2590 
2591 	/* fd_getvnode() will use the descriptor for us */
2592 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2593 		return (error);
2594 	vp = fp->f_data;
2595 	error = change_flags(vp, SCARG(uap, flags), l);
2596 	VOP_UNLOCK(vp, 0);
2597 	fd_putfile(SCARG(uap, fd));
2598 	return (error);
2599 }
2600 
2601 /*
2602  * Change flags of a file given a path name; this version does
2603  * not follow links.
2604  */
2605 int
2606 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2607 {
2608 	/* {
2609 		syscallarg(const char *) path;
2610 		syscallarg(u_long) flags;
2611 	} */
2612 	struct vnode *vp;
2613 	int error;
2614 	struct nameidata nd;
2615 
2616 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2617 	    SCARG(uap, path));
2618 	if ((error = namei(&nd)) != 0)
2619 		return (error);
2620 	vp = nd.ni_vp;
2621 	error = change_flags(vp, SCARG(uap, flags), l);
2622 	vput(vp);
2623 	return (error);
2624 }
2625 
2626 /*
2627  * Common routine to change flags of a file.
2628  */
2629 int
2630 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2631 {
2632 	struct vattr vattr;
2633 	int error;
2634 
2635 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2636 	/*
2637 	 * Non-superusers cannot change the flags on devices, even if they
2638 	 * own them.
2639 	 */
2640 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2641 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2642 			goto out;
2643 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2644 			error = EINVAL;
2645 			goto out;
2646 		}
2647 	}
2648 	VATTR_NULL(&vattr);
2649 	vattr.va_flags = flags;
2650 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2651 out:
2652 	return (error);
2653 }
2654 
2655 /*
2656  * Change mode of a file given path name; this version follows links.
2657  */
2658 /* ARGSUSED */
2659 int
2660 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2661 {
2662 	/* {
2663 		syscallarg(const char *) path;
2664 		syscallarg(int) mode;
2665 	} */
2666 	int error;
2667 	struct nameidata nd;
2668 
2669 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2670 	    SCARG(uap, path));
2671 	if ((error = namei(&nd)) != 0)
2672 		return (error);
2673 
2674 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2675 
2676 	vrele(nd.ni_vp);
2677 	return (error);
2678 }
2679 
2680 /*
2681  * Change mode of a file given a file descriptor.
2682  */
2683 /* ARGSUSED */
2684 int
2685 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2686 {
2687 	/* {
2688 		syscallarg(int) fd;
2689 		syscallarg(int) mode;
2690 	} */
2691 	file_t *fp;
2692 	int error;
2693 
2694 	/* fd_getvnode() will use the descriptor for us */
2695 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2696 		return (error);
2697 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2698 	fd_putfile(SCARG(uap, fd));
2699 	return (error);
2700 }
2701 
2702 /*
2703  * Change mode of a file given path name; this version does not follow links.
2704  */
2705 /* ARGSUSED */
2706 int
2707 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2708 {
2709 	/* {
2710 		syscallarg(const char *) path;
2711 		syscallarg(int) mode;
2712 	} */
2713 	int error;
2714 	struct nameidata nd;
2715 
2716 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2717 	    SCARG(uap, path));
2718 	if ((error = namei(&nd)) != 0)
2719 		return (error);
2720 
2721 	error = change_mode(nd.ni_vp, SCARG(uap, mode), l);
2722 
2723 	vrele(nd.ni_vp);
2724 	return (error);
2725 }
2726 
2727 /*
2728  * Common routine to set mode given a vnode.
2729  */
2730 static int
2731 change_mode(struct vnode *vp, int mode, struct lwp *l)
2732 {
2733 	struct vattr vattr;
2734 	int error;
2735 
2736 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2737 	VATTR_NULL(&vattr);
2738 	vattr.va_mode = mode & ALLPERMS;
2739 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2740 	VOP_UNLOCK(vp, 0);
2741 	return (error);
2742 }
2743 
2744 /*
2745  * Set ownership given a path name; this version follows links.
2746  */
2747 /* ARGSUSED */
2748 int
2749 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2750 {
2751 	/* {
2752 		syscallarg(const char *) path;
2753 		syscallarg(uid_t) uid;
2754 		syscallarg(gid_t) gid;
2755 	} */
2756 	int error;
2757 	struct nameidata nd;
2758 
2759 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2760 	    SCARG(uap, path));
2761 	if ((error = namei(&nd)) != 0)
2762 		return (error);
2763 
2764 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2765 
2766 	vrele(nd.ni_vp);
2767 	return (error);
2768 }
2769 
2770 /*
2771  * Set ownership given a path name; this version follows links.
2772  * Provides POSIX semantics.
2773  */
2774 /* ARGSUSED */
2775 int
2776 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2777 {
2778 	/* {
2779 		syscallarg(const char *) path;
2780 		syscallarg(uid_t) uid;
2781 		syscallarg(gid_t) gid;
2782 	} */
2783 	int error;
2784 	struct nameidata nd;
2785 
2786 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
2787 	    SCARG(uap, path));
2788 	if ((error = namei(&nd)) != 0)
2789 		return (error);
2790 
2791 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2792 
2793 	vrele(nd.ni_vp);
2794 	return (error);
2795 }
2796 
2797 /*
2798  * Set ownership given a file descriptor.
2799  */
2800 /* ARGSUSED */
2801 int
2802 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2803 {
2804 	/* {
2805 		syscallarg(int) fd;
2806 		syscallarg(uid_t) uid;
2807 		syscallarg(gid_t) gid;
2808 	} */
2809 	int error;
2810 	file_t *fp;
2811 
2812 	/* fd_getvnode() will use the descriptor for us */
2813 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2814 		return (error);
2815 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2816 	    l, 0);
2817 	fd_putfile(SCARG(uap, fd));
2818 	return (error);
2819 }
2820 
2821 /*
2822  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2823  */
2824 /* ARGSUSED */
2825 int
2826 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2827 {
2828 	/* {
2829 		syscallarg(int) fd;
2830 		syscallarg(uid_t) uid;
2831 		syscallarg(gid_t) gid;
2832 	} */
2833 	int error;
2834 	file_t *fp;
2835 
2836 	/* fd_getvnode() will use the descriptor for us */
2837 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2838 		return (error);
2839 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2840 	    l, 1);
2841 	fd_putfile(SCARG(uap, fd));
2842 	return (error);
2843 }
2844 
2845 /*
2846  * Set ownership given a path name; this version does not follow links.
2847  */
2848 /* ARGSUSED */
2849 int
2850 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2851 {
2852 	/* {
2853 		syscallarg(const char *) path;
2854 		syscallarg(uid_t) uid;
2855 		syscallarg(gid_t) gid;
2856 	} */
2857 	int error;
2858 	struct nameidata nd;
2859 
2860 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2861 	    SCARG(uap, path));
2862 	if ((error = namei(&nd)) != 0)
2863 		return (error);
2864 
2865 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2866 
2867 	vrele(nd.ni_vp);
2868 	return (error);
2869 }
2870 
2871 /*
2872  * Set ownership given a path name; this version does not follow links.
2873  * Provides POSIX/XPG semantics.
2874  */
2875 /* ARGSUSED */
2876 int
2877 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2878 {
2879 	/* {
2880 		syscallarg(const char *) path;
2881 		syscallarg(uid_t) uid;
2882 		syscallarg(gid_t) gid;
2883 	} */
2884 	int error;
2885 	struct nameidata nd;
2886 
2887 	NDINIT(&nd, LOOKUP, NOFOLLOW | TRYEMULROOT, UIO_USERSPACE,
2888 	    SCARG(uap, path));
2889 	if ((error = namei(&nd)) != 0)
2890 		return (error);
2891 
2892 	error = change_owner(nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2893 
2894 	vrele(nd.ni_vp);
2895 	return (error);
2896 }
2897 
2898 /*
2899  * Common routine to set ownership given a vnode.
2900  */
2901 static int
2902 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2903     int posix_semantics)
2904 {
2905 	struct vattr vattr;
2906 	mode_t newmode;
2907 	int error;
2908 
2909 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2910 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2911 		goto out;
2912 
2913 #define CHANGED(x) ((int)(x) != -1)
2914 	newmode = vattr.va_mode;
2915 	if (posix_semantics) {
2916 		/*
2917 		 * POSIX/XPG semantics: if the caller is not the super-user,
2918 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2919 		 * the XPG consider the behaviour for calls by the super-user
2920 		 * implementation-defined; we leave the set-user-id and set-
2921 		 * group-id settings intact in that case.
2922 		 */
2923 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2924 				      NULL) != 0)
2925 			newmode &= ~(S_ISUID | S_ISGID);
2926 	} else {
2927 		/*
2928 		 * NetBSD semantics: when changing owner and/or group,
2929 		 * clear the respective bit(s).
2930 		 */
2931 		if (CHANGED(uid))
2932 			newmode &= ~S_ISUID;
2933 		if (CHANGED(gid))
2934 			newmode &= ~S_ISGID;
2935 	}
2936 	/* Update va_mode iff altered. */
2937 	if (vattr.va_mode == newmode)
2938 		newmode = VNOVAL;
2939 
2940 	VATTR_NULL(&vattr);
2941 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2942 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2943 	vattr.va_mode = newmode;
2944 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2945 #undef CHANGED
2946 
2947 out:
2948 	VOP_UNLOCK(vp, 0);
2949 	return (error);
2950 }
2951 
2952 /*
2953  * Set the access and modification times given a path name; this
2954  * version follows links.
2955  */
2956 /* ARGSUSED */
2957 int
2958 sys_utimes(struct lwp *l, const struct sys_utimes_args *uap, register_t *retval)
2959 {
2960 	/* {
2961 		syscallarg(const char *) path;
2962 		syscallarg(const struct timeval *) tptr;
2963 	} */
2964 
2965 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
2966 	    SCARG(uap, tptr), UIO_USERSPACE);
2967 }
2968 
2969 /*
2970  * Set the access and modification times given a file descriptor.
2971  */
2972 /* ARGSUSED */
2973 int
2974 sys_futimes(struct lwp *l, const struct sys_futimes_args *uap, register_t *retval)
2975 {
2976 	/* {
2977 		syscallarg(int) fd;
2978 		syscallarg(const struct timeval *) tptr;
2979 	} */
2980 	int error;
2981 	file_t *fp;
2982 
2983 	/* fd_getvnode() will use the descriptor for us */
2984 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2985 		return (error);
2986 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
2987 	    UIO_USERSPACE);
2988 	fd_putfile(SCARG(uap, fd));
2989 	return (error);
2990 }
2991 
2992 /*
2993  * Set the access and modification times given a path name; this
2994  * version does not follow links.
2995  */
2996 int
2997 sys_lutimes(struct lwp *l, const struct sys_lutimes_args *uap, register_t *retval)
2998 {
2999 	/* {
3000 		syscallarg(const char *) path;
3001 		syscallarg(const struct timeval *) tptr;
3002 	} */
3003 
3004 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3005 	    SCARG(uap, tptr), UIO_USERSPACE);
3006 }
3007 
3008 /*
3009  * Common routine to set access and modification times given a vnode.
3010  */
3011 int
3012 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3013     const struct timeval *tptr, enum uio_seg seg)
3014 {
3015 	struct vattr vattr;
3016 	struct nameidata nd;
3017 	int error;
3018 
3019 	VATTR_NULL(&vattr);
3020 	if (tptr == NULL) {
3021 		nanotime(&vattr.va_atime);
3022 		vattr.va_mtime = vattr.va_atime;
3023 		vattr.va_vaflags |= VA_UTIMES_NULL;
3024 	} else {
3025 		struct timeval tv[2];
3026 
3027 		if (seg != UIO_SYSSPACE) {
3028 			error = copyin(tptr, &tv, sizeof (tv));
3029 			if (error != 0)
3030 				return error;
3031 			tptr = tv;
3032 		}
3033 		TIMEVAL_TO_TIMESPEC(tptr, &vattr.va_atime);
3034 		TIMEVAL_TO_TIMESPEC(tptr + 1, &vattr.va_mtime);
3035 	}
3036 
3037 	if (vp == NULL) {
3038 		NDINIT(&nd, LOOKUP, flag | TRYEMULROOT, UIO_USERSPACE, path);
3039 		if ((error = namei(&nd)) != 0)
3040 			return (error);
3041 		vp = nd.ni_vp;
3042 	} else
3043 		nd.ni_vp = NULL;
3044 
3045 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3046 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3047 	VOP_UNLOCK(vp, 0);
3048 
3049 	if (nd.ni_vp != NULL)
3050 		vrele(nd.ni_vp);
3051 
3052 	return (error);
3053 }
3054 
3055 /*
3056  * Truncate a file given its path name.
3057  */
3058 /* ARGSUSED */
3059 int
3060 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3061 {
3062 	/* {
3063 		syscallarg(const char *) path;
3064 		syscallarg(int) pad;
3065 		syscallarg(off_t) length;
3066 	} */
3067 	struct vnode *vp;
3068 	struct vattr vattr;
3069 	int error;
3070 	struct nameidata nd;
3071 
3072 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3073 	    SCARG(uap, path));
3074 	if ((error = namei(&nd)) != 0)
3075 		return (error);
3076 	vp = nd.ni_vp;
3077 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3078 	if (vp->v_type == VDIR)
3079 		error = EISDIR;
3080 	else if ((error = vn_writechk(vp)) == 0 &&
3081 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3082 		VATTR_NULL(&vattr);
3083 		vattr.va_size = SCARG(uap, length);
3084 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3085 	}
3086 	vput(vp);
3087 	return (error);
3088 }
3089 
3090 /*
3091  * Truncate a file given a file descriptor.
3092  */
3093 /* ARGSUSED */
3094 int
3095 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3096 {
3097 	/* {
3098 		syscallarg(int) fd;
3099 		syscallarg(int) pad;
3100 		syscallarg(off_t) length;
3101 	} */
3102 	struct vattr vattr;
3103 	struct vnode *vp;
3104 	file_t *fp;
3105 	int error;
3106 
3107 	/* fd_getvnode() will use the descriptor for us */
3108 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3109 		return (error);
3110 	if ((fp->f_flag & FWRITE) == 0) {
3111 		error = EINVAL;
3112 		goto out;
3113 	}
3114 	vp = fp->f_data;
3115 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3116 	if (vp->v_type == VDIR)
3117 		error = EISDIR;
3118 	else if ((error = vn_writechk(vp)) == 0) {
3119 		VATTR_NULL(&vattr);
3120 		vattr.va_size = SCARG(uap, length);
3121 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3122 	}
3123 	VOP_UNLOCK(vp, 0);
3124  out:
3125 	fd_putfile(SCARG(uap, fd));
3126 	return (error);
3127 }
3128 
3129 /*
3130  * Sync an open file.
3131  */
3132 /* ARGSUSED */
3133 int
3134 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3135 {
3136 	/* {
3137 		syscallarg(int) fd;
3138 	} */
3139 	struct vnode *vp;
3140 	file_t *fp;
3141 	int error;
3142 
3143 	/* fd_getvnode() will use the descriptor for us */
3144 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3145 		return (error);
3146 	vp = fp->f_data;
3147 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3148 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3149 	if (error == 0 && bioopsp != NULL &&
3150 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3151 		(*bioopsp->io_fsync)(vp, 0);
3152 	VOP_UNLOCK(vp, 0);
3153 	fd_putfile(SCARG(uap, fd));
3154 	return (error);
3155 }
3156 
3157 /*
3158  * Sync a range of file data.  API modeled after that found in AIX.
3159  *
3160  * FDATASYNC indicates that we need only save enough metadata to be able
3161  * to re-read the written data.  Note we duplicate AIX's requirement that
3162  * the file be open for writing.
3163  */
3164 /* ARGSUSED */
3165 int
3166 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3167 {
3168 	/* {
3169 		syscallarg(int) fd;
3170 		syscallarg(int) flags;
3171 		syscallarg(off_t) start;
3172 		syscallarg(off_t) length;
3173 	} */
3174 	struct vnode *vp;
3175 	file_t *fp;
3176 	int flags, nflags;
3177 	off_t s, e, len;
3178 	int error;
3179 
3180 	/* fd_getvnode() will use the descriptor for us */
3181 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3182 		return (error);
3183 
3184 	if ((fp->f_flag & FWRITE) == 0) {
3185 		error = EBADF;
3186 		goto out;
3187 	}
3188 
3189 	flags = SCARG(uap, flags);
3190 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3191 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3192 		error = EINVAL;
3193 		goto out;
3194 	}
3195 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3196 	if (flags & FDATASYNC)
3197 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3198 	else
3199 		nflags = FSYNC_WAIT;
3200 	if (flags & FDISKSYNC)
3201 		nflags |= FSYNC_CACHE;
3202 
3203 	len = SCARG(uap, length);
3204 	/* If length == 0, we do the whole file, and s = l = 0 will do that */
3205 	if (len) {
3206 		s = SCARG(uap, start);
3207 		e = s + len;
3208 		if (e < s) {
3209 			error = EINVAL;
3210 			goto out;
3211 		}
3212 	} else {
3213 		e = 0;
3214 		s = 0;
3215 	}
3216 
3217 	vp = fp->f_data;
3218 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3219 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3220 
3221 	if (error == 0 && bioopsp != NULL &&
3222 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
3223 		(*bioopsp->io_fsync)(vp, nflags);
3224 
3225 	VOP_UNLOCK(vp, 0);
3226 out:
3227 	fd_putfile(SCARG(uap, fd));
3228 	return (error);
3229 }
3230 
3231 /*
3232  * Sync the data of an open file.
3233  */
3234 /* ARGSUSED */
3235 int
3236 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3237 {
3238 	/* {
3239 		syscallarg(int) fd;
3240 	} */
3241 	struct vnode *vp;
3242 	file_t *fp;
3243 	int error;
3244 
3245 	/* fd_getvnode() will use the descriptor for us */
3246 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3247 		return (error);
3248 	if ((fp->f_flag & FWRITE) == 0) {
3249 		fd_putfile(SCARG(uap, fd));
3250 		return (EBADF);
3251 	}
3252 	vp = fp->f_data;
3253 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3254 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3255 	VOP_UNLOCK(vp, 0);
3256 	fd_putfile(SCARG(uap, fd));
3257 	return (error);
3258 }
3259 
3260 /*
3261  * Rename files, (standard) BSD semantics frontend.
3262  */
3263 /* ARGSUSED */
3264 int
3265 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3266 {
3267 	/* {
3268 		syscallarg(const char *) from;
3269 		syscallarg(const char *) to;
3270 	} */
3271 
3272 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3273 }
3274 
3275 /*
3276  * Rename files, POSIX semantics frontend.
3277  */
3278 /* ARGSUSED */
3279 int
3280 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3281 {
3282 	/* {
3283 		syscallarg(const char *) from;
3284 		syscallarg(const char *) to;
3285 	} */
3286 
3287 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3288 }
3289 
3290 /*
3291  * Rename files.  Source and destination must either both be directories,
3292  * or both not be directories.  If target is a directory, it must be empty.
3293  * If `from' and `to' refer to the same object, the value of the `retain'
3294  * argument is used to determine whether `from' will be
3295  *
3296  * (retain == 0)	deleted unless `from' and `to' refer to the same
3297  *			object in the file system's name space (BSD).
3298  * (retain == 1)	always retained (POSIX).
3299  */
3300 int
3301 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3302 {
3303 	struct vnode *tvp, *fvp, *tdvp;
3304 	struct nameidata fromnd, tond;
3305 	struct mount *fs;
3306 	struct lwp *l = curlwp;
3307 	struct proc *p;
3308 	uint32_t saveflag;
3309 	int error;
3310 
3311 	NDINIT(&fromnd, DELETE, LOCKPARENT | SAVESTART | TRYEMULROOT,
3312 	    seg, from);
3313 	if ((error = namei(&fromnd)) != 0)
3314 		return (error);
3315 	if (fromnd.ni_dvp != fromnd.ni_vp)
3316 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3317 	fvp = fromnd.ni_vp;
3318 
3319 	fs = fvp->v_mount;
3320 	error = VFS_RENAMELOCK_ENTER(fs);
3321 	if (error) {
3322 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3323 		vrele(fromnd.ni_dvp);
3324 		vrele(fvp);
3325 		goto out1;
3326 	}
3327 
3328 	/*
3329 	 * close, partially, yet another race - ideally we should only
3330 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3331 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3332 	 * that with namei as it stands.
3333 	 *
3334 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3335 	 * under us. The real fix is to get the locks in the right
3336 	 * order and do the lookups in the right places, but that's a
3337 	 * major rototill.
3338 	 *
3339 	 * Preserve the SAVESTART in cn_flags, because who knows what
3340 	 * might happen if we don't.
3341 	 *
3342 	 * Note: this logic (as well as this whole function) is cloned
3343 	 * in nfs_serv.c. Proceed accordingly.
3344 	 */
3345 	vrele(fvp);
3346 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3347 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3348 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3349 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3350 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3351 		error = EINVAL;
3352 		VFS_RENAMELOCK_EXIT(fs);
3353 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3354 		vrele(fromnd.ni_dvp);
3355 		goto out1;
3356 	}
3357 	saveflag = fromnd.ni_cnd.cn_flags & SAVESTART;
3358 	fromnd.ni_cnd.cn_flags &= ~SAVESTART;
3359 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3360 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd);
3361 	fromnd.ni_cnd.cn_flags |= saveflag;
3362 	if (error) {
3363 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3364 		VFS_RENAMELOCK_EXIT(fs);
3365 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3366 		vrele(fromnd.ni_dvp);
3367 		goto out1;
3368 	}
3369 	VOP_UNLOCK(fromnd.ni_vp, 0);
3370 	if (fromnd.ni_dvp != fromnd.ni_vp)
3371 		VOP_UNLOCK(fromnd.ni_dvp, 0);
3372 	fvp = fromnd.ni_vp;
3373 
3374 	NDINIT(&tond, RENAME,
3375 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | TRYEMULROOT
3376 	      | (fvp->v_type == VDIR ? CREATEDIR : 0),
3377 	    seg, to);
3378 	if ((error = namei(&tond)) != 0) {
3379 		VFS_RENAMELOCK_EXIT(fs);
3380 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3381 		vrele(fromnd.ni_dvp);
3382 		vrele(fvp);
3383 		goto out1;
3384 	}
3385 	tdvp = tond.ni_dvp;
3386 	tvp = tond.ni_vp;
3387 
3388 	if (tvp != NULL) {
3389 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3390 			error = ENOTDIR;
3391 			goto out;
3392 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3393 			error = EISDIR;
3394 			goto out;
3395 		}
3396 	}
3397 
3398 	if (fvp == tdvp)
3399 		error = EINVAL;
3400 
3401 	/*
3402 	 * Source and destination refer to the same object.
3403 	 */
3404 	if (fvp == tvp) {
3405 		if (retain)
3406 			error = -1;
3407 		else if (fromnd.ni_dvp == tdvp &&
3408 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3409 		    !memcmp(fromnd.ni_cnd.cn_nameptr,
3410 		          tond.ni_cnd.cn_nameptr,
3411 		          fromnd.ni_cnd.cn_namelen))
3412 		error = -1;
3413 	}
3414 
3415 #if NVERIEXEC > 0
3416 	if (!error) {
3417 		char *f1, *f2;
3418 
3419 		f1 = malloc(fromnd.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3420 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, fromnd.ni_cnd.cn_namelen);
3421 
3422 		f2 = malloc(tond.ni_cnd.cn_namelen + 1, M_TEMP, M_WAITOK);
3423 		strlcpy(f2, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen);
3424 
3425 		error = veriexec_renamechk(l, fvp, f1, tvp, f2);
3426 
3427 		free(f1, M_TEMP);
3428 		free(f2, M_TEMP);
3429 	}
3430 #endif /* NVERIEXEC > 0 */
3431 
3432 out:
3433 	p = l->l_proc;
3434 	if (!error) {
3435 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3436 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3437 		VFS_RENAMELOCK_EXIT(fs);
3438 	} else {
3439 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3440 		if (tdvp == tvp)
3441 			vrele(tdvp);
3442 		else
3443 			vput(tdvp);
3444 		if (tvp)
3445 			vput(tvp);
3446 		VFS_RENAMELOCK_EXIT(fs);
3447 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3448 		vrele(fromnd.ni_dvp);
3449 		vrele(fvp);
3450 	}
3451 	vrele(tond.ni_startdir);
3452 	PNBUF_PUT(tond.ni_cnd.cn_pnbuf);
3453 out1:
3454 	if (fromnd.ni_startdir)
3455 		vrele(fromnd.ni_startdir);
3456 	PNBUF_PUT(fromnd.ni_cnd.cn_pnbuf);
3457 	return (error == -1 ? 0 : error);
3458 }
3459 
3460 /*
3461  * Make a directory file.
3462  */
3463 /* ARGSUSED */
3464 int
3465 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3466 {
3467 	/* {
3468 		syscallarg(const char *) path;
3469 		syscallarg(int) mode;
3470 	} */
3471 	struct proc *p = l->l_proc;
3472 	struct vnode *vp;
3473 	struct vattr vattr;
3474 	int error;
3475 	struct nameidata nd;
3476 
3477 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, UIO_USERSPACE,
3478 	    SCARG(uap, path));
3479 	if ((error = namei(&nd)) != 0)
3480 		return (error);
3481 	vp = nd.ni_vp;
3482 	if (vp != NULL) {
3483 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3484 		if (nd.ni_dvp == vp)
3485 			vrele(nd.ni_dvp);
3486 		else
3487 			vput(nd.ni_dvp);
3488 		vrele(vp);
3489 		return (EEXIST);
3490 	}
3491 	VATTR_NULL(&vattr);
3492 	vattr.va_type = VDIR;
3493 	/* We will read cwdi->cwdi_cmask unlocked. */
3494 	vattr.va_mode =
3495 	    (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3496 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3497 	if (!error)
3498 		vput(nd.ni_vp);
3499 	return (error);
3500 }
3501 
3502 /*
3503  * Remove a directory file.
3504  */
3505 /* ARGSUSED */
3506 int
3507 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3508 {
3509 	/* {
3510 		syscallarg(const char *) path;
3511 	} */
3512 	struct vnode *vp;
3513 	int error;
3514 	struct nameidata nd;
3515 
3516 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, UIO_USERSPACE,
3517 	    SCARG(uap, path));
3518 	if ((error = namei(&nd)) != 0)
3519 		return (error);
3520 	vp = nd.ni_vp;
3521 	if (vp->v_type != VDIR) {
3522 		error = ENOTDIR;
3523 		goto out;
3524 	}
3525 	/*
3526 	 * No rmdir "." please.
3527 	 */
3528 	if (nd.ni_dvp == vp) {
3529 		error = EINVAL;
3530 		goto out;
3531 	}
3532 	/*
3533 	 * The root of a mounted filesystem cannot be deleted.
3534 	 */
3535 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3536 		error = EBUSY;
3537 		goto out;
3538 	}
3539 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3540 	return (error);
3541 
3542 out:
3543 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3544 	if (nd.ni_dvp == vp)
3545 		vrele(nd.ni_dvp);
3546 	else
3547 		vput(nd.ni_dvp);
3548 	vput(vp);
3549 	return (error);
3550 }
3551 
3552 /*
3553  * Read a block of directory entries in a file system independent format.
3554  */
3555 int
3556 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3557 {
3558 	/* {
3559 		syscallarg(int) fd;
3560 		syscallarg(char *) buf;
3561 		syscallarg(size_t) count;
3562 	} */
3563 	file_t *fp;
3564 	int error, done;
3565 
3566 	/* fd_getvnode() will use the descriptor for us */
3567 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3568 		return (error);
3569 	if ((fp->f_flag & FREAD) == 0) {
3570 		error = EBADF;
3571 		goto out;
3572 	}
3573 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3574 			SCARG(uap, count), &done, l, 0, 0);
3575 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3576 	*retval = done;
3577  out:
3578 	fd_putfile(SCARG(uap, fd));
3579 	return (error);
3580 }
3581 
3582 /*
3583  * Set the mode mask for creation of filesystem nodes.
3584  */
3585 int
3586 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3587 {
3588 	/* {
3589 		syscallarg(mode_t) newmask;
3590 	} */
3591 	struct proc *p = l->l_proc;
3592 	struct cwdinfo *cwdi;
3593 
3594 	/*
3595 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3596 	 * important is that we serialize changes to the mask.  The
3597 	 * rw_exit() will issue a write memory barrier on our behalf,
3598 	 * and force the changes out to other CPUs (as it must use an
3599 	 * atomic operation, draining the local CPU's store buffers).
3600 	 */
3601 	cwdi = p->p_cwdi;
3602 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3603 	*retval = cwdi->cwdi_cmask;
3604 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3605 	rw_exit(&cwdi->cwdi_lock);
3606 
3607 	return (0);
3608 }
3609 
3610 int
3611 dorevoke(struct vnode *vp, kauth_cred_t cred)
3612 {
3613 	struct vattr vattr;
3614 	int error;
3615 
3616 	if ((error = VOP_GETATTR(vp, &vattr, cred)) != 0)
3617 		return error;
3618 	if (kauth_cred_geteuid(cred) != vattr.va_uid &&
3619 	    (error = kauth_authorize_generic(cred,
3620 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3621 		VOP_REVOKE(vp, REVOKEALL);
3622 	return (error);
3623 }
3624 
3625 /*
3626  * Void all references to file by ripping underlying filesystem
3627  * away from vnode.
3628  */
3629 /* ARGSUSED */
3630 int
3631 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3632 {
3633 	/* {
3634 		syscallarg(const char *) path;
3635 	} */
3636 	struct vnode *vp;
3637 	int error;
3638 	struct nameidata nd;
3639 
3640 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, UIO_USERSPACE,
3641 	    SCARG(uap, path));
3642 	if ((error = namei(&nd)) != 0)
3643 		return (error);
3644 	vp = nd.ni_vp;
3645 	error = dorevoke(vp, l->l_cred);
3646 	vrele(vp);
3647 	return (error);
3648 }
3649 
3650 /*
3651  * Convert a user file descriptor to a kernel file entry.
3652  */
3653 int
3654 getvnode(int fd, file_t **fpp)
3655 {
3656 	struct vnode *vp;
3657 	file_t *fp;
3658 
3659 	if ((fp = fd_getfile(fd)) == NULL)
3660 		return (EBADF);
3661 
3662 	if (fp->f_type != DTYPE_VNODE) {
3663 		fd_putfile(fd);
3664 		return (EINVAL);
3665 	}
3666 
3667 	vp = fp->f_data;
3668 	if (vp->v_type == VBAD) {
3669 		fd_putfile(fd);
3670 		return (EBADF);
3671 	}
3672 
3673 	*fpp = fp;
3674 	return (0);
3675 }
3676