xref: /netbsd-src/sys/kern/vfs_syscalls.c (revision 1897181a7231d5fc7ab48994d1447fcbc4e13a49)
1 /*	$NetBSD: vfs_syscalls.c,v 1.442 2011/12/02 12:30:14 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.442 2011/12/02 12:30:14 yamt Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/kernel.h>
85 #include <sys/file.h>
86 #include <sys/fcntl.h>
87 #include <sys/stat.h>
88 #include <sys/vnode.h>
89 #include <sys/mount.h>
90 #include <sys/proc.h>
91 #include <sys/uio.h>
92 #include <sys/kmem.h>
93 #include <sys/dirent.h>
94 #include <sys/sysctl.h>
95 #include <sys/syscallargs.h>
96 #include <sys/vfs_syscalls.h>
97 #include <sys/ktrace.h>
98 #ifdef FILEASSOC
99 #include <sys/fileassoc.h>
100 #endif /* FILEASSOC */
101 #include <sys/extattr.h>
102 #include <sys/verified_exec.h>
103 #include <sys/kauth.h>
104 #include <sys/atomic.h>
105 #include <sys/module.h>
106 #include <sys/buf.h>
107 
108 #include <miscfs/genfs/genfs.h>
109 #include <miscfs/syncfs/syncfs.h>
110 #include <miscfs/specfs/specdev.h>
111 
112 #include <nfs/rpcv2.h>
113 #include <nfs/nfsproto.h>
114 #include <nfs/nfs.h>
115 #include <nfs/nfs_var.h>
116 
117 static int change_flags(struct vnode *, u_long, struct lwp *);
118 static int change_mode(struct vnode *, int, struct lwp *l);
119 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
120 
121 /*
122  * This table is used to maintain compatibility with 4.3BSD
123  * and NetBSD 0.9 mount syscalls - and possibly other systems.
124  * Note, the order is important!
125  *
126  * Do not modify this table. It should only contain filesystems
127  * supported by NetBSD 0.9 and 4.3BSD.
128  */
129 const char * const mountcompatnames[] = {
130 	NULL,		/* 0 = MOUNT_NONE */
131 	MOUNT_FFS,	/* 1 = MOUNT_UFS */
132 	MOUNT_NFS,	/* 2 */
133 	MOUNT_MFS,	/* 3 */
134 	MOUNT_MSDOS,	/* 4 */
135 	MOUNT_CD9660,	/* 5 = MOUNT_ISOFS */
136 	MOUNT_FDESC,	/* 6 */
137 	MOUNT_KERNFS,	/* 7 */
138 	NULL,		/* 8 = MOUNT_DEVFS */
139 	MOUNT_AFS,	/* 9 */
140 };
141 
142 const int nmountcompatnames = __arraycount(mountcompatnames);
143 
144 static int
145 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
146 {
147 	int error;
148 
149 	fp->f_flag = flags & FMASK;
150 	fp->f_type = DTYPE_VNODE;
151 	fp->f_ops = &vnops;
152 	fp->f_data = vp;
153 
154 	if (flags & (O_EXLOCK | O_SHLOCK)) {
155 		struct flock lf;
156 		int type;
157 
158 		lf.l_whence = SEEK_SET;
159 		lf.l_start = 0;
160 		lf.l_len = 0;
161 		if (flags & O_EXLOCK)
162 			lf.l_type = F_WRLCK;
163 		else
164 			lf.l_type = F_RDLCK;
165 		type = F_FLOCK;
166 		if ((flags & FNONBLOCK) == 0)
167 			type |= F_WAIT;
168 		VOP_UNLOCK(vp);
169 		error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
170 		if (error) {
171 			(void) vn_close(vp, fp->f_flag, fp->f_cred);
172 			fd_abort(l->l_proc, fp, indx);
173 			return error;
174 		}
175 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
176 		atomic_or_uint(&fp->f_flag, FHASLOCK);
177 	}
178 	if (flags & O_CLOEXEC)
179 		fd_set_exclose(l, indx, true);
180 	return 0;
181 }
182 
183 static int
184 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
185     void *data, size_t *data_len)
186 {
187 	struct mount *mp;
188 	int error = 0, saved_flags;
189 
190 	mp = vp->v_mount;
191 	saved_flags = mp->mnt_flag;
192 
193 	/* We can operate only on VV_ROOT nodes. */
194 	if ((vp->v_vflag & VV_ROOT) == 0) {
195 		error = EINVAL;
196 		goto out;
197 	}
198 
199 	/*
200 	 * We only allow the filesystem to be reloaded if it
201 	 * is currently mounted read-only.  Additionally, we
202 	 * prevent read-write to read-only downgrades.
203 	 */
204 	if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
205 	    (mp->mnt_flag & MNT_RDONLY) == 0 &&
206 	    (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
207 		error = EOPNOTSUPP;	/* Needs translation */
208 		goto out;
209 	}
210 
211 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
212 	    KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
213 	if (error)
214 		goto out;
215 
216 	if (vfs_busy(mp, NULL)) {
217 		error = EPERM;
218 		goto out;
219 	}
220 
221 	mutex_enter(&mp->mnt_updating);
222 
223 	mp->mnt_flag &= ~MNT_OP_FLAGS;
224 	mp->mnt_flag |= flags & MNT_OP_FLAGS;
225 
226 	/*
227 	 * Set the mount level flags.
228 	 */
229 	if (flags & MNT_RDONLY)
230 		mp->mnt_flag |= MNT_RDONLY;
231 	else if (mp->mnt_flag & MNT_RDONLY)
232 		mp->mnt_iflag |= IMNT_WANTRDWR;
233 	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
234 	mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
235 	error = VFS_MOUNT(mp, path, data, data_len);
236 
237 	if (error && data != NULL) {
238 		int error2;
239 
240 		/*
241 		 * Update failed; let's try and see if it was an
242 		 * export request.  For compat with 3.0 and earlier.
243 		 */
244 		error2 = vfs_hooks_reexport(mp, path, data);
245 
246 		/*
247 		 * Only update error code if the export request was
248 		 * understood but some problem occurred while
249 		 * processing it.
250 		 */
251 		if (error2 != EJUSTRETURN)
252 			error = error2;
253 	}
254 
255 	if (mp->mnt_iflag & IMNT_WANTRDWR)
256 		mp->mnt_flag &= ~MNT_RDONLY;
257 	if (error)
258 		mp->mnt_flag = saved_flags;
259 	mp->mnt_flag &= ~MNT_OP_FLAGS;
260 	mp->mnt_iflag &= ~IMNT_WANTRDWR;
261 	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
262 		if (mp->mnt_syncer == NULL)
263 			error = vfs_allocate_syncvnode(mp);
264 	} else {
265 		if (mp->mnt_syncer != NULL)
266 			vfs_deallocate_syncvnode(mp);
267 	}
268 	mutex_exit(&mp->mnt_updating);
269 	vfs_unbusy(mp, false, NULL);
270 
271 	if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
272 	    (flags & MNT_EXTATTR)) {
273 		if (VFS_EXTATTRCTL(vp->v_mount, EXTATTR_CMD_START,
274 				   NULL, 0, NULL) != 0) {
275 			printf("%s: failed to start extattr, error = %d",
276 			       vp->v_mount->mnt_stat.f_mntonname, error);
277 			mp->mnt_flag &= ~MNT_EXTATTR;
278 		}
279 	}
280 
281 	if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
282 	    !(flags & MNT_EXTATTR)) {
283 		if (VFS_EXTATTRCTL(vp->v_mount, EXTATTR_CMD_STOP,
284 				   NULL, 0, NULL) != 0) {
285 			printf("%s: failed to stop extattr, error = %d",
286 			       vp->v_mount->mnt_stat.f_mntonname, error);
287 			mp->mnt_flag |= MNT_RDONLY;
288 		}
289 	}
290  out:
291 	return (error);
292 }
293 
294 static int
295 mount_get_vfsops(const char *fstype, struct vfsops **vfsops)
296 {
297 	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
298 	int error;
299 
300 	/* Copy file-system type from userspace.  */
301 	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
302 	if (error) {
303 		/*
304 		 * Historically, filesystem types were identified by numbers.
305 		 * If we get an integer for the filesystem type instead of a
306 		 * string, we check to see if it matches one of the historic
307 		 * filesystem types.
308 		 */
309 		u_long fsindex = (u_long)fstype;
310 		if (fsindex >= nmountcompatnames ||
311 		    mountcompatnames[fsindex] == NULL)
312 			return ENODEV;
313 		strlcpy(fstypename, mountcompatnames[fsindex],
314 		    sizeof(fstypename));
315 	}
316 
317 	/* Accept `ufs' as an alias for `ffs', for compatibility. */
318 	if (strcmp(fstypename, "ufs") == 0)
319 		fstypename[0] = 'f';
320 
321 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
322 		return 0;
323 
324 	/* If we can autoload a vfs module, try again */
325 	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
326 
327 	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
328 		return 0;
329 
330 	return ENODEV;
331 }
332 
333 static int
334 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
335     void *data, size_t *data_len)
336 {
337 	struct mount *mp;
338 	int error;
339 
340 	/* If MNT_GETARGS is specified, it should be the only flag. */
341 	if (flags & ~MNT_GETARGS)
342 		return EINVAL;
343 
344 	mp = vp->v_mount;
345 
346 	/* XXX: probably some notion of "can see" here if we want isolation. */
347 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
348 	    KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
349 	if (error)
350 		return error;
351 
352 	if ((vp->v_vflag & VV_ROOT) == 0)
353 		return EINVAL;
354 
355 	if (vfs_busy(mp, NULL))
356 		return EPERM;
357 
358 	mutex_enter(&mp->mnt_updating);
359 	mp->mnt_flag &= ~MNT_OP_FLAGS;
360 	mp->mnt_flag |= MNT_GETARGS;
361 	error = VFS_MOUNT(mp, path, data, data_len);
362 	mp->mnt_flag &= ~MNT_OP_FLAGS;
363 	mutex_exit(&mp->mnt_updating);
364 
365 	vfs_unbusy(mp, false, NULL);
366 	return (error);
367 }
368 
369 int
370 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
371 {
372 	/* {
373 		syscallarg(const char *) type;
374 		syscallarg(const char *) path;
375 		syscallarg(int) flags;
376 		syscallarg(void *) data;
377 		syscallarg(size_t) data_len;
378 	} */
379 
380 	return do_sys_mount(l, NULL, SCARG(uap, type), SCARG(uap, path),
381 	    SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
382 	    SCARG(uap, data_len), retval);
383 }
384 
385 int
386 do_sys_mount(struct lwp *l, struct vfsops *vfsops, const char *type,
387     const char *path, int flags, void *data, enum uio_seg data_seg,
388     size_t data_len, register_t *retval)
389 {
390 	struct vnode *vp;
391 	void *data_buf = data;
392 	bool vfsopsrele = false;
393 	int error;
394 
395 	/* XXX: The calling convention of this routine is totally bizarre */
396 	if (vfsops)
397 		vfsopsrele = true;
398 
399 	/*
400 	 * Get vnode to be covered
401 	 */
402 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
403 	if (error != 0) {
404 		vp = NULL;
405 		goto done;
406 	}
407 
408 	if (vfsops == NULL) {
409 		if (flags & (MNT_GETARGS | MNT_UPDATE)) {
410 			vfsops = vp->v_mount->mnt_op;
411 		} else {
412 			/* 'type' is userspace */
413 			error = mount_get_vfsops(type, &vfsops);
414 			if (error != 0)
415 				goto done;
416 			vfsopsrele = true;
417 		}
418 	}
419 
420 	if (data != NULL && data_seg == UIO_USERSPACE) {
421 		if (data_len == 0) {
422 			/* No length supplied, use default for filesystem */
423 			data_len = vfsops->vfs_min_mount_data;
424 			if (data_len > VFS_MAX_MOUNT_DATA) {
425 				error = EINVAL;
426 				goto done;
427 			}
428 			/*
429 			 * Hopefully a longer buffer won't make copyin() fail.
430 			 * For compatibility with 3.0 and earlier.
431 			 */
432 			if (flags & MNT_UPDATE
433 			    && data_len < sizeof (struct mnt_export_args30))
434 				data_len = sizeof (struct mnt_export_args30);
435 		}
436 		data_buf = kmem_alloc(data_len, KM_SLEEP);
437 
438 		/* NFS needs the buffer even for mnt_getargs .... */
439 		error = copyin(data, data_buf, data_len);
440 		if (error != 0)
441 			goto done;
442 	}
443 
444 	if (flags & MNT_GETARGS) {
445 		if (data_len == 0) {
446 			error = EINVAL;
447 			goto done;
448 		}
449 		error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
450 		if (error != 0)
451 			goto done;
452 		if (data_seg == UIO_USERSPACE)
453 			error = copyout(data_buf, data, data_len);
454 		*retval = data_len;
455 	} else if (flags & MNT_UPDATE) {
456 		error = mount_update(l, vp, path, flags, data_buf, &data_len);
457 	} else {
458 		/* Locking is handled internally in mount_domount(). */
459 		KASSERT(vfsopsrele == true);
460 		error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
461 		    &data_len);
462 		vfsopsrele = false;
463 
464 		if ((error == 0) && (flags & MNT_EXTATTR)) {
465 			if (VFS_EXTATTRCTL(vp->v_mount, EXTATTR_CMD_START,
466 					   NULL, 0, NULL) != 0)
467 				printf("%s: failed to start extattr",
468 				       vp->v_mount->mnt_stat.f_mntonname);
469 				/* XXX remove flag */
470 		}
471 	}
472 
473     done:
474 	if (vfsopsrele)
475 		vfs_delref(vfsops);
476     	if (vp != NULL) {
477 	    	vrele(vp);
478 	}
479 	if (data_buf != data)
480 		kmem_free(data_buf, data_len);
481 	return (error);
482 }
483 
484 /*
485  * Unmount a file system.
486  *
487  * Note: unmount takes a path to the vnode mounted on as argument,
488  * not special file (as before).
489  */
490 /* ARGSUSED */
491 int
492 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
493 {
494 	/* {
495 		syscallarg(const char *) path;
496 		syscallarg(int) flags;
497 	} */
498 	struct vnode *vp;
499 	struct mount *mp;
500 	int error;
501 	struct pathbuf *pb;
502 	struct nameidata nd;
503 
504 	error = pathbuf_copyin(SCARG(uap, path), &pb);
505 	if (error) {
506 		return error;
507 	}
508 
509 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
510 	if ((error = namei(&nd)) != 0) {
511 		pathbuf_destroy(pb);
512 		return error;
513 	}
514 	vp = nd.ni_vp;
515 	pathbuf_destroy(pb);
516 
517 	mp = vp->v_mount;
518 	atomic_inc_uint(&mp->mnt_refcnt);
519 	VOP_UNLOCK(vp);
520 
521 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
522 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
523 	if (error) {
524 		vrele(vp);
525 		vfs_destroy(mp);
526 		return (error);
527 	}
528 
529 	/*
530 	 * Don't allow unmounting the root file system.
531 	 */
532 	if (mp->mnt_flag & MNT_ROOTFS) {
533 		vrele(vp);
534 		vfs_destroy(mp);
535 		return (EINVAL);
536 	}
537 
538 	/*
539 	 * Must be the root of the filesystem
540 	 */
541 	if ((vp->v_vflag & VV_ROOT) == 0) {
542 		vrele(vp);
543 		vfs_destroy(mp);
544 		return (EINVAL);
545 	}
546 
547 	vrele(vp);
548 	error = dounmount(mp, SCARG(uap, flags), l);
549 	vfs_destroy(mp);
550 	return error;
551 }
552 
553 /*
554  * Sync each mounted filesystem.
555  */
556 #ifdef DEBUG
557 int syncprt = 0;
558 struct ctldebug debug0 = { "syncprt", &syncprt };
559 #endif
560 
561 void
562 do_sys_sync(struct lwp *l)
563 {
564 	struct mount *mp, *nmp;
565 	int asyncflag;
566 
567 	mutex_enter(&mountlist_lock);
568 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
569 	     mp = nmp) {
570 		if (vfs_busy(mp, &nmp)) {
571 			continue;
572 		}
573 		mutex_enter(&mp->mnt_updating);
574 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
575 			asyncflag = mp->mnt_flag & MNT_ASYNC;
576 			mp->mnt_flag &= ~MNT_ASYNC;
577 			VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
578 			if (asyncflag)
579 				 mp->mnt_flag |= MNT_ASYNC;
580 		}
581 		mutex_exit(&mp->mnt_updating);
582 		vfs_unbusy(mp, false, &nmp);
583 	}
584 	mutex_exit(&mountlist_lock);
585 #ifdef DEBUG
586 	if (syncprt)
587 		vfs_bufstats();
588 #endif /* DEBUG */
589 }
590 
591 /* ARGSUSED */
592 int
593 sys_sync(struct lwp *l, const void *v, register_t *retval)
594 {
595 	do_sys_sync(l);
596 	return (0);
597 }
598 
599 
600 /*
601  * Change filesystem quotas.
602  */
603 /* ARGSUSED */
604 int
605 sys___quotactl50(struct lwp *l, const struct sys___quotactl50_args *uap,
606     register_t *retval)
607 {
608 	/* {
609 		syscallarg(const char *) path;
610 		syscallarg(struct plistref *) pref;
611 	} */
612 	struct mount *mp;
613 	int error;
614 	struct vnode *vp;
615 	prop_dictionary_t dict;
616 	struct plistref pref;
617 
618 	error = namei_simple_user(SCARG(uap, path),
619 				NSM_FOLLOW_TRYEMULROOT, &vp);
620 	if (error != 0)
621 		return (error);
622 	mp = vp->v_mount;
623 	error = copyin(SCARG(uap, pref), &pref, sizeof(pref));
624 	if (error)
625 		return error;
626 	error = prop_dictionary_copyin(&pref, &dict);
627 	if (error)
628 		return error;
629 	error = VFS_QUOTACTL(mp, dict);
630 	vrele(vp);
631 	if (!error)
632 		error = prop_dictionary_copyout(&pref, dict);
633 	if (!error)
634 		error = copyout(&pref, SCARG(uap, pref), sizeof(pref));
635 	prop_object_release(dict);
636 	return (error);
637 }
638 
639 int
640 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
641     int root)
642 {
643 	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
644 	int error = 0;
645 
646 	/*
647 	 * If MNT_NOWAIT or MNT_LAZY is specified, do not
648 	 * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
649 	 * overrides MNT_NOWAIT.
650 	 */
651 	if (flags == MNT_NOWAIT	|| flags == MNT_LAZY ||
652 	    (flags != MNT_WAIT && flags != 0)) {
653 		memcpy(sp, &mp->mnt_stat, sizeof(*sp));
654 		goto done;
655 	}
656 
657 	/* Get the filesystem stats now */
658 	memset(sp, 0, sizeof(*sp));
659 	if ((error = VFS_STATVFS(mp, sp)) != 0) {
660 		return error;
661 	}
662 
663 	if (cwdi->cwdi_rdir == NULL)
664 		(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
665 done:
666 	if (cwdi->cwdi_rdir != NULL) {
667 		size_t len;
668 		char *bp;
669 		char c;
670 		char *path = PNBUF_GET();
671 
672 		bp = path + MAXPATHLEN;
673 		*--bp = '\0';
674 		rw_enter(&cwdi->cwdi_lock, RW_READER);
675 		error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
676 		    MAXPATHLEN / 2, 0, l);
677 		rw_exit(&cwdi->cwdi_lock);
678 		if (error) {
679 			PNBUF_PUT(path);
680 			return error;
681 		}
682 		len = strlen(bp);
683 		if (len != 1) {
684 			/*
685 			 * for mount points that are below our root, we can see
686 			 * them, so we fix up the pathname and return them. The
687 			 * rest we cannot see, so we don't allow viewing the
688 			 * data.
689 			 */
690 			if (strncmp(bp, sp->f_mntonname, len) == 0 &&
691 			    ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
692 				(void)strlcpy(sp->f_mntonname,
693 				    c == '\0' ? "/" : &sp->f_mntonname[len],
694 				    sizeof(sp->f_mntonname));
695 			} else {
696 				if (root)
697 					(void)strlcpy(sp->f_mntonname, "/",
698 					    sizeof(sp->f_mntonname));
699 				else
700 					error = EPERM;
701 			}
702 		}
703 		PNBUF_PUT(path);
704 	}
705 	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
706 	return error;
707 }
708 
709 /*
710  * Get filesystem statistics by path.
711  */
712 int
713 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
714 {
715 	struct mount *mp;
716 	int error;
717 	struct vnode *vp;
718 
719 	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
720 	if (error != 0)
721 		return error;
722 	mp = vp->v_mount;
723 	error = dostatvfs(mp, sb, l, flags, 1);
724 	vrele(vp);
725 	return error;
726 }
727 
728 /* ARGSUSED */
729 int
730 sys_statvfs1(struct lwp *l, const struct sys_statvfs1_args *uap, register_t *retval)
731 {
732 	/* {
733 		syscallarg(const char *) path;
734 		syscallarg(struct statvfs *) buf;
735 		syscallarg(int) flags;
736 	} */
737 	struct statvfs *sb;
738 	int error;
739 
740 	sb = STATVFSBUF_GET();
741 	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
742 	if (error == 0)
743 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
744 	STATVFSBUF_PUT(sb);
745 	return error;
746 }
747 
748 /*
749  * Get filesystem statistics by fd.
750  */
751 int
752 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
753 {
754 	file_t *fp;
755 	struct mount *mp;
756 	int error;
757 
758 	/* fd_getvnode() will use the descriptor for us */
759 	if ((error = fd_getvnode(fd, &fp)) != 0)
760 		return (error);
761 	mp = ((struct vnode *)fp->f_data)->v_mount;
762 	error = dostatvfs(mp, sb, curlwp, flags, 1);
763 	fd_putfile(fd);
764 	return error;
765 }
766 
767 /* ARGSUSED */
768 int
769 sys_fstatvfs1(struct lwp *l, const struct sys_fstatvfs1_args *uap, register_t *retval)
770 {
771 	/* {
772 		syscallarg(int) fd;
773 		syscallarg(struct statvfs *) buf;
774 		syscallarg(int) flags;
775 	} */
776 	struct statvfs *sb;
777 	int error;
778 
779 	sb = STATVFSBUF_GET();
780 	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
781 	if (error == 0)
782 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
783 	STATVFSBUF_PUT(sb);
784 	return error;
785 }
786 
787 
788 /*
789  * Get statistics on all filesystems.
790  */
791 int
792 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
793     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
794     register_t *retval)
795 {
796 	int root = 0;
797 	struct proc *p = l->l_proc;
798 	struct mount *mp, *nmp;
799 	struct statvfs *sb;
800 	size_t count, maxcount;
801 	int error = 0;
802 
803 	sb = STATVFSBUF_GET();
804 	maxcount = bufsize / entry_sz;
805 	mutex_enter(&mountlist_lock);
806 	count = 0;
807 	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
808 	     mp = nmp) {
809 		if (vfs_busy(mp, &nmp)) {
810 			continue;
811 		}
812 		if (sfsp && count < maxcount) {
813 			error = dostatvfs(mp, sb, l, flags, 0);
814 			if (error) {
815 				vfs_unbusy(mp, false, &nmp);
816 				error = 0;
817 				continue;
818 			}
819 			error = copyfn(sb, sfsp, entry_sz);
820 			if (error) {
821 				vfs_unbusy(mp, false, NULL);
822 				goto out;
823 			}
824 			sfsp = (char *)sfsp + entry_sz;
825 			root |= strcmp(sb->f_mntonname, "/") == 0;
826 		}
827 		count++;
828 		vfs_unbusy(mp, false, &nmp);
829 	}
830 	mutex_exit(&mountlist_lock);
831 
832 	if (root == 0 && p->p_cwdi->cwdi_rdir) {
833 		/*
834 		 * fake a root entry
835 		 */
836 		error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
837 		    sb, l, flags, 1);
838 		if (error != 0)
839 			goto out;
840 		if (sfsp) {
841 			error = copyfn(sb, sfsp, entry_sz);
842 			if (error != 0)
843 				goto out;
844 		}
845 		count++;
846 	}
847 	if (sfsp && count > maxcount)
848 		*retval = maxcount;
849 	else
850 		*retval = count;
851 out:
852 	STATVFSBUF_PUT(sb);
853 	return error;
854 }
855 
856 int
857 sys_getvfsstat(struct lwp *l, const struct sys_getvfsstat_args *uap, register_t *retval)
858 {
859 	/* {
860 		syscallarg(struct statvfs *) buf;
861 		syscallarg(size_t) bufsize;
862 		syscallarg(int) flags;
863 	} */
864 
865 	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
866 	    SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
867 }
868 
869 /*
870  * Change current working directory to a given file descriptor.
871  */
872 /* ARGSUSED */
873 int
874 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
875 {
876 	/* {
877 		syscallarg(int) fd;
878 	} */
879 	struct proc *p = l->l_proc;
880 	struct cwdinfo *cwdi;
881 	struct vnode *vp, *tdp;
882 	struct mount *mp;
883 	file_t *fp;
884 	int error, fd;
885 
886 	/* fd_getvnode() will use the descriptor for us */
887 	fd = SCARG(uap, fd);
888 	if ((error = fd_getvnode(fd, &fp)) != 0)
889 		return (error);
890 	vp = fp->f_data;
891 
892 	vref(vp);
893 	vn_lock(vp,  LK_EXCLUSIVE | LK_RETRY);
894 	if (vp->v_type != VDIR)
895 		error = ENOTDIR;
896 	else
897 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
898 	if (error) {
899 		vput(vp);
900 		goto out;
901 	}
902 	while ((mp = vp->v_mountedhere) != NULL) {
903 		error = vfs_busy(mp, NULL);
904 		vput(vp);
905 		if (error != 0)
906 			goto out;
907 		error = VFS_ROOT(mp, &tdp);
908 		vfs_unbusy(mp, false, NULL);
909 		if (error)
910 			goto out;
911 		vp = tdp;
912 	}
913 	VOP_UNLOCK(vp);
914 
915 	/*
916 	 * Disallow changing to a directory not under the process's
917 	 * current root directory (if there is one).
918 	 */
919 	cwdi = p->p_cwdi;
920 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
921 	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
922 		vrele(vp);
923 		error = EPERM;	/* operation not permitted */
924 	} else {
925 		vrele(cwdi->cwdi_cdir);
926 		cwdi->cwdi_cdir = vp;
927 	}
928 	rw_exit(&cwdi->cwdi_lock);
929 
930  out:
931 	fd_putfile(fd);
932 	return (error);
933 }
934 
935 /*
936  * Change this process's notion of the root directory to a given file
937  * descriptor.
938  */
939 int
940 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
941 {
942 	struct proc *p = l->l_proc;
943 	struct vnode	*vp;
944 	file_t	*fp;
945 	int		 error, fd = SCARG(uap, fd);
946 
947 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
948  	    KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
949 		return error;
950 	/* fd_getvnode() will use the descriptor for us */
951 	if ((error = fd_getvnode(fd, &fp)) != 0)
952 		return error;
953 	vp = fp->f_data;
954 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
955 	if (vp->v_type != VDIR)
956 		error = ENOTDIR;
957 	else
958 		error = VOP_ACCESS(vp, VEXEC, l->l_cred);
959 	VOP_UNLOCK(vp);
960 	if (error)
961 		goto out;
962 	vref(vp);
963 
964 	change_root(p->p_cwdi, vp, l);
965 
966  out:
967 	fd_putfile(fd);
968 	return (error);
969 }
970 
971 /*
972  * Change current working directory (``.'').
973  */
974 /* ARGSUSED */
975 int
976 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
977 {
978 	/* {
979 		syscallarg(const char *) path;
980 	} */
981 	struct proc *p = l->l_proc;
982 	struct cwdinfo *cwdi;
983 	int error;
984 	struct vnode *vp;
985 
986 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
987 				  &vp, l)) != 0)
988 		return (error);
989 	cwdi = p->p_cwdi;
990 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
991 	vrele(cwdi->cwdi_cdir);
992 	cwdi->cwdi_cdir = vp;
993 	rw_exit(&cwdi->cwdi_lock);
994 	return (0);
995 }
996 
997 /*
998  * Change notion of root (``/'') directory.
999  */
1000 /* ARGSUSED */
1001 int
1002 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
1003 {
1004 	/* {
1005 		syscallarg(const char *) path;
1006 	} */
1007 	struct proc *p = l->l_proc;
1008 	int error;
1009 	struct vnode *vp;
1010 
1011 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1012 	    KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1013 		return (error);
1014 	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1015 				  &vp, l)) != 0)
1016 		return (error);
1017 
1018 	change_root(p->p_cwdi, vp, l);
1019 
1020 	return (0);
1021 }
1022 
1023 /*
1024  * Common routine for chroot and fchroot.
1025  * NB: callers need to properly authorize the change root operation.
1026  */
1027 void
1028 change_root(struct cwdinfo *cwdi, struct vnode *vp, struct lwp *l)
1029 {
1030 
1031 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1032 	if (cwdi->cwdi_rdir != NULL)
1033 		vrele(cwdi->cwdi_rdir);
1034 	cwdi->cwdi_rdir = vp;
1035 
1036 	/*
1037 	 * Prevent escaping from chroot by putting the root under
1038 	 * the working directory.  Silently chdir to / if we aren't
1039 	 * already there.
1040 	 */
1041 	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1042 		/*
1043 		 * XXX would be more failsafe to change directory to a
1044 		 * deadfs node here instead
1045 		 */
1046 		vrele(cwdi->cwdi_cdir);
1047 		vref(vp);
1048 		cwdi->cwdi_cdir = vp;
1049 	}
1050 	rw_exit(&cwdi->cwdi_lock);
1051 }
1052 
1053 /*
1054  * Common routine for chroot and chdir.
1055  * XXX "where" should be enum uio_seg
1056  */
1057 int
1058 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1059 {
1060 	struct pathbuf *pb;
1061 	struct nameidata nd;
1062 	int error;
1063 
1064 	error = pathbuf_maybe_copyin(path, where, &pb);
1065 	if (error) {
1066 		return error;
1067 	}
1068 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1069 	if ((error = namei(&nd)) != 0) {
1070 		pathbuf_destroy(pb);
1071 		return error;
1072 	}
1073 	*vpp = nd.ni_vp;
1074 	pathbuf_destroy(pb);
1075 
1076 	if ((*vpp)->v_type != VDIR)
1077 		error = ENOTDIR;
1078 	else
1079 		error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1080 
1081 	if (error)
1082 		vput(*vpp);
1083 	else
1084 		VOP_UNLOCK(*vpp);
1085 	return (error);
1086 }
1087 
1088 /*
1089  * Check permissions, allocate an open file structure,
1090  * and call the device open routine if any.
1091  */
1092 int
1093 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1094 {
1095 	/* {
1096 		syscallarg(const char *) path;
1097 		syscallarg(int) flags;
1098 		syscallarg(int) mode;
1099 	} */
1100 	struct proc *p = l->l_proc;
1101 	struct cwdinfo *cwdi = p->p_cwdi;
1102 	file_t *fp;
1103 	struct vnode *vp;
1104 	int flags, cmode;
1105 	int indx, error;
1106 	struct pathbuf *pb;
1107 	struct nameidata nd;
1108 
1109 	flags = FFLAGS(SCARG(uap, flags));
1110 	if ((flags & (FREAD | FWRITE)) == 0)
1111 		return (EINVAL);
1112 
1113 	error = pathbuf_copyin(SCARG(uap, path), &pb);
1114 	if (error) {
1115 		return error;
1116 	}
1117 
1118 	if ((error = fd_allocfile(&fp, &indx)) != 0) {
1119 		pathbuf_destroy(pb);
1120 		return error;
1121 	}
1122 	/* We're going to read cwdi->cwdi_cmask unlocked here. */
1123 	cmode = ((SCARG(uap, mode) &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1124 	NDINIT(&nd, LOOKUP, FOLLOW | TRYEMULROOT, pb);
1125 	l->l_dupfd = -indx - 1;			/* XXX check for fdopen */
1126 	if ((error = vn_open(&nd, flags, cmode)) != 0) {
1127 		fd_abort(p, fp, indx);
1128 		if ((error == EDUPFD || error == EMOVEFD) &&
1129 		    l->l_dupfd >= 0 &&			/* XXX from fdopen */
1130 		    (error =
1131 			fd_dupopen(l->l_dupfd, &indx, flags, error)) == 0) {
1132 			*retval = indx;
1133 			pathbuf_destroy(pb);
1134 			return (0);
1135 		}
1136 		if (error == ERESTART)
1137 			error = EINTR;
1138 		pathbuf_destroy(pb);
1139 		return (error);
1140 	}
1141 
1142 	l->l_dupfd = 0;
1143 	vp = nd.ni_vp;
1144 	pathbuf_destroy(pb);
1145 
1146 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1147 		return error;
1148 
1149 	VOP_UNLOCK(vp);
1150 	*retval = indx;
1151 	fd_affix(p, fp, indx);
1152 	return (0);
1153 }
1154 
1155 int
1156 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
1157 {
1158 	/* {
1159 		syscallarg(int) fd;
1160 		syscallarg(const char *) path;
1161 		syscallarg(int) flags;
1162 		syscallarg(int) mode;
1163 	} */
1164 
1165 	return ENOSYS;
1166 }
1167 
1168 static void
1169 vfs__fhfree(fhandle_t *fhp)
1170 {
1171 	size_t fhsize;
1172 
1173 	if (fhp == NULL) {
1174 		return;
1175 	}
1176 	fhsize = FHANDLE_SIZE(fhp);
1177 	kmem_free(fhp, fhsize);
1178 }
1179 
1180 /*
1181  * vfs_composefh: compose a filehandle.
1182  */
1183 
1184 int
1185 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1186 {
1187 	struct mount *mp;
1188 	struct fid *fidp;
1189 	int error;
1190 	size_t needfhsize;
1191 	size_t fidsize;
1192 
1193 	mp = vp->v_mount;
1194 	fidp = NULL;
1195 	if (*fh_size < FHANDLE_SIZE_MIN) {
1196 		fidsize = 0;
1197 	} else {
1198 		fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1199 		if (fhp != NULL) {
1200 			memset(fhp, 0, *fh_size);
1201 			fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1202 			fidp = &fhp->fh_fid;
1203 		}
1204 	}
1205 	error = VFS_VPTOFH(vp, fidp, &fidsize);
1206 	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1207 	if (error == 0 && *fh_size < needfhsize) {
1208 		error = E2BIG;
1209 	}
1210 	*fh_size = needfhsize;
1211 	return error;
1212 }
1213 
1214 int
1215 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1216 {
1217 	struct mount *mp;
1218 	fhandle_t *fhp;
1219 	size_t fhsize;
1220 	size_t fidsize;
1221 	int error;
1222 
1223 	*fhpp = NULL;
1224 	mp = vp->v_mount;
1225 	fidsize = 0;
1226 	error = VFS_VPTOFH(vp, NULL, &fidsize);
1227 	KASSERT(error != 0);
1228 	if (error != E2BIG) {
1229 		goto out;
1230 	}
1231 	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1232 	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1233 	if (fhp == NULL) {
1234 		error = ENOMEM;
1235 		goto out;
1236 	}
1237 	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1238 	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1239 	if (error == 0) {
1240 		KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1241 		    FHANDLE_FILEID(fhp)->fid_len == fidsize));
1242 		*fhpp = fhp;
1243 	} else {
1244 		kmem_free(fhp, fhsize);
1245 	}
1246 out:
1247 	return error;
1248 }
1249 
1250 void
1251 vfs_composefh_free(fhandle_t *fhp)
1252 {
1253 
1254 	vfs__fhfree(fhp);
1255 }
1256 
1257 /*
1258  * vfs_fhtovp: lookup a vnode by a filehandle.
1259  */
1260 
1261 int
1262 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
1263 {
1264 	struct mount *mp;
1265 	int error;
1266 
1267 	*vpp = NULL;
1268 	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1269 	if (mp == NULL) {
1270 		error = ESTALE;
1271 		goto out;
1272 	}
1273 	if (mp->mnt_op->vfs_fhtovp == NULL) {
1274 		error = EOPNOTSUPP;
1275 		goto out;
1276 	}
1277 	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1278 out:
1279 	return error;
1280 }
1281 
1282 /*
1283  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1284  * the needed size.
1285  */
1286 
1287 int
1288 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
1289 {
1290 	fhandle_t *fhp;
1291 	int error;
1292 
1293 	*fhpp = NULL;
1294 	if (fhsize > FHANDLE_SIZE_MAX) {
1295 		return EINVAL;
1296 	}
1297 	if (fhsize < FHANDLE_SIZE_MIN) {
1298 		return EINVAL;
1299 	}
1300 again:
1301 	fhp = kmem_alloc(fhsize, KM_SLEEP);
1302 	if (fhp == NULL) {
1303 		return ENOMEM;
1304 	}
1305 	error = copyin(ufhp, fhp, fhsize);
1306 	if (error == 0) {
1307 		/* XXX this check shouldn't be here */
1308 		if (FHANDLE_SIZE(fhp) == fhsize) {
1309 			*fhpp = fhp;
1310 			return 0;
1311 		} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1312 			/*
1313 			 * a kludge for nfsv2 padded handles.
1314 			 */
1315 			size_t sz;
1316 
1317 			sz = FHANDLE_SIZE(fhp);
1318 			kmem_free(fhp, fhsize);
1319 			fhsize = sz;
1320 			goto again;
1321 		} else {
1322 			/*
1323 			 * userland told us wrong size.
1324 			 */
1325 		    	error = EINVAL;
1326 		}
1327 	}
1328 	kmem_free(fhp, fhsize);
1329 	return error;
1330 }
1331 
1332 void
1333 vfs_copyinfh_free(fhandle_t *fhp)
1334 {
1335 
1336 	vfs__fhfree(fhp);
1337 }
1338 
1339 /*
1340  * Get file handle system call
1341  */
1342 int
1343 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
1344 {
1345 	/* {
1346 		syscallarg(char *) fname;
1347 		syscallarg(fhandle_t *) fhp;
1348 		syscallarg(size_t *) fh_size;
1349 	} */
1350 	struct vnode *vp;
1351 	fhandle_t *fh;
1352 	int error;
1353 	struct pathbuf *pb;
1354 	struct nameidata nd;
1355 	size_t sz;
1356 	size_t usz;
1357 
1358 	/*
1359 	 * Must be super user
1360 	 */
1361 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1362 	    0, NULL, NULL, NULL);
1363 	if (error)
1364 		return (error);
1365 
1366 	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1367 	if (error) {
1368 		return error;
1369 	}
1370 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1371 	error = namei(&nd);
1372 	if (error) {
1373 		pathbuf_destroy(pb);
1374 		return error;
1375 	}
1376 	vp = nd.ni_vp;
1377 	pathbuf_destroy(pb);
1378 
1379 	error = vfs_composefh_alloc(vp, &fh);
1380 	vput(vp);
1381 	if (error != 0) {
1382 		goto out;
1383 	}
1384 	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1385 	if (error != 0) {
1386 		goto out;
1387 	}
1388 	sz = FHANDLE_SIZE(fh);
1389 	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1390 	if (error != 0) {
1391 		goto out;
1392 	}
1393 	if (usz >= sz) {
1394 		error = copyout(fh, SCARG(uap, fhp), sz);
1395 	} else {
1396 		error = E2BIG;
1397 	}
1398 out:
1399 	vfs_composefh_free(fh);
1400 	return (error);
1401 }
1402 
1403 /*
1404  * Open a file given a file handle.
1405  *
1406  * Check permissions, allocate an open file structure,
1407  * and call the device open routine if any.
1408  */
1409 
1410 int
1411 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
1412     register_t *retval)
1413 {
1414 	file_t *fp;
1415 	struct vnode *vp = NULL;
1416 	kauth_cred_t cred = l->l_cred;
1417 	file_t *nfp;
1418 	int indx, error = 0;
1419 	struct vattr va;
1420 	fhandle_t *fh;
1421 	int flags;
1422 	proc_t *p;
1423 
1424 	p = curproc;
1425 
1426 	/*
1427 	 * Must be super user
1428 	 */
1429 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1430 	    0, NULL, NULL, NULL)))
1431 		return (error);
1432 
1433 	flags = FFLAGS(oflags);
1434 	if ((flags & (FREAD | FWRITE)) == 0)
1435 		return (EINVAL);
1436 	if ((flags & O_CREAT))
1437 		return (EINVAL);
1438 	if ((error = fd_allocfile(&nfp, &indx)) != 0)
1439 		return (error);
1440 	fp = nfp;
1441 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1442 	if (error != 0) {
1443 		goto bad;
1444 	}
1445 	error = vfs_fhtovp(fh, &vp);
1446 	if (error != 0) {
1447 		goto bad;
1448 	}
1449 
1450 	/* Now do an effective vn_open */
1451 
1452 	if (vp->v_type == VSOCK) {
1453 		error = EOPNOTSUPP;
1454 		goto bad;
1455 	}
1456 	error = vn_openchk(vp, cred, flags);
1457 	if (error != 0)
1458 		goto bad;
1459 	if (flags & O_TRUNC) {
1460 		VOP_UNLOCK(vp);			/* XXX */
1461 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
1462 		vattr_null(&va);
1463 		va.va_size = 0;
1464 		error = VOP_SETATTR(vp, &va, cred);
1465 		if (error)
1466 			goto bad;
1467 	}
1468 	if ((error = VOP_OPEN(vp, flags, cred)) != 0)
1469 		goto bad;
1470 	if (flags & FWRITE) {
1471 		mutex_enter(vp->v_interlock);
1472 		vp->v_writecount++;
1473 		mutex_exit(vp->v_interlock);
1474 	}
1475 
1476 	/* done with modified vn_open, now finish what sys_open does. */
1477 	if ((error = open_setfp(l, fp, vp, indx, flags)))
1478 		return error;
1479 
1480 	VOP_UNLOCK(vp);
1481 	*retval = indx;
1482 	fd_affix(p, fp, indx);
1483 	vfs_copyinfh_free(fh);
1484 	return (0);
1485 
1486 bad:
1487 	fd_abort(p, fp, indx);
1488 	if (vp != NULL)
1489 		vput(vp);
1490 	vfs_copyinfh_free(fh);
1491 	return (error);
1492 }
1493 
1494 int
1495 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
1496 {
1497 	/* {
1498 		syscallarg(const void *) fhp;
1499 		syscallarg(size_t) fh_size;
1500 		syscallarg(int) flags;
1501 	} */
1502 
1503 	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
1504 	    SCARG(uap, flags), retval);
1505 }
1506 
1507 int
1508 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
1509 {
1510 	int error;
1511 	fhandle_t *fh;
1512 	struct vnode *vp;
1513 
1514 	/*
1515 	 * Must be super user
1516 	 */
1517 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1518 	    0, NULL, NULL, NULL)))
1519 		return (error);
1520 
1521 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1522 	if (error != 0)
1523 		return error;
1524 
1525 	error = vfs_fhtovp(fh, &vp);
1526 	vfs_copyinfh_free(fh);
1527 	if (error != 0)
1528 		return error;
1529 
1530 	error = vn_stat(vp, sb);
1531 	vput(vp);
1532 	return error;
1533 }
1534 
1535 
1536 /* ARGSUSED */
1537 int
1538 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
1539 {
1540 	/* {
1541 		syscallarg(const void *) fhp;
1542 		syscallarg(size_t) fh_size;
1543 		syscallarg(struct stat *) sb;
1544 	} */
1545 	struct stat sb;
1546 	int error;
1547 
1548 	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
1549 	if (error)
1550 		return error;
1551 	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
1552 }
1553 
1554 int
1555 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
1556     int flags)
1557 {
1558 	fhandle_t *fh;
1559 	struct mount *mp;
1560 	struct vnode *vp;
1561 	int error;
1562 
1563 	/*
1564 	 * Must be super user
1565 	 */
1566 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1567 	    0, NULL, NULL, NULL)))
1568 		return error;
1569 
1570 	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1571 	if (error != 0)
1572 		return error;
1573 
1574 	error = vfs_fhtovp(fh, &vp);
1575 	vfs_copyinfh_free(fh);
1576 	if (error != 0)
1577 		return error;
1578 
1579 	mp = vp->v_mount;
1580 	error = dostatvfs(mp, sb, l, flags, 1);
1581 	vput(vp);
1582 	return error;
1583 }
1584 
1585 /* ARGSUSED */
1586 int
1587 sys___fhstatvfs140(struct lwp *l, const struct sys___fhstatvfs140_args *uap, register_t *retval)
1588 {
1589 	/* {
1590 		syscallarg(const void *) fhp;
1591 		syscallarg(size_t) fh_size;
1592 		syscallarg(struct statvfs *) buf;
1593 		syscallarg(int)	flags;
1594 	} */
1595 	struct statvfs *sb = STATVFSBUF_GET();
1596 	int error;
1597 
1598 	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
1599 	    SCARG(uap, flags));
1600 	if (error == 0)
1601 		error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1602 	STATVFSBUF_PUT(sb);
1603 	return error;
1604 }
1605 
1606 /*
1607  * Create a special file.
1608  */
1609 /* ARGSUSED */
1610 int
1611 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
1612     register_t *retval)
1613 {
1614 	/* {
1615 		syscallarg(const char *) path;
1616 		syscallarg(mode_t) mode;
1617 		syscallarg(dev_t) dev;
1618 	} */
1619 	return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
1620 	    SCARG(uap, dev), retval, UIO_USERSPACE);
1621 }
1622 
1623 int
1624 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
1625     register_t *retval)
1626 {
1627 	/* {
1628 		syscallarg(int) fd;
1629 		syscallarg(const char *) path;
1630 		syscallarg(mode_t) mode;
1631 		syscallarg(uint32_t) dev;
1632 	} */
1633 
1634 	return ENOSYS;
1635 }
1636 
1637 int
1638 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
1639     register_t *retval, enum uio_seg seg)
1640 {
1641 	struct proc *p = l->l_proc;
1642 	struct vnode *vp;
1643 	struct vattr vattr;
1644 	int error, optype;
1645 	struct pathbuf *pb;
1646 	struct nameidata nd;
1647 	const char *pathstring;
1648 
1649 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
1650 	    0, NULL, NULL, NULL)) != 0)
1651 		return (error);
1652 
1653 	optype = VOP_MKNOD_DESCOFFSET;
1654 
1655 	error = pathbuf_maybe_copyin(pathname, seg, &pb);
1656 	if (error) {
1657 		return error;
1658 	}
1659 	pathstring = pathbuf_stringcopy_get(pb);
1660 	if (pathstring == NULL) {
1661 		pathbuf_destroy(pb);
1662 		return ENOMEM;
1663 	}
1664 
1665 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
1666 	if ((error = namei(&nd)) != 0)
1667 		goto out;
1668 	vp = nd.ni_vp;
1669 
1670 	if (vp != NULL)
1671 		error = EEXIST;
1672 	else {
1673 		vattr_null(&vattr);
1674 		/* We will read cwdi->cwdi_cmask unlocked. */
1675 		vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1676 		vattr.va_rdev = dev;
1677 
1678 		switch (mode & S_IFMT) {
1679 		case S_IFMT:	/* used by badsect to flag bad sectors */
1680 			vattr.va_type = VBAD;
1681 			break;
1682 		case S_IFCHR:
1683 			vattr.va_type = VCHR;
1684 			break;
1685 		case S_IFBLK:
1686 			vattr.va_type = VBLK;
1687 			break;
1688 		case S_IFWHT:
1689 			optype = VOP_WHITEOUT_DESCOFFSET;
1690 			break;
1691 		case S_IFREG:
1692 #if NVERIEXEC > 0
1693 			error = veriexec_openchk(l, nd.ni_vp, pathstring,
1694 			    O_CREAT);
1695 #endif /* NVERIEXEC > 0 */
1696 			vattr.va_type = VREG;
1697 			vattr.va_rdev = VNOVAL;
1698 			optype = VOP_CREATE_DESCOFFSET;
1699 			break;
1700 		default:
1701 			error = EINVAL;
1702 			break;
1703 		}
1704 	}
1705 	if (error == 0 && optype == VOP_MKNOD_DESCOFFSET
1706 	    && vattr.va_rdev == VNOVAL)
1707 		error = EINVAL;
1708 	if (!error) {
1709 		switch (optype) {
1710 		case VOP_WHITEOUT_DESCOFFSET:
1711 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1712 			if (error)
1713 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1714 			vput(nd.ni_dvp);
1715 			break;
1716 
1717 		case VOP_MKNOD_DESCOFFSET:
1718 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1719 						&nd.ni_cnd, &vattr);
1720 			if (error == 0)
1721 				vput(nd.ni_vp);
1722 			break;
1723 
1724 		case VOP_CREATE_DESCOFFSET:
1725 			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
1726 						&nd.ni_cnd, &vattr);
1727 			if (error == 0)
1728 				vput(nd.ni_vp);
1729 			break;
1730 		}
1731 	} else {
1732 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1733 		if (nd.ni_dvp == vp)
1734 			vrele(nd.ni_dvp);
1735 		else
1736 			vput(nd.ni_dvp);
1737 		if (vp)
1738 			vrele(vp);
1739 	}
1740 out:
1741 	pathbuf_stringcopy_put(pb, pathstring);
1742 	pathbuf_destroy(pb);
1743 	return (error);
1744 }
1745 
1746 /*
1747  * Create a named pipe.
1748  */
1749 /* ARGSUSED */
1750 int
1751 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
1752 {
1753 	/* {
1754 		syscallarg(const char *) path;
1755 		syscallarg(int) mode;
1756 	} */
1757 	struct proc *p = l->l_proc;
1758 	struct vattr vattr;
1759 	int error;
1760 	struct pathbuf *pb;
1761 	struct nameidata nd;
1762 
1763 	error = pathbuf_copyin(SCARG(uap, path), &pb);
1764 	if (error) {
1765 		return error;
1766 	}
1767 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
1768 	if ((error = namei(&nd)) != 0) {
1769 		pathbuf_destroy(pb);
1770 		return error;
1771 	}
1772 	if (nd.ni_vp != NULL) {
1773 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1774 		if (nd.ni_dvp == nd.ni_vp)
1775 			vrele(nd.ni_dvp);
1776 		else
1777 			vput(nd.ni_dvp);
1778 		vrele(nd.ni_vp);
1779 		pathbuf_destroy(pb);
1780 		return (EEXIST);
1781 	}
1782 	vattr_null(&vattr);
1783 	vattr.va_type = VFIFO;
1784 	/* We will read cwdi->cwdi_cmask unlocked. */
1785 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
1786 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1787 	if (error == 0)
1788 		vput(nd.ni_vp);
1789 	pathbuf_destroy(pb);
1790 	return (error);
1791 }
1792 
1793 int
1794 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
1795     register_t *retval)
1796 {
1797 	/* {
1798 		syscallarg(int) fd;
1799 		syscallarg(const char *) path;
1800 		syscallarg(int) mode;
1801 	} */
1802 
1803 	return ENOSYS;
1804 }
1805 /*
1806  * Make a hard file link.
1807  */
1808 /* ARGSUSED */
1809 static int
1810 do_sys_link(struct lwp *l, const char *path, const char *link,
1811 	    int follow, register_t *retval)
1812 {
1813 	struct vnode *vp;
1814 	struct pathbuf *linkpb;
1815 	struct nameidata nd;
1816 	namei_simple_flags_t namei_simple_flags;
1817 	int error;
1818 
1819 	if (follow)
1820 		namei_simple_flags = NSM_FOLLOW_TRYEMULROOT;
1821 	else
1822 		namei_simple_flags =  NSM_NOFOLLOW_TRYEMULROOT;
1823 
1824 	error = namei_simple_user(path, namei_simple_flags, &vp);
1825 	if (error != 0)
1826 		return (error);
1827 	error = pathbuf_copyin(link, &linkpb);
1828 	if (error) {
1829 		goto out1;
1830 	}
1831 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
1832 	if ((error = namei(&nd)) != 0)
1833 		goto out2;
1834 	if (nd.ni_vp) {
1835 		error = EEXIST;
1836 		goto abortop;
1837 	}
1838 	/* Prevent hard links on directories. */
1839 	if (vp->v_type == VDIR) {
1840 		error = EPERM;
1841 		goto abortop;
1842 	}
1843 	/* Prevent cross-mount operation. */
1844 	if (nd.ni_dvp->v_mount != vp->v_mount) {
1845 		error = EXDEV;
1846 		goto abortop;
1847 	}
1848 	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1849 out2:
1850 	pathbuf_destroy(linkpb);
1851 out1:
1852 	vrele(vp);
1853 	return (error);
1854 abortop:
1855 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1856 	if (nd.ni_dvp == nd.ni_vp)
1857 		vrele(nd.ni_dvp);
1858 	else
1859 		vput(nd.ni_dvp);
1860 	if (nd.ni_vp != NULL)
1861 		vrele(nd.ni_vp);
1862 	goto out2;
1863 }
1864 
1865 int
1866 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
1867 {
1868 	/* {
1869 		syscallarg(const char *) path;
1870 		syscallarg(const char *) link;
1871 	} */
1872 	const char *path = SCARG(uap, path);
1873 	const char *link = SCARG(uap, link);
1874 
1875 	return do_sys_link(l, path, link, 1, retval);
1876 }
1877 
1878 int
1879 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
1880     register_t *retval)
1881 {
1882 	/* {
1883 		syscallarg(int) fd1;
1884 		syscallarg(const char *) name1;
1885 		syscallarg(int) fd2;
1886 		syscallarg(const char *) name2;
1887 		syscallarg(int) flags;
1888 	} */
1889 	const char *name1 = SCARG(uap, name1);
1890 	const char *name2 = SCARG(uap, name2);
1891 	int follow;
1892 
1893 	/*
1894 	 * Specified fd1 and fd2 are not yet implemented
1895 	 */
1896 	if ((SCARG(uap, fd1) != AT_FDCWD) || (SCARG(uap, fd2) != AT_FDCWD))
1897 		return ENOSYS;
1898 
1899 	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
1900 
1901 	return do_sys_link(l, name1, name2, follow, retval);
1902 }
1903 
1904 
1905 int
1906 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
1907 {
1908 	struct proc *p = curproc;
1909 	struct vattr vattr;
1910 	char *path;
1911 	int error;
1912 	struct pathbuf *linkpb;
1913 	struct nameidata nd;
1914 
1915 	path = PNBUF_GET();
1916 	if (seg == UIO_USERSPACE) {
1917 		if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != 0)
1918 			goto out1;
1919 		if ((error = pathbuf_copyin(link, &linkpb)) != 0)
1920 			goto out1;
1921 	} else {
1922 		KASSERT(strlen(patharg) < MAXPATHLEN);
1923 		strcpy(path, patharg);
1924 		linkpb = pathbuf_create(link);
1925 		if (linkpb == NULL) {
1926 			error = ENOMEM;
1927 			goto out1;
1928 		}
1929 	}
1930 	ktrkuser("symlink-target", path, strlen(path));
1931 
1932 	NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
1933 	if ((error = namei(&nd)) != 0)
1934 		goto out2;
1935 	if (nd.ni_vp) {
1936 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1937 		if (nd.ni_dvp == nd.ni_vp)
1938 			vrele(nd.ni_dvp);
1939 		else
1940 			vput(nd.ni_dvp);
1941 		vrele(nd.ni_vp);
1942 		error = EEXIST;
1943 		goto out2;
1944 	}
1945 	vattr_null(&vattr);
1946 	vattr.va_type = VLNK;
1947 	/* We will read cwdi->cwdi_cmask unlocked. */
1948 	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
1949 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
1950 	if (error == 0)
1951 		vput(nd.ni_vp);
1952 out2:
1953 	pathbuf_destroy(linkpb);
1954 out1:
1955 	PNBUF_PUT(path);
1956 	return (error);
1957 }
1958 
1959 /*
1960  * Make a symbolic link.
1961  */
1962 /* ARGSUSED */
1963 int
1964 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
1965 {
1966 	/* {
1967 		syscallarg(const char *) path;
1968 		syscallarg(const char *) link;
1969 	} */
1970 
1971 	return do_sys_symlink(SCARG(uap, path), SCARG(uap, link),
1972 	    UIO_USERSPACE);
1973 }
1974 
1975 int
1976 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
1977     register_t *retval)
1978 {
1979 	/* {
1980 		syscallarg(int) fd;
1981 		syscallarg(const char *) path;
1982 		syscallarg(const char *) link;
1983 	} */
1984 
1985 	return ENOSYS;
1986 }
1987 
1988 /*
1989  * Delete a whiteout from the filesystem.
1990  */
1991 /* ARGSUSED */
1992 int
1993 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
1994 {
1995 	/* {
1996 		syscallarg(const char *) path;
1997 	} */
1998 	int error;
1999 	struct pathbuf *pb;
2000 	struct nameidata nd;
2001 
2002 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2003 	if (error) {
2004 		return error;
2005 	}
2006 
2007 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2008 	error = namei(&nd);
2009 	if (error) {
2010 		pathbuf_destroy(pb);
2011 		return (error);
2012 	}
2013 
2014 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2015 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2016 		if (nd.ni_dvp == nd.ni_vp)
2017 			vrele(nd.ni_dvp);
2018 		else
2019 			vput(nd.ni_dvp);
2020 		if (nd.ni_vp)
2021 			vrele(nd.ni_vp);
2022 		pathbuf_destroy(pb);
2023 		return (EEXIST);
2024 	}
2025 	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2026 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2027 	vput(nd.ni_dvp);
2028 	pathbuf_destroy(pb);
2029 	return (error);
2030 }
2031 
2032 /*
2033  * Delete a name from the filesystem.
2034  */
2035 /* ARGSUSED */
2036 int
2037 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
2038 {
2039 	/* {
2040 		syscallarg(const char *) path;
2041 	} */
2042 
2043 	return do_sys_unlink(SCARG(uap, path), UIO_USERSPACE);
2044 }
2045 
2046 int
2047 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2048     register_t *retval)
2049 {
2050 	/* {
2051 		syscallarg(int) fd;
2052 		syscallarg(const char *) path;
2053 	} */
2054 
2055 	return ENOSYS;
2056 }
2057 
2058 int
2059 do_sys_unlink(const char *arg, enum uio_seg seg)
2060 {
2061 	struct vnode *vp;
2062 	int error;
2063 	struct pathbuf *pb;
2064 	struct nameidata nd;
2065 	const char *pathstring;
2066 
2067 	error = pathbuf_maybe_copyin(arg, seg, &pb);
2068 	if (error) {
2069 		return error;
2070 	}
2071 	pathstring = pathbuf_stringcopy_get(pb);
2072 	if (pathstring == NULL) {
2073 		pathbuf_destroy(pb);
2074 		return ENOMEM;
2075 	}
2076 
2077 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2078 	if ((error = namei(&nd)) != 0)
2079 		goto out;
2080 	vp = nd.ni_vp;
2081 
2082 	/*
2083 	 * The root of a mounted filesystem cannot be deleted.
2084 	 */
2085 	if (vp->v_vflag & VV_ROOT) {
2086 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2087 		if (nd.ni_dvp == vp)
2088 			vrele(nd.ni_dvp);
2089 		else
2090 			vput(nd.ni_dvp);
2091 		vput(vp);
2092 		error = EBUSY;
2093 		goto out;
2094 	}
2095 
2096 #if NVERIEXEC > 0
2097 	/* Handle remove requests for veriexec entries. */
2098 	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2099 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2100 		if (nd.ni_dvp == vp)
2101 			vrele(nd.ni_dvp);
2102 		else
2103 			vput(nd.ni_dvp);
2104 		vput(vp);
2105 		goto out;
2106 	}
2107 #endif /* NVERIEXEC > 0 */
2108 
2109 #ifdef FILEASSOC
2110 	(void)fileassoc_file_delete(vp);
2111 #endif /* FILEASSOC */
2112 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2113 out:
2114 	pathbuf_stringcopy_put(pb, pathstring);
2115 	pathbuf_destroy(pb);
2116 	return (error);
2117 }
2118 
2119 /*
2120  * Reposition read/write file offset.
2121  */
2122 int
2123 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2124 {
2125 	/* {
2126 		syscallarg(int) fd;
2127 		syscallarg(int) pad;
2128 		syscallarg(off_t) offset;
2129 		syscallarg(int) whence;
2130 	} */
2131 	kauth_cred_t cred = l->l_cred;
2132 	file_t *fp;
2133 	struct vnode *vp;
2134 	struct vattr vattr;
2135 	off_t newoff;
2136 	int error, fd;
2137 
2138 	fd = SCARG(uap, fd);
2139 
2140 	if ((fp = fd_getfile(fd)) == NULL)
2141 		return (EBADF);
2142 
2143 	vp = fp->f_data;
2144 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2145 		error = ESPIPE;
2146 		goto out;
2147 	}
2148 
2149 	switch (SCARG(uap, whence)) {
2150 	case SEEK_CUR:
2151 		newoff = fp->f_offset + SCARG(uap, offset);
2152 		break;
2153 	case SEEK_END:
2154 		vn_lock(vp, LK_SHARED | LK_RETRY);
2155 		error = VOP_GETATTR(vp, &vattr, cred);
2156 		VOP_UNLOCK(vp);
2157 		if (error) {
2158 			goto out;
2159 		}
2160 		newoff = SCARG(uap, offset) + vattr.va_size;
2161 		break;
2162 	case SEEK_SET:
2163 		newoff = SCARG(uap, offset);
2164 		break;
2165 	default:
2166 		error = EINVAL;
2167 		goto out;
2168 	}
2169 	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) {
2170 		*(off_t *)retval = fp->f_offset = newoff;
2171 	}
2172  out:
2173  	fd_putfile(fd);
2174 	return (error);
2175 }
2176 
2177 /*
2178  * Positional read system call.
2179  */
2180 int
2181 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
2182 {
2183 	/* {
2184 		syscallarg(int) fd;
2185 		syscallarg(void *) buf;
2186 		syscallarg(size_t) nbyte;
2187 		syscallarg(off_t) offset;
2188 	} */
2189 	file_t *fp;
2190 	struct vnode *vp;
2191 	off_t offset;
2192 	int error, fd = SCARG(uap, fd);
2193 
2194 	if ((fp = fd_getfile(fd)) == NULL)
2195 		return (EBADF);
2196 
2197 	if ((fp->f_flag & FREAD) == 0) {
2198 		fd_putfile(fd);
2199 		return (EBADF);
2200 	}
2201 
2202 	vp = fp->f_data;
2203 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2204 		error = ESPIPE;
2205 		goto out;
2206 	}
2207 
2208 	offset = SCARG(uap, offset);
2209 
2210 	/*
2211 	 * XXX This works because no file systems actually
2212 	 * XXX take any action on the seek operation.
2213 	 */
2214 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2215 		goto out;
2216 
2217 	/* dofileread() will unuse the descriptor for us */
2218 	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2219 	    &offset, 0, retval));
2220 
2221  out:
2222 	fd_putfile(fd);
2223 	return (error);
2224 }
2225 
2226 /*
2227  * Positional scatter read system call.
2228  */
2229 int
2230 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
2231 {
2232 	/* {
2233 		syscallarg(int) fd;
2234 		syscallarg(const struct iovec *) iovp;
2235 		syscallarg(int) iovcnt;
2236 		syscallarg(off_t) offset;
2237 	} */
2238 	off_t offset = SCARG(uap, offset);
2239 
2240 	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2241 	    SCARG(uap, iovcnt), &offset, 0, retval);
2242 }
2243 
2244 /*
2245  * Positional write system call.
2246  */
2247 int
2248 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
2249 {
2250 	/* {
2251 		syscallarg(int) fd;
2252 		syscallarg(const void *) buf;
2253 		syscallarg(size_t) nbyte;
2254 		syscallarg(off_t) offset;
2255 	} */
2256 	file_t *fp;
2257 	struct vnode *vp;
2258 	off_t offset;
2259 	int error, fd = SCARG(uap, fd);
2260 
2261 	if ((fp = fd_getfile(fd)) == NULL)
2262 		return (EBADF);
2263 
2264 	if ((fp->f_flag & FWRITE) == 0) {
2265 		fd_putfile(fd);
2266 		return (EBADF);
2267 	}
2268 
2269 	vp = fp->f_data;
2270 	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
2271 		error = ESPIPE;
2272 		goto out;
2273 	}
2274 
2275 	offset = SCARG(uap, offset);
2276 
2277 	/*
2278 	 * XXX This works because no file systems actually
2279 	 * XXX take any action on the seek operation.
2280 	 */
2281 	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0)
2282 		goto out;
2283 
2284 	/* dofilewrite() will unuse the descriptor for us */
2285 	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2286 	    &offset, 0, retval));
2287 
2288  out:
2289 	fd_putfile(fd);
2290 	return (error);
2291 }
2292 
2293 /*
2294  * Positional gather write system call.
2295  */
2296 int
2297 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
2298 {
2299 	/* {
2300 		syscallarg(int) fd;
2301 		syscallarg(const struct iovec *) iovp;
2302 		syscallarg(int) iovcnt;
2303 		syscallarg(off_t) offset;
2304 	} */
2305 	off_t offset = SCARG(uap, offset);
2306 
2307 	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2308 	    SCARG(uap, iovcnt), &offset, 0, retval);
2309 }
2310 
2311 /*
2312  * Check access permissions.
2313  */
2314 int
2315 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
2316 {
2317 	/* {
2318 		syscallarg(const char *) path;
2319 		syscallarg(int) flags;
2320 	} */
2321 	kauth_cred_t cred;
2322 	struct vnode *vp;
2323 	int error, flags;
2324 	struct pathbuf *pb;
2325 	struct nameidata nd;
2326 
2327 	CTASSERT(F_OK == 0);
2328 	if ((SCARG(uap, flags) & ~(R_OK | W_OK | X_OK)) != 0) {
2329 		/* nonsense flags */
2330 		return EINVAL;
2331 	}
2332 
2333 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2334 	if (error) {
2335 		return error;
2336 	}
2337 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2338 
2339 	/* Override default credentials */
2340 	cred = kauth_cred_dup(l->l_cred);
2341 	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2342 	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2343 	nd.ni_cnd.cn_cred = cred;
2344 
2345 	if ((error = namei(&nd)) != 0) {
2346 		pathbuf_destroy(pb);
2347 		goto out;
2348 	}
2349 	vp = nd.ni_vp;
2350 	pathbuf_destroy(pb);
2351 
2352 	/* Flags == 0 means only check for existence. */
2353 	if (SCARG(uap, flags)) {
2354 		flags = 0;
2355 		if (SCARG(uap, flags) & R_OK)
2356 			flags |= VREAD;
2357 		if (SCARG(uap, flags) & W_OK)
2358 			flags |= VWRITE;
2359 		if (SCARG(uap, flags) & X_OK)
2360 			flags |= VEXEC;
2361 
2362 		error = VOP_ACCESS(vp, flags, cred);
2363 		if (!error && (flags & VWRITE))
2364 			error = vn_writechk(vp);
2365 	}
2366 	vput(vp);
2367 out:
2368 	kauth_cred_free(cred);
2369 	return (error);
2370 }
2371 
2372 int
2373 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
2374     register_t *retval)
2375 {
2376 	/* {
2377 		syscallarg(int) fd;
2378 		syscallarg(const char *) path;
2379 		syscallarg(int) amode;
2380 		syscallarg(int) flag;
2381 	} */
2382 
2383 	return ENOSYS;
2384 }
2385 
2386 /*
2387  * Common code for all sys_stat functions, including compat versions.
2388  */
2389 int
2390 do_sys_stat(const char *userpath, unsigned int nd_flags, struct stat *sb)
2391 {
2392 	int error;
2393 	struct pathbuf *pb;
2394 	struct nameidata nd;
2395 
2396 	error = pathbuf_copyin(userpath, &pb);
2397 	if (error) {
2398 		return error;
2399 	}
2400 	NDINIT(&nd, LOOKUP, nd_flags | LOCKLEAF | TRYEMULROOT, pb);
2401 	error = namei(&nd);
2402 	if (error != 0) {
2403 		pathbuf_destroy(pb);
2404 		return error;
2405 	}
2406 	error = vn_stat(nd.ni_vp, sb);
2407 	vput(nd.ni_vp);
2408 	pathbuf_destroy(pb);
2409 	return error;
2410 }
2411 
2412 /*
2413  * Get file status; this version follows links.
2414  */
2415 /* ARGSUSED */
2416 int
2417 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
2418 {
2419 	/* {
2420 		syscallarg(const char *) path;
2421 		syscallarg(struct stat *) ub;
2422 	} */
2423 	struct stat sb;
2424 	int error;
2425 
2426 	error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
2427 	if (error)
2428 		return error;
2429 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2430 }
2431 
2432 /*
2433  * Get file status; this version does not follow links.
2434  */
2435 /* ARGSUSED */
2436 int
2437 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
2438 {
2439 	/* {
2440 		syscallarg(const char *) path;
2441 		syscallarg(struct stat *) ub;
2442 	} */
2443 	struct stat sb;
2444 	int error;
2445 
2446 	error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
2447 	if (error)
2448 		return error;
2449 	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
2450 }
2451 
2452 int
2453 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
2454     register_t *retval)
2455 {
2456 	/* {
2457 		syscallarg(int) fd;
2458 		syscallarg(const char *) path;
2459 		syscallarg(struct stat *) ub;
2460 		syscallarg(int) flag;
2461 	} */
2462 
2463 	return ENOSYS;
2464 }
2465 /*
2466  * Get configurable pathname variables.
2467  */
2468 /* ARGSUSED */
2469 int
2470 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap, register_t *retval)
2471 {
2472 	/* {
2473 		syscallarg(const char *) path;
2474 		syscallarg(int) name;
2475 	} */
2476 	int error;
2477 	struct pathbuf *pb;
2478 	struct nameidata nd;
2479 
2480 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2481 	if (error) {
2482 		return error;
2483 	}
2484 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2485 	if ((error = namei(&nd)) != 0) {
2486 		pathbuf_destroy(pb);
2487 		return (error);
2488 	}
2489 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
2490 	vput(nd.ni_vp);
2491 	pathbuf_destroy(pb);
2492 	return (error);
2493 }
2494 
2495 /*
2496  * Return target name of a symbolic link.
2497  */
2498 /* ARGSUSED */
2499 int
2500 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap, register_t *retval)
2501 {
2502 	/* {
2503 		syscallarg(const char *) path;
2504 		syscallarg(char *) buf;
2505 		syscallarg(size_t) count;
2506 	} */
2507 	struct vnode *vp;
2508 	struct iovec aiov;
2509 	struct uio auio;
2510 	int error;
2511 	struct pathbuf *pb;
2512 	struct nameidata nd;
2513 
2514 	error = pathbuf_copyin(SCARG(uap, path), &pb);
2515 	if (error) {
2516 		return error;
2517 	}
2518 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2519 	if ((error = namei(&nd)) != 0) {
2520 		pathbuf_destroy(pb);
2521 		return error;
2522 	}
2523 	vp = nd.ni_vp;
2524 	pathbuf_destroy(pb);
2525 	if (vp->v_type != VLNK)
2526 		error = EINVAL;
2527 	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
2528 	    (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
2529 		aiov.iov_base = SCARG(uap, buf);
2530 		aiov.iov_len = SCARG(uap, count);
2531 		auio.uio_iov = &aiov;
2532 		auio.uio_iovcnt = 1;
2533 		auio.uio_offset = 0;
2534 		auio.uio_rw = UIO_READ;
2535 		KASSERT(l == curlwp);
2536 		auio.uio_vmspace = l->l_proc->p_vmspace;
2537 		auio.uio_resid = SCARG(uap, count);
2538 		error = VOP_READLINK(vp, &auio, l->l_cred);
2539 	}
2540 	vput(vp);
2541 	*retval = SCARG(uap, count) - auio.uio_resid;
2542 	return (error);
2543 }
2544 
2545 int
2546 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
2547     register_t *retval)
2548 {
2549 	/* {
2550 		syscallarg(int) fd;
2551 		syscallarg(const char *) path;
2552 		syscallarg(char *) buf;
2553 		syscallarg(size_t) count;
2554 	} */
2555 
2556 	return ENOSYS;
2557 }
2558 
2559 /*
2560  * Change flags of a file given a path name.
2561  */
2562 /* ARGSUSED */
2563 int
2564 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
2565 {
2566 	/* {
2567 		syscallarg(const char *) path;
2568 		syscallarg(u_long) flags;
2569 	} */
2570 	struct vnode *vp;
2571 	int error;
2572 
2573 	error = namei_simple_user(SCARG(uap, path),
2574 				NSM_FOLLOW_TRYEMULROOT, &vp);
2575 	if (error != 0)
2576 		return (error);
2577 	error = change_flags(vp, SCARG(uap, flags), l);
2578 	vput(vp);
2579 	return (error);
2580 }
2581 
2582 /*
2583  * Change flags of a file given a file descriptor.
2584  */
2585 /* ARGSUSED */
2586 int
2587 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
2588 {
2589 	/* {
2590 		syscallarg(int) fd;
2591 		syscallarg(u_long) flags;
2592 	} */
2593 	struct vnode *vp;
2594 	file_t *fp;
2595 	int error;
2596 
2597 	/* fd_getvnode() will use the descriptor for us */
2598 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2599 		return (error);
2600 	vp = fp->f_data;
2601 	error = change_flags(vp, SCARG(uap, flags), l);
2602 	VOP_UNLOCK(vp);
2603 	fd_putfile(SCARG(uap, fd));
2604 	return (error);
2605 }
2606 
2607 /*
2608  * Change flags of a file given a path name; this version does
2609  * not follow links.
2610  */
2611 int
2612 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
2613 {
2614 	/* {
2615 		syscallarg(const char *) path;
2616 		syscallarg(u_long) flags;
2617 	} */
2618 	struct vnode *vp;
2619 	int error;
2620 
2621 	error = namei_simple_user(SCARG(uap, path),
2622 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2623 	if (error != 0)
2624 		return (error);
2625 	error = change_flags(vp, SCARG(uap, flags), l);
2626 	vput(vp);
2627 	return (error);
2628 }
2629 
2630 /*
2631  * Common routine to change flags of a file.
2632  */
2633 int
2634 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
2635 {
2636 	struct vattr vattr;
2637 	int error;
2638 
2639 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2640 	/*
2641 	 * Non-superusers cannot change the flags on devices, even if they
2642 	 * own them.
2643 	 */
2644 	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
2645 		if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2646 			goto out;
2647 		if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2648 			error = EINVAL;
2649 			goto out;
2650 		}
2651 	}
2652 	vattr_null(&vattr);
2653 	vattr.va_flags = flags;
2654 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2655 out:
2656 	return (error);
2657 }
2658 
2659 /*
2660  * Change mode of a file given path name; this version follows links.
2661  */
2662 /* ARGSUSED */
2663 int
2664 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
2665 {
2666 	/* {
2667 		syscallarg(const char *) path;
2668 		syscallarg(int) mode;
2669 	} */
2670 	int error;
2671 	struct vnode *vp;
2672 
2673 	error = namei_simple_user(SCARG(uap, path),
2674 				NSM_FOLLOW_TRYEMULROOT, &vp);
2675 	if (error != 0)
2676 		return (error);
2677 
2678 	error = change_mode(vp, SCARG(uap, mode), l);
2679 
2680 	vrele(vp);
2681 	return (error);
2682 }
2683 
2684 /*
2685  * Change mode of a file given a file descriptor.
2686  */
2687 /* ARGSUSED */
2688 int
2689 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
2690 {
2691 	/* {
2692 		syscallarg(int) fd;
2693 		syscallarg(int) mode;
2694 	} */
2695 	file_t *fp;
2696 	int error;
2697 
2698 	/* fd_getvnode() will use the descriptor for us */
2699 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2700 		return (error);
2701 	error = change_mode(fp->f_data, SCARG(uap, mode), l);
2702 	fd_putfile(SCARG(uap, fd));
2703 	return (error);
2704 }
2705 
2706 int
2707 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
2708     register_t *retval)
2709 {
2710 	/* {
2711 		syscallarg(int) fd;
2712 		syscallarg(const char *) path;
2713 		syscallarg(int) mode;
2714 		syscallarg(int) flag;
2715 	} */
2716 
2717 	return ENOSYS;
2718 }
2719 
2720 /*
2721  * Change mode of a file given path name; this version does not follow links.
2722  */
2723 /* ARGSUSED */
2724 int
2725 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
2726 {
2727 	/* {
2728 		syscallarg(const char *) path;
2729 		syscallarg(int) mode;
2730 	} */
2731 	int error;
2732 	struct vnode *vp;
2733 
2734 	error = namei_simple_user(SCARG(uap, path),
2735 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2736 	if (error != 0)
2737 		return (error);
2738 
2739 	error = change_mode(vp, SCARG(uap, mode), l);
2740 
2741 	vrele(vp);
2742 	return (error);
2743 }
2744 
2745 /*
2746  * Common routine to set mode given a vnode.
2747  */
2748 static int
2749 change_mode(struct vnode *vp, int mode, struct lwp *l)
2750 {
2751 	struct vattr vattr;
2752 	int error;
2753 
2754 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2755 	vattr_null(&vattr);
2756 	vattr.va_mode = mode & ALLPERMS;
2757 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2758 	VOP_UNLOCK(vp);
2759 	return (error);
2760 }
2761 
2762 /*
2763  * Set ownership given a path name; this version follows links.
2764  */
2765 /* ARGSUSED */
2766 int
2767 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
2768 {
2769 	/* {
2770 		syscallarg(const char *) path;
2771 		syscallarg(uid_t) uid;
2772 		syscallarg(gid_t) gid;
2773 	} */
2774 	int error;
2775 	struct vnode *vp;
2776 
2777 	error = namei_simple_user(SCARG(uap, path),
2778 				NSM_FOLLOW_TRYEMULROOT, &vp);
2779 	if (error != 0)
2780 		return (error);
2781 
2782 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2783 
2784 	vrele(vp);
2785 	return (error);
2786 }
2787 
2788 /*
2789  * Set ownership given a path name; this version follows links.
2790  * Provides POSIX semantics.
2791  */
2792 /* ARGSUSED */
2793 int
2794 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
2795 {
2796 	/* {
2797 		syscallarg(const char *) path;
2798 		syscallarg(uid_t) uid;
2799 		syscallarg(gid_t) gid;
2800 	} */
2801 	int error;
2802 	struct vnode *vp;
2803 
2804 	error = namei_simple_user(SCARG(uap, path),
2805 				NSM_FOLLOW_TRYEMULROOT, &vp);
2806 	if (error != 0)
2807 		return (error);
2808 
2809 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2810 
2811 	vrele(vp);
2812 	return (error);
2813 }
2814 
2815 /*
2816  * Set ownership given a file descriptor.
2817  */
2818 /* ARGSUSED */
2819 int
2820 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
2821 {
2822 	/* {
2823 		syscallarg(int) fd;
2824 		syscallarg(uid_t) uid;
2825 		syscallarg(gid_t) gid;
2826 	} */
2827 	int error;
2828 	file_t *fp;
2829 
2830 	/* fd_getvnode() will use the descriptor for us */
2831 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2832 		return (error);
2833 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2834 	    l, 0);
2835 	fd_putfile(SCARG(uap, fd));
2836 	return (error);
2837 }
2838 
2839 int
2840 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
2841     register_t *retval)
2842 {
2843 	/* {
2844 		syscallarg(int) fd;
2845 		syscallarg(const char *) path;
2846 		syscallarg(uid_t) uid;
2847 		syscallarg(gid_t) gid;
2848 		syscallarg(int) flag;
2849 	} */
2850 
2851 	return ENOSYS;
2852 }
2853 
2854 /*
2855  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
2856  */
2857 /* ARGSUSED */
2858 int
2859 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
2860 {
2861 	/* {
2862 		syscallarg(int) fd;
2863 		syscallarg(uid_t) uid;
2864 		syscallarg(gid_t) gid;
2865 	} */
2866 	int error;
2867 	file_t *fp;
2868 
2869 	/* fd_getvnode() will use the descriptor for us */
2870 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
2871 		return (error);
2872 	error = change_owner(fp->f_data, SCARG(uap, uid), SCARG(uap, gid),
2873 	    l, 1);
2874 	fd_putfile(SCARG(uap, fd));
2875 	return (error);
2876 }
2877 
2878 /*
2879  * Set ownership given a path name; this version does not follow links.
2880  */
2881 /* ARGSUSED */
2882 int
2883 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
2884 {
2885 	/* {
2886 		syscallarg(const char *) path;
2887 		syscallarg(uid_t) uid;
2888 		syscallarg(gid_t) gid;
2889 	} */
2890 	int error;
2891 	struct vnode *vp;
2892 
2893 	error = namei_simple_user(SCARG(uap, path),
2894 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2895 	if (error != 0)
2896 		return (error);
2897 
2898 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
2899 
2900 	vrele(vp);
2901 	return (error);
2902 }
2903 
2904 /*
2905  * Set ownership given a path name; this version does not follow links.
2906  * Provides POSIX/XPG semantics.
2907  */
2908 /* ARGSUSED */
2909 int
2910 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
2911 {
2912 	/* {
2913 		syscallarg(const char *) path;
2914 		syscallarg(uid_t) uid;
2915 		syscallarg(gid_t) gid;
2916 	} */
2917 	int error;
2918 	struct vnode *vp;
2919 
2920 	error = namei_simple_user(SCARG(uap, path),
2921 				NSM_NOFOLLOW_TRYEMULROOT, &vp);
2922 	if (error != 0)
2923 		return (error);
2924 
2925 	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
2926 
2927 	vrele(vp);
2928 	return (error);
2929 }
2930 
2931 /*
2932  * Common routine to set ownership given a vnode.
2933  */
2934 static int
2935 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
2936     int posix_semantics)
2937 {
2938 	struct vattr vattr;
2939 	mode_t newmode;
2940 	int error;
2941 
2942 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2943 	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
2944 		goto out;
2945 
2946 #define CHANGED(x) ((int)(x) != -1)
2947 	newmode = vattr.va_mode;
2948 	if (posix_semantics) {
2949 		/*
2950 		 * POSIX/XPG semantics: if the caller is not the super-user,
2951 		 * clear set-user-id and set-group-id bits.  Both POSIX and
2952 		 * the XPG consider the behaviour for calls by the super-user
2953 		 * implementation-defined; we leave the set-user-id and set-
2954 		 * group-id settings intact in that case.
2955 		 */
2956 		if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
2957 				      NULL) != 0)
2958 			newmode &= ~(S_ISUID | S_ISGID);
2959 	} else {
2960 		/*
2961 		 * NetBSD semantics: when changing owner and/or group,
2962 		 * clear the respective bit(s).
2963 		 */
2964 		if (CHANGED(uid))
2965 			newmode &= ~S_ISUID;
2966 		if (CHANGED(gid))
2967 			newmode &= ~S_ISGID;
2968 	}
2969 	/* Update va_mode iff altered. */
2970 	if (vattr.va_mode == newmode)
2971 		newmode = VNOVAL;
2972 
2973 	vattr_null(&vattr);
2974 	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
2975 	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
2976 	vattr.va_mode = newmode;
2977 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
2978 #undef CHANGED
2979 
2980 out:
2981 	VOP_UNLOCK(vp);
2982 	return (error);
2983 }
2984 
2985 /*
2986  * Set the access and modification times given a path name; this
2987  * version follows links.
2988  */
2989 /* ARGSUSED */
2990 int
2991 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
2992     register_t *retval)
2993 {
2994 	/* {
2995 		syscallarg(const char *) path;
2996 		syscallarg(const struct timeval *) tptr;
2997 	} */
2998 
2999 	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3000 	    SCARG(uap, tptr), UIO_USERSPACE);
3001 }
3002 
3003 /*
3004  * Set the access and modification times given a file descriptor.
3005  */
3006 /* ARGSUSED */
3007 int
3008 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3009     register_t *retval)
3010 {
3011 	/* {
3012 		syscallarg(int) fd;
3013 		syscallarg(const struct timeval *) tptr;
3014 	} */
3015 	int error;
3016 	file_t *fp;
3017 
3018 	/* fd_getvnode() will use the descriptor for us */
3019 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3020 		return (error);
3021 	error = do_sys_utimes(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
3022 	    UIO_USERSPACE);
3023 	fd_putfile(SCARG(uap, fd));
3024 	return (error);
3025 }
3026 
3027 int
3028 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3029     register_t *retval)
3030 {
3031 	/* {
3032 		syscallarg(int) fd;
3033 		syscallarg(const struct timespec *) tptr;
3034 	} */
3035 	int error;
3036 	file_t *fp;
3037 
3038 	/* fd_getvnode() will use the descriptor for us */
3039 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3040 		return (error);
3041 	error = do_sys_utimens(l, fp->f_data, NULL, 0, SCARG(uap, tptr),
3042 	    UIO_USERSPACE);
3043 	fd_putfile(SCARG(uap, fd));
3044 	return (error);
3045 }
3046 
3047 /*
3048  * Set the access and modification times given a path name; this
3049  * version does not follow links.
3050  */
3051 int
3052 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
3053     register_t *retval)
3054 {
3055 	/* {
3056 		syscallarg(const char *) path;
3057 		syscallarg(const struct timeval *) tptr;
3058 	} */
3059 
3060 	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3061 	    SCARG(uap, tptr), UIO_USERSPACE);
3062 }
3063 
3064 int
3065 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
3066     register_t *retval)
3067 {
3068 	/* {
3069 		syscallarg(int) fd;
3070 		syscallarg(const char *) path;
3071 		syscallarg(const struct timespec *) tptr;
3072 		syscallarg(int) flag;
3073 	} */
3074 	int follow;
3075 	const struct timespec *tptr;
3076 
3077 	/*
3078 	 * Specified fd is not yet implemented
3079 	 */
3080 	if (SCARG(uap, fd) != AT_FDCWD)
3081 		return ENOSYS;
3082 
3083 	tptr = SCARG(uap, tptr);
3084 	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3085 
3086 	return do_sys_utimens(l, NULL, SCARG(uap, path), follow,
3087 	    tptr, UIO_USERSPACE);
3088 }
3089 
3090 /*
3091  * Common routine to set access and modification times given a vnode.
3092  */
3093 int
3094 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
3095     const struct timespec *tptr, enum uio_seg seg)
3096 {
3097 	struct vattr vattr;
3098 	int error, dorele = 0;
3099 	namei_simple_flags_t sflags;
3100 
3101 	bool vanull, setbirthtime;
3102 	struct timespec ts[2];
3103 
3104 	/*
3105 	 * I have checked all callers and they pass either FOLLOW,
3106 	 * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3107 	 * is 0. More to the point, they don't pass anything else.
3108 	 * Let's keep it that way at least until the namei interfaces
3109 	 * are fully sanitized.
3110 	 */
3111 	KASSERT(flag == NOFOLLOW || flag == FOLLOW);
3112 	sflags = (flag == FOLLOW) ?
3113 		NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3114 
3115 	if (tptr == NULL) {
3116 		vanull = true;
3117 		nanotime(&ts[0]);
3118 		ts[1] = ts[0];
3119 	} else {
3120 		vanull = false;
3121 		if (seg != UIO_SYSSPACE) {
3122 			error = copyin(tptr, ts, sizeof (ts));
3123 			if (error != 0)
3124 				return error;
3125 		} else {
3126 			ts[0] = tptr[0];
3127 			ts[1] = tptr[1];
3128 		}
3129 	}
3130 
3131 	if (ts[0].tv_nsec == UTIME_NOW) {
3132 		nanotime(&ts[0]);
3133 		if (ts[1].tv_nsec == UTIME_NOW) {
3134 			vanull = true;
3135 			ts[1] = ts[0];
3136 		}
3137 	} else if (ts[1].tv_nsec == UTIME_NOW)
3138 		nanotime(&ts[1]);
3139 
3140 	if (vp == NULL) {
3141 		/* note: SEG describes TPTR, not PATH; PATH is always user */
3142 		error = namei_simple_user(path, sflags, &vp);
3143 		if (error != 0)
3144 			return error;
3145 		dorele = 1;
3146 	}
3147 
3148 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3149 	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
3150 	    timespeccmp(&ts[1], &vattr.va_birthtime, <));
3151 	vattr_null(&vattr);
3152 
3153 	if (ts[0].tv_nsec != UTIME_OMIT)
3154 		vattr.va_atime = ts[0];
3155 
3156 	if (ts[1].tv_nsec != UTIME_OMIT) {
3157 		vattr.va_mtime = ts[1];
3158 		if (setbirthtime)
3159 			vattr.va_birthtime = ts[1];
3160 	}
3161 
3162 	if (vanull)
3163 		vattr.va_vaflags |= VA_UTIMES_NULL;
3164 	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3165 	VOP_UNLOCK(vp);
3166 
3167 	if (dorele != 0)
3168 		vrele(vp);
3169 
3170 	return error;
3171 }
3172 
3173 int
3174 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
3175     const struct timeval *tptr, enum uio_seg seg)
3176 {
3177 	struct timespec ts[2];
3178 	struct timespec *tsptr = NULL;
3179 	int error;
3180 
3181 	if (tptr != NULL) {
3182 		struct timeval tv[2];
3183 
3184 		if (seg != UIO_SYSSPACE) {
3185 			error = copyin(tptr, tv, sizeof (tv));
3186 			if (error != 0)
3187 				return error;
3188 			tptr = tv;
3189 		}
3190 
3191 		if ((tv[0].tv_usec == UTIME_NOW) ||
3192 		    (tv[0].tv_usec == UTIME_OMIT))
3193 			ts[0].tv_nsec = tv[0].tv_usec;
3194 		else
3195 			TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
3196 
3197 		if ((tv[1].tv_usec == UTIME_NOW) ||
3198 		    (tv[1].tv_usec == UTIME_OMIT))
3199 			ts[1].tv_nsec = tv[1].tv_usec;
3200 		else
3201 			TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
3202 
3203 		tsptr = &ts[0];
3204 	}
3205 
3206 	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3207 }
3208 
3209 /*
3210  * Truncate a file given its path name.
3211  */
3212 /* ARGSUSED */
3213 int
3214 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
3215 {
3216 	/* {
3217 		syscallarg(const char *) path;
3218 		syscallarg(int) pad;
3219 		syscallarg(off_t) length;
3220 	} */
3221 	struct vnode *vp;
3222 	struct vattr vattr;
3223 	int error;
3224 
3225 	error = namei_simple_user(SCARG(uap, path),
3226 				NSM_FOLLOW_TRYEMULROOT, &vp);
3227 	if (error != 0)
3228 		return (error);
3229 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3230 	if (vp->v_type == VDIR)
3231 		error = EISDIR;
3232 	else if ((error = vn_writechk(vp)) == 0 &&
3233 	    (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
3234 		vattr_null(&vattr);
3235 		vattr.va_size = SCARG(uap, length);
3236 		error = VOP_SETATTR(vp, &vattr, l->l_cred);
3237 	}
3238 	vput(vp);
3239 	return (error);
3240 }
3241 
3242 /*
3243  * Truncate a file given a file descriptor.
3244  */
3245 /* ARGSUSED */
3246 int
3247 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
3248 {
3249 	/* {
3250 		syscallarg(int) fd;
3251 		syscallarg(int) pad;
3252 		syscallarg(off_t) length;
3253 	} */
3254 	struct vattr vattr;
3255 	struct vnode *vp;
3256 	file_t *fp;
3257 	int error;
3258 
3259 	/* fd_getvnode() will use the descriptor for us */
3260 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3261 		return (error);
3262 	if ((fp->f_flag & FWRITE) == 0) {
3263 		error = EINVAL;
3264 		goto out;
3265 	}
3266 	vp = fp->f_data;
3267 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3268 	if (vp->v_type == VDIR)
3269 		error = EISDIR;
3270 	else if ((error = vn_writechk(vp)) == 0) {
3271 		vattr_null(&vattr);
3272 		vattr.va_size = SCARG(uap, length);
3273 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3274 	}
3275 	VOP_UNLOCK(vp);
3276  out:
3277 	fd_putfile(SCARG(uap, fd));
3278 	return (error);
3279 }
3280 
3281 /*
3282  * Sync an open file.
3283  */
3284 /* ARGSUSED */
3285 int
3286 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
3287 {
3288 	/* {
3289 		syscallarg(int) fd;
3290 	} */
3291 	struct vnode *vp;
3292 	file_t *fp;
3293 	int error;
3294 
3295 	/* fd_getvnode() will use the descriptor for us */
3296 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3297 		return (error);
3298 	vp = fp->f_data;
3299 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3300 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
3301 	VOP_UNLOCK(vp);
3302 	fd_putfile(SCARG(uap, fd));
3303 	return (error);
3304 }
3305 
3306 /*
3307  * Sync a range of file data.  API modeled after that found in AIX.
3308  *
3309  * FDATASYNC indicates that we need only save enough metadata to be able
3310  * to re-read the written data.  Note we duplicate AIX's requirement that
3311  * the file be open for writing.
3312  */
3313 /* ARGSUSED */
3314 int
3315 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
3316 {
3317 	/* {
3318 		syscallarg(int) fd;
3319 		syscallarg(int) flags;
3320 		syscallarg(off_t) start;
3321 		syscallarg(off_t) length;
3322 	} */
3323 	struct vnode *vp;
3324 	file_t *fp;
3325 	int flags, nflags;
3326 	off_t s, e, len;
3327 	int error;
3328 
3329 	/* fd_getvnode() will use the descriptor for us */
3330 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3331 		return (error);
3332 
3333 	if ((fp->f_flag & FWRITE) == 0) {
3334 		error = EBADF;
3335 		goto out;
3336 	}
3337 
3338 	flags = SCARG(uap, flags);
3339 	if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
3340 	    ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
3341 		error = EINVAL;
3342 		goto out;
3343 	}
3344 	/* Now set up the flags for value(s) to pass to VOP_FSYNC() */
3345 	if (flags & FDATASYNC)
3346 		nflags = FSYNC_DATAONLY | FSYNC_WAIT;
3347 	else
3348 		nflags = FSYNC_WAIT;
3349 	if (flags & FDISKSYNC)
3350 		nflags |= FSYNC_CACHE;
3351 
3352 	len = SCARG(uap, length);
3353 	/* If length == 0, we do the whole file, and s = e = 0 will do that */
3354 	if (len) {
3355 		s = SCARG(uap, start);
3356 		e = s + len;
3357 		if (e < s) {
3358 			error = EINVAL;
3359 			goto out;
3360 		}
3361 	} else {
3362 		e = 0;
3363 		s = 0;
3364 	}
3365 
3366 	vp = fp->f_data;
3367 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3368 	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
3369 	VOP_UNLOCK(vp);
3370 out:
3371 	fd_putfile(SCARG(uap, fd));
3372 	return (error);
3373 }
3374 
3375 /*
3376  * Sync the data of an open file.
3377  */
3378 /* ARGSUSED */
3379 int
3380 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
3381 {
3382 	/* {
3383 		syscallarg(int) fd;
3384 	} */
3385 	struct vnode *vp;
3386 	file_t *fp;
3387 	int error;
3388 
3389 	/* fd_getvnode() will use the descriptor for us */
3390 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3391 		return (error);
3392 	if ((fp->f_flag & FWRITE) == 0) {
3393 		fd_putfile(SCARG(uap, fd));
3394 		return (EBADF);
3395 	}
3396 	vp = fp->f_data;
3397 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3398 	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
3399 	VOP_UNLOCK(vp);
3400 	fd_putfile(SCARG(uap, fd));
3401 	return (error);
3402 }
3403 
3404 /*
3405  * Rename files, (standard) BSD semantics frontend.
3406  */
3407 /* ARGSUSED */
3408 int
3409 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
3410 {
3411 	/* {
3412 		syscallarg(const char *) from;
3413 		syscallarg(const char *) to;
3414 	} */
3415 
3416 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 0));
3417 }
3418 
3419 int
3420 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
3421     register_t *retval)
3422 {
3423 	/* {
3424 		syscallarg(int) fromfd;
3425 		syscallarg(const char *) from;
3426 		syscallarg(int) tofd;
3427 		syscallarg(const char *) to;
3428 	} */
3429 
3430 	return ENOSYS;
3431 }
3432 
3433 /*
3434  * Rename files, POSIX semantics frontend.
3435  */
3436 /* ARGSUSED */
3437 int
3438 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
3439 {
3440 	/* {
3441 		syscallarg(const char *) from;
3442 		syscallarg(const char *) to;
3443 	} */
3444 
3445 	return (do_sys_rename(SCARG(uap, from), SCARG(uap, to), UIO_USERSPACE, 1));
3446 }
3447 
3448 /*
3449  * Rename files.  Source and destination must either both be directories,
3450  * or both not be directories.  If target is a directory, it must be empty.
3451  * If `from' and `to' refer to the same object, the value of the `retain'
3452  * argument is used to determine whether `from' will be
3453  *
3454  * (retain == 0)	deleted unless `from' and `to' refer to the same
3455  *			object in the file system's name space (BSD).
3456  * (retain == 1)	always retained (POSIX).
3457  */
3458 int
3459 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
3460 {
3461 	struct vnode *tvp, *fvp, *tdvp;
3462 	struct pathbuf *frompb, *topb;
3463 	struct nameidata fromnd, tond;
3464 	struct mount *fs;
3465 	int error;
3466 
3467 	error = pathbuf_maybe_copyin(from, seg, &frompb);
3468 	if (error) {
3469 		return error;
3470 	}
3471 	error = pathbuf_maybe_copyin(to, seg, &topb);
3472 	if (error) {
3473 		pathbuf_destroy(frompb);
3474 		return error;
3475 	}
3476 
3477 	NDINIT(&fromnd, DELETE, LOCKPARENT | TRYEMULROOT | INRENAME,
3478 	    frompb);
3479 	if ((error = namei(&fromnd)) != 0) {
3480 		pathbuf_destroy(frompb);
3481 		pathbuf_destroy(topb);
3482 		return (error);
3483 	}
3484 	if (fromnd.ni_dvp != fromnd.ni_vp)
3485 		VOP_UNLOCK(fromnd.ni_dvp);
3486 	fvp = fromnd.ni_vp;
3487 
3488 	fs = fvp->v_mount;
3489 	error = VFS_RENAMELOCK_ENTER(fs);
3490 	if (error) {
3491 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3492 		vrele(fromnd.ni_dvp);
3493 		vrele(fvp);
3494 		goto out1;
3495 	}
3496 
3497 	/*
3498 	 * close, partially, yet another race - ideally we should only
3499 	 * go as far as getting fromnd.ni_dvp before getting the per-fs
3500 	 * lock, and then continue to get fromnd.ni_vp, but we can't do
3501 	 * that with namei as it stands.
3502 	 *
3503 	 * This still won't prevent rmdir from nuking fromnd.ni_vp
3504 	 * under us. The real fix is to get the locks in the right
3505 	 * order and do the lookups in the right places, but that's a
3506 	 * major rototill.
3507 	 *
3508 	 * Note: this logic (as well as this whole function) is cloned
3509 	 * in nfs_serv.c. Proceed accordingly.
3510 	 */
3511 	vrele(fvp);
3512 	if ((fromnd.ni_cnd.cn_namelen == 1 &&
3513 	     fromnd.ni_cnd.cn_nameptr[0] == '.') ||
3514 	    (fromnd.ni_cnd.cn_namelen == 2 &&
3515 	     fromnd.ni_cnd.cn_nameptr[0] == '.' &&
3516 	     fromnd.ni_cnd.cn_nameptr[1] == '.')) {
3517 		error = EINVAL;
3518 		VFS_RENAMELOCK_EXIT(fs);
3519 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3520 		vrele(fromnd.ni_dvp);
3521 		goto out1;
3522 	}
3523 	vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
3524 	error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd, 0);
3525 	if (error) {
3526 		VOP_UNLOCK(fromnd.ni_dvp);
3527 		VFS_RENAMELOCK_EXIT(fs);
3528 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3529 		vrele(fromnd.ni_dvp);
3530 		goto out1;
3531 	}
3532 	VOP_UNLOCK(fromnd.ni_vp);
3533 	if (fromnd.ni_dvp != fromnd.ni_vp)
3534 		VOP_UNLOCK(fromnd.ni_dvp);
3535 	fvp = fromnd.ni_vp;
3536 
3537 	NDINIT(&tond, RENAME,
3538 	    LOCKPARENT | LOCKLEAF | NOCACHE | TRYEMULROOT
3539 	      | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
3540 	    topb);
3541 	if ((error = namei(&tond)) != 0) {
3542 		VFS_RENAMELOCK_EXIT(fs);
3543 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3544 		vrele(fromnd.ni_dvp);
3545 		vrele(fvp);
3546 		goto out1;
3547 	}
3548 	tdvp = tond.ni_dvp;
3549 	tvp = tond.ni_vp;
3550 
3551 	if (tvp != NULL) {
3552 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3553 			error = ENOTDIR;
3554 			goto out;
3555 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3556 			error = EISDIR;
3557 			goto out;
3558 		}
3559 	}
3560 
3561 	if (fvp == tdvp)
3562 		error = EINVAL;
3563 
3564 	/*
3565 	 * Source and destination refer to the same object.
3566 	 */
3567 	if (fvp == tvp) {
3568 		if (retain)
3569 			error = -1;
3570 		else if (fromnd.ni_dvp == tdvp &&
3571 		    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
3572 		    !memcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
3573 		          fromnd.ni_cnd.cn_namelen))
3574 			error = -1;
3575 	}
3576 	/*
3577 	 * Prevent cross-mount operation.
3578 	 */
3579 	if (error == 0) {
3580 		if (tond.ni_dvp->v_mount != fromnd.ni_dvp->v_mount) {
3581 			error = EXDEV;
3582 		}
3583 	}
3584 #if NVERIEXEC > 0
3585 	if (!error) {
3586 		char *f1, *f2;
3587 		size_t f1_len;
3588 		size_t f2_len;
3589 
3590 		f1_len = fromnd.ni_cnd.cn_namelen + 1;
3591 		f1 = kmem_alloc(f1_len, KM_SLEEP);
3592 		strlcpy(f1, fromnd.ni_cnd.cn_nameptr, f1_len);
3593 
3594 		f2_len = tond.ni_cnd.cn_namelen + 1;
3595 		f2 = kmem_alloc(f2_len, KM_SLEEP);
3596 		strlcpy(f2, tond.ni_cnd.cn_nameptr, f2_len);
3597 
3598 		error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
3599 
3600 		kmem_free(f1, f1_len);
3601 		kmem_free(f2, f2_len);
3602 	}
3603 #endif /* NVERIEXEC > 0 */
3604 
3605 out:
3606 	if (!error) {
3607 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3608 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3609 		VFS_RENAMELOCK_EXIT(fs);
3610 	} else {
3611 		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
3612 		if (tdvp == tvp)
3613 			vrele(tdvp);
3614 		else
3615 			vput(tdvp);
3616 		if (tvp)
3617 			vput(tvp);
3618 		VFS_RENAMELOCK_EXIT(fs);
3619 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
3620 		vrele(fromnd.ni_dvp);
3621 		vrele(fvp);
3622 	}
3623 out1:
3624 	pathbuf_destroy(frompb);
3625 	pathbuf_destroy(topb);
3626 	return (error == -1 ? 0 : error);
3627 }
3628 
3629 /*
3630  * Make a directory file.
3631  */
3632 /* ARGSUSED */
3633 int
3634 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
3635 {
3636 	/* {
3637 		syscallarg(const char *) path;
3638 		syscallarg(int) mode;
3639 	} */
3640 
3641 	return do_sys_mkdir(SCARG(uap, path), SCARG(uap, mode), UIO_USERSPACE);
3642 }
3643 
3644 int
3645 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
3646     register_t *retval)
3647 {
3648 	/* {
3649 		syscallarg(int) fd;
3650 		syscallarg(const char *) path;
3651 		syscallarg(int) mode;
3652 	} */
3653 
3654 	return ENOSYS;
3655 }
3656 
3657 
3658 int
3659 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
3660 {
3661 	struct proc *p = curlwp->l_proc;
3662 	struct vnode *vp;
3663 	struct vattr vattr;
3664 	int error;
3665 	struct pathbuf *pb;
3666 	struct nameidata nd;
3667 
3668 	/* XXX bollocks, should pass in a pathbuf */
3669 	error = pathbuf_maybe_copyin(path, seg, &pb);
3670 	if (error) {
3671 		return error;
3672 	}
3673 
3674 	NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
3675 	if ((error = namei(&nd)) != 0) {
3676 		pathbuf_destroy(pb);
3677 		return (error);
3678 	}
3679 	vp = nd.ni_vp;
3680 	if (vp != NULL) {
3681 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3682 		if (nd.ni_dvp == vp)
3683 			vrele(nd.ni_dvp);
3684 		else
3685 			vput(nd.ni_dvp);
3686 		vrele(vp);
3687 		pathbuf_destroy(pb);
3688 		return (EEXIST);
3689 	}
3690 	vattr_null(&vattr);
3691 	vattr.va_type = VDIR;
3692 	/* We will read cwdi->cwdi_cmask unlocked. */
3693 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
3694 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3695 	if (!error)
3696 		vput(nd.ni_vp);
3697 	pathbuf_destroy(pb);
3698 	return (error);
3699 }
3700 
3701 /*
3702  * Remove a directory file.
3703  */
3704 /* ARGSUSED */
3705 int
3706 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
3707 {
3708 	/* {
3709 		syscallarg(const char *) path;
3710 	} */
3711 	struct vnode *vp;
3712 	int error;
3713 	struct pathbuf *pb;
3714 	struct nameidata nd;
3715 
3716 	error = pathbuf_copyin(SCARG(uap, path), &pb);
3717 	if (error) {
3718 		return error;
3719 	}
3720 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
3721 	if ((error = namei(&nd)) != 0) {
3722 		pathbuf_destroy(pb);
3723 		return error;
3724 	}
3725 	vp = nd.ni_vp;
3726 	if (vp->v_type != VDIR) {
3727 		error = ENOTDIR;
3728 		goto out;
3729 	}
3730 	/*
3731 	 * No rmdir "." please.
3732 	 */
3733 	if (nd.ni_dvp == vp) {
3734 		error = EINVAL;
3735 		goto out;
3736 	}
3737 	/*
3738 	 * The root of a mounted filesystem cannot be deleted.
3739 	 */
3740 	if ((vp->v_vflag & VV_ROOT) != 0 || vp->v_mountedhere != NULL) {
3741 		error = EBUSY;
3742 		goto out;
3743 	}
3744 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3745 	pathbuf_destroy(pb);
3746 	return (error);
3747 
3748 out:
3749 	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
3750 	if (nd.ni_dvp == vp)
3751 		vrele(nd.ni_dvp);
3752 	else
3753 		vput(nd.ni_dvp);
3754 	vput(vp);
3755 	pathbuf_destroy(pb);
3756 	return (error);
3757 }
3758 
3759 /*
3760  * Read a block of directory entries in a file system independent format.
3761  */
3762 int
3763 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
3764 {
3765 	/* {
3766 		syscallarg(int) fd;
3767 		syscallarg(char *) buf;
3768 		syscallarg(size_t) count;
3769 	} */
3770 	file_t *fp;
3771 	int error, done;
3772 
3773 	/* fd_getvnode() will use the descriptor for us */
3774 	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3775 		return (error);
3776 	if ((fp->f_flag & FREAD) == 0) {
3777 		error = EBADF;
3778 		goto out;
3779 	}
3780 	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
3781 			SCARG(uap, count), &done, l, 0, 0);
3782 	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
3783 	*retval = done;
3784  out:
3785 	fd_putfile(SCARG(uap, fd));
3786 	return (error);
3787 }
3788 
3789 /*
3790  * Set the mode mask for creation of filesystem nodes.
3791  */
3792 int
3793 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
3794 {
3795 	/* {
3796 		syscallarg(mode_t) newmask;
3797 	} */
3798 	struct proc *p = l->l_proc;
3799 	struct cwdinfo *cwdi;
3800 
3801 	/*
3802 	 * cwdi->cwdi_cmask will be read unlocked elsewhere.  What's
3803 	 * important is that we serialize changes to the mask.  The
3804 	 * rw_exit() will issue a write memory barrier on our behalf,
3805 	 * and force the changes out to other CPUs (as it must use an
3806 	 * atomic operation, draining the local CPU's store buffers).
3807 	 */
3808 	cwdi = p->p_cwdi;
3809 	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
3810 	*retval = cwdi->cwdi_cmask;
3811 	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
3812 	rw_exit(&cwdi->cwdi_lock);
3813 
3814 	return (0);
3815 }
3816 
3817 int
3818 dorevoke(struct vnode *vp, kauth_cred_t cred)
3819 {
3820 	struct vattr vattr;
3821 	int error;
3822 
3823 	vn_lock(vp, LK_SHARED | LK_RETRY);
3824 	error = VOP_GETATTR(vp, &vattr, cred);
3825 	VOP_UNLOCK(vp);
3826 	if (error != 0)
3827 		return error;
3828 	if (kauth_cred_geteuid(cred) == vattr.va_uid ||
3829 	    (error = kauth_authorize_generic(cred,
3830 	    KAUTH_GENERIC_ISSUSER, NULL)) == 0)
3831 		VOP_REVOKE(vp, REVOKEALL);
3832 	return (error);
3833 }
3834 
3835 /*
3836  * Void all references to file by ripping underlying filesystem
3837  * away from vnode.
3838  */
3839 /* ARGSUSED */
3840 int
3841 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
3842 {
3843 	/* {
3844 		syscallarg(const char *) path;
3845 	} */
3846 	struct vnode *vp;
3847 	int error;
3848 
3849 	error = namei_simple_user(SCARG(uap, path),
3850 				NSM_FOLLOW_TRYEMULROOT, &vp);
3851 	if (error != 0)
3852 		return (error);
3853 	error = dorevoke(vp, l->l_cred);
3854 	vrele(vp);
3855 	return (error);
3856 }
3857