xref: /dflybsd-src/sys/kern/vfs_syscalls.c (revision 16e9ff28733d8bd9941b9770d79be966ba221f5f)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/buf.h>
45 #include <sys/conf.h>
46 #include <sys/sysent.h>
47 #include <sys/malloc.h>
48 #include <sys/mount.h>
49 #include <sys/mountctl.h>
50 #include <sys/sysproto.h>
51 #include <sys/filedesc.h>
52 #include <sys/kernel.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/linker.h>
56 #include <sys/stat.h>
57 #include <sys/unistd.h>
58 #include <sys/vnode.h>
59 #include <sys/proc.h>
60 #include <sys/priv.h>
61 #include <sys/jail.h>
62 #include <sys/namei.h>
63 #include <sys/nlookup.h>
64 #include <sys/dirent.h>
65 #include <sys/extattr.h>
66 #include <sys/spinlock.h>
67 #include <sys/kern_syscall.h>
68 #include <sys/objcache.h>
69 #include <sys/sysctl.h>
70 
71 #include <sys/buf2.h>
72 #include <sys/file2.h>
73 #include <sys/spinlock2.h>
74 #include <sys/mplock2.h>
75 
76 #include <vm/vm.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_page.h>
79 
80 #include <machine/limits.h>
81 #include <machine/stdarg.h>
82 
83 #include <vfs/union/union.h>
84 
85 static void mount_warning(struct mount *mp, const char *ctl, ...)
86 		__printflike(2, 3);
87 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
88 static int checkvp_chdir (struct vnode *vn, struct thread *td);
89 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
90 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
91 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
92 static int getutimes (const struct timeval *, struct timespec *);
93 static int setfown (struct vnode *, uid_t, gid_t);
94 static int setfmode (struct vnode *, int);
95 static int setfflags (struct vnode *, int);
96 static int setutimes (struct vnode *, struct vattr *,
97 			const struct timespec *, int);
98 static int	usermount = 0;	/* if 1, non-root can mount fs. */
99 
100 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
101 
102 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
103     "Allow non-root users to mount filesystems");
104 
105 /*
106  * Virtual File System System Calls
107  */
108 
109 /*
110  * Mount a file system.
111  *
112  * mount_args(char *type, char *path, int flags, caddr_t data)
113  *
114  * MPALMOSTSAFE
115  */
116 int
117 sys_mount(struct mount_args *uap)
118 {
119 	struct thread *td = curthread;
120 	struct vnode *vp;
121 	struct nchandle nch;
122 	struct mount *mp, *nullmp;
123 	struct vfsconf *vfsp;
124 	int error, flag = 0, flag2 = 0;
125 	int hasmount;
126 	struct vattr va;
127 	struct nlookupdata nd;
128 	char fstypename[MFSNAMELEN];
129 	struct ucred *cred;
130 
131 	get_mplock();
132 	cred = td->td_ucred;
133 	if (jailed(cred)) {
134 		error = EPERM;
135 		goto done;
136 	}
137 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
138 		goto done;
139 
140 	/*
141 	 * Do not allow NFS export by non-root users.
142 	 */
143 	if (uap->flags & MNT_EXPORTED) {
144 		error = priv_check(td, PRIV_ROOT);
145 		if (error)
146 			goto done;
147 	}
148 	/*
149 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
150 	 */
151 	if (priv_check(td, PRIV_ROOT))
152 		uap->flags |= MNT_NOSUID | MNT_NODEV;
153 
154 	/*
155 	 * Lookup the requested path and extract the nch and vnode.
156 	 */
157 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
158 	if (error == 0) {
159 		if ((error = nlookup(&nd)) == 0) {
160 			if (nd.nl_nch.ncp->nc_vp == NULL)
161 				error = ENOENT;
162 		}
163 	}
164 	if (error) {
165 		nlookup_done(&nd);
166 		goto done;
167 	}
168 
169 	/*
170 	 * If the target filesystem is resolved via a nullfs mount, then
171 	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
172 	 * instead of the target file system. We need it in case we are
173 	 * doing an update.
174 	 */
175 	nullmp = nd.nl_nch.mount;
176 
177 	/*
178 	 * Extract the locked+refd ncp and cleanup the nd structure
179 	 */
180 	nch = nd.nl_nch;
181 	cache_zero(&nd.nl_nch);
182 	nlookup_done(&nd);
183 
184 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
185 		hasmount = 1;
186 	else
187 		hasmount = 0;
188 
189 
190 	/*
191 	 * now we have the locked ref'd nch and unreferenced vnode.
192 	 */
193 	vp = nch.ncp->nc_vp;
194 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
195 		cache_put(&nch);
196 		goto done;
197 	}
198 	cache_unlock(&nch);
199 
200 	/*
201 	 * Extract the file system type. We need to know this early, to take
202 	 * appropriate actions if we are dealing with a nullfs.
203 	 */
204         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
205                 cache_drop(&nch);
206                 vput(vp);
207 		goto done;
208         }
209 
210 	/*
211 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
212 	 */
213 	if (uap->flags & MNT_UPDATE) {
214 		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
215 			cache_drop(&nch);
216 			vput(vp);
217 			error = EINVAL;
218 			goto done;
219 		}
220 
221 		if (strncmp(fstypename, "null", 5) == 0) {
222 			KKASSERT(nullmp);
223 			mp = nullmp;
224 		} else {
225 			mp = vp->v_mount;
226 		}
227 
228 		flag = mp->mnt_flag;
229 		flag2 = mp->mnt_kern_flag;
230 		/*
231 		 * We only allow the filesystem to be reloaded if it
232 		 * is currently mounted read-only.
233 		 */
234 		if ((uap->flags & MNT_RELOAD) &&
235 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
236 			cache_drop(&nch);
237 			vput(vp);
238 			error = EOPNOTSUPP;	/* Needs translation */
239 			goto done;
240 		}
241 		/*
242 		 * Only root, or the user that did the original mount is
243 		 * permitted to update it.
244 		 */
245 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
246 		    (error = priv_check(td, PRIV_ROOT))) {
247 			cache_drop(&nch);
248 			vput(vp);
249 			goto done;
250 		}
251 		if (vfs_busy(mp, LK_NOWAIT)) {
252 			cache_drop(&nch);
253 			vput(vp);
254 			error = EBUSY;
255 			goto done;
256 		}
257 		if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
258 			cache_drop(&nch);
259 			vfs_unbusy(mp);
260 			vput(vp);
261 			error = EBUSY;
262 			goto done;
263 		}
264 		vsetflags(vp, VMOUNT);
265 		mp->mnt_flag |=
266 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
267 		vn_unlock(vp);
268 		goto update;
269 	}
270 	/*
271 	 * If the user is not root, ensure that they own the directory
272 	 * onto which we are attempting to mount.
273 	 */
274 	if ((error = VOP_GETATTR(vp, &va)) ||
275 	    (va.va_uid != cred->cr_uid && (error = priv_check(td, PRIV_ROOT)))) {
276 		cache_drop(&nch);
277 		vput(vp);
278 		goto done;
279 	}
280 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
281 		cache_drop(&nch);
282 		vput(vp);
283 		goto done;
284 	}
285 	if (vp->v_type != VDIR) {
286 		cache_drop(&nch);
287 		vput(vp);
288 		error = ENOTDIR;
289 		goto done;
290 	}
291 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
292 		cache_drop(&nch);
293 		vput(vp);
294 		error = EPERM;
295 		goto done;
296 	}
297 	vfsp = vfsconf_find_by_name(fstypename);
298 	if (vfsp == NULL) {
299 		linker_file_t lf;
300 
301 		/* Only load modules for root (very important!) */
302 		if ((error = priv_check(td, PRIV_ROOT)) != 0) {
303 			cache_drop(&nch);
304 			vput(vp);
305 			goto done;
306 		}
307 		error = linker_load_file(fstypename, &lf);
308 		if (error || lf == NULL) {
309 			cache_drop(&nch);
310 			vput(vp);
311 			if (lf == NULL)
312 				error = ENODEV;
313 			goto done;
314 		}
315 		lf->userrefs++;
316 		/* lookup again, see if the VFS was loaded */
317 		vfsp = vfsconf_find_by_name(fstypename);
318 		if (vfsp == NULL) {
319 			lf->userrefs--;
320 			linker_file_unload(lf);
321 			cache_drop(&nch);
322 			vput(vp);
323 			error = ENODEV;
324 			goto done;
325 		}
326 	}
327 	if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
328 		cache_drop(&nch);
329 		vput(vp);
330 		error = EBUSY;
331 		goto done;
332 	}
333 	vsetflags(vp, VMOUNT);
334 
335 	/*
336 	 * Allocate and initialize the filesystem.
337 	 */
338 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
339 	mount_init(mp);
340 	vfs_busy(mp, LK_NOWAIT);
341 	mp->mnt_op = vfsp->vfc_vfsops;
342 	mp->mnt_vfc = vfsp;
343 	vfsp->vfc_refcount++;
344 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
345 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
346 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
347 	mp->mnt_stat.f_owner = cred->cr_uid;
348 	vn_unlock(vp);
349 update:
350 	/*
351 	 * Set the mount level flags.
352 	 */
353 	if (uap->flags & MNT_RDONLY)
354 		mp->mnt_flag |= MNT_RDONLY;
355 	else if (mp->mnt_flag & MNT_RDONLY)
356 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
357 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
358 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
359 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
360 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
361 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
362 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
363 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
364 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
365 	/*
366 	 * Mount the filesystem.
367 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
368 	 * get.
369 	 */
370 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
371 	if (mp->mnt_flag & MNT_UPDATE) {
372 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
373 			mp->mnt_flag &= ~MNT_RDONLY;
374 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
375 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
376 		if (error) {
377 			mp->mnt_flag = flag;
378 			mp->mnt_kern_flag = flag2;
379 		}
380 		vfs_unbusy(mp);
381 		vclrflags(vp, VMOUNT);
382 		vrele(vp);
383 		cache_drop(&nch);
384 		goto done;
385 	}
386 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
387 	/*
388 	 * Put the new filesystem on the mount list after root.  The mount
389 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
390 	 * up) which represents the root of the mount.  The lookup code
391 	 * detects the mount point going forward and checks the root of
392 	 * the mount going backwards.
393 	 *
394 	 * It is not necessary to invalidate or purge the vnode underneath
395 	 * because elements under the mount will be given their own glue
396 	 * namecache record.
397 	 */
398 	if (!error) {
399 		if (mp->mnt_ncmountpt.ncp == NULL) {
400 			/*
401 			 * allocate, then unlock, but leave the ref intact
402 			 */
403 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
404 			cache_unlock(&mp->mnt_ncmountpt);
405 		}
406 		mp->mnt_ncmounton = nch;		/* inherits ref */
407 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
408 
409 		/* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
410 		vclrflags(vp, VMOUNT);
411 		mountlist_insert(mp, MNTINS_LAST);
412 		vn_unlock(vp);
413 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
414 		error = vfs_allocate_syncvnode(mp);
415 		vfs_unbusy(mp);
416 		error = VFS_START(mp, 0);
417 		vrele(vp);
418 	} else {
419 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
420 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
421 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
422 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
423 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
424 		vclrflags(vp, VMOUNT);
425 		mp->mnt_vfc->vfc_refcount--;
426 		vfs_unbusy(mp);
427 		kfree(mp, M_MOUNT);
428 		cache_drop(&nch);
429 		vput(vp);
430 	}
431 done:
432 	rel_mplock();
433 	return (error);
434 }
435 
436 /*
437  * Scan all active processes to see if any of them have a current
438  * or root directory onto which the new filesystem has just been
439  * mounted. If so, replace them with the new mount point.
440  *
441  * The passed ncp is ref'd and locked (from the mount code) and
442  * must be associated with the vnode representing the root of the
443  * mount point.
444  */
445 struct checkdirs_info {
446 	struct nchandle old_nch;
447 	struct nchandle new_nch;
448 	struct vnode *old_vp;
449 	struct vnode *new_vp;
450 };
451 
452 static int checkdirs_callback(struct proc *p, void *data);
453 
454 static void
455 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
456 {
457 	struct checkdirs_info info;
458 	struct vnode *olddp;
459 	struct vnode *newdp;
460 	struct mount *mp;
461 
462 	/*
463 	 * If the old mount point's vnode has a usecount of 1, it is not
464 	 * being held as a descriptor anywhere.
465 	 */
466 	olddp = old_nch->ncp->nc_vp;
467 	if (olddp == NULL || olddp->v_sysref.refcnt == 1)
468 		return;
469 
470 	/*
471 	 * Force the root vnode of the new mount point to be resolved
472 	 * so we can update any matching processes.
473 	 */
474 	mp = new_nch->mount;
475 	if (VFS_ROOT(mp, &newdp))
476 		panic("mount: lost mount");
477 	cache_setunresolved(new_nch);
478 	cache_setvp(new_nch, newdp);
479 
480 	/*
481 	 * Special handling of the root node
482 	 */
483 	if (rootvnode == olddp) {
484 		vref(newdp);
485 		vfs_cache_setroot(newdp, cache_hold(new_nch));
486 	}
487 
488 	/*
489 	 * Pass newdp separately so the callback does not have to access
490 	 * it via new_nch->ncp->nc_vp.
491 	 */
492 	info.old_nch = *old_nch;
493 	info.new_nch = *new_nch;
494 	info.new_vp = newdp;
495 	allproc_scan(checkdirs_callback, &info);
496 	vput(newdp);
497 }
498 
499 /*
500  * NOTE: callback is not MP safe because the scanned process's filedesc
501  * structure can be ripped out from under us, amoung other things.
502  */
503 static int
504 checkdirs_callback(struct proc *p, void *data)
505 {
506 	struct checkdirs_info *info = data;
507 	struct filedesc *fdp;
508 	struct nchandle ncdrop1;
509 	struct nchandle ncdrop2;
510 	struct vnode *vprele1;
511 	struct vnode *vprele2;
512 
513 	if ((fdp = p->p_fd) != NULL) {
514 		cache_zero(&ncdrop1);
515 		cache_zero(&ncdrop2);
516 		vprele1 = NULL;
517 		vprele2 = NULL;
518 
519 		/*
520 		 * MPUNSAFE - XXX fdp can be pulled out from under a
521 		 * foreign process.
522 		 *
523 		 * A shared filedesc is ok, we don't have to copy it
524 		 * because we are making this change globally.
525 		 */
526 		spin_lock(&fdp->fd_spin);
527 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
528 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
529 			vprele1 = fdp->fd_cdir;
530 			vref(info->new_vp);
531 			fdp->fd_cdir = info->new_vp;
532 			ncdrop1 = fdp->fd_ncdir;
533 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
534 		}
535 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
536 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
537 			vprele2 = fdp->fd_rdir;
538 			vref(info->new_vp);
539 			fdp->fd_rdir = info->new_vp;
540 			ncdrop2 = fdp->fd_nrdir;
541 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
542 		}
543 		spin_unlock(&fdp->fd_spin);
544 		if (ncdrop1.ncp)
545 			cache_drop(&ncdrop1);
546 		if (ncdrop2.ncp)
547 			cache_drop(&ncdrop2);
548 		if (vprele1)
549 			vrele(vprele1);
550 		if (vprele2)
551 			vrele(vprele2);
552 	}
553 	return(0);
554 }
555 
556 /*
557  * Unmount a file system.
558  *
559  * Note: unmount takes a path to the vnode mounted on as argument,
560  * not special file (as before).
561  *
562  * umount_args(char *path, int flags)
563  *
564  * MPALMOSTSAFE
565  */
566 int
567 sys_unmount(struct unmount_args *uap)
568 {
569 	struct thread *td = curthread;
570 	struct proc *p __debugvar = td->td_proc;
571 	struct mount *mp = NULL;
572 	struct nlookupdata nd;
573 	int error;
574 
575 	KKASSERT(p);
576 	get_mplock();
577 	if (td->td_ucred->cr_prison != NULL) {
578 		error = EPERM;
579 		goto done;
580 	}
581 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
582 		goto done;
583 
584 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
585 	if (error == 0)
586 		error = nlookup(&nd);
587 	if (error)
588 		goto out;
589 
590 	mp = nd.nl_nch.mount;
591 
592 	/*
593 	 * Only root, or the user that did the original mount is
594 	 * permitted to unmount this filesystem.
595 	 */
596 	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
597 	    (error = priv_check(td, PRIV_ROOT)))
598 		goto out;
599 
600 	/*
601 	 * Don't allow unmounting the root file system.
602 	 */
603 	if (mp->mnt_flag & MNT_ROOTFS) {
604 		error = EINVAL;
605 		goto out;
606 	}
607 
608 	/*
609 	 * Must be the root of the filesystem
610 	 */
611 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
612 		error = EINVAL;
613 		goto out;
614 	}
615 
616 out:
617 	nlookup_done(&nd);
618 	if (error == 0)
619 		error = dounmount(mp, uap->flags);
620 done:
621 	rel_mplock();
622 	return (error);
623 }
624 
625 /*
626  * Do the actual file system unmount.
627  */
628 static int
629 dounmount_interlock(struct mount *mp)
630 {
631 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
632 		return (EBUSY);
633 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
634 	return(0);
635 }
636 
637 static int
638 unmount_allproc_cb(struct proc *p, void *arg)
639 {
640 	struct mount *mp;
641 
642 	if (p->p_textnch.ncp == NULL)
643 		return 0;
644 
645 	mp = (struct mount *)arg;
646 	if (p->p_textnch.mount == mp)
647 		cache_drop(&p->p_textnch);
648 
649 	return 0;
650 }
651 
652 int
653 dounmount(struct mount *mp, int flags)
654 {
655 	struct namecache *ncp;
656 	struct nchandle nch;
657 	struct vnode *vp;
658 	int error;
659 	int async_flag;
660 	int lflags;
661 	int freeok = 1;
662 
663 	/*
664 	 * Exclusive access for unmounting purposes
665 	 */
666 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
667 		return (error);
668 
669 	/*
670 	 * Allow filesystems to detect that a forced unmount is in progress.
671 	 */
672 	if (flags & MNT_FORCE)
673 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
674 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
675 	error = lockmgr(&mp->mnt_lock, lflags);
676 	if (error) {
677 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
678 		if (mp->mnt_kern_flag & MNTK_MWAIT)
679 			wakeup(mp);
680 		return (error);
681 	}
682 
683 	if (mp->mnt_flag & MNT_EXPUBLIC)
684 		vfs_setpublicfs(NULL, NULL, NULL);
685 
686 	vfs_msync(mp, MNT_WAIT);
687 	async_flag = mp->mnt_flag & MNT_ASYNC;
688 	mp->mnt_flag &=~ MNT_ASYNC;
689 
690 	/*
691 	 * If this filesystem isn't aliasing other filesystems,
692 	 * try to invalidate any remaining namecache entries and
693 	 * check the count afterwords.
694 	 */
695 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
696 		cache_lock(&mp->mnt_ncmountpt);
697 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
698 		cache_unlock(&mp->mnt_ncmountpt);
699 
700 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
701 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
702 			allproc_scan(&unmount_allproc_cb, mp);
703 		}
704 
705 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
706 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
707 
708 			if ((flags & MNT_FORCE) == 0) {
709 				error = EBUSY;
710 				mount_warning(mp, "Cannot unmount: "
711 						  "%d namecache "
712 						  "references still "
713 						  "present",
714 						  ncp->nc_refs - 1);
715 			} else {
716 				mount_warning(mp, "Forced unmount: "
717 						  "%d namecache "
718 						  "references still "
719 						  "present",
720 						  ncp->nc_refs - 1);
721 				freeok = 0;
722 			}
723 		}
724 	}
725 
726 	/*
727 	 * nchandle records ref the mount structure.  Expect a count of 1
728 	 * (our mount->mnt_ncmountpt).
729 	 */
730 	if (mp->mnt_refs != 1) {
731 		if ((flags & MNT_FORCE) == 0) {
732 			mount_warning(mp, "Cannot unmount: "
733 					  "%d process references still "
734 					  "present", mp->mnt_refs);
735 			error = EBUSY;
736 		} else {
737 			mount_warning(mp, "Forced unmount: "
738 					  "%d process references still "
739 					  "present", mp->mnt_refs);
740 			freeok = 0;
741 		}
742 	}
743 
744 	/*
745 	 * Decomission our special mnt_syncer vnode.  This also stops
746 	 * the vnlru code.  If we are unable to unmount we recommission
747 	 * the vnode.
748 	 */
749 	if (error == 0) {
750 		if ((vp = mp->mnt_syncer) != NULL) {
751 			mp->mnt_syncer = NULL;
752 			vrele(vp);
753 		}
754 		if (((mp->mnt_flag & MNT_RDONLY) ||
755 		     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
756 		    (flags & MNT_FORCE)) {
757 			error = VFS_UNMOUNT(mp, flags);
758 		}
759 	}
760 	if (error) {
761 		if (mp->mnt_syncer == NULL)
762 			vfs_allocate_syncvnode(mp);
763 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
764 		mp->mnt_flag |= async_flag;
765 		lockmgr(&mp->mnt_lock, LK_RELEASE);
766 		if (mp->mnt_kern_flag & MNTK_MWAIT)
767 			wakeup(mp);
768 		return (error);
769 	}
770 	/*
771 	 * Clean up any journals still associated with the mount after
772 	 * filesystem activity has ceased.
773 	 */
774 	journal_remove_all_journals(mp,
775 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
776 
777 	mountlist_remove(mp);
778 
779 	/*
780 	 * Remove any installed vnode ops here so the individual VFSs don't
781 	 * have to.
782 	 */
783 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
784 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
785 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
786 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
787 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
788 
789 	if (mp->mnt_ncmountpt.ncp != NULL) {
790 		nch = mp->mnt_ncmountpt;
791 		cache_zero(&mp->mnt_ncmountpt);
792 		cache_clrmountpt(&nch);
793 		cache_drop(&nch);
794 	}
795 	if (mp->mnt_ncmounton.ncp != NULL) {
796 		nch = mp->mnt_ncmounton;
797 		cache_zero(&mp->mnt_ncmounton);
798 		cache_clrmountpt(&nch);
799 		cache_drop(&nch);
800 	}
801 
802 	mp->mnt_vfc->vfc_refcount--;
803 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
804 		panic("unmount: dangling vnode");
805 	lockmgr(&mp->mnt_lock, LK_RELEASE);
806 	if (mp->mnt_kern_flag & MNTK_MWAIT)
807 		wakeup(mp);
808 	if (freeok)
809 		kfree(mp, M_MOUNT);
810 	return (0);
811 }
812 
813 static
814 void
815 mount_warning(struct mount *mp, const char *ctl, ...)
816 {
817 	char *ptr;
818 	char *buf;
819 	__va_list va;
820 
821 	__va_start(va, ctl);
822 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf, 0) == 0) {
823 		kprintf("unmount(%s): ", ptr);
824 		kvprintf(ctl, va);
825 		kprintf("\n");
826 		kfree(buf, M_TEMP);
827 	} else {
828 		kprintf("unmount(%p", mp);
829 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
830 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
831 		kprintf("): ");
832 		kvprintf(ctl, va);
833 		kprintf("\n");
834 	}
835 	__va_end(va);
836 }
837 
838 /*
839  * Shim cache_fullpath() to handle the case where a process is chrooted into
840  * a subdirectory of a mount.  In this case if the root mount matches the
841  * process root directory's mount we have to specify the process's root
842  * directory instead of the mount point, because the mount point might
843  * be above the root directory.
844  */
845 static
846 int
847 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
848 {
849 	struct nchandle *nch;
850 
851 	if (p && p->p_fd->fd_nrdir.mount == mp)
852 		nch = &p->p_fd->fd_nrdir;
853 	else
854 		nch = &mp->mnt_ncmountpt;
855 	return(cache_fullpath(p, nch, rb, fb, 0));
856 }
857 
858 /*
859  * Sync each mounted filesystem.
860  */
861 
862 #ifdef DEBUG
863 static int syncprt = 0;
864 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
865 #endif /* DEBUG */
866 
867 static int sync_callback(struct mount *mp, void *data);
868 
869 int
870 sys_sync(struct sync_args *uap)
871 {
872 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
873 #ifdef DEBUG
874 	/*
875 	 * print out buffer pool stat information on each sync() call.
876 	 */
877 	if (syncprt)
878 		vfs_bufstats();
879 #endif /* DEBUG */
880 	return (0);
881 }
882 
883 static
884 int
885 sync_callback(struct mount *mp, void *data __unused)
886 {
887 	int asyncflag;
888 
889 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
890 		asyncflag = mp->mnt_flag & MNT_ASYNC;
891 		mp->mnt_flag &= ~MNT_ASYNC;
892 		vfs_msync(mp, MNT_NOWAIT);
893 		VFS_SYNC(mp, MNT_NOWAIT | MNT_LAZY);
894 		mp->mnt_flag |= asyncflag;
895 	}
896 	return(0);
897 }
898 
899 /* XXX PRISON: could be per prison flag */
900 static int prison_quotas;
901 #if 0
902 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
903 #endif
904 
905 /*
906  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
907  *
908  * Change filesystem quotas.
909  *
910  * MPALMOSTSAFE
911  */
912 int
913 sys_quotactl(struct quotactl_args *uap)
914 {
915 	struct nlookupdata nd;
916 	struct thread *td;
917 	struct mount *mp;
918 	int error;
919 
920 	get_mplock();
921 	td = curthread;
922 	if (td->td_ucred->cr_prison && !prison_quotas) {
923 		error = EPERM;
924 		goto done;
925 	}
926 
927 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
928 	if (error == 0)
929 		error = nlookup(&nd);
930 	if (error == 0) {
931 		mp = nd.nl_nch.mount;
932 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
933 				    uap->arg, nd.nl_cred);
934 	}
935 	nlookup_done(&nd);
936 done:
937 	rel_mplock();
938 	return (error);
939 }
940 
941 /*
942  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
943  *		void *buf, int buflen)
944  *
945  * This function operates on a mount point and executes the specified
946  * operation using the specified control data, and possibly returns data.
947  *
948  * The actual number of bytes stored in the result buffer is returned, 0
949  * if none, otherwise an error is returned.
950  *
951  * MPALMOSTSAFE
952  */
953 int
954 sys_mountctl(struct mountctl_args *uap)
955 {
956 	struct thread *td = curthread;
957 	struct proc *p = td->td_proc;
958 	struct file *fp;
959 	void *ctl = NULL;
960 	void *buf = NULL;
961 	char *path = NULL;
962 	int error;
963 
964 	/*
965 	 * Sanity and permissions checks.  We must be root.
966 	 */
967 	KKASSERT(p);
968 	if (td->td_ucred->cr_prison != NULL)
969 		return (EPERM);
970 	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
971 	    (error = priv_check(td, PRIV_ROOT)) != 0)
972 		return (error);
973 
974 	/*
975 	 * Argument length checks
976 	 */
977 	if (uap->ctllen < 0 || uap->ctllen > 1024)
978 		return (EINVAL);
979 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
980 		return (EINVAL);
981 	if (uap->path == NULL)
982 		return (EINVAL);
983 
984 	/*
985 	 * Allocate the necessary buffers and copyin data
986 	 */
987 	path = objcache_get(namei_oc, M_WAITOK);
988 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
989 	if (error)
990 		goto done;
991 
992 	if (uap->ctllen) {
993 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
994 		error = copyin(uap->ctl, ctl, uap->ctllen);
995 		if (error)
996 			goto done;
997 	}
998 	if (uap->buflen)
999 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1000 
1001 	/*
1002 	 * Validate the descriptor
1003 	 */
1004 	if (uap->fd >= 0) {
1005 		fp = holdfp(p->p_fd, uap->fd, -1);
1006 		if (fp == NULL) {
1007 			error = EBADF;
1008 			goto done;
1009 		}
1010 	} else {
1011 		fp = NULL;
1012 	}
1013 
1014 	/*
1015 	 * Execute the internal kernel function and clean up.
1016 	 */
1017 	get_mplock();
1018 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
1019 	rel_mplock();
1020 	if (fp)
1021 		fdrop(fp);
1022 	if (error == 0 && uap->sysmsg_result > 0)
1023 		error = copyout(buf, uap->buf, uap->sysmsg_result);
1024 done:
1025 	if (path)
1026 		objcache_put(namei_oc, path);
1027 	if (ctl)
1028 		kfree(ctl, M_TEMP);
1029 	if (buf)
1030 		kfree(buf, M_TEMP);
1031 	return (error);
1032 }
1033 
1034 /*
1035  * Execute a mount control operation by resolving the path to a mount point
1036  * and calling vop_mountctl().
1037  *
1038  * Use the mount point from the nch instead of the vnode so nullfs mounts
1039  * can properly spike the VOP.
1040  */
1041 int
1042 kern_mountctl(const char *path, int op, struct file *fp,
1043 		const void *ctl, int ctllen,
1044 		void *buf, int buflen, int *res)
1045 {
1046 	struct vnode *vp;
1047 	struct mount *mp;
1048 	struct nlookupdata nd;
1049 	int error;
1050 
1051 	*res = 0;
1052 	vp = NULL;
1053 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1054 	if (error == 0)
1055 		error = nlookup(&nd);
1056 	if (error == 0)
1057 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1058 	mp = nd.nl_nch.mount;
1059 	nlookup_done(&nd);
1060 	if (error)
1061 		return (error);
1062 	vn_unlock(vp);
1063 
1064 	/*
1065 	 * Must be the root of the filesystem
1066 	 */
1067 	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1068 		vrele(vp);
1069 		return (EINVAL);
1070 	}
1071 	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1072 			     buf, buflen, res);
1073 	vrele(vp);
1074 	return (error);
1075 }
1076 
1077 int
1078 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1079 {
1080 	struct thread *td = curthread;
1081 	struct proc *p = td->td_proc;
1082 	struct mount *mp;
1083 	struct statfs *sp;
1084 	char *fullpath, *freepath;
1085 	int error;
1086 
1087 	if ((error = nlookup(nd)) != 0)
1088 		return (error);
1089 	mp = nd->nl_nch.mount;
1090 	sp = &mp->mnt_stat;
1091 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1092 		return (error);
1093 
1094 	error = mount_path(p, mp, &fullpath, &freepath);
1095 	if (error)
1096 		return(error);
1097 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1098 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1099 	kfree(freepath, M_TEMP);
1100 
1101 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1102 	bcopy(sp, buf, sizeof(*buf));
1103 	/* Only root should have access to the fsid's. */
1104 	if (priv_check(td, PRIV_ROOT))
1105 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1106 	return (0);
1107 }
1108 
1109 /*
1110  * statfs_args(char *path, struct statfs *buf)
1111  *
1112  * Get filesystem statistics.
1113  */
1114 int
1115 sys_statfs(struct statfs_args *uap)
1116 {
1117 	struct nlookupdata nd;
1118 	struct statfs buf;
1119 	int error;
1120 
1121 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1122 	if (error == 0)
1123 		error = kern_statfs(&nd, &buf);
1124 	nlookup_done(&nd);
1125 	if (error == 0)
1126 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1127 	return (error);
1128 }
1129 
1130 int
1131 kern_fstatfs(int fd, struct statfs *buf)
1132 {
1133 	struct thread *td = curthread;
1134 	struct proc *p = td->td_proc;
1135 	struct file *fp;
1136 	struct mount *mp;
1137 	struct statfs *sp;
1138 	char *fullpath, *freepath;
1139 	int error;
1140 
1141 	KKASSERT(p);
1142 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1143 		return (error);
1144 	mp = ((struct vnode *)fp->f_data)->v_mount;
1145 	if (mp == NULL) {
1146 		error = EBADF;
1147 		goto done;
1148 	}
1149 	if (fp->f_cred == NULL) {
1150 		error = EINVAL;
1151 		goto done;
1152 	}
1153 	sp = &mp->mnt_stat;
1154 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1155 		goto done;
1156 
1157 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1158 		goto done;
1159 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1160 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1161 	kfree(freepath, M_TEMP);
1162 
1163 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1164 	bcopy(sp, buf, sizeof(*buf));
1165 
1166 	/* Only root should have access to the fsid's. */
1167 	if (priv_check(td, PRIV_ROOT))
1168 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1169 	error = 0;
1170 done:
1171 	fdrop(fp);
1172 	return (error);
1173 }
1174 
1175 /*
1176  * fstatfs_args(int fd, struct statfs *buf)
1177  *
1178  * Get filesystem statistics.
1179  */
1180 int
1181 sys_fstatfs(struct fstatfs_args *uap)
1182 {
1183 	struct statfs buf;
1184 	int error;
1185 
1186 	error = kern_fstatfs(uap->fd, &buf);
1187 
1188 	if (error == 0)
1189 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1190 	return (error);
1191 }
1192 
1193 int
1194 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1195 {
1196 	struct mount *mp;
1197 	struct statvfs *sp;
1198 	int error;
1199 
1200 	if ((error = nlookup(nd)) != 0)
1201 		return (error);
1202 	mp = nd->nl_nch.mount;
1203 	sp = &mp->mnt_vstat;
1204 	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1205 		return (error);
1206 
1207 	sp->f_flag = 0;
1208 	if (mp->mnt_flag & MNT_RDONLY)
1209 		sp->f_flag |= ST_RDONLY;
1210 	if (mp->mnt_flag & MNT_NOSUID)
1211 		sp->f_flag |= ST_NOSUID;
1212 	bcopy(sp, buf, sizeof(*buf));
1213 	return (0);
1214 }
1215 
1216 /*
1217  * statfs_args(char *path, struct statfs *buf)
1218  *
1219  * Get filesystem statistics.
1220  */
1221 int
1222 sys_statvfs(struct statvfs_args *uap)
1223 {
1224 	struct nlookupdata nd;
1225 	struct statvfs buf;
1226 	int error;
1227 
1228 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1229 	if (error == 0)
1230 		error = kern_statvfs(&nd, &buf);
1231 	nlookup_done(&nd);
1232 	if (error == 0)
1233 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1234 	return (error);
1235 }
1236 
1237 int
1238 kern_fstatvfs(int fd, struct statvfs *buf)
1239 {
1240 	struct thread *td = curthread;
1241 	struct proc *p = td->td_proc;
1242 	struct file *fp;
1243 	struct mount *mp;
1244 	struct statvfs *sp;
1245 	int error;
1246 
1247 	KKASSERT(p);
1248 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1249 		return (error);
1250 	mp = ((struct vnode *)fp->f_data)->v_mount;
1251 	if (mp == NULL) {
1252 		error = EBADF;
1253 		goto done;
1254 	}
1255 	if (fp->f_cred == NULL) {
1256 		error = EINVAL;
1257 		goto done;
1258 	}
1259 	sp = &mp->mnt_vstat;
1260 	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1261 		goto done;
1262 
1263 	sp->f_flag = 0;
1264 	if (mp->mnt_flag & MNT_RDONLY)
1265 		sp->f_flag |= ST_RDONLY;
1266 	if (mp->mnt_flag & MNT_NOSUID)
1267 		sp->f_flag |= ST_NOSUID;
1268 
1269 	bcopy(sp, buf, sizeof(*buf));
1270 	error = 0;
1271 done:
1272 	fdrop(fp);
1273 	return (error);
1274 }
1275 
1276 /*
1277  * fstatfs_args(int fd, struct statfs *buf)
1278  *
1279  * Get filesystem statistics.
1280  */
1281 int
1282 sys_fstatvfs(struct fstatvfs_args *uap)
1283 {
1284 	struct statvfs buf;
1285 	int error;
1286 
1287 	error = kern_fstatvfs(uap->fd, &buf);
1288 
1289 	if (error == 0)
1290 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1291 	return (error);
1292 }
1293 
1294 /*
1295  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1296  *
1297  * Get statistics on all filesystems.
1298  */
1299 
1300 struct getfsstat_info {
1301 	struct statfs *sfsp;
1302 	long count;
1303 	long maxcount;
1304 	int error;
1305 	int flags;
1306 	struct thread *td;
1307 };
1308 
1309 static int getfsstat_callback(struct mount *, void *);
1310 
1311 int
1312 sys_getfsstat(struct getfsstat_args *uap)
1313 {
1314 	struct thread *td = curthread;
1315 	struct getfsstat_info info;
1316 
1317 	bzero(&info, sizeof(info));
1318 
1319 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1320 	info.sfsp = uap->buf;
1321 	info.count = 0;
1322 	info.flags = uap->flags;
1323 	info.td = td;
1324 
1325 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1326 	if (info.sfsp && info.count > info.maxcount)
1327 		uap->sysmsg_result = info.maxcount;
1328 	else
1329 		uap->sysmsg_result = info.count;
1330 	return (info.error);
1331 }
1332 
1333 static int
1334 getfsstat_callback(struct mount *mp, void *data)
1335 {
1336 	struct getfsstat_info *info = data;
1337 	struct statfs *sp;
1338 	char *freepath;
1339 	char *fullpath;
1340 	int error;
1341 
1342 	if (info->sfsp && info->count < info->maxcount) {
1343 		if (info->td->td_proc &&
1344 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1345 			return(0);
1346 		}
1347 		sp = &mp->mnt_stat;
1348 
1349 		/*
1350 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1351 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1352 		 * overrides MNT_WAIT.
1353 		 */
1354 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1355 		    (info->flags & MNT_WAIT)) &&
1356 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1357 			return(0);
1358 		}
1359 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1360 
1361 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1362 		if (error) {
1363 			info->error = error;
1364 			return(-1);
1365 		}
1366 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1367 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1368 		kfree(freepath, M_TEMP);
1369 
1370 		error = copyout(sp, info->sfsp, sizeof(*sp));
1371 		if (error) {
1372 			info->error = error;
1373 			return (-1);
1374 		}
1375 		++info->sfsp;
1376 	}
1377 	info->count++;
1378 	return(0);
1379 }
1380 
1381 /*
1382  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1383 		   long bufsize, int flags)
1384  *
1385  * Get statistics on all filesystems.
1386  */
1387 
1388 struct getvfsstat_info {
1389 	struct statfs *sfsp;
1390 	struct statvfs *vsfsp;
1391 	long count;
1392 	long maxcount;
1393 	int error;
1394 	int flags;
1395 	struct thread *td;
1396 };
1397 
1398 static int getvfsstat_callback(struct mount *, void *);
1399 
1400 int
1401 sys_getvfsstat(struct getvfsstat_args *uap)
1402 {
1403 	struct thread *td = curthread;
1404 	struct getvfsstat_info info;
1405 
1406 	bzero(&info, sizeof(info));
1407 
1408 	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1409 	info.sfsp = uap->buf;
1410 	info.vsfsp = uap->vbuf;
1411 	info.count = 0;
1412 	info.flags = uap->flags;
1413 	info.td = td;
1414 
1415 	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1416 	if (info.vsfsp && info.count > info.maxcount)
1417 		uap->sysmsg_result = info.maxcount;
1418 	else
1419 		uap->sysmsg_result = info.count;
1420 	return (info.error);
1421 }
1422 
1423 static int
1424 getvfsstat_callback(struct mount *mp, void *data)
1425 {
1426 	struct getvfsstat_info *info = data;
1427 	struct statfs *sp;
1428 	struct statvfs *vsp;
1429 	char *freepath;
1430 	char *fullpath;
1431 	int error;
1432 
1433 	if (info->vsfsp && info->count < info->maxcount) {
1434 		if (info->td->td_proc &&
1435 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1436 			return(0);
1437 		}
1438 		sp = &mp->mnt_stat;
1439 		vsp = &mp->mnt_vstat;
1440 
1441 		/*
1442 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1443 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1444 		 * overrides MNT_WAIT.
1445 		 */
1446 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1447 		    (info->flags & MNT_WAIT)) &&
1448 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1449 			return(0);
1450 		}
1451 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1452 
1453 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1454 		    (info->flags & MNT_WAIT)) &&
1455 		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1456 			return(0);
1457 		}
1458 		vsp->f_flag = 0;
1459 		if (mp->mnt_flag & MNT_RDONLY)
1460 			vsp->f_flag |= ST_RDONLY;
1461 		if (mp->mnt_flag & MNT_NOSUID)
1462 			vsp->f_flag |= ST_NOSUID;
1463 
1464 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1465 		if (error) {
1466 			info->error = error;
1467 			return(-1);
1468 		}
1469 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1470 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1471 		kfree(freepath, M_TEMP);
1472 
1473 		error = copyout(sp, info->sfsp, sizeof(*sp));
1474 		if (error == 0)
1475 			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1476 		if (error) {
1477 			info->error = error;
1478 			return (-1);
1479 		}
1480 		++info->sfsp;
1481 		++info->vsfsp;
1482 	}
1483 	info->count++;
1484 	return(0);
1485 }
1486 
1487 
1488 /*
1489  * fchdir_args(int fd)
1490  *
1491  * Change current working directory to a given file descriptor.
1492  */
1493 int
1494 sys_fchdir(struct fchdir_args *uap)
1495 {
1496 	struct thread *td = curthread;
1497 	struct proc *p = td->td_proc;
1498 	struct filedesc *fdp = p->p_fd;
1499 	struct vnode *vp, *ovp;
1500 	struct mount *mp;
1501 	struct file *fp;
1502 	struct nchandle nch, onch, tnch;
1503 	int error;
1504 
1505 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1506 		return (error);
1507 	lwkt_gettoken(&p->p_token);
1508 	vp = (struct vnode *)fp->f_data;
1509 	vref(vp);
1510 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1511 	if (fp->f_nchandle.ncp == NULL)
1512 		error = ENOTDIR;
1513 	else
1514 		error = checkvp_chdir(vp, td);
1515 	if (error) {
1516 		vput(vp);
1517 		goto done;
1518 	}
1519 	cache_copy(&fp->f_nchandle, &nch);
1520 
1521 	/*
1522 	 * If the ncp has become a mount point, traverse through
1523 	 * the mount point.
1524 	 */
1525 
1526 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1527 	       (mp = cache_findmount(&nch)) != NULL
1528 	) {
1529 		error = nlookup_mp(mp, &tnch);
1530 		if (error == 0) {
1531 			cache_unlock(&tnch);	/* leave ref intact */
1532 			vput(vp);
1533 			vp = tnch.ncp->nc_vp;
1534 			error = vget(vp, LK_SHARED);
1535 			KKASSERT(error == 0);
1536 			cache_drop(&nch);
1537 			nch = tnch;
1538 		}
1539 	}
1540 	if (error == 0) {
1541 		ovp = fdp->fd_cdir;
1542 		onch = fdp->fd_ncdir;
1543 		vn_unlock(vp);		/* leave ref intact */
1544 		fdp->fd_cdir = vp;
1545 		fdp->fd_ncdir = nch;
1546 		cache_drop(&onch);
1547 		vrele(ovp);
1548 	} else {
1549 		cache_drop(&nch);
1550 		vput(vp);
1551 	}
1552 	fdrop(fp);
1553 done:
1554 	lwkt_reltoken(&p->p_token);
1555 	return (error);
1556 }
1557 
1558 int
1559 kern_chdir(struct nlookupdata *nd)
1560 {
1561 	struct thread *td = curthread;
1562 	struct proc *p = td->td_proc;
1563 	struct filedesc *fdp = p->p_fd;
1564 	struct vnode *vp, *ovp;
1565 	struct nchandle onch;
1566 	int error;
1567 
1568 	if ((error = nlookup(nd)) != 0)
1569 		return (error);
1570 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1571 		return (ENOENT);
1572 	if ((error = vget(vp, LK_SHARED)) != 0)
1573 		return (error);
1574 
1575 	lwkt_gettoken(&p->p_token);
1576 	error = checkvp_chdir(vp, td);
1577 	vn_unlock(vp);
1578 	if (error == 0) {
1579 		ovp = fdp->fd_cdir;
1580 		onch = fdp->fd_ncdir;
1581 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1582 		fdp->fd_ncdir = nd->nl_nch;
1583 		fdp->fd_cdir = vp;
1584 		cache_drop(&onch);
1585 		vrele(ovp);
1586 		cache_zero(&nd->nl_nch);
1587 	} else {
1588 		vrele(vp);
1589 	}
1590 	lwkt_reltoken(&p->p_token);
1591 	return (error);
1592 }
1593 
1594 /*
1595  * chdir_args(char *path)
1596  *
1597  * Change current working directory (``.'').
1598  */
1599 int
1600 sys_chdir(struct chdir_args *uap)
1601 {
1602 	struct nlookupdata nd;
1603 	int error;
1604 
1605 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1606 	if (error == 0)
1607 		error = kern_chdir(&nd);
1608 	nlookup_done(&nd);
1609 	return (error);
1610 }
1611 
1612 /*
1613  * Helper function for raised chroot(2) security function:  Refuse if
1614  * any filedescriptors are open directories.
1615  */
1616 static int
1617 chroot_refuse_vdir_fds(struct filedesc *fdp)
1618 {
1619 	struct vnode *vp;
1620 	struct file *fp;
1621 	int error;
1622 	int fd;
1623 
1624 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1625 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1626 			continue;
1627 		vp = (struct vnode *)fp->f_data;
1628 		if (vp->v_type != VDIR) {
1629 			fdrop(fp);
1630 			continue;
1631 		}
1632 		fdrop(fp);
1633 		return(EPERM);
1634 	}
1635 	return (0);
1636 }
1637 
1638 /*
1639  * This sysctl determines if we will allow a process to chroot(2) if it
1640  * has a directory open:
1641  *	0: disallowed for all processes.
1642  *	1: allowed for processes that were not already chroot(2)'ed.
1643  *	2: allowed for all processes.
1644  */
1645 
1646 static int chroot_allow_open_directories = 1;
1647 
1648 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1649      &chroot_allow_open_directories, 0, "");
1650 
1651 /*
1652  * chroot to the specified namecache entry.  We obtain the vp from the
1653  * namecache data.  The passed ncp must be locked and referenced and will
1654  * remain locked and referenced on return.
1655  */
1656 int
1657 kern_chroot(struct nchandle *nch)
1658 {
1659 	struct thread *td = curthread;
1660 	struct proc *p = td->td_proc;
1661 	struct filedesc *fdp = p->p_fd;
1662 	struct vnode *vp;
1663 	int error;
1664 
1665 	/*
1666 	 * Only privileged user can chroot
1667 	 */
1668 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1669 	if (error)
1670 		return (error);
1671 
1672 	/*
1673 	 * Disallow open directory descriptors (fchdir() breakouts).
1674 	 */
1675 	if (chroot_allow_open_directories == 0 ||
1676 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1677 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1678 			return (error);
1679 	}
1680 	if ((vp = nch->ncp->nc_vp) == NULL)
1681 		return (ENOENT);
1682 
1683 	if ((error = vget(vp, LK_SHARED)) != 0)
1684 		return (error);
1685 
1686 	/*
1687 	 * Check the validity of vp as a directory to change to and
1688 	 * associate it with rdir/jdir.
1689 	 */
1690 	error = checkvp_chdir(vp, td);
1691 	vn_unlock(vp);			/* leave reference intact */
1692 	if (error == 0) {
1693 		vrele(fdp->fd_rdir);
1694 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1695 		cache_drop(&fdp->fd_nrdir);
1696 		cache_copy(nch, &fdp->fd_nrdir);
1697 		if (fdp->fd_jdir == NULL) {
1698 			fdp->fd_jdir = vp;
1699 			vref(fdp->fd_jdir);
1700 			cache_copy(nch, &fdp->fd_njdir);
1701 		}
1702 	} else {
1703 		vrele(vp);
1704 	}
1705 	return (error);
1706 }
1707 
1708 /*
1709  * chroot_args(char *path)
1710  *
1711  * Change notion of root (``/'') directory.
1712  */
1713 int
1714 sys_chroot(struct chroot_args *uap)
1715 {
1716 	struct thread *td __debugvar = curthread;
1717 	struct nlookupdata nd;
1718 	int error;
1719 
1720 	KKASSERT(td->td_proc);
1721 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1722 	if (error == 0) {
1723 		nd.nl_flags |= NLC_EXEC;
1724 		error = nlookup(&nd);
1725 		if (error == 0)
1726 			error = kern_chroot(&nd.nl_nch);
1727 	}
1728 	nlookup_done(&nd);
1729 	return(error);
1730 }
1731 
1732 int
1733 sys_chroot_kernel(struct chroot_kernel_args *uap)
1734 {
1735 	struct thread *td = curthread;
1736 	struct nlookupdata nd;
1737 	struct nchandle *nch;
1738 	struct vnode *vp;
1739 	int error;
1740 
1741 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1742 	if (error)
1743 		goto error_nond;
1744 
1745 	error = nlookup(&nd);
1746 	if (error)
1747 		goto error_out;
1748 
1749 	nch = &nd.nl_nch;
1750 
1751 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1752 	if (error)
1753 		goto error_out;
1754 
1755 	if ((vp = nch->ncp->nc_vp) == NULL) {
1756 		error = ENOENT;
1757 		goto error_out;
1758 	}
1759 
1760 	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
1761 		goto error_out;
1762 
1763 	kprintf("chroot_kernel: set new rootnch/rootvnode to %s\n", uap->path);
1764 	get_mplock();
1765 	vfs_cache_setroot(vp, cache_hold(nch));
1766 	rel_mplock();
1767 
1768 error_out:
1769 	nlookup_done(&nd);
1770 error_nond:
1771 	return(error);
1772 }
1773 
1774 /*
1775  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1776  * determine whether it is legal to chdir to the vnode.  The vnode's state
1777  * is not changed by this call.
1778  */
1779 int
1780 checkvp_chdir(struct vnode *vp, struct thread *td)
1781 {
1782 	int error;
1783 
1784 	if (vp->v_type != VDIR)
1785 		error = ENOTDIR;
1786 	else
1787 		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1788 	return (error);
1789 }
1790 
1791 int
1792 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1793 {
1794 	struct thread *td = curthread;
1795 	struct proc *p = td->td_proc;
1796 	struct lwp *lp = td->td_lwp;
1797 	struct filedesc *fdp = p->p_fd;
1798 	int cmode, flags;
1799 	struct file *nfp;
1800 	struct file *fp;
1801 	struct vnode *vp;
1802 	int type, indx, error;
1803 	struct flock lf;
1804 
1805 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1806 		return (EINVAL);
1807 	flags = FFLAGS(oflags);
1808 	error = falloc(lp, &nfp, NULL);
1809 	if (error)
1810 		return (error);
1811 	fp = nfp;
1812 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1813 
1814 	/*
1815 	 * XXX p_dupfd is a real mess.  It allows a device to return a
1816 	 * file descriptor to be duplicated rather then doing the open
1817 	 * itself.
1818 	 */
1819 	lp->lwp_dupfd = -1;
1820 
1821 	/*
1822 	 * Call vn_open() to do the lookup and assign the vnode to the
1823 	 * file pointer.  vn_open() does not change the ref count on fp
1824 	 * and the vnode, on success, will be inherited by the file pointer
1825 	 * and unlocked.
1826 	 */
1827 	nd->nl_flags |= NLC_LOCKVP;
1828 	error = vn_open(nd, fp, flags, cmode);
1829 	nlookup_done(nd);
1830 	if (error) {
1831 		/*
1832 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1833 		 * responsible for dropping the old contents of ofiles[indx]
1834 		 * if it succeeds.
1835 		 *
1836 		 * Note that fsetfd() will add a ref to fp which represents
1837 		 * the fd_files[] assignment.  We must still drop our
1838 		 * reference.
1839 		 */
1840 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1841 			if (fdalloc(p, 0, &indx) == 0) {
1842 				error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
1843 				if (error == 0) {
1844 					*res = indx;
1845 					fdrop(fp);	/* our ref */
1846 					return (0);
1847 				}
1848 				fsetfd(fdp, NULL, indx);
1849 			}
1850 		}
1851 		fdrop(fp);	/* our ref */
1852 		if (error == ERESTART)
1853 			error = EINTR;
1854 		return (error);
1855 	}
1856 
1857 	/*
1858 	 * ref the vnode for ourselves so it can't be ripped out from under
1859 	 * is.  XXX need an ND flag to request that the vnode be returned
1860 	 * anyway.
1861 	 *
1862 	 * Reserve a file descriptor but do not assign it until the open
1863 	 * succeeds.
1864 	 */
1865 	vp = (struct vnode *)fp->f_data;
1866 	vref(vp);
1867 	if ((error = fdalloc(p, 0, &indx)) != 0) {
1868 		fdrop(fp);
1869 		vrele(vp);
1870 		return (error);
1871 	}
1872 
1873 	/*
1874 	 * If no error occurs the vp will have been assigned to the file
1875 	 * pointer.
1876 	 */
1877 	lp->lwp_dupfd = 0;
1878 
1879 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1880 		lf.l_whence = SEEK_SET;
1881 		lf.l_start = 0;
1882 		lf.l_len = 0;
1883 		if (flags & O_EXLOCK)
1884 			lf.l_type = F_WRLCK;
1885 		else
1886 			lf.l_type = F_RDLCK;
1887 		if (flags & FNONBLOCK)
1888 			type = 0;
1889 		else
1890 			type = F_WAIT;
1891 
1892 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1893 			/*
1894 			 * lock request failed.  Clean up the reserved
1895 			 * descriptor.
1896 			 */
1897 			vrele(vp);
1898 			fsetfd(fdp, NULL, indx);
1899 			fdrop(fp);
1900 			return (error);
1901 		}
1902 		fp->f_flag |= FHASLOCK;
1903 	}
1904 #if 0
1905 	/*
1906 	 * Assert that all regular file vnodes were created with a object.
1907 	 */
1908 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1909 		("open: regular file has no backing object after vn_open"));
1910 #endif
1911 
1912 	vrele(vp);
1913 
1914 	/*
1915 	 * release our private reference, leaving the one associated with the
1916 	 * descriptor table intact.
1917 	 */
1918 	fsetfd(fdp, fp, indx);
1919 	fdrop(fp);
1920 	*res = indx;
1921 	return (0);
1922 }
1923 
1924 /*
1925  * open_args(char *path, int flags, int mode)
1926  *
1927  * Check permissions, allocate an open file structure,
1928  * and call the device open routine if any.
1929  */
1930 int
1931 sys_open(struct open_args *uap)
1932 {
1933 	struct nlookupdata nd;
1934 	int error;
1935 
1936 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1937 	if (error == 0) {
1938 		error = kern_open(&nd, uap->flags,
1939 				    uap->mode, &uap->sysmsg_result);
1940 	}
1941 	nlookup_done(&nd);
1942 	return (error);
1943 }
1944 
1945 /*
1946  * openat_args(int fd, char *path, int flags, int mode)
1947  */
1948 int
1949 sys_openat(struct openat_args *uap)
1950 {
1951 	struct nlookupdata nd;
1952 	int error;
1953 	struct file *fp;
1954 
1955 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
1956 	if (error == 0) {
1957 		error = kern_open(&nd, uap->flags, uap->mode,
1958 					&uap->sysmsg_result);
1959 	}
1960 	nlookup_done_at(&nd, fp);
1961 	return (error);
1962 }
1963 
1964 int
1965 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
1966 {
1967 	struct thread *td = curthread;
1968 	struct proc *p = td->td_proc;
1969 	struct vnode *vp;
1970 	struct vattr vattr;
1971 	int error;
1972 	int whiteout = 0;
1973 
1974 	KKASSERT(p);
1975 
1976 	VATTR_NULL(&vattr);
1977 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1978 	vattr.va_rmajor = rmajor;
1979 	vattr.va_rminor = rminor;
1980 
1981 	switch (mode & S_IFMT) {
1982 	case S_IFMT:	/* used by badsect to flag bad sectors */
1983 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
1984 		vattr.va_type = VBAD;
1985 		break;
1986 	case S_IFCHR:
1987 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1988 		vattr.va_type = VCHR;
1989 		break;
1990 	case S_IFBLK:
1991 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1992 		vattr.va_type = VBLK;
1993 		break;
1994 	case S_IFWHT:
1995 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
1996 		whiteout = 1;
1997 		break;
1998 	case S_IFDIR:	/* special directories support for HAMMER */
1999 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
2000 		vattr.va_type = VDIR;
2001 		break;
2002 	default:
2003 		error = EINVAL;
2004 		break;
2005 	}
2006 
2007 	if (error)
2008 		return (error);
2009 
2010 	bwillinode(1);
2011 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2012 	if ((error = nlookup(nd)) != 0)
2013 		return (error);
2014 	if (nd->nl_nch.ncp->nc_vp)
2015 		return (EEXIST);
2016 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2017 		return (error);
2018 
2019 	if (whiteout) {
2020 		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2021 				      nd->nl_cred, NAMEI_CREATE);
2022 	} else {
2023 		vp = NULL;
2024 		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2025 				   &vp, nd->nl_cred, &vattr);
2026 		if (error == 0)
2027 			vput(vp);
2028 	}
2029 	return (error);
2030 }
2031 
2032 /*
2033  * mknod_args(char *path, int mode, int dev)
2034  *
2035  * Create a special file.
2036  */
2037 int
2038 sys_mknod(struct mknod_args *uap)
2039 {
2040 	struct nlookupdata nd;
2041 	int error;
2042 
2043 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2044 	if (error == 0) {
2045 		error = kern_mknod(&nd, uap->mode,
2046 				   umajor(uap->dev), uminor(uap->dev));
2047 	}
2048 	nlookup_done(&nd);
2049 	return (error);
2050 }
2051 
2052 /*
2053  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2054  *
2055  * Create a special file.  The path is relative to the directory associated
2056  * with fd.
2057  */
2058 int
2059 sys_mknodat(struct mknodat_args *uap)
2060 {
2061 	struct nlookupdata nd;
2062 	struct file *fp;
2063 	int error;
2064 
2065 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2066 	if (error == 0) {
2067 		error = kern_mknod(&nd, uap->mode,
2068 				   umajor(uap->dev), uminor(uap->dev));
2069 	}
2070 	nlookup_done_at(&nd, fp);
2071 	return (error);
2072 }
2073 
2074 int
2075 kern_mkfifo(struct nlookupdata *nd, int mode)
2076 {
2077 	struct thread *td = curthread;
2078 	struct proc *p = td->td_proc;
2079 	struct vattr vattr;
2080 	struct vnode *vp;
2081 	int error;
2082 
2083 	bwillinode(1);
2084 
2085 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2086 	if ((error = nlookup(nd)) != 0)
2087 		return (error);
2088 	if (nd->nl_nch.ncp->nc_vp)
2089 		return (EEXIST);
2090 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2091 		return (error);
2092 
2093 	VATTR_NULL(&vattr);
2094 	vattr.va_type = VFIFO;
2095 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2096 	vp = NULL;
2097 	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2098 	if (error == 0)
2099 		vput(vp);
2100 	return (error);
2101 }
2102 
2103 /*
2104  * mkfifo_args(char *path, int mode)
2105  *
2106  * Create a named pipe.
2107  */
2108 int
2109 sys_mkfifo(struct mkfifo_args *uap)
2110 {
2111 	struct nlookupdata nd;
2112 	int error;
2113 
2114 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2115 	if (error == 0)
2116 		error = kern_mkfifo(&nd, uap->mode);
2117 	nlookup_done(&nd);
2118 	return (error);
2119 }
2120 
2121 /*
2122  * mkfifoat_args(int fd, char *path, mode_t mode)
2123  *
2124  * Create a named pipe.  The path is relative to the directory associated
2125  * with fd.
2126  */
2127 int
2128 sys_mkfifoat(struct mkfifoat_args *uap)
2129 {
2130 	struct nlookupdata nd;
2131 	struct file *fp;
2132 	int error;
2133 
2134 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2135 	if (error == 0)
2136 		error = kern_mkfifo(&nd, uap->mode);
2137 	nlookup_done_at(&nd, fp);
2138 	return (error);
2139 }
2140 
2141 static int hardlink_check_uid = 0;
2142 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2143     &hardlink_check_uid, 0,
2144     "Unprivileged processes cannot create hard links to files owned by other "
2145     "users");
2146 static int hardlink_check_gid = 0;
2147 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2148     &hardlink_check_gid, 0,
2149     "Unprivileged processes cannot create hard links to files owned by other "
2150     "groups");
2151 
2152 static int
2153 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2154 {
2155 	struct vattr va;
2156 	int error;
2157 
2158 	/*
2159 	 * Shortcut if disabled
2160 	 */
2161 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2162 		return (0);
2163 
2164 	/*
2165 	 * Privileged user can always hardlink
2166 	 */
2167 	if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2168 		return (0);
2169 
2170 	/*
2171 	 * Otherwise only if the originating file is owned by the
2172 	 * same user or group.  Note that any group is allowed if
2173 	 * the file is owned by the caller.
2174 	 */
2175 	error = VOP_GETATTR(vp, &va);
2176 	if (error != 0)
2177 		return (error);
2178 
2179 	if (hardlink_check_uid) {
2180 		if (cred->cr_uid != va.va_uid)
2181 			return (EPERM);
2182 	}
2183 
2184 	if (hardlink_check_gid) {
2185 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2186 			return (EPERM);
2187 	}
2188 
2189 	return (0);
2190 }
2191 
2192 int
2193 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2194 {
2195 	struct thread *td = curthread;
2196 	struct vnode *vp;
2197 	int error;
2198 
2199 	/*
2200 	 * Lookup the source and obtained a locked vnode.
2201 	 *
2202 	 * You may only hardlink a file which you have write permission
2203 	 * on or which you own.
2204 	 *
2205 	 * XXX relookup on vget failure / race ?
2206 	 */
2207 	bwillinode(1);
2208 	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2209 	if ((error = nlookup(nd)) != 0)
2210 		return (error);
2211 	vp = nd->nl_nch.ncp->nc_vp;
2212 	KKASSERT(vp != NULL);
2213 	if (vp->v_type == VDIR)
2214 		return (EPERM);		/* POSIX */
2215 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2216 		return (error);
2217 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2218 		return (error);
2219 
2220 	/*
2221 	 * Unlock the source so we can lookup the target without deadlocking
2222 	 * (XXX vp is locked already, possible other deadlock?).  The target
2223 	 * must not exist.
2224 	 */
2225 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2226 	nd->nl_flags &= ~NLC_NCPISLOCKED;
2227 	cache_unlock(&nd->nl_nch);
2228 	vn_unlock(vp);
2229 
2230 	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2231 	if ((error = nlookup(linknd)) != 0) {
2232 		vrele(vp);
2233 		return (error);
2234 	}
2235 	if (linknd->nl_nch.ncp->nc_vp) {
2236 		vrele(vp);
2237 		return (EEXIST);
2238 	}
2239 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2240 		vrele(vp);
2241 		return (error);
2242 	}
2243 
2244 	/*
2245 	 * Finally run the new API VOP.
2246 	 */
2247 	error = can_hardlink(vp, td, td->td_ucred);
2248 	if (error == 0) {
2249 		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2250 				  vp, linknd->nl_cred);
2251 	}
2252 	vput(vp);
2253 	return (error);
2254 }
2255 
2256 /*
2257  * link_args(char *path, char *link)
2258  *
2259  * Make a hard file link.
2260  */
2261 int
2262 sys_link(struct link_args *uap)
2263 {
2264 	struct nlookupdata nd, linknd;
2265 	int error;
2266 
2267 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2268 	if (error == 0) {
2269 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2270 		if (error == 0)
2271 			error = kern_link(&nd, &linknd);
2272 		nlookup_done(&linknd);
2273 	}
2274 	nlookup_done(&nd);
2275 	return (error);
2276 }
2277 
2278 int
2279 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2280 {
2281 	struct vattr vattr;
2282 	struct vnode *vp;
2283 	struct vnode *dvp;
2284 	int error;
2285 
2286 	bwillinode(1);
2287 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2288 	if ((error = nlookup(nd)) != 0)
2289 		return (error);
2290 	if (nd->nl_nch.ncp->nc_vp)
2291 		return (EEXIST);
2292 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2293 		return (error);
2294 	dvp = nd->nl_dvp;
2295 	VATTR_NULL(&vattr);
2296 	vattr.va_mode = mode;
2297 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2298 	if (error == 0)
2299 		vput(vp);
2300 	return (error);
2301 }
2302 
2303 /*
2304  * symlink(char *path, char *link)
2305  *
2306  * Make a symbolic link.
2307  */
2308 int
2309 sys_symlink(struct symlink_args *uap)
2310 {
2311 	struct thread *td = curthread;
2312 	struct nlookupdata nd;
2313 	char *path;
2314 	int error;
2315 	int mode;
2316 
2317 	path = objcache_get(namei_oc, M_WAITOK);
2318 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2319 	if (error == 0) {
2320 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2321 		if (error == 0) {
2322 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2323 			error = kern_symlink(&nd, path, mode);
2324 		}
2325 		nlookup_done(&nd);
2326 	}
2327 	objcache_put(namei_oc, path);
2328 	return (error);
2329 }
2330 
2331 /*
2332  * symlinkat_args(char *path1, int fd, char *path2)
2333  *
2334  * Make a symbolic link.  The path2 argument is relative to the directory
2335  * associated with fd.
2336  */
2337 int
2338 sys_symlinkat(struct symlinkat_args *uap)
2339 {
2340 	struct thread *td = curthread;
2341 	struct nlookupdata nd;
2342 	struct file *fp;
2343 	char *path1;
2344 	int error;
2345 	int mode;
2346 
2347 	path1 = objcache_get(namei_oc, M_WAITOK);
2348 	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2349 	if (error == 0) {
2350 		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2351 		    UIO_USERSPACE, 0);
2352 		if (error == 0) {
2353 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2354 			error = kern_symlink(&nd, path1, mode);
2355 		}
2356 		nlookup_done_at(&nd, fp);
2357 	}
2358 	objcache_put(namei_oc, path1);
2359 	return (error);
2360 }
2361 
2362 /*
2363  * undelete_args(char *path)
2364  *
2365  * Delete a whiteout from the filesystem.
2366  */
2367 int
2368 sys_undelete(struct undelete_args *uap)
2369 {
2370 	struct nlookupdata nd;
2371 	int error;
2372 
2373 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2374 	bwillinode(1);
2375 	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2376 	if (error == 0)
2377 		error = nlookup(&nd);
2378 	if (error == 0)
2379 		error = ncp_writechk(&nd.nl_nch);
2380 	if (error == 0) {
2381 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2382 				      NAMEI_DELETE);
2383 	}
2384 	nlookup_done(&nd);
2385 	return (error);
2386 }
2387 
2388 int
2389 kern_unlink(struct nlookupdata *nd)
2390 {
2391 	int error;
2392 
2393 	bwillinode(1);
2394 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2395 	if ((error = nlookup(nd)) != 0)
2396 		return (error);
2397 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2398 		return (error);
2399 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2400 	return (error);
2401 }
2402 
2403 /*
2404  * unlink_args(char *path)
2405  *
2406  * Delete a name from the filesystem.
2407  */
2408 int
2409 sys_unlink(struct unlink_args *uap)
2410 {
2411 	struct nlookupdata nd;
2412 	int error;
2413 
2414 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2415 	if (error == 0)
2416 		error = kern_unlink(&nd);
2417 	nlookup_done(&nd);
2418 	return (error);
2419 }
2420 
2421 
2422 /*
2423  * unlinkat_args(int fd, char *path, int flags)
2424  *
2425  * Delete the file or directory entry pointed to by fd/path.
2426  */
2427 int
2428 sys_unlinkat(struct unlinkat_args *uap)
2429 {
2430 	struct nlookupdata nd;
2431 	struct file *fp;
2432 	int error;
2433 
2434 	if (uap->flags & ~AT_REMOVEDIR)
2435 		return (EINVAL);
2436 
2437 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2438 	if (error == 0) {
2439 		if (uap->flags & AT_REMOVEDIR)
2440 			error = kern_rmdir(&nd);
2441 		else
2442 			error = kern_unlink(&nd);
2443 	}
2444 	nlookup_done_at(&nd, fp);
2445 	return (error);
2446 }
2447 
2448 int
2449 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2450 {
2451 	struct thread *td = curthread;
2452 	struct proc *p = td->td_proc;
2453 	struct file *fp;
2454 	struct vnode *vp;
2455 	struct vattr vattr;
2456 	off_t new_offset;
2457 	int error;
2458 
2459 	fp = holdfp(p->p_fd, fd, -1);
2460 	if (fp == NULL)
2461 		return (EBADF);
2462 	if (fp->f_type != DTYPE_VNODE) {
2463 		error = ESPIPE;
2464 		goto done;
2465 	}
2466 	vp = (struct vnode *)fp->f_data;
2467 
2468 	switch (whence) {
2469 	case L_INCR:
2470 		spin_lock(&fp->f_spin);
2471 		new_offset = fp->f_offset + offset;
2472 		error = 0;
2473 		break;
2474 	case L_XTND:
2475 		error = VOP_GETATTR(vp, &vattr);
2476 		spin_lock(&fp->f_spin);
2477 		new_offset = offset + vattr.va_size;
2478 		break;
2479 	case L_SET:
2480 		new_offset = offset;
2481 		error = 0;
2482 		spin_lock(&fp->f_spin);
2483 		break;
2484 	default:
2485 		new_offset = 0;
2486 		error = EINVAL;
2487 		spin_lock(&fp->f_spin);
2488 		break;
2489 	}
2490 
2491 	/*
2492 	 * Validate the seek position.  Negative offsets are not allowed
2493 	 * for regular files or directories.
2494 	 *
2495 	 * Normally we would also not want to allow negative offsets for
2496 	 * character and block-special devices.  However kvm addresses
2497 	 * on 64 bit architectures might appear to be negative and must
2498 	 * be allowed.
2499 	 */
2500 	if (error == 0) {
2501 		if (new_offset < 0 &&
2502 		    (vp->v_type == VREG || vp->v_type == VDIR)) {
2503 			error = EINVAL;
2504 		} else {
2505 			fp->f_offset = new_offset;
2506 		}
2507 	}
2508 	*res = fp->f_offset;
2509 	spin_unlock(&fp->f_spin);
2510 done:
2511 	fdrop(fp);
2512 	return (error);
2513 }
2514 
2515 /*
2516  * lseek_args(int fd, int pad, off_t offset, int whence)
2517  *
2518  * Reposition read/write file offset.
2519  */
2520 int
2521 sys_lseek(struct lseek_args *uap)
2522 {
2523 	int error;
2524 
2525 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2526 			   &uap->sysmsg_offset);
2527 
2528 	return (error);
2529 }
2530 
2531 /*
2532  * Check if current process can access given file.  amode is a bitmask of *_OK
2533  * access bits.  flags is a bitmask of AT_* flags.
2534  */
2535 int
2536 kern_access(struct nlookupdata *nd, int amode, int flags)
2537 {
2538 	struct vnode *vp;
2539 	int error, mode;
2540 
2541 	if (flags & ~AT_EACCESS)
2542 		return (EINVAL);
2543 	if ((error = nlookup(nd)) != 0)
2544 		return (error);
2545 retry:
2546 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2547 	if (error)
2548 		return (error);
2549 
2550 	/* Flags == 0 means only check for existence. */
2551 	if (amode) {
2552 		mode = 0;
2553 		if (amode & R_OK)
2554 			mode |= VREAD;
2555 		if (amode & W_OK)
2556 			mode |= VWRITE;
2557 		if (amode & X_OK)
2558 			mode |= VEXEC;
2559 		if ((mode & VWRITE) == 0 ||
2560 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2561 			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2562 
2563 		/*
2564 		 * If the file handle is stale we have to re-resolve the
2565 		 * entry.  This is a hack at the moment.
2566 		 */
2567 		if (error == ESTALE) {
2568 			vput(vp);
2569 			cache_setunresolved(&nd->nl_nch);
2570 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2571 			if (error == 0) {
2572 				vp = NULL;
2573 				goto retry;
2574 			}
2575 			return(error);
2576 		}
2577 	}
2578 	vput(vp);
2579 	return (error);
2580 }
2581 
2582 /*
2583  * access_args(char *path, int flags)
2584  *
2585  * Check access permissions.
2586  */
2587 int
2588 sys_access(struct access_args *uap)
2589 {
2590 	struct nlookupdata nd;
2591 	int error;
2592 
2593 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2594 	if (error == 0)
2595 		error = kern_access(&nd, uap->flags, 0);
2596 	nlookup_done(&nd);
2597 	return (error);
2598 }
2599 
2600 
2601 /*
2602  * faccessat_args(int fd, char *path, int amode, int flags)
2603  *
2604  * Check access permissions.
2605  */
2606 int
2607 sys_faccessat(struct faccessat_args *uap)
2608 {
2609 	struct nlookupdata nd;
2610 	struct file *fp;
2611 	int error;
2612 
2613 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2614 				NLC_FOLLOW);
2615 	if (error == 0)
2616 		error = kern_access(&nd, uap->amode, uap->flags);
2617 	nlookup_done_at(&nd, fp);
2618 	return (error);
2619 }
2620 
2621 
2622 int
2623 kern_stat(struct nlookupdata *nd, struct stat *st)
2624 {
2625 	int error;
2626 	struct vnode *vp;
2627 
2628 	if ((error = nlookup(nd)) != 0)
2629 		return (error);
2630 again:
2631 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2632 		return (ENOENT);
2633 
2634 	if ((error = vget(vp, LK_SHARED)) != 0)
2635 		return (error);
2636 	error = vn_stat(vp, st, nd->nl_cred);
2637 
2638 	/*
2639 	 * If the file handle is stale we have to re-resolve the entry.  This
2640 	 * is a hack at the moment.
2641 	 */
2642 	if (error == ESTALE) {
2643 		vput(vp);
2644 		cache_setunresolved(&nd->nl_nch);
2645 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2646 		if (error == 0)
2647 			goto again;
2648 	} else {
2649 		vput(vp);
2650 	}
2651 	return (error);
2652 }
2653 
2654 /*
2655  * stat_args(char *path, struct stat *ub)
2656  *
2657  * Get file status; this version follows links.
2658  */
2659 int
2660 sys_stat(struct stat_args *uap)
2661 {
2662 	struct nlookupdata nd;
2663 	struct stat st;
2664 	int error;
2665 
2666 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2667 	if (error == 0) {
2668 		error = kern_stat(&nd, &st);
2669 		if (error == 0)
2670 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2671 	}
2672 	nlookup_done(&nd);
2673 	return (error);
2674 }
2675 
2676 /*
2677  * lstat_args(char *path, struct stat *ub)
2678  *
2679  * Get file status; this version does not follow links.
2680  */
2681 int
2682 sys_lstat(struct lstat_args *uap)
2683 {
2684 	struct nlookupdata nd;
2685 	struct stat st;
2686 	int error;
2687 
2688 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2689 	if (error == 0) {
2690 		error = kern_stat(&nd, &st);
2691 		if (error == 0)
2692 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2693 	}
2694 	nlookup_done(&nd);
2695 	return (error);
2696 }
2697 
2698 /*
2699  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2700  *
2701  * Get status of file pointed to by fd/path.
2702  */
2703 int
2704 sys_fstatat(struct fstatat_args *uap)
2705 {
2706 	struct nlookupdata nd;
2707 	struct stat st;
2708 	int error;
2709 	int flags;
2710 	struct file *fp;
2711 
2712 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2713 		return (EINVAL);
2714 
2715 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2716 
2717 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
2718 				UIO_USERSPACE, flags);
2719 	if (error == 0) {
2720 		error = kern_stat(&nd, &st);
2721 		if (error == 0)
2722 			error = copyout(&st, uap->sb, sizeof(*uap->sb));
2723 	}
2724 	nlookup_done_at(&nd, fp);
2725 	return (error);
2726 }
2727 
2728 /*
2729  * pathconf_Args(char *path, int name)
2730  *
2731  * Get configurable pathname variables.
2732  */
2733 int
2734 sys_pathconf(struct pathconf_args *uap)
2735 {
2736 	struct nlookupdata nd;
2737 	struct vnode *vp;
2738 	int error;
2739 
2740 	vp = NULL;
2741 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2742 	if (error == 0)
2743 		error = nlookup(&nd);
2744 	if (error == 0)
2745 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2746 	nlookup_done(&nd);
2747 	if (error == 0) {
2748 		error = VOP_PATHCONF(vp, uap->name, &uap->sysmsg_reg);
2749 		vput(vp);
2750 	}
2751 	return (error);
2752 }
2753 
2754 /*
2755  * XXX: daver
2756  * kern_readlink isn't properly split yet.  There is a copyin burried
2757  * in VOP_READLINK().
2758  */
2759 int
2760 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2761 {
2762 	struct thread *td = curthread;
2763 	struct vnode *vp;
2764 	struct iovec aiov;
2765 	struct uio auio;
2766 	int error;
2767 
2768 	if ((error = nlookup(nd)) != 0)
2769 		return (error);
2770 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2771 	if (error)
2772 		return (error);
2773 	if (vp->v_type != VLNK) {
2774 		error = EINVAL;
2775 	} else {
2776 		aiov.iov_base = buf;
2777 		aiov.iov_len = count;
2778 		auio.uio_iov = &aiov;
2779 		auio.uio_iovcnt = 1;
2780 		auio.uio_offset = 0;
2781 		auio.uio_rw = UIO_READ;
2782 		auio.uio_segflg = UIO_USERSPACE;
2783 		auio.uio_td = td;
2784 		auio.uio_resid = count;
2785 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2786 	}
2787 	vput(vp);
2788 	*res = count - auio.uio_resid;
2789 	return (error);
2790 }
2791 
2792 /*
2793  * readlink_args(char *path, char *buf, int count)
2794  *
2795  * Return target name of a symbolic link.
2796  */
2797 int
2798 sys_readlink(struct readlink_args *uap)
2799 {
2800 	struct nlookupdata nd;
2801 	int error;
2802 
2803 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2804 	if (error == 0) {
2805 		error = kern_readlink(&nd, uap->buf, uap->count,
2806 					&uap->sysmsg_result);
2807 	}
2808 	nlookup_done(&nd);
2809 	return (error);
2810 }
2811 
2812 /*
2813  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
2814  *
2815  * Return target name of a symbolic link.  The path is relative to the
2816  * directory associated with fd.
2817  */
2818 int
2819 sys_readlinkat(struct readlinkat_args *uap)
2820 {
2821 	struct nlookupdata nd;
2822 	struct file *fp;
2823 	int error;
2824 
2825 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2826 	if (error == 0) {
2827 		error = kern_readlink(&nd, uap->buf, uap->bufsize,
2828 					&uap->sysmsg_result);
2829 	}
2830 	nlookup_done_at(&nd, fp);
2831 	return (error);
2832 }
2833 
2834 static int
2835 setfflags(struct vnode *vp, int flags)
2836 {
2837 	struct thread *td = curthread;
2838 	int error;
2839 	struct vattr vattr;
2840 
2841 	/*
2842 	 * Prevent non-root users from setting flags on devices.  When
2843 	 * a device is reused, users can retain ownership of the device
2844 	 * if they are allowed to set flags and programs assume that
2845 	 * chown can't fail when done as root.
2846 	 */
2847 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2848 	    ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
2849 		return (error);
2850 
2851 	/*
2852 	 * note: vget is required for any operation that might mod the vnode
2853 	 * so VINACTIVE is properly cleared.
2854 	 */
2855 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2856 		VATTR_NULL(&vattr);
2857 		vattr.va_flags = flags;
2858 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2859 		vput(vp);
2860 	}
2861 	return (error);
2862 }
2863 
2864 /*
2865  * chflags(char *path, int flags)
2866  *
2867  * Change flags of a file given a path name.
2868  */
2869 int
2870 sys_chflags(struct chflags_args *uap)
2871 {
2872 	struct nlookupdata nd;
2873 	struct vnode *vp;
2874 	int error;
2875 
2876 	vp = NULL;
2877 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2878 	if (error == 0)
2879 		error = nlookup(&nd);
2880 	if (error == 0)
2881 		error = ncp_writechk(&nd.nl_nch);
2882 	if (error == 0)
2883 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2884 	nlookup_done(&nd);
2885 	if (error == 0) {
2886 		error = setfflags(vp, uap->flags);
2887 		vrele(vp);
2888 	}
2889 	return (error);
2890 }
2891 
2892 /*
2893  * lchflags(char *path, int flags)
2894  *
2895  * Change flags of a file given a path name, but don't follow symlinks.
2896  */
2897 int
2898 sys_lchflags(struct lchflags_args *uap)
2899 {
2900 	struct nlookupdata nd;
2901 	struct vnode *vp;
2902 	int error;
2903 
2904 	vp = NULL;
2905 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2906 	if (error == 0)
2907 		error = nlookup(&nd);
2908 	if (error == 0)
2909 		error = ncp_writechk(&nd.nl_nch);
2910 	if (error == 0)
2911 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2912 	nlookup_done(&nd);
2913 	if (error == 0) {
2914 		error = setfflags(vp, uap->flags);
2915 		vrele(vp);
2916 	}
2917 	return (error);
2918 }
2919 
2920 /*
2921  * fchflags_args(int fd, int flags)
2922  *
2923  * Change flags of a file given a file descriptor.
2924  */
2925 int
2926 sys_fchflags(struct fchflags_args *uap)
2927 {
2928 	struct thread *td = curthread;
2929 	struct proc *p = td->td_proc;
2930 	struct file *fp;
2931 	int error;
2932 
2933 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2934 		return (error);
2935 	if (fp->f_nchandle.ncp)
2936 		error = ncp_writechk(&fp->f_nchandle);
2937 	if (error == 0)
2938 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
2939 	fdrop(fp);
2940 	return (error);
2941 }
2942 
2943 static int
2944 setfmode(struct vnode *vp, int mode)
2945 {
2946 	struct thread *td = curthread;
2947 	int error;
2948 	struct vattr vattr;
2949 
2950 	/*
2951 	 * note: vget is required for any operation that might mod the vnode
2952 	 * so VINACTIVE is properly cleared.
2953 	 */
2954 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2955 		VATTR_NULL(&vattr);
2956 		vattr.va_mode = mode & ALLPERMS;
2957 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2958 		vput(vp);
2959 	}
2960 	return error;
2961 }
2962 
2963 int
2964 kern_chmod(struct nlookupdata *nd, int mode)
2965 {
2966 	struct vnode *vp;
2967 	int error;
2968 
2969 	if ((error = nlookup(nd)) != 0)
2970 		return (error);
2971 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2972 		return (error);
2973 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2974 		error = setfmode(vp, mode);
2975 	vrele(vp);
2976 	return (error);
2977 }
2978 
2979 /*
2980  * chmod_args(char *path, int mode)
2981  *
2982  * Change mode of a file given path name.
2983  */
2984 int
2985 sys_chmod(struct chmod_args *uap)
2986 {
2987 	struct nlookupdata nd;
2988 	int error;
2989 
2990 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2991 	if (error == 0)
2992 		error = kern_chmod(&nd, uap->mode);
2993 	nlookup_done(&nd);
2994 	return (error);
2995 }
2996 
2997 /*
2998  * lchmod_args(char *path, int mode)
2999  *
3000  * Change mode of a file given path name (don't follow links.)
3001  */
3002 int
3003 sys_lchmod(struct lchmod_args *uap)
3004 {
3005 	struct nlookupdata nd;
3006 	int error;
3007 
3008 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3009 	if (error == 0)
3010 		error = kern_chmod(&nd, uap->mode);
3011 	nlookup_done(&nd);
3012 	return (error);
3013 }
3014 
3015 /*
3016  * fchmod_args(int fd, int mode)
3017  *
3018  * Change mode of a file given a file descriptor.
3019  */
3020 int
3021 sys_fchmod(struct fchmod_args *uap)
3022 {
3023 	struct thread *td = curthread;
3024 	struct proc *p = td->td_proc;
3025 	struct file *fp;
3026 	int error;
3027 
3028 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3029 		return (error);
3030 	if (fp->f_nchandle.ncp)
3031 		error = ncp_writechk(&fp->f_nchandle);
3032 	if (error == 0)
3033 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3034 	fdrop(fp);
3035 	return (error);
3036 }
3037 
3038 /*
3039  * fchmodat_args(char *path, int mode)
3040  *
3041  * Change mode of a file pointed to by fd/path.
3042  */
3043 int
3044 sys_fchmodat(struct fchmodat_args *uap)
3045 {
3046 	struct nlookupdata nd;
3047 	struct file *fp;
3048 	int error;
3049 	int flags;
3050 
3051 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3052 		return (EINVAL);
3053 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3054 
3055 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3056 				UIO_USERSPACE, flags);
3057 	if (error == 0)
3058 		error = kern_chmod(&nd, uap->mode);
3059 	nlookup_done_at(&nd, fp);
3060 	return (error);
3061 }
3062 
3063 static int
3064 setfown(struct vnode *vp, uid_t uid, gid_t gid)
3065 {
3066 	struct thread *td = curthread;
3067 	int error;
3068 	struct vattr vattr;
3069 
3070 	/*
3071 	 * note: vget is required for any operation that might mod the vnode
3072 	 * so VINACTIVE is properly cleared.
3073 	 */
3074 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3075 		VATTR_NULL(&vattr);
3076 		vattr.va_uid = uid;
3077 		vattr.va_gid = gid;
3078 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3079 		vput(vp);
3080 	}
3081 	return error;
3082 }
3083 
3084 int
3085 kern_chown(struct nlookupdata *nd, int uid, int gid)
3086 {
3087 	struct vnode *vp;
3088 	int error;
3089 
3090 	if ((error = nlookup(nd)) != 0)
3091 		return (error);
3092 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3093 		return (error);
3094 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3095 		error = setfown(vp, uid, gid);
3096 	vrele(vp);
3097 	return (error);
3098 }
3099 
3100 /*
3101  * chown(char *path, int uid, int gid)
3102  *
3103  * Set ownership given a path name.
3104  */
3105 int
3106 sys_chown(struct chown_args *uap)
3107 {
3108 	struct nlookupdata nd;
3109 	int error;
3110 
3111 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3112 	if (error == 0)
3113 		error = kern_chown(&nd, uap->uid, uap->gid);
3114 	nlookup_done(&nd);
3115 	return (error);
3116 }
3117 
3118 /*
3119  * lchown_args(char *path, int uid, int gid)
3120  *
3121  * Set ownership given a path name, do not cross symlinks.
3122  */
3123 int
3124 sys_lchown(struct lchown_args *uap)
3125 {
3126 	struct nlookupdata nd;
3127 	int error;
3128 
3129 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3130 	if (error == 0)
3131 		error = kern_chown(&nd, uap->uid, uap->gid);
3132 	nlookup_done(&nd);
3133 	return (error);
3134 }
3135 
3136 /*
3137  * fchown_args(int fd, int uid, int gid)
3138  *
3139  * Set ownership given a file descriptor.
3140  */
3141 int
3142 sys_fchown(struct fchown_args *uap)
3143 {
3144 	struct thread *td = curthread;
3145 	struct proc *p = td->td_proc;
3146 	struct file *fp;
3147 	int error;
3148 
3149 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3150 		return (error);
3151 	if (fp->f_nchandle.ncp)
3152 		error = ncp_writechk(&fp->f_nchandle);
3153 	if (error == 0)
3154 		error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
3155 	fdrop(fp);
3156 	return (error);
3157 }
3158 
3159 /*
3160  * fchownat(int fd, char *path, int uid, int gid, int flags)
3161  *
3162  * Set ownership of file pointed to by fd/path.
3163  */
3164 int
3165 sys_fchownat(struct fchownat_args *uap)
3166 {
3167 	struct nlookupdata nd;
3168 	struct file *fp;
3169 	int error;
3170 	int flags;
3171 
3172 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3173 		return (EINVAL);
3174 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3175 
3176 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3177 				UIO_USERSPACE, flags);
3178 	if (error == 0)
3179 		error = kern_chown(&nd, uap->uid, uap->gid);
3180 	nlookup_done_at(&nd, fp);
3181 	return (error);
3182 }
3183 
3184 
3185 static int
3186 getutimes(const struct timeval *tvp, struct timespec *tsp)
3187 {
3188 	struct timeval tv[2];
3189 
3190 	if (tvp == NULL) {
3191 		microtime(&tv[0]);
3192 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3193 		tsp[1] = tsp[0];
3194 	} else {
3195 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3196 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3197 	}
3198 	return 0;
3199 }
3200 
3201 static int
3202 setutimes(struct vnode *vp, struct vattr *vattr,
3203 	  const struct timespec *ts, int nullflag)
3204 {
3205 	struct thread *td = curthread;
3206 	int error;
3207 
3208 	VATTR_NULL(vattr);
3209 	vattr->va_atime = ts[0];
3210 	vattr->va_mtime = ts[1];
3211 	if (nullflag)
3212 		vattr->va_vaflags |= VA_UTIMES_NULL;
3213 	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3214 
3215 	return error;
3216 }
3217 
3218 int
3219 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3220 {
3221 	struct timespec ts[2];
3222 	struct vnode *vp;
3223 	struct vattr vattr;
3224 	int error;
3225 
3226 	if ((error = getutimes(tptr, ts)) != 0)
3227 		return (error);
3228 
3229 	/*
3230 	 * NOTE: utimes() succeeds for the owner even if the file
3231 	 * is not user-writable.
3232 	 */
3233 	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3234 
3235 	if ((error = nlookup(nd)) != 0)
3236 		return (error);
3237 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3238 		return (error);
3239 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3240 		return (error);
3241 
3242 	/*
3243 	 * note: vget is required for any operation that might mod the vnode
3244 	 * so VINACTIVE is properly cleared.
3245 	 */
3246 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3247 		error = vget(vp, LK_EXCLUSIVE);
3248 		if (error == 0) {
3249 			error = setutimes(vp, &vattr, ts, (tptr == NULL));
3250 			vput(vp);
3251 		}
3252 	}
3253 	vrele(vp);
3254 	return (error);
3255 }
3256 
3257 /*
3258  * utimes_args(char *path, struct timeval *tptr)
3259  *
3260  * Set the access and modification times of a file.
3261  */
3262 int
3263 sys_utimes(struct utimes_args *uap)
3264 {
3265 	struct timeval tv[2];
3266 	struct nlookupdata nd;
3267 	int error;
3268 
3269 	if (uap->tptr) {
3270  		error = copyin(uap->tptr, tv, sizeof(tv));
3271 		if (error)
3272 			return (error);
3273 	}
3274 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3275 	if (error == 0)
3276 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3277 	nlookup_done(&nd);
3278 	return (error);
3279 }
3280 
3281 /*
3282  * lutimes_args(char *path, struct timeval *tptr)
3283  *
3284  * Set the access and modification times of a file.
3285  */
3286 int
3287 sys_lutimes(struct lutimes_args *uap)
3288 {
3289 	struct timeval tv[2];
3290 	struct nlookupdata nd;
3291 	int error;
3292 
3293 	if (uap->tptr) {
3294 		error = copyin(uap->tptr, tv, sizeof(tv));
3295 		if (error)
3296 			return (error);
3297 	}
3298 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3299 	if (error == 0)
3300 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3301 	nlookup_done(&nd);
3302 	return (error);
3303 }
3304 
3305 /*
3306  * Set utimes on a file descriptor.  The creds used to open the
3307  * file are used to determine whether the operation is allowed
3308  * or not.
3309  */
3310 int
3311 kern_futimes(int fd, struct timeval *tptr)
3312 {
3313 	struct thread *td = curthread;
3314 	struct proc *p = td->td_proc;
3315 	struct timespec ts[2];
3316 	struct file *fp;
3317 	struct vnode *vp;
3318 	struct vattr vattr;
3319 	int error;
3320 
3321 	error = getutimes(tptr, ts);
3322 	if (error)
3323 		return (error);
3324 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3325 		return (error);
3326 	if (fp->f_nchandle.ncp)
3327 		error = ncp_writechk(&fp->f_nchandle);
3328 	if (error == 0) {
3329 		vp = fp->f_data;
3330 		error = vget(vp, LK_EXCLUSIVE);
3331 		if (error == 0) {
3332 			error = VOP_GETATTR(vp, &vattr);
3333 			if (error == 0) {
3334 				error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3335 						   fp->f_cred);
3336 			}
3337 			if (error == 0) {
3338 				error = setutimes(vp, &vattr, ts,
3339 						  (tptr == NULL));
3340 			}
3341 			vput(vp);
3342 		}
3343 	}
3344 	fdrop(fp);
3345 	return (error);
3346 }
3347 
3348 /*
3349  * futimes_args(int fd, struct timeval *tptr)
3350  *
3351  * Set the access and modification times of a file.
3352  */
3353 int
3354 sys_futimes(struct futimes_args *uap)
3355 {
3356 	struct timeval tv[2];
3357 	int error;
3358 
3359 	if (uap->tptr) {
3360 		error = copyin(uap->tptr, tv, sizeof(tv));
3361 		if (error)
3362 			return (error);
3363 	}
3364 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3365 
3366 	return (error);
3367 }
3368 
3369 int
3370 kern_truncate(struct nlookupdata *nd, off_t length)
3371 {
3372 	struct vnode *vp;
3373 	struct vattr vattr;
3374 	int error;
3375 
3376 	if (length < 0)
3377 		return(EINVAL);
3378 	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3379 	if ((error = nlookup(nd)) != 0)
3380 		return (error);
3381 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3382 		return (error);
3383 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3384 		return (error);
3385 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
3386 		vrele(vp);
3387 		return (error);
3388 	}
3389 	if (vp->v_type == VDIR) {
3390 		error = EISDIR;
3391 	} else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3392 		VATTR_NULL(&vattr);
3393 		vattr.va_size = length;
3394 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3395 	}
3396 	vput(vp);
3397 	return (error);
3398 }
3399 
3400 /*
3401  * truncate(char *path, int pad, off_t length)
3402  *
3403  * Truncate a file given its path name.
3404  */
3405 int
3406 sys_truncate(struct truncate_args *uap)
3407 {
3408 	struct nlookupdata nd;
3409 	int error;
3410 
3411 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3412 	if (error == 0)
3413 		error = kern_truncate(&nd, uap->length);
3414 	nlookup_done(&nd);
3415 	return error;
3416 }
3417 
3418 int
3419 kern_ftruncate(int fd, off_t length)
3420 {
3421 	struct thread *td = curthread;
3422 	struct proc *p = td->td_proc;
3423 	struct vattr vattr;
3424 	struct vnode *vp;
3425 	struct file *fp;
3426 	int error;
3427 
3428 	if (length < 0)
3429 		return(EINVAL);
3430 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3431 		return (error);
3432 	if (fp->f_nchandle.ncp) {
3433 		error = ncp_writechk(&fp->f_nchandle);
3434 		if (error)
3435 			goto done;
3436 	}
3437 	if ((fp->f_flag & FWRITE) == 0) {
3438 		error = EINVAL;
3439 		goto done;
3440 	}
3441 	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
3442 		error = EINVAL;
3443 		goto done;
3444 	}
3445 	vp = (struct vnode *)fp->f_data;
3446 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3447 	if (vp->v_type == VDIR) {
3448 		error = EISDIR;
3449 	} else if ((error = vn_writechk(vp, NULL)) == 0) {
3450 		VATTR_NULL(&vattr);
3451 		vattr.va_size = length;
3452 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3453 	}
3454 	vn_unlock(vp);
3455 done:
3456 	fdrop(fp);
3457 	return (error);
3458 }
3459 
3460 /*
3461  * ftruncate_args(int fd, int pad, off_t length)
3462  *
3463  * Truncate a file given a file descriptor.
3464  */
3465 int
3466 sys_ftruncate(struct ftruncate_args *uap)
3467 {
3468 	int error;
3469 
3470 	error = kern_ftruncate(uap->fd, uap->length);
3471 
3472 	return (error);
3473 }
3474 
3475 /*
3476  * fsync(int fd)
3477  *
3478  * Sync an open file.
3479  */
3480 int
3481 sys_fsync(struct fsync_args *uap)
3482 {
3483 	struct thread *td = curthread;
3484 	struct proc *p = td->td_proc;
3485 	struct vnode *vp;
3486 	struct file *fp;
3487 	vm_object_t obj;
3488 	int error;
3489 
3490 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3491 		return (error);
3492 	vp = (struct vnode *)fp->f_data;
3493 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3494 	if ((obj = vp->v_object) != NULL)
3495 		vm_object_page_clean(obj, 0, 0, 0);
3496 	error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3497 	if (error == 0 && vp->v_mount)
3498 		error = buf_fsync(vp);
3499 	vn_unlock(vp);
3500 	fdrop(fp);
3501 
3502 	return (error);
3503 }
3504 
3505 int
3506 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
3507 {
3508 	struct nchandle fnchd;
3509 	struct nchandle tnchd;
3510 	struct namecache *ncp;
3511 	struct vnode *fdvp;
3512 	struct vnode *tdvp;
3513 	struct mount *mp;
3514 	int error;
3515 
3516 	bwillinode(1);
3517 	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
3518 	if ((error = nlookup(fromnd)) != 0)
3519 		return (error);
3520 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
3521 		return (ENOENT);
3522 	fnchd.mount = fromnd->nl_nch.mount;
3523 	cache_hold(&fnchd);
3524 
3525 	/*
3526 	 * unlock the source nch so we can lookup the target nch without
3527 	 * deadlocking.  The target may or may not exist so we do not check
3528 	 * for a target vp like kern_mkdir() and other creation functions do.
3529 	 *
3530 	 * The source and target directories are ref'd and rechecked after
3531 	 * everything is relocked to determine if the source or target file
3532 	 * has been renamed.
3533 	 */
3534 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
3535 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
3536 	cache_unlock(&fromnd->nl_nch);
3537 
3538 	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
3539 	if ((error = nlookup(tond)) != 0) {
3540 		cache_drop(&fnchd);
3541 		return (error);
3542 	}
3543 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
3544 		cache_drop(&fnchd);
3545 		return (ENOENT);
3546 	}
3547 	tnchd.mount = tond->nl_nch.mount;
3548 	cache_hold(&tnchd);
3549 
3550 	/*
3551 	 * If the source and target are the same there is nothing to do
3552 	 */
3553 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
3554 		cache_drop(&fnchd);
3555 		cache_drop(&tnchd);
3556 		return (0);
3557 	}
3558 
3559 	/*
3560 	 * Mount points cannot be renamed or overwritten
3561 	 */
3562 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
3563 	    NCF_ISMOUNTPT
3564 	) {
3565 		cache_drop(&fnchd);
3566 		cache_drop(&tnchd);
3567 		return (EINVAL);
3568 	}
3569 
3570 	/*
3571 	 * Relock the source ncp.  cache_relock() will deal with any
3572 	 * deadlocks against the already-locked tond and will also
3573 	 * make sure both are resolved.
3574 	 *
3575 	 * NOTE AFTER RELOCKING: The source or target ncp may have become
3576 	 * invalid while they were unlocked, nc_vp and nc_mount could
3577 	 * be NULL.
3578 	 */
3579 	cache_relock(&fromnd->nl_nch, fromnd->nl_cred,
3580 		     &tond->nl_nch, tond->nl_cred);
3581 	fromnd->nl_flags |= NLC_NCPISLOCKED;
3582 
3583 	/*
3584 	 * make sure the parent directories linkages are the same
3585 	 */
3586 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
3587 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
3588 		cache_drop(&fnchd);
3589 		cache_drop(&tnchd);
3590 		return (ENOENT);
3591 	}
3592 
3593 	/*
3594 	 * Both the source and target must be within the same filesystem and
3595 	 * in the same filesystem as their parent directories within the
3596 	 * namecache topology.
3597 	 *
3598 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
3599 	 */
3600 	mp = fnchd.mount;
3601 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
3602 	    mp != tond->nl_nch.mount) {
3603 		cache_drop(&fnchd);
3604 		cache_drop(&tnchd);
3605 		return (EXDEV);
3606 	}
3607 
3608 	/*
3609 	 * Make sure the mount point is writable
3610 	 */
3611 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
3612 		cache_drop(&fnchd);
3613 		cache_drop(&tnchd);
3614 		return (error);
3615 	}
3616 
3617 	/*
3618 	 * If the target exists and either the source or target is a directory,
3619 	 * then both must be directories.
3620 	 *
3621 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
3622 	 * have become NULL.
3623 	 */
3624 	if (tond->nl_nch.ncp->nc_vp) {
3625 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
3626 			error = ENOENT;
3627 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
3628 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
3629 				error = ENOTDIR;
3630 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
3631 			error = EISDIR;
3632 		}
3633 	}
3634 
3635 	/*
3636 	 * You cannot rename a source into itself or a subdirectory of itself.
3637 	 * We check this by travsersing the target directory upwards looking
3638 	 * for a match against the source.
3639 	 *
3640 	 * XXX MPSAFE
3641 	 */
3642 	if (error == 0) {
3643 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
3644 			if (fromnd->nl_nch.ncp == ncp) {
3645 				error = EINVAL;
3646 				break;
3647 			}
3648 		}
3649 	}
3650 
3651 	cache_drop(&fnchd);
3652 	cache_drop(&tnchd);
3653 
3654 	/*
3655 	 * Even though the namespaces are different, they may still represent
3656 	 * hardlinks to the same file.  The filesystem might have a hard time
3657 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
3658 	 * when we detect the situation.
3659 	 */
3660 	if (error == 0) {
3661 		fdvp = fromnd->nl_dvp;
3662 		tdvp = tond->nl_dvp;
3663 		if (fdvp == NULL || tdvp == NULL) {
3664 			error = EPERM;
3665 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
3666 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
3667 					    fromnd->nl_cred);
3668 		} else {
3669 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
3670 					    fdvp, tdvp, tond->nl_cred);
3671 		}
3672 	}
3673 	return (error);
3674 }
3675 
3676 /*
3677  * rename_args(char *from, char *to)
3678  *
3679  * Rename files.  Source and destination must either both be directories,
3680  * or both not be directories.  If target is a directory, it must be empty.
3681  */
3682 int
3683 sys_rename(struct rename_args *uap)
3684 {
3685 	struct nlookupdata fromnd, tond;
3686 	int error;
3687 
3688 	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3689 	if (error == 0) {
3690 		error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3691 		if (error == 0)
3692 			error = kern_rename(&fromnd, &tond);
3693 		nlookup_done(&tond);
3694 	}
3695 	nlookup_done(&fromnd);
3696 	return (error);
3697 }
3698 
3699 /*
3700  * renameat_args(int oldfd, char *old, int newfd, char *new)
3701  *
3702  * Rename files using paths relative to the directories associated with
3703  * oldfd and newfd.  Source and destination must either both be directories,
3704  * or both not be directories.  If target is a directory, it must be empty.
3705  */
3706 int
3707 sys_renameat(struct renameat_args *uap)
3708 {
3709 	struct nlookupdata oldnd, newnd;
3710 	struct file *oldfp, *newfp;
3711 	int error;
3712 
3713 	error = nlookup_init_at(&oldnd, &oldfp, uap->oldfd, uap->old,
3714 	    UIO_USERSPACE, 0);
3715 	if (error == 0) {
3716 		error = nlookup_init_at(&newnd, &newfp, uap->newfd, uap->new,
3717 		    UIO_USERSPACE, 0);
3718 		if (error == 0)
3719 			error = kern_rename(&oldnd, &newnd);
3720 		nlookup_done_at(&newnd, newfp);
3721 	}
3722 	nlookup_done_at(&oldnd, oldfp);
3723 	return (error);
3724 }
3725 
3726 int
3727 kern_mkdir(struct nlookupdata *nd, int mode)
3728 {
3729 	struct thread *td = curthread;
3730 	struct proc *p = td->td_proc;
3731 	struct vnode *vp;
3732 	struct vattr vattr;
3733 	int error;
3734 
3735 	bwillinode(1);
3736 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
3737 	if ((error = nlookup(nd)) != 0)
3738 		return (error);
3739 
3740 	if (nd->nl_nch.ncp->nc_vp)
3741 		return (EEXIST);
3742 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3743 		return (error);
3744 	VATTR_NULL(&vattr);
3745 	vattr.va_type = VDIR;
3746 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3747 
3748 	vp = NULL;
3749 	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
3750 	if (error == 0)
3751 		vput(vp);
3752 	return (error);
3753 }
3754 
3755 /*
3756  * mkdir_args(char *path, int mode)
3757  *
3758  * Make a directory file.
3759  */
3760 int
3761 sys_mkdir(struct mkdir_args *uap)
3762 {
3763 	struct nlookupdata nd;
3764 	int error;
3765 
3766 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3767 	if (error == 0)
3768 		error = kern_mkdir(&nd, uap->mode);
3769 	nlookup_done(&nd);
3770 	return (error);
3771 }
3772 
3773 /*
3774  * mkdirat_args(int fd, char *path, mode_t mode)
3775  *
3776  * Make a directory file.  The path is relative to the directory associated
3777  * with fd.
3778  */
3779 int
3780 sys_mkdirat(struct mkdirat_args *uap)
3781 {
3782 	struct nlookupdata nd;
3783 	struct file *fp;
3784 	int error;
3785 
3786 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3787 	if (error == 0)
3788 		error = kern_mkdir(&nd, uap->mode);
3789 	nlookup_done_at(&nd, fp);
3790 	return (error);
3791 }
3792 
3793 int
3794 kern_rmdir(struct nlookupdata *nd)
3795 {
3796 	int error;
3797 
3798 	bwillinode(1);
3799 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
3800 	if ((error = nlookup(nd)) != 0)
3801 		return (error);
3802 
3803 	/*
3804 	 * Do not allow directories representing mount points to be
3805 	 * deleted, even if empty.  Check write perms on mount point
3806 	 * in case the vnode is aliased (aka nullfs).
3807 	 */
3808 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
3809 		return (EINVAL);
3810 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3811 		return (error);
3812 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
3813 	return (error);
3814 }
3815 
3816 /*
3817  * rmdir_args(char *path)
3818  *
3819  * Remove a directory file.
3820  */
3821 int
3822 sys_rmdir(struct rmdir_args *uap)
3823 {
3824 	struct nlookupdata nd;
3825 	int error;
3826 
3827 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3828 	if (error == 0)
3829 		error = kern_rmdir(&nd);
3830 	nlookup_done(&nd);
3831 	return (error);
3832 }
3833 
3834 int
3835 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
3836 		   enum uio_seg direction)
3837 {
3838 	struct thread *td = curthread;
3839 	struct proc *p = td->td_proc;
3840 	struct vnode *vp;
3841 	struct file *fp;
3842 	struct uio auio;
3843 	struct iovec aiov;
3844 	off_t loff;
3845 	int error, eofflag;
3846 
3847 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3848 		return (error);
3849 	if ((fp->f_flag & FREAD) == 0) {
3850 		error = EBADF;
3851 		goto done;
3852 	}
3853 	vp = (struct vnode *)fp->f_data;
3854 unionread:
3855 	if (vp->v_type != VDIR) {
3856 		error = EINVAL;
3857 		goto done;
3858 	}
3859 	aiov.iov_base = buf;
3860 	aiov.iov_len = count;
3861 	auio.uio_iov = &aiov;
3862 	auio.uio_iovcnt = 1;
3863 	auio.uio_rw = UIO_READ;
3864 	auio.uio_segflg = direction;
3865 	auio.uio_td = td;
3866 	auio.uio_resid = count;
3867 	loff = auio.uio_offset = fp->f_offset;
3868 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
3869 	fp->f_offset = auio.uio_offset;
3870 	if (error)
3871 		goto done;
3872 	if (count == auio.uio_resid) {
3873 		if (union_dircheckp) {
3874 			error = union_dircheckp(td, &vp, fp);
3875 			if (error == -1)
3876 				goto unionread;
3877 			if (error)
3878 				goto done;
3879 		}
3880 #if 0
3881 		if ((vp->v_flag & VROOT) &&
3882 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
3883 			struct vnode *tvp = vp;
3884 			vp = vp->v_mount->mnt_vnodecovered;
3885 			vref(vp);
3886 			fp->f_data = vp;
3887 			fp->f_offset = 0;
3888 			vrele(tvp);
3889 			goto unionread;
3890 		}
3891 #endif
3892 	}
3893 
3894 	/*
3895 	 * WARNING!  *basep may not be wide enough to accomodate the
3896 	 * seek offset.   XXX should we hack this to return the upper 32 bits
3897 	 * for offsets greater then 4G?
3898 	 */
3899 	if (basep) {
3900 		*basep = (long)loff;
3901 	}
3902 	*res = count - auio.uio_resid;
3903 done:
3904 	fdrop(fp);
3905 	return (error);
3906 }
3907 
3908 /*
3909  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
3910  *
3911  * Read a block of directory entries in a file system independent format.
3912  */
3913 int
3914 sys_getdirentries(struct getdirentries_args *uap)
3915 {
3916 	long base;
3917 	int error;
3918 
3919 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
3920 				   &uap->sysmsg_result, UIO_USERSPACE);
3921 
3922 	if (error == 0 && uap->basep)
3923 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
3924 	return (error);
3925 }
3926 
3927 /*
3928  * getdents_args(int fd, char *buf, size_t count)
3929  */
3930 int
3931 sys_getdents(struct getdents_args *uap)
3932 {
3933 	int error;
3934 
3935 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
3936 				   &uap->sysmsg_result, UIO_USERSPACE);
3937 
3938 	return (error);
3939 }
3940 
3941 /*
3942  * Set the mode mask for creation of filesystem nodes.
3943  *
3944  * umask(int newmask)
3945  */
3946 int
3947 sys_umask(struct umask_args *uap)
3948 {
3949 	struct thread *td = curthread;
3950 	struct proc *p = td->td_proc;
3951 	struct filedesc *fdp;
3952 
3953 	fdp = p->p_fd;
3954 	uap->sysmsg_result = fdp->fd_cmask;
3955 	fdp->fd_cmask = uap->newmask & ALLPERMS;
3956 	return (0);
3957 }
3958 
3959 /*
3960  * revoke(char *path)
3961  *
3962  * Void all references to file by ripping underlying filesystem
3963  * away from vnode.
3964  */
3965 int
3966 sys_revoke(struct revoke_args *uap)
3967 {
3968 	struct nlookupdata nd;
3969 	struct vattr vattr;
3970 	struct vnode *vp;
3971 	struct ucred *cred;
3972 	int error;
3973 
3974 	vp = NULL;
3975 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3976 	if (error == 0)
3977 		error = nlookup(&nd);
3978 	if (error == 0)
3979 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3980 	cred = crhold(nd.nl_cred);
3981 	nlookup_done(&nd);
3982 	if (error == 0) {
3983 		if (error == 0)
3984 			error = VOP_GETATTR(vp, &vattr);
3985 		if (error == 0 && cred->cr_uid != vattr.va_uid)
3986 			error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
3987 		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
3988 			if (vcount(vp) > 0)
3989 				error = vrevoke(vp, cred);
3990 		} else if (error == 0) {
3991 			error = vrevoke(vp, cred);
3992 		}
3993 		vrele(vp);
3994 	}
3995 	if (cred)
3996 		crfree(cred);
3997 	return (error);
3998 }
3999 
4000 /*
4001  * getfh_args(char *fname, fhandle_t *fhp)
4002  *
4003  * Get (NFS) file handle
4004  *
4005  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4006  * mount.  This allows nullfs mounts to be explicitly exported.
4007  *
4008  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4009  *
4010  * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4011  *	    work, but you do not really have protection against access to
4012  *	    the related parent directories.
4013  */
4014 int
4015 sys_getfh(struct getfh_args *uap)
4016 {
4017 	struct thread *td = curthread;
4018 	struct nlookupdata nd;
4019 	fhandle_t fh;
4020 	struct vnode *vp;
4021 	struct mount *mp;
4022 	int error;
4023 
4024 	/*
4025 	 * Must be super user
4026 	 */
4027 	if ((error = priv_check(td, PRIV_ROOT)) != 0)
4028 		return (error);
4029 
4030 	vp = NULL;
4031 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4032 	if (error == 0)
4033 		error = nlookup(&nd);
4034 	if (error == 0)
4035 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4036 	mp = nd.nl_nch.mount;
4037 	nlookup_done(&nd);
4038 	if (error == 0) {
4039 		bzero(&fh, sizeof(fh));
4040 		fh.fh_fsid = mp->mnt_stat.f_fsid;
4041 		error = VFS_VPTOFH(vp, &fh.fh_fid);
4042 		vput(vp);
4043 		if (error == 0)
4044 			error = copyout(&fh, uap->fhp, sizeof(fh));
4045 	}
4046 	return (error);
4047 }
4048 
4049 /*
4050  * fhopen_args(const struct fhandle *u_fhp, int flags)
4051  *
4052  * syscall for the rpc.lockd to use to translate a NFS file handle into
4053  * an open descriptor.
4054  *
4055  * warning: do not remove the priv_check() call or this becomes one giant
4056  * security hole.
4057  */
4058 int
4059 sys_fhopen(struct fhopen_args *uap)
4060 {
4061 	struct thread *td = curthread;
4062 	struct filedesc *fdp = td->td_proc->p_fd;
4063 	struct mount *mp;
4064 	struct vnode *vp;
4065 	struct fhandle fhp;
4066 	struct vattr vat;
4067 	struct vattr *vap = &vat;
4068 	struct flock lf;
4069 	int fmode, mode, error, type;
4070 	struct file *nfp;
4071 	struct file *fp;
4072 	int indx;
4073 
4074 	/*
4075 	 * Must be super user
4076 	 */
4077 	error = priv_check(td, PRIV_ROOT);
4078 	if (error)
4079 		return (error);
4080 
4081 	fmode = FFLAGS(uap->flags);
4082 
4083 	/*
4084 	 * Why not allow a non-read/write open for our lockd?
4085 	 */
4086 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4087 		return (EINVAL);
4088 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4089 	if (error)
4090 		return(error);
4091 
4092 	/*
4093 	 * Find the mount point
4094 	 */
4095 	mp = vfs_getvfs(&fhp.fh_fsid);
4096 	if (mp == NULL) {
4097 		error = ESTALE;
4098 		goto  done;
4099 	}
4100 	/* now give me my vnode, it gets returned to me locked */
4101 	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4102 	if (error)
4103 		goto done;
4104  	/*
4105 	 * from now on we have to make sure not
4106 	 * to forget about the vnode
4107 	 * any error that causes an abort must vput(vp)
4108 	 * just set error = err and 'goto bad;'.
4109 	 */
4110 
4111 	/*
4112 	 * from vn_open
4113 	 */
4114 	if (vp->v_type == VLNK) {
4115 		error = EMLINK;
4116 		goto bad;
4117 	}
4118 	if (vp->v_type == VSOCK) {
4119 		error = EOPNOTSUPP;
4120 		goto bad;
4121 	}
4122 	mode = 0;
4123 	if (fmode & (FWRITE | O_TRUNC)) {
4124 		if (vp->v_type == VDIR) {
4125 			error = EISDIR;
4126 			goto bad;
4127 		}
4128 		error = vn_writechk(vp, NULL);
4129 		if (error)
4130 			goto bad;
4131 		mode |= VWRITE;
4132 	}
4133 	if (fmode & FREAD)
4134 		mode |= VREAD;
4135 	if (mode) {
4136 		error = VOP_ACCESS(vp, mode, td->td_ucred);
4137 		if (error)
4138 			goto bad;
4139 	}
4140 	if (fmode & O_TRUNC) {
4141 		vn_unlock(vp);				/* XXX */
4142 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4143 		VATTR_NULL(vap);
4144 		vap->va_size = 0;
4145 		error = VOP_SETATTR(vp, vap, td->td_ucred);
4146 		if (error)
4147 			goto bad;
4148 	}
4149 
4150 	/*
4151 	 * VOP_OPEN needs the file pointer so it can potentially override
4152 	 * it.
4153 	 *
4154 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4155 	 * directory.  XXX
4156 	 */
4157 	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4158 		goto bad;
4159 	fp = nfp;
4160 
4161 	error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4162 	if (error) {
4163 		/*
4164 		 * setting f_ops this way prevents VOP_CLOSE from being
4165 		 * called or fdrop() releasing the vp from v_data.   Since
4166 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4167 		 */
4168 		fp->f_ops = &badfileops;
4169 		fp->f_data = NULL;
4170 		goto bad_drop;
4171 	}
4172 
4173 	/*
4174 	 * The fp is given its own reference, we still have our ref and lock.
4175 	 *
4176 	 * Assert that all regular files must be created with a VM object.
4177 	 */
4178 	if (vp->v_type == VREG && vp->v_object == NULL) {
4179 		kprintf("fhopen: regular file did not have VM object: %p\n", vp);
4180 		goto bad_drop;
4181 	}
4182 
4183 	/*
4184 	 * The open was successful.  Handle any locking requirements.
4185 	 */
4186 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4187 		lf.l_whence = SEEK_SET;
4188 		lf.l_start = 0;
4189 		lf.l_len = 0;
4190 		if (fmode & O_EXLOCK)
4191 			lf.l_type = F_WRLCK;
4192 		else
4193 			lf.l_type = F_RDLCK;
4194 		if (fmode & FNONBLOCK)
4195 			type = 0;
4196 		else
4197 			type = F_WAIT;
4198 		vn_unlock(vp);
4199 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
4200 			/*
4201 			 * release our private reference.
4202 			 */
4203 			fsetfd(fdp, NULL, indx);
4204 			fdrop(fp);
4205 			vrele(vp);
4206 			goto done;
4207 		}
4208 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4209 		fp->f_flag |= FHASLOCK;
4210 	}
4211 
4212 	/*
4213 	 * Clean up.  Associate the file pointer with the previously
4214 	 * reserved descriptor and return it.
4215 	 */
4216 	vput(vp);
4217 	fsetfd(fdp, fp, indx);
4218 	fdrop(fp);
4219 	uap->sysmsg_result = indx;
4220 	return (0);
4221 
4222 bad_drop:
4223 	fsetfd(fdp, NULL, indx);
4224 	fdrop(fp);
4225 bad:
4226 	vput(vp);
4227 done:
4228 	return (error);
4229 }
4230 
4231 /*
4232  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4233  */
4234 int
4235 sys_fhstat(struct fhstat_args *uap)
4236 {
4237 	struct thread *td = curthread;
4238 	struct stat sb;
4239 	fhandle_t fh;
4240 	struct mount *mp;
4241 	struct vnode *vp;
4242 	int error;
4243 
4244 	/*
4245 	 * Must be super user
4246 	 */
4247 	error = priv_check(td, PRIV_ROOT);
4248 	if (error)
4249 		return (error);
4250 
4251 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4252 	if (error)
4253 		return (error);
4254 
4255 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4256 		error = ESTALE;
4257 	if (error == 0) {
4258 		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4259 			error = vn_stat(vp, &sb, td->td_ucred);
4260 			vput(vp);
4261 		}
4262 	}
4263 	if (error == 0)
4264 		error = copyout(&sb, uap->sb, sizeof(sb));
4265 	return (error);
4266 }
4267 
4268 /*
4269  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4270  */
4271 int
4272 sys_fhstatfs(struct fhstatfs_args *uap)
4273 {
4274 	struct thread *td = curthread;
4275 	struct proc *p = td->td_proc;
4276 	struct statfs *sp;
4277 	struct mount *mp;
4278 	struct vnode *vp;
4279 	struct statfs sb;
4280 	char *fullpath, *freepath;
4281 	fhandle_t fh;
4282 	int error;
4283 
4284 	/*
4285 	 * Must be super user
4286 	 */
4287 	if ((error = priv_check(td, PRIV_ROOT)))
4288 		return (error);
4289 
4290 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4291 		return (error);
4292 
4293 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4294 		error = ESTALE;
4295 		goto done;
4296 	}
4297 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4298 		error = ESTALE;
4299 		goto done;
4300 	}
4301 
4302 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4303 		goto done;
4304 	mp = vp->v_mount;
4305 	sp = &mp->mnt_stat;
4306 	vput(vp);
4307 	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4308 		goto done;
4309 
4310 	error = mount_path(p, mp, &fullpath, &freepath);
4311 	if (error)
4312 		goto done;
4313 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4314 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4315 	kfree(freepath, M_TEMP);
4316 
4317 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4318 	if (priv_check(td, PRIV_ROOT)) {
4319 		bcopy(sp, &sb, sizeof(sb));
4320 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4321 		sp = &sb;
4322 	}
4323 	error = copyout(sp, uap->buf, sizeof(*sp));
4324 done:
4325 	return (error);
4326 }
4327 
4328 /*
4329  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4330  */
4331 int
4332 sys_fhstatvfs(struct fhstatvfs_args *uap)
4333 {
4334 	struct thread *td = curthread;
4335 	struct proc *p = td->td_proc;
4336 	struct statvfs *sp;
4337 	struct mount *mp;
4338 	struct vnode *vp;
4339 	fhandle_t fh;
4340 	int error;
4341 
4342 	/*
4343 	 * Must be super user
4344 	 */
4345 	if ((error = priv_check(td, PRIV_ROOT)))
4346 		return (error);
4347 
4348 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4349 		return (error);
4350 
4351 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4352 		error = ESTALE;
4353 		goto done;
4354 	}
4355 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4356 		error = ESTALE;
4357 		goto done;
4358 	}
4359 
4360 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4361 		goto done;
4362 	mp = vp->v_mount;
4363 	sp = &mp->mnt_vstat;
4364 	vput(vp);
4365 	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4366 		goto done;
4367 
4368 	sp->f_flag = 0;
4369 	if (mp->mnt_flag & MNT_RDONLY)
4370 		sp->f_flag |= ST_RDONLY;
4371 	if (mp->mnt_flag & MNT_NOSUID)
4372 		sp->f_flag |= ST_NOSUID;
4373 	error = copyout(sp, uap->buf, sizeof(*sp));
4374 done:
4375 	return (error);
4376 }
4377 
4378 
4379 /*
4380  * Syscall to push extended attribute configuration information into the
4381  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4382  * a command (int cmd), and attribute name and misc data.  For now, the
4383  * attribute name is left in userspace for consumption by the VFS_op.
4384  * It will probably be changed to be copied into sysspace by the
4385  * syscall in the future, once issues with various consumers of the
4386  * attribute code have raised their hands.
4387  *
4388  * Currently this is used only by UFS Extended Attributes.
4389  */
4390 int
4391 sys_extattrctl(struct extattrctl_args *uap)
4392 {
4393 	struct nlookupdata nd;
4394 	struct vnode *vp;
4395 	char attrname[EXTATTR_MAXNAMELEN];
4396 	int error;
4397 	size_t size;
4398 
4399 	attrname[0] = 0;
4400 	vp = NULL;
4401 	error = 0;
4402 
4403 	if (error == 0 && uap->filename) {
4404 		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
4405 				     NLC_FOLLOW);
4406 		if (error == 0)
4407 			error = nlookup(&nd);
4408 		if (error == 0)
4409 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4410 		nlookup_done(&nd);
4411 	}
4412 
4413 	if (error == 0 && uap->attrname) {
4414 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
4415 				  &size);
4416 	}
4417 
4418 	if (error == 0) {
4419 		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4420 		if (error == 0)
4421 			error = nlookup(&nd);
4422 		if (error == 0)
4423 			error = ncp_writechk(&nd.nl_nch);
4424 		if (error == 0) {
4425 			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
4426 					       uap->attrnamespace,
4427 					       uap->attrname, nd.nl_cred);
4428 		}
4429 		nlookup_done(&nd);
4430 	}
4431 
4432 	return (error);
4433 }
4434 
4435 /*
4436  * Syscall to get a named extended attribute on a file or directory.
4437  */
4438 int
4439 sys_extattr_set_file(struct extattr_set_file_args *uap)
4440 {
4441 	char attrname[EXTATTR_MAXNAMELEN];
4442 	struct nlookupdata nd;
4443 	struct vnode *vp;
4444 	struct uio auio;
4445 	struct iovec aiov;
4446 	int error;
4447 
4448 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4449 	if (error)
4450 		return (error);
4451 
4452 	vp = NULL;
4453 
4454 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4455 	if (error == 0)
4456 		error = nlookup(&nd);
4457 	if (error == 0)
4458 		error = ncp_writechk(&nd.nl_nch);
4459 	if (error == 0)
4460 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4461 	if (error) {
4462 		nlookup_done(&nd);
4463 		return (error);
4464 	}
4465 
4466 	bzero(&auio, sizeof(auio));
4467 	aiov.iov_base = uap->data;
4468 	aiov.iov_len = uap->nbytes;
4469 	auio.uio_iov = &aiov;
4470 	auio.uio_iovcnt = 1;
4471 	auio.uio_offset = 0;
4472 	auio.uio_resid = uap->nbytes;
4473 	auio.uio_rw = UIO_WRITE;
4474 	auio.uio_td = curthread;
4475 
4476 	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
4477 			       &auio, nd.nl_cred);
4478 
4479 	vput(vp);
4480 	nlookup_done(&nd);
4481 	return (error);
4482 }
4483 
4484 /*
4485  * Syscall to get a named extended attribute on a file or directory.
4486  */
4487 int
4488 sys_extattr_get_file(struct extattr_get_file_args *uap)
4489 {
4490 	char attrname[EXTATTR_MAXNAMELEN];
4491 	struct nlookupdata nd;
4492 	struct uio auio;
4493 	struct iovec aiov;
4494 	struct vnode *vp;
4495 	int error;
4496 
4497 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4498 	if (error)
4499 		return (error);
4500 
4501 	vp = NULL;
4502 
4503 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4504 	if (error == 0)
4505 		error = nlookup(&nd);
4506 	if (error == 0)
4507 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4508 	if (error) {
4509 		nlookup_done(&nd);
4510 		return (error);
4511 	}
4512 
4513 	bzero(&auio, sizeof(auio));
4514 	aiov.iov_base = uap->data;
4515 	aiov.iov_len = uap->nbytes;
4516 	auio.uio_iov = &aiov;
4517 	auio.uio_iovcnt = 1;
4518 	auio.uio_offset = 0;
4519 	auio.uio_resid = uap->nbytes;
4520 	auio.uio_rw = UIO_READ;
4521 	auio.uio_td = curthread;
4522 
4523 	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
4524 				&auio, nd.nl_cred);
4525 	uap->sysmsg_result = uap->nbytes - auio.uio_resid;
4526 
4527 	vput(vp);
4528 	nlookup_done(&nd);
4529 	return(error);
4530 }
4531 
4532 /*
4533  * Syscall to delete a named extended attribute from a file or directory.
4534  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
4535  */
4536 int
4537 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
4538 {
4539 	char attrname[EXTATTR_MAXNAMELEN];
4540 	struct nlookupdata nd;
4541 	struct vnode *vp;
4542 	int error;
4543 
4544 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4545 	if (error)
4546 		return(error);
4547 
4548 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4549 	if (error == 0)
4550 		error = nlookup(&nd);
4551 	if (error == 0)
4552 		error = ncp_writechk(&nd.nl_nch);
4553 	if (error == 0) {
4554 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4555 		if (error == 0) {
4556 			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
4557 					       attrname, NULL, nd.nl_cred);
4558 			vput(vp);
4559 		}
4560 	}
4561 	nlookup_done(&nd);
4562 	return(error);
4563 }
4564 
4565 /*
4566  * Determine if the mount is visible to the process.
4567  */
4568 static int
4569 chroot_visible_mnt(struct mount *mp, struct proc *p)
4570 {
4571 	struct nchandle nch;
4572 
4573 	/*
4574 	 * Traverse from the mount point upwards.  If we hit the process
4575 	 * root then the mount point is visible to the process.
4576 	 */
4577 	nch = mp->mnt_ncmountpt;
4578 	while (nch.ncp) {
4579 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
4580 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
4581 			return(1);
4582 		}
4583 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
4584 			nch = nch.mount->mnt_ncmounton;
4585 		} else {
4586 			nch.ncp = nch.ncp->nc_parent;
4587 		}
4588 	}
4589 
4590 	/*
4591 	 * If the mount point is not visible to the process, but the
4592 	 * process root is in a subdirectory of the mount, return
4593 	 * TRUE anyway.
4594 	 */
4595 	if (p->p_fd->fd_nrdir.mount == mp)
4596 		return(1);
4597 
4598 	return(0);
4599 }
4600 
4601