xref: /dflybsd-src/sys/kern/vfs_syscalls.c (revision 14e5be3a8c11f2df26dba50c4a8aaa5e8d88133e)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/sysent.h>
43 #include <sys/malloc.h>
44 #include <sys/mount.h>
45 #include <sys/mountctl.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/linker.h>
52 #include <sys/stat.h>
53 #include <sys/unistd.h>
54 #include <sys/vnode.h>
55 #include <sys/proc.h>
56 #include <sys/priv.h>
57 #include <sys/jail.h>
58 #include <sys/namei.h>
59 #include <sys/nlookup.h>
60 #include <sys/dirent.h>
61 #include <sys/extattr.h>
62 #include <sys/spinlock.h>
63 #include <sys/kern_syscall.h>
64 #include <sys/objcache.h>
65 #include <sys/sysctl.h>
66 
67 #include <sys/buf2.h>
68 #include <sys/file2.h>
69 #include <sys/spinlock2.h>
70 
71 #include <vm/vm.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 
75 #include <machine/limits.h>
76 #include <machine/stdarg.h>
77 
78 static void mount_warning(struct mount *mp, const char *ctl, ...)
79 		__printflike(2, 3);
80 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
81 static int checkvp_chdir (struct vnode *vn, struct thread *td);
82 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
83 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
84 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
85 static int getutimes (struct timeval *, struct timespec *);
86 static int getutimens (const struct timespec *, struct timespec *, int *);
87 static int setfown (struct mount *, struct vnode *, uid_t, gid_t);
88 static int setfmode (struct vnode *, int);
89 static int setfflags (struct vnode *, int);
90 static int setutimes (struct vnode *, struct vattr *,
91 			const struct timespec *, int);
92 static int	usermount = 0;	/* if 1, non-root can mount fs. */
93 
94 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
95     "Allow non-root users to mount filesystems");
96 
97 /*
98  * Virtual File System System Calls
99  */
100 
101 /*
102  * Mount a file system.
103  *
104  * mount_args(char *type, char *path, int flags, caddr_t data)
105  *
106  * MPALMOSTSAFE
107  */
108 int
109 sys_mount(struct mount_args *uap)
110 {
111 	struct thread *td = curthread;
112 	struct vnode *vp;
113 	struct nchandle nch;
114 	struct mount *mp, *nullmp;
115 	struct vfsconf *vfsp;
116 	int error, flag = 0, flag2 = 0;
117 	int hasmount;
118 	struct vattr va;
119 	struct nlookupdata nd;
120 	char fstypename[MFSNAMELEN];
121 	struct ucred *cred;
122 
123 	cred = td->td_ucred;
124 	if (jailed(cred)) {
125 		error = EPERM;
126 		goto done;
127 	}
128 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
129 		goto done;
130 
131 	/*
132 	 * Do not allow NFS export by non-root users.
133 	 */
134 	if (uap->flags & MNT_EXPORTED) {
135 		error = priv_check(td, PRIV_ROOT);
136 		if (error)
137 			goto done;
138 	}
139 	/*
140 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
141 	 */
142 	if (priv_check(td, PRIV_ROOT))
143 		uap->flags |= MNT_NOSUID | MNT_NODEV;
144 
145 	/*
146 	 * Lookup the requested path and extract the nch and vnode.
147 	 */
148 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
149 	if (error == 0) {
150 		if ((error = nlookup(&nd)) == 0) {
151 			if (nd.nl_nch.ncp->nc_vp == NULL)
152 				error = ENOENT;
153 		}
154 	}
155 	if (error) {
156 		nlookup_done(&nd);
157 		goto done;
158 	}
159 
160 	/*
161 	 * If the target filesystem is resolved via a nullfs mount, then
162 	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
163 	 * instead of the target file system. We need it in case we are
164 	 * doing an update.
165 	 */
166 	nullmp = nd.nl_nch.mount;
167 
168 	/*
169 	 * Extract the locked+refd ncp and cleanup the nd structure
170 	 */
171 	nch = nd.nl_nch;
172 	cache_zero(&nd.nl_nch);
173 	nlookup_done(&nd);
174 
175 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
176 	    (mp = cache_findmount(&nch)) != NULL) {
177 		cache_dropmount(mp);
178 		hasmount = 1;
179 	} else {
180 		hasmount = 0;
181 	}
182 
183 
184 	/*
185 	 * now we have the locked ref'd nch and unreferenced vnode.
186 	 */
187 	vp = nch.ncp->nc_vp;
188 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
189 		cache_put(&nch);
190 		goto done;
191 	}
192 	cache_unlock(&nch);
193 
194 	/*
195 	 * Extract the file system type. We need to know this early, to take
196 	 * appropriate actions if we are dealing with a nullfs.
197 	 */
198         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
199                 cache_drop(&nch);
200                 vput(vp);
201 		goto done;
202         }
203 
204 	/*
205 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
206 	 */
207 	if (uap->flags & MNT_UPDATE) {
208 		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
209 			cache_drop(&nch);
210 			vput(vp);
211 			error = EINVAL;
212 			goto done;
213 		}
214 
215 		if (strncmp(fstypename, "null", 5) == 0) {
216 			KKASSERT(nullmp);
217 			mp = nullmp;
218 		} else {
219 			mp = vp->v_mount;
220 		}
221 
222 		flag = mp->mnt_flag;
223 		flag2 = mp->mnt_kern_flag;
224 		/*
225 		 * We only allow the filesystem to be reloaded if it
226 		 * is currently mounted read-only.
227 		 */
228 		if ((uap->flags & MNT_RELOAD) &&
229 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
230 			cache_drop(&nch);
231 			vput(vp);
232 			error = EOPNOTSUPP;	/* Needs translation */
233 			goto done;
234 		}
235 		/*
236 		 * Only root, or the user that did the original mount is
237 		 * permitted to update it.
238 		 */
239 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
240 		    (error = priv_check(td, PRIV_ROOT))) {
241 			cache_drop(&nch);
242 			vput(vp);
243 			goto done;
244 		}
245 		if (vfs_busy(mp, LK_NOWAIT)) {
246 			cache_drop(&nch);
247 			vput(vp);
248 			error = EBUSY;
249 			goto done;
250 		}
251 		if (hasmount) {
252 			cache_drop(&nch);
253 			vfs_unbusy(mp);
254 			vput(vp);
255 			error = EBUSY;
256 			goto done;
257 		}
258 		mp->mnt_flag |=
259 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
260 		lwkt_gettoken(&mp->mnt_token);
261 		vn_unlock(vp);
262 		vfsp = mp->mnt_vfc;
263 		goto update;
264 	}
265 
266 	/*
267 	 * If the user is not root, ensure that they own the directory
268 	 * onto which we are attempting to mount.
269 	 */
270 	if ((error = VOP_GETATTR(vp, &va)) ||
271 	    (va.va_uid != cred->cr_uid &&
272 	     (error = priv_check(td, PRIV_ROOT)))) {
273 		cache_drop(&nch);
274 		vput(vp);
275 		goto done;
276 	}
277 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
278 		cache_drop(&nch);
279 		vput(vp);
280 		goto done;
281 	}
282 	if (vp->v_type != VDIR) {
283 		cache_drop(&nch);
284 		vput(vp);
285 		error = ENOTDIR;
286 		goto done;
287 	}
288 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
289 		cache_drop(&nch);
290 		vput(vp);
291 		error = EPERM;
292 		goto done;
293 	}
294 	vfsp = vfsconf_find_by_name(fstypename);
295 	if (vfsp == NULL) {
296 		linker_file_t lf;
297 
298 		/* Only load modules for root (very important!) */
299 		if ((error = priv_check(td, PRIV_ROOT)) != 0) {
300 			cache_drop(&nch);
301 			vput(vp);
302 			goto done;
303 		}
304 		error = linker_load_file(fstypename, &lf);
305 		if (error || lf == NULL) {
306 			cache_drop(&nch);
307 			vput(vp);
308 			if (lf == NULL)
309 				error = ENODEV;
310 			goto done;
311 		}
312 		lf->userrefs++;
313 		/* lookup again, see if the VFS was loaded */
314 		vfsp = vfsconf_find_by_name(fstypename);
315 		if (vfsp == NULL) {
316 			lf->userrefs--;
317 			linker_file_unload(lf);
318 			cache_drop(&nch);
319 			vput(vp);
320 			error = ENODEV;
321 			goto done;
322 		}
323 	}
324 	if (hasmount) {
325 		cache_drop(&nch);
326 		vput(vp);
327 		error = EBUSY;
328 		goto done;
329 	}
330 
331 	/*
332 	 * Allocate and initialize the filesystem.
333 	 */
334 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
335 	mount_init(mp);
336 	vfs_busy(mp, LK_NOWAIT);
337 	mp->mnt_op = vfsp->vfc_vfsops;
338 	mp->mnt_vfc = vfsp;
339 	mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
340 	vfsp->vfc_refcount++;
341 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
342 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
343 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
344 	mp->mnt_stat.f_owner = cred->cr_uid;
345 	lwkt_gettoken(&mp->mnt_token);
346 	vn_unlock(vp);
347 update:
348 	/*
349 	 * (per-mount token acquired at this point)
350 	 *
351 	 * Set the mount level flags.
352 	 */
353 	if (uap->flags & MNT_RDONLY)
354 		mp->mnt_flag |= MNT_RDONLY;
355 	else if (mp->mnt_flag & MNT_RDONLY)
356 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
357 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
358 	    MNT_SYNCHRONOUS | MNT_ASYNC | MNT_NOATIME |
359 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
360 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
361 	    MNT_AUTOMOUNTED);
362 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
363 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_ASYNC | MNT_FORCE |
364 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
365 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
366 	    MNT_AUTOMOUNTED);
367 
368 	/*
369 	 * Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
370 	 * This way the initial VFS_MOUNT() call will also be MPSAFE.
371 	 */
372 	if (vfsp->vfc_flags & VFCF_MPSAFE)
373 		mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;
374 
375 	/*
376 	 * Mount the filesystem.
377 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
378 	 * get.
379 	 */
380 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
381 	if (mp->mnt_flag & MNT_UPDATE) {
382 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
383 			mp->mnt_flag &= ~MNT_RDONLY;
384 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
385 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
386 		if (error) {
387 			mp->mnt_flag = flag;
388 			mp->mnt_kern_flag = flag2;
389 		}
390 		lwkt_reltoken(&mp->mnt_token);
391 		vfs_unbusy(mp);
392 		vrele(vp);
393 		cache_drop(&nch);
394 		goto done;
395 	}
396 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
397 
398 	/*
399 	 * Put the new filesystem on the mount list after root.  The mount
400 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
401 	 * up) which represents the root of the mount.  The lookup code
402 	 * detects the mount point going forward and checks the root of
403 	 * the mount going backwards.
404 	 *
405 	 * It is not necessary to invalidate or purge the vnode underneath
406 	 * because elements under the mount will be given their own glue
407 	 * namecache record.
408 	 */
409 	if (!error) {
410 		if (mp->mnt_ncmountpt.ncp == NULL) {
411 			/*
412 			 * Allocate, then unlock, but leave the ref intact.
413 			 * This is the mnt_refs (1) that we will retain
414 			 * through to the unmount.
415 			 */
416 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
417 			cache_unlock(&mp->mnt_ncmountpt);
418 		}
419 		vn_unlock(vp);
420 		mp->mnt_ncmounton = nch;		/* inherits ref */
421 		cache_lock(&nch);
422 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
423 		cache_unlock(&nch);
424 		cache_ismounting(mp);
425 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
426 
427 		mountlist_insert(mp, MNTINS_LAST);
428 		vn_unlock(vp);
429 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
430 		error = vfs_allocate_syncvnode(mp);
431 		lwkt_reltoken(&mp->mnt_token);
432 		vfs_unbusy(mp);
433 		error = VFS_START(mp, 0);
434 		vrele(vp);
435 		KNOTE(&fs_klist, VQ_MOUNT);
436 	} else {
437 		vn_syncer_thr_stop(mp);
438 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
439 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
440 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
441 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
442 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
443 		mp->mnt_vfc->vfc_refcount--;
444 		lwkt_reltoken(&mp->mnt_token);
445 		vfs_unbusy(mp);
446 		kfree(mp, M_MOUNT);
447 		cache_drop(&nch);
448 		vput(vp);
449 	}
450 done:
451 	return (error);
452 }
453 
454 /*
455  * Scan all active processes to see if any of them have a current
456  * or root directory onto which the new filesystem has just been
457  * mounted. If so, replace them with the new mount point.
458  *
459  * Both old_nch and new_nch are ref'd on call but not locked.
460  * new_nch must be temporarily locked so it can be associated with the
461  * vnode representing the root of the mount point.
462  */
463 struct checkdirs_info {
464 	struct nchandle old_nch;
465 	struct nchandle new_nch;
466 	struct vnode *old_vp;
467 	struct vnode *new_vp;
468 };
469 
470 static int checkdirs_callback(struct proc *p, void *data);
471 
472 static void
473 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
474 {
475 	struct checkdirs_info info;
476 	struct vnode *olddp;
477 	struct vnode *newdp;
478 	struct mount *mp;
479 
480 	/*
481 	 * If the old mount point's vnode has a usecount of 1, it is not
482 	 * being held as a descriptor anywhere.
483 	 */
484 	olddp = old_nch->ncp->nc_vp;
485 	if (olddp == NULL || VREFCNT(olddp) == 1)
486 		return;
487 
488 	/*
489 	 * Force the root vnode of the new mount point to be resolved
490 	 * so we can update any matching processes.
491 	 */
492 	mp = new_nch->mount;
493 	if (VFS_ROOT(mp, &newdp))
494 		panic("mount: lost mount");
495 	vn_unlock(newdp);
496 	cache_lock(new_nch);
497 	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
498 	cache_setunresolved(new_nch);
499 	cache_setvp(new_nch, newdp);
500 	cache_unlock(new_nch);
501 
502 	/*
503 	 * Special handling of the root node
504 	 */
505 	if (rootvnode == olddp) {
506 		vref(newdp);
507 		vfs_cache_setroot(newdp, cache_hold(new_nch));
508 	}
509 
510 	/*
511 	 * Pass newdp separately so the callback does not have to access
512 	 * it via new_nch->ncp->nc_vp.
513 	 */
514 	info.old_nch = *old_nch;
515 	info.new_nch = *new_nch;
516 	info.new_vp = newdp;
517 	allproc_scan(checkdirs_callback, &info, 0);
518 	vput(newdp);
519 }
520 
521 /*
522  * NOTE: callback is not MP safe because the scanned process's filedesc
523  * structure can be ripped out from under us, amoung other things.
524  */
525 static int
526 checkdirs_callback(struct proc *p, void *data)
527 {
528 	struct checkdirs_info *info = data;
529 	struct filedesc *fdp;
530 	struct nchandle ncdrop1;
531 	struct nchandle ncdrop2;
532 	struct vnode *vprele1;
533 	struct vnode *vprele2;
534 
535 	if ((fdp = p->p_fd) != NULL) {
536 		cache_zero(&ncdrop1);
537 		cache_zero(&ncdrop2);
538 		vprele1 = NULL;
539 		vprele2 = NULL;
540 
541 		/*
542 		 * MPUNSAFE - XXX fdp can be pulled out from under a
543 		 * foreign process.
544 		 *
545 		 * A shared filedesc is ok, we don't have to copy it
546 		 * because we are making this change globally.
547 		 */
548 		spin_lock(&fdp->fd_spin);
549 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
550 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
551 			vprele1 = fdp->fd_cdir;
552 			vref(info->new_vp);
553 			fdp->fd_cdir = info->new_vp;
554 			ncdrop1 = fdp->fd_ncdir;
555 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
556 		}
557 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
558 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
559 			vprele2 = fdp->fd_rdir;
560 			vref(info->new_vp);
561 			fdp->fd_rdir = info->new_vp;
562 			ncdrop2 = fdp->fd_nrdir;
563 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
564 		}
565 		spin_unlock(&fdp->fd_spin);
566 		if (ncdrop1.ncp)
567 			cache_drop(&ncdrop1);
568 		if (ncdrop2.ncp)
569 			cache_drop(&ncdrop2);
570 		if (vprele1)
571 			vrele(vprele1);
572 		if (vprele2)
573 			vrele(vprele2);
574 	}
575 	return(0);
576 }
577 
578 /*
579  * Unmount a file system.
580  *
581  * Note: unmount takes a path to the vnode mounted on as argument,
582  * not special file (as before).
583  *
584  * umount_args(char *path, int flags)
585  *
586  * MPALMOSTSAFE
587  */
588 int
589 sys_unmount(struct unmount_args *uap)
590 {
591 	struct thread *td = curthread;
592 	struct proc *p __debugvar = td->td_proc;
593 	struct mount *mp = NULL;
594 	struct nlookupdata nd;
595 	int error;
596 
597 	KKASSERT(p);
598 	if (td->td_ucred->cr_prison != NULL) {
599 		error = EPERM;
600 		goto done;
601 	}
602 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
603 		goto done;
604 
605 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
606 	if (error == 0)
607 		error = nlookup(&nd);
608 	if (error)
609 		goto out;
610 
611 	mp = nd.nl_nch.mount;
612 
613 	/*
614 	 * Only root, or the user that did the original mount is
615 	 * permitted to unmount this filesystem.
616 	 */
617 	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
618 	    (error = priv_check(td, PRIV_ROOT)))
619 		goto out;
620 
621 	/*
622 	 * Don't allow unmounting the root file system.
623 	 */
624 	if (mp->mnt_flag & MNT_ROOTFS) {
625 		error = EINVAL;
626 		goto out;
627 	}
628 
629 	/*
630 	 * Must be the root of the filesystem
631 	 */
632 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
633 		error = EINVAL;
634 		goto out;
635 	}
636 
637 	/*
638 	 * If no error try to issue the unmount.  We lose our cache
639 	 * ref when we call nlookup_done so we must hold the mount point
640 	 * to prevent use-after-free races.
641 	 */
642 out:
643 	if (error == 0) {
644 		mount_hold(mp);
645 		nlookup_done(&nd);
646 		error = dounmount(mp, uap->flags, 0);
647 		mount_drop(mp);
648 	} else {
649 		nlookup_done(&nd);
650 	}
651 done:
652 	return (error);
653 }
654 
655 /*
656  * Do the actual file system unmount (interlocked against the mountlist
657  * token and mp->mnt_token).
658  */
659 static int
660 dounmount_interlock(struct mount *mp)
661 {
662 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
663 		return (EBUSY);
664 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
665 	return(0);
666 }
667 
668 static int
669 unmount_allproc_cb(struct proc *p, void *arg)
670 {
671 	struct mount *mp;
672 
673 	if (p->p_textnch.ncp == NULL)
674 		return 0;
675 
676 	mp = (struct mount *)arg;
677 	if (p->p_textnch.mount == mp)
678 		cache_drop(&p->p_textnch);
679 
680 	return 0;
681 }
682 
683 /*
684  * The guts of the unmount code.  The mount owns one ref and one hold
685  * count.  If we successfully interlock the unmount, those refs are ours.
686  * (The ref is from mnt_ncmountpt).
687  *
688  * When halting we shortcut certain mount types such as devfs by not actually
689  * issuing the VFS_SYNC() or VFS_UNMOUNT().  They are still disconnected
690  * from the mountlist so higher-level filesytems can unmount cleanly.
691  *
692  * The mount types that allow QUICKHALT are: devfs, tmpfs, procfs.
693  */
694 int
695 dounmount(struct mount *mp, int flags, int halting)
696 {
697 	struct namecache *ncp;
698 	struct nchandle nch;
699 	struct vnode *vp;
700 	int error;
701 	int async_flag;
702 	int lflags;
703 	int freeok = 1;
704 	int retry;
705 	int quickhalt;
706 
707 	lwkt_gettoken(&mp->mnt_token);
708 
709 	/*
710 	 * When halting, certain mount points can essentially just
711 	 * be unhooked and otherwise ignored.
712 	 */
713 	if (halting && (mp->mnt_kern_flag & MNTK_QUICKHALT)) {
714 		quickhalt = 1;
715 		freeok = 0;
716 	} else {
717 		quickhalt = 0;
718 	}
719 
720 
721 	/*
722 	 * Exclusive access for unmounting purposes.
723 	 */
724 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
725 		goto out;
726 
727 	/*
728 	 * We now 'own' the last mp->mnt_refs
729 	 *
730 	 * Allow filesystems to detect that a forced unmount is in progress.
731 	 */
732 	if (flags & MNT_FORCE)
733 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
734 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
735 	error = lockmgr(&mp->mnt_lock, lflags);
736 	if (error) {
737 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
738 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
739 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
740 			wakeup(mp);
741 		}
742 		goto out;
743 	}
744 
745 	if (mp->mnt_flag & MNT_EXPUBLIC)
746 		vfs_setpublicfs(NULL, NULL, NULL);
747 
748 	vfs_msync(mp, MNT_WAIT);
749 	async_flag = mp->mnt_flag & MNT_ASYNC;
750 	mp->mnt_flag &=~ MNT_ASYNC;
751 
752 	/*
753 	 * If this filesystem isn't aliasing other filesystems,
754 	 * try to invalidate any remaining namecache entries and
755 	 * check the count afterwords.
756 	 *
757 	 * We own the last mnt_refs by owning mnt_ncmountpt.
758 	 */
759 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
760 		cache_lock(&mp->mnt_ncmountpt);
761 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
762 		cache_unlock(&mp->mnt_ncmountpt);
763 
764 		cache_clearmntcache();
765 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
766 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
767 			allproc_scan(&unmount_allproc_cb, mp, 0);
768 		}
769 
770 		cache_clearmntcache();
771 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
772 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
773 
774 			if ((flags & MNT_FORCE) == 0) {
775 				error = EBUSY;
776 				mount_warning(mp, "Cannot unmount: "
777 						  "%d namecache "
778 						  "references still "
779 						  "present",
780 						  ncp->nc_refs - 1);
781 			} else {
782 				mount_warning(mp, "Forced unmount: "
783 						  "%d namecache "
784 						  "references still "
785 						  "present",
786 						  ncp->nc_refs - 1);
787 				freeok = 0;
788 			}
789 		}
790 	}
791 
792 	/*
793 	 * Decomission our special mnt_syncer vnode.  This also stops
794 	 * the vnlru code.  If we are unable to unmount we recommission
795 	 * the vnode.
796 	 *
797 	 * Then sync the filesystem.
798 	 */
799 	if ((vp = mp->mnt_syncer) != NULL) {
800 		mp->mnt_syncer = NULL;
801 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
802 		vrele(vp);
803 	}
804 
805 	if (quickhalt == 0) {
806 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
807 			VFS_SYNC(mp, MNT_WAIT);
808 	}
809 
810 	/*
811 	 * nchandle records ref the mount structure.  Expect a count of 1
812 	 * (our mount->mnt_ncmountpt).
813 	 *
814 	 * Scans can get temporary refs on a mountpoint (thought really
815 	 * heavy duty stuff like cache_findmount() do not).
816 	 */
817 	if (mp->mnt_refs != 1)
818 		cache_clearmntcache();
819 	for (retry = 0; retry < 10 && mp->mnt_refs != 1; ++retry) {
820 		cache_unmounting(mp);
821 		tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 10 + 1);
822 		cache_clearmntcache();
823 	}
824 	if (mp->mnt_refs != 1) {
825 		if ((flags & MNT_FORCE) == 0) {
826 			mount_warning(mp, "Cannot unmount: "
827 					  "%d mount refs still present",
828 					  mp->mnt_refs - 1);
829 			error = EBUSY;
830 		} else {
831 			mount_warning(mp, "Forced unmount: "
832 					  "%d mount refs still present",
833 					  mp->mnt_refs - 1);
834 			freeok = 0;
835 		}
836 	}
837 
838 	/*
839 	 * So far so good, sync the filesystem once more and
840 	 * call the VFS unmount code if the sync succeeds.
841 	 */
842 	if (error == 0 && quickhalt == 0) {
843 		if (mp->mnt_flag & MNT_RDONLY) {
844 			error = VFS_UNMOUNT(mp, flags);
845 		} else {
846 			error = VFS_SYNC(mp, MNT_WAIT);
847 			if ((error == 0) ||
848 			    (error == EOPNOTSUPP) || /* No sync */
849 			    (flags & MNT_FORCE)) {
850 				error = VFS_UNMOUNT(mp, flags);
851 			}
852 		}
853 	}
854 
855 	/*
856 	 * If an error occurred we can still recover, restoring the
857 	 * syncer vnode and misc flags.
858 	 */
859 	if (error) {
860 		if (mp->mnt_syncer == NULL)
861 			vfs_allocate_syncvnode(mp);
862 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
863 		mp->mnt_flag |= async_flag;
864 		lockmgr(&mp->mnt_lock, LK_RELEASE);
865 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
866 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
867 			wakeup(mp);
868 		}
869 		goto out;
870 	}
871 	/*
872 	 * Clean up any journals still associated with the mount after
873 	 * filesystem activity has ceased.
874 	 */
875 	journal_remove_all_journals(mp,
876 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
877 
878 	mountlist_remove(mp);
879 
880 	/*
881 	 * Remove any installed vnode ops here so the individual VFSs don't
882 	 * have to.
883 	 *
884 	 * mnt_refs should go to zero when we scrap mnt_ncmountpt.
885 	 *
886 	 * When quickhalting we have to keep these intact because the
887 	 * underlying vnodes have not been destroyed, and some might be
888 	 * dirty.
889 	 */
890 	if (quickhalt == 0) {
891 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
892 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
893 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
894 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
895 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
896 	}
897 
898 	if (mp->mnt_ncmountpt.ncp != NULL) {
899 		nch = mp->mnt_ncmountpt;
900 		cache_zero(&mp->mnt_ncmountpt);
901 		cache_clrmountpt(&nch);
902 		cache_drop(&nch);
903 	}
904 	if (mp->mnt_ncmounton.ncp != NULL) {
905 		cache_unmounting(mp);
906 		nch = mp->mnt_ncmounton;
907 		cache_zero(&mp->mnt_ncmounton);
908 		cache_clrmountpt(&nch);
909 		cache_drop(&nch);
910 	}
911 
912 	mp->mnt_vfc->vfc_refcount--;
913 
914 	/*
915 	 * If not quickhalting the mount, we expect there to be no
916 	 * vnodes left.
917 	 */
918 	if (quickhalt == 0 && !TAILQ_EMPTY(&mp->mnt_nvnodelist))
919 		panic("unmount: dangling vnode");
920 
921 	/*
922 	 * Release the lock
923 	 */
924 	lockmgr(&mp->mnt_lock, LK_RELEASE);
925 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
926 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
927 		wakeup(mp);
928 	}
929 
930 	/*
931 	 * If we reach here and freeok != 0 we must free the mount.
932 	 * mnt_refs should already have dropped to 0, so if it is not
933 	 * zero we must cycle the caches and wait.
934 	 *
935 	 * When we are satisfied that the mount has disconnected we can
936 	 * drop the hold on the mp that represented the mount (though the
937 	 * caller might actually have another, so the caller's drop may
938 	 * do the actual free).
939 	 */
940 	if (freeok) {
941 		if (mp->mnt_refs > 0)
942 			cache_clearmntcache();
943 		while (mp->mnt_refs > 0) {
944 			cache_unmounting(mp);
945 			wakeup(mp);
946 			tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
947 			cache_clearmntcache();
948 		}
949 		lwkt_reltoken(&mp->mnt_token);
950 		mount_drop(mp);
951 		mp = NULL;
952 	} else {
953 		cache_clearmntcache();
954 	}
955 	error = 0;
956 	KNOTE(&fs_klist, VQ_UNMOUNT);
957 out:
958 	if (mp)
959 		lwkt_reltoken(&mp->mnt_token);
960 	return (error);
961 }
962 
963 static
964 void
965 mount_warning(struct mount *mp, const char *ctl, ...)
966 {
967 	char *ptr;
968 	char *buf;
969 	__va_list va;
970 
971 	__va_start(va, ctl);
972 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
973 			   &ptr, &buf, 0) == 0) {
974 		kprintf("unmount(%s): ", ptr);
975 		kvprintf(ctl, va);
976 		kprintf("\n");
977 		kfree(buf, M_TEMP);
978 	} else {
979 		kprintf("unmount(%p", mp);
980 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
981 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
982 		kprintf("): ");
983 		kvprintf(ctl, va);
984 		kprintf("\n");
985 	}
986 	__va_end(va);
987 }
988 
989 /*
990  * Shim cache_fullpath() to handle the case where a process is chrooted into
991  * a subdirectory of a mount.  In this case if the root mount matches the
992  * process root directory's mount we have to specify the process's root
993  * directory instead of the mount point, because the mount point might
994  * be above the root directory.
995  */
996 static
997 int
998 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
999 {
1000 	struct nchandle *nch;
1001 
1002 	if (p && p->p_fd->fd_nrdir.mount == mp)
1003 		nch = &p->p_fd->fd_nrdir;
1004 	else
1005 		nch = &mp->mnt_ncmountpt;
1006 	return(cache_fullpath(p, nch, NULL, rb, fb, 0));
1007 }
1008 
1009 /*
1010  * Sync each mounted filesystem.
1011  */
1012 
1013 #ifdef DEBUG
1014 static int syncprt = 0;
1015 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
1016 #endif /* DEBUG */
1017 
1018 static int sync_callback(struct mount *mp, void *data);
1019 
1020 int
1021 sys_sync(struct sync_args *uap)
1022 {
1023 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
1024 	return (0);
1025 }
1026 
1027 static
1028 int
1029 sync_callback(struct mount *mp, void *data __unused)
1030 {
1031 	int asyncflag;
1032 
1033 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1034 		lwkt_gettoken(&mp->mnt_token);
1035 		asyncflag = mp->mnt_flag & MNT_ASYNC;
1036 		mp->mnt_flag &= ~MNT_ASYNC;
1037 		lwkt_reltoken(&mp->mnt_token);
1038 		vfs_msync(mp, MNT_NOWAIT);
1039 		VFS_SYNC(mp, MNT_NOWAIT);
1040 		lwkt_gettoken(&mp->mnt_token);
1041 		mp->mnt_flag |= asyncflag;
1042 		lwkt_reltoken(&mp->mnt_token);
1043 	}
1044 	return(0);
1045 }
1046 
1047 /* XXX PRISON: could be per prison flag */
1048 static int prison_quotas;
1049 #if 0
1050 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
1051 #endif
1052 
1053 /*
1054  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
1055  *
1056  * Change filesystem quotas.
1057  *
1058  * MPALMOSTSAFE
1059  */
1060 int
1061 sys_quotactl(struct quotactl_args *uap)
1062 {
1063 	struct nlookupdata nd;
1064 	struct thread *td;
1065 	struct mount *mp;
1066 	int error;
1067 
1068 	td = curthread;
1069 	if (td->td_ucred->cr_prison && !prison_quotas) {
1070 		error = EPERM;
1071 		goto done;
1072 	}
1073 
1074 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1075 	if (error == 0)
1076 		error = nlookup(&nd);
1077 	if (error == 0) {
1078 		mp = nd.nl_nch.mount;
1079 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
1080 				    uap->arg, nd.nl_cred);
1081 	}
1082 	nlookup_done(&nd);
1083 done:
1084 	return (error);
1085 }
1086 
1087 /*
1088  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
1089  *		void *buf, int buflen)
1090  *
1091  * This function operates on a mount point and executes the specified
1092  * operation using the specified control data, and possibly returns data.
1093  *
1094  * The actual number of bytes stored in the result buffer is returned, 0
1095  * if none, otherwise an error is returned.
1096  *
1097  * MPALMOSTSAFE
1098  */
1099 int
1100 sys_mountctl(struct mountctl_args *uap)
1101 {
1102 	struct thread *td = curthread;
1103 	struct proc *p = td->td_proc;
1104 	struct file *fp;
1105 	void *ctl = NULL;
1106 	void *buf = NULL;
1107 	char *path = NULL;
1108 	int error;
1109 
1110 	/*
1111 	 * Sanity and permissions checks.  We must be root.
1112 	 */
1113 	KKASSERT(p);
1114 	if (td->td_ucred->cr_prison != NULL)
1115 		return (EPERM);
1116 	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
1117 	    (error = priv_check(td, PRIV_ROOT)) != 0)
1118 		return (error);
1119 
1120 	/*
1121 	 * Argument length checks
1122 	 */
1123 	if (uap->ctllen < 0 || uap->ctllen > 1024)
1124 		return (EINVAL);
1125 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
1126 		return (EINVAL);
1127 	if (uap->path == NULL)
1128 		return (EINVAL);
1129 
1130 	/*
1131 	 * Allocate the necessary buffers and copyin data
1132 	 */
1133 	path = objcache_get(namei_oc, M_WAITOK);
1134 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1135 	if (error)
1136 		goto done;
1137 
1138 	if (uap->ctllen) {
1139 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1140 		error = copyin(uap->ctl, ctl, uap->ctllen);
1141 		if (error)
1142 			goto done;
1143 	}
1144 	if (uap->buflen)
1145 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1146 
1147 	/*
1148 	 * Validate the descriptor
1149 	 */
1150 	if (uap->fd >= 0) {
1151 		fp = holdfp(p->p_fd, uap->fd, -1);
1152 		if (fp == NULL) {
1153 			error = EBADF;
1154 			goto done;
1155 		}
1156 	} else {
1157 		fp = NULL;
1158 	}
1159 
1160 	/*
1161 	 * Execute the internal kernel function and clean up.
1162 	 */
1163 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
1164 	if (fp)
1165 		fdrop(fp);
1166 	if (error == 0 && uap->sysmsg_result > 0)
1167 		error = copyout(buf, uap->buf, uap->sysmsg_result);
1168 done:
1169 	if (path)
1170 		objcache_put(namei_oc, path);
1171 	if (ctl)
1172 		kfree(ctl, M_TEMP);
1173 	if (buf)
1174 		kfree(buf, M_TEMP);
1175 	return (error);
1176 }
1177 
1178 /*
1179  * Execute a mount control operation by resolving the path to a mount point
1180  * and calling vop_mountctl().
1181  *
1182  * Use the mount point from the nch instead of the vnode so nullfs mounts
1183  * can properly spike the VOP.
1184  */
1185 int
1186 kern_mountctl(const char *path, int op, struct file *fp,
1187 		const void *ctl, int ctllen,
1188 		void *buf, int buflen, int *res)
1189 {
1190 	struct vnode *vp;
1191 	struct nlookupdata nd;
1192 	struct nchandle nch;
1193 	struct mount *mp;
1194 	int error;
1195 
1196 	*res = 0;
1197 	vp = NULL;
1198 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1199 	if (error)
1200 		return (error);
1201 	error = nlookup(&nd);
1202 	if (error) {
1203 		nlookup_done(&nd);
1204 		return (error);
1205 	}
1206 	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1207 	if (error) {
1208 		nlookup_done(&nd);
1209 		return (error);
1210 	}
1211 
1212 	/*
1213 	 * Yes, all this is needed to use the nch.mount below, because
1214 	 * we must maintain a ref on the mount to avoid ripouts (e.g.
1215 	 * due to heavy mount/unmount use by synth or poudriere).
1216 	 */
1217 	nch = nd.nl_nch;
1218 	cache_zero(&nd.nl_nch);
1219 	cache_unlock(&nch);
1220 	nlookup_done(&nd);
1221 	vn_unlock(vp);
1222 
1223 	mp = nch.mount;
1224 
1225 	/*
1226 	 * Must be the root of the filesystem
1227 	 */
1228 	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1229 		cache_drop(&nch);
1230 		vrele(vp);
1231 		return (EINVAL);
1232 	}
1233 	if (mp == NULL || mp->mnt_kern_flag & MNTK_UNMOUNT) {
1234 		kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
1235 			path);
1236 		cache_drop(&nch);
1237 		vrele(vp);
1238 		return (EINVAL);
1239 	}
1240 	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1241 			     buf, buflen, res);
1242 	vrele(vp);
1243 	cache_drop(&nch);
1244 
1245 	return (error);
1246 }
1247 
1248 int
1249 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1250 {
1251 	struct thread *td = curthread;
1252 	struct proc *p = td->td_proc;
1253 	struct mount *mp;
1254 	struct statfs *sp;
1255 	char *fullpath, *freepath;
1256 	int error;
1257 
1258 	if ((error = nlookup(nd)) != 0)
1259 		return (error);
1260 	mp = nd->nl_nch.mount;
1261 	sp = &mp->mnt_stat;
1262 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1263 		return (error);
1264 
1265 	error = mount_path(p, mp, &fullpath, &freepath);
1266 	if (error)
1267 		return(error);
1268 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1269 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1270 	kfree(freepath, M_TEMP);
1271 
1272 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1273 	bcopy(sp, buf, sizeof(*buf));
1274 	/* Only root should have access to the fsid's. */
1275 	if (priv_check(td, PRIV_ROOT))
1276 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1277 	return (0);
1278 }
1279 
1280 /*
1281  * statfs_args(char *path, struct statfs *buf)
1282  *
1283  * Get filesystem statistics.
1284  */
1285 int
1286 sys_statfs(struct statfs_args *uap)
1287 {
1288 	struct nlookupdata nd;
1289 	struct statfs buf;
1290 	int error;
1291 
1292 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1293 	if (error == 0)
1294 		error = kern_statfs(&nd, &buf);
1295 	nlookup_done(&nd);
1296 	if (error == 0)
1297 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1298 	return (error);
1299 }
1300 
1301 int
1302 kern_fstatfs(int fd, struct statfs *buf)
1303 {
1304 	struct thread *td = curthread;
1305 	struct proc *p = td->td_proc;
1306 	struct file *fp;
1307 	struct mount *mp;
1308 	struct statfs *sp;
1309 	char *fullpath, *freepath;
1310 	int error;
1311 
1312 	KKASSERT(p);
1313 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1314 		return (error);
1315 
1316 	/*
1317 	 * Try to use mount info from any overlays rather than the
1318 	 * mount info for the underlying vnode, otherwise we will
1319 	 * fail when operating on null-mounted paths inside a chroot.
1320 	 */
1321 	if ((mp = fp->f_nchandle.mount) == NULL)
1322 		mp = ((struct vnode *)fp->f_data)->v_mount;
1323 	if (mp == NULL) {
1324 		error = EBADF;
1325 		goto done;
1326 	}
1327 	if (fp->f_cred == NULL) {
1328 		error = EINVAL;
1329 		goto done;
1330 	}
1331 	sp = &mp->mnt_stat;
1332 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1333 		goto done;
1334 
1335 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1336 		goto done;
1337 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1338 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1339 	kfree(freepath, M_TEMP);
1340 
1341 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1342 	bcopy(sp, buf, sizeof(*buf));
1343 
1344 	/* Only root should have access to the fsid's. */
1345 	if (priv_check(td, PRIV_ROOT))
1346 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1347 	error = 0;
1348 done:
1349 	fdrop(fp);
1350 	return (error);
1351 }
1352 
1353 /*
1354  * fstatfs_args(int fd, struct statfs *buf)
1355  *
1356  * Get filesystem statistics.
1357  */
1358 int
1359 sys_fstatfs(struct fstatfs_args *uap)
1360 {
1361 	struct statfs buf;
1362 	int error;
1363 
1364 	error = kern_fstatfs(uap->fd, &buf);
1365 
1366 	if (error == 0)
1367 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1368 	return (error);
1369 }
1370 
1371 int
1372 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1373 {
1374 	struct mount *mp;
1375 	struct statvfs *sp;
1376 	int error;
1377 
1378 	if ((error = nlookup(nd)) != 0)
1379 		return (error);
1380 	mp = nd->nl_nch.mount;
1381 	sp = &mp->mnt_vstat;
1382 	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1383 		return (error);
1384 
1385 	sp->f_flag = 0;
1386 	if (mp->mnt_flag & MNT_RDONLY)
1387 		sp->f_flag |= ST_RDONLY;
1388 	if (mp->mnt_flag & MNT_NOSUID)
1389 		sp->f_flag |= ST_NOSUID;
1390 	bcopy(sp, buf, sizeof(*buf));
1391 	return (0);
1392 }
1393 
1394 /*
1395  * statfs_args(char *path, struct statfs *buf)
1396  *
1397  * Get filesystem statistics.
1398  */
1399 int
1400 sys_statvfs(struct statvfs_args *uap)
1401 {
1402 	struct nlookupdata nd;
1403 	struct statvfs buf;
1404 	int error;
1405 
1406 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1407 	if (error == 0)
1408 		error = kern_statvfs(&nd, &buf);
1409 	nlookup_done(&nd);
1410 	if (error == 0)
1411 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1412 	return (error);
1413 }
1414 
1415 int
1416 kern_fstatvfs(int fd, struct statvfs *buf)
1417 {
1418 	struct thread *td = curthread;
1419 	struct proc *p = td->td_proc;
1420 	struct file *fp;
1421 	struct mount *mp;
1422 	struct statvfs *sp;
1423 	int error;
1424 
1425 	KKASSERT(p);
1426 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1427 		return (error);
1428 	if ((mp = fp->f_nchandle.mount) == NULL)
1429 		mp = ((struct vnode *)fp->f_data)->v_mount;
1430 	if (mp == NULL) {
1431 		error = EBADF;
1432 		goto done;
1433 	}
1434 	if (fp->f_cred == NULL) {
1435 		error = EINVAL;
1436 		goto done;
1437 	}
1438 	sp = &mp->mnt_vstat;
1439 	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1440 		goto done;
1441 
1442 	sp->f_flag = 0;
1443 	if (mp->mnt_flag & MNT_RDONLY)
1444 		sp->f_flag |= ST_RDONLY;
1445 	if (mp->mnt_flag & MNT_NOSUID)
1446 		sp->f_flag |= ST_NOSUID;
1447 
1448 	bcopy(sp, buf, sizeof(*buf));
1449 	error = 0;
1450 done:
1451 	fdrop(fp);
1452 	return (error);
1453 }
1454 
1455 /*
1456  * fstatfs_args(int fd, struct statfs *buf)
1457  *
1458  * Get filesystem statistics.
1459  */
1460 int
1461 sys_fstatvfs(struct fstatvfs_args *uap)
1462 {
1463 	struct statvfs buf;
1464 	int error;
1465 
1466 	error = kern_fstatvfs(uap->fd, &buf);
1467 
1468 	if (error == 0)
1469 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1470 	return (error);
1471 }
1472 
1473 /*
1474  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1475  *
1476  * Get statistics on all filesystems.
1477  */
1478 
1479 struct getfsstat_info {
1480 	struct statfs *sfsp;
1481 	long count;
1482 	long maxcount;
1483 	int error;
1484 	int flags;
1485 	struct thread *td;
1486 };
1487 
1488 static int getfsstat_callback(struct mount *, void *);
1489 
1490 int
1491 sys_getfsstat(struct getfsstat_args *uap)
1492 {
1493 	struct thread *td = curthread;
1494 	struct getfsstat_info info;
1495 
1496 	bzero(&info, sizeof(info));
1497 
1498 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1499 	info.sfsp = uap->buf;
1500 	info.count = 0;
1501 	info.flags = uap->flags;
1502 	info.td = td;
1503 
1504 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1505 	if (info.sfsp && info.count > info.maxcount)
1506 		uap->sysmsg_result = info.maxcount;
1507 	else
1508 		uap->sysmsg_result = info.count;
1509 	return (info.error);
1510 }
1511 
1512 static int
1513 getfsstat_callback(struct mount *mp, void *data)
1514 {
1515 	struct getfsstat_info *info = data;
1516 	struct statfs *sp;
1517 	char *freepath;
1518 	char *fullpath;
1519 	int error;
1520 
1521 	if (info->sfsp && info->count < info->maxcount) {
1522 		if (info->td->td_proc &&
1523 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1524 			return(0);
1525 		}
1526 		sp = &mp->mnt_stat;
1527 
1528 		/*
1529 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1530 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1531 		 * overrides MNT_WAIT.
1532 		 */
1533 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1534 		    (info->flags & MNT_WAIT)) &&
1535 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1536 			return(0);
1537 		}
1538 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1539 
1540 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1541 		if (error) {
1542 			info->error = error;
1543 			return(-1);
1544 		}
1545 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1546 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1547 		kfree(freepath, M_TEMP);
1548 
1549 		error = copyout(sp, info->sfsp, sizeof(*sp));
1550 		if (error) {
1551 			info->error = error;
1552 			return (-1);
1553 		}
1554 		++info->sfsp;
1555 	}
1556 	info->count++;
1557 	return(0);
1558 }
1559 
1560 /*
1561  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1562 		   long bufsize, int flags)
1563  *
1564  * Get statistics on all filesystems.
1565  */
1566 
1567 struct getvfsstat_info {
1568 	struct statfs *sfsp;
1569 	struct statvfs *vsfsp;
1570 	long count;
1571 	long maxcount;
1572 	int error;
1573 	int flags;
1574 	struct thread *td;
1575 };
1576 
1577 static int getvfsstat_callback(struct mount *, void *);
1578 
1579 int
1580 sys_getvfsstat(struct getvfsstat_args *uap)
1581 {
1582 	struct thread *td = curthread;
1583 	struct getvfsstat_info info;
1584 
1585 	bzero(&info, sizeof(info));
1586 
1587 	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1588 	info.sfsp = uap->buf;
1589 	info.vsfsp = uap->vbuf;
1590 	info.count = 0;
1591 	info.flags = uap->flags;
1592 	info.td = td;
1593 
1594 	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1595 	if (info.vsfsp && info.count > info.maxcount)
1596 		uap->sysmsg_result = info.maxcount;
1597 	else
1598 		uap->sysmsg_result = info.count;
1599 	return (info.error);
1600 }
1601 
1602 static int
1603 getvfsstat_callback(struct mount *mp, void *data)
1604 {
1605 	struct getvfsstat_info *info = data;
1606 	struct statfs *sp;
1607 	struct statvfs *vsp;
1608 	char *freepath;
1609 	char *fullpath;
1610 	int error;
1611 
1612 	if (info->vsfsp && info->count < info->maxcount) {
1613 		if (info->td->td_proc &&
1614 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1615 			return(0);
1616 		}
1617 		sp = &mp->mnt_stat;
1618 		vsp = &mp->mnt_vstat;
1619 
1620 		/*
1621 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1622 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1623 		 * overrides MNT_WAIT.
1624 		 */
1625 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1626 		    (info->flags & MNT_WAIT)) &&
1627 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1628 			return(0);
1629 		}
1630 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1631 
1632 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1633 		    (info->flags & MNT_WAIT)) &&
1634 		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1635 			return(0);
1636 		}
1637 		vsp->f_flag = 0;
1638 		if (mp->mnt_flag & MNT_RDONLY)
1639 			vsp->f_flag |= ST_RDONLY;
1640 		if (mp->mnt_flag & MNT_NOSUID)
1641 			vsp->f_flag |= ST_NOSUID;
1642 
1643 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1644 		if (error) {
1645 			info->error = error;
1646 			return(-1);
1647 		}
1648 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1649 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1650 		kfree(freepath, M_TEMP);
1651 
1652 		error = copyout(sp, info->sfsp, sizeof(*sp));
1653 		if (error == 0)
1654 			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1655 		if (error) {
1656 			info->error = error;
1657 			return (-1);
1658 		}
1659 		++info->sfsp;
1660 		++info->vsfsp;
1661 	}
1662 	info->count++;
1663 	return(0);
1664 }
1665 
1666 
1667 /*
1668  * fchdir_args(int fd)
1669  *
1670  * Change current working directory to a given file descriptor.
1671  */
1672 int
1673 sys_fchdir(struct fchdir_args *uap)
1674 {
1675 	struct thread *td = curthread;
1676 	struct proc *p = td->td_proc;
1677 	struct filedesc *fdp = p->p_fd;
1678 	struct vnode *vp, *ovp;
1679 	struct mount *mp;
1680 	struct file *fp;
1681 	struct nchandle nch, onch, tnch;
1682 	int error;
1683 
1684 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1685 		return (error);
1686 	lwkt_gettoken(&p->p_token);
1687 	vp = (struct vnode *)fp->f_data;
1688 	vref(vp);
1689 	vn_lock(vp, LK_SHARED | LK_RETRY);
1690 	if (fp->f_nchandle.ncp == NULL)
1691 		error = ENOTDIR;
1692 	else
1693 		error = checkvp_chdir(vp, td);
1694 	if (error) {
1695 		vput(vp);
1696 		goto done;
1697 	}
1698 	cache_copy(&fp->f_nchandle, &nch);
1699 
1700 	/*
1701 	 * If the ncp has become a mount point, traverse through
1702 	 * the mount point.
1703 	 */
1704 
1705 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1706 	       (mp = cache_findmount(&nch)) != NULL
1707 	) {
1708 		error = nlookup_mp(mp, &tnch);
1709 		if (error == 0) {
1710 			cache_unlock(&tnch);	/* leave ref intact */
1711 			vput(vp);
1712 			vp = tnch.ncp->nc_vp;
1713 			error = vget(vp, LK_SHARED);
1714 			KKASSERT(error == 0);
1715 			cache_drop(&nch);
1716 			nch = tnch;
1717 		}
1718 		cache_dropmount(mp);
1719 	}
1720 	if (error == 0) {
1721 		spin_lock(&fdp->fd_spin);
1722 		ovp = fdp->fd_cdir;
1723 		onch = fdp->fd_ncdir;
1724 		fdp->fd_cdir = vp;
1725 		fdp->fd_ncdir = nch;
1726 		spin_unlock(&fdp->fd_spin);
1727 		vn_unlock(vp);		/* leave ref intact */
1728 		cache_drop(&onch);
1729 		vrele(ovp);
1730 	} else {
1731 		cache_drop(&nch);
1732 		vput(vp);
1733 	}
1734 	fdrop(fp);
1735 done:
1736 	lwkt_reltoken(&p->p_token);
1737 	return (error);
1738 }
1739 
1740 int
1741 kern_chdir(struct nlookupdata *nd)
1742 {
1743 	struct thread *td = curthread;
1744 	struct proc *p = td->td_proc;
1745 	struct filedesc *fdp = p->p_fd;
1746 	struct vnode *vp, *ovp;
1747 	struct nchandle onch;
1748 	int error;
1749 
1750 	nd->nl_flags |= NLC_SHAREDLOCK;
1751 	if ((error = nlookup(nd)) != 0)
1752 		return (error);
1753 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1754 		return (ENOENT);
1755 	if ((error = vget(vp, LK_SHARED)) != 0)
1756 		return (error);
1757 
1758 	lwkt_gettoken(&p->p_token);
1759 	error = checkvp_chdir(vp, td);
1760 	vn_unlock(vp);
1761 	if (error == 0) {
1762 		spin_lock(&fdp->fd_spin);
1763 		ovp = fdp->fd_cdir;
1764 		onch = fdp->fd_ncdir;
1765 		fdp->fd_ncdir = nd->nl_nch;
1766 		fdp->fd_cdir = vp;
1767 		spin_unlock(&fdp->fd_spin);
1768 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1769 		cache_drop(&onch);
1770 		vrele(ovp);
1771 		cache_zero(&nd->nl_nch);
1772 	} else {
1773 		vrele(vp);
1774 	}
1775 	lwkt_reltoken(&p->p_token);
1776 	return (error);
1777 }
1778 
1779 /*
1780  * chdir_args(char *path)
1781  *
1782  * Change current working directory (``.'').
1783  */
1784 int
1785 sys_chdir(struct chdir_args *uap)
1786 {
1787 	struct nlookupdata nd;
1788 	int error;
1789 
1790 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1791 	if (error == 0)
1792 		error = kern_chdir(&nd);
1793 	nlookup_done(&nd);
1794 	return (error);
1795 }
1796 
1797 /*
1798  * Helper function for raised chroot(2) security function:  Refuse if
1799  * any filedescriptors are open directories.
1800  */
1801 static int
1802 chroot_refuse_vdir_fds(struct filedesc *fdp)
1803 {
1804 	struct vnode *vp;
1805 	struct file *fp;
1806 	int error;
1807 	int fd;
1808 
1809 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1810 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1811 			continue;
1812 		vp = (struct vnode *)fp->f_data;
1813 		if (vp->v_type != VDIR) {
1814 			fdrop(fp);
1815 			continue;
1816 		}
1817 		fdrop(fp);
1818 		return(EPERM);
1819 	}
1820 	return (0);
1821 }
1822 
1823 /*
1824  * This sysctl determines if we will allow a process to chroot(2) if it
1825  * has a directory open:
1826  *	0: disallowed for all processes.
1827  *	1: allowed for processes that were not already chroot(2)'ed.
1828  *	2: allowed for all processes.
1829  */
1830 
1831 static int chroot_allow_open_directories = 1;
1832 
1833 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1834      &chroot_allow_open_directories, 0, "");
1835 
1836 /*
1837  * chroot to the specified namecache entry.  We obtain the vp from the
1838  * namecache data.  The passed ncp must be locked and referenced and will
1839  * remain locked and referenced on return.
1840  */
1841 int
1842 kern_chroot(struct nchandle *nch)
1843 {
1844 	struct thread *td = curthread;
1845 	struct proc *p = td->td_proc;
1846 	struct filedesc *fdp = p->p_fd;
1847 	struct vnode *vp;
1848 	int error;
1849 
1850 	/*
1851 	 * Only privileged user can chroot
1852 	 */
1853 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1854 	if (error)
1855 		return (error);
1856 
1857 	/*
1858 	 * Disallow open directory descriptors (fchdir() breakouts).
1859 	 */
1860 	if (chroot_allow_open_directories == 0 ||
1861 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1862 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1863 			return (error);
1864 	}
1865 	if ((vp = nch->ncp->nc_vp) == NULL)
1866 		return (ENOENT);
1867 
1868 	if ((error = vget(vp, LK_SHARED)) != 0)
1869 		return (error);
1870 
1871 	/*
1872 	 * Check the validity of vp as a directory to change to and
1873 	 * associate it with rdir/jdir.
1874 	 */
1875 	error = checkvp_chdir(vp, td);
1876 	vn_unlock(vp);			/* leave reference intact */
1877 	if (error == 0) {
1878 		lwkt_gettoken(&p->p_token);
1879 		vrele(fdp->fd_rdir);
1880 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1881 		cache_drop(&fdp->fd_nrdir);
1882 		cache_copy(nch, &fdp->fd_nrdir);
1883 		if (fdp->fd_jdir == NULL) {
1884 			fdp->fd_jdir = vp;
1885 			vref(fdp->fd_jdir);
1886 			cache_copy(nch, &fdp->fd_njdir);
1887 		}
1888 		if ((p->p_flags & P_DIDCHROOT) == 0) {
1889 			p->p_flags |= P_DIDCHROOT;
1890 			if (p->p_depth <= 65535 - 32)
1891 				p->p_depth += 32;
1892 		}
1893 		lwkt_reltoken(&p->p_token);
1894 	} else {
1895 		vrele(vp);
1896 	}
1897 	return (error);
1898 }
1899 
1900 /*
1901  * chroot_args(char *path)
1902  *
1903  * Change notion of root (``/'') directory.
1904  */
1905 int
1906 sys_chroot(struct chroot_args *uap)
1907 {
1908 	struct thread *td __debugvar = curthread;
1909 	struct nlookupdata nd;
1910 	int error;
1911 
1912 	KKASSERT(td->td_proc);
1913 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1914 	if (error == 0) {
1915 		nd.nl_flags |= NLC_EXEC;
1916 		error = nlookup(&nd);
1917 		if (error == 0)
1918 			error = kern_chroot(&nd.nl_nch);
1919 	}
1920 	nlookup_done(&nd);
1921 	return(error);
1922 }
1923 
1924 int
1925 sys_chroot_kernel(struct chroot_kernel_args *uap)
1926 {
1927 	struct thread *td = curthread;
1928 	struct nlookupdata nd;
1929 	struct nchandle *nch;
1930 	struct vnode *vp;
1931 	int error;
1932 
1933 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1934 	if (error)
1935 		goto error_nond;
1936 
1937 	error = nlookup(&nd);
1938 	if (error)
1939 		goto error_out;
1940 
1941 	nch = &nd.nl_nch;
1942 
1943 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1944 	if (error)
1945 		goto error_out;
1946 
1947 	if ((vp = nch->ncp->nc_vp) == NULL) {
1948 		error = ENOENT;
1949 		goto error_out;
1950 	}
1951 
1952 	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
1953 		goto error_out;
1954 
1955 	kprintf("chroot_kernel: set new rootnch/rootvnode to %s\n", uap->path);
1956 	vfs_cache_setroot(vp, cache_hold(nch));
1957 
1958 error_out:
1959 	nlookup_done(&nd);
1960 error_nond:
1961 	return(error);
1962 }
1963 
1964 /*
1965  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1966  * determine whether it is legal to chdir to the vnode.  The vnode's state
1967  * is not changed by this call.
1968  */
1969 static int
1970 checkvp_chdir(struct vnode *vp, struct thread *td)
1971 {
1972 	int error;
1973 
1974 	if (vp->v_type != VDIR)
1975 		error = ENOTDIR;
1976 	else
1977 		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1978 	return (error);
1979 }
1980 
1981 int
1982 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1983 {
1984 	struct thread *td = curthread;
1985 	struct proc *p = td->td_proc;
1986 	struct lwp *lp = td->td_lwp;
1987 	struct filedesc *fdp = p->p_fd;
1988 	int cmode, flags;
1989 	struct file *nfp;
1990 	struct file *fp;
1991 	struct vnode *vp;
1992 	int type, indx, error = 0;
1993 	struct flock lf;
1994 
1995 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1996 		return (EINVAL);
1997 	flags = FFLAGS(oflags);
1998 	error = falloc(lp, &nfp, NULL);
1999 	if (error)
2000 		return (error);
2001 	fp = nfp;
2002 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
2003 
2004 	/*
2005 	 * XXX p_dupfd is a real mess.  It allows a device to return a
2006 	 * file descriptor to be duplicated rather then doing the open
2007 	 * itself.
2008 	 */
2009 	lp->lwp_dupfd = -1;
2010 
2011 	/*
2012 	 * Call vn_open() to do the lookup and assign the vnode to the
2013 	 * file pointer.  vn_open() does not change the ref count on fp
2014 	 * and the vnode, on success, will be inherited by the file pointer
2015 	 * and unlocked.
2016 	 *
2017 	 * Request a shared lock on the vnode if possible.
2018 	 *
2019 	 * Executable binaries can race VTEXT against O_RDWR opens, so
2020 	 * use an exclusive lock for O_RDWR opens as well.
2021 	 *
2022 	 * NOTE: We need a flag to separate terminal vnode locking from
2023 	 *	 parent locking.  O_CREAT needs parent locking, but O_TRUNC
2024 	 *	 and O_RDWR only need to lock the terminal vnode exclusively.
2025 	 */
2026 	nd->nl_flags |= NLC_LOCKVP;
2027 	if ((flags & (O_CREAT|O_TRUNC|O_RDWR)) == 0)
2028 		nd->nl_flags |= NLC_SHAREDLOCK;
2029 
2030 	error = vn_open(nd, fp, flags, cmode);
2031 	nlookup_done(nd);
2032 
2033 	if (error) {
2034 		/*
2035 		 * handle special fdopen() case.  bleh.  dupfdopen() is
2036 		 * responsible for dropping the old contents of ofiles[indx]
2037 		 * if it succeeds.
2038 		 *
2039 		 * Note that fsetfd() will add a ref to fp which represents
2040 		 * the fd_files[] assignment.  We must still drop our
2041 		 * reference.
2042 		 */
2043 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
2044 			if (fdalloc(p, 0, &indx) == 0) {
2045 				error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
2046 				if (error == 0) {
2047 					*res = indx;
2048 					fdrop(fp);	/* our ref */
2049 					return (0);
2050 				}
2051 				fsetfd(fdp, NULL, indx);
2052 			}
2053 		}
2054 		fdrop(fp);	/* our ref */
2055 		if (error == ERESTART)
2056 			error = EINTR;
2057 		return (error);
2058 	}
2059 
2060 	/*
2061 	 * ref the vnode for ourselves so it can't be ripped out from under
2062 	 * is.  XXX need an ND flag to request that the vnode be returned
2063 	 * anyway.
2064 	 *
2065 	 * Reserve a file descriptor but do not assign it until the open
2066 	 * succeeds.
2067 	 */
2068 	vp = (struct vnode *)fp->f_data;
2069 	vref(vp);
2070 	if ((error = fdalloc(p, 0, &indx)) != 0) {
2071 		fdrop(fp);
2072 		vrele(vp);
2073 		return (error);
2074 	}
2075 
2076 	/*
2077 	 * If no error occurs the vp will have been assigned to the file
2078 	 * pointer.
2079 	 */
2080 	lp->lwp_dupfd = 0;
2081 
2082 	if (flags & (O_EXLOCK | O_SHLOCK)) {
2083 		lf.l_whence = SEEK_SET;
2084 		lf.l_start = 0;
2085 		lf.l_len = 0;
2086 		if (flags & O_EXLOCK)
2087 			lf.l_type = F_WRLCK;
2088 		else
2089 			lf.l_type = F_RDLCK;
2090 		if (flags & FNONBLOCK)
2091 			type = 0;
2092 		else
2093 			type = F_WAIT;
2094 
2095 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
2096 			/*
2097 			 * lock request failed.  Clean up the reserved
2098 			 * descriptor.
2099 			 */
2100 			vrele(vp);
2101 			fsetfd(fdp, NULL, indx);
2102 			fdrop(fp);
2103 			return (error);
2104 		}
2105 		atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
2106 	}
2107 #if 0
2108 	/*
2109 	 * Assert that all regular file vnodes were created with a object.
2110 	 */
2111 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
2112 		("open: regular file has no backing object after vn_open"));
2113 #endif
2114 
2115 	vrele(vp);
2116 
2117 	/*
2118 	 * release our private reference, leaving the one associated with the
2119 	 * descriptor table intact.
2120 	 */
2121 	if (oflags & O_CLOEXEC)
2122 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
2123 	fsetfd(fdp, fp, indx);
2124 	fdrop(fp);
2125 	*res = indx;
2126 	return (error);
2127 }
2128 
2129 /*
2130  * open_args(char *path, int flags, int mode)
2131  *
2132  * Check permissions, allocate an open file structure,
2133  * and call the device open routine if any.
2134  */
2135 int
2136 sys_open(struct open_args *uap)
2137 {
2138 	struct nlookupdata nd;
2139 	int error;
2140 
2141 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2142 	if (error == 0) {
2143 		error = kern_open(&nd, uap->flags,
2144 				    uap->mode, &uap->sysmsg_result);
2145 	}
2146 	nlookup_done(&nd);
2147 	return (error);
2148 }
2149 
2150 /*
2151  * openat_args(int fd, char *path, int flags, int mode)
2152  */
2153 int
2154 sys_openat(struct openat_args *uap)
2155 {
2156 	struct nlookupdata nd;
2157 	int error;
2158 	struct file *fp;
2159 
2160 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2161 	if (error == 0) {
2162 		error = kern_open(&nd, uap->flags, uap->mode,
2163 					&uap->sysmsg_result);
2164 	}
2165 	nlookup_done_at(&nd, fp);
2166 	return (error);
2167 }
2168 
2169 int
2170 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2171 {
2172 	struct thread *td = curthread;
2173 	struct proc *p = td->td_proc;
2174 	struct vnode *vp;
2175 	struct vattr vattr;
2176 	int error;
2177 	int whiteout = 0;
2178 
2179 	KKASSERT(p);
2180 
2181 	VATTR_NULL(&vattr);
2182 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2183 	vattr.va_rmajor = rmajor;
2184 	vattr.va_rminor = rminor;
2185 
2186 	switch (mode & S_IFMT) {
2187 	case S_IFMT:	/* used by badsect to flag bad sectors */
2188 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
2189 		vattr.va_type = VBAD;
2190 		break;
2191 	case S_IFCHR:
2192 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2193 		vattr.va_type = VCHR;
2194 		break;
2195 	case S_IFBLK:
2196 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2197 		vattr.va_type = VBLK;
2198 		break;
2199 	case S_IFWHT:
2200 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
2201 		whiteout = 1;
2202 		break;
2203 	case S_IFDIR:	/* special directories support for HAMMER */
2204 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
2205 		vattr.va_type = VDIR;
2206 		break;
2207 	default:
2208 		error = EINVAL;
2209 		break;
2210 	}
2211 
2212 	if (error)
2213 		return (error);
2214 
2215 	bwillinode(1);
2216 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2217 	if ((error = nlookup(nd)) != 0)
2218 		return (error);
2219 	if (nd->nl_nch.ncp->nc_vp)
2220 		return (EEXIST);
2221 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2222 		return (error);
2223 
2224 	if (whiteout) {
2225 		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2226 				      nd->nl_cred, NAMEI_CREATE);
2227 	} else {
2228 		vp = NULL;
2229 		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2230 				   &vp, nd->nl_cred, &vattr);
2231 		if (error == 0)
2232 			vput(vp);
2233 	}
2234 	return (error);
2235 }
2236 
2237 /*
2238  * mknod_args(char *path, int mode, int dev)
2239  *
2240  * Create a special file.
2241  */
2242 int
2243 sys_mknod(struct mknod_args *uap)
2244 {
2245 	struct nlookupdata nd;
2246 	int error;
2247 
2248 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2249 	if (error == 0) {
2250 		error = kern_mknod(&nd, uap->mode,
2251 				   umajor(uap->dev), uminor(uap->dev));
2252 	}
2253 	nlookup_done(&nd);
2254 	return (error);
2255 }
2256 
2257 /*
2258  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2259  *
2260  * Create a special file.  The path is relative to the directory associated
2261  * with fd.
2262  */
2263 int
2264 sys_mknodat(struct mknodat_args *uap)
2265 {
2266 	struct nlookupdata nd;
2267 	struct file *fp;
2268 	int error;
2269 
2270 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2271 	if (error == 0) {
2272 		error = kern_mknod(&nd, uap->mode,
2273 				   umajor(uap->dev), uminor(uap->dev));
2274 	}
2275 	nlookup_done_at(&nd, fp);
2276 	return (error);
2277 }
2278 
2279 int
2280 kern_mkfifo(struct nlookupdata *nd, int mode)
2281 {
2282 	struct thread *td = curthread;
2283 	struct proc *p = td->td_proc;
2284 	struct vattr vattr;
2285 	struct vnode *vp;
2286 	int error;
2287 
2288 	bwillinode(1);
2289 
2290 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2291 	if ((error = nlookup(nd)) != 0)
2292 		return (error);
2293 	if (nd->nl_nch.ncp->nc_vp)
2294 		return (EEXIST);
2295 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2296 		return (error);
2297 
2298 	VATTR_NULL(&vattr);
2299 	vattr.va_type = VFIFO;
2300 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2301 	vp = NULL;
2302 	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2303 	if (error == 0)
2304 		vput(vp);
2305 	return (error);
2306 }
2307 
2308 /*
2309  * mkfifo_args(char *path, int mode)
2310  *
2311  * Create a named pipe.
2312  */
2313 int
2314 sys_mkfifo(struct mkfifo_args *uap)
2315 {
2316 	struct nlookupdata nd;
2317 	int error;
2318 
2319 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2320 	if (error == 0)
2321 		error = kern_mkfifo(&nd, uap->mode);
2322 	nlookup_done(&nd);
2323 	return (error);
2324 }
2325 
2326 /*
2327  * mkfifoat_args(int fd, char *path, mode_t mode)
2328  *
2329  * Create a named pipe.  The path is relative to the directory associated
2330  * with fd.
2331  */
2332 int
2333 sys_mkfifoat(struct mkfifoat_args *uap)
2334 {
2335 	struct nlookupdata nd;
2336 	struct file *fp;
2337 	int error;
2338 
2339 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2340 	if (error == 0)
2341 		error = kern_mkfifo(&nd, uap->mode);
2342 	nlookup_done_at(&nd, fp);
2343 	return (error);
2344 }
2345 
2346 static int hardlink_check_uid = 0;
2347 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2348     &hardlink_check_uid, 0,
2349     "Unprivileged processes cannot create hard links to files owned by other "
2350     "users");
2351 static int hardlink_check_gid = 0;
2352 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2353     &hardlink_check_gid, 0,
2354     "Unprivileged processes cannot create hard links to files owned by other "
2355     "groups");
2356 
2357 static int
2358 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2359 {
2360 	struct vattr va;
2361 	int error;
2362 
2363 	/*
2364 	 * Shortcut if disabled
2365 	 */
2366 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2367 		return (0);
2368 
2369 	/*
2370 	 * Privileged user can always hardlink
2371 	 */
2372 	if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2373 		return (0);
2374 
2375 	/*
2376 	 * Otherwise only if the originating file is owned by the
2377 	 * same user or group.  Note that any group is allowed if
2378 	 * the file is owned by the caller.
2379 	 */
2380 	error = VOP_GETATTR(vp, &va);
2381 	if (error != 0)
2382 		return (error);
2383 
2384 	if (hardlink_check_uid) {
2385 		if (cred->cr_uid != va.va_uid)
2386 			return (EPERM);
2387 	}
2388 
2389 	if (hardlink_check_gid) {
2390 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2391 			return (EPERM);
2392 	}
2393 
2394 	return (0);
2395 }
2396 
2397 int
2398 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2399 {
2400 	struct thread *td = curthread;
2401 	struct vnode *vp;
2402 	int error;
2403 
2404 	/*
2405 	 * Lookup the source and obtained a locked vnode.
2406 	 *
2407 	 * You may only hardlink a file which you have write permission
2408 	 * on or which you own.
2409 	 *
2410 	 * XXX relookup on vget failure / race ?
2411 	 */
2412 	bwillinode(1);
2413 	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2414 	if ((error = nlookup(nd)) != 0)
2415 		return (error);
2416 	vp = nd->nl_nch.ncp->nc_vp;
2417 	KKASSERT(vp != NULL);
2418 	if (vp->v_type == VDIR)
2419 		return (EPERM);		/* POSIX */
2420 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2421 		return (error);
2422 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2423 		return (error);
2424 
2425 	/*
2426 	 * Unlock the source so we can lookup the target without deadlocking
2427 	 * (XXX vp is locked already, possible other deadlock?).  The target
2428 	 * must not exist.
2429 	 */
2430 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2431 	nd->nl_flags &= ~NLC_NCPISLOCKED;
2432 	cache_unlock(&nd->nl_nch);
2433 	vn_unlock(vp);
2434 
2435 	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2436 	if ((error = nlookup(linknd)) != 0) {
2437 		vrele(vp);
2438 		return (error);
2439 	}
2440 	if (linknd->nl_nch.ncp->nc_vp) {
2441 		vrele(vp);
2442 		return (EEXIST);
2443 	}
2444 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
2445 	if (error) {
2446 		vrele(vp);
2447 		return (error);
2448 	}
2449 
2450 	/*
2451 	 * Finally run the new API VOP.
2452 	 */
2453 	error = can_hardlink(vp, td, td->td_ucred);
2454 	if (error == 0) {
2455 		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2456 				  vp, linknd->nl_cred);
2457 	}
2458 	vput(vp);
2459 	return (error);
2460 }
2461 
2462 /*
2463  * link_args(char *path, char *link)
2464  *
2465  * Make a hard file link.
2466  */
2467 int
2468 sys_link(struct link_args *uap)
2469 {
2470 	struct nlookupdata nd, linknd;
2471 	int error;
2472 
2473 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2474 	if (error == 0) {
2475 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2476 		if (error == 0)
2477 			error = kern_link(&nd, &linknd);
2478 		nlookup_done(&linknd);
2479 	}
2480 	nlookup_done(&nd);
2481 	return (error);
2482 }
2483 
2484 /*
2485  * linkat_args(int fd1, char *path1, int fd2, char *path2, int flags)
2486  *
2487  * Make a hard file link. The path1 argument is relative to the directory
2488  * associated with fd1, and similarly the path2 argument is relative to
2489  * the directory associated with fd2.
2490  */
2491 int
2492 sys_linkat(struct linkat_args *uap)
2493 {
2494 	struct nlookupdata nd, linknd;
2495 	struct file *fp1, *fp2;
2496 	int error;
2497 
2498 	error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
2499 	    (uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
2500 	if (error == 0) {
2501 		error = nlookup_init_at(&linknd, &fp2, uap->fd2,
2502 		    uap->path2, UIO_USERSPACE, 0);
2503 		if (error == 0)
2504 			error = kern_link(&nd, &linknd);
2505 		nlookup_done_at(&linknd, fp2);
2506 	}
2507 	nlookup_done_at(&nd, fp1);
2508 	return (error);
2509 }
2510 
2511 int
2512 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2513 {
2514 	struct vattr vattr;
2515 	struct vnode *vp;
2516 	struct vnode *dvp;
2517 	int error;
2518 
2519 	bwillinode(1);
2520 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2521 	if ((error = nlookup(nd)) != 0)
2522 		return (error);
2523 	if (nd->nl_nch.ncp->nc_vp)
2524 		return (EEXIST);
2525 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2526 		return (error);
2527 	dvp = nd->nl_dvp;
2528 	VATTR_NULL(&vattr);
2529 	vattr.va_mode = mode;
2530 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2531 	if (error == 0)
2532 		vput(vp);
2533 	return (error);
2534 }
2535 
2536 /*
2537  * symlink(char *path, char *link)
2538  *
2539  * Make a symbolic link.
2540  */
2541 int
2542 sys_symlink(struct symlink_args *uap)
2543 {
2544 	struct thread *td = curthread;
2545 	struct nlookupdata nd;
2546 	char *path;
2547 	int error;
2548 	int mode;
2549 
2550 	path = objcache_get(namei_oc, M_WAITOK);
2551 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2552 	if (error == 0) {
2553 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2554 		if (error == 0) {
2555 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2556 			error = kern_symlink(&nd, path, mode);
2557 		}
2558 		nlookup_done(&nd);
2559 	}
2560 	objcache_put(namei_oc, path);
2561 	return (error);
2562 }
2563 
2564 /*
2565  * symlinkat_args(char *path1, int fd, char *path2)
2566  *
2567  * Make a symbolic link.  The path2 argument is relative to the directory
2568  * associated with fd.
2569  */
2570 int
2571 sys_symlinkat(struct symlinkat_args *uap)
2572 {
2573 	struct thread *td = curthread;
2574 	struct nlookupdata nd;
2575 	struct file *fp;
2576 	char *path1;
2577 	int error;
2578 	int mode;
2579 
2580 	path1 = objcache_get(namei_oc, M_WAITOK);
2581 	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2582 	if (error == 0) {
2583 		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2584 		    UIO_USERSPACE, 0);
2585 		if (error == 0) {
2586 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2587 			error = kern_symlink(&nd, path1, mode);
2588 		}
2589 		nlookup_done_at(&nd, fp);
2590 	}
2591 	objcache_put(namei_oc, path1);
2592 	return (error);
2593 }
2594 
2595 /*
2596  * undelete_args(char *path)
2597  *
2598  * Delete a whiteout from the filesystem.
2599  */
2600 int
2601 sys_undelete(struct undelete_args *uap)
2602 {
2603 	struct nlookupdata nd;
2604 	int error;
2605 
2606 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2607 	bwillinode(1);
2608 	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2609 	if (error == 0)
2610 		error = nlookup(&nd);
2611 	if (error == 0)
2612 		error = ncp_writechk(&nd.nl_nch);
2613 	if (error == 0) {
2614 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2615 				      NAMEI_DELETE);
2616 	}
2617 	nlookup_done(&nd);
2618 	return (error);
2619 }
2620 
2621 int
2622 kern_unlink(struct nlookupdata *nd)
2623 {
2624 	int error;
2625 
2626 	bwillinode(1);
2627 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2628 	if ((error = nlookup(nd)) != 0)
2629 		return (error);
2630 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2631 		return (error);
2632 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2633 	return (error);
2634 }
2635 
2636 /*
2637  * unlink_args(char *path)
2638  *
2639  * Delete a name from the filesystem.
2640  */
2641 int
2642 sys_unlink(struct unlink_args *uap)
2643 {
2644 	struct nlookupdata nd;
2645 	int error;
2646 
2647 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2648 	if (error == 0)
2649 		error = kern_unlink(&nd);
2650 	nlookup_done(&nd);
2651 	return (error);
2652 }
2653 
2654 
2655 /*
2656  * unlinkat_args(int fd, char *path, int flags)
2657  *
2658  * Delete the file or directory entry pointed to by fd/path.
2659  */
2660 int
2661 sys_unlinkat(struct unlinkat_args *uap)
2662 {
2663 	struct nlookupdata nd;
2664 	struct file *fp;
2665 	int error;
2666 
2667 	if (uap->flags & ~AT_REMOVEDIR)
2668 		return (EINVAL);
2669 
2670 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2671 	if (error == 0) {
2672 		if (uap->flags & AT_REMOVEDIR)
2673 			error = kern_rmdir(&nd);
2674 		else
2675 			error = kern_unlink(&nd);
2676 	}
2677 	nlookup_done_at(&nd, fp);
2678 	return (error);
2679 }
2680 
2681 int
2682 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2683 {
2684 	struct thread *td = curthread;
2685 	struct proc *p = td->td_proc;
2686 	struct file *fp;
2687 	struct vnode *vp;
2688 	struct vattr vattr;
2689 	off_t new_offset;
2690 	int error;
2691 
2692 	fp = holdfp(p->p_fd, fd, -1);
2693 	if (fp == NULL)
2694 		return (EBADF);
2695 	if (fp->f_type != DTYPE_VNODE) {
2696 		error = ESPIPE;
2697 		goto done;
2698 	}
2699 	vp = (struct vnode *)fp->f_data;
2700 
2701 	switch (whence) {
2702 	case L_INCR:
2703 		spin_lock(&fp->f_spin);
2704 		new_offset = fp->f_offset + offset;
2705 		error = 0;
2706 		break;
2707 	case L_XTND:
2708 		error = VOP_GETATTR(vp, &vattr);
2709 		spin_lock(&fp->f_spin);
2710 		new_offset = offset + vattr.va_size;
2711 		break;
2712 	case L_SET:
2713 		new_offset = offset;
2714 		error = 0;
2715 		spin_lock(&fp->f_spin);
2716 		break;
2717 	default:
2718 		new_offset = 0;
2719 		error = EINVAL;
2720 		spin_lock(&fp->f_spin);
2721 		break;
2722 	}
2723 
2724 	/*
2725 	 * Validate the seek position.  Negative offsets are not allowed
2726 	 * for regular files or directories.
2727 	 *
2728 	 * Normally we would also not want to allow negative offsets for
2729 	 * character and block-special devices.  However kvm addresses
2730 	 * on 64 bit architectures might appear to be negative and must
2731 	 * be allowed.
2732 	 */
2733 	if (error == 0) {
2734 		if (new_offset < 0 &&
2735 		    (vp->v_type == VREG || vp->v_type == VDIR)) {
2736 			error = EINVAL;
2737 		} else {
2738 			fp->f_offset = new_offset;
2739 		}
2740 	}
2741 	*res = fp->f_offset;
2742 	spin_unlock(&fp->f_spin);
2743 done:
2744 	fdrop(fp);
2745 	return (error);
2746 }
2747 
2748 /*
2749  * lseek_args(int fd, int pad, off_t offset, int whence)
2750  *
2751  * Reposition read/write file offset.
2752  */
2753 int
2754 sys_lseek(struct lseek_args *uap)
2755 {
2756 	int error;
2757 
2758 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2759 			   &uap->sysmsg_offset);
2760 
2761 	return (error);
2762 }
2763 
2764 /*
2765  * Check if current process can access given file.  amode is a bitmask of *_OK
2766  * access bits.  flags is a bitmask of AT_* flags.
2767  */
2768 int
2769 kern_access(struct nlookupdata *nd, int amode, int flags)
2770 {
2771 	struct vnode *vp;
2772 	int error, mode;
2773 
2774 	if (flags & ~AT_EACCESS)
2775 		return (EINVAL);
2776 	nd->nl_flags |= NLC_SHAREDLOCK;
2777 	if ((error = nlookup(nd)) != 0)
2778 		return (error);
2779 retry:
2780 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
2781 	if (error)
2782 		return (error);
2783 
2784 	/* Flags == 0 means only check for existence. */
2785 	if (amode) {
2786 		mode = 0;
2787 		if (amode & R_OK)
2788 			mode |= VREAD;
2789 		if (amode & W_OK)
2790 			mode |= VWRITE;
2791 		if (amode & X_OK)
2792 			mode |= VEXEC;
2793 		if ((mode & VWRITE) == 0 ||
2794 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2795 			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2796 
2797 		/*
2798 		 * If the file handle is stale we have to re-resolve the
2799 		 * entry with the ncp held exclusively.  This is a hack
2800 		 * at the moment.
2801 		 */
2802 		if (error == ESTALE) {
2803 			vput(vp);
2804 			cache_unlock(&nd->nl_nch);
2805 			cache_lock(&nd->nl_nch);
2806 			cache_setunresolved(&nd->nl_nch);
2807 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2808 			if (error == 0) {
2809 				vp = NULL;
2810 				goto retry;
2811 			}
2812 			return(error);
2813 		}
2814 	}
2815 	vput(vp);
2816 	return (error);
2817 }
2818 
2819 /*
2820  * access_args(char *path, int flags)
2821  *
2822  * Check access permissions.
2823  */
2824 int
2825 sys_access(struct access_args *uap)
2826 {
2827 	struct nlookupdata nd;
2828 	int error;
2829 
2830 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2831 	if (error == 0)
2832 		error = kern_access(&nd, uap->flags, 0);
2833 	nlookup_done(&nd);
2834 	return (error);
2835 }
2836 
2837 
2838 /*
2839  * eaccess_args(char *path, int flags)
2840  *
2841  * Check access permissions.
2842  */
2843 int
2844 sys_eaccess(struct eaccess_args *uap)
2845 {
2846 	struct nlookupdata nd;
2847 	int error;
2848 
2849 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2850 	if (error == 0)
2851 		error = kern_access(&nd, uap->flags, AT_EACCESS);
2852 	nlookup_done(&nd);
2853 	return (error);
2854 }
2855 
2856 
2857 /*
2858  * faccessat_args(int fd, char *path, int amode, int flags)
2859  *
2860  * Check access permissions.
2861  */
2862 int
2863 sys_faccessat(struct faccessat_args *uap)
2864 {
2865 	struct nlookupdata nd;
2866 	struct file *fp;
2867 	int error;
2868 
2869 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2870 				NLC_FOLLOW);
2871 	if (error == 0)
2872 		error = kern_access(&nd, uap->amode, uap->flags);
2873 	nlookup_done_at(&nd, fp);
2874 	return (error);
2875 }
2876 
2877 int
2878 kern_stat(struct nlookupdata *nd, struct stat *st)
2879 {
2880 	int error;
2881 	struct vnode *vp;
2882 
2883 	nd->nl_flags |= NLC_SHAREDLOCK;
2884 	if ((error = nlookup(nd)) != 0)
2885 		return (error);
2886 again:
2887 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2888 		return (ENOENT);
2889 
2890 	if ((error = vget(vp, LK_SHARED)) != 0)
2891 		return (error);
2892 	error = vn_stat(vp, st, nd->nl_cred);
2893 
2894 	/*
2895 	 * If the file handle is stale we have to re-resolve the
2896 	 * entry with the ncp held exclusively.  This is a hack
2897 	 * at the moment.
2898 	 */
2899 	if (error == ESTALE) {
2900 		vput(vp);
2901 		cache_unlock(&nd->nl_nch);
2902 		cache_lock(&nd->nl_nch);
2903 		cache_setunresolved(&nd->nl_nch);
2904 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2905 		if (error == 0)
2906 			goto again;
2907 	} else {
2908 		vput(vp);
2909 	}
2910 	return (error);
2911 }
2912 
2913 /*
2914  * stat_args(char *path, struct stat *ub)
2915  *
2916  * Get file status; this version follows links.
2917  */
2918 int
2919 sys_stat(struct stat_args *uap)
2920 {
2921 	struct nlookupdata nd;
2922 	struct stat st;
2923 	int error;
2924 
2925 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2926 	if (error == 0) {
2927 		error = kern_stat(&nd, &st);
2928 		if (error == 0)
2929 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2930 	}
2931 	nlookup_done(&nd);
2932 	return (error);
2933 }
2934 
2935 /*
2936  * lstat_args(char *path, struct stat *ub)
2937  *
2938  * Get file status; this version does not follow links.
2939  */
2940 int
2941 sys_lstat(struct lstat_args *uap)
2942 {
2943 	struct nlookupdata nd;
2944 	struct stat st;
2945 	int error;
2946 
2947 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2948 	if (error == 0) {
2949 		error = kern_stat(&nd, &st);
2950 		if (error == 0)
2951 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2952 	}
2953 	nlookup_done(&nd);
2954 	return (error);
2955 }
2956 
2957 /*
2958  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2959  *
2960  * Get status of file pointed to by fd/path.
2961  */
2962 int
2963 sys_fstatat(struct fstatat_args *uap)
2964 {
2965 	struct nlookupdata nd;
2966 	struct stat st;
2967 	int error;
2968 	int flags;
2969 	struct file *fp;
2970 
2971 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2972 		return (EINVAL);
2973 
2974 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2975 
2976 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
2977 				UIO_USERSPACE, flags);
2978 	if (error == 0) {
2979 		error = kern_stat(&nd, &st);
2980 		if (error == 0)
2981 			error = copyout(&st, uap->sb, sizeof(*uap->sb));
2982 	}
2983 	nlookup_done_at(&nd, fp);
2984 	return (error);
2985 }
2986 
2987 static int
2988 kern_pathconf(char *path, int name, int flags, register_t *sysmsg_regp)
2989 {
2990 	struct nlookupdata nd;
2991 	struct vnode *vp;
2992 	int error;
2993 
2994 	vp = NULL;
2995 	error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
2996 	if (error == 0)
2997 		error = nlookup(&nd);
2998 	if (error == 0)
2999 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3000 	nlookup_done(&nd);
3001 	if (error == 0) {
3002 		error = VOP_PATHCONF(vp, name, sysmsg_regp);
3003 		vput(vp);
3004 	}
3005 	return (error);
3006 }
3007 
3008 /*
3009  * pathconf_Args(char *path, int name)
3010  *
3011  * Get configurable pathname variables.
3012  */
3013 int
3014 sys_pathconf(struct pathconf_args *uap)
3015 {
3016 	return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
3017 		&uap->sysmsg_reg));
3018 }
3019 
3020 /*
3021  * lpathconf_Args(char *path, int name)
3022  *
3023  * Get configurable pathname variables, but don't follow symlinks.
3024  */
3025 int
3026 sys_lpathconf(struct lpathconf_args *uap)
3027 {
3028 	return (kern_pathconf(uap->path, uap->name, 0, &uap->sysmsg_reg));
3029 }
3030 
3031 /*
3032  * XXX: daver
3033  * kern_readlink isn't properly split yet.  There is a copyin burried
3034  * in VOP_READLINK().
3035  */
3036 int
3037 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
3038 {
3039 	struct thread *td = curthread;
3040 	struct vnode *vp;
3041 	struct iovec aiov;
3042 	struct uio auio;
3043 	int error;
3044 
3045 	nd->nl_flags |= NLC_SHAREDLOCK;
3046 	if ((error = nlookup(nd)) != 0)
3047 		return (error);
3048 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
3049 	if (error)
3050 		return (error);
3051 	if (vp->v_type != VLNK) {
3052 		error = EINVAL;
3053 	} else {
3054 		aiov.iov_base = buf;
3055 		aiov.iov_len = count;
3056 		auio.uio_iov = &aiov;
3057 		auio.uio_iovcnt = 1;
3058 		auio.uio_offset = 0;
3059 		auio.uio_rw = UIO_READ;
3060 		auio.uio_segflg = UIO_USERSPACE;
3061 		auio.uio_td = td;
3062 		auio.uio_resid = count;
3063 		error = VOP_READLINK(vp, &auio, td->td_ucred);
3064 	}
3065 	vput(vp);
3066 	*res = count - auio.uio_resid;
3067 	return (error);
3068 }
3069 
3070 /*
3071  * readlink_args(char *path, char *buf, int count)
3072  *
3073  * Return target name of a symbolic link.
3074  */
3075 int
3076 sys_readlink(struct readlink_args *uap)
3077 {
3078 	struct nlookupdata nd;
3079 	int error;
3080 
3081 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3082 	if (error == 0) {
3083 		error = kern_readlink(&nd, uap->buf, uap->count,
3084 					&uap->sysmsg_result);
3085 	}
3086 	nlookup_done(&nd);
3087 	return (error);
3088 }
3089 
3090 /*
3091  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
3092  *
3093  * Return target name of a symbolic link.  The path is relative to the
3094  * directory associated with fd.
3095  */
3096 int
3097 sys_readlinkat(struct readlinkat_args *uap)
3098 {
3099 	struct nlookupdata nd;
3100 	struct file *fp;
3101 	int error;
3102 
3103 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3104 	if (error == 0) {
3105 		error = kern_readlink(&nd, uap->buf, uap->bufsize,
3106 					&uap->sysmsg_result);
3107 	}
3108 	nlookup_done_at(&nd, fp);
3109 	return (error);
3110 }
3111 
3112 static int
3113 setfflags(struct vnode *vp, int flags)
3114 {
3115 	struct thread *td = curthread;
3116 	int error;
3117 	struct vattr vattr;
3118 
3119 	/*
3120 	 * Prevent non-root users from setting flags on devices.  When
3121 	 * a device is reused, users can retain ownership of the device
3122 	 * if they are allowed to set flags and programs assume that
3123 	 * chown can't fail when done as root.
3124 	 */
3125 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
3126 	    ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
3127 		return (error);
3128 
3129 	/*
3130 	 * note: vget is required for any operation that might mod the vnode
3131 	 * so VINACTIVE is properly cleared.
3132 	 */
3133 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3134 		VATTR_NULL(&vattr);
3135 		vattr.va_flags = flags;
3136 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3137 		vput(vp);
3138 	}
3139 	return (error);
3140 }
3141 
3142 /*
3143  * chflags(char *path, int flags)
3144  *
3145  * Change flags of a file given a path name.
3146  */
3147 int
3148 sys_chflags(struct chflags_args *uap)
3149 {
3150 	struct nlookupdata nd;
3151 	struct vnode *vp;
3152 	int error;
3153 
3154 	vp = NULL;
3155 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3156 	if (error == 0)
3157 		error = nlookup(&nd);
3158 	if (error == 0)
3159 		error = ncp_writechk(&nd.nl_nch);
3160 	if (error == 0)
3161 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3162 	nlookup_done(&nd);
3163 	if (error == 0) {
3164 		error = setfflags(vp, uap->flags);
3165 		vrele(vp);
3166 	}
3167 	return (error);
3168 }
3169 
3170 /*
3171  * lchflags(char *path, int flags)
3172  *
3173  * Change flags of a file given a path name, but don't follow symlinks.
3174  */
3175 int
3176 sys_lchflags(struct lchflags_args *uap)
3177 {
3178 	struct nlookupdata nd;
3179 	struct vnode *vp;
3180 	int error;
3181 
3182 	vp = NULL;
3183 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3184 	if (error == 0)
3185 		error = nlookup(&nd);
3186 	if (error == 0)
3187 		error = ncp_writechk(&nd.nl_nch);
3188 	if (error == 0)
3189 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3190 	nlookup_done(&nd);
3191 	if (error == 0) {
3192 		error = setfflags(vp, uap->flags);
3193 		vrele(vp);
3194 	}
3195 	return (error);
3196 }
3197 
3198 /*
3199  * fchflags_args(int fd, int flags)
3200  *
3201  * Change flags of a file given a file descriptor.
3202  */
3203 int
3204 sys_fchflags(struct fchflags_args *uap)
3205 {
3206 	struct thread *td = curthread;
3207 	struct proc *p = td->td_proc;
3208 	struct file *fp;
3209 	int error;
3210 
3211 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3212 		return (error);
3213 	if (fp->f_nchandle.ncp)
3214 		error = ncp_writechk(&fp->f_nchandle);
3215 	if (error == 0)
3216 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
3217 	fdrop(fp);
3218 	return (error);
3219 }
3220 
3221 /*
3222  * chflagsat_args(int fd, const char *path, int flags, int atflags)
3223  * change flags given a pathname relative to a filedescriptor
3224  */
3225 int sys_chflagsat(struct chflagsat_args *uap)
3226 {
3227 	struct nlookupdata nd;
3228 	struct vnode *vp;
3229 	struct file *fp;
3230 	int error;
3231 	int lookupflags;
3232 
3233 	if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
3234 		return (EINVAL);
3235 
3236 	lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3237 
3238 	vp = NULL;
3239 	error = nlookup_init_at(&nd, &fp, uap->fd,  uap->path, UIO_USERSPACE, lookupflags);
3240 	if (error == 0)
3241 		error = nlookup(&nd);
3242 	if (error == 0)
3243 		error = ncp_writechk(&nd.nl_nch);
3244 	if (error == 0)
3245 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3246 	nlookup_done_at(&nd, fp);
3247 	if (error == 0) {
3248 		error = setfflags(vp, uap->flags);
3249 		vrele(vp);
3250 	}
3251 	return (error);
3252 }
3253 
3254 
3255 static int
3256 setfmode(struct vnode *vp, int mode)
3257 {
3258 	struct thread *td = curthread;
3259 	int error;
3260 	struct vattr vattr;
3261 
3262 	/*
3263 	 * note: vget is required for any operation that might mod the vnode
3264 	 * so VINACTIVE is properly cleared.
3265 	 */
3266 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3267 		VATTR_NULL(&vattr);
3268 		vattr.va_mode = mode & ALLPERMS;
3269 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3270 		cache_inval_wxok(vp);
3271 		vput(vp);
3272 	}
3273 	return error;
3274 }
3275 
3276 int
3277 kern_chmod(struct nlookupdata *nd, int mode)
3278 {
3279 	struct vnode *vp;
3280 	int error;
3281 
3282 	if ((error = nlookup(nd)) != 0)
3283 		return (error);
3284 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3285 		return (error);
3286 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3287 		error = setfmode(vp, mode);
3288 	vrele(vp);
3289 	return (error);
3290 }
3291 
3292 /*
3293  * chmod_args(char *path, int mode)
3294  *
3295  * Change mode of a file given path name.
3296  */
3297 int
3298 sys_chmod(struct chmod_args *uap)
3299 {
3300 	struct nlookupdata nd;
3301 	int error;
3302 
3303 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3304 	if (error == 0)
3305 		error = kern_chmod(&nd, uap->mode);
3306 	nlookup_done(&nd);
3307 	return (error);
3308 }
3309 
3310 /*
3311  * lchmod_args(char *path, int mode)
3312  *
3313  * Change mode of a file given path name (don't follow links.)
3314  */
3315 int
3316 sys_lchmod(struct lchmod_args *uap)
3317 {
3318 	struct nlookupdata nd;
3319 	int error;
3320 
3321 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3322 	if (error == 0)
3323 		error = kern_chmod(&nd, uap->mode);
3324 	nlookup_done(&nd);
3325 	return (error);
3326 }
3327 
3328 /*
3329  * fchmod_args(int fd, int mode)
3330  *
3331  * Change mode of a file given a file descriptor.
3332  */
3333 int
3334 sys_fchmod(struct fchmod_args *uap)
3335 {
3336 	struct thread *td = curthread;
3337 	struct proc *p = td->td_proc;
3338 	struct file *fp;
3339 	int error;
3340 
3341 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3342 		return (error);
3343 	if (fp->f_nchandle.ncp)
3344 		error = ncp_writechk(&fp->f_nchandle);
3345 	if (error == 0)
3346 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3347 	fdrop(fp);
3348 	return (error);
3349 }
3350 
3351 /*
3352  * fchmodat_args(char *path, int mode)
3353  *
3354  * Change mode of a file pointed to by fd/path.
3355  */
3356 int
3357 sys_fchmodat(struct fchmodat_args *uap)
3358 {
3359 	struct nlookupdata nd;
3360 	struct file *fp;
3361 	int error;
3362 	int flags;
3363 
3364 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3365 		return (EINVAL);
3366 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3367 
3368 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3369 				UIO_USERSPACE, flags);
3370 	if (error == 0)
3371 		error = kern_chmod(&nd, uap->mode);
3372 	nlookup_done_at(&nd, fp);
3373 	return (error);
3374 }
3375 
3376 static int
3377 setfown(struct mount *mp, struct vnode *vp, uid_t uid, gid_t gid)
3378 {
3379 	struct thread *td = curthread;
3380 	int error;
3381 	struct vattr vattr;
3382 	uid_t o_uid;
3383 	gid_t o_gid;
3384 	uint64_t size;
3385 
3386 	/*
3387 	 * note: vget is required for any operation that might mod the vnode
3388 	 * so VINACTIVE is properly cleared.
3389 	 */
3390 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3391 		if ((error = VOP_GETATTR(vp, &vattr)) != 0)
3392 			return error;
3393 		o_uid = vattr.va_uid;
3394 		o_gid = vattr.va_gid;
3395 		size = vattr.va_size;
3396 
3397 		VATTR_NULL(&vattr);
3398 		vattr.va_uid = uid;
3399 		vattr.va_gid = gid;
3400 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3401 		vput(vp);
3402 	}
3403 
3404 	if (error == 0) {
3405 		if (uid == -1)
3406 			uid = o_uid;
3407 		if (gid == -1)
3408 			gid = o_gid;
3409 		VFS_ACCOUNT(mp, o_uid, o_gid, -size);
3410 		VFS_ACCOUNT(mp,   uid,   gid,  size);
3411 	}
3412 
3413 	return error;
3414 }
3415 
3416 int
3417 kern_chown(struct nlookupdata *nd, int uid, int gid)
3418 {
3419 	struct vnode *vp;
3420 	int error;
3421 
3422 	if ((error = nlookup(nd)) != 0)
3423 		return (error);
3424 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3425 		return (error);
3426 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3427 		error = setfown(nd->nl_nch.mount, vp, uid, gid);
3428 	vrele(vp);
3429 	return (error);
3430 }
3431 
3432 /*
3433  * chown(char *path, int uid, int gid)
3434  *
3435  * Set ownership given a path name.
3436  */
3437 int
3438 sys_chown(struct chown_args *uap)
3439 {
3440 	struct nlookupdata nd;
3441 	int error;
3442 
3443 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3444 	if (error == 0)
3445 		error = kern_chown(&nd, uap->uid, uap->gid);
3446 	nlookup_done(&nd);
3447 	return (error);
3448 }
3449 
3450 /*
3451  * lchown_args(char *path, int uid, int gid)
3452  *
3453  * Set ownership given a path name, do not cross symlinks.
3454  */
3455 int
3456 sys_lchown(struct lchown_args *uap)
3457 {
3458 	struct nlookupdata nd;
3459 	int error;
3460 
3461 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3462 	if (error == 0)
3463 		error = kern_chown(&nd, uap->uid, uap->gid);
3464 	nlookup_done(&nd);
3465 	return (error);
3466 }
3467 
3468 /*
3469  * fchown_args(int fd, int uid, int gid)
3470  *
3471  * Set ownership given a file descriptor.
3472  */
3473 int
3474 sys_fchown(struct fchown_args *uap)
3475 {
3476 	struct thread *td = curthread;
3477 	struct proc *p = td->td_proc;
3478 	struct file *fp;
3479 	int error;
3480 
3481 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3482 		return (error);
3483 	if (fp->f_nchandle.ncp)
3484 		error = ncp_writechk(&fp->f_nchandle);
3485 	if (error == 0)
3486 		error = setfown(p->p_fd->fd_ncdir.mount,
3487 			(struct vnode *)fp->f_data, uap->uid, uap->gid);
3488 	fdrop(fp);
3489 	return (error);
3490 }
3491 
3492 /*
3493  * fchownat(int fd, char *path, int uid, int gid, int flags)
3494  *
3495  * Set ownership of file pointed to by fd/path.
3496  */
3497 int
3498 sys_fchownat(struct fchownat_args *uap)
3499 {
3500 	struct nlookupdata nd;
3501 	struct file *fp;
3502 	int error;
3503 	int flags;
3504 
3505 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3506 		return (EINVAL);
3507 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3508 
3509 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3510 				UIO_USERSPACE, flags);
3511 	if (error == 0)
3512 		error = kern_chown(&nd, uap->uid, uap->gid);
3513 	nlookup_done_at(&nd, fp);
3514 	return (error);
3515 }
3516 
3517 
3518 static int
3519 getutimes(struct timeval *tvp, struct timespec *tsp)
3520 {
3521 	struct timeval tv[2];
3522 	int error;
3523 
3524 	if (tvp == NULL) {
3525 		microtime(&tv[0]);
3526 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3527 		tsp[1] = tsp[0];
3528 	} else {
3529 		if ((error = itimerfix(tvp)) != 0)
3530 			return (error);
3531 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3532 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3533 	}
3534 	return 0;
3535 }
3536 
3537 static int
3538 getutimens(const struct timespec *ts, struct timespec *newts, int *nullflag)
3539 {
3540 	struct timespec tsnow;
3541 	int error;
3542 
3543 	*nullflag = 0;
3544 	nanotime(&tsnow);
3545 	if (ts == NULL) {
3546 		newts[0] = tsnow;
3547 		newts[1] = tsnow;
3548 		*nullflag = 1;
3549 		return (0);
3550 	}
3551 
3552 	newts[0] = ts[0];
3553 	newts[1] = ts[1];
3554 	if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT)
3555 		return (0);
3556 	if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
3557 		*nullflag = 1;
3558 
3559 	if (newts[0].tv_nsec == UTIME_OMIT)
3560 		newts[0].tv_sec = VNOVAL;
3561 	else if (newts[0].tv_nsec == UTIME_NOW)
3562 		newts[0] = tsnow;
3563 	else if ((error = itimespecfix(&newts[0])) != 0)
3564 		return (error);
3565 
3566 	if (newts[1].tv_nsec == UTIME_OMIT)
3567 		newts[1].tv_sec = VNOVAL;
3568 	else if (newts[1].tv_nsec == UTIME_NOW)
3569 		newts[1] = tsnow;
3570 	else if ((error = itimespecfix(&newts[1])) != 0)
3571 		return (error);
3572 
3573 	return (0);
3574 }
3575 
3576 static int
3577 setutimes(struct vnode *vp, struct vattr *vattr,
3578 	  const struct timespec *ts, int nullflag)
3579 {
3580 	struct thread *td = curthread;
3581 	int error;
3582 
3583 	VATTR_NULL(vattr);
3584 	vattr->va_atime = ts[0];
3585 	vattr->va_mtime = ts[1];
3586 	if (nullflag)
3587 		vattr->va_vaflags |= VA_UTIMES_NULL;
3588 	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3589 
3590 	return error;
3591 }
3592 
3593 int
3594 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3595 {
3596 	struct timespec ts[2];
3597 	int error;
3598 
3599 	if (tptr) {
3600 		if ((error = getutimes(tptr, ts)) != 0)
3601 			return (error);
3602 	}
3603 	error = kern_utimensat(nd, tptr ? ts : NULL, 0);
3604 	return (error);
3605 }
3606 
3607 /*
3608  * utimes_args(char *path, struct timeval *tptr)
3609  *
3610  * Set the access and modification times of a file.
3611  */
3612 int
3613 sys_utimes(struct utimes_args *uap)
3614 {
3615 	struct timeval tv[2];
3616 	struct nlookupdata nd;
3617 	int error;
3618 
3619 	if (uap->tptr) {
3620  		error = copyin(uap->tptr, tv, sizeof(tv));
3621 		if (error)
3622 			return (error);
3623 	}
3624 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3625 	if (error == 0)
3626 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3627 	nlookup_done(&nd);
3628 	return (error);
3629 }
3630 
3631 /*
3632  * lutimes_args(char *path, struct timeval *tptr)
3633  *
3634  * Set the access and modification times of a file.
3635  */
3636 int
3637 sys_lutimes(struct lutimes_args *uap)
3638 {
3639 	struct timeval tv[2];
3640 	struct nlookupdata nd;
3641 	int error;
3642 
3643 	if (uap->tptr) {
3644 		error = copyin(uap->tptr, tv, sizeof(tv));
3645 		if (error)
3646 			return (error);
3647 	}
3648 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3649 	if (error == 0)
3650 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3651 	nlookup_done(&nd);
3652 	return (error);
3653 }
3654 
3655 /*
3656  * Set utimes on a file descriptor.  The creds used to open the
3657  * file are used to determine whether the operation is allowed
3658  * or not.
3659  */
3660 int
3661 kern_futimens(int fd, struct timespec *ts)
3662 {
3663 	struct thread *td = curthread;
3664 	struct proc *p = td->td_proc;
3665 	struct timespec newts[2];
3666 	struct file *fp;
3667 	struct vnode *vp;
3668 	struct vattr vattr;
3669 	int nullflag;
3670 	int error;
3671 
3672 	error = getutimens(ts, newts, &nullflag);
3673 	if (error)
3674 		return (error);
3675 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3676 		return (error);
3677 	if (fp->f_nchandle.ncp)
3678 		error = ncp_writechk(&fp->f_nchandle);
3679 	if (error == 0) {
3680 		vp = fp->f_data;
3681 		error = vget(vp, LK_EXCLUSIVE);
3682 		if (error == 0) {
3683 			error = VOP_GETATTR(vp, &vattr);
3684 			if (error == 0) {
3685 				error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3686 						   fp->f_cred);
3687 			}
3688 			if (error == 0) {
3689 				error = setutimes(vp, &vattr, newts, nullflag);
3690 			}
3691 			vput(vp);
3692 		}
3693 	}
3694 	fdrop(fp);
3695 	return (error);
3696 }
3697 
3698 /*
3699  * futimens_args(int fd, struct timespec *ts)
3700  *
3701  * Set the access and modification times of a file.
3702  */
3703 int
3704 sys_futimens(struct futimens_args *uap)
3705 {
3706 	struct timespec ts[2];
3707 	int error;
3708 
3709 	if (uap->ts) {
3710 		error = copyin(uap->ts, ts, sizeof(ts));
3711 		if (error)
3712 			return (error);
3713 	}
3714 	error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
3715 	return (error);
3716 }
3717 
3718 int
3719 kern_futimes(int fd, struct timeval *tptr)
3720 {
3721 	struct timespec ts[2];
3722 	int error;
3723 
3724 	if (tptr) {
3725 		if ((error = getutimes(tptr, ts)) != 0)
3726 			return (error);
3727 	}
3728 	error = kern_futimens(fd, tptr ? ts : NULL);
3729 	return (error);
3730 }
3731 
3732 /*
3733  * futimes_args(int fd, struct timeval *tptr)
3734  *
3735  * Set the access and modification times of a file.
3736  */
3737 int
3738 sys_futimes(struct futimes_args *uap)
3739 {
3740 	struct timeval tv[2];
3741 	int error;
3742 
3743 	if (uap->tptr) {
3744 		error = copyin(uap->tptr, tv, sizeof(tv));
3745 		if (error)
3746 			return (error);
3747 	}
3748 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3749 	return (error);
3750 }
3751 
3752 int
3753 kern_utimensat(struct nlookupdata *nd, const struct timespec *ts, int flags)
3754 {
3755 	struct timespec newts[2];
3756 	struct vnode *vp;
3757 	struct vattr vattr;
3758 	int nullflag;
3759 	int error;
3760 
3761 	if (flags & ~AT_SYMLINK_NOFOLLOW)
3762 		return (EINVAL);
3763 
3764 	error = getutimens(ts, newts, &nullflag);
3765 	if (error)
3766 		return (error);
3767 
3768 	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3769 	if ((error = nlookup(nd)) != 0)
3770 		return (error);
3771 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3772 		return (error);
3773 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3774 		return (error);
3775 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3776 		error = vget(vp, LK_EXCLUSIVE);
3777 		if (error == 0) {
3778 			error = setutimes(vp, &vattr, newts, nullflag);
3779 			vput(vp);
3780 		}
3781 	}
3782 	vrele(vp);
3783 	return (error);
3784 }
3785 
3786 /*
3787  * utimensat_args(int fd, const char *path, const struct timespec *ts, int flags);
3788  *
3789  * Set file access and modification times of a file.
3790  */
3791 int
3792 sys_utimensat(struct utimensat_args *uap)
3793 {
3794 	struct timespec ts[2];
3795 	struct nlookupdata nd;
3796 	struct file *fp;
3797 	int error;
3798 	int flags;
3799 
3800 	if (uap->ts) {
3801 		error = copyin(uap->ts, ts, sizeof(ts));
3802 		if (error)
3803 			return (error);
3804 	}
3805 
3806 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3807 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3808 	                        UIO_USERSPACE, flags);
3809 	if (error == 0)
3810 		error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
3811 	nlookup_done_at(&nd, fp);
3812 	return (error);
3813 }
3814 
3815 int
3816 kern_truncate(struct nlookupdata *nd, off_t length)
3817 {
3818 	struct vnode *vp;
3819 	struct vattr vattr;
3820 	int error;
3821 	uid_t uid = 0;
3822 	gid_t gid = 0;
3823 	uint64_t old_size = 0;
3824 
3825 	if (length < 0)
3826 		return(EINVAL);
3827 	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3828 	if ((error = nlookup(nd)) != 0)
3829 		return (error);
3830 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3831 		return (error);
3832 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3833 		return (error);
3834 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
3835 	if (error) {
3836 		vrele(vp);
3837 		return (error);
3838 	}
3839 	if (vp->v_type == VDIR) {
3840 		error = EISDIR;
3841 		goto done;
3842 	}
3843 	if (vfs_quota_enabled) {
3844 		error = VOP_GETATTR(vp, &vattr);
3845 		KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
3846 		uid = vattr.va_uid;
3847 		gid = vattr.va_gid;
3848 		old_size = vattr.va_size;
3849 	}
3850 
3851 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3852 		VATTR_NULL(&vattr);
3853 		vattr.va_size = length;
3854 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3855 		VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
3856 	}
3857 done:
3858 	vput(vp);
3859 	return (error);
3860 }
3861 
3862 /*
3863  * truncate(char *path, int pad, off_t length)
3864  *
3865  * Truncate a file given its path name.
3866  */
3867 int
3868 sys_truncate(struct truncate_args *uap)
3869 {
3870 	struct nlookupdata nd;
3871 	int error;
3872 
3873 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3874 	if (error == 0)
3875 		error = kern_truncate(&nd, uap->length);
3876 	nlookup_done(&nd);
3877 	return error;
3878 }
3879 
3880 int
3881 kern_ftruncate(int fd, off_t length)
3882 {
3883 	struct thread *td = curthread;
3884 	struct proc *p = td->td_proc;
3885 	struct vattr vattr;
3886 	struct vnode *vp;
3887 	struct file *fp;
3888 	int error;
3889 	uid_t uid = 0;
3890 	gid_t gid = 0;
3891 	uint64_t old_size = 0;
3892 	struct mount *mp;
3893 
3894 	if (length < 0)
3895 		return(EINVAL);
3896 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3897 		return (error);
3898 	if (fp->f_nchandle.ncp) {
3899 		error = ncp_writechk(&fp->f_nchandle);
3900 		if (error)
3901 			goto done;
3902 	}
3903 	if ((fp->f_flag & FWRITE) == 0) {
3904 		error = EINVAL;
3905 		goto done;
3906 	}
3907 	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
3908 		error = EINVAL;
3909 		goto done;
3910 	}
3911 	vp = (struct vnode *)fp->f_data;
3912 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3913 	if (vp->v_type == VDIR) {
3914 		error = EISDIR;
3915 		vn_unlock(vp);
3916 		goto done;
3917 	}
3918 
3919 	if (vfs_quota_enabled) {
3920 		error = VOP_GETATTR(vp, &vattr);
3921 		KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
3922 		uid = vattr.va_uid;
3923 		gid = vattr.va_gid;
3924 		old_size = vattr.va_size;
3925 	}
3926 
3927 	if ((error = vn_writechk(vp, NULL)) == 0) {
3928 		VATTR_NULL(&vattr);
3929 		vattr.va_size = length;
3930 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3931 		mp = vq_vptomp(vp);
3932 		VFS_ACCOUNT(mp, uid, gid, length - old_size);
3933 	}
3934 	vn_unlock(vp);
3935 done:
3936 	fdrop(fp);
3937 	return (error);
3938 }
3939 
3940 /*
3941  * ftruncate_args(int fd, int pad, off_t length)
3942  *
3943  * Truncate a file given a file descriptor.
3944  */
3945 int
3946 sys_ftruncate(struct ftruncate_args *uap)
3947 {
3948 	int error;
3949 
3950 	error = kern_ftruncate(uap->fd, uap->length);
3951 
3952 	return (error);
3953 }
3954 
3955 /*
3956  * fsync(int fd)
3957  *
3958  * Sync an open file.
3959  */
3960 int
3961 sys_fsync(struct fsync_args *uap)
3962 {
3963 	struct thread *td = curthread;
3964 	struct proc *p = td->td_proc;
3965 	struct vnode *vp;
3966 	struct file *fp;
3967 	vm_object_t obj;
3968 	int error;
3969 
3970 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3971 		return (error);
3972 	vp = (struct vnode *)fp->f_data;
3973 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3974 	if ((obj = vp->v_object) != NULL) {
3975 		if (vp->v_mount == NULL ||
3976 		    (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
3977 			vm_object_page_clean(obj, 0, 0, 0);
3978 		}
3979 	}
3980 	error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3981 	if (error == 0 && vp->v_mount)
3982 		error = buf_fsync(vp);
3983 	vn_unlock(vp);
3984 	fdrop(fp);
3985 
3986 	return (error);
3987 }
3988 
3989 int
3990 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
3991 {
3992 	struct nchandle fnchd;
3993 	struct nchandle tnchd;
3994 	struct namecache *ncp;
3995 	struct vnode *fdvp;
3996 	struct vnode *tdvp;
3997 	struct mount *mp;
3998 	int error;
3999 	u_int fncp_gen;
4000 	u_int tncp_gen;
4001 
4002 	bwillinode(1);
4003 	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
4004 	if ((error = nlookup(fromnd)) != 0)
4005 		return (error);
4006 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
4007 		return (ENOENT);
4008 	fnchd.mount = fromnd->nl_nch.mount;
4009 	cache_hold(&fnchd);
4010 
4011 	/*
4012 	 * unlock the source nch so we can lookup the target nch without
4013 	 * deadlocking.  The target may or may not exist so we do not check
4014 	 * for a target vp like kern_mkdir() and other creation functions do.
4015 	 *
4016 	 * The source and target directories are ref'd and rechecked after
4017 	 * everything is relocked to determine if the source or target file
4018 	 * has been renamed.
4019 	 */
4020 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
4021 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
4022 
4023 	fncp_gen = fromnd->nl_nch.ncp->nc_generation;
4024 
4025 	cache_unlock(&fromnd->nl_nch);
4026 
4027 	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
4028 	if ((error = nlookup(tond)) != 0) {
4029 		cache_drop(&fnchd);
4030 		return (error);
4031 	}
4032 	tncp_gen = tond->nl_nch.ncp->nc_generation;
4033 
4034 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
4035 		cache_drop(&fnchd);
4036 		return (ENOENT);
4037 	}
4038 	tnchd.mount = tond->nl_nch.mount;
4039 	cache_hold(&tnchd);
4040 
4041 	/*
4042 	 * If the source and target are the same there is nothing to do
4043 	 */
4044 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
4045 		cache_drop(&fnchd);
4046 		cache_drop(&tnchd);
4047 		return (0);
4048 	}
4049 
4050 	/*
4051 	 * Mount points cannot be renamed or overwritten
4052 	 */
4053 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
4054 	    NCF_ISMOUNTPT
4055 	) {
4056 		cache_drop(&fnchd);
4057 		cache_drop(&tnchd);
4058 		return (EINVAL);
4059 	}
4060 
4061 	/*
4062 	 * Relock the source ncp.  cache_relock() will deal with any
4063 	 * deadlocks against the already-locked tond and will also
4064 	 * make sure both are resolved.
4065 	 *
4066 	 * NOTE AFTER RELOCKING: The source or target ncp may have become
4067 	 * invalid while they were unlocked, nc_vp and nc_mount could
4068 	 * be NULL.
4069 	 */
4070 	cache_relock(&fromnd->nl_nch, fromnd->nl_cred,
4071 		     &tond->nl_nch, tond->nl_cred);
4072 	fromnd->nl_flags |= NLC_NCPISLOCKED;
4073 
4074 	/*
4075 	 * If the namecache generation changed for either fromnd or tond,
4076 	 * we must retry.
4077 	 */
4078 	if (fromnd->nl_nch.ncp->nc_generation != fncp_gen ||
4079 	    tond->nl_nch.ncp->nc_generation != tncp_gen) {
4080 		kprintf("kern_rename: retry due to gen on: "
4081 			"\"%s\" -> \"%s\"\n",
4082 			fromnd->nl_nch.ncp->nc_name,
4083 			tond->nl_nch.ncp->nc_name);
4084 		cache_drop(&fnchd);
4085 		cache_drop(&tnchd);
4086 		return (EAGAIN);
4087 	}
4088 
4089 	/*
4090 	 * If either fromnd or tond are marked destroyed a ripout occured
4091 	 * out from under us and we must retry.
4092 	 */
4093 	if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)) ||
4094 	    fromnd->nl_nch.ncp->nc_vp == NULL ||
4095 	    (tond->nl_nch.ncp->nc_flag & NCF_DESTROYED)) {
4096 		kprintf("kern_rename: retry due to ripout on: "
4097 			"\"%s\" -> \"%s\"\n",
4098 			fromnd->nl_nch.ncp->nc_name,
4099 			tond->nl_nch.ncp->nc_name);
4100 		cache_drop(&fnchd);
4101 		cache_drop(&tnchd);
4102 		return (EAGAIN);
4103 	}
4104 
4105 	/*
4106 	 * Make sure the parent directories linkages are the same.
4107 	 * XXX shouldn't be needed any more w/ generation check above.
4108 	 */
4109 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
4110 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
4111 		cache_drop(&fnchd);
4112 		cache_drop(&tnchd);
4113 		return (ENOENT);
4114 	}
4115 
4116 	/*
4117 	 * Both the source and target must be within the same filesystem and
4118 	 * in the same filesystem as their parent directories within the
4119 	 * namecache topology.
4120 	 *
4121 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
4122 	 */
4123 	mp = fnchd.mount;
4124 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
4125 	    mp != tond->nl_nch.mount) {
4126 		cache_drop(&fnchd);
4127 		cache_drop(&tnchd);
4128 		return (EXDEV);
4129 	}
4130 
4131 	/*
4132 	 * Make sure the mount point is writable
4133 	 */
4134 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
4135 		cache_drop(&fnchd);
4136 		cache_drop(&tnchd);
4137 		return (error);
4138 	}
4139 
4140 	/*
4141 	 * If the target exists and either the source or target is a directory,
4142 	 * then both must be directories.
4143 	 *
4144 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
4145 	 * have become NULL.
4146 	 */
4147 	if (tond->nl_nch.ncp->nc_vp) {
4148 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
4149 			error = ENOENT;
4150 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4151 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
4152 				error = ENOTDIR;
4153 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
4154 			error = EISDIR;
4155 		}
4156 	}
4157 
4158 	/*
4159 	 * You cannot rename a source into itself or a subdirectory of itself.
4160 	 * We check this by travsersing the target directory upwards looking
4161 	 * for a match against the source.
4162 	 *
4163 	 * XXX MPSAFE
4164 	 */
4165 	if (error == 0) {
4166 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
4167 			if (fromnd->nl_nch.ncp == ncp) {
4168 				error = EINVAL;
4169 				break;
4170 			}
4171 		}
4172 	}
4173 
4174 	cache_drop(&fnchd);
4175 	cache_drop(&tnchd);
4176 
4177 	/*
4178 	 * Even though the namespaces are different, they may still represent
4179 	 * hardlinks to the same file.  The filesystem might have a hard time
4180 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
4181 	 * when we detect the situation.
4182 	 */
4183 	if (error == 0) {
4184 		fdvp = fromnd->nl_dvp;
4185 		tdvp = tond->nl_dvp;
4186 		if (fdvp == NULL || tdvp == NULL) {
4187 			error = EPERM;
4188 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
4189 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
4190 					    fromnd->nl_cred);
4191 		} else {
4192 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
4193 					    fdvp, tdvp, tond->nl_cred);
4194 		}
4195 	}
4196 	return (error);
4197 }
4198 
4199 /*
4200  * rename_args(char *from, char *to)
4201  *
4202  * Rename files.  Source and destination must either both be directories,
4203  * or both not be directories.  If target is a directory, it must be empty.
4204  */
4205 int
4206 sys_rename(struct rename_args *uap)
4207 {
4208 	struct nlookupdata fromnd, tond;
4209 	int error;
4210 
4211 	do {
4212 		error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
4213 		if (error == 0) {
4214 			error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
4215 			if (error == 0)
4216 				error = kern_rename(&fromnd, &tond);
4217 			nlookup_done(&tond);
4218 		}
4219 		nlookup_done(&fromnd);
4220 	} while (error == EAGAIN);
4221 	return (error);
4222 }
4223 
4224 /*
4225  * renameat_args(int oldfd, char *old, int newfd, char *new)
4226  *
4227  * Rename files using paths relative to the directories associated with
4228  * oldfd and newfd.  Source and destination must either both be directories,
4229  * or both not be directories.  If target is a directory, it must be empty.
4230  */
4231 int
4232 sys_renameat(struct renameat_args *uap)
4233 {
4234 	struct nlookupdata oldnd, newnd;
4235 	struct file *oldfp, *newfp;
4236 	int error;
4237 
4238 	do {
4239 		error = nlookup_init_at(&oldnd, &oldfp,
4240 					uap->oldfd, uap->old,
4241 					UIO_USERSPACE, 0);
4242 		if (error == 0) {
4243 			error = nlookup_init_at(&newnd, &newfp,
4244 						uap->newfd, uap->new,
4245 						UIO_USERSPACE, 0);
4246 			if (error == 0)
4247 				error = kern_rename(&oldnd, &newnd);
4248 			nlookup_done_at(&newnd, newfp);
4249 		}
4250 		nlookup_done_at(&oldnd, oldfp);
4251 	} while (error == EAGAIN);
4252 	return (error);
4253 }
4254 
4255 int
4256 kern_mkdir(struct nlookupdata *nd, int mode)
4257 {
4258 	struct thread *td = curthread;
4259 	struct proc *p = td->td_proc;
4260 	struct vnode *vp;
4261 	struct vattr vattr;
4262 	int error;
4263 
4264 	bwillinode(1);
4265 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
4266 	if ((error = nlookup(nd)) != 0)
4267 		return (error);
4268 
4269 	if (nd->nl_nch.ncp->nc_vp)
4270 		return (EEXIST);
4271 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4272 		return (error);
4273 	VATTR_NULL(&vattr);
4274 	vattr.va_type = VDIR;
4275 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
4276 
4277 	vp = NULL;
4278 	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
4279 	if (error == 0)
4280 		vput(vp);
4281 	return (error);
4282 }
4283 
4284 /*
4285  * mkdir_args(char *path, int mode)
4286  *
4287  * Make a directory file.
4288  */
4289 int
4290 sys_mkdir(struct mkdir_args *uap)
4291 {
4292 	struct nlookupdata nd;
4293 	int error;
4294 
4295 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4296 	if (error == 0)
4297 		error = kern_mkdir(&nd, uap->mode);
4298 	nlookup_done(&nd);
4299 	return (error);
4300 }
4301 
4302 /*
4303  * mkdirat_args(int fd, char *path, mode_t mode)
4304  *
4305  * Make a directory file.  The path is relative to the directory associated
4306  * with fd.
4307  */
4308 int
4309 sys_mkdirat(struct mkdirat_args *uap)
4310 {
4311 	struct nlookupdata nd;
4312 	struct file *fp;
4313 	int error;
4314 
4315 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
4316 	if (error == 0)
4317 		error = kern_mkdir(&nd, uap->mode);
4318 	nlookup_done_at(&nd, fp);
4319 	return (error);
4320 }
4321 
4322 int
4323 kern_rmdir(struct nlookupdata *nd)
4324 {
4325 	int error;
4326 
4327 	bwillinode(1);
4328 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4329 	if ((error = nlookup(nd)) != 0)
4330 		return (error);
4331 
4332 	/*
4333 	 * Do not allow directories representing mount points to be
4334 	 * deleted, even if empty.  Check write perms on mount point
4335 	 * in case the vnode is aliased (aka nullfs).
4336 	 */
4337 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4338 		return (EBUSY);
4339 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4340 		return (error);
4341 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4342 	return (error);
4343 }
4344 
4345 /*
4346  * rmdir_args(char *path)
4347  *
4348  * Remove a directory file.
4349  */
4350 int
4351 sys_rmdir(struct rmdir_args *uap)
4352 {
4353 	struct nlookupdata nd;
4354 	int error;
4355 
4356 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4357 	if (error == 0)
4358 		error = kern_rmdir(&nd);
4359 	nlookup_done(&nd);
4360 	return (error);
4361 }
4362 
4363 int
4364 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4365 		   enum uio_seg direction)
4366 {
4367 	struct thread *td = curthread;
4368 	struct proc *p = td->td_proc;
4369 	struct vnode *vp;
4370 	struct file *fp;
4371 	struct uio auio;
4372 	struct iovec aiov;
4373 	off_t loff;
4374 	int error, eofflag;
4375 
4376 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
4377 		return (error);
4378 	if ((fp->f_flag & FREAD) == 0) {
4379 		error = EBADF;
4380 		goto done;
4381 	}
4382 	vp = (struct vnode *)fp->f_data;
4383 	if (vp->v_type != VDIR) {
4384 		error = EINVAL;
4385 		goto done;
4386 	}
4387 	aiov.iov_base = buf;
4388 	aiov.iov_len = count;
4389 	auio.uio_iov = &aiov;
4390 	auio.uio_iovcnt = 1;
4391 	auio.uio_rw = UIO_READ;
4392 	auio.uio_segflg = direction;
4393 	auio.uio_td = td;
4394 	auio.uio_resid = count;
4395 	loff = auio.uio_offset = fp->f_offset;
4396 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
4397 	fp->f_offset = auio.uio_offset;
4398 	if (error)
4399 		goto done;
4400 
4401 	/*
4402 	 * WARNING!  *basep may not be wide enough to accomodate the
4403 	 * seek offset.   XXX should we hack this to return the upper 32 bits
4404 	 * for offsets greater then 4G?
4405 	 */
4406 	if (basep) {
4407 		*basep = (long)loff;
4408 	}
4409 	*res = count - auio.uio_resid;
4410 done:
4411 	fdrop(fp);
4412 	return (error);
4413 }
4414 
4415 /*
4416  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4417  *
4418  * Read a block of directory entries in a file system independent format.
4419  */
4420 int
4421 sys_getdirentries(struct getdirentries_args *uap)
4422 {
4423 	long base;
4424 	int error;
4425 
4426 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4427 				   &uap->sysmsg_result, UIO_USERSPACE);
4428 
4429 	if (error == 0 && uap->basep)
4430 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
4431 	return (error);
4432 }
4433 
4434 /*
4435  * getdents_args(int fd, char *buf, size_t count)
4436  */
4437 int
4438 sys_getdents(struct getdents_args *uap)
4439 {
4440 	int error;
4441 
4442 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4443 				   &uap->sysmsg_result, UIO_USERSPACE);
4444 
4445 	return (error);
4446 }
4447 
4448 /*
4449  * Set the mode mask for creation of filesystem nodes.
4450  *
4451  * umask(int newmask)
4452  */
4453 int
4454 sys_umask(struct umask_args *uap)
4455 {
4456 	struct thread *td = curthread;
4457 	struct proc *p = td->td_proc;
4458 	struct filedesc *fdp;
4459 
4460 	fdp = p->p_fd;
4461 	uap->sysmsg_result = fdp->fd_cmask;
4462 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4463 	return (0);
4464 }
4465 
4466 /*
4467  * revoke(char *path)
4468  *
4469  * Void all references to file by ripping underlying filesystem
4470  * away from vnode.
4471  */
4472 int
4473 sys_revoke(struct revoke_args *uap)
4474 {
4475 	struct nlookupdata nd;
4476 	struct vattr vattr;
4477 	struct vnode *vp;
4478 	struct ucred *cred;
4479 	int error;
4480 
4481 	vp = NULL;
4482 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4483 	if (error == 0)
4484 		error = nlookup(&nd);
4485 	if (error == 0)
4486 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4487 	cred = crhold(nd.nl_cred);
4488 	nlookup_done(&nd);
4489 	if (error == 0) {
4490 		if (error == 0)
4491 			error = VOP_GETATTR(vp, &vattr);
4492 		if (error == 0 && cred->cr_uid != vattr.va_uid)
4493 			error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
4494 		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4495 			if (vcount(vp) > 0)
4496 				error = vrevoke(vp, cred);
4497 		} else if (error == 0) {
4498 			error = vrevoke(vp, cred);
4499 		}
4500 		vrele(vp);
4501 	}
4502 	if (cred)
4503 		crfree(cred);
4504 	return (error);
4505 }
4506 
4507 /*
4508  * getfh_args(char *fname, fhandle_t *fhp)
4509  *
4510  * Get (NFS) file handle
4511  *
4512  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4513  * mount.  This allows nullfs mounts to be explicitly exported.
4514  *
4515  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4516  *
4517  * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4518  *	    work, but you do not really have protection against access to
4519  *	    the related parent directories.
4520  */
4521 int
4522 sys_getfh(struct getfh_args *uap)
4523 {
4524 	struct thread *td = curthread;
4525 	struct nlookupdata nd;
4526 	fhandle_t fh;
4527 	struct vnode *vp;
4528 	struct mount *mp;
4529 	int error;
4530 
4531 	/*
4532 	 * Must be super user
4533 	 */
4534 	if ((error = priv_check(td, PRIV_ROOT)) != 0)
4535 		return (error);
4536 
4537 	vp = NULL;
4538 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4539 	if (error == 0)
4540 		error = nlookup(&nd);
4541 	if (error == 0)
4542 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4543 	mp = nd.nl_nch.mount;
4544 	nlookup_done(&nd);
4545 	if (error == 0) {
4546 		bzero(&fh, sizeof(fh));
4547 		fh.fh_fsid = mp->mnt_stat.f_fsid;
4548 		error = VFS_VPTOFH(vp, &fh.fh_fid);
4549 		vput(vp);
4550 		if (error == 0)
4551 			error = copyout(&fh, uap->fhp, sizeof(fh));
4552 	}
4553 	return (error);
4554 }
4555 
4556 /*
4557  * fhopen_args(const struct fhandle *u_fhp, int flags)
4558  *
4559  * syscall for the rpc.lockd to use to translate a NFS file handle into
4560  * an open descriptor.
4561  *
4562  * warning: do not remove the priv_check() call or this becomes one giant
4563  * security hole.
4564  */
4565 int
4566 sys_fhopen(struct fhopen_args *uap)
4567 {
4568 	struct thread *td = curthread;
4569 	struct filedesc *fdp = td->td_proc->p_fd;
4570 	struct mount *mp;
4571 	struct vnode *vp;
4572 	struct fhandle fhp;
4573 	struct vattr vat;
4574 	struct vattr *vap = &vat;
4575 	struct flock lf;
4576 	int fmode, mode, error = 0, type;
4577 	struct file *nfp;
4578 	struct file *fp;
4579 	int indx;
4580 
4581 	/*
4582 	 * Must be super user
4583 	 */
4584 	error = priv_check(td, PRIV_ROOT);
4585 	if (error)
4586 		return (error);
4587 
4588 	fmode = FFLAGS(uap->flags);
4589 
4590 	/*
4591 	 * Why not allow a non-read/write open for our lockd?
4592 	 */
4593 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4594 		return (EINVAL);
4595 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4596 	if (error)
4597 		return(error);
4598 
4599 	/*
4600 	 * Find the mount point
4601 	 */
4602 	mp = vfs_getvfs(&fhp.fh_fsid);
4603 	if (mp == NULL) {
4604 		error = ESTALE;
4605 		goto done2;
4606 	}
4607 	/* now give me my vnode, it gets returned to me locked */
4608 	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4609 	if (error)
4610 		goto done;
4611  	/*
4612 	 * from now on we have to make sure not
4613 	 * to forget about the vnode
4614 	 * any error that causes an abort must vput(vp)
4615 	 * just set error = err and 'goto bad;'.
4616 	 */
4617 
4618 	/*
4619 	 * from vn_open
4620 	 */
4621 	if (vp->v_type == VLNK) {
4622 		error = EMLINK;
4623 		goto bad;
4624 	}
4625 	if (vp->v_type == VSOCK) {
4626 		error = EOPNOTSUPP;
4627 		goto bad;
4628 	}
4629 	mode = 0;
4630 	if (fmode & (FWRITE | O_TRUNC)) {
4631 		if (vp->v_type == VDIR) {
4632 			error = EISDIR;
4633 			goto bad;
4634 		}
4635 		error = vn_writechk(vp, NULL);
4636 		if (error)
4637 			goto bad;
4638 		mode |= VWRITE;
4639 	}
4640 	if (fmode & FREAD)
4641 		mode |= VREAD;
4642 	if (mode) {
4643 		error = VOP_ACCESS(vp, mode, td->td_ucred);
4644 		if (error)
4645 			goto bad;
4646 	}
4647 	if (fmode & O_TRUNC) {
4648 		vn_unlock(vp);				/* XXX */
4649 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4650 		VATTR_NULL(vap);
4651 		vap->va_size = 0;
4652 		error = VOP_SETATTR(vp, vap, td->td_ucred);
4653 		if (error)
4654 			goto bad;
4655 	}
4656 
4657 	/*
4658 	 * VOP_OPEN needs the file pointer so it can potentially override
4659 	 * it.
4660 	 *
4661 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4662 	 * directory.  XXX
4663 	 */
4664 	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4665 		goto bad;
4666 	fp = nfp;
4667 
4668 	error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4669 	if (error) {
4670 		/*
4671 		 * setting f_ops this way prevents VOP_CLOSE from being
4672 		 * called or fdrop() releasing the vp from v_data.   Since
4673 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4674 		 */
4675 		fp->f_ops = &badfileops;
4676 		fp->f_data = NULL;
4677 		goto bad_drop;
4678 	}
4679 
4680 	/*
4681 	 * The fp is given its own reference, we still have our ref and lock.
4682 	 *
4683 	 * Assert that all regular files must be created with a VM object.
4684 	 */
4685 	if (vp->v_type == VREG && vp->v_object == NULL) {
4686 		kprintf("fhopen: regular file did not "
4687 			"have VM object: %p\n",
4688 			vp);
4689 		goto bad_drop;
4690 	}
4691 
4692 	/*
4693 	 * The open was successful.  Handle any locking requirements.
4694 	 */
4695 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4696 		lf.l_whence = SEEK_SET;
4697 		lf.l_start = 0;
4698 		lf.l_len = 0;
4699 		if (fmode & O_EXLOCK)
4700 			lf.l_type = F_WRLCK;
4701 		else
4702 			lf.l_type = F_RDLCK;
4703 		if (fmode & FNONBLOCK)
4704 			type = 0;
4705 		else
4706 			type = F_WAIT;
4707 		vn_unlock(vp);
4708 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK,
4709 					 &lf, type)) != 0) {
4710 			/*
4711 			 * release our private reference.
4712 			 */
4713 			fsetfd(fdp, NULL, indx);
4714 			fdrop(fp);
4715 			vrele(vp);
4716 			goto done;
4717 		}
4718 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4719 		atomic_set_int(&fp->f_flag, FHASLOCK);	/* race ok */
4720 	}
4721 
4722 	/*
4723 	 * Clean up.  Associate the file pointer with the previously
4724 	 * reserved descriptor and return it.
4725 	 */
4726 	vput(vp);
4727 	if (uap->flags & O_CLOEXEC)
4728 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
4729 	fsetfd(fdp, fp, indx);
4730 	fdrop(fp);
4731 	uap->sysmsg_result = indx;
4732 	mount_drop(mp);
4733 
4734 	return (error);
4735 
4736 bad_drop:
4737 	fsetfd(fdp, NULL, indx);
4738 	fdrop(fp);
4739 bad:
4740 	vput(vp);
4741 done:
4742 	mount_drop(mp);
4743 done2:
4744 	return (error);
4745 }
4746 
4747 /*
4748  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4749  */
4750 int
4751 sys_fhstat(struct fhstat_args *uap)
4752 {
4753 	struct thread *td = curthread;
4754 	struct stat sb;
4755 	fhandle_t fh;
4756 	struct mount *mp;
4757 	struct vnode *vp;
4758 	int error;
4759 
4760 	/*
4761 	 * Must be super user
4762 	 */
4763 	error = priv_check(td, PRIV_ROOT);
4764 	if (error)
4765 		return (error);
4766 
4767 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4768 	if (error)
4769 		return (error);
4770 
4771 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4772 		error = ESTALE;
4773 	if (error == 0) {
4774 		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4775 			error = vn_stat(vp, &sb, td->td_ucred);
4776 			vput(vp);
4777 		}
4778 	}
4779 	if (error == 0)
4780 		error = copyout(&sb, uap->sb, sizeof(sb));
4781 	if (mp)
4782 		mount_drop(mp);
4783 
4784 	return (error);
4785 }
4786 
4787 /*
4788  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4789  */
4790 int
4791 sys_fhstatfs(struct fhstatfs_args *uap)
4792 {
4793 	struct thread *td = curthread;
4794 	struct proc *p = td->td_proc;
4795 	struct statfs *sp;
4796 	struct mount *mp;
4797 	struct vnode *vp;
4798 	struct statfs sb;
4799 	char *fullpath, *freepath;
4800 	fhandle_t fh;
4801 	int error;
4802 
4803 	/*
4804 	 * Must be super user
4805 	 */
4806 	if ((error = priv_check(td, PRIV_ROOT)))
4807 		return (error);
4808 
4809 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4810 		return (error);
4811 
4812 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4813 		error = ESTALE;
4814 		goto done;
4815 	}
4816 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4817 		error = ESTALE;
4818 		goto done;
4819 	}
4820 
4821 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4822 		goto done;
4823 	mp = vp->v_mount;
4824 	sp = &mp->mnt_stat;
4825 	vput(vp);
4826 	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4827 		goto done;
4828 
4829 	error = mount_path(p, mp, &fullpath, &freepath);
4830 	if (error)
4831 		goto done;
4832 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4833 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4834 	kfree(freepath, M_TEMP);
4835 
4836 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4837 	if (priv_check(td, PRIV_ROOT)) {
4838 		bcopy(sp, &sb, sizeof(sb));
4839 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4840 		sp = &sb;
4841 	}
4842 	error = copyout(sp, uap->buf, sizeof(*sp));
4843 done:
4844 	if (mp)
4845 		mount_drop(mp);
4846 
4847 	return (error);
4848 }
4849 
4850 /*
4851  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4852  */
4853 int
4854 sys_fhstatvfs(struct fhstatvfs_args *uap)
4855 {
4856 	struct thread *td = curthread;
4857 	struct proc *p = td->td_proc;
4858 	struct statvfs *sp;
4859 	struct mount *mp;
4860 	struct vnode *vp;
4861 	fhandle_t fh;
4862 	int error;
4863 
4864 	/*
4865 	 * Must be super user
4866 	 */
4867 	if ((error = priv_check(td, PRIV_ROOT)))
4868 		return (error);
4869 
4870 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4871 		return (error);
4872 
4873 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4874 		error = ESTALE;
4875 		goto done;
4876 	}
4877 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4878 		error = ESTALE;
4879 		goto done;
4880 	}
4881 
4882 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4883 		goto done;
4884 	mp = vp->v_mount;
4885 	sp = &mp->mnt_vstat;
4886 	vput(vp);
4887 	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4888 		goto done;
4889 
4890 	sp->f_flag = 0;
4891 	if (mp->mnt_flag & MNT_RDONLY)
4892 		sp->f_flag |= ST_RDONLY;
4893 	if (mp->mnt_flag & MNT_NOSUID)
4894 		sp->f_flag |= ST_NOSUID;
4895 	error = copyout(sp, uap->buf, sizeof(*sp));
4896 done:
4897 	if (mp)
4898 		mount_drop(mp);
4899 	return (error);
4900 }
4901 
4902 
4903 /*
4904  * Syscall to push extended attribute configuration information into the
4905  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4906  * a command (int cmd), and attribute name and misc data.  For now, the
4907  * attribute name is left in userspace for consumption by the VFS_op.
4908  * It will probably be changed to be copied into sysspace by the
4909  * syscall in the future, once issues with various consumers of the
4910  * attribute code have raised their hands.
4911  *
4912  * Currently this is used only by UFS Extended Attributes.
4913  */
4914 int
4915 sys_extattrctl(struct extattrctl_args *uap)
4916 {
4917 	struct nlookupdata nd;
4918 	struct vnode *vp;
4919 	char attrname[EXTATTR_MAXNAMELEN];
4920 	int error;
4921 	size_t size;
4922 
4923 	attrname[0] = 0;
4924 	vp = NULL;
4925 	error = 0;
4926 
4927 	if (error == 0 && uap->filename) {
4928 		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
4929 				     NLC_FOLLOW);
4930 		if (error == 0)
4931 			error = nlookup(&nd);
4932 		if (error == 0)
4933 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4934 		nlookup_done(&nd);
4935 	}
4936 
4937 	if (error == 0 && uap->attrname) {
4938 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
4939 				  &size);
4940 	}
4941 
4942 	if (error == 0) {
4943 		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4944 		if (error == 0)
4945 			error = nlookup(&nd);
4946 		if (error == 0)
4947 			error = ncp_writechk(&nd.nl_nch);
4948 		if (error == 0) {
4949 			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
4950 					       uap->attrnamespace,
4951 					       uap->attrname, nd.nl_cred);
4952 		}
4953 		nlookup_done(&nd);
4954 	}
4955 
4956 	return (error);
4957 }
4958 
4959 /*
4960  * Syscall to get a named extended attribute on a file or directory.
4961  */
4962 int
4963 sys_extattr_set_file(struct extattr_set_file_args *uap)
4964 {
4965 	char attrname[EXTATTR_MAXNAMELEN];
4966 	struct nlookupdata nd;
4967 	struct vnode *vp;
4968 	struct uio auio;
4969 	struct iovec aiov;
4970 	int error;
4971 
4972 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4973 	if (error)
4974 		return (error);
4975 
4976 	vp = NULL;
4977 
4978 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4979 	if (error == 0)
4980 		error = nlookup(&nd);
4981 	if (error == 0)
4982 		error = ncp_writechk(&nd.nl_nch);
4983 	if (error == 0)
4984 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4985 	if (error) {
4986 		nlookup_done(&nd);
4987 		return (error);
4988 	}
4989 
4990 	bzero(&auio, sizeof(auio));
4991 	aiov.iov_base = uap->data;
4992 	aiov.iov_len = uap->nbytes;
4993 	auio.uio_iov = &aiov;
4994 	auio.uio_iovcnt = 1;
4995 	auio.uio_offset = 0;
4996 	auio.uio_resid = uap->nbytes;
4997 	auio.uio_rw = UIO_WRITE;
4998 	auio.uio_td = curthread;
4999 
5000 	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
5001 			       &auio, nd.nl_cred);
5002 
5003 	vput(vp);
5004 	nlookup_done(&nd);
5005 	return (error);
5006 }
5007 
5008 /*
5009  * Syscall to get a named extended attribute on a file or directory.
5010  */
5011 int
5012 sys_extattr_get_file(struct extattr_get_file_args *uap)
5013 {
5014 	char attrname[EXTATTR_MAXNAMELEN];
5015 	struct nlookupdata nd;
5016 	struct uio auio;
5017 	struct iovec aiov;
5018 	struct vnode *vp;
5019 	int error;
5020 
5021 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5022 	if (error)
5023 		return (error);
5024 
5025 	vp = NULL;
5026 
5027 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5028 	if (error == 0)
5029 		error = nlookup(&nd);
5030 	if (error == 0)
5031 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
5032 	if (error) {
5033 		nlookup_done(&nd);
5034 		return (error);
5035 	}
5036 
5037 	bzero(&auio, sizeof(auio));
5038 	aiov.iov_base = uap->data;
5039 	aiov.iov_len = uap->nbytes;
5040 	auio.uio_iov = &aiov;
5041 	auio.uio_iovcnt = 1;
5042 	auio.uio_offset = 0;
5043 	auio.uio_resid = uap->nbytes;
5044 	auio.uio_rw = UIO_READ;
5045 	auio.uio_td = curthread;
5046 
5047 	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
5048 				&auio, nd.nl_cred);
5049 	uap->sysmsg_result = uap->nbytes - auio.uio_resid;
5050 
5051 	vput(vp);
5052 	nlookup_done(&nd);
5053 	return(error);
5054 }
5055 
5056 /*
5057  * Syscall to delete a named extended attribute from a file or directory.
5058  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
5059  */
5060 int
5061 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
5062 {
5063 	char attrname[EXTATTR_MAXNAMELEN];
5064 	struct nlookupdata nd;
5065 	struct vnode *vp;
5066 	int error;
5067 
5068 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5069 	if (error)
5070 		return(error);
5071 
5072 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5073 	if (error == 0)
5074 		error = nlookup(&nd);
5075 	if (error == 0)
5076 		error = ncp_writechk(&nd.nl_nch);
5077 	if (error == 0) {
5078 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5079 		if (error == 0) {
5080 			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
5081 					       attrname, NULL, nd.nl_cred);
5082 			vput(vp);
5083 		}
5084 	}
5085 	nlookup_done(&nd);
5086 	return(error);
5087 }
5088 
5089 /*
5090  * Determine if the mount is visible to the process.
5091  */
5092 static int
5093 chroot_visible_mnt(struct mount *mp, struct proc *p)
5094 {
5095 	struct nchandle nch;
5096 
5097 	/*
5098 	 * Traverse from the mount point upwards.  If we hit the process
5099 	 * root then the mount point is visible to the process.
5100 	 */
5101 	nch = mp->mnt_ncmountpt;
5102 	while (nch.ncp) {
5103 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
5104 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
5105 			return(1);
5106 		}
5107 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
5108 			nch = nch.mount->mnt_ncmounton;
5109 		} else {
5110 			nch.ncp = nch.ncp->nc_parent;
5111 		}
5112 	}
5113 
5114 	/*
5115 	 * If the mount point is not visible to the process, but the
5116 	 * process root is in a subdirectory of the mount, return
5117 	 * TRUE anyway.
5118 	 */
5119 	if (p->p_fd->fd_nrdir.mount == mp)
5120 		return(1);
5121 
5122 	return(0);
5123 }
5124 
5125