xref: /dflybsd-src/sys/kern/vfs_syscalls.c (revision 9d62a0f98f0de1bb82fb9d9c510d597e1e9a188d)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
40  * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.121 2007/09/10 15:08:43 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/buf.h>
46 #include <sys/conf.h>
47 #include <sys/sysent.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/mountctl.h>
51 #include <sys/sysproto.h>
52 #include <sys/filedesc.h>
53 #include <sys/kernel.h>
54 #include <sys/fcntl.h>
55 #include <sys/file.h>
56 #include <sys/linker.h>
57 #include <sys/stat.h>
58 #include <sys/unistd.h>
59 #include <sys/vnode.h>
60 #include <sys/proc.h>
61 #include <sys/namei.h>
62 #include <sys/nlookup.h>
63 #include <sys/dirent.h>
64 #include <sys/extattr.h>
65 #include <sys/spinlock.h>
66 #include <sys/kern_syscall.h>
67 #include <sys/objcache.h>
68 #include <sys/sysctl.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_page.h>
75 
76 #include <machine/limits.h>
77 #include <machine/stdarg.h>
78 
79 #include <vfs/union/union.h>
80 
81 static void mount_warning(struct mount *mp, const char *ctl, ...);
82 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
83 static int checkvp_chdir (struct vnode *vn, struct thread *td);
84 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
85 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
86 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
87 static int getutimes (const struct timeval *, struct timespec *);
88 static int setfown (struct vnode *, uid_t, gid_t);
89 static int setfmode (struct vnode *, int);
90 static int setfflags (struct vnode *, int);
91 static int setutimes (struct vnode *, const struct timespec *, int);
92 static int	usermount = 0;	/* if 1, non-root can mount fs. */
93 
94 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
95 
96 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
97 
98 /*
99  * Virtual File System System Calls
100  */
101 
102 /*
103  * Mount a file system.
104  */
105 /*
106  * mount_args(char *type, char *path, int flags, caddr_t data)
107  */
108 /* ARGSUSED */
109 int
110 sys_mount(struct mount_args *uap)
111 {
112 	struct thread *td = curthread;
113 	struct proc *p = td->td_proc;
114 	struct vnode *vp;
115 	struct nchandle nch;
116 	struct mount *mp;
117 	struct vfsconf *vfsp;
118 	int error, flag = 0, flag2 = 0;
119 	int hasmount;
120 	struct vattr va;
121 	struct nlookupdata nd;
122 	char fstypename[MFSNAMELEN];
123 	struct ucred *cred = p->p_ucred;
124 
125 	KKASSERT(p);
126 	if (cred->cr_prison != NULL)
127 		return (EPERM);
128 	if (usermount == 0 && (error = suser(td)))
129 		return (error);
130 	/*
131 	 * Do not allow NFS export by non-root users.
132 	 */
133 	if (uap->flags & MNT_EXPORTED) {
134 		error = suser(td);
135 		if (error)
136 			return (error);
137 	}
138 	/*
139 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
140 	 */
141 	if (suser(td))
142 		uap->flags |= MNT_NOSUID | MNT_NODEV;
143 
144 	/*
145 	 * Lookup the requested path and extract the nch and vnode.
146 	 */
147 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
148 	if (error == 0) {
149 		if ((error = nlookup(&nd)) == 0) {
150 			if (nd.nl_nch.ncp->nc_vp == NULL)
151 				error = ENOENT;
152 		}
153 	}
154 	if (error) {
155 		nlookup_done(&nd);
156 		return (error);
157 	}
158 
159 	/*
160 	 * Extract the locked+refd ncp and cleanup the nd structure
161 	 */
162 	nch = nd.nl_nch;
163 	cache_zero(&nd.nl_nch);
164 	nlookup_done(&nd);
165 
166 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
167 		hasmount = 1;
168 	else
169 		hasmount = 0;
170 
171 
172 	/*
173 	 * now we have the locked ref'd nch and unreferenced vnode.
174 	 */
175 	vp = nch.ncp->nc_vp;
176 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
177 		cache_put(&nch);
178 		return (error);
179 	}
180 	cache_unlock(&nch);
181 
182 	/*
183 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
184 	 */
185 	if (uap->flags & MNT_UPDATE) {
186 		if ((vp->v_flag & VROOT) == 0) {
187 			cache_drop(&nch);
188 			vput(vp);
189 			return (EINVAL);
190 		}
191 		mp = vp->v_mount;
192 		flag = mp->mnt_flag;
193 		flag2 = mp->mnt_kern_flag;
194 		/*
195 		 * We only allow the filesystem to be reloaded if it
196 		 * is currently mounted read-only.
197 		 */
198 		if ((uap->flags & MNT_RELOAD) &&
199 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
200 			cache_drop(&nch);
201 			vput(vp);
202 			return (EOPNOTSUPP);	/* Needs translation */
203 		}
204 		/*
205 		 * Only root, or the user that did the original mount is
206 		 * permitted to update it.
207 		 */
208 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
209 		    (error = suser(td))) {
210 			cache_drop(&nch);
211 			vput(vp);
212 			return (error);
213 		}
214 		if (vfs_busy(mp, LK_NOWAIT)) {
215 			cache_drop(&nch);
216 			vput(vp);
217 			return (EBUSY);
218 		}
219 		if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
220 			cache_drop(&nch);
221 			vfs_unbusy(mp);
222 			vput(vp);
223 			return (EBUSY);
224 		}
225 		vp->v_flag |= VMOUNT;
226 		mp->mnt_flag |=
227 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
228 		vn_unlock(vp);
229 		goto update;
230 	}
231 	/*
232 	 * If the user is not root, ensure that they own the directory
233 	 * onto which we are attempting to mount.
234 	 */
235 	if ((error = VOP_GETATTR(vp, &va)) ||
236 	    (va.va_uid != cred->cr_uid && (error = suser(td)))) {
237 		cache_drop(&nch);
238 		vput(vp);
239 		return (error);
240 	}
241 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
242 		cache_drop(&nch);
243 		vput(vp);
244 		return (error);
245 	}
246 	if (vp->v_type != VDIR) {
247 		cache_drop(&nch);
248 		vput(vp);
249 		return (ENOTDIR);
250 	}
251 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
252 		cache_drop(&nch);
253 		vput(vp);
254 		return (EPERM);
255 	}
256 	if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
257 		cache_drop(&nch);
258 		vput(vp);
259 		return (error);
260 	}
261 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
262 		if (!strcmp(vfsp->vfc_name, fstypename))
263 			break;
264 	}
265 	if (vfsp == NULL) {
266 		linker_file_t lf;
267 
268 		/* Only load modules for root (very important!) */
269 		if ((error = suser(td)) != 0) {
270 			cache_drop(&nch);
271 			vput(vp);
272 			return error;
273 		}
274 		error = linker_load_file(fstypename, &lf);
275 		if (error || lf == NULL) {
276 			cache_drop(&nch);
277 			vput(vp);
278 			if (lf == NULL)
279 				error = ENODEV;
280 			return error;
281 		}
282 		lf->userrefs++;
283 		/* lookup again, see if the VFS was loaded */
284 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
285 			if (!strcmp(vfsp->vfc_name, fstypename))
286 				break;
287 		}
288 		if (vfsp == NULL) {
289 			lf->userrefs--;
290 			linker_file_unload(lf);
291 			cache_drop(&nch);
292 			vput(vp);
293 			return (ENODEV);
294 		}
295 	}
296 	if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
297 		cache_drop(&nch);
298 		vput(vp);
299 		return (EBUSY);
300 	}
301 	vp->v_flag |= VMOUNT;
302 
303 	/*
304 	 * Allocate and initialize the filesystem.
305 	 */
306 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
307 	TAILQ_INIT(&mp->mnt_nvnodelist);
308 	TAILQ_INIT(&mp->mnt_reservedvnlist);
309 	TAILQ_INIT(&mp->mnt_jlist);
310 	mp->mnt_nvnodelistsize = 0;
311 	lockinit(&mp->mnt_lock, "vfslock", 0, 0);
312 	vfs_busy(mp, LK_NOWAIT);
313 	mp->mnt_op = vfsp->vfc_vfsops;
314 	mp->mnt_vfc = vfsp;
315 	vfsp->vfc_refcount++;
316 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
317 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
318 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
319 	mp->mnt_stat.f_owner = cred->cr_uid;
320 	mp->mnt_iosize_max = DFLTPHYS;
321 	vn_unlock(vp);
322 update:
323 	/*
324 	 * Set the mount level flags.
325 	 */
326 	if (uap->flags & MNT_RDONLY)
327 		mp->mnt_flag |= MNT_RDONLY;
328 	else if (mp->mnt_flag & MNT_RDONLY)
329 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
330 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
331 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
332 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
333 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
334 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
335 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
336 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
337 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
338 	/*
339 	 * Mount the filesystem.
340 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
341 	 * get.
342 	 */
343 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
344 	if (mp->mnt_flag & MNT_UPDATE) {
345 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
346 			mp->mnt_flag &= ~MNT_RDONLY;
347 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
348 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
349 		if (error) {
350 			mp->mnt_flag = flag;
351 			mp->mnt_kern_flag = flag2;
352 		}
353 		vfs_unbusy(mp);
354 		vp->v_flag &= ~VMOUNT;
355 		vrele(vp);
356 		cache_drop(&nch);
357 		return (error);
358 	}
359 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
360 	/*
361 	 * Put the new filesystem on the mount list after root.  The mount
362 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
363 	 * up) which represents the root of the mount.  The lookup code
364 	 * detects the mount point going forward and checks the root of
365 	 * the mount going backwards.
366 	 *
367 	 * It is not necessary to invalidate or purge the vnode underneath
368 	 * because elements under the mount will be given their own glue
369 	 * namecache record.
370 	 */
371 	if (!error) {
372 		if (mp->mnt_ncmountpt.ncp == NULL) {
373 			/*
374 			 * allocate, then unlock, but leave the ref intact
375 			 */
376 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
377 			cache_unlock(&mp->mnt_ncmountpt);
378 		}
379 		mp->mnt_ncmounton = nch;		/* inherits ref */
380 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
381 
382 		/* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
383 		vp->v_flag &= ~VMOUNT;
384 		mountlist_insert(mp, MNTINS_LAST);
385 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
386 		vn_unlock(vp);
387 		error = vfs_allocate_syncvnode(mp);
388 		vfs_unbusy(mp);
389 		error = VFS_START(mp, 0);
390 		vrele(vp);
391 	} else {
392 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
393 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
394 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
395 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
396 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
397 		vp->v_flag &= ~VMOUNT;
398 		mp->mnt_vfc->vfc_refcount--;
399 		vfs_unbusy(mp);
400 		kfree(mp, M_MOUNT);
401 		cache_drop(&nch);
402 		vput(vp);
403 	}
404 	return (error);
405 }
406 
407 /*
408  * Scan all active processes to see if any of them have a current
409  * or root directory onto which the new filesystem has just been
410  * mounted. If so, replace them with the new mount point.
411  *
412  * The passed ncp is ref'd and locked (from the mount code) and
413  * must be associated with the vnode representing the root of the
414  * mount point.
415  */
416 struct checkdirs_info {
417 	struct nchandle old_nch;
418 	struct nchandle new_nch;
419 	struct vnode *old_vp;
420 	struct vnode *new_vp;
421 };
422 
423 static int checkdirs_callback(struct proc *p, void *data);
424 
425 static void
426 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
427 {
428 	struct checkdirs_info info;
429 	struct vnode *olddp;
430 	struct vnode *newdp;
431 	struct mount *mp;
432 
433 	/*
434 	 * If the old mount point's vnode has a usecount of 1, it is not
435 	 * being held as a descriptor anywhere.
436 	 */
437 	olddp = old_nch->ncp->nc_vp;
438 	if (olddp == NULL || olddp->v_sysref.refcnt == 1)
439 		return;
440 
441 	/*
442 	 * Force the root vnode of the new mount point to be resolved
443 	 * so we can update any matching processes.
444 	 */
445 	mp = new_nch->mount;
446 	if (VFS_ROOT(mp, &newdp))
447 		panic("mount: lost mount");
448 	cache_setunresolved(new_nch);
449 	cache_setvp(new_nch, newdp);
450 
451 	/*
452 	 * Special handling of the root node
453 	 */
454 	if (rootvnode == olddp) {
455 		vref(newdp);
456 		vfs_cache_setroot(newdp, cache_hold(new_nch));
457 	}
458 
459 	/*
460 	 * Pass newdp separately so the callback does not have to access
461 	 * it via new_nch->ncp->nc_vp.
462 	 */
463 	info.old_nch = *old_nch;
464 	info.new_nch = *new_nch;
465 	info.new_vp = newdp;
466 	allproc_scan(checkdirs_callback, &info);
467 	vput(newdp);
468 }
469 
470 /*
471  * NOTE: callback is not MP safe because the scanned process's filedesc
472  * structure can be ripped out from under us, amoung other things.
473  */
474 static int
475 checkdirs_callback(struct proc *p, void *data)
476 {
477 	struct checkdirs_info *info = data;
478 	struct filedesc *fdp;
479 	struct nchandle ncdrop1;
480 	struct nchandle ncdrop2;
481 	struct vnode *vprele1;
482 	struct vnode *vprele2;
483 
484 	if ((fdp = p->p_fd) != NULL) {
485 		cache_zero(&ncdrop1);
486 		cache_zero(&ncdrop2);
487 		vprele1 = NULL;
488 		vprele2 = NULL;
489 
490 		/*
491 		 * MPUNSAFE - XXX fdp can be pulled out from under a
492 		 * foreign process.
493 		 *
494 		 * A shared filedesc is ok, we don't have to copy it
495 		 * because we are making this change globally.
496 		 */
497 		spin_lock_wr(&fdp->fd_spin);
498 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
499 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
500 			vprele1 = fdp->fd_cdir;
501 			vref(info->new_vp);
502 			fdp->fd_cdir = info->new_vp;
503 			ncdrop1 = fdp->fd_ncdir;
504 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
505 		}
506 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
507 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
508 			vprele2 = fdp->fd_rdir;
509 			vref(info->new_vp);
510 			fdp->fd_rdir = info->new_vp;
511 			ncdrop2 = fdp->fd_nrdir;
512 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
513 		}
514 		spin_unlock_wr(&fdp->fd_spin);
515 		if (ncdrop1.ncp)
516 			cache_drop(&ncdrop1);
517 		if (ncdrop2.ncp)
518 			cache_drop(&ncdrop2);
519 		if (vprele1)
520 			vrele(vprele1);
521 		if (vprele2)
522 			vrele(vprele2);
523 	}
524 	return(0);
525 }
526 
527 /*
528  * Unmount a file system.
529  *
530  * Note: unmount takes a path to the vnode mounted on as argument,
531  * not special file (as before).
532  */
533 /*
534  * umount_args(char *path, int flags)
535  */
536 /* ARGSUSED */
537 int
538 sys_unmount(struct unmount_args *uap)
539 {
540 	struct thread *td = curthread;
541 	struct proc *p = td->td_proc;
542 	struct mount *mp = NULL;
543 	int error;
544 	struct nlookupdata nd;
545 
546 	KKASSERT(p);
547 	if (p->p_ucred->cr_prison != NULL)
548 		return (EPERM);
549 	if (usermount == 0 && (error = suser(td)))
550 		return (error);
551 
552 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
553 	if (error == 0)
554 		error = nlookup(&nd);
555 	if (error)
556 		goto out;
557 
558 	mp = nd.nl_nch.mount;
559 
560 	/*
561 	 * Only root, or the user that did the original mount is
562 	 * permitted to unmount this filesystem.
563 	 */
564 	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
565 	    (error = suser(td)))
566 		goto out;
567 
568 	/*
569 	 * Don't allow unmounting the root file system.
570 	 */
571 	if (mp->mnt_flag & MNT_ROOTFS) {
572 		error = EINVAL;
573 		goto out;
574 	}
575 
576 	/*
577 	 * Must be the root of the filesystem
578 	 */
579 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
580 		error = EINVAL;
581 		goto out;
582 	}
583 
584 out:
585 	nlookup_done(&nd);
586 	if (error)
587 		return (error);
588 	return (dounmount(mp, uap->flags));
589 }
590 
591 /*
592  * Do the actual file system unmount.
593  */
594 static int
595 dounmount_interlock(struct mount *mp)
596 {
597 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
598 		return (EBUSY);
599 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
600 	return(0);
601 }
602 
603 int
604 dounmount(struct mount *mp, int flags)
605 {
606 	struct namecache *ncp;
607 	struct nchandle nch;
608 	int error;
609 	int async_flag;
610 	int lflags;
611 	int freeok = 1;
612 
613 	/*
614 	 * Exclusive access for unmounting purposes
615 	 */
616 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
617 		return (error);
618 
619 	/*
620 	 * Allow filesystems to detect that a forced unmount is in progress.
621 	 */
622 	if (flags & MNT_FORCE)
623 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
624 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
625 	error = lockmgr(&mp->mnt_lock, lflags);
626 	if (error) {
627 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
628 		if (mp->mnt_kern_flag & MNTK_MWAIT)
629 			wakeup(mp);
630 		return (error);
631 	}
632 
633 	if (mp->mnt_flag & MNT_EXPUBLIC)
634 		vfs_setpublicfs(NULL, NULL, NULL);
635 
636 	vfs_msync(mp, MNT_WAIT);
637 	async_flag = mp->mnt_flag & MNT_ASYNC;
638 	mp->mnt_flag &=~ MNT_ASYNC;
639 
640 	/*
641 	 * If this filesystem isn't aliasing other filesystems,
642 	 * try to invalidate any remaining namecache entries and
643 	 * check the count afterwords.
644 	 */
645 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
646 		cache_lock(&mp->mnt_ncmountpt);
647 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
648 		cache_unlock(&mp->mnt_ncmountpt);
649 
650 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
651 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
652 
653 			if ((flags & MNT_FORCE) == 0) {
654 				error = EBUSY;
655 				mount_warning(mp, "Cannot unmount: "
656 						  "%d namecache "
657 						  "references still "
658 						  "present",
659 						  ncp->nc_refs - 1);
660 			} else {
661 				mount_warning(mp, "Forced unmount: "
662 						  "%d namecache "
663 						  "references still "
664 						  "present",
665 						  ncp->nc_refs - 1);
666 				freeok = 0;
667 			}
668 		}
669 	}
670 
671 	/*
672 	 * nchandle records ref the mount structure.  Expect a count of 1
673 	 * (our mount->mnt_ncmountpt).
674 	 */
675 	if (mp->mnt_refs != 1) {
676 		if ((flags & MNT_FORCE) == 0) {
677 			mount_warning(mp, "Cannot unmount: "
678 					  "%d process references still "
679 					  "present", mp->mnt_refs);
680 			error = EBUSY;
681 		} else {
682 			mount_warning(mp, "Forced unmount: "
683 					  "%d process references still "
684 					  "present", mp->mnt_refs);
685 			freeok = 0;
686 		}
687 	}
688 
689 	if (error == 0) {
690 		if (mp->mnt_syncer != NULL)
691 			vrele(mp->mnt_syncer);
692 		if (((mp->mnt_flag & MNT_RDONLY) ||
693 		     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
694 		    (flags & MNT_FORCE)) {
695 			error = VFS_UNMOUNT(mp, flags);
696 		}
697 	}
698 	if (error) {
699 		if (mp->mnt_syncer == NULL)
700 			vfs_allocate_syncvnode(mp);
701 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
702 		mp->mnt_flag |= async_flag;
703 		lockmgr(&mp->mnt_lock, LK_RELEASE);
704 		if (mp->mnt_kern_flag & MNTK_MWAIT)
705 			wakeup(mp);
706 		return (error);
707 	}
708 	/*
709 	 * Clean up any journals still associated with the mount after
710 	 * filesystem activity has ceased.
711 	 */
712 	journal_remove_all_journals(mp,
713 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
714 
715 	mountlist_remove(mp);
716 
717 	/*
718 	 * Remove any installed vnode ops here so the individual VFSs don't
719 	 * have to.
720 	 */
721 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
722 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
723 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
724 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
725 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
726 
727 	if (mp->mnt_ncmountpt.ncp != NULL) {
728 		nch = mp->mnt_ncmountpt;
729 		cache_zero(&mp->mnt_ncmountpt);
730 		cache_clrmountpt(&nch);
731 		cache_drop(&nch);
732 	}
733 	if (mp->mnt_ncmounton.ncp != NULL) {
734 		nch = mp->mnt_ncmounton;
735 		cache_zero(&mp->mnt_ncmounton);
736 		cache_clrmountpt(&nch);
737 		cache_drop(&nch);
738 	}
739 
740 	mp->mnt_vfc->vfc_refcount--;
741 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
742 		panic("unmount: dangling vnode");
743 	lockmgr(&mp->mnt_lock, LK_RELEASE);
744 	if (mp->mnt_kern_flag & MNTK_MWAIT)
745 		wakeup(mp);
746 	if (freeok)
747 		kfree(mp, M_MOUNT);
748 	return (0);
749 }
750 
751 static
752 void
753 mount_warning(struct mount *mp, const char *ctl, ...)
754 {
755 	char *ptr;
756 	char *buf;
757 	__va_list va;
758 
759 	__va_start(va, ctl);
760 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf) == 0) {
761 		kprintf("unmount(%s): ", ptr);
762 		kvprintf(ctl, va);
763 		kprintf("\n");
764 		kfree(buf, M_TEMP);
765 	} else {
766 		kprintf("unmount(%p", mp);
767 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
768 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
769 		kprintf("): ");
770 		kvprintf(ctl, va);
771 		kprintf("\n");
772 	}
773 	__va_end(va);
774 }
775 
776 /*
777  * Shim cache_fullpath() to handle the case where a process is chrooted into
778  * a subdirectory of a mount.  In this case if the root mount matches the
779  * process root directory's mount we have to specify the process's root
780  * directory instead of the mount point, because the mount point might
781  * be above the root directory.
782  */
783 static
784 int
785 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
786 {
787 	struct nchandle *nch;
788 
789 	if (p && p->p_fd->fd_nrdir.mount == mp)
790 		nch = &p->p_fd->fd_nrdir;
791 	else
792 		nch = &mp->mnt_ncmountpt;
793 	return(cache_fullpath(p, nch, rb, fb));
794 }
795 
796 /*
797  * Sync each mounted filesystem.
798  */
799 
800 #ifdef DEBUG
801 static int syncprt = 0;
802 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
803 #endif /* DEBUG */
804 
805 static int sync_callback(struct mount *mp, void *data);
806 
807 /* ARGSUSED */
808 int
809 sys_sync(struct sync_args *uap)
810 {
811 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
812 #ifdef DEBUG
813 	/*
814 	 * print out buffer pool stat information on each sync() call.
815 	 */
816 	if (syncprt)
817 		vfs_bufstats();
818 #endif /* DEBUG */
819 	return (0);
820 }
821 
822 static
823 int
824 sync_callback(struct mount *mp, void *data __unused)
825 {
826 	int asyncflag;
827 
828 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
829 		asyncflag = mp->mnt_flag & MNT_ASYNC;
830 		mp->mnt_flag &= ~MNT_ASYNC;
831 		vfs_msync(mp, MNT_NOWAIT);
832 		VFS_SYNC(mp, MNT_NOWAIT);
833 		mp->mnt_flag |= asyncflag;
834 	}
835 	return(0);
836 }
837 
838 /* XXX PRISON: could be per prison flag */
839 static int prison_quotas;
840 #if 0
841 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
842 #endif
843 
844 /*
845  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
846  *
847  * Change filesystem quotas.
848  */
849 /* ARGSUSED */
850 int
851 sys_quotactl(struct quotactl_args *uap)
852 {
853 	struct nlookupdata nd;
854 	struct thread *td;
855 	struct proc *p;
856 	struct mount *mp;
857 	int error;
858 
859 	td = curthread;
860 	p = td->td_proc;
861 	if (p->p_ucred->cr_prison && !prison_quotas)
862 		return (EPERM);
863 
864 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
865 	if (error == 0)
866 		error = nlookup(&nd);
867 	if (error == 0) {
868 		mp = nd.nl_nch.mount;
869 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
870 				    uap->arg, nd.nl_cred);
871 	}
872 	nlookup_done(&nd);
873 	return (error);
874 }
875 
876 /*
877  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
878  *		void *buf, int buflen)
879  *
880  * This function operates on a mount point and executes the specified
881  * operation using the specified control data, and possibly returns data.
882  *
883  * The actual number of bytes stored in the result buffer is returned, 0
884  * if none, otherwise an error is returned.
885  */
886 /* ARGSUSED */
887 int
888 sys_mountctl(struct mountctl_args *uap)
889 {
890 	struct thread *td = curthread;
891 	struct proc *p = td->td_proc;
892 	struct file *fp;
893 	void *ctl = NULL;
894 	void *buf = NULL;
895 	char *path = NULL;
896 	int error;
897 
898 	/*
899 	 * Sanity and permissions checks.  We must be root.
900 	 */
901 	KKASSERT(p);
902 	if (p->p_ucred->cr_prison != NULL)
903 		return (EPERM);
904 	if ((error = suser(td)) != 0)
905 		return (error);
906 
907 	/*
908 	 * Argument length checks
909 	 */
910 	if (uap->ctllen < 0 || uap->ctllen > 1024)
911 		return (EINVAL);
912 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
913 		return (EINVAL);
914 	if (uap->path == NULL)
915 		return (EINVAL);
916 
917 	/*
918 	 * Allocate the necessary buffers and copyin data
919 	 */
920 	path = objcache_get(namei_oc, M_WAITOK);
921 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
922 	if (error)
923 		goto done;
924 
925 	if (uap->ctllen) {
926 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
927 		error = copyin(uap->ctl, ctl, uap->ctllen);
928 		if (error)
929 			goto done;
930 	}
931 	if (uap->buflen)
932 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
933 
934 	/*
935 	 * Validate the descriptor
936 	 */
937 	if (uap->fd >= 0) {
938 		fp = holdfp(p->p_fd, uap->fd, -1);
939 		if (fp == NULL) {
940 			error = EBADF;
941 			goto done;
942 		}
943 	} else {
944 		fp = NULL;
945 	}
946 
947 	/*
948 	 * Execute the internal kernel function and clean up.
949 	 */
950 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
951 	if (fp)
952 		fdrop(fp);
953 	if (error == 0 && uap->sysmsg_result > 0)
954 		error = copyout(buf, uap->buf, uap->sysmsg_result);
955 done:
956 	if (path)
957 		objcache_put(namei_oc, path);
958 	if (ctl)
959 		kfree(ctl, M_TEMP);
960 	if (buf)
961 		kfree(buf, M_TEMP);
962 	return (error);
963 }
964 
965 /*
966  * Execute a mount control operation by resolving the path to a mount point
967  * and calling vop_mountctl().
968  */
969 int
970 kern_mountctl(const char *path, int op, struct file *fp,
971 		const void *ctl, int ctllen,
972 		void *buf, int buflen, int *res)
973 {
974 	struct vnode *vp;
975 	struct mount *mp;
976 	struct nlookupdata nd;
977 	int error;
978 
979 	*res = 0;
980 	vp = NULL;
981 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
982 	if (error == 0)
983 		error = nlookup(&nd);
984 	if (error == 0)
985 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
986 	nlookup_done(&nd);
987 	if (error)
988 		return (error);
989 
990 	mp = vp->v_mount;
991 
992 	/*
993 	 * Must be the root of the filesystem
994 	 */
995 	if ((vp->v_flag & VROOT) == 0) {
996 		vput(vp);
997 		return (EINVAL);
998 	}
999 	error = vop_mountctl(mp->mnt_vn_use_ops, op, fp, ctl, ctllen,
1000 				buf, buflen, res);
1001 	vput(vp);
1002 	return (error);
1003 }
1004 
1005 int
1006 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1007 {
1008 	struct thread *td = curthread;
1009 	struct proc *p = td->td_proc;
1010 	struct mount *mp;
1011 	struct statfs *sp;
1012 	char *fullpath, *freepath;
1013 	int error;
1014 
1015 	if ((error = nlookup(nd)) != 0)
1016 		return (error);
1017 	mp = nd->nl_nch.mount;
1018 	sp = &mp->mnt_stat;
1019 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1020 		return (error);
1021 
1022 	error = mount_path(p, mp, &fullpath, &freepath);
1023 	if (error)
1024 		return(error);
1025 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1026 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1027 	kfree(freepath, M_TEMP);
1028 
1029 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1030 	bcopy(sp, buf, sizeof(*buf));
1031 	/* Only root should have access to the fsid's. */
1032 	if (suser(td))
1033 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1034 	return (0);
1035 }
1036 
1037 /*
1038  * statfs_args(char *path, struct statfs *buf)
1039  *
1040  * Get filesystem statistics.
1041  */
1042 int
1043 sys_statfs(struct statfs_args *uap)
1044 {
1045 	struct nlookupdata nd;
1046 	struct statfs buf;
1047 	int error;
1048 
1049 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1050 	if (error == 0)
1051 		error = kern_statfs(&nd, &buf);
1052 	nlookup_done(&nd);
1053 	if (error == 0)
1054 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1055 	return (error);
1056 }
1057 
1058 int
1059 kern_fstatfs(int fd, struct statfs *buf)
1060 {
1061 	struct thread *td = curthread;
1062 	struct proc *p = td->td_proc;
1063 	struct file *fp;
1064 	struct mount *mp;
1065 	struct statfs *sp;
1066 	char *fullpath, *freepath;
1067 	int error;
1068 
1069 	KKASSERT(p);
1070 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1071 		return (error);
1072 	mp = ((struct vnode *)fp->f_data)->v_mount;
1073 	if (mp == NULL) {
1074 		error = EBADF;
1075 		goto done;
1076 	}
1077 	if (fp->f_cred == NULL) {
1078 		error = EINVAL;
1079 		goto done;
1080 	}
1081 	sp = &mp->mnt_stat;
1082 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1083 		goto done;
1084 
1085 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1086 		goto done;
1087 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1088 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1089 	kfree(freepath, M_TEMP);
1090 
1091 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1092 	bcopy(sp, buf, sizeof(*buf));
1093 
1094 	/* Only root should have access to the fsid's. */
1095 	if (suser(td))
1096 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1097 	error = 0;
1098 done:
1099 	fdrop(fp);
1100 	return (error);
1101 }
1102 
1103 /*
1104  * fstatfs_args(int fd, struct statfs *buf)
1105  *
1106  * Get filesystem statistics.
1107  */
1108 int
1109 sys_fstatfs(struct fstatfs_args *uap)
1110 {
1111 	struct statfs buf;
1112 	int error;
1113 
1114 	error = kern_fstatfs(uap->fd, &buf);
1115 
1116 	if (error == 0)
1117 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1118 	return (error);
1119 }
1120 
1121 /*
1122  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1123  *
1124  * Get statistics on all filesystems.
1125  */
1126 
1127 struct getfsstat_info {
1128 	struct statfs *sfsp;
1129 	long count;
1130 	long maxcount;
1131 	int error;
1132 	int flags;
1133 	struct proc *p;
1134 };
1135 
1136 static int getfsstat_callback(struct mount *, void *);
1137 
1138 /* ARGSUSED */
1139 int
1140 sys_getfsstat(struct getfsstat_args *uap)
1141 {
1142 	struct thread *td = curthread;
1143 	struct proc *p = td->td_proc;
1144 	struct getfsstat_info info;
1145 
1146 	bzero(&info, sizeof(info));
1147 
1148 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1149 	info.sfsp = uap->buf;
1150 	info.count = 0;
1151 	info.flags = uap->flags;
1152 	info.p = p;
1153 
1154 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1155 	if (info.sfsp && info.count > info.maxcount)
1156 		uap->sysmsg_result = info.maxcount;
1157 	else
1158 		uap->sysmsg_result = info.count;
1159 	return (info.error);
1160 }
1161 
1162 static int
1163 getfsstat_callback(struct mount *mp, void *data)
1164 {
1165 	struct getfsstat_info *info = data;
1166 	struct statfs *sp;
1167 	char *freepath;
1168 	char *fullpath;
1169 	int error;
1170 
1171 	if (info->sfsp && info->count < info->maxcount) {
1172 		if (info->p && !chroot_visible_mnt(mp, info->p))
1173 			return(0);
1174 		sp = &mp->mnt_stat;
1175 
1176 		/*
1177 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1178 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1179 		 * overrides MNT_WAIT.
1180 		 */
1181 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1182 		    (info->flags & MNT_WAIT)) &&
1183 		    (error = VFS_STATFS(mp, sp, info->p->p_ucred))) {
1184 			return(0);
1185 		}
1186 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1187 
1188 		error = mount_path(info->p, mp, &fullpath, &freepath);
1189 		if (error) {
1190 			info->error = error;
1191 			return(-1);
1192 		}
1193 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1194 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1195 		kfree(freepath, M_TEMP);
1196 
1197 		error = copyout(sp, info->sfsp, sizeof(*sp));
1198 		if (error) {
1199 			info->error = error;
1200 			return (-1);
1201 		}
1202 		++info->sfsp;
1203 	}
1204 	info->count++;
1205 	return(0);
1206 }
1207 
1208 /*
1209  * fchdir_args(int fd)
1210  *
1211  * Change current working directory to a given file descriptor.
1212  */
1213 /* ARGSUSED */
1214 int
1215 sys_fchdir(struct fchdir_args *uap)
1216 {
1217 	struct thread *td = curthread;
1218 	struct proc *p = td->td_proc;
1219 	struct filedesc *fdp = p->p_fd;
1220 	struct vnode *vp, *ovp;
1221 	struct mount *mp;
1222 	struct file *fp;
1223 	struct nchandle nch, onch, tnch;
1224 	int error;
1225 
1226 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1227 		return (error);
1228 	vp = (struct vnode *)fp->f_data;
1229 	vref(vp);
1230 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1231 	if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL)
1232 		error = ENOTDIR;
1233 	else
1234 		error = VOP_ACCESS(vp, VEXEC, p->p_ucred);
1235 	if (error) {
1236 		vput(vp);
1237 		fdrop(fp);
1238 		return (error);
1239 	}
1240 	cache_copy(&fp->f_nchandle, &nch);
1241 
1242 	/*
1243 	 * If the ncp has become a mount point, traverse through
1244 	 * the mount point.
1245 	 */
1246 
1247 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1248 	       (mp = cache_findmount(&nch)) != NULL
1249 	) {
1250 		error = nlookup_mp(mp, &tnch);
1251 		if (error == 0) {
1252 			cache_unlock(&tnch);	/* leave ref intact */
1253 			vput(vp);
1254 			vp = tnch.ncp->nc_vp;
1255 			error = vget(vp, LK_SHARED);
1256 			KKASSERT(error == 0);
1257 			cache_drop(&nch);
1258 			nch = tnch;
1259 		}
1260 	}
1261 	if (error == 0) {
1262 		ovp = fdp->fd_cdir;
1263 		onch = fdp->fd_ncdir;
1264 		vn_unlock(vp);		/* leave ref intact */
1265 		fdp->fd_cdir = vp;
1266 		fdp->fd_ncdir = nch;
1267 		cache_drop(&onch);
1268 		vrele(ovp);
1269 	} else {
1270 		cache_drop(&nch);
1271 		vput(vp);
1272 	}
1273 	fdrop(fp);
1274 	return (error);
1275 }
1276 
1277 int
1278 kern_chdir(struct nlookupdata *nd)
1279 {
1280 	struct thread *td = curthread;
1281 	struct proc *p = td->td_proc;
1282 	struct filedesc *fdp = p->p_fd;
1283 	struct vnode *vp, *ovp;
1284 	struct nchandle onch;
1285 	int error;
1286 
1287 	if ((error = nlookup(nd)) != 0)
1288 		return (error);
1289 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1290 		return (ENOENT);
1291 	if ((error = vget(vp, LK_SHARED)) != 0)
1292 		return (error);
1293 
1294 	error = checkvp_chdir(vp, td);
1295 	vn_unlock(vp);
1296 	if (error == 0) {
1297 		ovp = fdp->fd_cdir;
1298 		onch = fdp->fd_ncdir;
1299 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1300 		fdp->fd_ncdir = nd->nl_nch;
1301 		fdp->fd_cdir = vp;
1302 		cache_drop(&onch);
1303 		vrele(ovp);
1304 		cache_zero(&nd->nl_nch);
1305 	} else {
1306 		vrele(vp);
1307 	}
1308 	return (error);
1309 }
1310 
1311 /*
1312  * chdir_args(char *path)
1313  *
1314  * Change current working directory (``.'').
1315  */
1316 int
1317 sys_chdir(struct chdir_args *uap)
1318 {
1319 	struct nlookupdata nd;
1320 	int error;
1321 
1322 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1323 	if (error == 0)
1324 		error = kern_chdir(&nd);
1325 	nlookup_done(&nd);
1326 	return (error);
1327 }
1328 
1329 /*
1330  * Helper function for raised chroot(2) security function:  Refuse if
1331  * any filedescriptors are open directories.
1332  */
1333 static int
1334 chroot_refuse_vdir_fds(struct filedesc *fdp)
1335 {
1336 	struct vnode *vp;
1337 	struct file *fp;
1338 	int error;
1339 	int fd;
1340 
1341 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1342 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1343 			continue;
1344 		vp = (struct vnode *)fp->f_data;
1345 		if (vp->v_type != VDIR) {
1346 			fdrop(fp);
1347 			continue;
1348 		}
1349 		fdrop(fp);
1350 		return(EPERM);
1351 	}
1352 	return (0);
1353 }
1354 
1355 /*
1356  * This sysctl determines if we will allow a process to chroot(2) if it
1357  * has a directory open:
1358  *	0: disallowed for all processes.
1359  *	1: allowed for processes that were not already chroot(2)'ed.
1360  *	2: allowed for all processes.
1361  */
1362 
1363 static int chroot_allow_open_directories = 1;
1364 
1365 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1366      &chroot_allow_open_directories, 0, "");
1367 
1368 /*
1369  * chroot to the specified namecache entry.  We obtain the vp from the
1370  * namecache data.  The passed ncp must be locked and referenced and will
1371  * remain locked and referenced on return.
1372  */
1373 int
1374 kern_chroot(struct nchandle *nch)
1375 {
1376 	struct thread *td = curthread;
1377 	struct proc *p = td->td_proc;
1378 	struct filedesc *fdp = p->p_fd;
1379 	struct vnode *vp;
1380 	int error;
1381 
1382 	/*
1383 	 * Only root can chroot
1384 	 */
1385 	if ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0)
1386 		return (error);
1387 
1388 	/*
1389 	 * Disallow open directory descriptors (fchdir() breakouts).
1390 	 */
1391 	if (chroot_allow_open_directories == 0 ||
1392 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1393 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1394 			return (error);
1395 	}
1396 	if ((vp = nch->ncp->nc_vp) == NULL)
1397 		return (ENOENT);
1398 
1399 	if ((error = vget(vp, LK_SHARED)) != 0)
1400 		return (error);
1401 
1402 	/*
1403 	 * Check the validity of vp as a directory to change to and
1404 	 * associate it with rdir/jdir.
1405 	 */
1406 	error = checkvp_chdir(vp, td);
1407 	vn_unlock(vp);			/* leave reference intact */
1408 	if (error == 0) {
1409 		vrele(fdp->fd_rdir);
1410 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1411 		cache_drop(&fdp->fd_nrdir);
1412 		cache_copy(nch, &fdp->fd_nrdir);
1413 		if (fdp->fd_jdir == NULL) {
1414 			fdp->fd_jdir = vp;
1415 			vref(fdp->fd_jdir);
1416 			cache_copy(nch, &fdp->fd_njdir);
1417 		}
1418 	} else {
1419 		vrele(vp);
1420 	}
1421 	return (error);
1422 }
1423 
1424 /*
1425  * chroot_args(char *path)
1426  *
1427  * Change notion of root (``/'') directory.
1428  */
1429 /* ARGSUSED */
1430 int
1431 sys_chroot(struct chroot_args *uap)
1432 {
1433 	struct thread *td = curthread;
1434 	struct nlookupdata nd;
1435 	int error;
1436 
1437 	KKASSERT(td->td_proc);
1438 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1439 	if (error) {
1440 		nlookup_done(&nd);
1441 		return(error);
1442 	}
1443 	error = nlookup(&nd);
1444 	if (error == 0)
1445 		error = kern_chroot(&nd.nl_nch);
1446 	nlookup_done(&nd);
1447 	return(error);
1448 }
1449 
1450 /*
1451  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1452  * determine whether it is legal to chdir to the vnode.  The vnode's state
1453  * is not changed by this call.
1454  */
1455 int
1456 checkvp_chdir(struct vnode *vp, struct thread *td)
1457 {
1458 	int error;
1459 
1460 	if (vp->v_type != VDIR)
1461 		error = ENOTDIR;
1462 	else
1463 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred);
1464 	return (error);
1465 }
1466 
1467 int
1468 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1469 {
1470 	struct thread *td = curthread;
1471 	struct proc *p = td->td_proc;
1472 	struct lwp *lp = td->td_lwp;
1473 	struct filedesc *fdp = p->p_fd;
1474 	int cmode, flags;
1475 	struct file *nfp;
1476 	struct file *fp;
1477 	struct vnode *vp;
1478 	int type, indx, error;
1479 	struct flock lf;
1480 
1481 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1482 		return (EINVAL);
1483 	flags = FFLAGS(oflags);
1484 	error = falloc(p, &nfp, NULL);
1485 	if (error)
1486 		return (error);
1487 	fp = nfp;
1488 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1489 
1490 	/*
1491 	 * XXX p_dupfd is a real mess.  It allows a device to return a
1492 	 * file descriptor to be duplicated rather then doing the open
1493 	 * itself.
1494 	 */
1495 	lp->lwp_dupfd = -1;
1496 
1497 	/*
1498 	 * Call vn_open() to do the lookup and assign the vnode to the
1499 	 * file pointer.  vn_open() does not change the ref count on fp
1500 	 * and the vnode, on success, will be inherited by the file pointer
1501 	 * and unlocked.
1502 	 */
1503 	nd->nl_flags |= NLC_LOCKVP;
1504 	error = vn_open(nd, fp, flags, cmode);
1505 	nlookup_done(nd);
1506 	if (error) {
1507 		/*
1508 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1509 		 * responsible for dropping the old contents of ofiles[indx]
1510 		 * if it succeeds.
1511 		 *
1512 		 * Note that fsetfd() will add a ref to fp which represents
1513 		 * the fd_files[] assignment.  We must still drop our
1514 		 * reference.
1515 		 */
1516 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1517 			if (fdalloc(p, 0, &indx) == 0) {
1518 				error = dupfdopen(p, indx, lp->lwp_dupfd, flags, error);
1519 				if (error == 0) {
1520 					*res = indx;
1521 					fdrop(fp);	/* our ref */
1522 					return (0);
1523 				}
1524 				fsetfd(p, NULL, indx);
1525 			}
1526 		}
1527 		fdrop(fp);	/* our ref */
1528 		if (error == ERESTART)
1529 			error = EINTR;
1530 		return (error);
1531 	}
1532 
1533 	/*
1534 	 * ref the vnode for ourselves so it can't be ripped out from under
1535 	 * is.  XXX need an ND flag to request that the vnode be returned
1536 	 * anyway.
1537 	 *
1538 	 * Reserve a file descriptor but do not assign it until the open
1539 	 * succeeds.
1540 	 */
1541 	vp = (struct vnode *)fp->f_data;
1542 	vref(vp);
1543 	if ((error = fdalloc(p, 0, &indx)) != 0) {
1544 		fdrop(fp);
1545 		vrele(vp);
1546 		return (error);
1547 	}
1548 
1549 	/*
1550 	 * If no error occurs the vp will have been assigned to the file
1551 	 * pointer.
1552 	 */
1553 	lp->lwp_dupfd = 0;
1554 
1555 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1556 		lf.l_whence = SEEK_SET;
1557 		lf.l_start = 0;
1558 		lf.l_len = 0;
1559 		if (flags & O_EXLOCK)
1560 			lf.l_type = F_WRLCK;
1561 		else
1562 			lf.l_type = F_RDLCK;
1563 		if (flags & FNONBLOCK)
1564 			type = 0;
1565 		else
1566 			type = F_WAIT;
1567 
1568 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1569 			/*
1570 			 * lock request failed.  Clean up the reserved
1571 			 * descriptor.
1572 			 */
1573 			vrele(vp);
1574 			fsetfd(p, NULL, indx);
1575 			fdrop(fp);
1576 			return (error);
1577 		}
1578 		fp->f_flag |= FHASLOCK;
1579 	}
1580 #if 0
1581 	/*
1582 	 * Assert that all regular file vnodes were created with a object.
1583 	 */
1584 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1585 		("open: regular file has no backing object after vn_open"));
1586 #endif
1587 
1588 	vrele(vp);
1589 
1590 	/*
1591 	 * release our private reference, leaving the one associated with the
1592 	 * descriptor table intact.
1593 	 */
1594 	fsetfd(p, fp, indx);
1595 	fdrop(fp);
1596 	*res = indx;
1597 	return (0);
1598 }
1599 
1600 /*
1601  * open_args(char *path, int flags, int mode)
1602  *
1603  * Check permissions, allocate an open file structure,
1604  * and call the device open routine if any.
1605  */
1606 int
1607 sys_open(struct open_args *uap)
1608 {
1609 	struct nlookupdata nd;
1610 	int error;
1611 
1612 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1613 	if (error == 0) {
1614 		error = kern_open(&nd, uap->flags,
1615 				    uap->mode, &uap->sysmsg_result);
1616 	}
1617 	nlookup_done(&nd);
1618 	return (error);
1619 }
1620 
1621 int
1622 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
1623 {
1624 	struct thread *td = curthread;
1625 	struct proc *p = td->td_proc;
1626 	struct vnode *vp;
1627 	struct vnode *dvp;
1628 	struct vattr vattr;
1629 	int error;
1630 	int whiteout = 0;
1631 
1632 	KKASSERT(p);
1633 
1634 	switch (mode & S_IFMT) {
1635 	case S_IFCHR:
1636 	case S_IFBLK:
1637 		error = suser(td);
1638 		break;
1639 	default:
1640 		error = suser_cred(p->p_ucred, PRISON_ROOT);
1641 		break;
1642 	}
1643 	if (error)
1644 		return (error);
1645 
1646 	bwillwrite();
1647 	nd->nl_flags |= NLC_CREATE;
1648 	if ((error = nlookup(nd)) != 0)
1649 		return (error);
1650 	if (nd->nl_nch.ncp->nc_vp)
1651 		return (EEXIST);
1652 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1653 		return (error);
1654 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
1655 		return (EPERM);
1656 	/* vhold(dvp); - DVP can't go away */
1657 
1658 	VATTR_NULL(&vattr);
1659 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1660 	vattr.va_rmajor = rmajor;
1661 	vattr.va_rminor = rminor;
1662 	whiteout = 0;
1663 
1664 	switch (mode & S_IFMT) {
1665 	case S_IFMT:	/* used by badsect to flag bad sectors */
1666 		vattr.va_type = VBAD;
1667 		break;
1668 	case S_IFCHR:
1669 		vattr.va_type = VCHR;
1670 		break;
1671 	case S_IFBLK:
1672 		vattr.va_type = VBLK;
1673 		break;
1674 	case S_IFWHT:
1675 		whiteout = 1;
1676 		break;
1677 	default:
1678 		error = EINVAL;
1679 		break;
1680 	}
1681 	if (error == 0) {
1682 		if (whiteout) {
1683 			error = VOP_NWHITEOUT(&nd->nl_nch, dvp, nd->nl_cred, NAMEI_CREATE);
1684 		} else {
1685 			vp = NULL;
1686 			error = VOP_NMKNOD(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr);
1687 			if (error == 0)
1688 				vput(vp);
1689 		}
1690 	}
1691 	/* vdrop(dvp); */
1692 	return (error);
1693 }
1694 
1695 /*
1696  * mknod_args(char *path, int mode, int dev)
1697  *
1698  * Create a special file.
1699  */
1700 int
1701 sys_mknod(struct mknod_args *uap)
1702 {
1703 	struct nlookupdata nd;
1704 	int error;
1705 
1706 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1707 	if (error == 0) {
1708 		error = kern_mknod(&nd, uap->mode,
1709 				   umajor(uap->dev), uminor(uap->dev));
1710 	}
1711 	nlookup_done(&nd);
1712 	return (error);
1713 }
1714 
1715 int
1716 kern_mkfifo(struct nlookupdata *nd, int mode)
1717 {
1718 	struct thread *td = curthread;
1719 	struct proc *p = td->td_proc;
1720 	struct vattr vattr;
1721 	struct vnode *vp;
1722 	struct vnode *dvp;
1723 	int error;
1724 
1725 	bwillwrite();
1726 
1727 	nd->nl_flags |= NLC_CREATE;
1728 	if ((error = nlookup(nd)) != 0)
1729 		return (error);
1730 	if (nd->nl_nch.ncp->nc_vp)
1731 		return (EEXIST);
1732 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1733 		return (error);
1734 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
1735 		return (EPERM);
1736 	/* vhold(dvp); - DVP can't go away */
1737 
1738 	VATTR_NULL(&vattr);
1739 	vattr.va_type = VFIFO;
1740 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1741 	vp = NULL;
1742 	error = VOP_NMKNOD(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr);
1743 	/* vdrop(dvp); */
1744 	if (error == 0)
1745 		vput(vp);
1746 	return (error);
1747 }
1748 
1749 /*
1750  * mkfifo_args(char *path, int mode)
1751  *
1752  * Create a named pipe.
1753  */
1754 int
1755 sys_mkfifo(struct mkfifo_args *uap)
1756 {
1757 	struct nlookupdata nd;
1758 	int error;
1759 
1760 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1761 	if (error == 0)
1762 		error = kern_mkfifo(&nd, uap->mode);
1763 	nlookup_done(&nd);
1764 	return (error);
1765 }
1766 
1767 static int hardlink_check_uid = 0;
1768 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1769     &hardlink_check_uid, 0,
1770     "Unprivileged processes cannot create hard links to files owned by other "
1771     "users");
1772 static int hardlink_check_gid = 0;
1773 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1774     &hardlink_check_gid, 0,
1775     "Unprivileged processes cannot create hard links to files owned by other "
1776     "groups");
1777 
1778 static int
1779 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
1780 {
1781 	struct vattr va;
1782 	int error;
1783 
1784 	/*
1785 	 * Shortcut if disabled
1786 	 */
1787 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
1788 		return (0);
1789 
1790 	/*
1791 	 * root cred can always hardlink
1792 	 */
1793 	if (suser_cred(cred, PRISON_ROOT) == 0)
1794 		return (0);
1795 
1796 	/*
1797 	 * Otherwise only if the originating file is owned by the
1798 	 * same user or group.  Note that any group is allowed if
1799 	 * the file is owned by the caller.
1800 	 */
1801 	error = VOP_GETATTR(vp, &va);
1802 	if (error != 0)
1803 		return (error);
1804 
1805 	if (hardlink_check_uid) {
1806 		if (cred->cr_uid != va.va_uid)
1807 			return (EPERM);
1808 	}
1809 
1810 	if (hardlink_check_gid) {
1811 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
1812 			return (EPERM);
1813 	}
1814 
1815 	return (0);
1816 }
1817 
1818 int
1819 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
1820 {
1821 	struct thread *td = curthread;
1822 	struct vnode *vp;
1823 	struct vnode *dvp;
1824 	int error;
1825 
1826 	/*
1827 	 * Lookup the source and obtained a locked vnode.
1828 	 *
1829 	 * XXX relookup on vget failure / race ?
1830 	 */
1831 	bwillwrite();
1832 	if ((error = nlookup(nd)) != 0)
1833 		return (error);
1834 	vp = nd->nl_nch.ncp->nc_vp;
1835 	KKASSERT(vp != NULL);
1836 	if (vp->v_type == VDIR)
1837 		return (EPERM);		/* POSIX */
1838 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1839 		return (error);
1840 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
1841 		return (error);
1842 
1843 	/*
1844 	 * Unlock the source so we can lookup the target without deadlocking
1845 	 * (XXX vp is locked already, possible other deadlock?).  The target
1846 	 * must not exist.
1847 	 */
1848 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
1849 	nd->nl_flags &= ~NLC_NCPISLOCKED;
1850 	cache_unlock(&nd->nl_nch);
1851 
1852 	linknd->nl_flags |= NLC_CREATE;
1853 	if ((error = nlookup(linknd)) != 0) {
1854 		vput(vp);
1855 		return (error);
1856 	}
1857 	if (linknd->nl_nch.ncp->nc_vp) {
1858 		vput(vp);
1859 		return (EEXIST);
1860 	}
1861 	if ((dvp = linknd->nl_nch.ncp->nc_parent->nc_vp) == NULL) {
1862 		vput(vp);
1863 		return (EPERM);
1864 	}
1865 	/* vhold(dvp); - dvp can't go away */
1866 
1867 	/*
1868 	 * Finally run the new API VOP.
1869 	 */
1870 	error = can_hardlink(vp, td, td->td_proc->p_ucred);
1871 	if (error == 0)
1872 		error = VOP_NLINK(&linknd->nl_nch, dvp, vp, linknd->nl_cred);
1873 	/* vdrop(dvp); */
1874 	vput(vp);
1875 	return (error);
1876 }
1877 
1878 /*
1879  * link_args(char *path, char *link)
1880  *
1881  * Make a hard file link.
1882  */
1883 int
1884 sys_link(struct link_args *uap)
1885 {
1886 	struct nlookupdata nd, linknd;
1887 	int error;
1888 
1889 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1890 	if (error == 0) {
1891 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
1892 		if (error == 0)
1893 			error = kern_link(&nd, &linknd);
1894 		nlookup_done(&linknd);
1895 	}
1896 	nlookup_done(&nd);
1897 	return (error);
1898 }
1899 
1900 int
1901 kern_symlink(struct nlookupdata *nd, char *path, int mode)
1902 {
1903 	struct vattr vattr;
1904 	struct vnode *vp;
1905 	struct vnode *dvp;
1906 	int error;
1907 
1908 	bwillwrite();
1909 	nd->nl_flags |= NLC_CREATE;
1910 	if ((error = nlookup(nd)) != 0)
1911 		return (error);
1912 	if (nd->nl_nch.ncp->nc_vp)
1913 		return (EEXIST);
1914 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1915 		return (error);
1916 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
1917 		return (EPERM);
1918 	/* vhold(dvp); - dvp can't go away */
1919 	VATTR_NULL(&vattr);
1920 	vattr.va_mode = mode;
1921 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
1922 	/* vdrop(dvp); */
1923 	if (error == 0)
1924 		vput(vp);
1925 	return (error);
1926 }
1927 
1928 /*
1929  * symlink(char *path, char *link)
1930  *
1931  * Make a symbolic link.
1932  */
1933 int
1934 sys_symlink(struct symlink_args *uap)
1935 {
1936 	struct thread *td = curthread;
1937 	struct nlookupdata nd;
1938 	char *path;
1939 	int error;
1940 	int mode;
1941 
1942 	path = objcache_get(namei_oc, M_WAITOK);
1943 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1944 	if (error == 0) {
1945 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
1946 		if (error == 0) {
1947 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
1948 			error = kern_symlink(&nd, path, mode);
1949 		}
1950 		nlookup_done(&nd);
1951 	}
1952 	objcache_put(namei_oc, path);
1953 	return (error);
1954 }
1955 
1956 /*
1957  * undelete_args(char *path)
1958  *
1959  * Delete a whiteout from the filesystem.
1960  */
1961 /* ARGSUSED */
1962 int
1963 sys_undelete(struct undelete_args *uap)
1964 {
1965 	struct nlookupdata nd;
1966 	struct vnode *dvp;
1967 	int error;
1968 
1969 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1970 	bwillwrite();
1971 	nd.nl_flags |= NLC_DELETE;
1972 	if (error == 0)
1973 		error = nlookup(&nd);
1974 	if (error == 0)
1975 		error = ncp_writechk(&nd.nl_nch);
1976 	dvp = NULL;
1977 	if (error == 0) {
1978 		if ((dvp = nd.nl_nch.ncp->nc_parent->nc_vp) == NULL)
1979 			error = EPERM;
1980 	}
1981 	if (error == 0) {
1982 		/* vhold(dvp); - dvp can't go away */
1983 		error = VOP_NWHITEOUT(&nd.nl_nch, dvp, nd.nl_cred, NAMEI_DELETE);
1984 		/* vdrop(dvp); */
1985 	}
1986 	nlookup_done(&nd);
1987 	return (error);
1988 }
1989 
1990 int
1991 kern_unlink(struct nlookupdata *nd)
1992 {
1993 	struct vnode *dvp;
1994 	int error;
1995 
1996 	bwillwrite();
1997 	nd->nl_flags |= NLC_DELETE;
1998 	if ((error = nlookup(nd)) != 0)
1999 		return (error);
2000 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2001 		return (error);
2002 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
2003 		return (EPERM);
2004 	/* vhold(dvp); - dvp can't go away */
2005 	error = VOP_NREMOVE(&nd->nl_nch, dvp, nd->nl_cred);
2006 	/* vdrop(dvp); */
2007 	return (error);
2008 }
2009 
2010 /*
2011  * unlink_args(char *path)
2012  *
2013  * Delete a name from the filesystem.
2014  */
2015 int
2016 sys_unlink(struct unlink_args *uap)
2017 {
2018 	struct nlookupdata nd;
2019 	int error;
2020 
2021 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2022 	if (error == 0)
2023 		error = kern_unlink(&nd);
2024 	nlookup_done(&nd);
2025 	return (error);
2026 }
2027 
2028 int
2029 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2030 {
2031 	struct thread *td = curthread;
2032 	struct proc *p = td->td_proc;
2033 	struct file *fp;
2034 	struct vattr vattr;
2035 	int error;
2036 
2037 	fp = holdfp(p->p_fd, fd, -1);
2038 	if (fp == NULL)
2039 		return (EBADF);
2040 	if (fp->f_type != DTYPE_VNODE) {
2041 		error = ESPIPE;
2042 		goto done;
2043 	}
2044 
2045 	switch (whence) {
2046 	case L_INCR:
2047 		fp->f_offset += offset;
2048 		error = 0;
2049 		break;
2050 	case L_XTND:
2051 		error = VOP_GETATTR((struct vnode *)fp->f_data, &vattr);
2052 		if (error == 0)
2053 			fp->f_offset = offset + vattr.va_size;
2054 		break;
2055 	case L_SET:
2056 		fp->f_offset = offset;
2057 		error = 0;
2058 		break;
2059 	default:
2060 		error = EINVAL;
2061 		break;
2062 	}
2063 	*res = fp->f_offset;
2064 done:
2065 	fdrop(fp);
2066 	return (error);
2067 }
2068 
2069 /*
2070  * lseek_args(int fd, int pad, off_t offset, int whence)
2071  *
2072  * Reposition read/write file offset.
2073  */
2074 int
2075 sys_lseek(struct lseek_args *uap)
2076 {
2077 	int error;
2078 
2079 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2080 	    &uap->sysmsg_offset);
2081 
2082 	return (error);
2083 }
2084 
2085 int
2086 kern_access(struct nlookupdata *nd, int aflags)
2087 {
2088 	struct vnode *vp;
2089 	int error, flags;
2090 
2091 	if ((error = nlookup(nd)) != 0)
2092 		return (error);
2093 retry:
2094 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2095 	if (error)
2096 		return (error);
2097 
2098 	/* Flags == 0 means only check for existence. */
2099 	if (aflags) {
2100 		flags = 0;
2101 		if (aflags & R_OK)
2102 			flags |= VREAD;
2103 		if (aflags & W_OK)
2104 			flags |= VWRITE;
2105 		if (aflags & X_OK)
2106 			flags |= VEXEC;
2107 		if ((flags & VWRITE) == 0 ||
2108 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2109 			error = VOP_ACCESS(vp, flags, nd->nl_cred);
2110 
2111 		/*
2112 		 * If the file handle is stale we have to re-resolve the
2113 		 * entry.  This is a hack at the moment.
2114 		 */
2115 		if (error == ESTALE) {
2116 			vput(vp);
2117 			cache_setunresolved(&nd->nl_nch);
2118 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2119 			if (error == 0) {
2120 				vp = NULL;
2121 				goto retry;
2122 			}
2123 			return(error);
2124 		}
2125 	}
2126 	vput(vp);
2127 	return (error);
2128 }
2129 
2130 /*
2131  * access_args(char *path, int flags)
2132  *
2133  * Check access permissions.
2134  */
2135 int
2136 sys_access(struct access_args *uap)
2137 {
2138 	struct nlookupdata nd;
2139 	int error;
2140 
2141 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2142 	if (error == 0)
2143 		error = kern_access(&nd, uap->flags);
2144 	nlookup_done(&nd);
2145 	return (error);
2146 }
2147 
2148 int
2149 kern_stat(struct nlookupdata *nd, struct stat *st)
2150 {
2151 	int error;
2152 	struct vnode *vp;
2153 	thread_t td;
2154 
2155 	if ((error = nlookup(nd)) != 0)
2156 		return (error);
2157 again:
2158 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2159 		return (ENOENT);
2160 
2161 	td = curthread;
2162 	if ((error = vget(vp, LK_SHARED)) != 0)
2163 		return (error);
2164 	error = vn_stat(vp, st, nd->nl_cred);
2165 
2166 	/*
2167 	 * If the file handle is stale we have to re-resolve the entry.  This
2168 	 * is a hack at the moment.
2169 	 */
2170 	if (error == ESTALE) {
2171 		vput(vp);
2172 		cache_setunresolved(&nd->nl_nch);
2173 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2174 		if (error == 0)
2175 			goto again;
2176 	} else {
2177 		vput(vp);
2178 	}
2179 	return (error);
2180 }
2181 
2182 /*
2183  * stat_args(char *path, struct stat *ub)
2184  *
2185  * Get file status; this version follows links.
2186  */
2187 int
2188 sys_stat(struct stat_args *uap)
2189 {
2190 	struct nlookupdata nd;
2191 	struct stat st;
2192 	int error;
2193 
2194 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2195 	if (error == 0) {
2196 		error = kern_stat(&nd, &st);
2197 		if (error == 0)
2198 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2199 	}
2200 	nlookup_done(&nd);
2201 	return (error);
2202 }
2203 
2204 /*
2205  * lstat_args(char *path, struct stat *ub)
2206  *
2207  * Get file status; this version does not follow links.
2208  */
2209 int
2210 sys_lstat(struct lstat_args *uap)
2211 {
2212 	struct nlookupdata nd;
2213 	struct stat st;
2214 	int error;
2215 
2216 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2217 	if (error == 0) {
2218 		error = kern_stat(&nd, &st);
2219 		if (error == 0)
2220 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2221 	}
2222 	nlookup_done(&nd);
2223 	return (error);
2224 }
2225 
2226 /*
2227  * pathconf_Args(char *path, int name)
2228  *
2229  * Get configurable pathname variables.
2230  */
2231 /* ARGSUSED */
2232 int
2233 sys_pathconf(struct pathconf_args *uap)
2234 {
2235 	struct nlookupdata nd;
2236 	struct vnode *vp;
2237 	int error;
2238 
2239 	vp = NULL;
2240 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2241 	if (error == 0)
2242 		error = nlookup(&nd);
2243 	if (error == 0)
2244 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2245 	nlookup_done(&nd);
2246 	if (error == 0) {
2247 		error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds);
2248 		vput(vp);
2249 	}
2250 	return (error);
2251 }
2252 
2253 /*
2254  * XXX: daver
2255  * kern_readlink isn't properly split yet.  There is a copyin burried
2256  * in VOP_READLINK().
2257  */
2258 int
2259 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2260 {
2261 	struct thread *td = curthread;
2262 	struct proc *p = td->td_proc;
2263 	struct vnode *vp;
2264 	struct iovec aiov;
2265 	struct uio auio;
2266 	int error;
2267 
2268 	if ((error = nlookup(nd)) != 0)
2269 		return (error);
2270 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2271 	if (error)
2272 		return (error);
2273 	if (vp->v_type != VLNK) {
2274 		error = EINVAL;
2275 	} else {
2276 		aiov.iov_base = buf;
2277 		aiov.iov_len = count;
2278 		auio.uio_iov = &aiov;
2279 		auio.uio_iovcnt = 1;
2280 		auio.uio_offset = 0;
2281 		auio.uio_rw = UIO_READ;
2282 		auio.uio_segflg = UIO_USERSPACE;
2283 		auio.uio_td = td;
2284 		auio.uio_resid = count;
2285 		error = VOP_READLINK(vp, &auio, p->p_ucred);
2286 	}
2287 	vput(vp);
2288 	*res = count - auio.uio_resid;
2289 	return (error);
2290 }
2291 
2292 /*
2293  * readlink_args(char *path, char *buf, int count)
2294  *
2295  * Return target name of a symbolic link.
2296  */
2297 int
2298 sys_readlink(struct readlink_args *uap)
2299 {
2300 	struct nlookupdata nd;
2301 	int error;
2302 
2303 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2304 	if (error == 0) {
2305 		error = kern_readlink(&nd, uap->buf, uap->count,
2306 					&uap->sysmsg_result);
2307 	}
2308 	nlookup_done(&nd);
2309 	return (error);
2310 }
2311 
2312 static int
2313 setfflags(struct vnode *vp, int flags)
2314 {
2315 	struct thread *td = curthread;
2316 	struct proc *p = td->td_proc;
2317 	int error;
2318 	struct vattr vattr;
2319 
2320 	/*
2321 	 * Prevent non-root users from setting flags on devices.  When
2322 	 * a device is reused, users can retain ownership of the device
2323 	 * if they are allowed to set flags and programs assume that
2324 	 * chown can't fail when done as root.
2325 	 */
2326 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2327 	    ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0))
2328 		return (error);
2329 
2330 	/*
2331 	 * note: vget is required for any operation that might mod the vnode
2332 	 * so VINACTIVE is properly cleared.
2333 	 */
2334 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2335 		VATTR_NULL(&vattr);
2336 		vattr.va_flags = flags;
2337 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2338 		vput(vp);
2339 	}
2340 	return (error);
2341 }
2342 
2343 /*
2344  * chflags(char *path, int flags)
2345  *
2346  * Change flags of a file given a path name.
2347  */
2348 /* ARGSUSED */
2349 int
2350 sys_chflags(struct chflags_args *uap)
2351 {
2352 	struct nlookupdata nd;
2353 	struct vnode *vp;
2354 	int error;
2355 
2356 	vp = NULL;
2357 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2358 	/* XXX Add NLC flag indicating modifying operation? */
2359 	if (error == 0)
2360 		error = nlookup(&nd);
2361 	if (error == 0)
2362 		error = ncp_writechk(&nd.nl_nch);
2363 	if (error == 0)
2364 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2365 	nlookup_done(&nd);
2366 	if (error == 0) {
2367 		error = setfflags(vp, uap->flags);
2368 		vrele(vp);
2369 	}
2370 	return (error);
2371 }
2372 
2373 /*
2374  * fchflags_args(int fd, int flags)
2375  *
2376  * Change flags of a file given a file descriptor.
2377  */
2378 /* ARGSUSED */
2379 int
2380 sys_fchflags(struct fchflags_args *uap)
2381 {
2382 	struct thread *td = curthread;
2383 	struct proc *p = td->td_proc;
2384 	struct file *fp;
2385 	int error;
2386 
2387 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2388 		return (error);
2389 	if (fp->f_nchandle.ncp)
2390 		error = ncp_writechk(&fp->f_nchandle);
2391 	if (error == 0)
2392 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
2393 	fdrop(fp);
2394 	return (error);
2395 }
2396 
2397 static int
2398 setfmode(struct vnode *vp, int mode)
2399 {
2400 	struct thread *td = curthread;
2401 	struct proc *p = td->td_proc;
2402 	int error;
2403 	struct vattr vattr;
2404 
2405 	/*
2406 	 * note: vget is required for any operation that might mod the vnode
2407 	 * so VINACTIVE is properly cleared.
2408 	 */
2409 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2410 		VATTR_NULL(&vattr);
2411 		vattr.va_mode = mode & ALLPERMS;
2412 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2413 		vput(vp);
2414 	}
2415 	return error;
2416 }
2417 
2418 int
2419 kern_chmod(struct nlookupdata *nd, int mode)
2420 {
2421 	struct vnode *vp;
2422 	int error;
2423 
2424 	/* XXX Add NLC flag indicating modifying operation? */
2425 	if ((error = nlookup(nd)) != 0)
2426 		return (error);
2427 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2428 		return (error);
2429 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2430 		error = setfmode(vp, mode);
2431 	vrele(vp);
2432 	return (error);
2433 }
2434 
2435 /*
2436  * chmod_args(char *path, int mode)
2437  *
2438  * Change mode of a file given path name.
2439  */
2440 /* ARGSUSED */
2441 int
2442 sys_chmod(struct chmod_args *uap)
2443 {
2444 	struct nlookupdata nd;
2445 	int error;
2446 
2447 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2448 	if (error == 0)
2449 		error = kern_chmod(&nd, uap->mode);
2450 	nlookup_done(&nd);
2451 	return (error);
2452 }
2453 
2454 /*
2455  * lchmod_args(char *path, int mode)
2456  *
2457  * Change mode of a file given path name (don't follow links.)
2458  */
2459 /* ARGSUSED */
2460 int
2461 sys_lchmod(struct lchmod_args *uap)
2462 {
2463 	struct nlookupdata nd;
2464 	int error;
2465 
2466 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2467 	if (error == 0)
2468 		error = kern_chmod(&nd, uap->mode);
2469 	nlookup_done(&nd);
2470 	return (error);
2471 }
2472 
2473 /*
2474  * fchmod_args(int fd, int mode)
2475  *
2476  * Change mode of a file given a file descriptor.
2477  */
2478 /* ARGSUSED */
2479 int
2480 sys_fchmod(struct fchmod_args *uap)
2481 {
2482 	struct thread *td = curthread;
2483 	struct proc *p = td->td_proc;
2484 	struct file *fp;
2485 	int error;
2486 
2487 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2488 		return (error);
2489 	if (fp->f_nchandle.ncp)
2490 		error = ncp_writechk(&fp->f_nchandle);
2491 	if (error == 0)
2492 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
2493 	fdrop(fp);
2494 	return (error);
2495 }
2496 
2497 static int
2498 setfown(struct vnode *vp, uid_t uid, gid_t gid)
2499 {
2500 	struct thread *td = curthread;
2501 	struct proc *p = td->td_proc;
2502 	int error;
2503 	struct vattr vattr;
2504 
2505 	/*
2506 	 * note: vget is required for any operation that might mod the vnode
2507 	 * so VINACTIVE is properly cleared.
2508 	 */
2509 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2510 		VATTR_NULL(&vattr);
2511 		vattr.va_uid = uid;
2512 		vattr.va_gid = gid;
2513 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2514 		vput(vp);
2515 	}
2516 	return error;
2517 }
2518 
2519 int
2520 kern_chown(struct nlookupdata *nd, int uid, int gid)
2521 {
2522 	struct vnode *vp;
2523 	int error;
2524 
2525 	/* XXX Add NLC flag indicating modifying operation? */
2526 	if ((error = nlookup(nd)) != 0)
2527 		return (error);
2528 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2529 		return (error);
2530 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2531 		error = setfown(vp, uid, gid);
2532 	vrele(vp);
2533 	return (error);
2534 }
2535 
2536 /*
2537  * chown(char *path, int uid, int gid)
2538  *
2539  * Set ownership given a path name.
2540  */
2541 int
2542 sys_chown(struct chown_args *uap)
2543 {
2544 	struct nlookupdata nd;
2545 	int error;
2546 
2547 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2548 	if (error == 0)
2549 		error = kern_chown(&nd, uap->uid, uap->gid);
2550 	nlookup_done(&nd);
2551 	return (error);
2552 }
2553 
2554 /*
2555  * lchown_args(char *path, int uid, int gid)
2556  *
2557  * Set ownership given a path name, do not cross symlinks.
2558  */
2559 int
2560 sys_lchown(struct lchown_args *uap)
2561 {
2562 	struct nlookupdata nd;
2563 	int error;
2564 
2565 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2566 	if (error == 0)
2567 		error = kern_chown(&nd, uap->uid, uap->gid);
2568 	nlookup_done(&nd);
2569 	return (error);
2570 }
2571 
2572 /*
2573  * fchown_args(int fd, int uid, int gid)
2574  *
2575  * Set ownership given a file descriptor.
2576  */
2577 /* ARGSUSED */
2578 int
2579 sys_fchown(struct fchown_args *uap)
2580 {
2581 	struct thread *td = curthread;
2582 	struct proc *p = td->td_proc;
2583 	struct file *fp;
2584 	int error;
2585 
2586 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2587 		return (error);
2588 	if (fp->f_nchandle.ncp)
2589 		error = ncp_writechk(&fp->f_nchandle);
2590 	if (error == 0)
2591 		error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
2592 	fdrop(fp);
2593 	return (error);
2594 }
2595 
2596 static int
2597 getutimes(const struct timeval *tvp, struct timespec *tsp)
2598 {
2599 	struct timeval tv[2];
2600 
2601 	if (tvp == NULL) {
2602 		microtime(&tv[0]);
2603 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
2604 		tsp[1] = tsp[0];
2605 	} else {
2606 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2607 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2608 	}
2609 	return 0;
2610 }
2611 
2612 static int
2613 setutimes(struct vnode *vp, const struct timespec *ts, int nullflag)
2614 {
2615 	struct thread *td = curthread;
2616 	struct proc *p = td->td_proc;
2617 	int error;
2618 	struct vattr vattr;
2619 
2620 	/*
2621 	 * note: vget is required for any operation that might mod the vnode
2622 	 * so VINACTIVE is properly cleared.
2623 	 */
2624 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2625 		VATTR_NULL(&vattr);
2626 		vattr.va_atime = ts[0];
2627 		vattr.va_mtime = ts[1];
2628 		if (nullflag)
2629 			vattr.va_vaflags |= VA_UTIMES_NULL;
2630 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2631 		vput(vp);
2632 	}
2633 	return error;
2634 }
2635 
2636 int
2637 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
2638 {
2639 	struct timespec ts[2];
2640 	struct vnode *vp;
2641 	int error;
2642 
2643 	if ((error = getutimes(tptr, ts)) != 0)
2644 		return (error);
2645 	/* XXX Add NLC flag indicating modifying operation? */
2646 	if ((error = nlookup(nd)) != 0)
2647 		return (error);
2648 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2649 		return (error);
2650 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2651 		return (error);
2652 	error = setutimes(vp, ts, tptr == NULL);
2653 	vrele(vp);
2654 	return (error);
2655 }
2656 
2657 /*
2658  * utimes_args(char *path, struct timeval *tptr)
2659  *
2660  * Set the access and modification times of a file.
2661  */
2662 int
2663 sys_utimes(struct utimes_args *uap)
2664 {
2665 	struct timeval tv[2];
2666 	struct nlookupdata nd;
2667 	int error;
2668 
2669 	if (uap->tptr) {
2670  		error = copyin(uap->tptr, tv, sizeof(tv));
2671 		if (error)
2672 			return (error);
2673 	}
2674 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2675 	if (error == 0)
2676 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2677 	nlookup_done(&nd);
2678 	return (error);
2679 }
2680 
2681 /*
2682  * lutimes_args(char *path, struct timeval *tptr)
2683  *
2684  * Set the access and modification times of a file.
2685  */
2686 int
2687 sys_lutimes(struct lutimes_args *uap)
2688 {
2689 	struct timeval tv[2];
2690 	struct nlookupdata nd;
2691 	int error;
2692 
2693 	if (uap->tptr) {
2694 		error = copyin(uap->tptr, tv, sizeof(tv));
2695 		if (error)
2696 			return (error);
2697 	}
2698 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2699 	if (error == 0)
2700 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2701 	nlookup_done(&nd);
2702 	return (error);
2703 }
2704 
2705 int
2706 kern_futimes(int fd, struct timeval *tptr)
2707 {
2708 	struct thread *td = curthread;
2709 	struct proc *p = td->td_proc;
2710 	struct timespec ts[2];
2711 	struct file *fp;
2712 	int error;
2713 
2714 	error = getutimes(tptr, ts);
2715 	if (error)
2716 		return (error);
2717 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2718 		return (error);
2719 	if (fp->f_nchandle.ncp)
2720 		error = ncp_writechk(&fp->f_nchandle);
2721 	if (error == 0)
2722 		error =  setutimes((struct vnode *)fp->f_data, ts, tptr == NULL);
2723 	fdrop(fp);
2724 	return (error);
2725 }
2726 
2727 /*
2728  * futimes_args(int fd, struct timeval *tptr)
2729  *
2730  * Set the access and modification times of a file.
2731  */
2732 int
2733 sys_futimes(struct futimes_args *uap)
2734 {
2735 	struct timeval tv[2];
2736 	int error;
2737 
2738 	if (uap->tptr) {
2739 		error = copyin(uap->tptr, tv, sizeof(tv));
2740 		if (error)
2741 			return (error);
2742 	}
2743 
2744 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
2745 
2746 	return (error);
2747 }
2748 
2749 int
2750 kern_truncate(struct nlookupdata *nd, off_t length)
2751 {
2752 	struct vnode *vp;
2753 	struct vattr vattr;
2754 	int error;
2755 
2756 	if (length < 0)
2757 		return(EINVAL);
2758 	/* XXX Add NLC flag indicating modifying operation? */
2759 	if ((error = nlookup(nd)) != 0)
2760 		return (error);
2761 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2762 		return (error);
2763 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2764 		return (error);
2765 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2766 		vrele(vp);
2767 		return (error);
2768 	}
2769 	if (vp->v_type == VDIR) {
2770 		error = EISDIR;
2771 	} else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0 &&
2772 	    (error = VOP_ACCESS(vp, VWRITE, nd->nl_cred)) == 0) {
2773 		VATTR_NULL(&vattr);
2774 		vattr.va_size = length;
2775 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
2776 	}
2777 	vput(vp);
2778 	return (error);
2779 }
2780 
2781 /*
2782  * truncate(char *path, int pad, off_t length)
2783  *
2784  * Truncate a file given its path name.
2785  */
2786 int
2787 sys_truncate(struct truncate_args *uap)
2788 {
2789 	struct nlookupdata nd;
2790 	int error;
2791 
2792 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2793 	if (error == 0)
2794 		error = kern_truncate(&nd, uap->length);
2795 	nlookup_done(&nd);
2796 	return error;
2797 }
2798 
2799 int
2800 kern_ftruncate(int fd, off_t length)
2801 {
2802 	struct thread *td = curthread;
2803 	struct proc *p = td->td_proc;
2804 	struct vattr vattr;
2805 	struct vnode *vp;
2806 	struct file *fp;
2807 	int error;
2808 
2809 	if (length < 0)
2810 		return(EINVAL);
2811 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2812 		return (error);
2813 	if (fp->f_nchandle.ncp) {
2814 		error = ncp_writechk(&fp->f_nchandle);
2815 		if (error)
2816 			goto done;
2817 	}
2818 	if ((fp->f_flag & FWRITE) == 0) {
2819 		error = EINVAL;
2820 		goto done;
2821 	}
2822 	vp = (struct vnode *)fp->f_data;
2823 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2824 	if (vp->v_type == VDIR) {
2825 		error = EISDIR;
2826 	} else if ((error = vn_writechk(vp, NULL)) == 0) {
2827 		VATTR_NULL(&vattr);
2828 		vattr.va_size = length;
2829 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
2830 	}
2831 	vn_unlock(vp);
2832 done:
2833 	fdrop(fp);
2834 	return (error);
2835 }
2836 
2837 /*
2838  * ftruncate_args(int fd, int pad, off_t length)
2839  *
2840  * Truncate a file given a file descriptor.
2841  */
2842 int
2843 sys_ftruncate(struct ftruncate_args *uap)
2844 {
2845 	int error;
2846 
2847 	error = kern_ftruncate(uap->fd, uap->length);
2848 
2849 	return (error);
2850 }
2851 
2852 /*
2853  * fsync(int fd)
2854  *
2855  * Sync an open file.
2856  */
2857 /* ARGSUSED */
2858 int
2859 sys_fsync(struct fsync_args *uap)
2860 {
2861 	struct thread *td = curthread;
2862 	struct proc *p = td->td_proc;
2863 	struct vnode *vp;
2864 	struct file *fp;
2865 	vm_object_t obj;
2866 	int error;
2867 
2868 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2869 		return (error);
2870 	vp = (struct vnode *)fp->f_data;
2871 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2872 	if ((obj = vp->v_object) != NULL)
2873 		vm_object_page_clean(obj, 0, 0, 0);
2874 	if ((error = VOP_FSYNC(vp, MNT_WAIT)) == 0 &&
2875 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
2876 	    bioops.io_fsync) {
2877 		error = (*bioops.io_fsync)(vp);
2878 	}
2879 	vn_unlock(vp);
2880 	fdrop(fp);
2881 	return (error);
2882 }
2883 
2884 int
2885 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
2886 {
2887 	struct nchandle fnchd;
2888 	struct nchandle tnchd;
2889 	struct namecache *ncp;
2890 	struct vnode *fdvp;
2891 	struct vnode *tdvp;
2892 	struct mount *mp;
2893 	int error;
2894 
2895 	bwillwrite();
2896 	if ((error = nlookup(fromnd)) != 0)
2897 		return (error);
2898 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
2899 		return (ENOENT);
2900 	fnchd.mount = fromnd->nl_nch.mount;
2901 	cache_hold(&fnchd);
2902 
2903 	/*
2904 	 * unlock the source nch so we can lookup the target nch without
2905 	 * deadlocking.  The target may or may not exist so we do not check
2906 	 * for a target vp like kern_mkdir() and other creation functions do.
2907 	 *
2908 	 * The source and target directories are ref'd and rechecked after
2909 	 * everything is relocked to determine if the source or target file
2910 	 * has been renamed.
2911 	 */
2912 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
2913 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
2914 	cache_unlock(&fromnd->nl_nch);
2915 
2916 	tond->nl_flags |= NLC_CREATE;
2917 	if ((error = nlookup(tond)) != 0) {
2918 		cache_drop(&fnchd);
2919 		return (error);
2920 	}
2921 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
2922 		cache_drop(&fnchd);
2923 		return (ENOENT);
2924 	}
2925 	tnchd.mount = tond->nl_nch.mount;
2926 	cache_hold(&tnchd);
2927 
2928 	/*
2929 	 * If the source and target are the same there is nothing to do
2930 	 */
2931 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
2932 		cache_drop(&fnchd);
2933 		cache_drop(&tnchd);
2934 		return (0);
2935 	}
2936 
2937 	/*
2938 	 * Mount points cannot be renamed or overwritten
2939 	 */
2940 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
2941 	    NCF_ISMOUNTPT
2942 	) {
2943 		cache_drop(&fnchd);
2944 		cache_drop(&tnchd);
2945 		return (EINVAL);
2946 	}
2947 
2948 	/*
2949 	 * relock the source ncp.  NOTE AFTER RELOCKING: the source ncp
2950 	 * may have become invalid while it was unlocked, nc_vp and nc_mount
2951 	 * could be NULL.
2952 	 */
2953 	if (cache_lock_nonblock(&fromnd->nl_nch) == 0) {
2954 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2955 	} else if (fromnd->nl_nch.ncp > tond->nl_nch.ncp) {
2956 		cache_lock(&fromnd->nl_nch);
2957 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2958 	} else {
2959 		cache_unlock(&tond->nl_nch);
2960 		cache_lock(&fromnd->nl_nch);
2961 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2962 		cache_lock(&tond->nl_nch);
2963 		cache_resolve(&tond->nl_nch, tond->nl_cred);
2964 	}
2965 	fromnd->nl_flags |= NLC_NCPISLOCKED;
2966 
2967 	/*
2968 	 * make sure the parent directories linkages are the same
2969 	 */
2970 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
2971 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
2972 		cache_drop(&fnchd);
2973 		cache_drop(&tnchd);
2974 		return (ENOENT);
2975 	}
2976 
2977 	/*
2978 	 * Both the source and target must be within the same filesystem and
2979 	 * in the same filesystem as their parent directories within the
2980 	 * namecache topology.
2981 	 *
2982 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
2983 	 */
2984 	mp = fnchd.mount;
2985 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
2986 	    mp != tond->nl_nch.mount) {
2987 		cache_drop(&fnchd);
2988 		cache_drop(&tnchd);
2989 		return (EXDEV);
2990 	}
2991 
2992 	/*
2993 	 * Make sure the mount point is writable
2994 	 */
2995 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
2996 		cache_drop(&fnchd);
2997 		cache_drop(&tnchd);
2998 		return (error);
2999 	}
3000 
3001 	/*
3002 	 * If the target exists and either the source or target is a directory,
3003 	 * then both must be directories.
3004 	 *
3005 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
3006 	 * have become NULL.
3007 	 */
3008 	if (tond->nl_nch.ncp->nc_vp) {
3009 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
3010 			error = ENOENT;
3011 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
3012 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
3013 				error = ENOTDIR;
3014 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
3015 			error = EISDIR;
3016 		}
3017 	}
3018 
3019 	/*
3020 	 * You cannot rename a source into itself or a subdirectory of itself.
3021 	 * We check this by travsersing the target directory upwards looking
3022 	 * for a match against the source.
3023 	 */
3024 	if (error == 0) {
3025 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
3026 			if (fromnd->nl_nch.ncp == ncp) {
3027 				error = EINVAL;
3028 				break;
3029 			}
3030 		}
3031 	}
3032 
3033 	cache_drop(&fnchd);
3034 	cache_drop(&tnchd);
3035 
3036 	/*
3037 	 * Even though the namespaces are different, they may still represent
3038 	 * hardlinks to the same file.  The filesystem might have a hard time
3039 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
3040 	 * when we detect the situation.
3041 	 */
3042 	if (error == 0) {
3043 		fdvp = fromnd->nl_nch.ncp->nc_parent->nc_vp;
3044 		tdvp = tond->nl_nch.ncp->nc_parent->nc_vp;
3045 		if (fdvp == NULL || tdvp == NULL) {
3046 			error = EPERM;
3047 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
3048 			/* vhold(fdvp); - dvp can't go away */
3049 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
3050 					    fromnd->nl_cred);
3051 			/* vdrop(fdvp); */
3052 		} else {
3053 			/* vhold(fdvp); - dvp can't go away */
3054 			/* vhold(tdvp); - dvp can't go away */
3055 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
3056 					    fdvp, tdvp, tond->nl_cred);
3057 			/* vdrop(fdvp); */
3058 			/* vdrop(tdvp); */
3059 		}
3060 	}
3061 	return (error);
3062 }
3063 
3064 /*
3065  * rename_args(char *from, char *to)
3066  *
3067  * Rename files.  Source and destination must either both be directories,
3068  * or both not be directories.  If target is a directory, it must be empty.
3069  */
3070 int
3071 sys_rename(struct rename_args *uap)
3072 {
3073 	struct nlookupdata fromnd, tond;
3074 	int error;
3075 
3076 	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3077 	if (error == 0) {
3078 		error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3079 		if (error == 0)
3080 			error = kern_rename(&fromnd, &tond);
3081 		nlookup_done(&tond);
3082 	}
3083 	nlookup_done(&fromnd);
3084 	return (error);
3085 }
3086 
3087 int
3088 kern_mkdir(struct nlookupdata *nd, int mode)
3089 {
3090 	struct thread *td = curthread;
3091 	struct proc *p = td->td_proc;
3092 	struct vnode *vp;
3093 	struct vnode *dvp;
3094 	struct vattr vattr;
3095 	int error;
3096 
3097 	bwillwrite();
3098 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE;
3099 	if ((error = nlookup(nd)) != 0)
3100 		return (error);
3101 
3102 	if (nd->nl_nch.ncp->nc_vp)
3103 		return (EEXIST);
3104 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3105 		return (error);
3106 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
3107 		return (EPERM);
3108 	/* vhold(dvp); - dvp can't go away */
3109 	VATTR_NULL(&vattr);
3110 	vattr.va_type = VDIR;
3111 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3112 
3113 	vp = NULL;
3114 	error = VOP_NMKDIR(&nd->nl_nch, dvp, &vp, p->p_ucred, &vattr);
3115 	/* vdrop(dvp); */
3116 	if (error == 0)
3117 		vput(vp);
3118 	return (error);
3119 }
3120 
3121 /*
3122  * mkdir_args(char *path, int mode)
3123  *
3124  * Make a directory file.
3125  */
3126 /* ARGSUSED */
3127 int
3128 sys_mkdir(struct mkdir_args *uap)
3129 {
3130 	struct nlookupdata nd;
3131 	int error;
3132 
3133 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3134 	if (error == 0)
3135 		error = kern_mkdir(&nd, uap->mode);
3136 	nlookup_done(&nd);
3137 	return (error);
3138 }
3139 
3140 int
3141 kern_rmdir(struct nlookupdata *nd)
3142 {
3143 	struct vnode *dvp;
3144 	int error;
3145 
3146 	bwillwrite();
3147 	nd->nl_flags |= NLC_DELETE;
3148 	if ((error = nlookup(nd)) != 0)
3149 		return (error);
3150 
3151 	/*
3152 	 * Do not allow directories representing mount points to be
3153 	 * deleted, even if empty.  Check write perms on mount point
3154 	 * in case the vnode is aliased (aka nullfs).
3155 	 */
3156 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
3157 		return (EINVAL);
3158 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3159 		return (error);
3160 	if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
3161 		return (EPERM);
3162 	/* vhold(dvp); - dvp can't go away */
3163 	error = VOP_NRMDIR(&nd->nl_nch, dvp, nd->nl_cred);
3164 	/* vdrop(dvp); */
3165 	return (error);
3166 }
3167 
3168 /*
3169  * rmdir_args(char *path)
3170  *
3171  * Remove a directory file.
3172  */
3173 /* ARGSUSED */
3174 int
3175 sys_rmdir(struct rmdir_args *uap)
3176 {
3177 	struct nlookupdata nd;
3178 	int error;
3179 
3180 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3181 	if (error == 0)
3182 		error = kern_rmdir(&nd);
3183 	nlookup_done(&nd);
3184 	return (error);
3185 }
3186 
3187 int
3188 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
3189     enum uio_seg direction)
3190 {
3191 	struct thread *td = curthread;
3192 	struct proc *p = td->td_proc;
3193 	struct vnode *vp;
3194 	struct file *fp;
3195 	struct uio auio;
3196 	struct iovec aiov;
3197 	long loff;
3198 	int error, eofflag;
3199 
3200 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3201 		return (error);
3202 	if ((fp->f_flag & FREAD) == 0) {
3203 		error = EBADF;
3204 		goto done;
3205 	}
3206 	vp = (struct vnode *)fp->f_data;
3207 unionread:
3208 	if (vp->v_type != VDIR) {
3209 		error = EINVAL;
3210 		goto done;
3211 	}
3212 	aiov.iov_base = buf;
3213 	aiov.iov_len = count;
3214 	auio.uio_iov = &aiov;
3215 	auio.uio_iovcnt = 1;
3216 	auio.uio_rw = UIO_READ;
3217 	auio.uio_segflg = direction;
3218 	auio.uio_td = td;
3219 	auio.uio_resid = count;
3220 	loff = auio.uio_offset = fp->f_offset;
3221 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
3222 	fp->f_offset = auio.uio_offset;
3223 	if (error)
3224 		goto done;
3225 	if (count == auio.uio_resid) {
3226 		if (union_dircheckp) {
3227 			error = union_dircheckp(td, &vp, fp);
3228 			if (error == -1)
3229 				goto unionread;
3230 			if (error)
3231 				goto done;
3232 		}
3233 #if 0
3234 		if ((vp->v_flag & VROOT) &&
3235 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
3236 			struct vnode *tvp = vp;
3237 			vp = vp->v_mount->mnt_vnodecovered;
3238 			vref(vp);
3239 			fp->f_data = vp;
3240 			fp->f_offset = 0;
3241 			vrele(tvp);
3242 			goto unionread;
3243 		}
3244 #endif
3245 	}
3246 	if (basep) {
3247 		*basep = loff;
3248 	}
3249 	*res = count - auio.uio_resid;
3250 done:
3251 	fdrop(fp);
3252 	return (error);
3253 }
3254 
3255 /*
3256  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
3257  *
3258  * Read a block of directory entries in a file system independent format.
3259  */
3260 int
3261 sys_getdirentries(struct getdirentries_args *uap)
3262 {
3263 	long base;
3264 	int error;
3265 
3266 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
3267 	    &uap->sysmsg_result, UIO_USERSPACE);
3268 
3269 	if (error == 0)
3270 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
3271 	return (error);
3272 }
3273 
3274 /*
3275  * getdents_args(int fd, char *buf, size_t count)
3276  */
3277 int
3278 sys_getdents(struct getdents_args *uap)
3279 {
3280 	int error;
3281 
3282 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
3283 	    &uap->sysmsg_result, UIO_USERSPACE);
3284 
3285 	return (error);
3286 }
3287 
3288 /*
3289  * umask(int newmask)
3290  *
3291  * Set the mode mask for creation of filesystem nodes.
3292  *
3293  * MP SAFE
3294  */
3295 int
3296 sys_umask(struct umask_args *uap)
3297 {
3298 	struct thread *td = curthread;
3299 	struct proc *p = td->td_proc;
3300 	struct filedesc *fdp;
3301 
3302 	fdp = p->p_fd;
3303 	uap->sysmsg_result = fdp->fd_cmask;
3304 	fdp->fd_cmask = uap->newmask & ALLPERMS;
3305 	return (0);
3306 }
3307 
3308 /*
3309  * revoke(char *path)
3310  *
3311  * Void all references to file by ripping underlying filesystem
3312  * away from vnode.
3313  */
3314 /* ARGSUSED */
3315 int
3316 sys_revoke(struct revoke_args *uap)
3317 {
3318 	struct nlookupdata nd;
3319 	struct vattr vattr;
3320 	struct vnode *vp;
3321 	struct ucred *cred;
3322 	int error;
3323 
3324 	vp = NULL;
3325 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3326 	if (error == 0)
3327 		error = nlookup(&nd);
3328 	if (error == 0)
3329 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3330 	cred = crhold(nd.nl_cred);
3331 	nlookup_done(&nd);
3332 	if (error == 0) {
3333 		if (vp->v_type != VCHR && vp->v_type != VBLK)
3334 			error = EINVAL;
3335 		if (error == 0)
3336 			error = VOP_GETATTR(vp, &vattr);
3337 		if (error == 0 && cred->cr_uid != vattr.va_uid)
3338 			error = suser_cred(cred, PRISON_ROOT);
3339 		if (error == 0 && count_udev(vp->v_umajor, vp->v_uminor) > 0) {
3340 			error = 0;
3341 			vx_lock(vp);
3342 			VOP_REVOKE(vp, REVOKEALL);
3343 			vx_unlock(vp);
3344 		}
3345 		vrele(vp);
3346 	}
3347 	if (cred)
3348 		crfree(cred);
3349 	return (error);
3350 }
3351 
3352 /*
3353  * getfh_args(char *fname, fhandle_t *fhp)
3354  *
3355  * Get (NFS) file handle
3356  */
3357 int
3358 sys_getfh(struct getfh_args *uap)
3359 {
3360 	struct thread *td = curthread;
3361 	struct nlookupdata nd;
3362 	fhandle_t fh;
3363 	struct vnode *vp;
3364 	int error;
3365 
3366 	/*
3367 	 * Must be super user
3368 	 */
3369 	if ((error = suser(td)) != 0)
3370 		return (error);
3371 
3372 	vp = NULL;
3373 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
3374 	if (error == 0)
3375 		error = nlookup(&nd);
3376 	if (error == 0)
3377 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3378 	nlookup_done(&nd);
3379 	if (error == 0) {
3380 		bzero(&fh, sizeof(fh));
3381 		fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3382 		error = VFS_VPTOFH(vp, &fh.fh_fid);
3383 		vput(vp);
3384 		if (error == 0)
3385 			error = copyout(&fh, uap->fhp, sizeof(fh));
3386 	}
3387 	return (error);
3388 }
3389 
3390 /*
3391  * fhopen_args(const struct fhandle *u_fhp, int flags)
3392  *
3393  * syscall for the rpc.lockd to use to translate a NFS file handle into
3394  * an open descriptor.
3395  *
3396  * warning: do not remove the suser() call or this becomes one giant
3397  * security hole.
3398  */
3399 int
3400 sys_fhopen(struct fhopen_args *uap)
3401 {
3402 	struct thread *td = curthread;
3403 	struct proc *p = td->td_proc;
3404 	struct mount *mp;
3405 	struct vnode *vp;
3406 	struct fhandle fhp;
3407 	struct vattr vat;
3408 	struct vattr *vap = &vat;
3409 	struct flock lf;
3410 	int fmode, mode, error, type;
3411 	struct file *nfp;
3412 	struct file *fp;
3413 	int indx;
3414 
3415 	/*
3416 	 * Must be super user
3417 	 */
3418 	error = suser(td);
3419 	if (error)
3420 		return (error);
3421 
3422 	fmode = FFLAGS(uap->flags);
3423 	/* why not allow a non-read/write open for our lockd? */
3424 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3425 		return (EINVAL);
3426 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3427 	if (error)
3428 		return(error);
3429 	/* find the mount point */
3430 	mp = vfs_getvfs(&fhp.fh_fsid);
3431 	if (mp == NULL)
3432 		return (ESTALE);
3433 	/* now give me my vnode, it gets returned to me locked */
3434 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
3435 	if (error)
3436 		return (error);
3437  	/*
3438 	 * from now on we have to make sure not
3439 	 * to forget about the vnode
3440 	 * any error that causes an abort must vput(vp)
3441 	 * just set error = err and 'goto bad;'.
3442 	 */
3443 
3444 	/*
3445 	 * from vn_open
3446 	 */
3447 	if (vp->v_type == VLNK) {
3448 		error = EMLINK;
3449 		goto bad;
3450 	}
3451 	if (vp->v_type == VSOCK) {
3452 		error = EOPNOTSUPP;
3453 		goto bad;
3454 	}
3455 	mode = 0;
3456 	if (fmode & (FWRITE | O_TRUNC)) {
3457 		if (vp->v_type == VDIR) {
3458 			error = EISDIR;
3459 			goto bad;
3460 		}
3461 		error = vn_writechk(vp, NULL);
3462 		if (error)
3463 			goto bad;
3464 		mode |= VWRITE;
3465 	}
3466 	if (fmode & FREAD)
3467 		mode |= VREAD;
3468 	if (mode) {
3469 		error = VOP_ACCESS(vp, mode, p->p_ucred);
3470 		if (error)
3471 			goto bad;
3472 	}
3473 	if (fmode & O_TRUNC) {
3474 		vn_unlock(vp);				/* XXX */
3475 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
3476 		VATTR_NULL(vap);
3477 		vap->va_size = 0;
3478 		error = VOP_SETATTR(vp, vap, p->p_ucred);
3479 		if (error)
3480 			goto bad;
3481 	}
3482 
3483 	/*
3484 	 * VOP_OPEN needs the file pointer so it can potentially override
3485 	 * it.
3486 	 *
3487 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
3488 	 * directory.  XXX
3489 	 */
3490 	if ((error = falloc(p, &nfp, &indx)) != 0)
3491 		goto bad;
3492 	fp = nfp;
3493 
3494 	error = VOP_OPEN(vp, fmode, p->p_ucred, fp);
3495 	if (error) {
3496 		/*
3497 		 * setting f_ops this way prevents VOP_CLOSE from being
3498 		 * called or fdrop() releasing the vp from v_data.   Since
3499 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
3500 		 */
3501 		fp->f_ops = &badfileops;
3502 		fp->f_data = NULL;
3503 		goto bad_drop;
3504 	}
3505 
3506 	/*
3507 	 * The fp is given its own reference, we still have our ref and lock.
3508 	 *
3509 	 * Assert that all regular files must be created with a VM object.
3510 	 */
3511 	if (vp->v_type == VREG && vp->v_object == NULL) {
3512 		kprintf("fhopen: regular file did not have VM object: %p\n", vp);
3513 		goto bad_drop;
3514 	}
3515 
3516 	/*
3517 	 * The open was successful.  Handle any locking requirements.
3518 	 */
3519 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
3520 		lf.l_whence = SEEK_SET;
3521 		lf.l_start = 0;
3522 		lf.l_len = 0;
3523 		if (fmode & O_EXLOCK)
3524 			lf.l_type = F_WRLCK;
3525 		else
3526 			lf.l_type = F_RDLCK;
3527 		if (fmode & FNONBLOCK)
3528 			type = 0;
3529 		else
3530 			type = F_WAIT;
3531 		vn_unlock(vp);
3532 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
3533 			/*
3534 			 * release our private reference.
3535 			 */
3536 			fsetfd(p, NULL, indx);
3537 			fdrop(fp);
3538 			vrele(vp);
3539 			return (error);
3540 		}
3541 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3542 		fp->f_flag |= FHASLOCK;
3543 	}
3544 
3545 	/*
3546 	 * Clean up.  Associate the file pointer with the previously
3547 	 * reserved descriptor and return it.
3548 	 */
3549 	vput(vp);
3550 	fsetfd(p, fp, indx);
3551 	fdrop(fp);
3552 	uap->sysmsg_result = indx;
3553 	return (0);
3554 
3555 bad_drop:
3556 	fsetfd(p, NULL, indx);
3557 	fdrop(fp);
3558 bad:
3559 	vput(vp);
3560 	return (error);
3561 }
3562 
3563 /*
3564  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
3565  */
3566 int
3567 sys_fhstat(struct fhstat_args *uap)
3568 {
3569 	struct thread *td = curthread;
3570 	struct stat sb;
3571 	fhandle_t fh;
3572 	struct mount *mp;
3573 	struct vnode *vp;
3574 	int error;
3575 
3576 	/*
3577 	 * Must be super user
3578 	 */
3579 	error = suser(td);
3580 	if (error)
3581 		return (error);
3582 
3583 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
3584 	if (error)
3585 		return (error);
3586 
3587 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3588 		return (ESTALE);
3589 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3590 		return (error);
3591 	error = vn_stat(vp, &sb, td->td_proc->p_ucred);
3592 	vput(vp);
3593 	if (error)
3594 		return (error);
3595 	error = copyout(&sb, uap->sb, sizeof(sb));
3596 	return (error);
3597 }
3598 
3599 /*
3600  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
3601  */
3602 int
3603 sys_fhstatfs(struct fhstatfs_args *uap)
3604 {
3605 	struct thread *td = curthread;
3606 	struct proc *p = td->td_proc;
3607 	struct statfs *sp;
3608 	struct mount *mp;
3609 	struct vnode *vp;
3610 	struct statfs sb;
3611 	char *fullpath, *freepath;
3612 	fhandle_t fh;
3613 	int error;
3614 
3615 	/*
3616 	 * Must be super user
3617 	 */
3618 	if ((error = suser(td)))
3619 		return (error);
3620 
3621 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
3622 		return (error);
3623 
3624 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3625 		return (ESTALE);
3626 
3627 	if (p != NULL && !chroot_visible_mnt(mp, p))
3628 		return (ESTALE);
3629 
3630 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3631 		return (error);
3632 	mp = vp->v_mount;
3633 	sp = &mp->mnt_stat;
3634 	vput(vp);
3635 	if ((error = VFS_STATFS(mp, sp, p->p_ucred)) != 0)
3636 		return (error);
3637 
3638 	error = mount_path(p, mp, &fullpath, &freepath);
3639 	if (error)
3640 		return(error);
3641 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3642 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
3643 	kfree(freepath, M_TEMP);
3644 
3645 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3646 	if (suser(td)) {
3647 		bcopy(sp, &sb, sizeof(sb));
3648 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
3649 		sp = &sb;
3650 	}
3651 	return (copyout(sp, uap->buf, sizeof(*sp)));
3652 }
3653 
3654 /*
3655  * Syscall to push extended attribute configuration information into the
3656  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
3657  * a command (int cmd), and attribute name and misc data.  For now, the
3658  * attribute name is left in userspace for consumption by the VFS_op.
3659  * It will probably be changed to be copied into sysspace by the
3660  * syscall in the future, once issues with various consumers of the
3661  * attribute code have raised their hands.
3662  *
3663  * Currently this is used only by UFS Extended Attributes.
3664  */
3665 int
3666 sys_extattrctl(struct extattrctl_args *uap)
3667 {
3668 	struct nlookupdata nd;
3669 	struct mount *mp;
3670 	struct vnode *vp;
3671 	int error;
3672 
3673 	vp = NULL;
3674 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3675 	if (error == 0)
3676 		error = nlookup(&nd);
3677 	if (error == 0) {
3678 		mp = nd.nl_nch.mount;
3679 		error = VFS_EXTATTRCTL(mp, uap->cmd,
3680 				uap->attrname, uap->arg,
3681 				nd.nl_cred);
3682 	}
3683 	nlookup_done(&nd);
3684 	return (error);
3685 }
3686 
3687 /*
3688  * Syscall to set a named extended attribute on a file or directory.
3689  * Accepts attribute name, and a uio structure pointing to the data to set.
3690  * The uio is consumed in the style of writev().  The real work happens
3691  * in VOP_SETEXTATTR().
3692  */
3693 int
3694 sys_extattr_set_file(struct extattr_set_file_args *uap)
3695 {
3696 	char attrname[EXTATTR_MAXNAMELEN];
3697 	struct iovec aiov[UIO_SMALLIOV];
3698 	struct iovec *needfree;
3699 	struct nlookupdata nd;
3700 	struct iovec *iov;
3701 	struct vnode *vp;
3702 	struct uio auio;
3703 	u_int iovlen;
3704 	u_int cnt;
3705 	int error;
3706 	int i;
3707 
3708 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3709 	if (error)
3710 		return (error);
3711 
3712 	vp = NULL;
3713 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3714 	if (error == 0)
3715 		error = nlookup(&nd);
3716 	if (error == 0)
3717 		error = ncp_writechk(&nd.nl_nch);
3718 	if (error == 0)
3719 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3720 	if (error) {
3721 		nlookup_done(&nd);
3722 		return (error);
3723 	}
3724 
3725 	needfree = NULL;
3726 	iovlen = uap->iovcnt * sizeof(struct iovec);
3727 	if (uap->iovcnt > UIO_SMALLIOV) {
3728 		if (uap->iovcnt > UIO_MAXIOV) {
3729 			error = EINVAL;
3730 			goto done;
3731 		}
3732 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3733 		needfree = iov;
3734 	} else {
3735 		iov = aiov;
3736 	}
3737 	auio.uio_iov = iov;
3738 	auio.uio_iovcnt = uap->iovcnt;
3739 	auio.uio_rw = UIO_WRITE;
3740 	auio.uio_segflg = UIO_USERSPACE;
3741 	auio.uio_td = nd.nl_td;
3742 	auio.uio_offset = 0;
3743 	if ((error = copyin(uap->iovp, iov, iovlen)))
3744 		goto done;
3745 	auio.uio_resid = 0;
3746 	for (i = 0; i < uap->iovcnt; i++) {
3747 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3748 			error = EINVAL;
3749 			goto done;
3750 		}
3751 		auio.uio_resid += iov->iov_len;
3752 		iov++;
3753 	}
3754 	cnt = auio.uio_resid;
3755 	error = VOP_SETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3756 	cnt -= auio.uio_resid;
3757 	uap->sysmsg_result = cnt;
3758 done:
3759 	vput(vp);
3760 	nlookup_done(&nd);
3761 	if (needfree)
3762 		FREE(needfree, M_IOV);
3763 	return (error);
3764 }
3765 
3766 /*
3767  * Syscall to get a named extended attribute on a file or directory.
3768  * Accepts attribute name, and a uio structure pointing to a buffer for the
3769  * data.  The uio is consumed in the style of readv().  The real work
3770  * happens in VOP_GETEXTATTR();
3771  */
3772 int
3773 sys_extattr_get_file(struct extattr_get_file_args *uap)
3774 {
3775 	char attrname[EXTATTR_MAXNAMELEN];
3776 	struct iovec aiov[UIO_SMALLIOV];
3777 	struct iovec *needfree;
3778 	struct nlookupdata nd;
3779 	struct iovec *iov;
3780 	struct vnode *vp;
3781 	struct uio auio;
3782 	u_int iovlen;
3783 	u_int cnt;
3784 	int error;
3785 	int i;
3786 
3787 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3788 	if (error)
3789 		return (error);
3790 
3791 	vp = NULL;
3792 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3793 	if (error == 0)
3794 		error = nlookup(&nd);
3795 	if (error == 0)
3796 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3797 	if (error) {
3798 		nlookup_done(&nd);
3799 		return (error);
3800 	}
3801 
3802 	iovlen = uap->iovcnt * sizeof (struct iovec);
3803 	needfree = NULL;
3804 	if (uap->iovcnt > UIO_SMALLIOV) {
3805 		if (uap->iovcnt > UIO_MAXIOV) {
3806 			error = EINVAL;
3807 			goto done;
3808 		}
3809 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3810 		needfree = iov;
3811 	} else {
3812 		iov = aiov;
3813 	}
3814 	auio.uio_iov = iov;
3815 	auio.uio_iovcnt = uap->iovcnt;
3816 	auio.uio_rw = UIO_READ;
3817 	auio.uio_segflg = UIO_USERSPACE;
3818 	auio.uio_td = nd.nl_td;
3819 	auio.uio_offset = 0;
3820 	if ((error = copyin(uap->iovp, iov, iovlen)))
3821 		goto done;
3822 	auio.uio_resid = 0;
3823 	for (i = 0; i < uap->iovcnt; i++) {
3824 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3825 			error = EINVAL;
3826 			goto done;
3827 		}
3828 		auio.uio_resid += iov->iov_len;
3829 		iov++;
3830 	}
3831 	cnt = auio.uio_resid;
3832 	error = VOP_GETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3833 	cnt -= auio.uio_resid;
3834 	uap->sysmsg_result = cnt;
3835 done:
3836 	vput(vp);
3837 	nlookup_done(&nd);
3838 	if (needfree)
3839 		FREE(needfree, M_IOV);
3840 	return(error);
3841 }
3842 
3843 /*
3844  * Syscall to delete a named extended attribute from a file or directory.
3845  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
3846  */
3847 int
3848 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
3849 {
3850 	char attrname[EXTATTR_MAXNAMELEN];
3851 	struct nlookupdata nd;
3852 	struct vnode *vp;
3853 	int error;
3854 
3855 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3856 	if (error)
3857 		return(error);
3858 
3859 	vp = NULL;
3860 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3861 	if (error == 0)
3862 		error = nlookup(&nd);
3863 	if (error == 0)
3864 		error = ncp_writechk(&nd.nl_nch);
3865 	if (error == 0)
3866 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3867 	if (error) {
3868 		nlookup_done(&nd);
3869 		return (error);
3870 	}
3871 
3872 	error = VOP_SETEXTATTR(vp, attrname, NULL, nd.nl_cred);
3873 	vput(vp);
3874 	nlookup_done(&nd);
3875 	return(error);
3876 }
3877 
3878 /*
3879  * Determine if the mount is visible to the process.
3880  */
3881 static int
3882 chroot_visible_mnt(struct mount *mp, struct proc *p)
3883 {
3884 	struct nchandle nch;
3885 
3886 	/*
3887 	 * Traverse from the mount point upwards.  If we hit the process
3888 	 * root then the mount point is visible to the process.
3889 	 */
3890 	nch = mp->mnt_ncmountpt;
3891 	while (nch.ncp) {
3892 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
3893 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
3894 			return(1);
3895 		}
3896 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
3897 			nch = nch.mount->mnt_ncmounton;
3898 		} else {
3899 			nch.ncp = nch.ncp->nc_parent;
3900 		}
3901 	}
3902 
3903 	/*
3904 	 * If the mount point is not visible to the process, but the
3905 	 * process root is in a subdirectory of the mount, return
3906 	 * TRUE anyway.
3907 	 */
3908 	if (p->p_fd->fd_nrdir.mount == mp)
3909 		return(1);
3910 
3911 	return(0);
3912 }
3913 
3914