xref: /dflybsd-src/sys/kern/vfs_syscalls.c (revision ab4c55c707dde5384d8a233485cc87b3ea249687)
1  /*
2   * Copyright (c) 1989, 1993
3   *	The Regents of the University of California.  All rights reserved.
4   * (c) UNIX System Laboratories, Inc.
5   * All or some portions of this file are derived from material licensed
6   * to the University of California by American Telephone and Telegraph
7   * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8   * the permission of UNIX System Laboratories, Inc.
9   *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in the
17   *    documentation and/or other materials provided with the distribution.
18   * 3. Neither the name of the University nor the names of its contributors
19   *    may be used to endorse or promote products derived from this software
20   *    without specific prior written permission.
21   *
22   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32   * SUCH DAMAGE.
33   *
34   *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35   * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
36   */
37  
38  #include <sys/param.h>
39  #include <sys/systm.h>
40  #include <sys/buf.h>
41  #include <sys/conf.h>
42  #include <sys/sysent.h>
43  #include <sys/malloc.h>
44  #include <sys/mount.h>
45  #include <sys/mountctl.h>
46  #include <sys/sysmsg.h>
47  #include <sys/filedesc.h>
48  #include <sys/kernel.h>
49  #include <sys/fcntl.h>
50  #include <sys/file.h>
51  #include <sys/linker.h>
52  #include <sys/stat.h>
53  #include <sys/unistd.h>
54  #include <sys/vnode.h>
55  #include <sys/proc.h>
56  #include <sys/caps.h>
57  #include <sys/jail.h>
58  #include <sys/namei.h>
59  #include <sys/nlookup.h>
60  #include <sys/dirent.h>
61  #include <sys/extattr.h>
62  #include <sys/spinlock.h>
63  #include <sys/kern_syscall.h>
64  #include <sys/objcache.h>
65  #include <sys/sysctl.h>
66  
67  #include <sys/buf2.h>
68  #include <sys/file2.h>
69  #include <sys/spinlock2.h>
70  
71  #include <vm/vm.h>
72  #include <vm/vm_object.h>
73  #include <vm/vm_page.h>
74  
75  #include <machine/limits.h>
76  #include <machine/stdarg.h>
77  
78  #define UMOUNTF_RETRIES		50	/* 0.25 seconds per retry */
79  
80  static void mount_warning(struct mount *mp, const char *ctl, ...)
81  		__printflike(2, 3);
82  static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
83  static int checkvp_chdir (struct vnode *vn, struct thread *td);
84  static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
85  static int get_fscap(const char *);
86  static int chroot_refuse_vdir_fds (thread_t td, struct filedesc *fdp);
87  static int chroot_visible_mnt(struct mount *mp, struct proc *p);
88  static int getutimes (struct timeval *, struct timespec *);
89  static int getutimens (const struct timespec *, struct timespec *, int *);
90  static int setfown (struct mount *, struct vnode *, uid_t, gid_t);
91  static int setfmode (struct vnode *, int);
92  static int setfflags (struct vnode *, u_long);
93  static int setutimes (struct vnode *, struct vattr *,
94  			const struct timespec *, int);
95  
96  static int	usermount = 0;	/* if 1, non-root can mount fs. */
97  SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
98      "Allow non-root users to mount filesystems");
99  
100  static int	debug_unmount = 0; /* if 1 loop until unmount success */
101  SYSCTL_INT(_vfs, OID_AUTO, debug_unmount, CTLFLAG_RW, &debug_unmount, 0,
102      "Stall failed unmounts in loop");
103  
104  static struct krate krate_rename = { 1 };
105  
106  /*
107   * Virtual File System System Calls
108   */
109  
110  /*
111   * Mount a file system.
112   *
113   * mount_args(char *type, char *path, int flags, caddr_t data)
114   *
115   * MPALMOSTSAFE
116   */
117  int
118  sys_mount(struct sysmsg *sysmsg, const struct mount_args *uap)
119  {
120  	struct thread *td = curthread;
121  	struct vnode *vp;
122  	struct nchandle nch;
123  	struct mount *mp, *nullmp;
124  	struct vfsconf *vfsp;
125  	int error, flag = 0, flag2 = 0;
126  	int hasmount;
127  	int priv = 0;
128  	int flags = uap->flags;
129  	struct vattr va;
130  	struct nlookupdata nd;
131  	char fstypename[MFSNAMELEN];
132  	struct ucred *cred;
133  
134  	cred = td->td_ucred;
135  
136  	/* We do not allow user mounts inside a jail for now */
137  	if (usermount && jailed(cred)) {
138  		error = EPERM;
139  		goto done;
140  	}
141  
142  	/*
143  	 * Extract the file system type. We need to know this early, to take
144  	 * appropriate actions for jails and the filesystems to mount.
145  	 */
146          if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0)
147  		goto done;
148  
149  	/*
150  	 * Select the correct cap according to the file system type.
151  	 */
152  	priv = get_fscap(fstypename);
153  
154  	if (usermount == 0 && (error = caps_priv_check_td(td, priv)))
155  		goto done;
156  
157  	/*
158  	 * Do not allow NFS export by non-root users.
159  	 */
160  	if (flags & MNT_EXPORTED) {
161  		error = caps_priv_check_td(td, priv);
162  		if (error)
163  			goto done;
164  	}
165  	/*
166  	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
167  	 */
168  	if (caps_priv_check_td(td, priv))
169  		flags |= MNT_NOSUID | MNT_NODEV;
170  
171  	/*
172  	 * Lookup the requested path and extract the nch and vnode.
173  	 */
174  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
175  	if (error == 0) {
176  		if ((error = nlookup(&nd)) == 0) {
177  			if (nd.nl_nch.ncp->nc_vp == NULL)
178  				error = ENOENT;
179  		}
180  	}
181  	if (error) {
182  		nlookup_done(&nd);
183  		goto done;
184  	}
185  
186  	/*
187  	 * If the target filesystem is resolved via a nullfs mount, then
188  	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
189  	 * instead of the target file system. We need it in case we are
190  	 * doing an update.
191  	 */
192  	nullmp = nd.nl_nch.mount;
193  
194  	/*
195  	 * Extract the locked+refd ncp and cleanup the nd structure
196  	 */
197  	nch = nd.nl_nch;
198  	cache_zero(&nd.nl_nch);
199  	nlookup_done(&nd);
200  
201  	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
202  	    (mp = cache_findmount(&nch)) != NULL) {
203  		cache_dropmount(mp);
204  		hasmount = 1;
205  	} else {
206  		hasmount = 0;
207  	}
208  
209  
210  	/*
211  	 * now we have the locked ref'd nch and unreferenced vnode.
212  	 */
213  	vp = nch.ncp->nc_vp;
214  	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
215  		cache_put(&nch);
216  		goto done;
217  	}
218  	cache_unlock(&nch);
219  
220  	/*
221  	 * Now we have an unlocked ref'd nch and a locked ref'd vp
222  	 */
223  	if (flags & MNT_UPDATE) {
224  		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
225  			cache_drop(&nch);
226  			vput(vp);
227  			error = EINVAL;
228  			goto done;
229  		}
230  
231  		if (strncmp(fstypename, "null", 5) == 0) {
232  			KKASSERT(nullmp);
233  			mp = nullmp;
234  		} else {
235  			mp = vp->v_mount;
236  		}
237  
238  		flag = mp->mnt_flag;
239  		flag2 = mp->mnt_kern_flag;
240  		/*
241  		 * We only allow the filesystem to be reloaded if it
242  		 * is currently mounted read-only.
243  		 */
244  		if ((flags & MNT_RELOAD) &&
245  		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
246  			cache_drop(&nch);
247  			vput(vp);
248  			error = EOPNOTSUPP;	/* Needs translation */
249  			goto done;
250  		}
251  		/*
252  		 * Only root, or the user that did the original mount is
253  		 * permitted to update it.
254  		 */
255  		if (mp->mnt_stat.f_owner != cred->cr_uid &&
256  		    (error = caps_priv_check_td(td, priv))) {
257  			cache_drop(&nch);
258  			vput(vp);
259  			goto done;
260  		}
261  		if (vfs_busy(mp, LK_NOWAIT)) {
262  			cache_drop(&nch);
263  			vput(vp);
264  			error = EBUSY;
265  			goto done;
266  		}
267  		if (hasmount) {
268  			cache_drop(&nch);
269  			vfs_unbusy(mp);
270  			vput(vp);
271  			error = EBUSY;
272  			goto done;
273  		}
274  		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
275  		lwkt_gettoken(&mp->mnt_token);
276  		vn_unlock(vp);
277  		vfsp = mp->mnt_vfc;
278  		goto update;
279  	}
280  
281  	/*
282  	 * If the user is not root, ensure that they own the directory
283  	 * onto which we are attempting to mount.
284  	 */
285  	if ((error = VOP_GETATTR(vp, &va)) ||
286  	    (va.va_uid != cred->cr_uid &&
287  	     (error = caps_priv_check_td(td, priv)))) {
288  		cache_drop(&nch);
289  		vput(vp);
290  		goto done;
291  	}
292  	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
293  		cache_drop(&nch);
294  		vput(vp);
295  		goto done;
296  	}
297  	if (vp->v_type != VDIR) {
298  		cache_drop(&nch);
299  		vput(vp);
300  		error = ENOTDIR;
301  		goto done;
302  	}
303  	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
304  		cache_drop(&nch);
305  		vput(vp);
306  		error = EPERM;
307  		goto done;
308  	}
309  	vfsp = vfsconf_find_by_name(fstypename);
310  	if (vfsp == NULL) {
311  		linker_file_t lf;
312  
313  		/* Only load modules for root (very important!) */
314  		error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
315  		if (error) {
316  			cache_drop(&nch);
317  			vput(vp);
318  			goto done;
319  		}
320  		error = linker_load_file(fstypename, &lf);
321  		if (error || lf == NULL) {
322  			cache_drop(&nch);
323  			vput(vp);
324  			if (lf == NULL)
325  				error = ENODEV;
326  			goto done;
327  		}
328  		lf->userrefs++;
329  		/* lookup again, see if the VFS was loaded */
330  		vfsp = vfsconf_find_by_name(fstypename);
331  		if (vfsp == NULL) {
332  			lf->userrefs--;
333  			linker_file_unload(lf);
334  			cache_drop(&nch);
335  			vput(vp);
336  			error = ENODEV;
337  			goto done;
338  		}
339  	}
340  	if (hasmount) {
341  		cache_drop(&nch);
342  		vput(vp);
343  		error = EBUSY;
344  		goto done;
345  	}
346  
347  	/*
348  	 * Allocate and initialize the filesystem.
349  	 */
350  	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
351  	mount_init(mp, vfsp->vfc_vfsops);
352  	vfs_busy(mp, LK_NOWAIT);
353  	mp->mnt_vfc = vfsp;
354  	mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
355  	vfsp->vfc_refcount++;
356  	mp->mnt_stat.f_type = vfsp->vfc_typenum;
357  	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
358  	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
359  	mp->mnt_stat.f_owner = cred->cr_uid;
360  	lwkt_gettoken(&mp->mnt_token);
361  	vn_unlock(vp);
362  update:
363  	/*
364  	 * (per-mount token acquired at this point)
365  	 *
366  	 * Set the mount level flags.
367  	 */
368  	if (flags & MNT_RDONLY)
369  		mp->mnt_flag |= MNT_RDONLY;
370  	else if (mp->mnt_flag & MNT_RDONLY)
371  		mp->mnt_kern_flag |= MNTK_WANTRDWR;
372  	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
373  	    MNT_SYNCHRONOUS | MNT_ASYNC | MNT_NOATIME |
374  	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
375  	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
376  	    MNT_AUTOMOUNTED);
377  	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC |
378  	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_ASYNC | MNT_FORCE |
379  	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
380  	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
381  	    MNT_AUTOMOUNTED);
382  
383  	/*
384  	 * Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
385  	 * This way the initial VFS_MOUNT() call will also be MPSAFE.
386  	 */
387  	if (vfsp->vfc_flags & VFCF_MPSAFE)
388  		mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;
389  
390  	/*
391  	 * Mount the filesystem.
392  	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
393  	 * get.
394  	 */
395  	if (mp->mnt_flag & MNT_UPDATE) {
396  		error = VFS_MOUNT(mp, uap->path, uap->data, cred);
397  		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
398  			mp->mnt_flag &= ~MNT_RDONLY;
399  		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
400  		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
401  		if (error) {
402  			mp->mnt_flag = flag;
403  			mp->mnt_kern_flag = flag2;
404  		}
405  		lwkt_reltoken(&mp->mnt_token);
406  		vfs_unbusy(mp);
407  		vrele(vp);
408  		cache_drop(&nch);
409  		goto done;
410  	}
411  	mp->mnt_ncmounton = nch;
412  	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
413  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
414  
415  	/*
416  	 * Put the new filesystem on the mount list after root.  The mount
417  	 * point gets its own mnt_ncmountpt (unless the VFS already set one
418  	 * up) which represents the root of the mount.  The lookup code
419  	 * detects the mount point going forward and checks the root of
420  	 * the mount going backwards.
421  	 *
422  	 * It is not necessary to invalidate or purge the vnode underneath
423  	 * because elements under the mount will be given their own glue
424  	 * namecache record.
425  	 */
426  	if (!error) {
427  		if (mp->mnt_ncmountpt.ncp == NULL) {
428  			/*
429  			 * Allocate, then unlock, but leave the ref intact.
430  			 * This is the mnt_refs (1) that we will retain
431  			 * through to the unmount.
432  			 */
433  			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
434  			cache_unlock(&mp->mnt_ncmountpt);
435  		}
436  		vn_unlock(vp);
437  		cache_lock(&nch);
438  		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
439  		cache_unlock(&nch);
440  		cache_ismounting(mp);
441  		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
442  
443  		mountlist_insert(mp, MNTINS_LAST);
444  		vn_unlock(vp);
445  		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
446  		error = vfs_allocate_syncvnode(mp);
447  		lwkt_reltoken(&mp->mnt_token);
448  		vfs_unbusy(mp);
449  		error = VFS_START(mp, 0);
450  		vrele(vp);
451  		KNOTE(&fs_klist, VQ_MOUNT);
452  	} else {
453  		bzero(&mp->mnt_ncmounton, sizeof(mp->mnt_ncmounton));
454  		vn_syncer_thr_stop(mp);
455  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
456  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
457  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
458  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
459  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
460  		if (mp->mnt_cred) {
461  			crfree(mp->mnt_cred);
462  			mp->mnt_cred = NULL;
463  		}
464  		mp->mnt_vfc->vfc_refcount--;
465  		lwkt_reltoken(&mp->mnt_token);
466  		vfs_unbusy(mp);
467  		kfree(mp, M_MOUNT);
468  		cache_drop(&nch);
469  		vput(vp);
470  	}
471  done:
472  	return (error);
473  }
474  
475  /*
476   * Scan all active processes to see if any of them have a current
477   * or root directory onto which the new filesystem has just been
478   * mounted. If so, replace them with the new mount point.
479   *
480   * Both old_nch and new_nch are ref'd on call but not locked.
481   * new_nch must be temporarily locked so it can be associated with the
482   * vnode representing the root of the mount point.
483   */
484  struct checkdirs_info {
485  	struct nchandle old_nch;
486  	struct nchandle new_nch;
487  	struct vnode *old_vp;
488  	struct vnode *new_vp;
489  };
490  
491  static int checkdirs_callback(struct proc *p, void *data);
492  
493  static void
494  checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
495  {
496  	struct checkdirs_info info;
497  	struct vnode *olddp;
498  	struct vnode *newdp;
499  	struct mount *mp;
500  
501  	/*
502  	 * If the old mount point's vnode has a usecount of 1, it is not
503  	 * being held as a descriptor anywhere.
504  	 */
505  	olddp = old_nch->ncp->nc_vp;
506  	if (olddp == NULL || VREFCNT(olddp) == 1)
507  		return;
508  
509  	/*
510  	 * Force the root vnode of the new mount point to be resolved
511  	 * so we can update any matching processes.
512  	 */
513  	mp = new_nch->mount;
514  	if (VFS_ROOT(mp, &newdp))
515  		panic("mount: lost mount");
516  	vn_unlock(newdp);
517  	cache_lock(new_nch);
518  	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
519  	cache_setunresolved(new_nch);
520  	cache_setvp(new_nch, newdp);
521  	cache_unlock(new_nch);
522  
523  	/*
524  	 * Special handling of the root node
525  	 */
526  	if (rootvnode == olddp) {
527  		vref(newdp);
528  		vfs_cache_setroot(newdp, cache_hold(new_nch));
529  	}
530  
531  	/*
532  	 * Pass newdp separately so the callback does not have to access
533  	 * it via new_nch->ncp->nc_vp.
534  	 */
535  	info.old_nch = *old_nch;
536  	info.new_nch = *new_nch;
537  	info.new_vp = newdp;
538  	allproc_scan(checkdirs_callback, &info, 0);
539  	vput(newdp);
540  }
541  
542  /*
543   * NOTE: callback is not MP safe because the scanned process's filedesc
544   * structure can be ripped out from under us, amoung other things.
545   */
546  static int
547  checkdirs_callback(struct proc *p, void *data)
548  {
549  	struct checkdirs_info *info = data;
550  	struct filedesc *fdp;
551  	struct nchandle ncdrop1;
552  	struct nchandle ncdrop2;
553  	struct vnode *vprele1;
554  	struct vnode *vprele2;
555  
556  	if ((fdp = p->p_fd) != NULL) {
557  		cache_zero(&ncdrop1);
558  		cache_zero(&ncdrop2);
559  		vprele1 = NULL;
560  		vprele2 = NULL;
561  
562  		/*
563  		 * MPUNSAFE - XXX fdp can be pulled out from under a
564  		 * foreign process.
565  		 *
566  		 * A shared filedesc is ok, we don't have to copy it
567  		 * because we are making this change globally.
568  		 */
569  		spin_lock(&fdp->fd_spin);
570  		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
571  		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
572  			vprele1 = fdp->fd_cdir;
573  			vref(info->new_vp);
574  			fdp->fd_cdir = info->new_vp;
575  			ncdrop1 = fdp->fd_ncdir;
576  			cache_copy(&info->new_nch, &fdp->fd_ncdir);
577  		}
578  		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
579  		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
580  			vprele2 = fdp->fd_rdir;
581  			vref(info->new_vp);
582  			fdp->fd_rdir = info->new_vp;
583  			ncdrop2 = fdp->fd_nrdir;
584  			cache_copy(&info->new_nch, &fdp->fd_nrdir);
585  		}
586  		spin_unlock(&fdp->fd_spin);
587  		if (ncdrop1.ncp)
588  			cache_drop(&ncdrop1);
589  		if (ncdrop2.ncp)
590  			cache_drop(&ncdrop2);
591  		if (vprele1)
592  			vrele(vprele1);
593  		if (vprele2)
594  			vrele(vprele2);
595  	}
596  	return(0);
597  }
598  
599  /*
600   * Unmount a file system.
601   *
602   * Note: unmount takes a path to the vnode mounted on as argument,
603   * not special file (as before).
604   *
605   * umount_args(char *path, int flags)
606   *
607   * MPALMOSTSAFE
608   */
609  int
610  sys_unmount(struct sysmsg *sysmsg, const struct unmount_args *uap)
611  {
612  	struct thread *td = curthread;
613  	struct proc *p __debugvar = td->td_proc;
614  	struct mount *mp = NULL;
615  	struct nlookupdata nd;
616  	char fstypename[MFSNAMELEN];
617  	int priv = 0;
618  	int error;
619  	struct ucred *cred;
620  
621  	cred = td->td_ucred;
622  
623  	KKASSERT(p);
624  
625  	/* We do not allow user umounts inside a jail for now */
626  	if (usermount && jailed(cred)) {
627  		error = EPERM;
628  		goto done;
629  	}
630  
631  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE,
632  			     NLC_FOLLOW | NLC_IGNBADDIR);
633  	if (error == 0)
634  		error = nlookup(&nd);
635  	if (error)
636  		goto out;
637  
638  	mp = nd.nl_nch.mount;
639  
640  	/* Figure out the fsname in order to select proper privs */
641  	ksnprintf(fstypename, MFSNAMELEN, "%s", mp->mnt_vfc->vfc_name);
642  	priv = get_fscap(fstypename);
643  
644  	if (usermount == 0 && (error = caps_priv_check_td(td, priv))) {
645  		nlookup_done(&nd);
646  		goto done;
647  	}
648  
649  	/*
650  	 * Only root, or the user that did the original mount is
651  	 * permitted to unmount this filesystem.
652  	 */
653  	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
654  	    (error = caps_priv_check_td(td, priv)))
655  	{
656  		goto out;
657  	}
658  
659  	/*
660  	 * Don't allow unmounting the root file system.
661  	 */
662  	if (mp->mnt_flag & MNT_ROOTFS) {
663  		error = EINVAL;
664  		goto out;
665  	}
666  
667  	/*
668  	 * Must be the root of the filesystem
669  	 */
670  	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
671  		error = EINVAL;
672  		goto out;
673  	}
674  
675  	/* Check if this mount belongs to this prison */
676  	if (jailed(cred) && mp->mnt_cred && (!mp->mnt_cred->cr_prison ||
677  		mp->mnt_cred->cr_prison != cred->cr_prison)) {
678  		kprintf("mountpoint %s does not belong to this jail\n",
679  		    uap->path);
680  		error = EPERM;
681  		goto out;
682  	}
683  
684  	/*
685  	 * If no error try to issue the unmount.  We lose our cache
686  	 * ref when we call nlookup_done so we must hold the mount point
687  	 * to prevent use-after-free races.
688  	 */
689  out:
690  	if (error == 0) {
691  		mount_hold(mp);
692  		nlookup_done(&nd);
693  		error = dounmount(mp, uap->flags, 0);
694  		mount_drop(mp);
695  	} else {
696  		nlookup_done(&nd);
697  	}
698  done:
699  	return (error);
700  }
701  
702  /*
703   * Do the actual file system unmount (interlocked against the mountlist
704   * token and mp->mnt_token).
705   */
706  static int
707  dounmount_interlock(struct mount *mp)
708  {
709  	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
710  		return (EBUSY);
711  	mp->mnt_kern_flag |= MNTK_UNMOUNT;
712  	return(0);
713  }
714  
715  /*
716   * Returns non-zero if the specified process uses the specified
717   * mount point.
718   */
719  static int
720  process_uses_mount(struct proc *p, struct mount *mp)
721  {
722  	struct filedesc *fdp;
723  	struct file *fp;
724  	int found;
725  	int n;
726  
727  	fdp = p->p_fd;
728  	if (fdp == NULL)
729  		return 0;
730  	if (fdp->fd_ncdir.mount == mp ||
731  	    fdp->fd_nrdir.mount == mp ||
732  	    fdp->fd_njdir.mount == mp)
733  	{
734  		return 1;
735  	}
736  
737  	found = 0;
738  	spin_lock_shared(&fdp->fd_spin);
739  	for (n = 0; n < fdp->fd_nfiles; ++n) {
740  		fp = fdp->fd_files[n].fp;
741  		if (fp && fp->f_nchandle.mount == mp) {
742  			found = 1;
743  			break;
744  		}
745  	}
746  	spin_unlock_shared(&fdp->fd_spin);
747  
748  	return found;
749  }
750  
751  /*
752   * Cleanup processes that have references to the mount point
753   * being force-unmounted.
754   */
755  struct unmount_allproc_info {
756  	struct mount *mp;
757  	int sig;
758  };
759  
760  static int
761  unmount_allproc_cb(struct proc *p, void *arg)
762  {
763  	struct unmount_allproc_info *info;
764  	struct mount *mp;
765  
766  	info = arg;
767  	mp = info->mp;
768  
769  	if (p->p_textnch.mount == mp)
770  		cache_drop(&p->p_textnch);
771  	if (info->sig && process_uses_mount(p, mp)) {
772  		lwkt_gettoken(&p->p_token);
773  		p->p_flags |= P_MUSTKILL;
774  		lwkt_reltoken(&p->p_token);
775  		ksignal(p, info->sig);
776  	}
777  
778  	return 0;
779  }
780  
781  /*
782   * The guts of the unmount code.  The mount owns one ref and one hold
783   * count.  If we successfully interlock the unmount, those refs are ours.
784   * (The ref is from mnt_ncmountpt).
785   *
786   * When halting we shortcut certain mount types such as devfs by not actually
787   * issuing the VFS_SYNC() or VFS_UNMOUNT().  They are still disconnected
788   * from the mountlist so higher-level filesytems can unmount cleanly.
789   *
790   * The mount types that allow QUICKHALT are: devfs, tmpfs, procfs.
791   */
792  int
793  dounmount(struct mount *mp, int flags, int halting)
794  {
795  	struct namecache *ncp;
796  	struct nchandle nch;
797  	struct vnode *vp;
798  	int error;
799  	int async_flag;
800  	int lflags;
801  	int freeok = 1;
802  	int hadsyncer = 0;
803  	int retry;
804  	int quickhalt;
805  
806  	lwkt_gettoken(&mp->mnt_token);
807  
808  	/*
809  	 * When halting, certain mount points can essentially just
810  	 * be unhooked and otherwise ignored.
811  	 */
812  	if (halting && (mp->mnt_kern_flag & MNTK_QUICKHALT)) {
813  		quickhalt = 1;
814  		freeok = 0;
815  	} else {
816  		quickhalt = 0;
817  	}
818  
819  
820  	/*
821  	 * Exclusive access for unmounting purposes.
822  	 */
823  	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
824  		goto out;
825  
826  	/*
827  	 * We now 'own' the last mp->mnt_refs
828  	 *
829  	 * Allow filesystems to detect that a forced unmount is in progress.
830  	 */
831  	if (flags & MNT_FORCE)
832  		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
833  	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
834  	error = lockmgr(&mp->mnt_lock, lflags);
835  	if (error) {
836  		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
837  		if (mp->mnt_kern_flag & MNTK_MWAIT) {
838  			mp->mnt_kern_flag &= ~MNTK_MWAIT;
839  			wakeup(mp);
840  		}
841  		goto out;
842  	}
843  
844  	if (mp->mnt_flag & MNT_EXPUBLIC)
845  		vfs_setpublicfs(NULL, NULL, NULL);
846  
847  	vfs_msync(mp, MNT_WAIT);
848  	async_flag = mp->mnt_flag & MNT_ASYNC;
849  	mp->mnt_flag &=~ MNT_ASYNC;
850  
851  	/*
852  	 * Decomission our special mnt_syncer vnode.  This also stops
853  	 * the vnlru code.  If we are unable to unmount we recommission
854  	 * the vnode.
855  	 *
856  	 * Then sync the filesystem.
857  	 */
858  	if ((vp = mp->mnt_syncer) != NULL) {
859  		mp->mnt_syncer = NULL;
860  		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
861  		vrele(vp);
862  		hadsyncer = 1;
863  	}
864  
865  	/*
866  	 * Sync normally-mounted filesystem.
867  	 */
868  	if (quickhalt == 0) {
869  		if ((mp->mnt_flag & MNT_RDONLY) == 0)
870  			VFS_SYNC(mp, MNT_WAIT);
871  	}
872  
873  	/*
874  	 * nchandle records ref the mount structure.  Expect a count of 1
875  	 * (our mount->mnt_ncmountpt).
876  	 *
877  	 * Scans can get temporary refs on a mountpoint (thought really
878  	 * heavy duty stuff like cache_findmount() do not).
879  	 */
880  	for (retry = 0; (retry < UMOUNTF_RETRIES || debug_unmount); ++retry) {
881  		int dummy = 0;
882  
883  		/*
884  		 * Invalidate the namecache topology under the mount.
885  		 * nullfs mounts alias a real mount's namecache topology
886  		 * and it should not be invalidated in that case.
887  		 */
888  		if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
889  			cache_lock(&mp->mnt_ncmountpt);
890  			cache_inval(&mp->mnt_ncmountpt,
891  				    CINV_DESTROY|CINV_CHILDREN);
892  			cache_unlock(&mp->mnt_ncmountpt);
893  		}
894  
895  		/*
896  		 * Clear pcpu caches
897  		 */
898  		cache_unmounting(mp);
899  		if (mp->mnt_refs != 1)
900  			cache_clearmntcache(mp);
901  
902  		/*
903  		 * Break out if we are good.  Don't count ncp refs if the
904  		 * mount is aliased.
905  		 */
906  		ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
907  		      NULL : mp->mnt_ncmountpt.ncp;
908  		if (mp->mnt_refs == 1 &&
909  		    (ncp == NULL || (ncp->nc_refs == 1 &&
910  				     TAILQ_FIRST(&ncp->nc_list) == NULL))) {
911  			break;
912  		}
913  
914  		/*
915  		 * If forcing the unmount, clean out any p->p_textnch
916  		 * nchandles that match this mount.
917  		 *
918  		 * In addition any process which has a current, root, or
919  		 * jail directory matching the mount, or which has an open
920  		 * descriptor matching the mount, will be killed.  We first
921  		 * try SIGINT, and if that doesn't work we issue SIGKILL.
922  		 */
923  		if (flags & MNT_FORCE) {
924  			struct unmount_allproc_info info;
925  
926  			info.mp = mp;
927  			switch(retry) {
928  			case 3:
929  				info.sig = SIGINT;
930  				break;
931  			case 7:
932  				info.sig = SIGKILL;
933  				break;
934  			default:
935  				info.sig = 0;
936  				break;
937  			}
938  			allproc_scan(&unmount_allproc_cb, &info, 0);
939  		}
940  
941  		/*
942  		 * Sleep and retry.
943  		 */
944  		error = lockmgr(&mp->mnt_lock, LK_RELEASE);
945  		tsleep(&dummy, 0, "mntbsy", hz / 4 + 1);
946  		error = lockmgr(&mp->mnt_lock, LK_EXCLUSIVE);
947  		if (debug_unmount && (retry & 15) == 15) {
948  			mount_warning(mp,
949  				      "(%p) debug - retry %d, "
950  				      "%d namecache refs, %d mount refs",
951  				      mp, retry,
952  				      (ncp ? ncp->nc_refs - 1 : 0),
953  				      mp->mnt_refs - 1);
954  		}
955  	}
956  	if (retry == UMOUNTF_RETRIES) {
957  		mount_warning(mp,
958  			      "forced umount of \"%s\" - "
959  			      "%d namecache refs, %d mount refs",
960  			      (mp->mnt_ncmountpt.ncp ?
961  				mp->mnt_ncmountpt.ncp->nc_name : "?"),
962  			      (ncp ? ncp->nc_refs - 1 : 0),
963  			      mp->mnt_refs - 1);
964  	}
965  
966  	error = 0;
967  	ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
968  	      NULL : mp->mnt_ncmountpt.ncp;
969  	if (mp->mnt_refs != 1 ||
970  	    (ncp != NULL && (ncp->nc_refs != 1 ||
971  			     TAILQ_FIRST(&ncp->nc_list)))) {
972  		mount_warning(mp,
973  			      "(%p): %d namecache refs, %d mount refs "
974  			      "still present",
975  			      mp,
976  			      (ncp ? ncp->nc_refs - 1 : 0),
977  			      mp->mnt_refs - 1);
978  		if (flags & MNT_FORCE) {
979  			freeok = 0;
980  			mount_warning(mp, "forcing unmount\n");
981  		} else {
982  			error = EBUSY;
983  		}
984  	}
985  
986  	/*
987  	 * So far so good, sync the filesystem once more and
988  	 * call the VFS unmount code if the sync succeeds.
989  	 */
990  	if (error == 0 && quickhalt == 0) {
991  		if (mp->mnt_flag & MNT_RDONLY) {
992  			error = VFS_UNMOUNT(mp, flags);
993  		} else {
994  			error = VFS_SYNC(mp, MNT_WAIT);
995  			if (error == 0 ||		/* no error */
996  			    error == EOPNOTSUPP ||	/* no sync avail */
997  			    (flags & MNT_FORCE)) {	/* force anyway */
998  				error = VFS_UNMOUNT(mp, flags);
999  			}
1000  		}
1001  		if (error) {
1002  			mount_warning(mp,
1003  				      "(%p) unmount: vfs refused to unmount, "
1004  				      "error %d",
1005  				      mp, error);
1006  		}
1007  	}
1008  
1009  	/*
1010  	 * If an error occurred we can still recover, restoring the
1011  	 * syncer vnode and misc flags.
1012  	 */
1013  	if (error) {
1014  		if (mp->mnt_syncer == NULL && hadsyncer)
1015  			vfs_allocate_syncvnode(mp);
1016  		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1017  		mp->mnt_flag |= async_flag;
1018  		lockmgr(&mp->mnt_lock, LK_RELEASE);
1019  		if (mp->mnt_kern_flag & MNTK_MWAIT) {
1020  			mp->mnt_kern_flag &= ~MNTK_MWAIT;
1021  			wakeup(mp);
1022  		}
1023  		goto out;
1024  	}
1025  	/*
1026  	 * Clean up any journals still associated with the mount after
1027  	 * filesystem activity has ceased.
1028  	 */
1029  	journal_remove_all_journals(mp,
1030  	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
1031  
1032  	mountlist_remove(mp);
1033  
1034  	/*
1035  	 * Remove any installed vnode ops here so the individual VFSs don't
1036  	 * have to.
1037  	 *
1038  	 * mnt_refs should go to zero when we scrap mnt_ncmountpt.
1039  	 *
1040  	 * When quickhalting we have to keep these intact because the
1041  	 * underlying vnodes have not been destroyed, and some might be
1042  	 * dirty.
1043  	 */
1044  	if (quickhalt == 0) {
1045  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
1046  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
1047  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
1048  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
1049  		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
1050  	}
1051  
1052  	if (mp->mnt_ncmountpt.ncp != NULL) {
1053  		nch = mp->mnt_ncmountpt;
1054  		cache_zero(&mp->mnt_ncmountpt);
1055  		cache_clrmountpt(&nch);
1056  		cache_drop(&nch);
1057  	}
1058  	if (mp->mnt_ncmounton.ncp != NULL) {
1059  		cache_unmounting(mp);
1060  		nch = mp->mnt_ncmounton;
1061  		cache_zero(&mp->mnt_ncmounton);
1062  		cache_clrmountpt(&nch);
1063  		cache_drop(&nch);
1064  	}
1065  
1066  	if (mp->mnt_cred) {
1067  		crfree(mp->mnt_cred);
1068  		mp->mnt_cred = NULL;
1069  	}
1070  
1071  	mp->mnt_vfc->vfc_refcount--;
1072  
1073  	/*
1074  	 * If not quickhalting the mount, we expect there to be no
1075  	 * vnodes left.
1076  	 */
1077  	if (quickhalt == 0 && !TAILQ_EMPTY(&mp->mnt_nvnodelist))
1078  		panic("unmount: dangling vnode");
1079  
1080  	/*
1081  	 * Release the lock
1082  	 */
1083  	lockmgr(&mp->mnt_lock, LK_RELEASE);
1084  	if (mp->mnt_kern_flag & MNTK_MWAIT) {
1085  		mp->mnt_kern_flag &= ~MNTK_MWAIT;
1086  		wakeup(mp);
1087  	}
1088  
1089  	/*
1090  	 * If we reach here and freeok != 0 we must free the mount.
1091  	 * mnt_refs should already have dropped to 0, so if it is not
1092  	 * zero we must cycle the caches and wait.
1093  	 *
1094  	 * When we are satisfied that the mount has disconnected we can
1095  	 * drop the hold on the mp that represented the mount (though the
1096  	 * caller might actually have another, so the caller's drop may
1097  	 * do the actual free).
1098  	 */
1099  	if (freeok) {
1100  		if (mp->mnt_refs > 0)
1101  			cache_clearmntcache(mp);
1102  		while (mp->mnt_refs > 0) {
1103  			cache_unmounting(mp);
1104  			wakeup(mp);
1105  			tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
1106  			cache_clearmntcache(mp);
1107  		}
1108  		lwkt_reltoken(&mp->mnt_token);
1109  		mount_drop(mp);
1110  		mp = NULL;
1111  	} else {
1112  		cache_clearmntcache(mp);
1113  	}
1114  	error = 0;
1115  	KNOTE(&fs_klist, VQ_UNMOUNT);
1116  out:
1117  	if (mp)
1118  		lwkt_reltoken(&mp->mnt_token);
1119  	return (error);
1120  }
1121  
1122  static
1123  void
1124  mount_warning(struct mount *mp, const char *ctl, ...)
1125  {
1126  	char *ptr;
1127  	char *buf;
1128  	__va_list va;
1129  
1130  	__va_start(va, ctl);
1131  	if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
1132  			   &ptr, &buf, 0) == 0) {
1133  		kprintf("unmount(%s): ", ptr);
1134  		kvprintf(ctl, va);
1135  		kprintf("\n");
1136  		kfree(buf, M_TEMP);
1137  	} else {
1138  		kprintf("unmount(%p", mp);
1139  		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
1140  			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
1141  		kprintf("): ");
1142  		kvprintf(ctl, va);
1143  		kprintf("\n");
1144  	}
1145  	__va_end(va);
1146  }
1147  
1148  /*
1149   * Shim cache_fullpath() to handle the case where a process is chrooted into
1150   * a subdirectory of a mount.  In this case if the root mount matches the
1151   * process root directory's mount we have to specify the process's root
1152   * directory instead of the mount point, because the mount point might
1153   * be above the root directory.
1154   */
1155  static
1156  int
1157  mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
1158  {
1159  	struct nchandle *nch;
1160  
1161  	if (p && p->p_fd->fd_nrdir.mount == mp)
1162  		nch = &p->p_fd->fd_nrdir;
1163  	else
1164  		nch = &mp->mnt_ncmountpt;
1165  	return(cache_fullpath(p, nch, NULL, rb, fb, 0));
1166  }
1167  
1168  /*
1169   * Sync each mounted filesystem.
1170   */
1171  
1172  #ifdef DEBUG
1173  static int syncprt = 0;
1174  SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
1175  #endif /* DEBUG */
1176  
1177  static int sync_callback(struct mount *mp, void *data);
1178  
1179  int
1180  sys_sync(struct sysmsg *sysmsg, const struct sync_args *uap)
1181  {
1182  	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
1183  	return (0);
1184  }
1185  
1186  static
1187  int
1188  sync_callback(struct mount *mp, void *data __unused)
1189  {
1190  	int asyncflag;
1191  
1192  	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1193  		lwkt_gettoken(&mp->mnt_token);
1194  		asyncflag = mp->mnt_flag & MNT_ASYNC;
1195  		mp->mnt_flag &= ~MNT_ASYNC;
1196  		lwkt_reltoken(&mp->mnt_token);
1197  		vfs_msync(mp, MNT_NOWAIT);
1198  		VFS_SYNC(mp, MNT_NOWAIT);
1199  		lwkt_gettoken(&mp->mnt_token);
1200  		mp->mnt_flag |= asyncflag;
1201  		lwkt_reltoken(&mp->mnt_token);
1202  	}
1203  	return(0);
1204  }
1205  
1206  /* XXX PRISON: could be per prison flag */
1207  static int prison_quotas;
1208  #if 0
1209  SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
1210  #endif
1211  
1212  /*
1213   *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
1214   *
1215   * Change filesystem quotas.
1216   *
1217   * MPALMOSTSAFE
1218   */
1219  int
1220  sys_quotactl(struct sysmsg *sysmsg, const struct quotactl_args *uap)
1221  {
1222  	struct nlookupdata nd;
1223  	struct thread *td;
1224  	struct mount *mp;
1225  	int error;
1226  
1227  	td = curthread;
1228  	if (td->td_ucred->cr_prison && !prison_quotas) {
1229  		error = EPERM;
1230  		goto done;
1231  	}
1232  
1233  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1234  	if (error == 0)
1235  		error = nlookup(&nd);
1236  	if (error == 0) {
1237  		mp = nd.nl_nch.mount;
1238  		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
1239  				    uap->arg, nd.nl_cred);
1240  	}
1241  	nlookup_done(&nd);
1242  done:
1243  	return (error);
1244  }
1245  
1246  /*
1247   * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
1248   *		void *buf, int buflen)
1249   *
1250   * This function operates on a mount point and executes the specified
1251   * operation using the specified control data, and possibly returns data.
1252   *
1253   * The actual number of bytes stored in the result buffer is returned, 0
1254   * if none, otherwise an error is returned.
1255   *
1256   * MPALMOSTSAFE
1257   */
1258  int
1259  sys_mountctl(struct sysmsg *sysmsg, const struct mountctl_args *uap)
1260  {
1261  	struct thread *td = curthread;
1262  	struct file *fp;
1263  	void *ctl = NULL;
1264  	void *buf = NULL;
1265  	char *path = NULL;
1266  	int error;
1267  
1268  	/*
1269  	 * Sanity and permissions checks.  We must be root.
1270  	 */
1271  	if (td->td_ucred->cr_prison != NULL)
1272  		return (EPERM);
1273  	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
1274  	    (error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) != 0)
1275  	{
1276  		return (error);
1277  	}
1278  
1279  	/*
1280  	 * Argument length checks
1281  	 */
1282  	if (uap->ctllen < 0 || uap->ctllen > 1024)
1283  		return (EINVAL);
1284  	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
1285  		return (EINVAL);
1286  	if (uap->path == NULL)
1287  		return (EINVAL);
1288  
1289  	/*
1290  	 * Allocate the necessary buffers and copyin data
1291  	 */
1292  	path = objcache_get(namei_oc, M_WAITOK);
1293  	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1294  	if (error)
1295  		goto done;
1296  
1297  	if (uap->ctllen) {
1298  		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1299  		error = copyin(uap->ctl, ctl, uap->ctllen);
1300  		if (error)
1301  			goto done;
1302  	}
1303  	if (uap->buflen)
1304  		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1305  
1306  	/*
1307  	 * Validate the descriptor
1308  	 */
1309  	if (uap->fd >= 0) {
1310  		fp = holdfp(td, uap->fd, -1);
1311  		if (fp == NULL) {
1312  			error = EBADF;
1313  			goto done;
1314  		}
1315  	} else {
1316  		fp = NULL;
1317  	}
1318  
1319  	/*
1320  	 * Execute the internal kernel function and clean up.
1321  	 */
1322  	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen,
1323  			      buf, uap->buflen, &sysmsg->sysmsg_result);
1324  	if (fp)
1325  		dropfp(td, uap->fd, fp);
1326  	if (error == 0 && sysmsg->sysmsg_result > 0)
1327  		error = copyout(buf, uap->buf, sysmsg->sysmsg_result);
1328  done:
1329  	if (path)
1330  		objcache_put(namei_oc, path);
1331  	if (ctl)
1332  		kfree(ctl, M_TEMP);
1333  	if (buf)
1334  		kfree(buf, M_TEMP);
1335  	return (error);
1336  }
1337  
1338  /*
1339   * Execute a mount control operation by resolving the path to a mount point
1340   * and calling vop_mountctl().
1341   *
1342   * Use the mount point from the nch instead of the vnode so nullfs mounts
1343   * can properly spike the VOP.
1344   */
1345  int
1346  kern_mountctl(const char *path, int op, struct file *fp,
1347  		const void *ctl, int ctllen,
1348  		void *buf, int buflen, int *res)
1349  {
1350  	struct vnode *vp;
1351  	struct nlookupdata nd;
1352  	struct nchandle nch;
1353  	struct mount *mp;
1354  	int error;
1355  
1356  	*res = 0;
1357  	vp = NULL;
1358  	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1359  	if (error)
1360  		return (error);
1361  	error = nlookup(&nd);
1362  	if (error) {
1363  		nlookup_done(&nd);
1364  		return (error);
1365  	}
1366  	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1367  	if (error) {
1368  		nlookup_done(&nd);
1369  		return (error);
1370  	}
1371  
1372  	/*
1373  	 * Yes, all this is needed to use the nch.mount below, because
1374  	 * we must maintain a ref on the mount to avoid ripouts (e.g.
1375  	 * due to heavy mount/unmount use by synth or poudriere).
1376  	 */
1377  	nch = nd.nl_nch;
1378  	cache_zero(&nd.nl_nch);
1379  	cache_unlock(&nch);
1380  	nlookup_done(&nd);
1381  	vn_unlock(vp);
1382  
1383  	mp = nch.mount;
1384  
1385  	/*
1386  	 * Must be the root of the filesystem
1387  	 */
1388  	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1389  		cache_drop(&nch);
1390  		vrele(vp);
1391  		return (EINVAL);
1392  	}
1393  	if (mp == NULL || mp->mnt_kern_flag & MNTK_UNMOUNT) {
1394  		kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
1395  			path);
1396  		cache_drop(&nch);
1397  		vrele(vp);
1398  		return (EINVAL);
1399  	}
1400  	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1401  			     buf, buflen, res);
1402  	vrele(vp);
1403  	cache_drop(&nch);
1404  
1405  	return (error);
1406  }
1407  
1408  int
1409  kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1410  {
1411  	struct thread *td = curthread;
1412  	struct proc *p = td->td_proc;
1413  	struct mount *mp;
1414  	struct statfs *sp;
1415  	char *fullpath, *freepath;
1416  	int error;
1417  
1418  	if ((error = nlookup(nd)) != 0)
1419  		return (error);
1420  	mp = nd->nl_nch.mount;
1421  	sp = &mp->mnt_stat;
1422  
1423  	/*
1424  	 * Ignore refresh error, user should have visibility.
1425  	 * This can happen if a NFS mount goes bad (e.g. server
1426  	 * revokes perms or goes down).
1427  	 */
1428  	error = VFS_STATFS(mp, sp, nd->nl_cred);
1429  	/* ignore error */
1430  
1431  	error = mount_path(p, mp, &fullpath, &freepath);
1432  	if (error)
1433  		return(error);
1434  	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1435  	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1436  	kfree(freepath, M_TEMP);
1437  
1438  	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1439  	bcopy(sp, buf, sizeof(*buf));
1440  	/* Only root should have access to the fsid's. */
1441  	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
1442  		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1443  	return (0);
1444  }
1445  
1446  /*
1447   * statfs_args(char *path, struct statfs *buf)
1448   *
1449   * Get filesystem statistics.
1450   */
1451  int
1452  sys_statfs(struct sysmsg *sysmsg, const struct statfs_args *uap)
1453  {
1454  	struct nlookupdata nd;
1455  	struct statfs buf;
1456  	int error;
1457  
1458  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1459  	if (error == 0)
1460  		error = kern_statfs(&nd, &buf);
1461  	nlookup_done(&nd);
1462  	if (error == 0)
1463  		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1464  	return (error);
1465  }
1466  
1467  int
1468  kern_fstatfs(int fd, struct statfs *buf)
1469  {
1470  	struct thread *td = curthread;
1471  	struct proc *p = td->td_proc;
1472  	struct file *fp;
1473  	struct mount *mp;
1474  	struct statfs *sp;
1475  	char *fullpath, *freepath;
1476  	int error;
1477  
1478  	KKASSERT(p);
1479  	if ((error = holdvnode(td, fd, &fp)) != 0)
1480  		return (error);
1481  
1482  	/*
1483  	 * Try to use mount info from any overlays rather than the
1484  	 * mount info for the underlying vnode, otherwise we will
1485  	 * fail when operating on null-mounted paths inside a chroot.
1486  	 */
1487  	if ((mp = fp->f_nchandle.mount) == NULL)
1488  		mp = ((struct vnode *)fp->f_data)->v_mount;
1489  	if (mp == NULL) {
1490  		error = EBADF;
1491  		goto done;
1492  	}
1493  	if (fp->f_cred == NULL) {
1494  		error = EINVAL;
1495  		goto done;
1496  	}
1497  
1498  	/*
1499  	 * Ignore refresh error, user should have visibility.
1500  	 * This can happen if a NFS mount goes bad (e.g. server
1501  	 * revokes perms or goes down).
1502  	 */
1503  	sp = &mp->mnt_stat;
1504  	error = VFS_STATFS(mp, sp, fp->f_cred);
1505  
1506  	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1507  		goto done;
1508  	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1509  	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1510  	kfree(freepath, M_TEMP);
1511  
1512  	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1513  	bcopy(sp, buf, sizeof(*buf));
1514  
1515  	/* Only root should have access to the fsid's. */
1516  	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
1517  		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1518  	error = 0;
1519  done:
1520  	fdrop(fp);
1521  	return (error);
1522  }
1523  
1524  /*
1525   * fstatfs_args(int fd, struct statfs *buf)
1526   *
1527   * Get filesystem statistics.
1528   */
1529  int
1530  sys_fstatfs(struct sysmsg *sysmsg, const struct fstatfs_args *uap)
1531  {
1532  	struct statfs buf;
1533  	int error;
1534  
1535  	error = kern_fstatfs(uap->fd, &buf);
1536  
1537  	if (error == 0)
1538  		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1539  	return (error);
1540  }
1541  
1542  int
1543  kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1544  {
1545  	struct mount *mp;
1546  	struct statvfs *sp;
1547  	int error;
1548  
1549  	if ((error = nlookup(nd)) != 0)
1550  		return (error);
1551  	mp = nd->nl_nch.mount;
1552  	sp = &mp->mnt_vstat;
1553  	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1554  		return (error);
1555  
1556  	sp->f_flag = 0;
1557  	if (mp->mnt_flag & MNT_RDONLY)
1558  		sp->f_flag |= ST_RDONLY;
1559  	if (mp->mnt_flag & MNT_NOSUID)
1560  		sp->f_flag |= ST_NOSUID;
1561  	bcopy(sp, buf, sizeof(*buf));
1562  	return (0);
1563  }
1564  
1565  /*
1566   * statfs_args(char *path, struct statfs *buf)
1567   *
1568   * Get filesystem statistics.
1569   */
1570  int
1571  sys_statvfs(struct sysmsg *sysmsg, const struct statvfs_args *uap)
1572  {
1573  	struct nlookupdata nd;
1574  	struct statvfs buf;
1575  	int error;
1576  
1577  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1578  	if (error == 0)
1579  		error = kern_statvfs(&nd, &buf);
1580  	nlookup_done(&nd);
1581  	if (error == 0)
1582  		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1583  	return (error);
1584  }
1585  
1586  int
1587  kern_fstatvfs(int fd, struct statvfs *buf)
1588  {
1589  	struct thread *td = curthread;
1590  	struct file *fp;
1591  	struct mount *mp;
1592  	struct statvfs *sp;
1593  	int error;
1594  
1595  	if ((error = holdvnode(td, fd, &fp)) != 0)
1596  		return (error);
1597  	if ((mp = fp->f_nchandle.mount) == NULL)
1598  		mp = ((struct vnode *)fp->f_data)->v_mount;
1599  	if (mp == NULL) {
1600  		error = EBADF;
1601  		goto done;
1602  	}
1603  	if (fp->f_cred == NULL) {
1604  		error = EINVAL;
1605  		goto done;
1606  	}
1607  	sp = &mp->mnt_vstat;
1608  	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1609  		goto done;
1610  
1611  	sp->f_flag = 0;
1612  	if (mp->mnt_flag & MNT_RDONLY)
1613  		sp->f_flag |= ST_RDONLY;
1614  	if (mp->mnt_flag & MNT_NOSUID)
1615  		sp->f_flag |= ST_NOSUID;
1616  
1617  	bcopy(sp, buf, sizeof(*buf));
1618  	error = 0;
1619  done:
1620  	fdrop(fp);
1621  	return (error);
1622  }
1623  
1624  /*
1625   * fstatfs_args(int fd, struct statfs *buf)
1626   *
1627   * Get filesystem statistics.
1628   */
1629  int
1630  sys_fstatvfs(struct sysmsg *sysmsg, const struct fstatvfs_args *uap)
1631  {
1632  	struct statvfs buf;
1633  	int error;
1634  
1635  	error = kern_fstatvfs(uap->fd, &buf);
1636  
1637  	if (error == 0)
1638  		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1639  	return (error);
1640  }
1641  
1642  /*
1643   * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1644   *
1645   * Get statistics on all filesystems.
1646   */
1647  
1648  struct getfsstat_info {
1649  	struct statfs *sfsp;
1650  	long count;
1651  	long maxcount;
1652  	int error;
1653  	int flags;
1654  	struct thread *td;
1655  };
1656  
1657  static int getfsstat_callback(struct mount *, void *);
1658  
1659  int
1660  sys_getfsstat(struct sysmsg *sysmsg, const struct getfsstat_args *uap)
1661  {
1662  	struct thread *td = curthread;
1663  	struct getfsstat_info info;
1664  
1665  	bzero(&info, sizeof(info));
1666  
1667  	info.maxcount = uap->bufsize / sizeof(struct statfs);
1668  	info.sfsp = uap->buf;
1669  	info.count = 0;
1670  	info.flags = uap->flags;
1671  	info.td = td;
1672  
1673  	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1674  	if (info.sfsp && info.count > info.maxcount)
1675  		sysmsg->sysmsg_result = info.maxcount;
1676  	else
1677  		sysmsg->sysmsg_result = info.count;
1678  	return (info.error);
1679  }
1680  
1681  static int
1682  getfsstat_callback(struct mount *mp, void *data)
1683  {
1684  	struct getfsstat_info *info = data;
1685  	struct statfs *sp;
1686  	char *freepath;
1687  	char *fullpath;
1688  	int error;
1689  
1690  	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1691  		return(0);
1692  
1693  	if (info->sfsp && info->count < info->maxcount) {
1694  		sp = &mp->mnt_stat;
1695  
1696  		/*
1697  		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1698  		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1699  		 * overrides MNT_WAIT.
1700  		 *
1701  		 * Ignore refresh error, user should have visibility.
1702  		 * This can happen if a NFS mount goes bad (e.g. server
1703  		 * revokes perms or goes down).
1704  		 */
1705  		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1706  		    (info->flags & MNT_WAIT)) &&
1707  		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1708  			/* ignore error */
1709  		}
1710  		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1711  
1712  		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1713  		if (error) {
1714  			info->error = error;
1715  			return(-1);
1716  		}
1717  		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1718  		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1719  		kfree(freepath, M_TEMP);
1720  
1721  		error = copyout(sp, info->sfsp, sizeof(*sp));
1722  		if (error) {
1723  			info->error = error;
1724  			return (-1);
1725  		}
1726  		++info->sfsp;
1727  	}
1728  	info->count++;
1729  	return(0);
1730  }
1731  
1732  /*
1733   * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1734  		   long bufsize, int flags)
1735   *
1736   * Get statistics on all filesystems.
1737   */
1738  
1739  struct getvfsstat_info {
1740  	struct statfs *sfsp;
1741  	struct statvfs *vsfsp;
1742  	long count;
1743  	long maxcount;
1744  	int error;
1745  	int flags;
1746  	struct thread *td;
1747  };
1748  
1749  static int getvfsstat_callback(struct mount *, void *);
1750  
1751  int
1752  sys_getvfsstat(struct sysmsg *sysmsg, const struct getvfsstat_args *uap)
1753  {
1754  	struct thread *td = curthread;
1755  	struct getvfsstat_info info;
1756  
1757  	bzero(&info, sizeof(info));
1758  
1759  	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1760  	info.sfsp = uap->buf;
1761  	info.vsfsp = uap->vbuf;
1762  	info.count = 0;
1763  	info.flags = uap->flags;
1764  	info.td = td;
1765  
1766  	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1767  	if (info.vsfsp && info.count > info.maxcount)
1768  		sysmsg->sysmsg_result = info.maxcount;
1769  	else
1770  		sysmsg->sysmsg_result = info.count;
1771  	return (info.error);
1772  }
1773  
1774  static int
1775  getvfsstat_callback(struct mount *mp, void *data)
1776  {
1777  	struct getvfsstat_info *info = data;
1778  	struct statfs *sp;
1779  	struct statvfs *vsp;
1780  	char *freepath;
1781  	char *fullpath;
1782  	int error;
1783  
1784  	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1785  		return(0);
1786  
1787  	if (info->vsfsp && info->count < info->maxcount) {
1788  		sp = &mp->mnt_stat;
1789  		vsp = &mp->mnt_vstat;
1790  
1791  		/*
1792  		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1793  		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1794  		 * overrides MNT_WAIT.
1795  		 *
1796  		 * Ignore refresh error, user should have visibility.
1797  		 * This can happen if a NFS mount goes bad (e.g. server
1798  		 * revokes perms or goes down).
1799  		 */
1800  		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1801  		    (info->flags & MNT_WAIT)) &&
1802  		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1803  			/* ignore error */
1804  		}
1805  		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1806  
1807  		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1808  		    (info->flags & MNT_WAIT)) &&
1809  		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1810  			/* ignore error */
1811  		}
1812  		vsp->f_flag = 0;
1813  		if (mp->mnt_flag & MNT_RDONLY)
1814  			vsp->f_flag |= ST_RDONLY;
1815  		if (mp->mnt_flag & MNT_NOSUID)
1816  			vsp->f_flag |= ST_NOSUID;
1817  
1818  		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1819  		if (error) {
1820  			info->error = error;
1821  			return(-1);
1822  		}
1823  		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1824  		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1825  		kfree(freepath, M_TEMP);
1826  
1827  		error = copyout(sp, info->sfsp, sizeof(*sp));
1828  		if (error == 0)
1829  			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1830  		if (error) {
1831  			info->error = error;
1832  			return (-1);
1833  		}
1834  		++info->sfsp;
1835  		++info->vsfsp;
1836  	}
1837  	info->count++;
1838  	return(0);
1839  }
1840  
1841  
1842  /*
1843   * fchdir_args(int fd)
1844   *
1845   * Change current working directory to a given file descriptor.
1846   */
1847  int
1848  sys_fchdir(struct sysmsg *sysmsg, const struct fchdir_args *uap)
1849  {
1850  	struct thread *td = curthread;
1851  	struct proc *p = td->td_proc;
1852  	struct filedesc *fdp = p->p_fd;
1853  	struct vnode *vp, *ovp;
1854  	struct mount *mp;
1855  	struct file *fp;
1856  	struct nchandle nch, onch, tnch;
1857  	int error;
1858  
1859  	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
1860  		return (error);
1861  	lwkt_gettoken(&p->p_token);
1862  	vp = (struct vnode *)fp->f_data;
1863  	vref(vp);
1864  	vn_lock(vp, LK_SHARED | LK_RETRY);
1865  	if (fp->f_nchandle.ncp == NULL)
1866  		error = ENOTDIR;
1867  	else
1868  		error = checkvp_chdir(vp, td);
1869  	if (error) {
1870  		vput(vp);
1871  		goto done;
1872  	}
1873  	cache_copy(&fp->f_nchandle, &nch);
1874  
1875  	/*
1876  	 * If the ncp has become a mount point, traverse through
1877  	 * the mount point.
1878  	 */
1879  
1880  	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1881  	       (mp = cache_findmount(&nch)) != NULL
1882  	) {
1883  		error = nlookup_mp(mp, &tnch);
1884  		if (error == 0) {
1885  			cache_unlock(&tnch);	/* leave ref intact */
1886  			vput(vp);
1887  			vp = tnch.ncp->nc_vp;
1888  			error = vget(vp, LK_SHARED);
1889  			KKASSERT(error == 0);
1890  			cache_drop(&nch);
1891  			nch = tnch;
1892  		}
1893  		cache_dropmount(mp);
1894  	}
1895  	if (error == 0) {
1896  		spin_lock(&fdp->fd_spin);
1897  		ovp = fdp->fd_cdir;
1898  		onch = fdp->fd_ncdir;
1899  		fdp->fd_cdir = vp;
1900  		fdp->fd_ncdir = nch;
1901  		spin_unlock(&fdp->fd_spin);
1902  		vn_unlock(vp);		/* leave ref intact */
1903  		cache_drop(&onch);
1904  		vrele(ovp);
1905  	} else {
1906  		cache_drop(&nch);
1907  		vput(vp);
1908  	}
1909  	fdrop(fp);
1910  done:
1911  	lwkt_reltoken(&p->p_token);
1912  	return (error);
1913  }
1914  
1915  int
1916  kern_chdir(struct nlookupdata *nd)
1917  {
1918  	struct thread *td = curthread;
1919  	struct proc *p = td->td_proc;
1920  	struct filedesc *fdp = p->p_fd;
1921  	struct vnode *vp, *ovp;
1922  	struct nchandle onch;
1923  	int error;
1924  
1925  	nd->nl_flags |= NLC_SHAREDLOCK;
1926  	if ((error = nlookup(nd)) != 0)
1927  		return (error);
1928  	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1929  		return (ENOENT);
1930  	if ((error = vget(vp, LK_SHARED)) != 0)
1931  		return (error);
1932  
1933  	lwkt_gettoken(&p->p_token);
1934  	error = checkvp_chdir(vp, td);
1935  	vn_unlock(vp);
1936  	if (error == 0) {
1937  		spin_lock(&fdp->fd_spin);
1938  		ovp = fdp->fd_cdir;
1939  		onch = fdp->fd_ncdir;
1940  		fdp->fd_ncdir = nd->nl_nch;
1941  		fdp->fd_cdir = vp;
1942  		spin_unlock(&fdp->fd_spin);
1943  		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1944  		cache_drop(&onch);
1945  		vrele(ovp);
1946  		cache_zero(&nd->nl_nch);
1947  	} else {
1948  		vrele(vp);
1949  	}
1950  	lwkt_reltoken(&p->p_token);
1951  	return (error);
1952  }
1953  
1954  /*
1955   * chdir_args(char *path)
1956   *
1957   * Change current working directory (``.'').
1958   */
1959  int
1960  sys_chdir(struct sysmsg *sysmsg, const struct chdir_args *uap)
1961  {
1962  	struct nlookupdata nd;
1963  	int error;
1964  
1965  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1966  	if (error == 0)
1967  		error = kern_chdir(&nd);
1968  	nlookup_done(&nd);
1969  	return (error);
1970  }
1971  
1972  /*
1973   * Helper function for raised chroot(2) security function:  Refuse if
1974   * any filedescriptors are open directories.
1975   */
1976  static int
1977  chroot_refuse_vdir_fds(thread_t td, struct filedesc *fdp)
1978  {
1979  	struct vnode *vp;
1980  	struct file *fp;
1981  	int error;
1982  	int fd;
1983  
1984  	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1985  		if ((error = holdvnode(td, fd, &fp)) != 0)
1986  			continue;
1987  		vp = (struct vnode *)fp->f_data;
1988  		if (vp->v_type != VDIR) {
1989  			fdrop(fp);
1990  			continue;
1991  		}
1992  		fdrop(fp);
1993  		return(EPERM);
1994  	}
1995  	return (0);
1996  }
1997  
1998  /*
1999   * This sysctl determines if we will allow a process to chroot(2) if it
2000   * has a directory open:
2001   *	0: disallowed for all processes.
2002   *	1: allowed for processes that were not already chroot(2)'ed.
2003   *	2: allowed for all processes.
2004   */
2005  
2006  static int chroot_allow_open_directories = 1;
2007  
2008  SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
2009       &chroot_allow_open_directories, 0, "");
2010  
2011  /*
2012   * chroot to the specified namecache entry.  We obtain the vp from the
2013   * namecache data.  The passed ncp must be locked and referenced and will
2014   * remain locked and referenced on return.
2015   */
2016  int
2017  kern_chroot(struct nchandle *nch)
2018  {
2019  	struct thread *td = curthread;
2020  	struct proc *p = td->td_proc;
2021  	struct filedesc *fdp = p->p_fd;
2022  	struct vnode *vp;
2023  	int error;
2024  
2025  	/*
2026  	 * Only privileged user can chroot
2027  	 */
2028  	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
2029  	if (error)
2030  		return (error);
2031  
2032  	/*
2033  	 * Disallow open directory descriptors (fchdir() breakouts).
2034  	 */
2035  	if (chroot_allow_open_directories == 0 ||
2036  	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
2037  		if ((error = chroot_refuse_vdir_fds(td, fdp)) != 0)
2038  			return (error);
2039  	}
2040  	if ((vp = nch->ncp->nc_vp) == NULL)
2041  		return (ENOENT);
2042  
2043  	if ((error = vget(vp, LK_SHARED)) != 0)
2044  		return (error);
2045  
2046  	/*
2047  	 * Check the validity of vp as a directory to change to and
2048  	 * associate it with rdir/jdir.
2049  	 */
2050  	error = checkvp_chdir(vp, td);
2051  	vn_unlock(vp);			/* leave reference intact */
2052  	if (error == 0) {
2053  		lwkt_gettoken(&p->p_token);
2054  		vrele(fdp->fd_rdir);
2055  		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
2056  		cache_drop(&fdp->fd_nrdir);
2057  		cache_copy(nch, &fdp->fd_nrdir);
2058  		if (fdp->fd_jdir == NULL) {
2059  			fdp->fd_jdir = vp;
2060  			vref(fdp->fd_jdir);
2061  			cache_copy(nch, &fdp->fd_njdir);
2062  		}
2063  		if ((p->p_flags & P_DIDCHROOT) == 0) {
2064  			p->p_flags |= P_DIDCHROOT;
2065  			if (p->p_depth <= 65535 - 32)
2066  				p->p_depth += 32;
2067  		}
2068  		lwkt_reltoken(&p->p_token);
2069  	} else {
2070  		vrele(vp);
2071  	}
2072  	return (error);
2073  }
2074  
2075  /*
2076   * chroot_args(char *path)
2077   *
2078   * Change notion of root (``/'') directory.
2079   */
2080  int
2081  sys_chroot(struct sysmsg *sysmsg, const struct chroot_args *uap)
2082  {
2083  	struct thread *td __debugvar = curthread;
2084  	struct nlookupdata nd;
2085  	int error;
2086  
2087  	KKASSERT(td->td_proc);
2088  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2089  	if (error == 0) {
2090  		nd.nl_flags |= NLC_EXEC;
2091  		error = nlookup(&nd);
2092  		if (error == 0)
2093  			error = kern_chroot(&nd.nl_nch);
2094  	}
2095  	nlookup_done(&nd);
2096  	return(error);
2097  }
2098  
2099  int
2100  sys_chroot_kernel(struct sysmsg *sysmsg, const struct chroot_kernel_args *uap)
2101  {
2102  	struct thread *td = curthread;
2103  	struct nlookupdata nd;
2104  	struct nchandle *nch;
2105  	struct vnode *vp;
2106  	int error;
2107  
2108  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2109  	if (error)
2110  		goto error_nond;
2111  
2112  	error = nlookup(&nd);
2113  	if (error)
2114  		goto error_out;
2115  
2116  	nch = &nd.nl_nch;
2117  
2118  	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
2119  	if (error)
2120  		goto error_out;
2121  
2122  	if ((vp = nch->ncp->nc_vp) == NULL) {
2123  		error = ENOENT;
2124  		goto error_out;
2125  	}
2126  
2127  	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
2128  		goto error_out;
2129  
2130  	vfs_cache_setroot(vp, cache_hold(nch));
2131  
2132  error_out:
2133  	nlookup_done(&nd);
2134  error_nond:
2135  	return(error);
2136  }
2137  
2138  /*
2139   * Common routine for chroot and chdir.  Given a locked, referenced vnode,
2140   * determine whether it is legal to chdir to the vnode.  The vnode's state
2141   * is not changed by this call.
2142   */
2143  static int
2144  checkvp_chdir(struct vnode *vp, struct thread *td)
2145  {
2146  	int error;
2147  
2148  	if (vp->v_type != VDIR)
2149  		error = ENOTDIR;
2150  	else
2151  		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
2152  	return (error);
2153  }
2154  
2155  int
2156  kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
2157  {
2158  	struct thread *td = curthread;
2159  	struct proc *p = td->td_proc;
2160  	struct lwp *lp = td->td_lwp;
2161  	struct filedesc *fdp = p->p_fd;
2162  	int cmode, flags;
2163  	struct file *nfp;
2164  	struct file *fp;
2165  	int type, indx, error = 0;
2166  	struct flock lf;
2167  
2168  	if ((oflags & O_ACCMODE) == O_ACCMODE)
2169  		return (EINVAL);
2170  	flags = FFLAGS(oflags);
2171  	error = falloc(lp, &nfp, NULL);
2172  	if (error)
2173  		return (error);
2174  	fp = nfp;
2175  	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
2176  
2177  	/*
2178  	 * Call vn_open() to do the lookup and assign the vnode to the
2179  	 * file pointer.  vn_open() does not change the ref count on fp
2180  	 * and the vnode, on success, will be inherited by the file pointer
2181  	 * and unlocked.
2182  	 *
2183  	 * Request a shared lock on the vnode if possible.
2184  	 *
2185  	 * When NLC_SHAREDLOCK is set we may still need an exclusive vnode
2186  	 * lock for O_RDWR opens on executables in order to avoid a VTEXT
2187  	 * detection race.  The NLC_EXCLLOCK_IFEXEC handles this case.
2188  	 *
2189  	 * NOTE: We need a flag to separate terminal vnode locking from
2190  	 *	 parent locking.  O_CREAT needs parent locking, but O_TRUNC
2191  	 *	 and O_RDWR only need to lock the terminal vnode exclusively.
2192  	 */
2193  	nd->nl_flags |= NLC_LOCKVP;
2194  	if ((flags & (O_CREAT|O_TRUNC)) == 0) {
2195  		nd->nl_flags |= NLC_SHAREDLOCK;
2196  		if (flags & O_RDWR)
2197  			nd->nl_flags |= NLC_EXCLLOCK_IFEXEC;
2198  	}
2199  
2200  	/*
2201  	 * Issue the vn_open, passing in the referenced fp.  the vn_open()
2202  	 * is allowed to replace fp by fdrop()ing it and returning its own
2203  	 * referenced fp.
2204  	 */
2205  	nfp = fp;
2206  	error = vn_open(nd, &nfp, flags, cmode);
2207  	fp = nfp;
2208  	nlookup_done(nd);
2209  
2210  	/*
2211  	 * Deal with any error condition
2212  	 */
2213  	if (error) {
2214  		fdrop(fp);	/* our ref */
2215  		if (error == ERESTART)
2216  			error = EINTR;
2217  		return (error);
2218  	}
2219  
2220  	/*
2221  	 * Reserve a file descriptor.
2222  	 */
2223  	if ((error = fdalloc(p, 0, &indx)) != 0) {
2224  		fdrop(fp);
2225  		return (error);
2226  	}
2227  
2228  	/*
2229  	 * Handle advisory lock flags.  This is only supported with vnodes.
2230  	 * For things like /dev/fd/N we might not actually get a vnode.
2231  	 */
2232  	if ((flags & (O_EXLOCK | O_SHLOCK)) && fp->f_type == DTYPE_VNODE) {
2233  		struct vnode *vp;
2234  
2235  		vp = (struct vnode *)fp->f_data;
2236  		vref(vp);
2237  
2238  		lf.l_whence = SEEK_SET;
2239  		lf.l_start = 0;
2240  		lf.l_len = 0;
2241  		if (flags & O_EXLOCK)
2242  			lf.l_type = F_WRLCK;
2243  		else
2244  			lf.l_type = F_RDLCK;
2245  		if (flags & FNONBLOCK)
2246  			type = 0;
2247  		else
2248  			type = F_WAIT;
2249  
2250  		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
2251  		if (error) {
2252  			/*
2253  			 * lock request failed.  Clean up the reserved
2254  			 * descriptor.
2255  			 */
2256  			vrele(vp);
2257  			fsetfd(fdp, NULL, indx);
2258  			fdrop(fp);
2259  			return (error);
2260  		}
2261  		atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
2262  		vrele(vp);
2263  	}
2264  
2265  	/*
2266  	 * release our private reference, leaving the one associated with the
2267  	 * descriptor table intact.
2268  	 */
2269  	if (oflags & O_CLOEXEC)
2270  		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
2271  	fsetfd(fdp, fp, indx);
2272  	fdrop(fp);
2273  	*res = indx;
2274  
2275  	return (error);
2276  }
2277  
2278  /*
2279   * open_args(char *path, int flags, int mode)
2280   *
2281   * Check permissions, allocate an open file structure,
2282   * and call the device open routine if any.
2283   */
2284  int
2285  sys_open(struct sysmsg *sysmsg, const struct open_args *uap)
2286  {
2287  	struct nlookupdata nd;
2288  	int error;
2289  
2290  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2291  	if (error == 0) {
2292  		error = kern_open(&nd, uap->flags,
2293  				    uap->mode, &sysmsg->sysmsg_result);
2294  	}
2295  	nlookup_done(&nd);
2296  	return (error);
2297  }
2298  
2299  /*
2300   * openat_args(int fd, char *path, int flags, int mode)
2301   */
2302  int
2303  sys_openat(struct sysmsg *sysmsg, const struct openat_args *uap)
2304  {
2305  	struct nlookupdata nd;
2306  	int error;
2307  	struct file *fp;
2308  
2309  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2310  	if (error == 0) {
2311  		error = kern_open(&nd, uap->flags, uap->mode,
2312  					&sysmsg->sysmsg_result);
2313  	}
2314  	nlookup_done_at(&nd, fp);
2315  	return (error);
2316  }
2317  
2318  int
2319  kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2320  {
2321  	struct thread *td = curthread;
2322  	struct proc *p = td->td_proc;
2323  	struct vnode *vp;
2324  	struct vattr vattr;
2325  	int error;
2326  	int whiteout = 0;
2327  
2328  	KKASSERT(p);
2329  
2330  	VATTR_NULL(&vattr);
2331  	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2332  	vattr.va_rmajor = rmajor;
2333  	vattr.va_rminor = rminor;
2334  
2335  	switch (mode & S_IFMT) {
2336  	case S_IFMT:	/* used by badsect to flag bad sectors */
2337  		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_BAD);
2338  		vattr.va_type = VBAD;
2339  		break;
2340  	case S_IFCHR:
2341  		error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
2342  		vattr.va_type = VCHR;
2343  		break;
2344  	case S_IFBLK:
2345  		error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
2346  		vattr.va_type = VBLK;
2347  		break;
2348  	case S_IFWHT:
2349  		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_WHT);
2350  		whiteout = 1;
2351  		break;
2352  	case S_IFDIR:	/* special directories support for HAMMER */
2353  		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_DIR);
2354  		vattr.va_type = VDIR;
2355  		break;
2356  	case S_IFIFO:
2357  		return (kern_mkfifo(nd, mode));
2358  		break;
2359  	default:
2360  		error = EINVAL;
2361  		break;
2362  	}
2363  
2364  	if (error)
2365  		return (error);
2366  
2367  	bwillinode(1);
2368  	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2369  	if ((error = nlookup(nd)) != 0)
2370  		return (error);
2371  	if (nd->nl_nch.ncp->nc_vp)
2372  		return (EEXIST);
2373  	if (nd->nl_dvp == NULL)
2374  		return (EINVAL);
2375  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2376  		return (error);
2377  
2378  	if (whiteout) {
2379  		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2380  				      nd->nl_cred, NAMEI_CREATE);
2381  	} else {
2382  		vp = NULL;
2383  		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2384  				   &vp, nd->nl_cred, &vattr);
2385  		if (error == 0)
2386  			vput(vp);
2387  	}
2388  	return (error);
2389  }
2390  
2391  /*
2392   * mknod_args(char *path, int mode, int dev)
2393   *
2394   * Create a special file.
2395   */
2396  int
2397  sys_mknod(struct sysmsg *sysmsg, const struct mknod_args *uap)
2398  {
2399  	struct nlookupdata nd;
2400  	int error;
2401  
2402  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2403  	if (error == 0) {
2404  		error = kern_mknod(&nd, uap->mode,
2405  				   umajor(uap->dev), uminor(uap->dev));
2406  	}
2407  	nlookup_done(&nd);
2408  	return (error);
2409  }
2410  
2411  /*
2412   * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2413   *
2414   * Create a special file.  The path is relative to the directory associated
2415   * with fd.
2416   */
2417  int
2418  sys_mknodat(struct sysmsg *sysmsg, const struct mknodat_args *uap)
2419  {
2420  	struct nlookupdata nd;
2421  	struct file *fp;
2422  	int error;
2423  
2424  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2425  	if (error == 0) {
2426  		error = kern_mknod(&nd, uap->mode,
2427  				   umajor(uap->dev), uminor(uap->dev));
2428  	}
2429  	nlookup_done_at(&nd, fp);
2430  	return (error);
2431  }
2432  
2433  int
2434  kern_mkfifo(struct nlookupdata *nd, int mode)
2435  {
2436  	struct thread *td = curthread;
2437  	struct proc *p = td->td_proc;
2438  	struct vattr vattr;
2439  	struct vnode *vp;
2440  	int error;
2441  
2442  	bwillinode(1);
2443  
2444  	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2445  	if ((error = nlookup(nd)) != 0)
2446  		return (error);
2447  	if (nd->nl_nch.ncp->nc_vp)
2448  		return (EEXIST);
2449  	if (nd->nl_dvp == NULL)
2450  		return (EINVAL);
2451  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2452  		return (error);
2453  
2454  	VATTR_NULL(&vattr);
2455  	vattr.va_type = VFIFO;
2456  	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2457  	vp = NULL;
2458  	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2459  	if (error == 0)
2460  		vput(vp);
2461  	return (error);
2462  }
2463  
2464  /*
2465   * mkfifo_args(char *path, int mode)
2466   *
2467   * Create a named pipe.
2468   */
2469  int
2470  sys_mkfifo(struct sysmsg *sysmsg, const struct mkfifo_args *uap)
2471  {
2472  	struct nlookupdata nd;
2473  	int error;
2474  
2475  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2476  	if (error == 0)
2477  		error = kern_mkfifo(&nd, uap->mode);
2478  	nlookup_done(&nd);
2479  	return (error);
2480  }
2481  
2482  /*
2483   * mkfifoat_args(int fd, char *path, mode_t mode)
2484   *
2485   * Create a named pipe.  The path is relative to the directory associated
2486   * with fd.
2487   */
2488  int
2489  sys_mkfifoat(struct sysmsg *sysmsg, const struct mkfifoat_args *uap)
2490  {
2491  	struct nlookupdata nd;
2492  	struct file *fp;
2493  	int error;
2494  
2495  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2496  	if (error == 0)
2497  		error = kern_mkfifo(&nd, uap->mode);
2498  	nlookup_done_at(&nd, fp);
2499  	return (error);
2500  }
2501  
2502  static int hardlink_check_uid = 0;
2503  SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2504      &hardlink_check_uid, 0,
2505      "Unprivileged processes cannot create hard links to files owned by other "
2506      "users");
2507  static int hardlink_check_gid = 0;
2508  SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2509      &hardlink_check_gid, 0,
2510      "Unprivileged processes cannot create hard links to files owned by other "
2511      "groups");
2512  
2513  static int
2514  can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2515  {
2516  	struct vattr va;
2517  	int error;
2518  
2519  	/*
2520  	 * Shortcut if disabled
2521  	 */
2522  	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2523  		return (0);
2524  
2525  	/*
2526  	 * Privileged user can always hardlink
2527  	 */
2528  	if (caps_priv_check(cred, SYSCAP_NOVFS_LINK) == 0)
2529  		return (0);
2530  
2531  	/*
2532  	 * Otherwise only if the originating file is owned by the
2533  	 * same user or group.  Note that any group is allowed if
2534  	 * the file is owned by the caller.
2535  	 */
2536  	error = VOP_GETATTR(vp, &va);
2537  	if (error != 0)
2538  		return (error);
2539  
2540  	if (hardlink_check_uid) {
2541  		if (cred->cr_uid != va.va_uid)
2542  			return (EPERM);
2543  	}
2544  
2545  	if (hardlink_check_gid) {
2546  		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2547  			return (EPERM);
2548  	}
2549  
2550  	return (0);
2551  }
2552  
2553  int
2554  kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2555  {
2556  	struct thread *td = curthread;
2557  	struct vnode *vp;
2558  	int error;
2559  
2560  	/*
2561  	 * Lookup the source and obtained a locked vnode.
2562  	 *
2563  	 * You may only hardlink a file which you have write permission
2564  	 * on or which you own.
2565  	 *
2566  	 * XXX relookup on vget failure / race ?
2567  	 */
2568  	bwillinode(1);
2569  	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2570  	if ((error = nlookup(nd)) != 0)
2571  		return (error);
2572  	vp = nd->nl_nch.ncp->nc_vp;
2573  	KKASSERT(vp != NULL);
2574  	if (vp->v_type == VDIR)
2575  		return (EPERM);		/* POSIX */
2576  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2577  		return (error);
2578  	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2579  		return (error);
2580  
2581  	/*
2582  	 * Unlock the source so we can lookup the target without deadlocking
2583  	 * (XXX vp is locked already, possible other deadlock?).  The target
2584  	 * must not exist.
2585  	 */
2586  	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2587  	nd->nl_flags &= ~NLC_NCPISLOCKED;
2588  	cache_unlock(&nd->nl_nch);
2589  	vn_unlock(vp);
2590  
2591  	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2592  	if ((error = nlookup(linknd)) != 0) {
2593  		vrele(vp);
2594  		return (error);
2595  	}
2596  	if (linknd->nl_nch.ncp->nc_vp) {
2597  		vrele(vp);
2598  		return (EEXIST);
2599  	}
2600  	if (linknd->nl_dvp == NULL) {
2601  		vrele(vp);
2602  		return (EINVAL);
2603  	}
2604  	VFS_MODIFYING(vp->v_mount);
2605  	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
2606  	if (error) {
2607  		vrele(vp);
2608  		return (error);
2609  	}
2610  
2611  	/*
2612  	 * Finally run the new API VOP.
2613  	 */
2614  	error = can_hardlink(vp, td, td->td_ucred);
2615  	if (error == 0) {
2616  		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2617  				  vp, linknd->nl_cred);
2618  	}
2619  	vput(vp);
2620  	return (error);
2621  }
2622  
2623  /*
2624   * link_args(char *path, char *link)
2625   *
2626   * Make a hard file link.
2627   */
2628  int
2629  sys_link(struct sysmsg *sysmsg, const struct link_args *uap)
2630  {
2631  	struct nlookupdata nd, linknd;
2632  	int error;
2633  
2634  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2635  	if (error == 0) {
2636  		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2637  		if (error == 0)
2638  			error = kern_link(&nd, &linknd);
2639  		nlookup_done(&linknd);
2640  	}
2641  	nlookup_done(&nd);
2642  	return (error);
2643  }
2644  
2645  /*
2646   * linkat_args(int fd1, char *path1, int fd2, char *path2, int flags)
2647   *
2648   * Make a hard file link. The path1 argument is relative to the directory
2649   * associated with fd1, and similarly the path2 argument is relative to
2650   * the directory associated with fd2.
2651   */
2652  int
2653  sys_linkat(struct sysmsg *sysmsg, const struct linkat_args *uap)
2654  {
2655  	struct nlookupdata nd, linknd;
2656  	struct file *fp1, *fp2;
2657  	int error;
2658  
2659  	error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
2660  	    (uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
2661  	if (error == 0) {
2662  		error = nlookup_init_at(&linknd, &fp2, uap->fd2,
2663  		    uap->path2, UIO_USERSPACE, 0);
2664  		if (error == 0)
2665  			error = kern_link(&nd, &linknd);
2666  		nlookup_done_at(&linknd, fp2);
2667  	}
2668  	nlookup_done_at(&nd, fp1);
2669  	return (error);
2670  }
2671  
2672  int
2673  kern_symlink(struct nlookupdata *nd, char *path, int mode)
2674  {
2675  	struct vattr vattr;
2676  	struct vnode *vp;
2677  	struct vnode *dvp;
2678  	int error;
2679  
2680  	bwillinode(1);
2681  	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2682  	if ((error = nlookup(nd)) != 0)
2683  		return (error);
2684  	if (nd->nl_nch.ncp->nc_vp)
2685  		return (EEXIST);
2686  	if (nd->nl_dvp == NULL)
2687  		return (EINVAL);
2688  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2689  		return (error);
2690  	dvp = nd->nl_dvp;
2691  	VATTR_NULL(&vattr);
2692  	vattr.va_mode = mode;
2693  	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2694  	if (error == 0)
2695  		vput(vp);
2696  	return (error);
2697  }
2698  
2699  /*
2700   * symlink(char *path, char *link)
2701   *
2702   * Make a symbolic link.
2703   */
2704  int
2705  sys_symlink(struct sysmsg *sysmsg, const struct symlink_args *uap)
2706  {
2707  	struct thread *td = curthread;
2708  	struct nlookupdata nd;
2709  	char *path;
2710  	int error;
2711  	int mode;
2712  
2713  	path = objcache_get(namei_oc, M_WAITOK);
2714  	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2715  	if (error == 0) {
2716  		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2717  		if (error == 0) {
2718  			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2719  			error = kern_symlink(&nd, path, mode);
2720  		}
2721  		nlookup_done(&nd);
2722  	}
2723  	objcache_put(namei_oc, path);
2724  	return (error);
2725  }
2726  
2727  /*
2728   * symlinkat_args(char *path1, int fd, char *path2)
2729   *
2730   * Make a symbolic link.  The path2 argument is relative to the directory
2731   * associated with fd.
2732   */
2733  int
2734  sys_symlinkat(struct sysmsg *sysmsg, const struct symlinkat_args *uap)
2735  {
2736  	struct thread *td = curthread;
2737  	struct nlookupdata nd;
2738  	struct file *fp;
2739  	char *path1;
2740  	int error;
2741  	int mode;
2742  
2743  	path1 = objcache_get(namei_oc, M_WAITOK);
2744  	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2745  	if (error == 0) {
2746  		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2747  		    UIO_USERSPACE, 0);
2748  		if (error == 0) {
2749  			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2750  			error = kern_symlink(&nd, path1, mode);
2751  		}
2752  		nlookup_done_at(&nd, fp);
2753  	}
2754  	objcache_put(namei_oc, path1);
2755  	return (error);
2756  }
2757  
2758  /*
2759   * undelete_args(char *path)
2760   *
2761   * Delete a whiteout from the filesystem.
2762   */
2763  int
2764  sys_undelete(struct sysmsg *sysmsg, const struct undelete_args *uap)
2765  {
2766  	struct nlookupdata nd;
2767  	int error;
2768  
2769  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2770  	bwillinode(1);
2771  	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2772  	if (error == 0)
2773  		error = nlookup(&nd);
2774  	if (error == 0 && nd.nl_dvp == NULL)
2775  		error = EINVAL;
2776  	if (error == 0)
2777  		error = ncp_writechk(&nd.nl_nch);
2778  	if (error == 0) {
2779  		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2780  				      NAMEI_DELETE);
2781  	}
2782  	nlookup_done(&nd);
2783  	return (error);
2784  }
2785  
2786  int
2787  kern_unlink(struct nlookupdata *nd)
2788  {
2789  	int error;
2790  
2791  	bwillinode(1);
2792  	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2793  	if ((error = nlookup(nd)) != 0)
2794  		return (error);
2795  	if (nd->nl_dvp == NULL)
2796  		return EINVAL;
2797  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2798  		return (error);
2799  	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2800  	return (error);
2801  }
2802  
2803  /*
2804   * unlink_args(char *path)
2805   *
2806   * Delete a name from the filesystem.
2807   */
2808  int
2809  sys_unlink(struct sysmsg *sysmsg, const struct unlink_args *uap)
2810  {
2811  	struct nlookupdata nd;
2812  	int error;
2813  
2814  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2815  	if (error == 0)
2816  		error = kern_unlink(&nd);
2817  	nlookup_done(&nd);
2818  	return (error);
2819  }
2820  
2821  
2822  /*
2823   * unlinkat_args(int fd, char *path, int flags)
2824   *
2825   * Delete the file or directory entry pointed to by fd/path.
2826   */
2827  int
2828  sys_unlinkat(struct sysmsg *sysmsg, const struct unlinkat_args *uap)
2829  {
2830  	struct nlookupdata nd;
2831  	struct file *fp;
2832  	int error;
2833  
2834  	if (uap->flags & ~AT_REMOVEDIR)
2835  		return (EINVAL);
2836  
2837  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2838  	if (error == 0) {
2839  		if (uap->flags & AT_REMOVEDIR)
2840  			error = kern_rmdir(&nd);
2841  		else
2842  			error = kern_unlink(&nd);
2843  	}
2844  	nlookup_done_at(&nd, fp);
2845  	return (error);
2846  }
2847  
2848  int
2849  kern_lseek(int fd, off_t offset, int whence, off_t *res)
2850  {
2851  	struct thread *td = curthread;
2852  	struct file *fp;
2853  	int error;
2854  
2855  	fp = holdfp(td, fd, -1);
2856  	if (fp == NULL)
2857  		return (EBADF);
2858  
2859  	error = fo_seek(fp, offset, whence, res);
2860  	dropfp(td, fd, fp);
2861  
2862  	return (error);
2863  }
2864  
2865  /*
2866   * lseek_args(int fd, int pad, off_t offset, int whence)
2867   *
2868   * Reposition read/write file offset.
2869   */
2870  int
2871  sys_lseek(struct sysmsg *sysmsg, const struct lseek_args *uap)
2872  {
2873  	int error;
2874  
2875  	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2876  			   &sysmsg->sysmsg_offset);
2877  
2878  	return (error);
2879  }
2880  
2881  /*
2882   * Check if current process can access given file.  amode is a bitmask of *_OK
2883   * access bits.  flags is a bitmask of AT_* flags.
2884   */
2885  int
2886  kern_access(struct nlookupdata *nd, int amode, int flags)
2887  {
2888  	struct vnode *vp;
2889  	int error, mode;
2890  
2891  	if (flags & ~AT_EACCESS)
2892  		return (EINVAL);
2893  	nd->nl_flags |= NLC_SHAREDLOCK;
2894  	if ((error = nlookup(nd)) != 0)
2895  		return (error);
2896  	if ((amode & W_OK) && (error = ncp_writechk(&nd->nl_nch)) != 0)
2897  		return (error);
2898  retry:
2899  	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
2900  	if (error)
2901  		return (error);
2902  
2903  	/* Flags == 0 means only check for existence. */
2904  	if (amode) {
2905  		mode = 0;
2906  		if (amode & R_OK)
2907  			mode |= VREAD;
2908  		if (amode & W_OK)
2909  			mode |= VWRITE;
2910  		if (amode & X_OK)
2911  			mode |= VEXEC;
2912  		if ((mode & VWRITE) == 0 ||
2913  		    (error = vn_writechk(vp)) == 0) {
2914  			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2915  		}
2916  
2917  		/*
2918  		 * If the file handle is stale we have to re-resolve the
2919  		 * entry with the ncp held exclusively.  This is a hack
2920  		 * at the moment.
2921  		 */
2922  		if (error == ESTALE) {
2923  			u_int dummy_gen;
2924  
2925  			vput(vp);
2926  			cache_unlock(&nd->nl_nch);
2927  			cache_lock(&nd->nl_nch);
2928  			dummy_gen = nd->nl_nch.ncp->nc_generation;
2929  			cache_setunresolved(&nd->nl_nch);
2930  			error = cache_resolve(&nd->nl_nch, &dummy_gen,
2931  					      nd->nl_cred);
2932  			if (error == 0) {
2933  				vp = NULL;
2934  				goto retry;
2935  			}
2936  			return(error);
2937  		}
2938  	}
2939  	vput(vp);
2940  	return (error);
2941  }
2942  
2943  /*
2944   * access_args(char *path, int flags)
2945   *
2946   * Check access permissions.
2947   */
2948  int
2949  sys_access(struct sysmsg *sysmsg, const struct access_args *uap)
2950  {
2951  	struct nlookupdata nd;
2952  	int error;
2953  
2954  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2955  	if (error == 0)
2956  		error = kern_access(&nd, uap->flags, 0);
2957  	nlookup_done(&nd);
2958  	return (error);
2959  }
2960  
2961  
2962  /*
2963   * eaccess_args(char *path, int flags)
2964   *
2965   * Check access permissions.
2966   */
2967  int
2968  sys_eaccess(struct sysmsg *sysmsg, const struct eaccess_args *uap)
2969  {
2970  	struct nlookupdata nd;
2971  	int error;
2972  
2973  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2974  	if (error == 0)
2975  		error = kern_access(&nd, uap->flags, AT_EACCESS);
2976  	nlookup_done(&nd);
2977  	return (error);
2978  }
2979  
2980  
2981  /*
2982   * faccessat_args(int fd, char *path, int amode, int flags)
2983   *
2984   * Check access permissions.
2985   */
2986  int
2987  sys_faccessat(struct sysmsg *sysmsg, const struct faccessat_args *uap)
2988  {
2989  	struct nlookupdata nd;
2990  	struct file *fp;
2991  	int error;
2992  
2993  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2994  				NLC_FOLLOW);
2995  	if (error == 0)
2996  		error = kern_access(&nd, uap->amode, uap->flags);
2997  	nlookup_done_at(&nd, fp);
2998  	return (error);
2999  }
3000  
3001  int
3002  kern_stat(struct nlookupdata *nd, struct stat *st)
3003  {
3004  	int error;
3005  	struct vnode *vp;
3006  
3007  	nd->nl_flags |= NLC_SHAREDLOCK;
3008  	if ((error = nlookup(nd)) != 0)
3009  		return (error);
3010  again:
3011  	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
3012  		return (ENOENT);
3013  
3014  #if 1
3015  	error = cache_vref(&nd->nl_nch, NULL, &vp);
3016  #else
3017  	error = vget(vp, LK_SHARED);
3018  #endif
3019  	if (error)
3020  		return (error);
3021  	error = vn_stat(vp, st, nd->nl_cred);
3022  
3023  	/*
3024  	 * If the file handle is stale we have to re-resolve the
3025  	 * entry with the ncp held exclusively.  This is a hack
3026  	 * at the moment.
3027  	 */
3028  	if (error == ESTALE) {
3029  		u_int dummy_gen;
3030  #if 1
3031  		vrele(vp);
3032  #else
3033  		vput(vp);
3034  #endif
3035  		cache_unlock(&nd->nl_nch);
3036  		cache_lock(&nd->nl_nch);
3037  		dummy_gen = nd->nl_nch.ncp->nc_generation;
3038  		cache_setunresolved(&nd->nl_nch);
3039  		error = cache_resolve(&nd->nl_nch, &dummy_gen, nd->nl_cred);
3040  		if (error == 0)
3041  			goto again;
3042  	} else {
3043  #if 1
3044  		vrele(vp);
3045  #else
3046  		vput(vp);
3047  #endif
3048  	}
3049  	return (error);
3050  }
3051  
3052  /*
3053   * stat_args(char *path, struct stat *ub)
3054   *
3055   * Get file status; this version follows links.
3056   */
3057  int
3058  sys_stat(struct sysmsg *sysmsg, const struct stat_args *uap)
3059  {
3060  	struct nlookupdata nd;
3061  	struct stat st;
3062  	int error;
3063  
3064  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3065  	if (error == 0) {
3066  		error = kern_stat(&nd, &st);
3067  		if (error == 0)
3068  			error = copyout(&st, uap->ub, sizeof(*uap->ub));
3069  	}
3070  	nlookup_done(&nd);
3071  	return (error);
3072  }
3073  
3074  /*
3075   * lstat_args(char *path, struct stat *ub)
3076   *
3077   * Get file status; this version does not follow links.
3078   */
3079  int
3080  sys_lstat(struct sysmsg *sysmsg, const struct lstat_args *uap)
3081  {
3082  	struct nlookupdata nd;
3083  	struct stat st;
3084  	int error;
3085  
3086  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3087  	if (error == 0) {
3088  		error = kern_stat(&nd, &st);
3089  		if (error == 0)
3090  			error = copyout(&st, uap->ub, sizeof(*uap->ub));
3091  	}
3092  	nlookup_done(&nd);
3093  	return (error);
3094  }
3095  
3096  /*
3097   * fstatat_args(int fd, char *path, struct stat *sb, int flags)
3098   *
3099   * Get status of file pointed to by fd/path.
3100   */
3101  int
3102  sys_fstatat(struct sysmsg *sysmsg, const struct fstatat_args *uap)
3103  {
3104  	struct nlookupdata nd;
3105  	struct stat st;
3106  	int error;
3107  	int flags;
3108  	struct file *fp;
3109  
3110  	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3111  		return (EINVAL);
3112  
3113  	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3114  
3115  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3116  				UIO_USERSPACE, flags);
3117  	if (error == 0) {
3118  		error = kern_stat(&nd, &st);
3119  		if (error == 0)
3120  			error = copyout(&st, uap->sb, sizeof(*uap->sb));
3121  	}
3122  	nlookup_done_at(&nd, fp);
3123  	return (error);
3124  }
3125  
3126  static int
3127  kern_pathconf(char *path, int name, int flags, register_t *sysmsg_regp)
3128  {
3129  	struct nlookupdata nd;
3130  	struct vnode *vp;
3131  	int error;
3132  
3133  	vp = NULL;
3134  	error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
3135  	if (error == 0)
3136  		error = nlookup(&nd);
3137  	if (error == 0)
3138  		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3139  	nlookup_done(&nd);
3140  	if (error == 0) {
3141  		error = VOP_PATHCONF(vp, name, sysmsg_regp);
3142  		vput(vp);
3143  	}
3144  	return (error);
3145  }
3146  
3147  /*
3148   * pathconf_Args(char *path, int name)
3149   *
3150   * Get configurable pathname variables.
3151   */
3152  int
3153  sys_pathconf(struct sysmsg *sysmsg, const struct pathconf_args *uap)
3154  {
3155  	return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
3156  		&sysmsg->sysmsg_reg));
3157  }
3158  
3159  /*
3160   * lpathconf_Args(char *path, int name)
3161   *
3162   * Get configurable pathname variables, but don't follow symlinks.
3163   */
3164  int
3165  sys_lpathconf(struct sysmsg *sysmsg, const struct lpathconf_args *uap)
3166  {
3167  	return (kern_pathconf(uap->path, uap->name, 0, &sysmsg->sysmsg_reg));
3168  }
3169  
3170  /*
3171   * XXX: daver
3172   * kern_readlink isn't properly split yet.  There is a copyin burried
3173   * in VOP_READLINK().
3174   */
3175  int
3176  kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
3177  {
3178  	struct thread *td = curthread;
3179  	struct vnode *vp;
3180  	struct iovec aiov;
3181  	struct uio auio;
3182  	int error;
3183  
3184  	nd->nl_flags |= NLC_SHAREDLOCK;
3185  	if ((error = nlookup(nd)) != 0)
3186  		return (error);
3187  	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
3188  	if (error)
3189  		return (error);
3190  	if (vp->v_type != VLNK) {
3191  		error = EINVAL;
3192  	} else {
3193  		aiov.iov_base = buf;
3194  		aiov.iov_len = count;
3195  		auio.uio_iov = &aiov;
3196  		auio.uio_iovcnt = 1;
3197  		auio.uio_offset = 0;
3198  		auio.uio_rw = UIO_READ;
3199  		auio.uio_segflg = UIO_USERSPACE;
3200  		auio.uio_td = td;
3201  		auio.uio_resid = count;
3202  		error = VOP_READLINK(vp, &auio, td->td_ucred);
3203  	}
3204  	vput(vp);
3205  	*res = count - auio.uio_resid;
3206  	return (error);
3207  }
3208  
3209  /*
3210   * readlink_args(char *path, char *buf, int count)
3211   *
3212   * Return target name of a symbolic link.
3213   */
3214  int
3215  sys_readlink(struct sysmsg *sysmsg, const struct readlink_args *uap)
3216  {
3217  	struct nlookupdata nd;
3218  	int error;
3219  
3220  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3221  	if (error == 0) {
3222  		error = kern_readlink(&nd, uap->buf, uap->count,
3223  					&sysmsg->sysmsg_result);
3224  	}
3225  	nlookup_done(&nd);
3226  	return (error);
3227  }
3228  
3229  /*
3230   * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
3231   *
3232   * Return target name of a symbolic link.  The path is relative to the
3233   * directory associated with fd.
3234   */
3235  int
3236  sys_readlinkat(struct sysmsg *sysmsg, const struct readlinkat_args *uap)
3237  {
3238  	struct nlookupdata nd;
3239  	struct file *fp;
3240  	int error;
3241  
3242  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3243  	if (error == 0) {
3244  		error = kern_readlink(&nd, uap->buf, uap->bufsize,
3245  					&sysmsg->sysmsg_result);
3246  	}
3247  	nlookup_done_at(&nd, fp);
3248  	return (error);
3249  }
3250  
3251  static int
3252  setfflags(struct vnode *vp, u_long flags)
3253  {
3254  	struct thread *td = curthread;
3255  	int error;
3256  	struct vattr vattr;
3257  
3258  	/*
3259  	 * Prevent non-root users from setting flags on devices.  When
3260  	 * a device is reused, users can retain ownership of the device
3261  	 * if they are allowed to set flags and programs assume that
3262  	 * chown can't fail when done as root.
3263  	 */
3264  	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
3265  	    ((error =
3266  		caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHFLAGS_DEV)) != 0))
3267  	{
3268  		return (error);
3269  	}
3270  
3271  	/*
3272  	 * note: vget is required for any operation that might mod the vnode
3273  	 * so VINACTIVE is properly cleared.
3274  	 */
3275  	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3276  		VATTR_NULL(&vattr);
3277  		vattr.va_flags = flags;
3278  		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3279  		vput(vp);
3280  	}
3281  	return (error);
3282  }
3283  
3284  /*
3285   * chflags(const char *path, u_long flags)
3286   *
3287   * Change flags of a file given a path name.
3288   */
3289  int
3290  sys_chflags(struct sysmsg *sysmsg, const struct chflags_args *uap)
3291  {
3292  	struct nlookupdata nd;
3293  	struct vnode *vp;
3294  	int error;
3295  
3296  	vp = NULL;
3297  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3298  	if (error == 0)
3299  		error = nlookup(&nd);
3300  	if (error == 0)
3301  		error = ncp_writechk(&nd.nl_nch);
3302  	if (error == 0)
3303  		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3304  	nlookup_done(&nd);
3305  	if (error == 0) {
3306  		error = setfflags(vp, uap->flags);
3307  		vrele(vp);
3308  	}
3309  	return (error);
3310  }
3311  
3312  /*
3313   * lchflags(const char *path, u_long flags)
3314   *
3315   * Change flags of a file given a path name, but don't follow symlinks.
3316   */
3317  int
3318  sys_lchflags(struct sysmsg *sysmsg, const struct lchflags_args *uap)
3319  {
3320  	struct nlookupdata nd;
3321  	struct vnode *vp;
3322  	int error;
3323  
3324  	vp = NULL;
3325  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3326  	if (error == 0)
3327  		error = nlookup(&nd);
3328  	if (error == 0)
3329  		error = ncp_writechk(&nd.nl_nch);
3330  	if (error == 0)
3331  		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3332  	nlookup_done(&nd);
3333  	if (error == 0) {
3334  		error = setfflags(vp, uap->flags);
3335  		vrele(vp);
3336  	}
3337  	return (error);
3338  }
3339  
3340  /*
3341   * fchflags_args(int fd, u_flags flags)
3342   *
3343   * Change flags of a file given a file descriptor.
3344   */
3345  int
3346  sys_fchflags(struct sysmsg *sysmsg, const struct fchflags_args *uap)
3347  {
3348  	struct thread *td = curthread;
3349  	struct file *fp;
3350  	int error;
3351  
3352  	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3353  		return (error);
3354  	if (fp->f_nchandle.ncp)
3355  		error = ncp_writechk(&fp->f_nchandle);
3356  	if (error == 0)
3357  		error = setfflags((struct vnode *) fp->f_data, uap->flags);
3358  	fdrop(fp);
3359  	return (error);
3360  }
3361  
3362  /*
3363   * chflagsat_args(int fd, const char *path, u_long flags, int atflags)
3364   * change flags given a pathname relative to a filedescriptor
3365   */
3366  int
3367  sys_chflagsat(struct sysmsg *sysmsg, const struct chflagsat_args *uap)
3368  {
3369  	struct nlookupdata nd;
3370  	struct vnode *vp;
3371  	struct file *fp;
3372  	int error;
3373  	int lookupflags;
3374  
3375  	if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
3376  		return (EINVAL);
3377  
3378  	lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3379  
3380  	vp = NULL;
3381  	error = nlookup_init_at(&nd, &fp, uap->fd,  uap->path, UIO_USERSPACE, lookupflags);
3382  	if (error == 0)
3383  		error = nlookup(&nd);
3384  	if (error == 0)
3385  		error = ncp_writechk(&nd.nl_nch);
3386  	if (error == 0)
3387  		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3388  	nlookup_done_at(&nd, fp);
3389  	if (error == 0) {
3390  		error = setfflags(vp, uap->flags);
3391  		vrele(vp);
3392  	}
3393  	return (error);
3394  }
3395  
3396  
3397  static int
3398  setfmode(struct vnode *vp, int mode)
3399  {
3400  	struct thread *td = curthread;
3401  	int error;
3402  	struct vattr vattr;
3403  
3404  	/*
3405  	 * note: vget is required for any operation that might mod the vnode
3406  	 * so VINACTIVE is properly cleared.
3407  	 */
3408  	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3409  		VATTR_NULL(&vattr);
3410  		vattr.va_mode = mode & ALLPERMS;
3411  		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3412  		cache_inval_wxok(vp);
3413  		vput(vp);
3414  	}
3415  	return error;
3416  }
3417  
3418  int
3419  kern_chmod(struct nlookupdata *nd, int mode)
3420  {
3421  	struct vnode *vp;
3422  	int error;
3423  
3424  	if ((error = nlookup(nd)) != 0)
3425  		return (error);
3426  	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3427  		return (error);
3428  	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3429  		error = setfmode(vp, mode);
3430  	vrele(vp);
3431  	return (error);
3432  }
3433  
3434  /*
3435   * chmod_args(char *path, int mode)
3436   *
3437   * Change mode of a file given path name.
3438   */
3439  int
3440  sys_chmod(struct sysmsg *sysmsg, const struct chmod_args *uap)
3441  {
3442  	struct nlookupdata nd;
3443  	int error;
3444  
3445  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3446  	if (error == 0)
3447  		error = kern_chmod(&nd, uap->mode);
3448  	nlookup_done(&nd);
3449  	return (error);
3450  }
3451  
3452  /*
3453   * lchmod_args(char *path, int mode)
3454   *
3455   * Change mode of a file given path name (don't follow links.)
3456   */
3457  int
3458  sys_lchmod(struct sysmsg *sysmsg, const struct lchmod_args *uap)
3459  {
3460  	struct nlookupdata nd;
3461  	int error;
3462  
3463  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3464  	if (error == 0)
3465  		error = kern_chmod(&nd, uap->mode);
3466  	nlookup_done(&nd);
3467  	return (error);
3468  }
3469  
3470  /*
3471   * fchmod_args(int fd, int mode)
3472   *
3473   * Change mode of a file given a file descriptor.
3474   */
3475  int
3476  sys_fchmod(struct sysmsg *sysmsg, const struct fchmod_args *uap)
3477  {
3478  	struct thread *td = curthread;
3479  	struct file *fp;
3480  	int error;
3481  
3482  	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3483  		return (error);
3484  	if (fp->f_nchandle.ncp)
3485  		error = ncp_writechk(&fp->f_nchandle);
3486  	if (error == 0)
3487  		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3488  	fdrop(fp);
3489  	return (error);
3490  }
3491  
3492  /*
3493   * fchmodat_args(char *path, int mode)
3494   *
3495   * Change mode of a file pointed to by fd/path.
3496   */
3497  int
3498  sys_fchmodat(struct sysmsg *sysmsg, const struct fchmodat_args *uap)
3499  {
3500  	struct nlookupdata nd;
3501  	struct file *fp;
3502  	int error;
3503  	int flags;
3504  
3505  	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3506  		return (EINVAL);
3507  	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3508  
3509  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3510  				UIO_USERSPACE, flags);
3511  	if (error == 0)
3512  		error = kern_chmod(&nd, uap->mode);
3513  	nlookup_done_at(&nd, fp);
3514  	return (error);
3515  }
3516  
3517  static int
3518  setfown(struct mount *mp, struct vnode *vp, uid_t uid, gid_t gid)
3519  {
3520  	struct thread *td = curthread;
3521  	int error;
3522  	struct vattr vattr;
3523  	uid_t o_uid;
3524  	gid_t o_gid;
3525  	uint64_t size;
3526  
3527  	/*
3528  	 * note: vget is required for any operation that might mod the vnode
3529  	 * so VINACTIVE is properly cleared.
3530  	 */
3531  	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3532  		if ((error = VOP_GETATTR(vp, &vattr)) != 0)
3533  			return error;
3534  		o_uid = vattr.va_uid;
3535  		o_gid = vattr.va_gid;
3536  		size = vattr.va_size;
3537  
3538  		VATTR_NULL(&vattr);
3539  		vattr.va_uid = uid;
3540  		vattr.va_gid = gid;
3541  		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3542  		vput(vp);
3543  	}
3544  
3545  	if (error == 0) {
3546  		if (uid == -1)
3547  			uid = o_uid;
3548  		if (gid == -1)
3549  			gid = o_gid;
3550  		VFS_ACCOUNT(mp, o_uid, o_gid, -size);
3551  		VFS_ACCOUNT(mp,   uid,   gid,  size);
3552  	}
3553  
3554  	return error;
3555  }
3556  
3557  int
3558  kern_chown(struct nlookupdata *nd, int uid, int gid)
3559  {
3560  	struct vnode *vp;
3561  	int error;
3562  
3563  	if ((error = nlookup(nd)) != 0)
3564  		return (error);
3565  	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3566  		return (error);
3567  	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3568  		error = setfown(nd->nl_nch.mount, vp, uid, gid);
3569  	vrele(vp);
3570  	return (error);
3571  }
3572  
3573  /*
3574   * chown(char *path, int uid, int gid)
3575   *
3576   * Set ownership given a path name.
3577   */
3578  int
3579  sys_chown(struct sysmsg *sysmsg, const struct chown_args *uap)
3580  {
3581  	struct nlookupdata nd;
3582  	int error;
3583  
3584  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3585  	if (error == 0)
3586  		error = kern_chown(&nd, uap->uid, uap->gid);
3587  	nlookup_done(&nd);
3588  	return (error);
3589  }
3590  
3591  /*
3592   * lchown_args(char *path, int uid, int gid)
3593   *
3594   * Set ownership given a path name, do not cross symlinks.
3595   */
3596  int
3597  sys_lchown(struct sysmsg *sysmsg, const struct lchown_args *uap)
3598  {
3599  	struct nlookupdata nd;
3600  	int error;
3601  
3602  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3603  	if (error == 0)
3604  		error = kern_chown(&nd, uap->uid, uap->gid);
3605  	nlookup_done(&nd);
3606  	return (error);
3607  }
3608  
3609  /*
3610   * fchown_args(int fd, int uid, int gid)
3611   *
3612   * Set ownership given a file descriptor.
3613   */
3614  int
3615  sys_fchown(struct sysmsg *sysmsg, const struct fchown_args *uap)
3616  {
3617  	struct thread *td = curthread;
3618  	struct proc *p = td->td_proc;
3619  	struct file *fp;
3620  	int error;
3621  
3622  	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3623  		return (error);
3624  	if (fp->f_nchandle.ncp)
3625  		error = ncp_writechk(&fp->f_nchandle);
3626  	if (error == 0)
3627  		error = setfown(p->p_fd->fd_ncdir.mount,
3628  			(struct vnode *)fp->f_data, uap->uid, uap->gid);
3629  	fdrop(fp);
3630  	return (error);
3631  }
3632  
3633  /*
3634   * fchownat(int fd, char *path, int uid, int gid, int flags)
3635   *
3636   * Set ownership of file pointed to by fd/path.
3637   */
3638  int
3639  sys_fchownat(struct sysmsg *sysmsg, const struct fchownat_args *uap)
3640  {
3641  	struct nlookupdata nd;
3642  	struct file *fp;
3643  	int error;
3644  	int flags;
3645  
3646  	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3647  		return (EINVAL);
3648  	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3649  
3650  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3651  				UIO_USERSPACE, flags);
3652  	if (error == 0)
3653  		error = kern_chown(&nd, uap->uid, uap->gid);
3654  	nlookup_done_at(&nd, fp);
3655  	return (error);
3656  }
3657  
3658  
3659  static int
3660  getutimes(struct timeval *tvp, struct timespec *tsp)
3661  {
3662  	struct timeval tv[2];
3663  	int error;
3664  
3665  	if (tvp == NULL) {
3666  		microtime(&tv[0]);
3667  		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3668  		tsp[1] = tsp[0];
3669  	} else {
3670  		if ((error = itimerfix(tvp)) != 0)
3671  			return (error);
3672  		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3673  		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3674  	}
3675  	return 0;
3676  }
3677  
3678  static int
3679  getutimens(const struct timespec *ts, struct timespec *newts, int *nullflag)
3680  {
3681  	struct timespec tsnow;
3682  	int error;
3683  
3684  	*nullflag = 0;
3685  	nanotime(&tsnow);
3686  	if (ts == NULL) {
3687  		newts[0] = tsnow;
3688  		newts[1] = tsnow;
3689  		*nullflag = 1;
3690  		return (0);
3691  	}
3692  
3693  	newts[0] = ts[0];
3694  	newts[1] = ts[1];
3695  	if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT) {
3696  		newts[0].tv_sec = VNOVAL;
3697  		newts[1].tv_sec = VNOVAL;
3698  		return (0);
3699  	}
3700  	if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
3701  		*nullflag = 1;
3702  
3703  	if (newts[0].tv_nsec == UTIME_OMIT)
3704  		newts[0].tv_sec = VNOVAL;
3705  	else if (newts[0].tv_nsec == UTIME_NOW)
3706  		newts[0] = tsnow;
3707  	else if ((error = itimespecfix(&newts[0])) != 0)
3708  		return (error);
3709  
3710  	if (newts[1].tv_nsec == UTIME_OMIT)
3711  		newts[1].tv_sec = VNOVAL;
3712  	else if (newts[1].tv_nsec == UTIME_NOW)
3713  		newts[1] = tsnow;
3714  	else if ((error = itimespecfix(&newts[1])) != 0)
3715  		return (error);
3716  
3717  	return (0);
3718  }
3719  
3720  static int
3721  setutimes(struct vnode *vp, struct vattr *vattr,
3722  	  const struct timespec *ts, int nullflag)
3723  {
3724  	struct thread *td = curthread;
3725  	int error;
3726  
3727  	VATTR_NULL(vattr);
3728  	vattr->va_atime = ts[0];
3729  	vattr->va_mtime = ts[1];
3730  	if (nullflag)
3731  		vattr->va_vaflags |= VA_UTIMES_NULL;
3732  	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3733  
3734  	return error;
3735  }
3736  
3737  int
3738  kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3739  {
3740  	struct timespec ts[2];
3741  	int error;
3742  
3743  	if (tptr) {
3744  		if ((error = getutimes(tptr, ts)) != 0)
3745  			return (error);
3746  	}
3747  	error = kern_utimensat(nd, tptr ? ts : NULL, 0);
3748  	return (error);
3749  }
3750  
3751  /*
3752   * utimes_args(char *path, struct timeval *tptr)
3753   *
3754   * Set the access and modification times of a file.
3755   */
3756  int
3757  sys_utimes(struct sysmsg *sysmsg, const struct utimes_args *uap)
3758  {
3759  	struct timeval tv[2];
3760  	struct nlookupdata nd;
3761  	int error;
3762  
3763  	if (uap->tptr) {
3764   		error = copyin(uap->tptr, tv, sizeof(tv));
3765  		if (error)
3766  			return (error);
3767  	}
3768  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3769  	if (error == 0)
3770  		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3771  	nlookup_done(&nd);
3772  	return (error);
3773  }
3774  
3775  /*
3776   * lutimes_args(char *path, struct timeval *tptr)
3777   *
3778   * Set the access and modification times of a file.
3779   */
3780  int
3781  sys_lutimes(struct sysmsg *sysmsg, const struct lutimes_args *uap)
3782  {
3783  	struct timeval tv[2];
3784  	struct nlookupdata nd;
3785  	int error;
3786  
3787  	if (uap->tptr) {
3788  		error = copyin(uap->tptr, tv, sizeof(tv));
3789  		if (error)
3790  			return (error);
3791  	}
3792  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3793  	if (error == 0)
3794  		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3795  	nlookup_done(&nd);
3796  	return (error);
3797  }
3798  
3799  /*
3800   * Set utimes on a file descriptor.  The creds used to open the
3801   * file are used to determine whether the operation is allowed
3802   * or not.
3803   */
3804  int
3805  kern_futimens(int fd, struct timespec *ts)
3806  {
3807  	struct thread *td = curthread;
3808  	struct timespec newts[2];
3809  	struct file *fp;
3810  	struct vnode *vp;
3811  	struct vattr vattr;
3812  	struct vattr_lite lva;
3813  	int nullflag;
3814  	int error;
3815  
3816  	error = getutimens(ts, newts, &nullflag);
3817  	if (error)
3818  		return (error);
3819  	if ((error = holdvnode(td, fd, &fp)) != 0)
3820  		return (error);
3821  	if (fp->f_nchandle.ncp)
3822  		error = ncp_writechk(&fp->f_nchandle);
3823  	if (error == 0) {
3824  		vp = fp->f_data;
3825  		error = vget(vp, LK_EXCLUSIVE);
3826  		if (error == 0) {
3827  			error = VOP_GETATTR_FP(vp, &vattr, fp);
3828  			if (error == 0) {
3829  				lva.va_type = vattr.va_type;
3830  				lva.va_nlink = vattr.va_nlink;
3831  				lva.va_mode = vattr.va_mode;
3832  				lva.va_uid = vattr.va_uid;
3833  				lva.va_gid = vattr.va_gid;
3834  				lva.va_size = vattr.va_size;
3835  				lva.va_flags = vattr.va_flags;
3836  
3837  				error = naccess_lva(&lva, NLC_OWN | NLC_WRITE,
3838  						   fp->f_cred);
3839  			}
3840  			if (error == 0) {
3841  				error = setutimes(vp, &vattr, newts, nullflag);
3842  			}
3843  			vput(vp);
3844  		}
3845  	}
3846  	fdrop(fp);
3847  	return (error);
3848  }
3849  
3850  /*
3851   * futimens_args(int fd, struct timespec *ts)
3852   *
3853   * Set the access and modification times of a file.
3854   */
3855  int
3856  sys_futimens(struct sysmsg *sysmsg, const struct futimens_args *uap)
3857  {
3858  	struct timespec ts[2];
3859  	int error;
3860  
3861  	if (uap->ts) {
3862  		error = copyin(uap->ts, ts, sizeof(ts));
3863  		if (error)
3864  			return (error);
3865  	}
3866  	error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
3867  	return (error);
3868  }
3869  
3870  int
3871  kern_futimes(int fd, struct timeval *tptr)
3872  {
3873  	struct timespec ts[2];
3874  	int error;
3875  
3876  	if (tptr) {
3877  		if ((error = getutimes(tptr, ts)) != 0)
3878  			return (error);
3879  	}
3880  	error = kern_futimens(fd, tptr ? ts : NULL);
3881  	return (error);
3882  }
3883  
3884  /*
3885   * futimes_args(int fd, struct timeval *tptr)
3886   *
3887   * Set the access and modification times of a file.
3888   */
3889  int
3890  sys_futimes(struct sysmsg *sysmsg, const struct futimes_args *uap)
3891  {
3892  	struct timeval tv[2];
3893  	int error;
3894  
3895  	if (uap->tptr) {
3896  		error = copyin(uap->tptr, tv, sizeof(tv));
3897  		if (error)
3898  			return (error);
3899  	}
3900  	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3901  	return (error);
3902  }
3903  
3904  /*
3905   * futimesat_args(int fd, const char *path, struct timeval *tptr)
3906   *
3907   * Set the access and modification times of a file.
3908   */
3909  int
3910  sys_futimesat(struct sysmsg *sysmsg, const struct futimesat_args *uap)
3911  {
3912  	struct timespec ts[2];
3913  	struct nlookupdata nd;
3914  	struct file *fp;
3915  	int error;
3916  
3917  	if (uap->tptr) {
3918  		struct timeval tv[2];
3919  
3920  		if ((error = copyin(uap->tptr, tv, sizeof(tv))) != 0)
3921  			return error;
3922  		if ((error = getutimes(tv, ts)) != 0)
3923  			return error;
3924  	}
3925  
3926  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3927  	                        UIO_USERSPACE, 0);
3928  	if (error == 0)
3929  		error = kern_utimensat(&nd, uap->tptr ? ts : NULL, 0);
3930  	nlookup_done_at(&nd, fp);
3931  
3932  	return (error);
3933  }
3934  
3935  int
3936  kern_utimensat(struct nlookupdata *nd, const struct timespec *ts, int flags)
3937  {
3938  	struct timespec newts[2];
3939  	struct vnode *vp;
3940  	struct vattr vattr;
3941  	int nullflag;
3942  	int error;
3943  
3944  	if (flags & ~AT_SYMLINK_NOFOLLOW)
3945  		return (EINVAL);
3946  
3947  	error = getutimens(ts, newts, &nullflag);
3948  	if (error)
3949  		return (error);
3950  
3951  	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3952  	if ((error = nlookup(nd)) != 0)
3953  		return (error);
3954  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3955  		return (error);
3956  	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3957  		return (error);
3958  	if ((error = vn_writechk(vp)) == 0) {
3959  		error = vget(vp, LK_EXCLUSIVE);
3960  		if (error == 0) {
3961  			error = setutimes(vp, &vattr, newts, nullflag);
3962  			vput(vp);
3963  		}
3964  	}
3965  	vrele(vp);
3966  	return (error);
3967  }
3968  
3969  /*
3970   * utimensat_args(int fd, const char *path, const struct timespec *ts, int flags);
3971   *
3972   * Set file access and modification times of a file.
3973   */
3974  int
3975  sys_utimensat(struct sysmsg *sysmsg, const struct utimensat_args *uap)
3976  {
3977  	struct timespec ts[2];
3978  	struct nlookupdata nd;
3979  	struct file *fp;
3980  	int error;
3981  	int flags;
3982  
3983  	if (uap->ts) {
3984  		error = copyin(uap->ts, ts, sizeof(ts));
3985  		if (error)
3986  			return (error);
3987  	}
3988  
3989  	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3990  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3991  	                        UIO_USERSPACE, flags);
3992  	if (error == 0)
3993  		error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
3994  	nlookup_done_at(&nd, fp);
3995  	return (error);
3996  }
3997  
3998  int
3999  kern_truncate(struct nlookupdata *nd, off_t length)
4000  {
4001  	struct vnode *vp;
4002  	struct vattr vattr;
4003  	int error;
4004  	uid_t uid = 0;
4005  	gid_t gid = 0;
4006  	uint64_t old_size = 0;
4007  
4008  	if (length < 0)
4009  		return(EINVAL);
4010  	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
4011  	if ((error = nlookup(nd)) != 0)
4012  		return (error);
4013  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4014  		return (error);
4015  	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
4016  		return (error);
4017  	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
4018  	if (error) {
4019  		vrele(vp);
4020  		return (error);
4021  	}
4022  	if (vp->v_type == VDIR) {
4023  		error = EISDIR;
4024  		goto done;
4025  	}
4026  	if (vfs_quota_enabled) {
4027  		error = VOP_GETATTR(vp, &vattr);
4028  		KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
4029  		uid = vattr.va_uid;
4030  		gid = vattr.va_gid;
4031  		old_size = vattr.va_size;
4032  	}
4033  
4034  	if ((error = vn_writechk(vp)) == 0) {
4035  		VATTR_NULL(&vattr);
4036  		vattr.va_size = length;
4037  		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
4038  		VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
4039  	}
4040  done:
4041  	vput(vp);
4042  	return (error);
4043  }
4044  
4045  /*
4046   * truncate(char *path, int pad, off_t length)
4047   *
4048   * Truncate a file given its path name.
4049   */
4050  int
4051  sys_truncate(struct sysmsg *sysmsg, const struct truncate_args *uap)
4052  {
4053  	struct nlookupdata nd;
4054  	int error;
4055  
4056  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4057  	if (error == 0)
4058  		error = kern_truncate(&nd, uap->length);
4059  	nlookup_done(&nd);
4060  	return error;
4061  }
4062  
4063  int
4064  kern_ftruncate(int fd, off_t length)
4065  {
4066  	struct thread *td = curthread;
4067  	struct vattr vattr;
4068  	struct vnode *vp;
4069  	struct file *fp;
4070  	int error;
4071  	uid_t uid = 0;
4072  	gid_t gid = 0;
4073  	uint64_t old_size = 0;
4074  	struct mount *mp;
4075  
4076  	if (length < 0)
4077  		return(EINVAL);
4078  	if ((error = holdvnode(td, fd, &fp)) != 0)
4079  		return (error);
4080  	if (fp->f_nchandle.ncp) {
4081  		error = ncp_writechk(&fp->f_nchandle);
4082  		if (error)
4083  			goto done;
4084  	}
4085  	if ((fp->f_flag & FWRITE) == 0) {
4086  		error = EINVAL;
4087  		goto done;
4088  	}
4089  	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
4090  		error = EINVAL;
4091  		goto done;
4092  	}
4093  	vp = (struct vnode *)fp->f_data;
4094  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4095  	if (vp->v_type == VDIR) {
4096  		error = EISDIR;
4097  		vn_unlock(vp);
4098  		goto done;
4099  	}
4100  
4101  	if (vfs_quota_enabled) {
4102  		error = VOP_GETATTR_FP(vp, &vattr, fp);
4103  		KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
4104  		uid = vattr.va_uid;
4105  		gid = vattr.va_gid;
4106  		old_size = vattr.va_size;
4107  	}
4108  
4109  	if ((error = vn_writechk(vp)) == 0) {
4110  		VATTR_NULL(&vattr);
4111  		vattr.va_size = length;
4112  		error = VOP_SETATTR_FP(vp, &vattr, fp->f_cred, fp);
4113  		mp = vq_vptomp(vp);
4114  		VFS_ACCOUNT(mp, uid, gid, length - old_size);
4115  	}
4116  	vn_unlock(vp);
4117  done:
4118  	fdrop(fp);
4119  	return (error);
4120  }
4121  
4122  /*
4123   * ftruncate_args(int fd, int pad, off_t length)
4124   *
4125   * Truncate a file given a file descriptor.
4126   */
4127  int
4128  sys_ftruncate(struct sysmsg *sysmsg, const struct ftruncate_args *uap)
4129  {
4130  	int error;
4131  
4132  	error = kern_ftruncate(uap->fd, uap->length);
4133  
4134  	return (error);
4135  }
4136  
4137  int
4138  kern_fsync(int fd, bool fullsync)
4139  {
4140  	struct thread *td = curthread;
4141  	struct vnode *vp;
4142  	struct file *fp;
4143  	vm_object_t obj;
4144  	int error;
4145  
4146  	if ((error = holdvnode(td, fd, &fp)) != 0)
4147  		return (error);
4148  	vp = (struct vnode *)fp->f_data;
4149  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4150  	if ((obj = vp->v_object) != NULL) {
4151  		if (vp->v_mount == NULL ||
4152  		    (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
4153  			vm_object_page_clean(obj, 0, 0, 0);
4154  		}
4155  	}
4156  	error = fullsync ?
4157  		VOP_FSYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp) :
4158  		VOP_FDATASYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp);
4159  	if (error == 0 && vp->v_mount)
4160  		error = buf_fsync(vp);
4161  	vn_unlock(vp);
4162  	fdrop(fp);
4163  
4164  	return (error);
4165  }
4166  
4167  /*
4168   * fsync(int fd)
4169   *
4170   * Sync an open file.
4171   */
4172  int
4173  sys_fsync(struct sysmsg *sysmsg, const struct fsync_args *uap)
4174  {
4175  	return (kern_fsync(uap->fd, true));
4176  }
4177  
4178  /*
4179   * fdatasync(int fd)
4180   *
4181   * Data-sync an open file.
4182   */
4183  int
4184  sys_fdatasync(struct sysmsg *sysmsg, const struct fdatasync_args *uap)
4185  {
4186  	return (kern_fsync(uap->fd, false));
4187  }
4188  
4189  /*
4190   * rename op.
4191   *
4192   * NOTE: error == 0 and nl_dvp is NULL indicates a mount point, operation
4193   *	 disallowed.  e.g. /var/cache where /var/cache is a null-mount, for
4194   *	 example.
4195   */
4196  int
4197  kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
4198  {
4199  	struct nchandle fnchd;
4200  	struct nchandle tnchd;
4201  	struct namecache *ncp;
4202  	struct vnode *fdvp;
4203  	struct vnode *tdvp;
4204  	struct mount *mp;
4205  	struct mount *userenlk;
4206  	int error;
4207  	u_int fncp_gen;
4208  	u_int tncp_gen;
4209  
4210  	bwillinode(1);
4211  	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
4212  	if ((error = nlookup(fromnd)) != 0)
4213  		return (error);
4214  
4215  	/*
4216  	 * Attempt to rename a mount point (from or to)
4217  	 */
4218  	if (error == 0 && fromnd->nl_dvp == NULL)
4219  		return (EINVAL);
4220  
4221  	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
4222  		return (ENOENT);
4223  	fnchd.mount = fromnd->nl_nch.mount;
4224  	cache_hold(&fnchd);
4225  
4226  	/*
4227  	 * unlock the source nch so we can lookup the target nch without
4228  	 * deadlocking.  The target may or may not exist so we do not check
4229  	 * for a target vp like kern_mkdir() and other creation functions do.
4230  	 *
4231  	 * The source and target directories are ref'd and rechecked after
4232  	 * everything is relocked to determine if the source or target file
4233  	 * has been renamed.
4234  	 */
4235  	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
4236  	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
4237  	fncp_gen = fromnd->nl_nch.ncp->nc_generation;
4238  
4239  	if (fromnd->nl_nch.ncp->nc_vp &&
4240  	    fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4241  		userenlk = fnchd.mount;
4242  		cache_unlock(&fromnd->nl_nch);
4243  		lockmgr(&userenlk->mnt_renlock, LK_EXCLUSIVE);
4244  	} else {
4245  		userenlk = NULL;
4246  		cache_unlock(&fromnd->nl_nch);
4247  	}
4248  
4249  	/*
4250  	 * Lookup target
4251  	 */
4252  	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
4253  	if ((error = nlookup(tond)) != 0) {
4254  		cache_drop(&fnchd);
4255  		goto done;
4256  	}
4257  	tncp_gen = tond->nl_nch.ncp->nc_generation;
4258  
4259  	/*
4260  	 * Attempt to rename a mount point (from or to)
4261  	 */
4262  	if (error == 0 && tond->nl_dvp == NULL) {
4263  		cache_drop(&fnchd);
4264  		error = ENOENT;
4265  		goto done;
4266  	}
4267  
4268  	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
4269  		cache_drop(&fnchd);
4270  		error = ENOENT;
4271  		goto done;
4272  	}
4273  	tnchd.mount = tond->nl_nch.mount;
4274  	cache_hold(&tnchd);
4275  
4276  	/*
4277  	 * If the source and target are the same there is nothing to do
4278  	 */
4279  	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
4280  		cache_drop(&fnchd);
4281  		cache_drop(&tnchd);
4282  		error = 0;
4283  		goto done;
4284  	}
4285  
4286  	/*
4287  	 * Mount points cannot be renamed or overwritten
4288  	 */
4289  	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
4290  	    NCF_ISMOUNTPT
4291  	) {
4292  		cache_drop(&fnchd);
4293  		cache_drop(&tnchd);
4294  		error = EINVAL;
4295  		goto done;
4296  	}
4297  
4298  	/*
4299  	 * Lock all four namecache entries.  tond is already locked.
4300  	 */
4301  	cache_lock4_tondlocked(&fnchd, &fromnd->nl_nch,
4302  			       &tnchd, &tond->nl_nch,
4303  			       fromnd->nl_cred, tond->nl_cred);
4304  	fromnd->nl_flags |= NLC_NCPISLOCKED;
4305  
4306  	/*
4307  	 * If the namecache generation changed for either fromnd or tond,
4308  	 * we must retry.
4309  	 */
4310  	if (((fromnd->nl_nch.ncp->nc_generation - fncp_gen) & ~1) ||
4311  	    ((tond->nl_nch.ncp->nc_generation - tncp_gen) & ~1))
4312  	{
4313  		krateprintf(&krate_rename,
4314  			"kern_rename: retry due to race on: "
4315  			"\"%s\" -> \"%s\" (%d,%d)\n",
4316  			fromnd->nl_nch.ncp->nc_name,
4317  			tond->nl_nch.ncp->nc_name,
4318  			fromnd->nl_nch.ncp->nc_generation - fncp_gen,
4319  			tond->nl_nch.ncp->nc_generation - tncp_gen);
4320  		error = EAGAIN;
4321  		goto finish;
4322  	}
4323  
4324  	/*
4325  	 * If either fromnd or tond are marked destroyed a ripout occured
4326  	 * out from under us and we must retry.
4327  	 */
4328  	if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)) ||
4329  	    fromnd->nl_nch.ncp->nc_vp == NULL ||
4330  	    (tond->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED))) {
4331  		krateprintf(&krate_rename,
4332  			"kern_rename: retry due to ripout on: "
4333  			"\"%s\" -> \"%s\"\n",
4334  			fromnd->nl_nch.ncp->nc_name,
4335  			tond->nl_nch.ncp->nc_name);
4336  		error = EAGAIN;
4337  		goto finish;
4338  	}
4339  
4340  	/*
4341  	 * Make sure the parent directories linkages are the same.  We have
4342  	 * already checked that fromnd and tond are not mount points so this
4343  	 * should not loop forever on a cross-mount.
4344  	 */
4345  	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
4346  	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
4347  		error = EAGAIN;
4348  		goto finish;
4349  	}
4350  
4351  	/*
4352  	 * Both the source and target must be within the same filesystem and
4353  	 * in the same filesystem as their parent directories within the
4354  	 * namecache topology.
4355  	 *
4356  	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
4357  	 */
4358  	mp = fnchd.mount;
4359  	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
4360  	    mp != tond->nl_nch.mount) {
4361  		error = EXDEV;
4362  		goto finish;
4363  	}
4364  
4365  	/*
4366  	 * Make sure the mount point is writable
4367  	 */
4368  	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
4369  		goto finish;
4370  	}
4371  
4372  	/*
4373  	 * If the target exists and either the source or target is a directory,
4374  	 * then both must be directories.
4375  	 *
4376  	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
4377  	 * have become NULL.
4378  	 */
4379  	if (tond->nl_nch.ncp->nc_vp) {
4380  		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
4381  			error = ENOENT;
4382  		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4383  			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
4384  				error = ENOTDIR;
4385  		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
4386  			error = EISDIR;
4387  		}
4388  	}
4389  
4390  	/*
4391  	 * You cannot rename a source into itself or a subdirectory of itself.
4392  	 * We check this by travsersing the target directory upwards looking
4393  	 * for a match against the source.
4394  	 *
4395  	 * Only required when renaming a directory, in which case userenlk is
4396  	 * non-NULL.
4397  	 */
4398  	if (__predict_false(userenlk && error == 0)) {
4399  		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
4400  			if (fromnd->nl_nch.ncp == ncp) {
4401  				error = EINVAL;
4402  				break;
4403  			}
4404  		}
4405  	}
4406  
4407  	/*
4408  	 * Even though the namespaces are different, they may still represent
4409  	 * hardlinks to the same file.  The filesystem might have a hard time
4410  	 * with this so we issue a NREMOVE of the source instead of a NRENAME
4411  	 * when we detect the situation.
4412  	 */
4413  	if (error == 0) {
4414  		fdvp = fromnd->nl_dvp;
4415  		tdvp = tond->nl_dvp;
4416  		if (fdvp == NULL || tdvp == NULL) {
4417  			error = EPERM;
4418  		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
4419  			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
4420  					    fromnd->nl_cred);
4421  		} else {
4422  			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
4423  					    fdvp, tdvp, tond->nl_cred);
4424  		}
4425  	}
4426  finish:
4427  	cache_put(&tnchd);
4428  	cache_put(&fnchd);
4429  done:
4430  	if (userenlk)
4431  		lockmgr(&userenlk->mnt_renlock, LK_RELEASE);
4432  	return (error);
4433  }
4434  
4435  /*
4436   * rename_args(char *from, char *to)
4437   *
4438   * Rename files.  Source and destination must either both be directories,
4439   * or both not be directories.  If target is a directory, it must be empty.
4440   */
4441  int
4442  sys_rename(struct sysmsg *sysmsg, const struct rename_args *uap)
4443  {
4444  	struct nlookupdata fromnd, tond;
4445  	int error;
4446  
4447  	do {
4448  		error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
4449  		if (error == 0) {
4450  			error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
4451  			if (error == 0)
4452  				error = kern_rename(&fromnd, &tond);
4453  			nlookup_done(&tond);
4454  		}
4455  		nlookup_done(&fromnd);
4456  	} while (error == EAGAIN);
4457  	return (error);
4458  }
4459  
4460  /*
4461   * renameat_args(int oldfd, char *old, int newfd, char *new)
4462   *
4463   * Rename files using paths relative to the directories associated with
4464   * oldfd and newfd.  Source and destination must either both be directories,
4465   * or both not be directories.  If target is a directory, it must be empty.
4466   */
4467  int
4468  sys_renameat(struct sysmsg *sysmsg, const struct renameat_args *uap)
4469  {
4470  	struct nlookupdata oldnd, newnd;
4471  	struct file *oldfp, *newfp;
4472  	int error;
4473  
4474  	do {
4475  		error = nlookup_init_at(&oldnd, &oldfp,
4476  					uap->oldfd, uap->old,
4477  					UIO_USERSPACE, 0);
4478  		if (error == 0) {
4479  			error = nlookup_init_at(&newnd, &newfp,
4480  						uap->newfd, uap->new,
4481  						UIO_USERSPACE, 0);
4482  			if (error == 0)
4483  				error = kern_rename(&oldnd, &newnd);
4484  			nlookup_done_at(&newnd, newfp);
4485  		}
4486  		nlookup_done_at(&oldnd, oldfp);
4487  	} while (error == EAGAIN);
4488  	return (error);
4489  }
4490  
4491  int
4492  kern_mkdir(struct nlookupdata *nd, int mode)
4493  {
4494  	struct thread *td = curthread;
4495  	struct proc *p = td->td_proc;
4496  	struct vnode *vp;
4497  	struct vattr vattr;
4498  	int error;
4499  
4500  	bwillinode(1);
4501  	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
4502  	if ((error = nlookup(nd)) != 0)
4503  		return (error);
4504  
4505  	if (nd->nl_nch.ncp->nc_vp)
4506  		return (EEXIST);
4507  	if (nd->nl_dvp == NULL)
4508  		return (EINVAL);
4509  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4510  		return (error);
4511  	VATTR_NULL(&vattr);
4512  	vattr.va_type = VDIR;
4513  	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
4514  
4515  	vp = NULL;
4516  	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
4517  	if (error == 0)
4518  		vput(vp);
4519  	return (error);
4520  }
4521  
4522  /*
4523   * mkdir_args(char *path, int mode)
4524   *
4525   * Make a directory file.
4526   */
4527  int
4528  sys_mkdir(struct sysmsg *sysmsg, const struct mkdir_args *uap)
4529  {
4530  	struct nlookupdata nd;
4531  	int error;
4532  
4533  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4534  	if (error == 0)
4535  		error = kern_mkdir(&nd, uap->mode);
4536  	nlookup_done(&nd);
4537  	return (error);
4538  }
4539  
4540  /*
4541   * mkdirat_args(int fd, char *path, mode_t mode)
4542   *
4543   * Make a directory file.  The path is relative to the directory associated
4544   * with fd.
4545   */
4546  int
4547  sys_mkdirat(struct sysmsg *sysmsg, const struct mkdirat_args *uap)
4548  {
4549  	struct nlookupdata nd;
4550  	struct file *fp;
4551  	int error;
4552  
4553  	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
4554  	if (error == 0)
4555  		error = kern_mkdir(&nd, uap->mode);
4556  	nlookup_done_at(&nd, fp);
4557  	return (error);
4558  }
4559  
4560  int
4561  kern_rmdir(struct nlookupdata *nd)
4562  {
4563  	int error;
4564  
4565  	bwillinode(1);
4566  	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4567  	if ((error = nlookup(nd)) != 0)
4568  		return (error);
4569  
4570  	/*
4571  	 * Do not allow directories representing mount points to be
4572  	 * deleted, even if empty.  Check write perms on mount point
4573  	 * in case the vnode is aliased (aka nullfs).
4574  	 */
4575  	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4576  		return (EBUSY);
4577  	if (nd->nl_dvp == NULL)
4578  		return (EINVAL);
4579  	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4580  		return (error);
4581  	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4582  	return (error);
4583  }
4584  
4585  /*
4586   * rmdir_args(char *path)
4587   *
4588   * Remove a directory file.
4589   */
4590  int
4591  sys_rmdir(struct sysmsg *sysmsg, const struct rmdir_args *uap)
4592  {
4593  	struct nlookupdata nd;
4594  	int error;
4595  
4596  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4597  	if (error == 0)
4598  		error = kern_rmdir(&nd);
4599  	nlookup_done(&nd);
4600  	return (error);
4601  }
4602  
4603  int
4604  kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4605  		   enum uio_seg direction)
4606  {
4607  	struct thread *td = curthread;
4608  	struct vnode *vp;
4609  	struct file *fp;
4610  	struct uio auio;
4611  	struct iovec aiov;
4612  	off_t loff;
4613  	int error, eofflag;
4614  
4615  	if ((error = holdvnode(td, fd, &fp)) != 0)
4616  		return (error);
4617  	if ((fp->f_flag & FREAD) == 0) {
4618  		error = EBADF;
4619  		goto done;
4620  	}
4621  	vp = (struct vnode *)fp->f_data;
4622  	if (vp->v_type != VDIR) {
4623  		error = EINVAL;
4624  		goto done;
4625  	}
4626  	aiov.iov_base = buf;
4627  	aiov.iov_len = count;
4628  	auio.uio_iov = &aiov;
4629  	auio.uio_iovcnt = 1;
4630  	auio.uio_rw = UIO_READ;
4631  	auio.uio_segflg = direction;
4632  	auio.uio_td = td;
4633  	auio.uio_resid = count;
4634  	loff = auio.uio_offset = fp->f_offset;
4635  	error = VOP_READDIR_FP(vp, &auio, fp->f_cred, &eofflag, NULL, NULL, fp);
4636  	fp->f_offset = auio.uio_offset;
4637  	if (error)
4638  		goto done;
4639  
4640  	/*
4641  	 * WARNING!  *basep may not be wide enough to accomodate the
4642  	 * seek offset.   XXX should we hack this to return the upper 32 bits
4643  	 * for offsets greater then 4G?
4644  	 */
4645  	if (basep) {
4646  		*basep = (long)loff;
4647  	}
4648  	*res = count - auio.uio_resid;
4649  done:
4650  	fdrop(fp);
4651  	return (error);
4652  }
4653  
4654  /*
4655   * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4656   *
4657   * Read a block of directory entries in a file system independent format.
4658   */
4659  int
4660  sys_getdirentries(struct sysmsg *sysmsg, const struct getdirentries_args *uap)
4661  {
4662  	long base;
4663  	int error;
4664  
4665  	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4666  				   &sysmsg->sysmsg_result, UIO_USERSPACE);
4667  
4668  	if (error == 0 && uap->basep)
4669  		error = copyout(&base, uap->basep, sizeof(*uap->basep));
4670  	return (error);
4671  }
4672  
4673  /*
4674   * getdents_args(int fd, char *buf, size_t count)
4675   */
4676  int
4677  sys_getdents(struct sysmsg *sysmsg, const struct getdents_args *uap)
4678  {
4679  	int error;
4680  
4681  	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4682  				   &sysmsg->sysmsg_result, UIO_USERSPACE);
4683  
4684  	return (error);
4685  }
4686  
4687  /*
4688   * Set the mode mask for creation of filesystem nodes.
4689   *
4690   * umask(int newmask)
4691   */
4692  int
4693  sys_umask(struct sysmsg *sysmsg, const struct umask_args *uap)
4694  {
4695  	struct thread *td = curthread;
4696  	struct proc *p = td->td_proc;
4697  	struct filedesc *fdp;
4698  
4699  	fdp = p->p_fd;
4700  	sysmsg->sysmsg_result = fdp->fd_cmask;
4701  	fdp->fd_cmask = uap->newmask & ALLPERMS;
4702  	return (0);
4703  }
4704  
4705  /*
4706   * revoke(char *path)
4707   *
4708   * Void all references to file by ripping underlying filesystem
4709   * away from vnode.
4710   */
4711  int
4712  sys_revoke(struct sysmsg *sysmsg, const struct revoke_args *uap)
4713  {
4714  	struct nlookupdata nd;
4715  	struct vattr vattr;
4716  	struct vnode *vp;
4717  	struct ucred *cred;
4718  	int error;
4719  
4720  	vp = NULL;
4721  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4722  	if (error == 0)
4723  		error = nlookup(&nd);
4724  	if (error == 0)
4725  		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4726  	cred = crhold(nd.nl_cred);
4727  	nlookup_done(&nd);
4728  	if (error == 0) {
4729  		if (error == 0)
4730  			error = VOP_GETATTR(vp, &vattr);
4731  		if (error == 0 && cred->cr_uid != vattr.va_uid)
4732  			error = caps_priv_check(cred, SYSCAP_NOVFS_REVOKE);
4733  		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4734  			if (vcount(vp) > 0)
4735  				error = vrevoke(vp, cred);
4736  		} else if (error == 0) {
4737  			error = vrevoke(vp, cred);
4738  		}
4739  		vrele(vp);
4740  	}
4741  	if (cred)
4742  		crfree(cred);
4743  	return (error);
4744  }
4745  
4746  /*
4747   * getfh_args(char *fname, fhandle_t *fhp)
4748   *
4749   * Get (NFS) file handle
4750   *
4751   * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4752   * mount.  This allows nullfs mounts to be explicitly exported.
4753   *
4754   * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4755   *
4756   * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4757   *	    work, but you do not really have protection against access to
4758   *	    the related parent directories.
4759   */
4760  int
4761  sys_getfh(struct sysmsg *sysmsg, const struct getfh_args *uap)
4762  {
4763  	struct nlookupdata nd;
4764  	fhandle_t fh;
4765  	struct vnode *vp;
4766  	struct mount *mp;
4767  	int error;
4768  
4769  	/*
4770  	 * Must be super user
4771  	 */
4772  	if ((error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) != 0)
4773  		return (error);
4774  
4775  	vp = NULL;
4776  	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4777  	if (error == 0)
4778  		error = nlookup(&nd);
4779  	if (error == 0)
4780  		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4781  	mp = nd.nl_nch.mount;
4782  	nlookup_done(&nd);
4783  	if (error == 0) {
4784  		bzero(&fh, sizeof(fh));
4785  		fh.fh_fsid = mp->mnt_stat.f_fsid;
4786  		error = VFS_VPTOFH(vp, &fh.fh_fid);
4787  		vput(vp);
4788  		if (error == 0)
4789  			error = copyout(&fh, uap->fhp, sizeof(fh));
4790  	}
4791  	return (error);
4792  }
4793  
4794  /*
4795   * fhopen_args(const struct fhandle *u_fhp, int flags)
4796   *
4797   * syscall for the rpc.lockd to use to translate a NFS file handle into
4798   * an open descriptor.
4799   *
4800   * WARNING: Do not remove the caps_priv_check() call or this becomes
4801   *	    one giant security hole.
4802   */
4803  int
4804  sys_fhopen(struct sysmsg *sysmsg, const struct fhopen_args *uap)
4805  {
4806  	struct thread *td = curthread;
4807  	struct filedesc *fdp = td->td_proc->p_fd;
4808  	struct mount *mp;
4809  	struct vnode *vp;
4810  	struct fhandle fhp;
4811  	struct vattr vat;
4812  	struct vattr *vap = &vat;
4813  	struct flock lf;
4814  	int fmode, mode, error = 0, type;
4815  	struct file *nfp;
4816  	struct file *fp;
4817  	int indx;
4818  
4819  	/*
4820  	 * Must be super user
4821  	 */
4822  	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
4823  	if (error)
4824  		return (error);
4825  
4826  	fmode = FFLAGS(uap->flags);
4827  
4828  	/*
4829  	 * Why not allow a non-read/write open for our lockd?
4830  	 */
4831  	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4832  		return (EINVAL);
4833  	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4834  	if (error)
4835  		return(error);
4836  
4837  	/*
4838  	 * Find the mount point
4839  	 */
4840  	mp = vfs_getvfs(&fhp.fh_fsid);
4841  	if (mp == NULL) {
4842  		error = ESTALE;
4843  		goto done2;
4844  	}
4845  	/* now give me my vnode, it gets returned to me locked */
4846  	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4847  	if (error)
4848  		goto done;
4849   	/*
4850  	 * from now on we have to make sure not
4851  	 * to forget about the vnode
4852  	 * any error that causes an abort must vput(vp)
4853  	 * just set error = err and 'goto bad;'.
4854  	 */
4855  
4856  	/*
4857  	 * from vn_open
4858  	 */
4859  	if (vp->v_type == VLNK) {
4860  		error = EMLINK;
4861  		goto bad;
4862  	}
4863  	if (vp->v_type == VSOCK) {
4864  		error = EOPNOTSUPP;
4865  		goto bad;
4866  	}
4867  	mode = 0;
4868  	if (fmode & (FWRITE | O_TRUNC)) {
4869  		if (vp->v_type == VDIR) {
4870  			error = EISDIR;
4871  			goto bad;
4872  		}
4873  		error = vn_writechk(vp);
4874  		if (error)
4875  			goto bad;
4876  		mode |= VWRITE;
4877  	}
4878  	if (fmode & FREAD)
4879  		mode |= VREAD;
4880  	if (mode) {
4881  		error = VOP_ACCESS(vp, mode, td->td_ucred);
4882  		if (error)
4883  			goto bad;
4884  	}
4885  	if (fmode & O_TRUNC) {
4886  		vn_unlock(vp);				/* XXX */
4887  		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4888  		VATTR_NULL(vap);
4889  		vap->va_size = 0;
4890  		error = VOP_SETATTR(vp, vap, td->td_ucred);
4891  		if (error)
4892  			goto bad;
4893  	}
4894  
4895  	/*
4896  	 * VOP_OPEN needs the file pointer so it can potentially override
4897  	 * it.
4898  	 *
4899  	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4900  	 * directory.  XXX
4901  	 */
4902  	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4903  		goto bad;
4904  	error = VOP_OPEN(vp, fmode, td->td_ucred, &nfp);
4905  	fp = nfp;
4906  
4907  	if (error) {
4908  		/*
4909  		 * setting f_ops this way prevents VOP_CLOSE from being
4910  		 * called or fdrop() releasing the vp from v_data.   Since
4911  		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4912  		 */
4913  		fp->f_ops = &badfileops;
4914  		fp->f_data = NULL;
4915  		goto bad_drop;
4916  	}
4917  
4918  	/*
4919  	 * The fp is given its own reference, we still have our ref and lock.
4920  	 *
4921  	 * Assert that all regular files must be created with a VM object.
4922  	 */
4923  	if (vp->v_type == VREG && vp->v_object == NULL) {
4924  		kprintf("fhopen: regular file did not "
4925  			"have VM object: %p\n",
4926  			vp);
4927  		goto bad_drop;
4928  	}
4929  
4930  	/*
4931  	 * The open was successful.  Handle any locking requirements.
4932  	 */
4933  	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4934  		lf.l_whence = SEEK_SET;
4935  		lf.l_start = 0;
4936  		lf.l_len = 0;
4937  		if (fmode & O_EXLOCK)
4938  			lf.l_type = F_WRLCK;
4939  		else
4940  			lf.l_type = F_RDLCK;
4941  		if (fmode & FNONBLOCK)
4942  			type = 0;
4943  		else
4944  			type = F_WAIT;
4945  		vn_unlock(vp);
4946  		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK,
4947  					 &lf, type)) != 0) {
4948  			/*
4949  			 * release our private reference.
4950  			 */
4951  			fsetfd(fdp, NULL, indx);
4952  			fdrop(fp);
4953  			vrele(vp);
4954  			goto done;
4955  		}
4956  		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4957  		atomic_set_int(&fp->f_flag, FHASLOCK);	/* race ok */
4958  	}
4959  
4960  	/*
4961  	 * Clean up.  Associate the file pointer with the previously
4962  	 * reserved descriptor and return it.
4963  	 */
4964  	vput(vp);
4965  	if (uap->flags & O_CLOEXEC)
4966  		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
4967  	fsetfd(fdp, fp, indx);
4968  	fdrop(fp);
4969  	sysmsg->sysmsg_result = indx;
4970  	mount_drop(mp);
4971  
4972  	return (error);
4973  
4974  bad_drop:
4975  	fsetfd(fdp, NULL, indx);
4976  	fdrop(fp);
4977  bad:
4978  	vput(vp);
4979  done:
4980  	mount_drop(mp);
4981  done2:
4982  	return (error);
4983  }
4984  
4985  /*
4986   * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4987   */
4988  int
4989  sys_fhstat(struct sysmsg *sysmsg, const struct fhstat_args *uap)
4990  {
4991  	struct thread *td = curthread;
4992  	struct stat sb;
4993  	fhandle_t fh;
4994  	struct mount *mp;
4995  	struct vnode *vp;
4996  	int error;
4997  
4998  	/*
4999  	 * Must be super user
5000  	 */
5001  	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
5002  	if (error)
5003  		return (error);
5004  
5005  	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
5006  	if (error)
5007  		return (error);
5008  
5009  	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
5010  		error = ESTALE;
5011  	if (error == 0) {
5012  		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
5013  			error = vn_stat(vp, &sb, td->td_ucred);
5014  			vput(vp);
5015  		}
5016  	}
5017  	if (error == 0)
5018  		error = copyout(&sb, uap->sb, sizeof(sb));
5019  	if (mp)
5020  		mount_drop(mp);
5021  
5022  	return (error);
5023  }
5024  
5025  /*
5026   * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
5027   */
5028  int
5029  sys_fhstatfs(struct sysmsg *sysmsg, const struct fhstatfs_args *uap)
5030  {
5031  	struct thread *td = curthread;
5032  	struct proc *p = td->td_proc;
5033  	struct statfs *sp;
5034  	struct mount *mp;
5035  	struct vnode *vp;
5036  	struct statfs sb;
5037  	char *fullpath, *freepath;
5038  	fhandle_t fh;
5039  	int error;
5040  
5041  	/*
5042  	 * Must be super user
5043  	 */
5044  	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
5045  	if (error)
5046  		return (error);
5047  
5048  	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
5049  		return (error);
5050  
5051  	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
5052  		error = ESTALE;
5053  		goto done;
5054  	}
5055  	if (p != NULL && !chroot_visible_mnt(mp, p)) {
5056  		error = ESTALE;
5057  		goto done;
5058  	}
5059  
5060  	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
5061  		goto done;
5062  	mp = vp->v_mount;
5063  	sp = &mp->mnt_stat;
5064  	vput(vp);
5065  	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
5066  		goto done;
5067  
5068  	error = mount_path(p, mp, &fullpath, &freepath);
5069  	if (error)
5070  		goto done;
5071  	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
5072  	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
5073  	kfree(freepath, M_TEMP);
5074  
5075  	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
5076  	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) {
5077  		bcopy(sp, &sb, sizeof(sb));
5078  		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
5079  		sp = &sb;
5080  	}
5081  	error = copyout(sp, uap->buf, sizeof(*sp));
5082  done:
5083  	if (mp)
5084  		mount_drop(mp);
5085  
5086  	return (error);
5087  }
5088  
5089  /*
5090   * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
5091   */
5092  int
5093  sys_fhstatvfs(struct sysmsg *sysmsg, const struct fhstatvfs_args *uap)
5094  {
5095  	struct thread *td = curthread;
5096  	struct proc *p = td->td_proc;
5097  	struct statvfs *sp;
5098  	struct mount *mp;
5099  	struct vnode *vp;
5100  	fhandle_t fh;
5101  	int error;
5102  
5103  	/*
5104  	 * Must be super user
5105  	 */
5106  	if ((error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)))
5107  		return (error);
5108  
5109  	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
5110  		return (error);
5111  
5112  	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
5113  		error = ESTALE;
5114  		goto done;
5115  	}
5116  	if (p != NULL && !chroot_visible_mnt(mp, p)) {
5117  		error = ESTALE;
5118  		goto done;
5119  	}
5120  
5121  	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
5122  		goto done;
5123  	mp = vp->v_mount;
5124  	sp = &mp->mnt_vstat;
5125  	vput(vp);
5126  	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
5127  		goto done;
5128  
5129  	sp->f_flag = 0;
5130  	if (mp->mnt_flag & MNT_RDONLY)
5131  		sp->f_flag |= ST_RDONLY;
5132  	if (mp->mnt_flag & MNT_NOSUID)
5133  		sp->f_flag |= ST_NOSUID;
5134  	error = copyout(sp, uap->buf, sizeof(*sp));
5135  done:
5136  	if (mp)
5137  		mount_drop(mp);
5138  	return (error);
5139  }
5140  
5141  
5142  /*
5143   * Syscall to push extended attribute configuration information into the
5144   * VFS.  Accepts a path, which it converts to a mountpoint, as well as
5145   * a command (int cmd), and attribute name and misc data.  For now, the
5146   * attribute name is left in userspace for consumption by the VFS_op.
5147   * It will probably be changed to be copied into sysspace by the
5148   * syscall in the future, once issues with various consumers of the
5149   * attribute code have raised their hands.
5150   *
5151   * Currently this is used only by UFS Extended Attributes.
5152   */
5153  int
5154  sys_extattrctl(struct sysmsg *sysmsg, const struct extattrctl_args *uap)
5155  {
5156  	struct nlookupdata nd;
5157  	struct vnode *vp;
5158  	char attrname[EXTATTR_MAXNAMELEN];
5159  	int error;
5160  	size_t size;
5161  
5162  	attrname[0] = 0;
5163  	vp = NULL;
5164  	error = 0;
5165  
5166  	if (error == 0 && uap->filename) {
5167  		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
5168  				     NLC_FOLLOW);
5169  		if (error == 0)
5170  			error = nlookup(&nd);
5171  		if (error == 0)
5172  			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
5173  		nlookup_done(&nd);
5174  	}
5175  
5176  	if (error == 0 && uap->attrname) {
5177  		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
5178  				  &size);
5179  	}
5180  
5181  	if (error == 0) {
5182  		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5183  		if (error == 0)
5184  			error = nlookup(&nd);
5185  		if (error == 0)
5186  			error = ncp_writechk(&nd.nl_nch);
5187  		if (error == 0) {
5188  			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
5189  					       uap->attrnamespace,
5190  					       uap->attrname, nd.nl_cred);
5191  		}
5192  		nlookup_done(&nd);
5193  	}
5194  
5195  	return (error);
5196  }
5197  
5198  /*
5199   * Syscall to get a named extended attribute on a file or directory.
5200   */
5201  int
5202  sys_extattr_set_file(struct sysmsg *sysmsg,
5203  		     const struct extattr_set_file_args *uap)
5204  {
5205  	char attrname[EXTATTR_MAXNAMELEN];
5206  	struct nlookupdata nd;
5207  	struct vnode *vp;
5208  	struct uio auio;
5209  	struct iovec aiov;
5210  	int error;
5211  
5212  	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5213  	if (error)
5214  		return (error);
5215  
5216  	vp = NULL;
5217  
5218  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5219  	if (error == 0)
5220  		error = nlookup(&nd);
5221  	if (error == 0)
5222  		error = ncp_writechk(&nd.nl_nch);
5223  	if (error == 0)
5224  		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5225  	if (error) {
5226  		nlookup_done(&nd);
5227  		return (error);
5228  	}
5229  
5230  	bzero(&auio, sizeof(auio));
5231  	aiov.iov_base = uap->data;
5232  	aiov.iov_len = uap->nbytes;
5233  	auio.uio_iov = &aiov;
5234  	auio.uio_iovcnt = 1;
5235  	auio.uio_offset = 0;
5236  	auio.uio_resid = uap->nbytes;
5237  	auio.uio_rw = UIO_WRITE;
5238  	auio.uio_td = curthread;
5239  
5240  	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
5241  			       &auio, nd.nl_cred);
5242  
5243  	vput(vp);
5244  	nlookup_done(&nd);
5245  	return (error);
5246  }
5247  
5248  /*
5249   * Syscall to get a named extended attribute on a file or directory.
5250   */
5251  int
5252  sys_extattr_get_file(struct sysmsg *sysmsg,
5253  		     const struct extattr_get_file_args *uap)
5254  {
5255  	char attrname[EXTATTR_MAXNAMELEN];
5256  	struct nlookupdata nd;
5257  	struct uio auio;
5258  	struct iovec aiov;
5259  	struct vnode *vp;
5260  	int error;
5261  
5262  	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5263  	if (error)
5264  		return (error);
5265  
5266  	vp = NULL;
5267  
5268  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5269  	if (error == 0)
5270  		error = nlookup(&nd);
5271  	if (error == 0)
5272  		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
5273  	if (error) {
5274  		nlookup_done(&nd);
5275  		return (error);
5276  	}
5277  
5278  	bzero(&auio, sizeof(auio));
5279  	aiov.iov_base = uap->data;
5280  	aiov.iov_len = uap->nbytes;
5281  	auio.uio_iov = &aiov;
5282  	auio.uio_iovcnt = 1;
5283  	auio.uio_offset = 0;
5284  	auio.uio_resid = uap->nbytes;
5285  	auio.uio_rw = UIO_READ;
5286  	auio.uio_td = curthread;
5287  
5288  	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
5289  				&auio, nd.nl_cred);
5290  	sysmsg->sysmsg_result = uap->nbytes - auio.uio_resid;
5291  
5292  	vput(vp);
5293  	nlookup_done(&nd);
5294  	return(error);
5295  }
5296  
5297  /*
5298   * Syscall to delete a named extended attribute from a file or directory.
5299   * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
5300   */
5301  int
5302  sys_extattr_delete_file(struct sysmsg *sysmsg,
5303  			const struct extattr_delete_file_args *uap)
5304  {
5305  	char attrname[EXTATTR_MAXNAMELEN];
5306  	struct nlookupdata nd;
5307  	struct vnode *vp;
5308  	int error;
5309  
5310  	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5311  	if (error)
5312  		return(error);
5313  
5314  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5315  	if (error == 0)
5316  		error = nlookup(&nd);
5317  	if (error == 0)
5318  		error = ncp_writechk(&nd.nl_nch);
5319  	if (error == 0) {
5320  		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5321  		if (error == 0) {
5322  			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
5323  					       attrname, NULL, nd.nl_cred);
5324  			vput(vp);
5325  		}
5326  	}
5327  	nlookup_done(&nd);
5328  	return(error);
5329  }
5330  
5331  /*
5332   * Determine if the mount is visible to the process.
5333   */
5334  static int
5335  chroot_visible_mnt(struct mount *mp, struct proc *p)
5336  {
5337  	struct nchandle nch;
5338  
5339  	/*
5340  	 * Traverse from the mount point upwards.  If we hit the process
5341  	 * root then the mount point is visible to the process.
5342  	 */
5343  	nch = mp->mnt_ncmountpt;
5344  	while (nch.ncp) {
5345  		if (nch.mount == p->p_fd->fd_nrdir.mount &&
5346  		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
5347  			return(1);
5348  		}
5349  		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
5350  			nch = nch.mount->mnt_ncmounton;
5351  		} else {
5352  			nch.ncp = nch.ncp->nc_parent;
5353  		}
5354  	}
5355  
5356  	/*
5357  	 * If the mount point is not visible to the process, but the
5358  	 * process root is in a subdirectory of the mount, return
5359  	 * TRUE anyway.
5360  	 */
5361  	if (p->p_fd->fd_nrdir.mount == mp)
5362  		return(1);
5363  
5364  	return(0);
5365  }
5366  
5367  /*
5368   * Return the appropriate system capability restriction.
5369   */
5370  static int
5371  get_fscap(const char *fsname)
5372  {
5373  
5374  	if (strncmp("null", fsname, 5) == 0) {
5375  		return SYSCAP_NOMOUNT_NULLFS;
5376  	} else if (strncmp(fsname, "devfs", 6) == 0) {
5377  		return SYSCAP_NOMOUNT_DEVFS;
5378  	} else if (strncmp(fsname, "procfs", 7) == 0) {
5379  		return SYSCAP_NOMOUNT_PROCFS;
5380  	} else if (strncmp(fsname, "tmpfs", 6) == 0) {
5381  		return SYSCAP_NOMOUNT_TMPFS;
5382  	} else if (strncmp(fsname, "fusefs", 7) == 0) {
5383  		return SYSCAP_NOMOUNT_FUSE;
5384  	}
5385  	return SYSCAP_RESTRICTEDROOT;
5386  }
5387  
5388  int
5389  sys___realpath(struct sysmsg *sysmsg, const struct __realpath_args *uap)
5390  {
5391  	struct nlookupdata nd;
5392  	char *rbuf;
5393  	char *fbuf;
5394  	ssize_t rlen;
5395  	int error;
5396  
5397  	/*
5398  	 * Invalid length if less than 0.  0 is allowed
5399  	 */
5400  	if ((ssize_t)uap->len < 0)
5401  		return EINVAL;
5402  
5403  	rbuf = NULL;
5404  	fbuf = NULL;
5405  	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5406  	if (error)
5407  		goto done;
5408  
5409  	nd.nl_flags |= NLC_SHAREDLOCK;
5410  	error = nlookup(&nd);
5411  	if (error)
5412  		goto done;
5413  
5414  	if (nd.nl_nch.ncp->nc_vp == NULL) {
5415  		error = ENOENT;
5416  		goto done;
5417  	}
5418  
5419  	/*
5420  	 * Shortcut test for existence.
5421  	 */
5422  	if (uap->len == 0) {
5423  		error = ENAMETOOLONG;
5424  		goto done;
5425  	}
5426  
5427  	/*
5428  	 * Obtain the path relative to the process root.  The nch must not
5429  	 * be locked for the cache_fullpath() call.
5430  	 */
5431  	if (nd.nl_flags & NLC_NCPISLOCKED) {
5432  		nd.nl_flags &= ~NLC_NCPISLOCKED;
5433  		cache_unlock(&nd.nl_nch);
5434  	}
5435  	error = cache_fullpath(curproc, &nd.nl_nch, NULL, &rbuf, &fbuf, 0);
5436  	if (error)
5437  		goto done;
5438  
5439  	rlen = (ssize_t)strlen(rbuf);
5440  	if (rlen >= uap->len) {
5441  		error = ENAMETOOLONG;
5442  		goto done;
5443  	}
5444  	error = copyout(rbuf, uap->buf, rlen + 1);
5445  	if (error == 0)
5446  		sysmsg->sysmsg_szresult = rlen;
5447  done:
5448  	nlookup_done(&nd);
5449  	if (fbuf)
5450  		kfree(fbuf, M_TEMP);
5451  
5452  	return error;
5453  }
5454  
5455  int
5456  sys_posix_fallocate(struct sysmsg *sysmsg, const struct posix_fallocate_args *uap)
5457  {
5458  	return (kern_posix_fallocate(uap->fd, uap->offset, uap->len));
5459  }
5460  
5461  int
5462  kern_posix_fallocate(int fd, off_t offset, off_t len)
5463  {
5464  	struct thread *td = curthread;
5465  	struct vnode *vp;
5466  	struct file *fp;
5467  	int error;
5468  
5469  	if (offset < 0 || len <= 0)
5470  		return (EINVAL);
5471  	/* Check for wrap. */
5472  	if (offset > OFF_MAX - len)
5473  		return (EFBIG);
5474  
5475  	fp = holdfp(td, fd, -1);
5476  	if (fp == NULL)
5477  		return (EBADF);
5478  
5479  	switch (fp->f_type) {
5480  	case DTYPE_VNODE:
5481  		break;
5482  	case DTYPE_PIPE:
5483  	case DTYPE_FIFO:
5484  		error = ESPIPE;
5485  		goto out;
5486  	default:
5487  		error = ENODEV;
5488  		goto out;
5489  	}
5490  
5491  	if ((fp->f_flag & FWRITE) == 0) {
5492  		error = EBADF;
5493  		goto out;
5494  	}
5495  
5496  	vp = fp->f_data;
5497  	if (vp->v_type != VREG) {
5498  		error = ENODEV;
5499  		goto out;
5500  	}
5501  
5502  	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5503  	error = VOP_ALLOCATE(vp, offset, len);
5504  	vn_unlock(vp);
5505  out:
5506  	dropfp(td, fd, fp);
5507  	return (error);
5508  }
5509