xref: /netbsd-src/sys/miscfs/procfs/procfs_vnops.c (revision 3f351f34c6d827cf017cdcff3543f6ec0c88b420)
1 /*	$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1993, 1995
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Jan-Simon Pendry.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  *
63  *	@(#)procfs_vnops.c	8.18 (Berkeley) 5/21/95
64  */
65 
66 /*
67  * Copyright (c) 1993 Jan-Simon Pendry
68  *
69  * This code is derived from software contributed to Berkeley by
70  * Jan-Simon Pendry.
71  *
72  * Redistribution and use in source and binary forms, with or without
73  * modification, are permitted provided that the following conditions
74  * are met:
75  * 1. Redistributions of source code must retain the above copyright
76  *    notice, this list of conditions and the following disclaimer.
77  * 2. Redistributions in binary form must reproduce the above copyright
78  *    notice, this list of conditions and the following disclaimer in the
79  *    documentation and/or other materials provided with the distribution.
80  * 3. All advertising materials mentioning features or use of this software
81  *    must display the following acknowledgement:
82  *	This product includes software developed by the University of
83  *	California, Berkeley and its contributors.
84  * 4. Neither the name of the University nor the names of its contributors
85  *    may be used to endorse or promote products derived from this software
86  *    without specific prior written permission.
87  *
88  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
89  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
90  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
91  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
92  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
93  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
94  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
95  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
96  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
97  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
98  * SUCH DAMAGE.
99  *
100  *	@(#)procfs_vnops.c	8.18 (Berkeley) 5/21/95
101  */
102 
103 /*
104  * procfs vnode interface
105  */
106 
107 #include <sys/cdefs.h>
108 __KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $");
109 
110 #include <sys/param.h>
111 #include <sys/atomic.h>
112 #include <sys/systm.h>
113 #include <sys/time.h>
114 #include <sys/kernel.h>
115 #include <sys/file.h>
116 #include <sys/filedesc.h>
117 #include <sys/proc.h>
118 #include <sys/vnode.h>
119 #include <sys/namei.h>
120 #include <sys/malloc.h>
121 #include <sys/mount.h>
122 #include <sys/dirent.h>
123 #include <sys/resourcevar.h>
124 #include <sys/stat.h>
125 #include <sys/ptrace.h>
126 #include <sys/kauth.h>
127 #include <sys/exec.h>
128 
129 #include <uvm/uvm_extern.h>	/* for PAGE_SIZE */
130 
131 #include <machine/reg.h>
132 
133 #include <miscfs/genfs/genfs.h>
134 #include <miscfs/procfs/procfs.h>
135 
136 /*
137  * Vnode Operations.
138  *
139  */
140 
141 static int procfs_validfile_linux(struct lwp *, struct mount *);
142 static int procfs_root_readdir_callback(struct proc *, void *);
143 static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *,
144     size_t);
145 
146 /*
147  * This is a list of the valid names in the
148  * process-specific sub-directories.  It is
149  * used in procfs_lookup and procfs_readdir
150  */
151 static const struct proc_target {
152 	u_char	pt_type;
153 	u_char	pt_namlen;
154 	const char	*pt_name;
155 	pfstype	pt_pfstype;
156 	int	(*pt_valid)(struct lwp *, struct mount *);
157 } proc_targets[] = {
158 #define N(s) sizeof(s)-1, s
159 	/*	  name		type		validp */
160 	{ DT_DIR, N("."),	PFSproc,	NULL },
161 	{ DT_DIR, N(".."),	PFSroot,	NULL },
162 	{ DT_DIR, N("fd"),	PFSfd,		NULL },
163 	{ DT_DIR, N("task"),	PFStask,	procfs_validfile_linux },
164 	{ DT_LNK, N("cwd"),	PFScwd,		NULL },
165 	{ DT_REG, N("emul"),	PFSemul,	NULL },
166 	{ DT_LNK, N("root"),	PFSchroot,	NULL },
167 	{ DT_REG, N("auxv"),	PFSauxv,	procfs_validauxv },
168 	{ DT_REG, N("cmdline"), PFScmdline,	NULL },
169 	{ DT_REG, N("environ"), PFSenviron,	NULL },
170 	{ DT_LNK, N("exe"),	PFSexe,		procfs_validfile },
171 	{ DT_REG, N("file"),	PFSfile,	procfs_validfile },
172 	{ DT_REG, N("fpregs"),	PFSfpregs,	procfs_validfpregs },
173 	{ DT_REG, N("limit"),	PFSlimit,	NULL },
174 	{ DT_REG, N("map"),	PFSmap,		procfs_validmap },
175 	{ DT_REG, N("maps"),	PFSmaps,	procfs_validmap },
176 	{ DT_REG, N("mem"),	PFSmem,		NULL },
177 	{ DT_REG, N("note"),	PFSnote,	NULL },
178 	{ DT_REG, N("notepg"),	PFSnotepg,	NULL },
179 	{ DT_REG, N("regs"),	PFSregs,	procfs_validregs },
180 	{ DT_REG, N("stat"),	PFSstat,	procfs_validfile_linux },
181 	{ DT_REG, N("statm"),	PFSstatm,	procfs_validfile_linux },
182 	{ DT_REG, N("status"),	PFSstatus,	NULL },
183 #ifdef __HAVE_PROCFS_MACHDEP
184 	PROCFS_MACHDEP_NODETYPE_DEFNS
185 #endif
186 #undef N
187 };
188 static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]);
189 
190 /*
191  * List of files in the root directory. Note: the validate function will
192  * be called with p == NULL for these ones.
193  */
194 static const struct proc_target proc_root_targets[] = {
195 #define N(s) sizeof(s)-1, s
196 	/*	  name		    type	    validp */
197 	{ DT_REG, N("meminfo"),     PFSmeminfo,        procfs_validfile_linux },
198 	{ DT_REG, N("cpuinfo"),     PFScpuinfo,        procfs_validfile_linux },
199 	{ DT_REG, N("uptime"),      PFSuptime,         procfs_validfile_linux },
200 	{ DT_REG, N("mounts"),	    PFSmounts,	       procfs_validfile_linux },
201 	{ DT_REG, N("devices"),     PFSdevices,        procfs_validfile_linux },
202 	{ DT_REG, N("stat"),	    PFScpustat,        procfs_validfile_linux },
203 	{ DT_REG, N("loadavg"),	    PFSloadavg,        procfs_validfile_linux },
204 	{ DT_REG, N("version"),     PFSversion,        procfs_validfile_linux },
205 #undef N
206 };
207 static const int nproc_root_targets =
208     sizeof(proc_root_targets) / sizeof(proc_root_targets[0]);
209 
210 int	procfs_lookup(void *);
211 int	procfs_open(void *);
212 int	procfs_close(void *);
213 int	procfs_access(void *);
214 int	procfs_getattr(void *);
215 int	procfs_setattr(void *);
216 int	procfs_readdir(void *);
217 int	procfs_readlink(void *);
218 int	procfs_inactive(void *);
219 int	procfs_reclaim(void *);
220 int	procfs_print(void *);
221 int	procfs_pathconf(void *);
222 int	procfs_getpages(void *);
223 
224 static uint8_t fttodt(file_t *);
225 static int atoi(const char *, size_t);
226 
227 /*
228  * procfs vnode operations.
229  */
230 int (**procfs_vnodeop_p)(void *);
231 const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = {
232 	{ &vop_default_desc, vn_default_error },
233 	{ &vop_parsepath_desc, genfs_parsepath },	/* parsepath */
234 	{ &vop_lookup_desc, procfs_lookup },		/* lookup */
235 	{ &vop_create_desc, genfs_eopnotsupp },		/* create */
236 	{ &vop_mknod_desc, genfs_eopnotsupp },		/* mknod */
237 	{ &vop_open_desc, procfs_open },		/* open */
238 	{ &vop_close_desc, procfs_close },		/* close */
239 	{ &vop_access_desc, procfs_access },		/* access */
240 	{ &vop_accessx_desc, genfs_accessx },		/* accessx */
241 	{ &vop_getattr_desc, procfs_getattr },		/* getattr */
242 	{ &vop_setattr_desc, procfs_setattr },		/* setattr */
243 	{ &vop_read_desc, procfs_rw },			/* read */
244 	{ &vop_write_desc, procfs_rw },			/* write */
245 	{ &vop_fallocate_desc, genfs_eopnotsupp },	/* fallocate */
246 	{ &vop_fdiscard_desc, genfs_eopnotsupp },	/* fdiscard */
247 	{ &vop_fcntl_desc, genfs_fcntl },		/* fcntl */
248 	{ &vop_ioctl_desc, genfs_enoioctl },		/* ioctl */
249 	{ &vop_poll_desc, genfs_poll },			/* poll */
250 	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
251 	{ &vop_revoke_desc, genfs_revoke },		/* revoke */
252 	{ &vop_fsync_desc, genfs_nullop },		/* fsync */
253 	{ &vop_seek_desc, genfs_nullop },		/* seek */
254 	{ &vop_remove_desc, genfs_eopnotsupp },		/* remove */
255 	{ &vop_link_desc, genfs_erofs_link },		/* link */
256 	{ &vop_rename_desc, genfs_eopnotsupp },		/* rename */
257 	{ &vop_mkdir_desc, genfs_eopnotsupp },		/* mkdir */
258 	{ &vop_rmdir_desc, genfs_eopnotsupp },		/* rmdir */
259 	{ &vop_symlink_desc, genfs_erofs_symlink },	/* symlink */
260 	{ &vop_readdir_desc, procfs_readdir },		/* readdir */
261 	{ &vop_readlink_desc, procfs_readlink },	/* readlink */
262 	{ &vop_abortop_desc, genfs_abortop },		/* abortop */
263 	{ &vop_inactive_desc, procfs_inactive },	/* inactive */
264 	{ &vop_reclaim_desc, procfs_reclaim },		/* reclaim */
265 	{ &vop_lock_desc, genfs_lock },			/* lock */
266 	{ &vop_unlock_desc, genfs_unlock },		/* unlock */
267 	{ &vop_bmap_desc, genfs_eopnotsupp },		/* bmap */
268 	{ &vop_strategy_desc, genfs_badop },		/* strategy */
269 	{ &vop_print_desc, procfs_print },		/* print */
270 	{ &vop_islocked_desc, genfs_islocked },		/* islocked */
271 	{ &vop_pathconf_desc, procfs_pathconf },	/* pathconf */
272 	{ &vop_advlock_desc, genfs_einval },		/* advlock */
273 	{ &vop_getpages_desc, procfs_getpages },	/* getpages */
274 	{ &vop_putpages_desc, genfs_null_putpages },	/* putpages */
275 	{ NULL, NULL }
276 };
277 const struct vnodeopv_desc procfs_vnodeop_opv_desc =
278 	{ &procfs_vnodeop_p, procfs_vnodeop_entries };
279 /*
280  * set things up for doing i/o on
281  * the pfsnode (vp).  (vp) is locked
282  * on entry, and should be left locked
283  * on exit.
284  *
285  * for procfs we don't need to do anything
286  * in particular for i/o.  all that is done
287  * is to support exclusive open on process
288  * memory images.
289  */
290 int
291 procfs_open(void *v)
292 {
293 	struct vop_open_args /* {
294 		struct vnode *a_vp;
295 		int  a_mode;
296 		kauth_cred_t a_cred;
297 	} */ *ap = v;
298 	struct vnode *vp = ap->a_vp;
299 	struct pfsnode *pfs = VTOPFS(vp);
300 	struct lwp *l1;
301 	struct proc *p2;
302 	int error;
303 
304 	if ((error =
305 	     procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0)
306 		return error;
307 
308 	l1 = curlwp;				/* tracer */
309 
310 #define	M2K(m)	(((m) & FREAD) && ((m) & FWRITE) ? \
311 		 KAUTH_REQ_PROCESS_PROCFS_RW : \
312 		 (m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \
313 		 KAUTH_REQ_PROCESS_PROCFS_READ)
314 
315 	mutex_enter(p2->p_lock);
316 	error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS,
317 	    p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL);
318 	mutex_exit(p2->p_lock);
319 	if (error) {
320 		procfs_proc_unlock(p2);
321 		return (error);
322 	}
323 
324 #undef M2K
325 
326 	switch (pfs->pfs_type) {
327 	case PFSmem:
328 		if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
329 		    ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
330 			error = EBUSY;
331 			break;
332 		}
333 
334 		if (!proc_isunder(p2, l1)) {
335 			error = EPERM;
336 			break;
337 		}
338 
339 		if (ap->a_mode & FWRITE)
340 			pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
341 
342 		break;
343 
344 	case PFSregs:
345 	case PFSfpregs:
346 		if (!proc_isunder(p2, l1)) {
347 			error = EPERM;
348 			break;
349 		}
350 		break;
351 
352 	default:
353 		break;
354 	}
355 
356 	procfs_proc_unlock(p2);
357 	return (error);
358 }
359 
360 /*
361  * close the pfsnode (vp) after doing i/o.
362  * (vp) is not locked on entry or exit.
363  *
364  * nothing to do for procfs other than undo
365  * any exclusive open flag (see _open above).
366  */
367 int
368 procfs_close(void *v)
369 {
370 	struct vop_close_args /* {
371 		struct vnode *a_vp;
372 		int  a_fflag;
373 		kauth_cred_t a_cred;
374 	} */ *ap = v;
375 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
376 
377 	switch (pfs->pfs_type) {
378 	case PFSmem:
379 		if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
380 			pfs->pfs_flags &= ~(FWRITE|O_EXCL);
381 		break;
382 
383 	default:
384 		break;
385 	}
386 
387 	return (0);
388 }
389 
390 /*
391  * _inactive is called when the pfsnode
392  * is vrele'd and the reference count goes
393  * to zero.  (vp) will be on the vnode free
394  * list, so to get it back vget() must be
395  * used.
396  *
397  * (vp) is locked on entry, but must be unlocked on exit.
398  */
399 int
400 procfs_inactive(void *v)
401 {
402 	struct vop_inactive_v2_args /* {
403 		struct vnode *a_vp;
404 		bool *a_recycle;
405 	} */ *ap = v;
406 	struct vnode *vp = ap->a_vp;
407 	struct pfsnode *pfs = VTOPFS(vp);
408 
409 	mutex_enter(&proc_lock);
410 	*ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL);
411 	mutex_exit(&proc_lock);
412 
413 	return (0);
414 }
415 
416 /*
417  * _reclaim is called when getnewvnode()
418  * wants to make use of an entry on the vnode
419  * free list.  at this time the filesystem needs
420  * to free any private data and remove the node
421  * from any private lists.
422  */
423 int
424 procfs_reclaim(void *v)
425 {
426 	struct vop_reclaim_v2_args /* {
427 		struct vnode *a_vp;
428 	} */ *ap = v;
429 	struct vnode *vp = ap->a_vp;
430 	struct pfsnode *pfs = VTOPFS(vp);
431 
432 	VOP_UNLOCK(vp);
433 
434 	/*
435 	 * To interlock with procfs_revoke_vnodes().
436 	 */
437 	mutex_enter(vp->v_interlock);
438 	vp->v_data = NULL;
439 	mutex_exit(vp->v_interlock);
440 	procfs_hashrem(pfs);
441 	kmem_free(pfs, sizeof(*pfs));
442 	return 0;
443 }
444 
445 /*
446  * Return POSIX pathconf information applicable to special devices.
447  */
448 int
449 procfs_pathconf(void *v)
450 {
451 	struct vop_pathconf_args /* {
452 		struct vnode *a_vp;
453 		int a_name;
454 		register_t *a_retval;
455 	} */ *ap = v;
456 
457 	switch (ap->a_name) {
458 	case _PC_LINK_MAX:
459 		*ap->a_retval = LINK_MAX;
460 		return (0);
461 	case _PC_MAX_CANON:
462 		*ap->a_retval = MAX_CANON;
463 		return (0);
464 	case _PC_MAX_INPUT:
465 		*ap->a_retval = MAX_INPUT;
466 		return (0);
467 	case _PC_PIPE_BUF:
468 		*ap->a_retval = PIPE_BUF;
469 		return (0);
470 	case _PC_CHOWN_RESTRICTED:
471 		*ap->a_retval = 1;
472 		return (0);
473 	case _PC_VDISABLE:
474 		*ap->a_retval = _POSIX_VDISABLE;
475 		return (0);
476 	case _PC_SYNC_IO:
477 		*ap->a_retval = 1;
478 		return (0);
479 	default:
480 		return genfs_pathconf(ap);
481 	}
482 	/* NOTREACHED */
483 }
484 
485 /*
486  * _print is used for debugging.
487  * just print a readable description
488  * of (vp).
489  */
490 int
491 procfs_print(void *v)
492 {
493 	struct vop_print_args /* {
494 		struct vnode *a_vp;
495 	} */ *ap = v;
496 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
497 
498 	printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n",
499 	    pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
500 	return 0;
501 }
502 
503 /*
504  * Works out the path to the target process's current
505  * working directory or chroot.  If the caller is in a chroot and
506  * can't "reach" the target's cwd or root (or some other error
507  * occurs), a "/" is returned for the path.
508  */
509 static void
510 procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp,
511     char *path, size_t len)
512 {
513 	struct cwdinfo *cwdi;
514 	struct vnode *vp, *rvp;
515 	char *bp;
516 
517 	/*
518 	 * Lock target cwdi and take a reference to the vnode
519 	 * we are interested in to prevent it from disappearing
520 	 * before getcwd_common() below.
521 	 */
522 	rw_enter(&target->p_cwdi->cwdi_lock, RW_READER);
523 	switch (t) {
524 	case PFScwd:
525 		vp = target->p_cwdi->cwdi_cdir;
526 		break;
527 	case PFSchroot:
528 		vp = target->p_cwdi->cwdi_rdir;
529 		break;
530 	default:
531 		rw_exit(&target->p_cwdi->cwdi_lock);
532 		return;
533 	}
534 	if (vp != NULL)
535 		vref(vp);
536 	rw_exit(&target->p_cwdi->cwdi_lock);
537 
538 	cwdi = caller->l_proc->p_cwdi;
539 	rw_enter(&cwdi->cwdi_lock, RW_READER);
540 
541 	rvp = cwdi->cwdi_rdir;
542 	bp = bpp ? *bpp : NULL;
543 
544 	/*
545 	 * XXX: this horrible kludge avoids locking panics when
546 	 * attempting to lookup links that point to within procfs
547 	 */
548 	if (vp != NULL && vp->v_tag == VT_PROCFS) {
549 		if (bpp) {
550 			*--bp = '/';
551 			*bpp = bp;
552 		}
553 		vrele(vp);
554 		rw_exit(&cwdi->cwdi_lock);
555 		return;
556 	}
557 
558 	if (rvp == NULL)
559 		rvp = rootvnode;
560 	if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path,
561 	    len / 2, 0, caller) != 0) {
562 		if (bpp) {
563 			bp = *bpp;
564 			*--bp = '/';
565 		}
566 	}
567 
568 	if (bpp)
569 		*bpp = bp;
570 
571 	if (vp != NULL)
572 		vrele(vp);
573 	rw_exit(&cwdi->cwdi_lock);
574 }
575 
576 /*
577  * Invent attributes for pfsnode (vp) and store
578  * them in (vap).
579  * Directories lengths are returned as zero since
580  * any real length would require the genuine size
581  * to be computed, and nothing cares anyway.
582  *
583  * this is relatively minimal for procfs.
584  */
585 int
586 procfs_getattr(void *v)
587 {
588 	struct vop_getattr_args /* {
589 		struct vnode *a_vp;
590 		struct vattr *a_vap;
591 		kauth_cred_t a_cred;
592 	} */ *ap = v;
593 	struct vnode *vp = ap->a_vp;
594 	struct pfsnode *pfs = VTOPFS(vp);
595 	struct vattr *vap = ap->a_vap;
596 	struct proc *procp;
597 	char *path, *bp, bf[16];
598 	int error;
599 
600 	/* first check the process still exists */
601 	switch (pfs->pfs_type) {
602 	case PFSroot:
603 	case PFScurproc:
604 	case PFSself:
605 		procp = NULL;
606 		break;
607 
608 	default:
609 		error =
610 		    procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT);
611 		if (error != 0)
612 			return (error);
613 		break;
614 	}
615 
616 	switch (pfs->pfs_type) {
617 	case PFStask:
618 		if (pfs->pfs_fd == -1) {
619 			path = NULL;
620 			break;
621 		}
622 		/*FALLTHROUGH*/
623 	case PFScwd:
624 	case PFSchroot:
625 		path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
626 		if (path == NULL && procp != NULL) {
627 			procfs_proc_unlock(procp);
628 			return (ENOMEM);
629 		}
630 		break;
631 
632 	default:
633 		path = NULL;
634 		break;
635 	}
636 
637 	if (procp != NULL) {
638 		mutex_enter(procp->p_lock);
639 		error = kauth_authorize_process(kauth_cred_get(),
640 		    KAUTH_PROCESS_CANSEE, procp,
641 		    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
642 		mutex_exit(procp->p_lock);
643 		if (error != 0) {
644 		    	procfs_proc_unlock(procp);
645 		    	if (path != NULL)
646 		    		free(path, M_TEMP);
647 			return (ENOENT);
648 		}
649 	}
650 
651 	error = 0;
652 
653 	/* start by zeroing out the attributes */
654 	vattr_null(vap);
655 
656 	/* next do all the common fields */
657 	vap->va_type = ap->a_vp->v_type;
658 	vap->va_mode = pfs->pfs_mode;
659 	vap->va_fileid = pfs->pfs_fileno;
660 	vap->va_flags = 0;
661 	vap->va_blocksize = PAGE_SIZE;
662 
663 	/*
664 	 * Make all times be current TOD.
665 	 *
666 	 * It would be possible to get the process start
667 	 * time from the p_stats structure, but there's
668 	 * no "file creation" time stamp anyway, and the
669 	 * p_stats structure is not addressable if u. gets
670 	 * swapped out for that process.
671 	 */
672 	getnanotime(&vap->va_ctime);
673 	vap->va_atime = vap->va_mtime = vap->va_ctime;
674 	if (procp)
675 		TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start,
676 		    &vap->va_birthtime);
677 	else
678 		getnanotime(&vap->va_birthtime);
679 
680 	switch (pfs->pfs_type) {
681 	case PFSmem:
682 	case PFSregs:
683 	case PFSfpregs:
684 #if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES)
685 	PROCFS_MACHDEP_PROTECT_CASES
686 #endif
687 		/*
688 		 * If the process has exercised some setuid or setgid
689 		 * privilege, then rip away read/write permission so
690 		 * that only root can gain access.
691 		 */
692 		if (procp->p_flag & PK_SUGID)
693 			vap->va_mode &= ~(S_IRUSR|S_IWUSR);
694 		/* FALLTHROUGH */
695 	case PFSstatus:
696 	case PFSstat:
697 	case PFSnote:
698 	case PFSnotepg:
699 	case PFScmdline:
700 	case PFSenviron:
701 	case PFSemul:
702 	case PFSstatm:
703 
704 	case PFSmap:
705 	case PFSmaps:
706 	case PFSlimit:
707 	case PFSauxv:
708 		vap->va_nlink = 1;
709 		vap->va_uid = kauth_cred_geteuid(procp->p_cred);
710 		vap->va_gid = kauth_cred_getegid(procp->p_cred);
711 		break;
712 	case PFScwd:
713 	case PFSchroot:
714 	case PFSmeminfo:
715 	case PFSdevices:
716 	case PFScpuinfo:
717 	case PFSuptime:
718 	case PFSmounts:
719 	case PFScpustat:
720 	case PFSloadavg:
721 	case PFSversion:
722 	case PFSexe:
723 	case PFSself:
724 	case PFScurproc:
725 	case PFSroot:
726 		vap->va_nlink = 1;
727 		vap->va_uid = vap->va_gid = 0;
728 		break;
729 
730 	case PFSproc:
731 	case PFStask:
732 	case PFSfile:
733 	case PFSfd:
734 		break;
735 
736 	default:
737 		panic("%s: %d/1", __func__, pfs->pfs_type);
738 	}
739 
740 	/*
741 	 * now do the object specific fields
742 	 *
743 	 * The size could be set from struct reg, but it's hardly
744 	 * worth the trouble, and it puts some (potentially) machine
745 	 * dependent data into this machine-independent code.  If it
746 	 * becomes important then this function should break out into
747 	 * a per-file stat function in the corresponding .c file.
748 	 */
749 
750 	switch (pfs->pfs_type) {
751 	case PFSroot:
752 		vap->va_bytes = vap->va_size = DEV_BSIZE;
753 		break;
754 
755 	case PFSself:
756 	case PFScurproc:
757 		vap->va_bytes = vap->va_size =
758 		    snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
759 		break;
760 	case PFStask:
761 		if (pfs->pfs_fd != -1) {
762 			vap->va_nlink = 1;
763 			vap->va_uid = 0;
764 			vap->va_gid = 0;
765 			vap->va_bytes = vap->va_size =
766 			    snprintf(bf, sizeof(bf), "..");
767 			break;
768 		}
769 		/*FALLTHROUGH*/
770 	case PFSfd:
771 		if (pfs->pfs_fd != -1) {
772 			file_t *fp;
773 
774 			fp = fd_getfile2(procp, pfs->pfs_fd);
775 			if (fp == NULL) {
776 				error = EBADF;
777 				break;
778 			}
779 			vap->va_nlink = 1;
780 			vap->va_uid = kauth_cred_geteuid(fp->f_cred);
781 			vap->va_gid = kauth_cred_getegid(fp->f_cred);
782 			switch (fp->f_type) {
783 			case DTYPE_VNODE:
784 				vap->va_bytes = vap->va_size =
785 				    fp->f_vnode->v_size;
786 				break;
787 			default:
788 				vap->va_bytes = vap->va_size = 0;
789 				break;
790 			}
791 			closef(fp);
792 			break;
793 		}
794 		/*FALLTHROUGH*/
795 	case PFSproc:
796 		vap->va_nlink = 2;
797 		vap->va_uid = kauth_cred_geteuid(procp->p_cred);
798 		vap->va_gid = kauth_cred_getegid(procp->p_cred);
799 		vap->va_bytes = vap->va_size = DEV_BSIZE;
800 		break;
801 
802 	case PFSfile:
803 		error = EOPNOTSUPP;
804 		break;
805 
806 	case PFSmem:
807 		vap->va_bytes = vap->va_size =
808 			ctob(procp->p_vmspace->vm_tsize +
809 				    procp->p_vmspace->vm_dsize +
810 				    procp->p_vmspace->vm_ssize);
811 		break;
812 
813 	case PFSauxv:
814 		vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen;
815 		break;
816 
817 #if defined(PT_GETREGS) || defined(PT_SETREGS)
818 	case PFSregs:
819 		vap->va_bytes = vap->va_size = sizeof(struct reg);
820 		break;
821 #endif
822 
823 #if defined(PT_GETFPREGS) || defined(PT_SETFPREGS)
824 	case PFSfpregs:
825 		vap->va_bytes = vap->va_size = sizeof(struct fpreg);
826 		break;
827 #endif
828 
829 	case PFSstatus:
830 	case PFSstat:
831 	case PFSnote:
832 	case PFSnotepg:
833 	case PFScmdline:
834 	case PFSenviron:
835 	case PFSmeminfo:
836 	case PFSdevices:
837 	case PFScpuinfo:
838 	case PFSuptime:
839 	case PFSmounts:
840 	case PFScpustat:
841 	case PFSloadavg:
842 	case PFSstatm:
843 	case PFSversion:
844 		vap->va_bytes = vap->va_size = 0;
845 		break;
846 	case PFSlimit:
847 	case PFSmap:
848 	case PFSmaps:
849 		/*
850 		 * Advise a larger blocksize for the map files, so that
851 		 * they may be read in one pass.
852 		 */
853 		vap->va_blocksize = 4 * PAGE_SIZE;
854 		vap->va_bytes = vap->va_size = 0;
855 		break;
856 
857 	case PFScwd:
858 	case PFSchroot:
859 		bp = path + MAXPATHLEN;
860 		*--bp = '\0';
861 		procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path,
862 		     MAXPATHLEN);
863 		vap->va_bytes = vap->va_size = strlen(bp);
864 		break;
865 
866 	case PFSexe:
867 		vap->va_bytes = vap->va_size = strlen(procp->p_path);
868 		break;
869 
870 	case PFSemul:
871 		vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name);
872 		break;
873 
874 #ifdef __HAVE_PROCFS_MACHDEP
875 	PROCFS_MACHDEP_NODETYPE_CASES
876 		error = procfs_machdep_getattr(ap->a_vp, vap, procp);
877 		break;
878 #endif
879 
880 	default:
881 		panic("%s: %d/2", __func__, pfs->pfs_type);
882 	}
883 
884 	if (procp != NULL)
885 		procfs_proc_unlock(procp);
886 	if (path != NULL)
887 		free(path, M_TEMP);
888 
889 	return (error);
890 }
891 
892 /*ARGSUSED*/
893 int
894 procfs_setattr(void *v)
895 {
896 	/*
897 	 * just fake out attribute setting
898 	 * it's not good to generate an error
899 	 * return, otherwise things like creat()
900 	 * will fail when they try to set the
901 	 * file length to 0.  worse, this means
902 	 * that echo $note > /proc/$pid/note will fail.
903 	 */
904 
905 	return (0);
906 }
907 
908 /*
909  * implement access checking.
910  *
911  * actually, the check for super-user is slightly
912  * broken since it will allow read access to write-only
913  * objects.  this doesn't cause any particular trouble
914  * but does mean that the i/o entry points need to check
915  * that the operation really does make sense.
916  */
917 int
918 procfs_access(void *v)
919 {
920 	struct vop_access_args /* {
921 		struct vnode *a_vp;
922 		accmode_t a_accmode;
923 		kauth_cred_t a_cred;
924 	} */ *ap = v;
925 	struct vattr va;
926 	int error;
927 
928 	if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
929 		return (error);
930 
931 	return kauth_authorize_vnode(ap->a_cred,
932 	    KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode),
933 	    ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred,
934 	    va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode));
935 }
936 
937 /*
938  * lookup.  this is incredibly complicated in the
939  * general case, however for most pseudo-filesystems
940  * very little needs to be done.
941  *
942  * Locking isn't hard here, just poorly documented.
943  *
944  * If we're looking up ".", just vref the parent & return it.
945  *
946  * If we're looking up "..", unlock the parent, and lock "..". If everything
947  * went ok, and we're on the last component and the caller requested the
948  * parent locked, try to re-lock the parent. We do this to prevent lock
949  * races.
950  *
951  * For anything else, get the needed node. Then unlock the parent if not
952  * the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the
953  * parent in the .. case).
954  *
955  * We try to exit with the parent locked in error cases.
956  */
957 int
958 procfs_lookup(void *v)
959 {
960 	struct vop_lookup_v2_args /* {
961 		struct vnode * a_dvp;
962 		struct vnode ** a_vpp;
963 		struct componentname * a_cnp;
964 	} */ *ap = v;
965 	struct componentname *cnp = ap->a_cnp;
966 	struct vnode **vpp = ap->a_vpp;
967 	struct vnode *dvp = ap->a_dvp;
968 	const char *pname = cnp->cn_nameptr;
969 	const struct proc_target *pt = NULL;
970 	struct vnode *fvp;
971 	pid_t pid, vnpid;
972 	struct pfsnode *pfs;
973 	struct proc *p = NULL;
974 	struct lwp *plwp;
975 	int i, error;
976 	pfstype type;
977 
978 	*vpp = NULL;
979 
980 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
981 		return (error);
982 
983 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
984 		return (EROFS);
985 
986 	if (cnp->cn_namelen == 1 && *pname == '.') {
987 		*vpp = dvp;
988 		vref(dvp);
989 		return (0);
990 	}
991 
992 	pfs = VTOPFS(dvp);
993 	switch (pfs->pfs_type) {
994 	case PFSroot:
995 		/*
996 		 * Shouldn't get here with .. in the root node.
997 		 */
998 		if (cnp->cn_flags & ISDOTDOT)
999 			return (EIO);
1000 
1001 		for (i = 0; i < nproc_root_targets; i++) {
1002 			pt = &proc_root_targets[i];
1003 			/*
1004 			 * check for node match.  proc is always NULL here,
1005 			 * so call pt_valid with constant NULL lwp.
1006 			 */
1007 			if (cnp->cn_namelen == pt->pt_namlen &&
1008 			    memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
1009 			    (pt->pt_valid == NULL ||
1010 			     (*pt->pt_valid)(NULL, dvp->v_mount)))
1011 				break;
1012 		}
1013 
1014 		if (i != nproc_root_targets) {
1015 			error = procfs_allocvp(dvp->v_mount, vpp, 0,
1016 			    pt->pt_pfstype, -1);
1017 			return (error);
1018 		}
1019 
1020 		if (CNEQ(cnp, "curproc", 7)) {
1021 			pid = curproc->p_pid;
1022 			vnpid = 0;
1023 			type = PFScurproc;
1024 		} else if (CNEQ(cnp, "self", 4)) {
1025 			pid = curproc->p_pid;
1026 			vnpid = 0;
1027 			type = PFSself;
1028 		} else {
1029 			pid = (pid_t)atoi(pname, cnp->cn_namelen);
1030 			vnpid = pid;
1031 			type = PFSproc;
1032 		}
1033 
1034 		if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0)
1035 			break;
1036 		error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1);
1037 		procfs_proc_unlock(p);
1038 		return (error);
1039 
1040 	case PFSproc:
1041 		if (cnp->cn_flags & ISDOTDOT) {
1042 			error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot,
1043 			    -1);
1044 			return (error);
1045 		}
1046 
1047 		if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
1048 				     ESRCH) != 0)
1049 			break;
1050 
1051 		mutex_enter(p->p_lock);
1052 		LIST_FOREACH(plwp, &p->p_lwps, l_sibling) {
1053 			if (plwp->l_stat != LSZOMB)
1054 				break;
1055 		}
1056 		/* Process is exiting if no-LWPS or all LWPs are LSZOMB */
1057 		if (plwp == NULL) {
1058 			mutex_exit(p->p_lock);
1059 			procfs_proc_unlock(p);
1060 			return ESRCH;
1061 		}
1062 
1063 		lwp_addref(plwp);
1064 		mutex_exit(p->p_lock);
1065 
1066 		for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
1067 			int found;
1068 
1069 			found = cnp->cn_namelen == pt->pt_namlen &&
1070 			    memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
1071 			    (pt->pt_valid == NULL
1072 			      || (*pt->pt_valid)(plwp, dvp->v_mount));
1073 			if (found)
1074 				break;
1075 		}
1076 		lwp_delref(plwp);
1077 
1078 		if (i == nproc_targets) {
1079 			procfs_proc_unlock(p);
1080 			break;
1081 		}
1082 		if (pt->pt_pfstype == PFSfile) {
1083 			fvp = p->p_textvp;
1084 			/* We already checked that it exists. */
1085 			vref(fvp);
1086 			procfs_proc_unlock(p);
1087 			*vpp = fvp;
1088 			return (0);
1089 		}
1090 
1091 		error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1092 		    pt->pt_pfstype, -1);
1093 		procfs_proc_unlock(p);
1094 		return (error);
1095 
1096 	case PFSfd: {
1097 		int fd;
1098 		file_t *fp;
1099 
1100 		if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
1101 					      ENOENT)) != 0)
1102 			return error;
1103 
1104 		if (cnp->cn_flags & ISDOTDOT) {
1105 			error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1106 			    PFSproc, -1);
1107 			procfs_proc_unlock(p);
1108 			return (error);
1109 		}
1110 		fd = atoi(pname, cnp->cn_namelen);
1111 
1112 		fp = fd_getfile2(p, fd);
1113 		if (fp == NULL) {
1114 			procfs_proc_unlock(p);
1115 			return ENOENT;
1116 		}
1117 		fvp = fp->f_vnode;
1118 
1119 		/* Don't show directories */
1120 		if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR &&
1121 		    !procfs_proc_is_linux_compat()) {
1122 			vref(fvp);
1123 			closef(fp);
1124 			procfs_proc_unlock(p);
1125 			*vpp = fvp;
1126 			return 0;
1127 		}
1128 
1129 		closef(fp);
1130 		error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1131 		    PFSfd, fd);
1132 		procfs_proc_unlock(p);
1133 		return error;
1134 	}
1135 	case PFStask: {
1136 		int xpid;
1137 
1138 		if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
1139 					      ENOENT)) != 0)
1140 			return error;
1141 
1142 		if (cnp->cn_flags & ISDOTDOT) {
1143 			error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1144 			    PFSproc, -1);
1145 			procfs_proc_unlock(p);
1146 			return (error);
1147 		}
1148 		xpid = atoi(pname, cnp->cn_namelen);
1149 
1150 		if (xpid != pfs->pfs_pid) {
1151 			procfs_proc_unlock(p);
1152 			return ENOENT;
1153 		}
1154 		error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
1155 		    PFStask, 0);
1156 		procfs_proc_unlock(p);
1157 		return error;
1158 	}
1159 	default:
1160 		return (ENOTDIR);
1161 	}
1162 
1163 	return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS);
1164 }
1165 
1166 int
1167 procfs_validfile(struct lwp *l, struct mount *mp)
1168 {
1169 	return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL;
1170 }
1171 
1172 static int
1173 procfs_validfile_linux(struct lwp *l, struct mount *mp)
1174 {
1175 	return procfs_use_linux_compat(mp) &&
1176 	    (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp));
1177 }
1178 
1179 struct procfs_root_readdir_ctx {
1180 	struct uio *uiop;
1181 	off_t *cookies;
1182 	int ncookies;
1183 	off_t off;
1184 	off_t startoff;
1185 	int error;
1186 };
1187 
1188 static int
1189 procfs_root_readdir_callback(struct proc *p, void *arg)
1190 {
1191 	struct procfs_root_readdir_ctx *ctxp = arg;
1192 	struct dirent d;
1193 	struct uio *uiop;
1194 	int error;
1195 
1196 	uiop = ctxp->uiop;
1197 	if (uiop->uio_resid < UIO_MX)
1198 		return -1; /* no space */
1199 
1200 	if (kauth_authorize_process(kauth_cred_get(),
1201 	    KAUTH_PROCESS_CANSEE, p,
1202 	    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0)
1203 		return 0;
1204 
1205 	if (ctxp->off < ctxp->startoff) {
1206 		ctxp->off++;
1207 		return 0;
1208 	}
1209 
1210 	memset(&d, 0, UIO_MX);
1211 	d.d_reclen = UIO_MX;
1212 	d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1);
1213 	d.d_namlen = snprintf(d.d_name,
1214 	    UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid);
1215 	d.d_type = DT_DIR;
1216 
1217 	mutex_exit(&proc_lock);
1218 	error = uiomove(&d, UIO_MX, uiop);
1219 	mutex_enter(&proc_lock);
1220 	if (error) {
1221 		ctxp->error = error;
1222 		return -1;
1223 	}
1224 
1225 	ctxp->ncookies++;
1226 	if (ctxp->cookies)
1227 		*(ctxp->cookies)++ = ctxp->off + 1;
1228 	ctxp->off++;
1229 
1230 	return 0;
1231 }
1232 
1233 /*
1234  * readdir returns directory entries from pfsnode (vp).
1235  *
1236  * the strategy here with procfs is to generate a single
1237  * directory entry at a time (struct dirent) and then
1238  * copy that out to userland using uiomove.  a more efficient
1239  * though more complex implementation, would try to minimize
1240  * the number of calls to uiomove().  for procfs, this is
1241  * hardly worth the added code complexity.
1242  *
1243  * this should just be done through read()
1244  */
1245 int
1246 procfs_readdir(void *v)
1247 {
1248 	struct vop_readdir_args /* {
1249 		struct vnode *a_vp;
1250 		struct uio *a_uio;
1251 		kauth_cred_t a_cred;
1252 		int *a_eofflag;
1253 		off_t **a_cookies;
1254 		int *a_ncookies;
1255 	} */ *ap = v;
1256 	struct uio *uio = ap->a_uio;
1257 	struct dirent d;
1258 	struct pfsnode *pfs;
1259 	off_t i;
1260 	int error;
1261 	off_t *cookies = NULL;
1262 	int ncookies;
1263 	struct vnode *vp;
1264 	const struct proc_target *pt;
1265 	struct procfs_root_readdir_ctx ctx;
1266 	struct proc *p = NULL;
1267 	struct lwp *l;
1268 	int nfd;
1269 	int nc = 0;
1270 
1271 	vp = ap->a_vp;
1272 	pfs = VTOPFS(vp);
1273 
1274 	if (uio->uio_resid < UIO_MX)
1275 		return (EINVAL);
1276 	if (uio->uio_offset < 0)
1277 		return (EINVAL);
1278 
1279 	error = 0;
1280 	i = uio->uio_offset;
1281 	memset(&d, 0, UIO_MX);
1282 	d.d_reclen = UIO_MX;
1283 	ncookies = uio->uio_resid / UIO_MX;
1284 
1285 	switch (pfs->pfs_type) {
1286 	/*
1287 	 * this is for the process-specific sub-directories.
1288 	 * all that is needed to is copy out all the entries
1289 	 * from the procent[] table (top of this file).
1290 	 */
1291 	case PFSproc: {
1292 
1293 		if (i >= nproc_targets)
1294 			return 0;
1295 
1296 		if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0)
1297 			break;
1298 
1299 		if (ap->a_ncookies) {
1300 			ncookies = uimin(ncookies, (nproc_targets - i));
1301 			cookies = malloc(ncookies * sizeof (off_t),
1302 			    M_TEMP, M_WAITOK);
1303 			*ap->a_cookies = cookies;
1304 		}
1305 
1306 		for (pt = &proc_targets[i];
1307 		     uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) {
1308 			if (pt->pt_valid) {
1309 				/* XXXSMP LWP can disappear */
1310 				mutex_enter(p->p_lock);
1311 				l = LIST_FIRST(&p->p_lwps);
1312 				KASSERT(l != NULL);
1313 				mutex_exit(p->p_lock);
1314 				if ((*pt->pt_valid)(l, vp->v_mount) == 0)
1315 					continue;
1316 			}
1317 
1318 			d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
1319 			    pt->pt_pfstype, -1);
1320 			d.d_namlen = pt->pt_namlen;
1321 			memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1322 			d.d_type = pt->pt_type;
1323 
1324 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1325 				break;
1326 			if (cookies)
1327 				*cookies++ = i + 1;
1328 		}
1329 
1330 		procfs_proc_unlock(p);
1331 	    	break;
1332 	}
1333 	case PFSfd: {
1334 		file_t *fp;
1335 		int lim;
1336 
1337 		if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
1338 					      ESRCH)) != 0)
1339 			return error;
1340 
1341 		/* XXX Should this be by file as well? */
1342 		if (kauth_authorize_process(kauth_cred_get(),
1343 		    KAUTH_PROCESS_CANSEE, p,
1344 		    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL,
1345 		    NULL) != 0) {
1346 		    	procfs_proc_unlock(p);
1347 			return ESRCH;
1348 		}
1349 
1350 		nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles;
1351 
1352 		lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
1353 		if (i >= lim) {
1354 		    	procfs_proc_unlock(p);
1355 			return 0;
1356 		}
1357 
1358 		if (ap->a_ncookies) {
1359 			ncookies = uimin(ncookies, (nfd + 2 - i));
1360 			cookies = malloc(ncookies * sizeof (off_t),
1361 			    M_TEMP, M_WAITOK);
1362 			*ap->a_cookies = cookies;
1363 		}
1364 
1365 		for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
1366 			pt = &proc_targets[i];
1367 			d.d_namlen = pt->pt_namlen;
1368 			d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
1369 			    pt->pt_pfstype, -1);
1370 			(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1371 			d.d_type = pt->pt_type;
1372 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1373 				break;
1374 			if (cookies)
1375 				*cookies++ = i + 1;
1376 			nc++;
1377 		}
1378 		if (error)
1379 			goto out;
1380 		for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
1381 			/* check the descriptor exists */
1382 			if ((fp = fd_getfile2(p, i - 2)) == NULL)
1383 				continue;
1384 			closef(fp);
1385 
1386 			d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2);
1387 			d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
1388 			    "%lld", (long long)(i - 2));
1389 			d.d_type = fttodt(fp);
1390 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1391 				break;
1392 			if (cookies)
1393 				*cookies++ = i + 1;
1394 			nc++;
1395 		}
1396 		goto out;
1397 	}
1398 	case PFStask: {
1399 
1400 		if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
1401 					      ESRCH)) != 0)
1402 			return error;
1403 
1404 		nfd = 3;	/* ., .., pid */
1405 
1406 		if (ap->a_ncookies) {
1407 			ncookies = uimin(ncookies, (nfd + 2 - i));
1408 			cookies = malloc(ncookies * sizeof (off_t),
1409 			    M_TEMP, M_WAITOK);
1410 			*ap->a_cookies = cookies;
1411 		}
1412 
1413 		for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
1414 			pt = &proc_targets[i];
1415 			d.d_namlen = pt->pt_namlen;
1416 			d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
1417 			    pt->pt_pfstype, -1);
1418 			(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1419 			d.d_type = pt->pt_type;
1420 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1421 				break;
1422 			if (cookies)
1423 				*cookies++ = i + 1;
1424 			nc++;
1425 		}
1426 		if (error)
1427 			goto out;
1428 		for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
1429 			/* check the descriptor exists */
1430 			d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask,
1431 			    i - 2);
1432 			d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
1433 			    "%ld", (long)pfs->pfs_pid);
1434 			d.d_type = DT_LNK;
1435 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1436 				break;
1437 			if (cookies)
1438 				*cookies++ = i + 1;
1439 			nc++;
1440 		}
1441 		goto out;
1442 	}
1443 
1444 	/*
1445 	 * this is for the root of the procfs filesystem
1446 	 * what is needed are special entries for "curproc"
1447 	 * and "self" followed by an entry for each process
1448 	 * on allproc.
1449 	 */
1450 
1451 	case PFSroot: {
1452 
1453 		if (ap->a_ncookies) {
1454 			/*
1455 			 * XXX Potentially allocating too much space here,
1456 			 * but I'm lazy. This loop needs some work.
1457 			 */
1458 			cookies = malloc(ncookies * sizeof (off_t),
1459 			    M_TEMP, M_WAITOK);
1460 			*ap->a_cookies = cookies;
1461 		}
1462 
1463 		/* 0 ... 3 are static entries. */
1464 		for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) {
1465 			switch (i) {
1466 			case 0:		/* `.' */
1467 			case 1:		/* `..' */
1468 				d.d_fileno = PROCFS_FILENO(0, PFSroot, -1);
1469 				d.d_namlen = i + 1;
1470 				memcpy(d.d_name, "..", d.d_namlen);
1471 				d.d_name[i + 1] = '\0';
1472 				d.d_type = DT_DIR;
1473 				break;
1474 
1475 			case 2:
1476 				d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1);
1477 				d.d_namlen = sizeof("curproc") - 1;
1478 				memcpy(d.d_name, "curproc", sizeof("curproc"));
1479 				d.d_type = DT_LNK;
1480 				break;
1481 
1482 			case 3:
1483 				d.d_fileno = PROCFS_FILENO(0, PFSself, -1);
1484 				d.d_namlen = sizeof("self") - 1;
1485 				memcpy(d.d_name, "self", sizeof("self"));
1486 				d.d_type = DT_LNK;
1487 				break;
1488 			}
1489 
1490 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1491 				break;
1492 			nc++;
1493 			if (cookies)
1494 				*cookies++ = i + 1;
1495 		}
1496 		if (error)
1497 			break;
1498 		/* 4 ... are process entries. */
1499 		ctx.uiop = uio;
1500 		ctx.error = 0;
1501 		ctx.off = 4;
1502 		ctx.startoff = i;
1503 		ctx.cookies = cookies;
1504 		ctx.ncookies = nc;
1505 		proclist_foreach_call(&allproc,
1506 		    procfs_root_readdir_callback, &ctx);
1507 		cookies = ctx.cookies;
1508 		nc = ctx.ncookies;
1509 		error = ctx.error;
1510 		if (error)
1511 			break;
1512 
1513 		/* misc entries. */
1514 		if (i < ctx.off)
1515 			i = ctx.off;
1516 		if (i >= ctx.off + nproc_root_targets)
1517 			break;
1518 		error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH);
1519 		if (error)
1520 			break;
1521 		for (pt = &proc_root_targets[i - ctx.off];
1522 		    uio->uio_resid >= UIO_MX &&
1523 		    pt < &proc_root_targets[nproc_root_targets];
1524 		    pt++, i++) {
1525 			if (pt->pt_valid &&
1526 			    (*pt->pt_valid)(NULL, vp->v_mount) == 0)
1527 				continue;
1528 			if (kauth_authorize_process(kauth_cred_get(),
1529 			    KAUTH_PROCESS_CANSEE, p,
1530 			    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY),
1531 			    NULL, NULL) != 0)
1532 				continue;
1533 			d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1);
1534 			d.d_namlen = pt->pt_namlen;
1535 			memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
1536 			d.d_type = pt->pt_type;
1537 
1538 			if ((error = uiomove(&d, UIO_MX, uio)) != 0)
1539 				break;
1540 			nc++;
1541 			if (cookies)
1542 				*cookies++ = i + 1;
1543 		}
1544 out:
1545 		KASSERT(p != NULL);
1546 		ncookies = nc;
1547 		procfs_proc_unlock(p);
1548 		break;
1549 	}
1550 
1551 	default:
1552 		error = ENOTDIR;
1553 		break;
1554 	}
1555 
1556 	if (ap->a_ncookies) {
1557 		if (error) {
1558 			if (cookies)
1559 				free(*ap->a_cookies, M_TEMP);
1560 			*ap->a_ncookies = 0;
1561 			*ap->a_cookies = NULL;
1562 		} else
1563 			*ap->a_ncookies = ncookies;
1564 	}
1565 	uio->uio_offset = i;
1566 	return (error);
1567 }
1568 
1569 /*
1570  * readlink reads the link of `curproc' and others
1571  */
1572 int
1573 procfs_readlink(void *v)
1574 {
1575 	struct vop_readlink_args *ap = v;
1576 	char bf[16];		/* should be enough */
1577 	char *bp = bf;
1578 	char *path = NULL;
1579 	int len = 0;
1580 	int error = 0;
1581 	struct vnode *vp = ap->a_vp;
1582 	struct pfsnode *pfs = VTOPFS(vp);
1583 	struct proc *pown = NULL;
1584 
1585 	if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1))
1586 		len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
1587 	else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1))
1588 		len = snprintf(bf, sizeof(bf), "%s", "curproc");
1589 	else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0))
1590 		len = snprintf(bf, sizeof(bf), "..");
1591 	else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) {
1592 		if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
1593 					      ESRCH)) != 0)
1594 			return error;
1595 		bp = pown->p_path;
1596 		len = strlen(bp);
1597 	} else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) ||
1598 	    pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) {
1599 		if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
1600 					      ESRCH)) != 0)
1601 			return error;
1602 		path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
1603 		if (path == NULL) {
1604 			procfs_proc_unlock(pown);
1605 			return (ENOMEM);
1606 		}
1607 		bp = path + MAXPATHLEN;
1608 		*--bp = '\0';
1609 		procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown,
1610 		    &bp, path, MAXPATHLEN);
1611 		len = strlen(bp);
1612 	} else {
1613 		file_t *fp;
1614 		struct vnode *vxp;
1615 
1616 		if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
1617 					      ESRCH)) != 0)
1618 			return error;
1619 
1620 		fp = fd_getfile2(pown, pfs->pfs_fd);
1621 		if (fp == NULL) {
1622 			procfs_proc_unlock(pown);
1623 			return EBADF;
1624 		}
1625 
1626 		switch (fp->f_type) {
1627 		case DTYPE_VNODE:
1628 			vxp = fp->f_vnode;
1629 			if (vxp->v_type != VDIR &&
1630 			    !procfs_proc_is_linux_compat()) {
1631 				error = EINVAL;
1632 				break;
1633 			}
1634 			if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK))
1635 			    == NULL) {
1636 				error = ENOMEM;
1637 				break;
1638 			}
1639 			bp = path + MAXPATHLEN;
1640 			*--bp = '\0';
1641 
1642 			/*
1643 			 * XXX: kludge to avoid locking against ourselves
1644 			 * in getcwd()
1645 			 */
1646 			if (vxp->v_tag == VT_PROCFS) {
1647 				*--bp = '/';
1648 			} else {
1649 				rw_enter(&curproc->p_cwdi->cwdi_lock,
1650 				    RW_READER);
1651 				vp = curproc->p_cwdi->cwdi_rdir;
1652 				if (vp == NULL)
1653 					vp = rootvnode;
1654 				error = getcwd_common(vxp, vp, &bp, path,
1655 				    MAXPATHLEN / 2, 0, curlwp);
1656 				rw_exit(&curproc->p_cwdi->cwdi_lock);
1657 			}
1658 			if (error)
1659 				break;
1660 			len = strlen(bp);
1661 			break;
1662 
1663 		case DTYPE_MISC:
1664 			len = snprintf(bf, sizeof(bf), "%s", "[misc]");
1665 			break;
1666 
1667 		case DTYPE_KQUEUE:
1668 			len = snprintf(bf, sizeof(bf), "%s", "[kqueue]");
1669 			break;
1670 
1671 		case DTYPE_SEM:
1672 			len = snprintf(bf, sizeof(bf), "%s", "[ksem]");
1673 			break;
1674 
1675 		default:
1676 			error = EINVAL;
1677 			break;
1678 		}
1679 		closef(fp);
1680 	}
1681 
1682 	if (error == 0)
1683 		error = uiomove(bp, len, ap->a_uio);
1684 	if (pown)
1685 		procfs_proc_unlock(pown);
1686 	if (path)
1687 		free(path, M_TEMP);
1688 	return error;
1689 }
1690 
1691 int
1692 procfs_getpages(void *v)
1693 {
1694 	struct vop_getpages_args /* {
1695 		struct vnode *a_vp;
1696 		voff_t a_offset;
1697 		struct vm_page **a_m;
1698 		int *a_count;
1699 		int a_centeridx;
1700 		vm_prot_t a_access_type;
1701 		int a_advice;
1702 		int a_flags;
1703 	} */ *ap = v;
1704 
1705 	if ((ap->a_flags & PGO_LOCKED) == 0)
1706 		rw_exit(ap->a_vp->v_uobj.vmobjlock);
1707 
1708 	return (EFAULT);
1709 }
1710 
1711 /*
1712  * convert decimal ascii to int
1713  */
1714 static int
1715 atoi(const char *b, size_t len)
1716 {
1717 	int p = 0;
1718 
1719 	while (len--) {
1720 		char c = *b++;
1721 		if (c < '0' || c > '9')
1722 			return -1;
1723 		p = 10 * p + (c - '0');
1724 	}
1725 
1726 	return p;
1727 }
1728 
1729 /**
1730  * convert DTYPE_XXX to corresponding DT_XXX
1731  * matching what procfs_loadvnode() does.
1732  */
1733 static uint8_t
1734 fttodt(file_t *fp)
1735 {
1736 	switch (fp->f_type) {
1737 	case DTYPE_VNODE:
1738 		switch (fp->f_vnode->v_type) {
1739 		case VREG:	return DT_REG;
1740 		case VDIR:	return DT_LNK;	/* symlink */
1741 		case VBLK:	return DT_BLK;
1742 		case VCHR:	return DT_CHR;
1743 		case VLNK:	return DT_LNK;
1744 		case VSOCK:	return DT_SOCK;
1745 		case VFIFO:	return DT_FIFO;
1746 		default:	return DT_UNKNOWN;
1747 		}
1748 	case DTYPE_PIPE:	return DT_FIFO;
1749 	case DTYPE_SOCKET:	return DT_SOCK;
1750 	case DTYPE_KQUEUE:	/*FALLTHROUGH*/
1751 	case DTYPE_MISC:	/*FALLTHROUGH*/
1752 	case DTYPE_SEM:		return DT_LNK;	/* symlinks */
1753 	default:		return DT_UNKNOWN;
1754 	}
1755 }
1756