xref: /dflybsd-src/sys/vfs/procfs/procfs_vnops.c (revision 2b0cb81791f7cbae1a99aff6d697787b0372d28e)
1 /*
2  * Copyright (c) 1993, 1995 Jan-Simon Pendry
3  * Copyright (c) 1993, 1995
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Jan-Simon Pendry.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)procfs_vnops.c	8.18 (Berkeley) 5/21/95
34  *
35  * $FreeBSD: src/sys/miscfs/procfs/procfs_vnops.c,v 1.76.2.7 2002/01/22 17:22:59 nectar Exp $
36  */
37 
38 /*
39  * procfs vnode interface
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/time.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/fcntl.h>
48 #include <sys/proc.h>
49 #include <sys/caps.h>
50 #include <sys/signalvar.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/mount.h>
54 #include <sys/namei.h>
55 #include <sys/dirent.h>
56 #include <sys/malloc.h>
57 #include <sys/reg.h>
58 #include <vm/vm_zone.h>
59 #include <vfs/procfs/procfs.h>
60 #include <sys/pioctl.h>
61 
62 #include <sys/spinlock2.h>
63 
64 #include <machine/limits.h>
65 
66 static int	procfs_access (struct vop_access_args *);
67 static int	procfs_badop (struct vop_generic_args *);
68 static int	procfs_bmap (struct vop_bmap_args *);
69 static int	procfs_close (struct vop_close_args *);
70 static int	procfs_getattr (struct vop_getattr_args *);
71 static int	procfs_inactive (struct vop_inactive_args *);
72 static int	procfs_ioctl (struct vop_ioctl_args *);
73 static int	procfs_kqfilter (struct vop_kqfilter_args *);
74 static int	procfs_lookup (struct vop_old_lookup_args *);
75 static int	procfs_open (struct vop_open_args *);
76 static int	procfs_print (struct vop_print_args *);
77 static int	procfs_readdir (struct vop_readdir_args *);
78 static int	procfs_readlink (struct vop_readlink_args *);
79 static int	procfs_reclaim (struct vop_reclaim_args *);
80 static int	procfs_setattr (struct vop_setattr_args *);
81 
82 static int	procfs_readdir_proc(struct vop_readdir_args *);
83 static int	procfs_readdir_root(struct vop_readdir_args *);
84 
85 /*
86  * procfs vnode operations.
87  */
88 struct vop_ops procfs_vnode_vops = {
89 	.vop_default =		vop_defaultop,
90 	.vop_access =		procfs_access,
91 	.vop_advlock =		(void *)procfs_badop,
92 	.vop_bmap =		procfs_bmap,
93 	.vop_close =		procfs_close,
94 	.vop_old_create =	(void *)procfs_badop,
95 	.vop_getattr =		procfs_getattr,
96 	.vop_inactive =		procfs_inactive,
97 	.vop_old_link =		(void *)procfs_badop,
98 	.vop_old_lookup =	procfs_lookup,
99 	.vop_old_mkdir =	(void *)procfs_badop,
100 	.vop_old_mknod =	(void *)procfs_badop,
101 	.vop_open =		procfs_open,
102 	.vop_pathconf =		vop_stdpathconf,
103 	.vop_print =		procfs_print,
104 	.vop_read =		procfs_rw,
105 	.vop_readdir =		procfs_readdir,
106 	.vop_readlink =		procfs_readlink,
107 	.vop_reclaim =		procfs_reclaim,
108 	.vop_old_remove =	(void *)procfs_badop,
109 	.vop_old_rename =	(void *)procfs_badop,
110 	.vop_old_rmdir =	(void *)procfs_badop,
111 	.vop_setattr =		procfs_setattr,
112 	.vop_old_symlink =	(void *)procfs_badop,
113 	.vop_write =		(void *)procfs_rw,
114 	.vop_ioctl =		procfs_ioctl,
115 	.vop_kqfilter =		procfs_kqfilter,
116 };
117 
118 
119 /*
120  * This is a list of the valid names in the
121  * process-specific sub-directories.  It is
122  * used in procfs_lookup and procfs_readdir
123  */
124 static struct proc_target {
125 	u_char	pt_type;
126 	u_char	pt_namlen;
127 	char	*pt_name;
128 	pfstype	pt_pfstype;
129 	int	(*pt_valid) (struct lwp *p);
130 } proc_targets[] = {
131 #define N(s) sizeof(s)-1, s
132 	/*	  name		type		validp */
133 	{ DT_DIR, N("."),	Pproc,		NULL },
134 	{ DT_DIR, N(".."),	Proot,		NULL },
135 	{ DT_REG, N("mem"),	Pmem,		NULL },
136 	{ DT_REG, N("regs"),	Pregs,		procfs_validregs },
137 	{ DT_REG, N("fpregs"),	Pfpregs,	procfs_validfpregs },
138 	{ DT_REG, N("dbregs"),	Pdbregs,	procfs_validdbregs },
139 	{ DT_REG, N("ctl"),	Pctl,		NULL },
140 	{ DT_REG, N("status"),	Pstatus,	NULL },
141 	{ DT_REG, N("note"),	Pnote,		NULL },
142 	{ DT_REG, N("notepg"),	Pnotepg,	NULL },
143 	{ DT_REG, N("map"), 	Pmap,		procfs_validmap },
144 	{ DT_REG, N("etype"),	Ptype,		procfs_validtype },
145 	{ DT_REG, N("cmdline"),	Pcmdline,	NULL },
146 	{ DT_REG, N("rlimit"),	Prlimit,	NULL },
147 	{ DT_LNK, N("file"),	Pfile,		NULL },
148 	{ DT_LNK, N("exe"),	Pfile,		NULL },
149 #undef N
150 };
151 static const int nproc_targets = NELEM(proc_targets);
152 
153 static pid_t atopid (const char *, u_int);
154 
155 /*
156  * set things up for doing i/o on
157  * the pfsnode (vp).  (vp) is locked
158  * on entry, and should be left locked
159  * on exit.
160  *
161  * for procfs we don't need to do anything
162  * in particular for i/o.  all that is done
163  * is to support exclusive open on process
164  * memory images.
165  *
166  * procfs_open(struct vnode *a_vp, int a_mode, struct ucred *a_cred,
167  *	       struct file *a_fp)
168  */
169 static int
170 procfs_open(struct vop_open_args *ap)
171 {
172 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
173 	struct proc *p1, *p2;
174 	int error;
175 
176 	p2 = pfs_pfind(pfs->pfs_pid);
177 	if (p2 == NULL)
178 		return (ENOENT);
179 	if (pfs->pfs_pid && !PRISON_CHECK(ap->a_cred, p2->p_ucred)) {
180 		error = ENOENT;
181 		goto done;
182 	}
183 
184 	switch (pfs->pfs_type) {
185 	case Pmem:
186 		if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
187 		    ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
188 			error = EBUSY;
189 			goto done;
190 		}
191 
192 		p1 = curproc;
193 		KKASSERT(p1);
194 		/* Can't trace a process that's currently exec'ing. */
195 		if ((p2->p_flags & P_INEXEC) != 0) {
196 			error = EAGAIN;
197 			goto done;
198 		}
199 		if (!CHECKIO(p1, p2) || p_trespass(ap->a_cred, p2->p_ucred)) {
200 			error = EPERM;
201 			goto done;
202 		}
203 
204 		if (ap->a_mode & FWRITE)
205 			pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
206 
207 		break;
208 
209 	default:
210 		break;
211 	}
212 	error = vop_stdopen(ap);
213 done:
214 	pfs_pdone(p2);
215 	return error;
216 }
217 
218 /*
219  * close the pfsnode (vp) after doing i/o.
220  * (vp) is not locked on entry or exit.
221  *
222  * nothing to do for procfs other than undo
223  * any exclusive open flag (see _open above).
224  *
225  * procfs_close(struct vnode *a_vp, int a_fflag, struct ucred *a_cred)
226  */
227 static int
228 procfs_close(struct vop_close_args *ap)
229 {
230 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
231 	struct proc *p;
232 
233 	/*
234 	 * Make sure the lock is exclusive for opencount tests
235 	 */
236 	vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY);
237 
238 	switch (pfs->pfs_type) {
239 	case Pmem:
240 		if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
241 			pfs->pfs_flags &= ~(FWRITE|O_EXCL);
242 		/*
243 		 * v_opencount determines the last real close on the vnode.
244 		 *
245 		 * If this is the last close, then it checks to see if
246 		 * the target process has PF_LINGER set in p_pfsflags,
247 		 * if this is *not* the case, then the process' stop flags
248 		 * are cleared, and the process is woken up.  This is
249 		 * to help prevent the case where a process has been
250 		 * told to stop on an event, but then the requesting process
251 		 * has gone away or forgotten about it.
252 		 */
253 		p = NULL;
254 		if ((ap->a_vp->v_opencount < 2)
255 		    && ((p = pfs_pfind(pfs->pfs_pid)) != NULL ||
256 		        (p = pfs_zpfind(pfs->pfs_pid)) != NULL)
257 		    && !(p->p_pfsflags & PF_LINGER)) {
258 			spin_lock(&p->p_spin);
259 			p->p_stops = 0;
260 			p->p_step = 0;
261 			spin_unlock(&p->p_spin);
262 			wakeup(&p->p_stype);
263 			wakeup(&p->p_step);
264 		}
265 		pfs_pdone(p);
266 		break;
267 	default:
268 		break;
269 	}
270 
271 	return (vop_stdclose(ap));
272 }
273 
274 /*
275  * do an ioctl operation on a pfsnode (vp).
276  * (vp) is not locked on entry or exit.
277  */
278 static int
279 procfs_ioctl(struct vop_ioctl_args *ap)
280 {
281 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
282 	struct proc *procp;
283 	struct proc *p;
284 	int error;
285 	int signo;
286 	struct procfs_status *psp;
287 	unsigned char flags;
288 
289 	procp = pfs_pfind(pfs->pfs_pid);
290 	if (procp == NULL)
291 		return ENOTTY;
292 	p = curproc;
293 	if (p == NULL) {
294 		error = EINVAL;
295 		goto done;
296 	}
297 
298 	/* Can't trace a process that's currently exec'ing. */
299 	if ((procp->p_flags & P_INEXEC) != 0) {
300 		error = EAGAIN;
301 		goto done;
302 	}
303 	if (!CHECKIO(p, procp) || p_trespass(ap->a_cred, procp->p_ucred)) {
304 		error = EPERM;
305 		goto done;
306 	}
307 
308 	switch (ap->a_command) {
309 	case PIOCBIS:
310 		spin_lock(&procp->p_spin);
311 		procp->p_stops |= *(unsigned int*)ap->a_data;
312 		spin_unlock(&procp->p_spin);
313 		break;
314 	case PIOCBIC:
315 		spin_lock(&procp->p_spin);
316 		procp->p_stops &= ~*(unsigned int*)ap->a_data;
317 		spin_unlock(&procp->p_spin);
318 		break;
319 	case PIOCSFL:
320 		/*
321 		* NFLAGS is "non-suser_xxx flags" -- currently, only
322 		* PFS_ISUGID ("ignore set u/g id");
323 		*/
324 #define NFLAGS	(PF_ISUGID)
325 		flags = (unsigned char)*(unsigned int*)ap->a_data;
326 		if (flags & NFLAGS &&
327 		    (error = caps_priv_check(ap->a_cred,
328 					     SYSCAP_RESTRICTEDROOT))) {
329 			goto done;
330 		}
331 		procp->p_pfsflags = flags;
332 		break;
333 	case PIOCGFL:
334 		*(unsigned int*)ap->a_data = (unsigned int)procp->p_pfsflags;
335 		break;
336 	case PIOCSTATUS:
337 		/*
338 		* NOTE: syscall entry deals with stopevents and may run
339 		*	without the MP lock.
340 		*/
341 		psp = (struct procfs_status *)ap->a_data;
342 		psp->flags = procp->p_pfsflags;
343 		psp->events = procp->p_stops;
344 		spin_lock(&procp->p_spin);
345 		if (procp->p_step) {
346 			psp->state = 0;
347 			psp->why = procp->p_stype;
348 			psp->val = procp->p_xstat;
349 			spin_unlock(&procp->p_spin);
350 		} else {
351 			psp->state = 1;
352 			spin_unlock(&procp->p_spin);
353 			psp->why = 0;	/* Not defined values */
354 			psp->val = 0;	/* Not defined values */
355 		}
356 		break;
357 	case PIOCWAIT:
358 		/*
359 		* NOTE: syscall entry deals with stopevents and may run
360 		*	without the MP lock.
361 		*/
362 		psp = (struct procfs_status *)ap->a_data;
363 		spin_lock(&procp->p_spin);
364 		while (procp->p_step == 0) {
365 			tsleep_interlock(&procp->p_stype, PCATCH);
366 			spin_unlock(&procp->p_spin);
367 			if (procp->p_stops == 0) {
368 				error = 0;
369 				goto done;
370 			}
371 			if (procp->p_flags & P_POSTEXIT) {
372 				error = EINVAL;
373 				goto done;
374 			}
375 			if (procp->p_flags & P_INEXEC) {
376 				error = EAGAIN;
377 				goto done;
378 			}
379 			error = tsleep(&procp->p_stype, PCATCH | PINTERLOCKED,
380 				       "piocwait", 0);
381 			if (error)
382 				goto done;
383 			spin_lock(&procp->p_spin);
384 		}
385 		spin_unlock(&procp->p_spin);
386 		psp->state = 1;	/* It stopped */
387 		psp->flags = procp->p_pfsflags;
388 		psp->events = procp->p_stops;
389 		psp->why = procp->p_stype;	/* why it stopped */
390 		psp->val = procp->p_xstat;	/* any extra info */
391 		break;
392 	case PIOCCONT:	/* Restart a proc */
393 		/*
394 		* NOTE: syscall entry deals with stopevents and may run
395 		*	without the MP lock.  However, the caller is
396 		*	presumably interlocked by having waited.
397 		*/
398 		if (procp->p_step == 0) {
399 			error = EINVAL;	/* Can only start a stopped process */
400 			goto done;
401 		}
402 		if ((signo = *(int*)ap->a_data) != 0) {
403 			if (signo >= NSIG || signo <= 0) {
404 				error = EINVAL;
405 				goto done;
406 			}
407 			ksignal(procp, signo);
408 		}
409 		procp->p_step = 0;
410 		wakeup(&procp->p_step);
411 		break;
412 	default:
413 		error = ENOTTY;
414 		goto done;
415 	}
416 	error = 0;
417 done:
418 	pfs_pdone(procp);
419 	return error;
420 }
421 
422 /*
423  * do block mapping for pfsnode (vp).
424  * since we don't use the buffer cache
425  * for procfs this function should never
426  * be called.  in any case, it's not clear
427  * what part of the kernel ever makes use
428  * of this function.  for sanity, this is the
429  * usual no-op bmap, although returning
430  * (EIO) would be a reasonable alternative.
431  *
432  * XXX mmap assumes buffer cache operation
433  *
434  * procfs_bmap(struct vnode *a_vp, off_t a_loffset,
435  *		off_t *a_doffsetp, int *a_runp, int *a_runb)
436  */
437 static int
438 procfs_bmap(struct vop_bmap_args *ap)
439 {
440 	if (ap->a_doffsetp != NULL)
441 		*ap->a_doffsetp = ap->a_loffset;
442 	if (ap->a_runp != NULL)
443 		*ap->a_runp = 0;
444 	if (ap->a_runb != NULL)
445 		*ap->a_runb = 0;
446 	return (0);
447 }
448 
449 /*
450  * procfs_inactive is called when the pfsnode
451  * is vrele'd and the reference count goes
452  * to zero.  (vp) will be on the vnode free
453  * list, so to get it back vget() must be
454  * used.
455  *
456  * (vp) is locked on entry, but must be unlocked on exit.
457  *
458  * procfs_inactive(struct vnode *a_vp)
459  */
460 static int
461 procfs_inactive(struct vop_inactive_args *ap)
462 {
463 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
464 
465 	if (pfs->pfs_pid & PFS_DEAD)
466 		vrecycle(ap->a_vp);
467 	return (0);
468 }
469 
470 /*
471  * _reclaim is called when getnewvnode()
472  * wants to make use of an entry on the vnode
473  * free list.  at this time the filesystem needs
474  * to free any private data and remove the node
475  * from any private lists.
476  *
477  * procfs_reclaim(struct vnode *a_vp)
478  */
479 static int
480 procfs_reclaim(struct vop_reclaim_args *ap)
481 {
482 	return (procfs_freevp(ap->a_vp));
483 }
484 
485 /*
486  * _print is used for debugging.
487  * just print a readable description
488  * of (vp).
489  *
490  * procfs_print(struct vnode *a_vp)
491  */
492 static int
493 procfs_print(struct vop_print_args *ap)
494 {
495 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
496 
497 	kprintf("tag VT_PROCFS, type %d, pid %ld, mode %x, flags %lx\n",
498 	    pfs->pfs_type, (long)pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
499 	return (0);
500 }
501 
502 /*
503  * generic entry point for unsupported operations
504  */
505 static int
506 procfs_badop(struct vop_generic_args *ap)
507 {
508 	return (EIO);
509 }
510 
511 /*
512  * Invent attributes for pfsnode (vp) and store
513  * them in (vap).
514  * Directories lengths are returned as zero since
515  * any real length would require the genuine size
516  * to be computed, and nothing cares anyway.
517  *
518  * this is relatively minimal for procfs.
519  *
520  * procfs_getattr(struct vnode *a_vp, struct vattr *a_vap)
521  */
522 static int
523 procfs_getattr(struct vop_getattr_args *ap)
524 {
525 	struct pfsnode *pfs = VTOPFS(ap->a_vp);
526 	struct vattr *vap = ap->a_vap;
527 	struct proc *procp;
528 	int error;
529 
530 	/*
531 	 * First make sure that the process and its credentials
532 	 * still exist.
533 	 */
534 	switch (pfs->pfs_type) {
535 	case Proot:
536 	case Pcurproc:
537 		procp = NULL;
538 		break;
539 	default:
540 		procp = pfs_pfind(pfs->pfs_pid);
541 		if (procp == NULL || procp->p_ucred == NULL) {
542 			error = ENOENT;
543 			goto done;
544 		}
545 		break;
546 	}
547 
548 	error = 0;
549 
550 	/* start by zeroing out the attributes */
551 	VATTR_NULL(vap);
552 
553 	/* next do all the common fields */
554 	vap->va_type = ap->a_vp->v_type;
555 	vap->va_mode = pfs->pfs_mode;
556 	vap->va_fileid = pfs->pfs_fileno;
557 	vap->va_flags = 0;
558 	vap->va_blocksize = PAGE_SIZE;
559 	vap->va_bytes = vap->va_size = 0;
560 	vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
561 
562 	/*
563 	 * Make all times be current TOD.
564 	 * It would be possible to get the process start
565 	 * time from the p_stat structure, but there's
566 	 * no "file creation" time stamp anyway, and the
567 	 * p_stat structure is not addressible if u. gets
568 	 * swapped out for that process.
569 	 */
570 	vfs_timestamp(&vap->va_ctime);
571 	vap->va_atime = vap->va_mtime = vap->va_ctime;
572 
573 	/*
574 	 * If the process has exercised some setuid or setgid
575 	 * privilege, then rip away read/write permission so
576 	 * that only root can gain access.
577 	 */
578 	switch (pfs->pfs_type) {
579 	case Pctl:
580 	case Pregs:
581 	case Pfpregs:
582 	case Pdbregs:
583 	case Pmem:
584 		if (procp->p_flags & P_SUGID) {
585 			vap->va_mode &= ~((VREAD|VWRITE)|
586 					  ((VREAD|VWRITE)>>3)|
587 					  ((VREAD|VWRITE)>>6));
588 		}
589 		break;
590 	default:
591 		break;
592 	}
593 
594 	/*
595 	 * now do the object specific fields
596 	 *
597 	 * The size could be set from struct reg, but it's hardly
598 	 * worth the trouble, and it puts some (potentially) machine
599 	 * dependent data into this machine-independent code.  If it
600 	 * becomes important then this function should break out into
601 	 * a per-file stat function in the corresponding .c file.
602 	 */
603 
604 	vap->va_nlink = 1;
605 	if (procp) {
606 		if (procp->p_ucred) {
607 			vap->va_uid = procp->p_ucred->cr_uid;
608 			vap->va_gid = procp->p_ucred->cr_gid;
609 		} else {
610 			vap->va_uid = -1;
611 			vap->va_gid = -1;
612 		}
613 	}
614 
615 	switch (pfs->pfs_type) {
616 	case Proot:
617 		/*
618 		 * Set nlink to 1 to tell fts(3) we don't actually know.
619 		 */
620 		vap->va_nlink = 1;
621 		vap->va_uid = 0;
622 		vap->va_gid = 0;
623 		vap->va_size = vap->va_bytes = DEV_BSIZE;
624 		break;
625 
626 	case Pcurproc: {
627 		char buf[16];		/* should be enough */
628 
629 		vap->va_uid = 0;
630 		vap->va_gid = 0;
631 		vap->va_size = ksnprintf(buf, sizeof(buf),
632 					 "%ld", (long)curproc->p_pid);
633 		vap->va_bytes = vap->va_size;
634 		break;
635 	}
636 
637 	case Pproc:
638 		vap->va_nlink = nproc_targets;
639 		vap->va_size = vap->va_bytes = DEV_BSIZE;
640 		break;
641 
642 	case Pfile: {
643 		char *fullpath, *freepath;
644 
645 		if (procp->p_textnch.ncp) {
646 			struct nchandle nch;
647 
648 			cache_copy(&procp->p_textnch, &nch);
649 			error = cache_fullpath(procp, &nch, NULL,
650 					       &fullpath, &freepath, 0);
651 			cache_drop(&nch);
652 		} else {
653 			error = EINVAL;
654 		}
655 
656 		if (error == 0) {
657 			vap->va_size = strlen(fullpath);
658 			kfree(freepath, M_TEMP);
659 		} else {
660 			vap->va_size = sizeof("unknown") - 1;
661 			error = 0;
662 		}
663 		vap->va_bytes = vap->va_size;
664 		break;
665 	}
666 
667 	case Pmem:
668 		/*
669 		 * If we denied owner access earlier, then we have to
670 		 * change the owner to root - otherwise 'ps' and friends
671 		 * will break even though they are setgid kmem. *SIGH*
672 		 */
673 		if (procp->p_flags & P_SUGID)
674 			vap->va_uid = 0;
675 		else if (procp->p_ucred)
676 			vap->va_uid = procp->p_ucred->cr_uid;
677 		else
678 			vap->va_uid = -1;
679 		break;
680 
681 	case Pregs:
682 		vap->va_bytes = vap->va_size = sizeof(struct reg);
683 		break;
684 
685 	case Pfpregs:
686 		vap->va_bytes = vap->va_size = sizeof(struct fpreg);
687 		break;
688 
689 	case Pdbregs:
690 		vap->va_bytes = vap->va_size = sizeof(struct dbreg);
691 		break;
692 
693 	case Ptype:
694 	case Pmap:
695 	case Pctl:
696 	case Pstatus:
697 	case Pnote:
698 	case Pnotepg:
699 	case Pcmdline:
700 	case Prlimit:
701 		break;
702 
703 	default:
704 		panic("procfs_getattr");
705 	}
706 done:
707 	pfs_pdone(procp);
708 	return (error);
709 }
710 
711 /*
712  * procfs_setattr(struct vnode *a_vp, struct vattr *a_vap,
713  *		  struct ucred *a_cred)
714  */
715 static int
716 procfs_setattr(struct vop_setattr_args *ap)
717 {
718 	if (ap->a_vap->va_flags != VNOVAL)
719 		return (EOPNOTSUPP);
720 
721 	/*
722 	 * just fake out attribute setting
723 	 * it's not good to generate an error
724 	 * return, otherwise things like creat()
725 	 * will fail when they try to set the
726 	 * file length to 0.  worse, this means
727 	 * that echo $note > /proc/$pid/note will fail.
728 	 */
729 
730 	return (0);
731 }
732 
733 /*
734  * implement access checking.
735  *
736  * procfs_access(struct vnode *a_vp, int a_mode, struct ucred *a_cred)
737  */
738 static int
739 procfs_access(struct vop_access_args *ap)
740 {
741 	struct vattr vattr;
742 	int error;
743 
744 	error = VOP_GETATTR(ap->a_vp, &vattr);
745 	if (!error)
746 		error = vop_helper_access(ap, vattr.va_uid, vattr.va_gid,
747 					  vattr.va_mode, 0);
748 	return (error);
749 }
750 
751 /*
752  * lookup.  this is incredibly complicated in the general case, however
753  * for most pseudo-filesystems very little needs to be done.
754  *
755  * procfs_lookup(struct vnode *a_dvp, struct vnode **a_vpp,
756  *		 struct componentname *a_cnp)
757  */
758 static int
759 procfs_lookup(struct vop_old_lookup_args *ap)
760 {
761 	struct componentname *cnp = ap->a_cnp;
762 	struct vnode **vpp = ap->a_vpp;
763 	struct vnode *dvp = ap->a_dvp;
764 	char *pname = cnp->cn_nameptr;
765 	/* struct proc *curp = cnp->cn_proc; */
766 	struct proc_target *pt;
767 	pid_t pid;
768 	struct pfsnode *pfs;
769 	struct proc *p;
770 	struct lwp *lp;
771 	int i;
772 	int error;
773 
774 	*vpp = NULL;
775 
776 	if (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)
777 		return (EROFS);
778 
779 	p = NULL;
780 	error = 0;
781 	if (cnp->cn_namelen == 1 && *pname == '.') {
782 		*vpp = dvp;
783 		vref(*vpp);
784 		goto out;
785 	}
786 
787 	pfs = VTOPFS(dvp);
788 	switch (pfs->pfs_type) {
789 	case Proot:
790 		if (cnp->cn_flags & CNP_ISDOTDOT)
791 			return (EIO);
792 
793 		if (CNEQ(cnp, "curproc", 7) || CNEQ(cnp, "self", 4)) {
794 			error = procfs_allocvp(dvp->v_mount, vpp, 0, Pcurproc);
795 			goto out;
796 		}
797 
798 		pid = atopid(pname, cnp->cn_namelen);
799 		if (pid == NO_PID)
800 			break;
801 
802 		p = pfs_pfind(pid);
803 		if (p == NULL)
804 			break;
805 
806 		if (!PRISON_CHECK(ap->a_cnp->cn_cred, p->p_ucred))
807 			break;
808 
809 		if (ps_showallprocs == 0 && ap->a_cnp->cn_cred->cr_uid != 0 &&
810 		    ap->a_cnp->cn_cred->cr_uid != p->p_ucred->cr_uid)
811 			break;
812 
813 		error = procfs_allocvp(dvp->v_mount, vpp, pid, Pproc);
814 		goto out;
815 
816 	case Pproc:
817 		if (cnp->cn_flags & CNP_ISDOTDOT) {
818 			error = procfs_root(dvp->v_mount, vpp);
819 			goto out;
820 		}
821 
822 		p = pfs_pfind(pfs->pfs_pid);
823 		if (p == NULL)
824 			break;
825 		/* XXX lwp */
826 		lp = FIRST_LWP_IN_PROC(p);
827 		if (lp == NULL)
828 			break;
829 
830 		if (!PRISON_CHECK(ap->a_cnp->cn_cred, p->p_ucred))
831 			break;
832 
833 		if (ps_showallprocs == 0 && ap->a_cnp->cn_cred->cr_uid != 0 &&
834 		    ap->a_cnp->cn_cred->cr_uid != p->p_ucred->cr_uid)
835 			break;
836 
837 		for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
838 			if (cnp->cn_namelen == pt->pt_namlen &&
839 			    bcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
840 			    (pt->pt_valid == NULL || (*pt->pt_valid)(lp)))
841 				goto found;
842 		}
843 		break;
844 	found:
845 		error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
846 				       pt->pt_pfstype);
847 		goto out;
848 
849 	default:
850 		error = ENOTDIR;
851 		goto out;
852 	}
853 	if (cnp->cn_nameiop == NAMEI_LOOKUP)
854 		error = ENOENT;
855 	else
856 		error = EROFS;
857 	/*
858 	 * If no error occured *vpp will hold a referenced locked vnode.
859 	 * dvp was passed to us locked and *vpp must be returned locked.
860 	 * If *vpp != dvp then we should unlock dvp if (1) this is not the
861 	 * last component or (2) CNP_LOCKPARENT is not set.
862 	 */
863 out:
864 	if (error == 0 && *vpp != dvp) {
865 		if ((cnp->cn_flags & CNP_LOCKPARENT) == 0) {
866 			cnp->cn_flags |= CNP_PDIRUNLOCK;
867 			vn_unlock(dvp);
868 		}
869 	}
870 	pfs_pdone(p);
871 	return (error);
872 }
873 
874 /*
875  * Does this process have a text file?
876  */
877 int
878 procfs_validfile(struct lwp *lp)
879 {
880 	return (procfs_findtextvp(lp->lwp_proc) != NULLVP);
881 }
882 
883 /*
884  * readdir() returns directory entries from pfsnode (vp).
885  *
886  * We generate just one directory entry at a time, as it would probably
887  * not pay off to buffer several entries locally to save uiomove calls.
888  *
889  * procfs_readdir(struct vnode *a_vp, struct uio *a_uio, struct ucred *a_cred,
890  *		  int *a_eofflag, int *a_ncookies, off_t **a_cookies)
891  */
892 static int
893 procfs_readdir(struct vop_readdir_args *ap)
894 {
895 	struct pfsnode *pfs;
896 	int error;
897 
898 	if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX)
899 		return (EINVAL);
900 	error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
901 	if (error)
902 		return (error);
903 
904 	pfs = VTOPFS(ap->a_vp);
905 	switch (pfs->pfs_type) {
906 	case Pproc:
907 		/*
908 		 * this is for the process-specific sub-directories.
909 		 * all that is needed to is copy out all the entries
910 		 * from the procent[] table (top of this file).
911 		 */
912 		error = procfs_readdir_proc(ap);
913 		break;
914 	case Proot:
915 		/*
916 		 * this is for the root of the procfs filesystem
917 		 * what is needed is a special entry for "curproc"
918 		 * followed by an entry for each process on allproc
919 		 */
920 		error = procfs_readdir_root(ap);
921 		break;
922 	default:
923 		error = ENOTDIR;
924 		break;
925 	}
926 
927 	vn_unlock(ap->a_vp);
928 	return (error);
929 }
930 
931 static int
932 procfs_readdir_proc(struct vop_readdir_args *ap)
933 {
934 	struct pfsnode *pfs;
935 	int error, i, retval;
936 	struct proc *p;
937 	struct lwp *lp;
938 	struct proc_target *pt;
939 	struct uio *uio = ap->a_uio;
940 
941 	pfs = VTOPFS(ap->a_vp);
942 	p = pfs_pfind(pfs->pfs_pid);
943 	if (p == NULL)
944 		return(0);
945 	if (!PRISON_CHECK(ap->a_cred, p->p_ucred)) {
946 		error = 0;
947 		goto done;
948 	}
949 	/* XXX lwp, not MPSAFE */
950 	lp = FIRST_LWP_IN_PROC(p);
951 	if (lp == NULL) {
952 		error = EINVAL;
953 		goto done;
954 	}
955 
956 	error = 0;
957 	i = (int)uio->uio_offset;
958 	if (i < 0) {
959 		error = EINVAL;
960 		goto done;
961 	}
962 
963 	for (pt = &proc_targets[i];
964 	     !error && uio->uio_resid > 0 && i < nproc_targets; pt++, i++) {
965 		if (pt->pt_valid && (*pt->pt_valid)(lp) == 0)
966 			continue;
967 
968 		retval = vop_write_dirent(&error, uio,
969 		    PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype), pt->pt_type,
970 		    pt->pt_namlen, pt->pt_name);
971 		if (retval)
972 			break;
973 	}
974 
975 	uio->uio_offset = (off_t)i;
976 	error = 0;
977 done:
978 	pfs_pdone(p);
979 	return error;
980 }
981 
982 struct procfs_readdir_root_info {
983 	int error;
984 	int i;
985 	int pcnt;
986 	struct uio *uio;
987 	struct ucred *cred;
988 };
989 
990 static int procfs_readdir_root_callback(struct proc *p, void *data);
991 
992 static int
993 procfs_readdir_root(struct vop_readdir_args *ap)
994 {
995 	struct procfs_readdir_root_info info;
996 	struct uio *uio = ap->a_uio;
997 	int res;
998 
999 	res = 0;
1000 	info.error = 0;
1001 	info.i = (int)uio->uio_offset;
1002 
1003 	if (info.i < 0)
1004 		return (EINVAL);
1005 
1006 	info.pcnt = 0;
1007 	info.uio = uio;
1008 	info.cred = ap->a_cred;
1009 	while (info.pcnt < 4) {
1010 		res = procfs_readdir_root_callback(NULL, &info);
1011 		if (res < 0)
1012 			break;
1013 	}
1014 	if (res >= 0)
1015 		allproc_scan(procfs_readdir_root_callback, &info, 0);
1016 	uio->uio_offset = (off_t)info.i;
1017 
1018 	return (info.error);
1019 }
1020 
1021 static int
1022 procfs_readdir_root_callback(struct proc *p, void *data)
1023 {
1024 	struct procfs_readdir_root_info *info = data;
1025 	struct uio *uio;
1026 	int retval;
1027 	ino_t d_ino;
1028 	const char *d_name;
1029 	char d_name_pid[20];
1030 	size_t d_namlen;
1031 	uint8_t d_type;
1032 
1033 	uio = info->uio;
1034 
1035 	if (uio->uio_resid <= 0 || info->error)
1036 		return(-1);
1037 
1038 	switch (info->pcnt) {
1039 	case 0:		/* `.' */
1040 		d_ino = PROCFS_FILENO(0, Proot);
1041 		d_name = ".";
1042 		d_namlen = 1;
1043 		d_type = DT_DIR;
1044 		break;
1045 	case 1:		/* `..' */
1046 		d_ino = PROCFS_FILENO(0, Proot);
1047 		d_name = "..";
1048 		d_namlen = 2;
1049 		d_type = DT_DIR;
1050 		break;
1051 
1052 	case 2:
1053 		d_ino = PROCFS_FILENO(0, Pcurproc);
1054 		d_namlen = 7;
1055 		d_name = "curproc";
1056 		d_type = DT_LNK;
1057 		break;
1058 
1059 	case 3:
1060 		d_ino = PROCFS_FILENO(0, Pcurproc);
1061 		d_namlen = 4;
1062 		d_name = "self";
1063 		d_type = DT_LNK;
1064 		break;
1065 
1066 	default:
1067 		if (!PRISON_CHECK(info->cred, p->p_ucred))
1068 			return(0);
1069 		if (ps_showallprocs == 0 &&
1070 		    info->cred->cr_uid != 0 &&
1071 		    info->cred->cr_uid != p->p_ucred->cr_uid) {
1072 			return(0);
1073 		}
1074 
1075 		/*
1076 		 * Skip entries we have already returned (optimization)
1077 		 */
1078 		if (info->pcnt < info->i) {
1079 			++info->pcnt;
1080 			return(0);
1081 		}
1082 
1083 		d_ino = PROCFS_FILENO(p->p_pid, Pproc);
1084 		d_namlen = ksnprintf(d_name_pid, sizeof(d_name_pid),
1085 				     "%ld", (long)p->p_pid);
1086 		d_name = d_name_pid;
1087 		d_type = DT_DIR;
1088 		break;
1089 	}
1090 
1091 	/*
1092 	 * Skip entries we have already returned (optimization)
1093 	 */
1094 	if (info->pcnt < info->i) {
1095 		++info->pcnt;
1096 		return(0);
1097 	}
1098 
1099 	retval = vop_write_dirent(&info->error, uio,
1100 				  d_ino, d_type, d_namlen, d_name);
1101 	if (retval)
1102 		return(-1);
1103 	++info->pcnt;
1104 	++info->i;
1105 	return(0);
1106 }
1107 
1108 /*
1109  * readlink reads the link of `curproc' or `file'
1110  */
1111 static int
1112 procfs_readlink(struct vop_readlink_args *ap)
1113 {
1114 	char buf[16];		/* should be enough */
1115 	struct proc *procp;
1116 	struct vnode *vp = ap->a_vp;
1117 	struct pfsnode *pfs = VTOPFS(vp);
1118 	char *fullpath, *freepath;
1119 	int error, len;
1120 
1121 	switch (pfs->pfs_type) {
1122 	case Pcurproc:
1123 		if (pfs->pfs_fileno != PROCFS_FILENO(0, Pcurproc))
1124 			return (EINVAL);
1125 
1126 		len = ksnprintf(buf, sizeof(buf), "%ld", (long)curproc->p_pid);
1127 		return (uiomove(buf, len, ap->a_uio));
1128 	case Pfile:
1129 		/*
1130 		 * procfs's directory topology is somewhat asynchronous from
1131 		 * reality so it is possible for pid requests to race exiting
1132 		 * processes.  In this situation, bit 31 is set in
1133 		 * pfs->pfs_pid which guarantees that pfs_pfind() will return
1134 		 * NULL.
1135 		 *
1136 		 * It is also possible to catch a process in the middle of
1137 		 * an exit sequence so various fields might wind up being
1138 		 * NULL that are not normally NULL.
1139 		 */
1140 		procp = pfs_pfind(pfs->pfs_pid);
1141 		if (procp == NULL || procp->p_ucred == NULL) {
1142 			pfs_pdone(procp);
1143 			return (uiomove("unknown", sizeof("unknown") - 1,
1144 					ap->a_uio));
1145 		}
1146 		if (procp->p_textnch.ncp) {
1147 			struct nchandle nch;
1148 
1149 			cache_copy(&procp->p_textnch, &nch);
1150 			error = cache_fullpath(procp, &nch, NULL,
1151 					       &fullpath, &freepath, 0);
1152 			cache_drop(&nch);
1153 		} else {
1154 			error = EINVAL;
1155 		}
1156 
1157 		if (error != 0) {
1158 			pfs_pdone(procp);
1159 			return (uiomove("unknown", sizeof("unknown") - 1,
1160 					ap->a_uio));
1161 		}
1162 		error = uiomove(fullpath, strlen(fullpath), ap->a_uio);
1163 		kfree(freepath, M_TEMP);
1164 		pfs_pdone(procp);
1165 		return (error);
1166 	default:
1167 		return (EINVAL);
1168 	}
1169 }
1170 
1171 /*
1172  * convert decimal ascii to pid_t
1173  */
1174 static pid_t
1175 atopid(const char *b, u_int len)
1176 {
1177 	pid_t p = 0;
1178 
1179 	while (len--) {
1180 		char c = *b++;
1181 		if (c < '0' || c > '9')
1182 			return (NO_PID);
1183 		p = 10 * p + (c - '0');
1184 		if (p > PID_MAX)
1185 			return (NO_PID);
1186 	}
1187 
1188 	return (p);
1189 }
1190 
1191 /*
1192  * kqfilter operations
1193  */
1194 static void
1195 procfs_filt_detach(struct knote *kn)
1196 {
1197 	struct vnode *vp = (void *)kn->kn_hook;
1198 
1199 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1200 }
1201 
1202 static int
1203 procfs_filt_read(struct knote *kn, long hint)
1204 {
1205 	if (hint == NOTE_REVOKE) {
1206 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1207 		return (1);
1208 	}
1209 
1210 	/* Files on procfs have a size of 0. */
1211 	kn->kn_data = 0;
1212 	if (kn->kn_sfflags & NOTE_OLDAPI)
1213 		return (1);
1214 	return (kn->kn_data != 0);
1215 }
1216 
1217 static int
1218 procfs_filt_write(struct knote *kn, long hint)
1219 {
1220 	if (hint == NOTE_REVOKE)
1221 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1222 	kn->kn_data = 0;
1223 	return (1);
1224 }
1225 
1226 static int
1227 procfs_filt_vnode(struct knote *kn, long hint)
1228 {
1229 	if (kn->kn_sfflags & hint)
1230 		kn->kn_fflags |= hint;
1231 	if (hint == NOTE_REVOKE) {
1232 		kn->kn_flags |= (EV_EOF | EV_NODATA);
1233 		return (1);
1234 	}
1235 	return (kn->kn_fflags != 0);
1236 }
1237 
1238 static struct filterops procfs_read_filtops = {
1239 	FILTEROP_ISFD | FILTEROP_MPSAFE, NULL,
1240 	procfs_filt_detach, procfs_filt_read,
1241 };
1242 static struct filterops procfs_write_filtops = {
1243 	FILTEROP_ISFD | FILTEROP_MPSAFE, NULL,
1244 	procfs_filt_detach, procfs_filt_write,
1245 };
1246 static struct filterops procfs_vnode_filtops = {
1247 	FILTEROP_ISFD | FILTEROP_MPSAFE, NULL,
1248 	procfs_filt_detach, procfs_filt_vnode,
1249 };
1250 
1251 static int
1252 procfs_kqfilter(struct vop_kqfilter_args *ap)
1253 {
1254 	struct vnode *vp = ap->a_vp;
1255 	struct knote *kn = ap->a_kn;
1256 
1257 	switch (kn->kn_filter) {
1258 	case EVFILT_READ:
1259 		kn->kn_fop = &procfs_read_filtops;
1260 		break;
1261 	case EVFILT_WRITE:
1262 		kn->kn_fop = &procfs_write_filtops;
1263 		break;
1264 	case EVFILT_VNODE:
1265 		kn->kn_fop = &procfs_vnode_filtops;
1266 		break;
1267 	default:
1268 		return (EOPNOTSUPP);
1269 	}
1270 
1271 	kn->kn_hook = (caddr_t)vp;
1272 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1273 
1274 	return (0);
1275 }
1276