xref: /minix3/minix/servers/mib/proc.c (revision d2532d3d42d764c9ef9816851cdb17eda7e08d36)
1 /* MIB service - proc.c - functionality based on service process tables */
2 /* Eventually, the CTL_PROC subtree might end up here as well. */
3 
4 #include "mib.h"
5 
6 #include <sys/exec.h>
7 #include <minix/sysinfo.h>
8 
9 #include <machine/archtypes.h>
10 #include "kernel/proc.h"
11 #include "servers/pm/mproc.h"
12 #include "servers/vfs/const.h"
13 #include "servers/vfs/fproc.h"
14 
15 typedef struct proc ixfer_proc_t;
16 typedef struct mproc ixfer_mproc_t;
17 typedef struct fproc ixfer_fproc_t;
18 
19 static ixfer_proc_t proc_tab[NR_TASKS + NR_PROCS];
20 static ixfer_mproc_t mproc_tab[NR_PROCS];
21 static ixfer_fproc_t fproc_tab[NR_PROCS];
22 
23 /*
24  * The number of processes added to the current number of processes when doing
25  * a size estimation, so that the actual data retrieval does not end up with
26  * too little space if new processes have forked between the two calls.  We do
27  * a process table update only once per clock tick, which means that typically
28  * no update will take place between the user process's size estimation request
29  * and its subsequent data retrieval request.  On the other hand, if we do
30  * update process tables in between, quite a bit might have changed.
31  */
32 #define EXTRA_PROCS	8
33 
34 #define HASH_SLOTS 	(NR_PROCS / 4)	/* expected nr. of processes in use */
35 #define NO_SLOT		(-1)
36 static int hash_tab[HASH_SLOTS];	/* hash table mapping from PID.. */
37 static int hnext_tab[NR_PROCS];		/* ..to PM process slot */
38 
39 static clock_t tabs_updated = 0;	/* when the tables were last updated */
40 static int tabs_valid = TRUE;		/* FALSE if obtaining tables failed */
41 
42 /*
43  * Update the process tables by pulling in new copies from the kernel, PM, and
44  * VFS, but only every so often and only if it has not failed before.  Return
45  * TRUE iff the tables are now valid.
46  */
47 static int
48 update_tables(void)
49 {
50 	clock_t now;
51 	pid_t pid;
52 	int r, kslot, mslot, hslot;
53 
54 	/*
55 	 * If retrieving the tables failed at some point, do not keep trying
56 	 * all the time.  Such a failure is very unlikely to be transient.
57 	 */
58 	if (tabs_valid == FALSE)
59 		return FALSE;
60 
61 	/*
62 	 * Update the tables once per clock tick at most.  The update operation
63 	 * is rather heavy, transferring several hundreds of kilobytes between
64 	 * servers.  Userland should be able to live with information that is
65 	 * outdated by at most one clock tick.
66 	 */
67 	now = getticks();
68 
69 	if (tabs_updated != 0 && tabs_updated == now)
70 		return TRUE;
71 
72 	/* Perform an actual update now. */
73 	tabs_valid = FALSE;
74 
75 	/* Retrieve and check the kernel process table. */
76 	if ((r = sys_getproctab(proc_tab)) != OK) {
77 		printf("MIB: unable to obtain kernel process table (%d)\n", r);
78 
79 		return FALSE;
80 	}
81 
82 	for (kslot = 0; kslot < NR_TASKS + NR_PROCS; kslot++) {
83 		if (proc_tab[kslot].p_magic != PMAGIC) {
84 			printf("MIB: kernel process table mismatch\n");
85 
86 			return FALSE;
87 		}
88 	}
89 
90 	/* Retrieve and check the PM process table. */
91 	r = getsysinfo(PM_PROC_NR, SI_PROC_TAB, mproc_tab, sizeof(mproc_tab));
92 	if (r != OK) {
93 		printf("MIB: unable to obtain PM process table (%d)\n", r);
94 
95 		return FALSE;
96 	}
97 
98 	for (mslot = 0; mslot < NR_PROCS; mslot++) {
99 		if (mproc_tab[mslot].mp_magic != MP_MAGIC) {
100 			printf("MIB: PM process table mismatch\n");
101 
102 			return FALSE;
103 		}
104 	}
105 
106 	/* Retrieve the VFS process table, which has no magic number. */
107 	r = getsysinfo(VFS_PROC_NR, SI_PROC_TAB, fproc_tab, sizeof(fproc_tab));
108 	if (r != OK) {
109 		printf("MIB: unable to obtain VFS process table (%d)\n", r);
110 
111 		return FALSE;
112 	}
113 
114 	tabs_valid = TRUE;
115 	tabs_updated = now;
116 
117 	/*
118 	 * Build a hash table mapping from process IDs to slot numbers, for
119 	 * fast access.  TODO: decide if this is better done on demand only.
120 	 */
121 	for (hslot = 0; hslot < HASH_SLOTS; hslot++)
122 		hash_tab[hslot] = NO_SLOT;
123 
124 	for (mslot = 0; mslot < NR_PROCS; mslot++) {
125 		if (mproc_tab[mslot].mp_flags & IN_USE) {
126 			if ((pid = mproc_tab[mslot].mp_pid) <= 0)
127 				continue;
128 
129 			hslot = mproc_tab[mslot].mp_pid % HASH_SLOTS;
130 
131 			hnext_tab[mslot] = hash_tab[hslot];
132 			hash_tab[hslot] = mslot;
133 		}
134 	}
135 
136 	return TRUE;
137 }
138 
139 /*
140  * Return the PM slot number for the given PID, or NO_SLOT if the PID is not in
141  * use by a process.
142  */
143 static int
144 get_mslot(pid_t pid)
145 {
146 	int mslot;
147 
148 	/* PID 0 identifies the kernel; checking this is up to the caller. */
149 	if (pid <= 0)
150 		return NO_SLOT;
151 
152 	for (mslot = hash_tab[pid % HASH_SLOTS]; mslot != NO_SLOT;
153 	    mslot = hnext_tab[mslot])
154 		if (mproc_tab[mslot].mp_pid == pid)
155 			break;
156 
157 	return mslot;
158 }
159 
160 /*
161  * Store the given number of clock ticks as a timeval structure.
162  */
163 static void
164 ticks_to_timeval(struct timeval * tv, clock_t ticks)
165 {
166 	clock_t hz;
167 
168 	hz = sys_hz();
169 
170 	tv->tv_sec = ticks / hz;
171 	tv->tv_usec = (long)((ticks % hz) * 1000000ULL / hz);
172 }
173 
174 /*
175  * Generate a wchan message text for the cases that the process is blocked on
176  * IPC with another process, of which the endpoint is given as 'endpt' here.
177  * The name of the other process is to be stored in 'wmesg', which is a buffer
178  * of size 'wmsz'.  The result should be null terminated.  If 'ipc' is set, the
179  * process is blocked on a direct IPC call, in which case the name of the other
180  * process is enclosed in parentheses.  If 'ipc' is not set, the call is made
181  * indirectly through VFS, and the name of the other process should not be
182  * enclosed in parentheses.  If no name can be obtained, we use the endpoint of
183  * the other process instead.
184  */
185 static void
186 fill_wmesg(char * wmesg, size_t wmsz, endpoint_t endpt, int ipc)
187 {
188 	const char *name;
189 	int mslot;
190 
191 	switch (endpt) {
192 	case ANY:
193 		name = "any";
194 		break;
195 	case SELF:
196 		name = "self";
197 		break;
198 	case NONE:
199 		name = "none";
200 		break;
201 	default:
202 		mslot = _ENDPOINT_P(endpt);
203 		if (mslot >= -NR_TASKS && mslot < NR_PROCS &&
204 		    (mslot < 0 || (mproc_tab[mslot].mp_flags & IN_USE)))
205 			name = proc_tab[NR_TASKS + mslot].p_name;
206 		else
207 			name = NULL;
208 	}
209 
210 	if (name != NULL)
211 		snprintf(wmesg, wmsz, "%s%s%s",
212 		    ipc ? "(" : "", name, ipc ? ")" : "");
213 	else
214 		snprintf(wmesg, wmsz, "%s%d%s",
215 		    ipc ? "(" : "", endpt, ipc ? ")" : "");
216 }
217 
218 /*
219  * Return the LWP status of a process, along with additional information in
220  * case the process is sleeping (LSSLEEP): a wchan value and text to indicate
221  * what the process is sleeping on, and possibly a flag field modification to
222  * indicate that the sleep is interruptible.
223  */
224 static int
225 get_lwp_stat(int mslot, uint64_t * wcptr, char * wmptr, size_t wmsz,
226 	int32_t * flag)
227 {
228 	struct mproc *mp;
229 	struct fproc *fp;
230 	struct proc *kp;
231 	const char *wmesg;
232 	uint64_t wchan;
233 	endpoint_t endpt;
234 
235 	mp = &mproc_tab[mslot];
236 	fp = &fproc_tab[mslot];
237 	kp = &proc_tab[NR_TASKS + mslot];
238 
239 	/*
240 	 * First cover all the cases that the process is not sleeping.  In
241 	 * those cases, we need not return additional sleep information either.
242 	 */
243 	if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
244 		return LSZOMB;
245 
246 	if (mp->mp_flags & EXITING)
247 		return LSDEAD;
248 
249 	if ((mp->mp_flags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP))
250 		return LSSTOP;
251 
252 	if (proc_is_runnable(kp))
253 		return LSRUN;
254 
255 	/*
256 	 * The process is sleeping.  In that case, we must also figure out why,
257 	 * and return an appropriate wchan value and human-readable wmesg text.
258 	 *
259 	 * The process can be blocked on either a known sleep state in PM or
260 	 * VFS, or otherwise on IPC communication with another process, or
261 	 * otherwise on a kernel RTS flag.  In each case, decide what to use as
262 	 * wchan value and wmesg text, and whether the sleep is interruptible.
263 	 *
264 	 * The wchan value should be unique for the sleep reason.  We use its
265 	 * lower eight bits to indicate a class:
266 	 *   0x00 = kernel task
267 	 *   0x01 = kerel RTS block
268 	 *   0x02 = PM call
269 	 *   0x03 = VFS call
270 	 *   0x04 = MIB call
271 	 *   0xff = blocked on process
272 	 * The upper bits are used for class-specific information.  The actual
273 	 * value does not really matter, as long as it is nonzero and there is
274 	 * no overlap between the different values.
275 	 */
276 	wchan = 0;
277 	wmesg = NULL;
278 
279 	/*
280 	 * First see if the process is marked as blocked in the tables of PM or
281 	 * VFS.  Such a block reason is always an interruptible sleep.  Note
282 	 * that we do not use the kernel table at all in this case: each of the
283 	 * three tables is consistent within itself, but not necessarily
284 	 * consistent with any of the other tables, so we avoid internal
285 	 * mismatches if we can.
286 	 */
287 	if (mp->mp_flags & WAITING) {
288 		wchan = 0x102;
289 		wmesg = "wait";
290 	} else if (mp->mp_flags & SIGSUSPENDED) {
291 		wchan = 0x202;
292 		wmesg = "pause";
293 	} else if (fp->fp_blocked_on != FP_BLOCKED_ON_NONE) {
294 		wchan = (fp->fp_blocked_on << 8) | 0x03;
295 		switch (fp->fp_blocked_on) {
296 		case FP_BLOCKED_ON_PIPE:
297 			wmesg = "pipe";
298 			break;
299 		case FP_BLOCKED_ON_LOCK:
300 			wmesg = "lock";
301 			break;
302 		case FP_BLOCKED_ON_POPEN:
303 			wmesg = "popen";
304 			break;
305 		case FP_BLOCKED_ON_SELECT:
306 			wmesg = "select";
307 			break;
308 		case FP_BLOCKED_ON_OTHER:
309 			/*
310 			 * Add the task (= character driver) endpoint to the
311 			 * wchan value, and use the driver's process name,
312 			 * without parentheses, as wmesg text.
313 			 */
314 			wchan |= (uint64_t)fp->fp_task << 16;
315 			fill_wmesg(wmptr, wmsz, fp->fp_task, FALSE /*ipc*/);
316 			break;
317 		default:
318 			/* A newly added flag we don't yet know about? */
319 			wmesg = "???";
320 			break;
321 		}
322 	}
323 	if (wchan != 0) {
324 		*wcptr = wchan;
325 		if (wmesg != NULL) /* NULL means "already set" here */
326 			strlcpy(wmptr, wmesg, wmsz);
327 		*flag |= L_SINTR;
328 	}
329 
330 	/*
331 	 * See if the process is blocked on sending or receiving.  If not, then
332 	 * use one of the kernel RTS flags as reason.
333 	 */
334 	endpt = P_BLOCKEDON(kp);
335 
336 	switch (endpt) {
337 	case MIB_PROC_NR:
338 		/* This is really just aesthetics. */
339 		wchan = 0x04;
340 		wmesg = "sysctl";
341 		break;
342 	case NONE:
343 		/*
344 		 * The process is not running, but also not blocked on IPC with
345 		 * another process.  This means it must be stopped on a kernel
346 		 * RTS flag.
347 		 */
348 		wchan = ((uint64_t)kp->p_rts_flags << 8) | 0x01;
349 		if (RTS_ISSET(kp, RTS_PROC_STOP))
350 			wmesg = "kstop";
351 		else if (RTS_ISSET(kp, RTS_SIGNALED) ||
352 		    RTS_ISSET(kp, RTS_SIGNALED))
353 			wmesg = "ksignal";
354 		else if (RTS_ISSET(kp, RTS_NO_PRIV))
355 			wmesg = "knopriv";
356 		else if (RTS_ISSET(kp, RTS_PAGEFAULT) ||
357 		    RTS_ISSET(kp, RTS_VMREQTARGET))
358 			wmesg = "fault";
359 		else if (RTS_ISSET(kp, RTS_NO_QUANTUM))
360 			wmesg = "sched";
361 		else
362 			wmesg = "kflag";
363 		break;
364 	case ANY:
365 		/*
366 		 * If the process is blocked receiving from ANY, mark it as
367 		 * being in an interruptible sleep.  This looks nicer, even
368 		 * though "interruptible" is not applicable to services at all.
369 		 */
370 		*flag |= L_SINTR;
371 		break;
372 	}
373 
374 	/*
375 	 * If at this point wchan is still zero, the process is blocked sending
376 	 * or receiving.  Use a wchan value based on the target endpoint, and
377 	 * use "(procname)" as wmesg text.
378 	 */
379 	if (wchan == 0) {
380 		*wcptr = ((uint64_t)endpt << 8) | 0xff;
381 		fill_wmesg(wmptr, wmsz, endpt, TRUE /*ipc*/);
382 	} else {
383 		*wcptr = wchan;
384 		if (wmesg != NULL) /* NULL means "already set" here */
385 			strlcpy(wmptr, wmesg, wmsz);
386 	}
387 
388 	return LSSLEEP;
389 }
390 
391 
392 /*
393  * Fill the part of a LWP structure that is common between kernel tasks and
394  * user processes.  Also return a CPU estimate in 'estcpu', because we generate
395  * the value as a side effect here, and the LWP structure has no estcpu field.
396  */
397 static void
398 fill_lwp_common(struct kinfo_lwp * l, int kslot, uint32_t * estcpu)
399 {
400 	struct proc *kp;
401 	struct timeval tv;
402 	clock_t uptime;
403 	uint32_t hz;
404 
405 	kp = &proc_tab[kslot];
406 
407 	uptime = getticks();
408 	hz = sys_hz();
409 
410 	/*
411 	 * We use the process endpoint as the LWP ID.  Not only does this allow
412 	 * users to obtain process endpoints with "ps -s" (thus replacing the
413 	 * MINIX3 ps(1)'s "ps -E"), but if we ever do implement kernel threads,
414 	 * this is probably still going to be accurate.
415 	 */
416 	l->l_lid = kp->p_endpoint;
417 
418 	/*
419 	 * The time during which the process has not been swapped in or out is
420 	 * not applicable for us, and thus, we set it to the time the process
421 	 * has been running (in seconds).  This value is relevant mostly for
422 	 * ps(1)'s CPU usage correction for processes that have just started.
423 	 */
424 	if (kslot >= NR_TASKS)
425 		l->l_swtime = uptime - mproc_tab[kslot - NR_TASKS].mp_started;
426 	else
427 		l->l_swtime = uptime;
428 	l->l_swtime /= hz;
429 
430 	/*
431 	 * Sleep (dequeue) times are not maintained for kernel tasks, so
432 	 * pretend they are never asleep (which is pretty accurate).
433 	 */
434 	if (kslot < NR_TASKS)
435 		l->l_slptime = 0;
436 	else
437 		l->l_slptime = (uptime - kp->p_dequeued) / hz;
438 
439 	l->l_priority = kp->p_priority;
440 	l->l_usrpri = kp->p_priority;
441 	l->l_cpuid = kp->p_cpu;
442 	ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
443 	l->l_rtime_sec = tv.tv_sec;
444 	l->l_rtime_usec = tv.tv_usec;
445 
446 	/*
447 	 * Obtain CPU usage percentages and estimates through library code
448 	 * shared between the kernel and this service; see its source for
449 	 * details.  We note that the produced estcpu value is rather different
450 	 * from the one produced by NetBSD, but this should not be a problem.
451 	 */
452 	l->l_pctcpu = cpuavg_getstats(&kp->p_cpuavg, &l->l_cpticks, estcpu,
453 	    uptime, hz);
454 }
455 
456 /*
457  * Fill a LWP structure for a kernel task.  Each kernel task has its own LWP,
458  * and all of them have negative PIDs.
459  */
460 static void
461 fill_lwp_kern(struct kinfo_lwp * l, int kslot)
462 {
463 	uint32_t estcpu;
464 
465 	memset(l, 0, sizeof(*l));
466 
467 	l->l_flag = L_INMEM | L_SINTR | L_SYSTEM;
468 	l->l_stat = LSSLEEP;
469 	l->l_pid = kslot - NR_TASKS;
470 
471 	/*
472 	 * When showing LWP entries, ps(1) uses the process name rather than
473 	 * the LWP name.  All kernel tasks are therefore shown as "[kernel]"
474 	 * anyway.  We use the wmesg field to show the actual kernel task name.
475 	 */
476 	l->l_wchan = ((uint64_t)(l->l_pid) << 8) | 0x00;
477 	strlcpy(l->l_wmesg, proc_tab[kslot].p_name, sizeof(l->l_wmesg));
478 	strlcpy(l->l_name, "kernel", sizeof(l->l_name));
479 
480 	fill_lwp_common(l, kslot, &estcpu);
481 }
482 
483 /*
484  * Fill a LWP structure for a user process.
485  */
486 static void
487 fill_lwp_user(struct kinfo_lwp * l, int mslot)
488 {
489 	struct mproc *mp;
490 	uint32_t estcpu;
491 
492 	memset(l, 0, sizeof(*l));
493 
494 	mp = &mproc_tab[mslot];
495 
496 	l->l_flag = L_INMEM;
497 	l->l_stat = get_lwp_stat(mslot, &l->l_wchan, l->l_wmesg,
498 	    sizeof(l->l_wmesg), &l->l_flag);
499 	l->l_pid = mp->mp_pid;
500 	strlcpy(l->l_name, mp->mp_name, sizeof(l->l_name));
501 
502 	fill_lwp_common(l, NR_TASKS + mslot, &estcpu);
503 }
504 
505 /*
506  * Implementation of CTL_KERN KERN_LWP.
507  */
508 ssize_t
509 mib_kern_lwp(struct mib_call * call, struct mib_node * node __unused,
510 	struct mib_oldp * oldp, struct mib_newp * newp __unused)
511 {
512 	struct kinfo_lwp lwp;
513 	struct mproc *mp;
514 	size_t copysz;
515 	ssize_t off;
516 	pid_t pid;
517 	int r, elsz, elmax, kslot, mslot, last_mslot;
518 
519 	if (call->call_namelen != 3)
520 		return EINVAL;
521 
522 	pid = (pid_t)call->call_name[0];
523 	elsz = call->call_name[1];
524 	elmax = call->call_name[2]; /* redundant with the given oldlen.. */
525 
526 	if (pid < -1 || elsz <= 0 || elmax < 0)
527 		return EINVAL;
528 
529 	if (!update_tables())
530 		return EINVAL;
531 
532 	off = 0;
533 	copysz = MIN((size_t)elsz, sizeof(lwp));
534 
535 	/*
536 	 * We model kernel tasks as LWP threads of the kernel (with PID 0).
537 	 * Modeling the kernel tasks as processes with negative PIDs, like
538 	 * ProcFS does, conflicts with the KERN_LWP API here: a PID of -1
539 	 * indicates that the caller wants a full listing of LWPs.
540 	 */
541 	if (pid <= 0) {
542 		for (kslot = 0; kslot < NR_TASKS; kslot++) {
543 			if (mib_inrange(oldp, off) && elmax > 0) {
544 				fill_lwp_kern(&lwp, kslot);
545 				if ((r = mib_copyout(oldp, off, &lwp,
546 				    copysz)) < 0)
547 					return r;
548 				elmax--;
549 			}
550 			off += elsz;
551 		}
552 
553 		/* No need to add extra space here: NR_TASKS is static. */
554 		if (pid == 0)
555 			return off;
556 	}
557 
558 	/*
559 	 * With PID 0 out of the way: the user requested the LWP for either a
560 	 * specific user process (pid > 0), or for all processes (pid < 0).
561 	 */
562 	if (pid > 0) {
563 		if ((mslot = get_mslot(pid)) == NO_SLOT ||
564 		    (mproc_tab[mslot].mp_flags & (TRACE_ZOMBIE | ZOMBIE)))
565 			return ESRCH;
566 		last_mslot = mslot;
567 	} else {
568 		mslot = 0;
569 		last_mslot = NR_PROCS - 1;
570 	}
571 
572 	for (; mslot <= last_mslot; mslot++) {
573 		mp = &mproc_tab[mslot];
574 
575 		if ((mp->mp_flags & (IN_USE | TRACE_ZOMBIE | ZOMBIE)) !=
576 		    IN_USE)
577 			continue;
578 
579 		if (mib_inrange(oldp, off) && elmax > 0) {
580 			fill_lwp_user(&lwp, mslot);
581 			if ((r = mib_copyout(oldp, off, &lwp, copysz)) < 0)
582 				return r;
583 			elmax--;
584 		}
585 		off += elsz;
586 	}
587 
588 	if (oldp == NULL && pid < 0)
589 		off += EXTRA_PROCS * elsz;
590 
591 	return off;
592 }
593 
594 
595 /*
596  * Fill the part of a process structure that is common between kernel tasks and
597  * user processes.
598  */
599 static void
600 fill_proc2_common(struct kinfo_proc2 * p, int kslot)
601 {
602 	struct vm_usage_info vui;
603 	struct timeval tv;
604 	struct proc *kp;
605 	struct kinfo_lwp l;
606 
607 	kp = &proc_tab[kslot];
608 
609 	/*
610 	 * Much of the information in the LWP structure also ends up in the
611 	 * process structure.  In order to avoid duplication of some important
612 	 * code, first generate LWP values and then copy it them into the
613 	 * process structure.
614 	 */
615 	memset(&l, 0, sizeof(l));
616 	fill_lwp_common(&l, kslot, &p->p_estcpu);
617 
618 	/* Obtain memory usage information from VM.  Ignore failures. */
619 	memset(&vui, 0, sizeof(vui));
620 	(void)vm_info_usage(kp->p_endpoint, &vui);
621 
622 	ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
623 	p->p_rtime_sec = l.l_rtime_sec;
624 	p->p_rtime_usec = l.l_rtime_usec;
625 	p->p_cpticks = l.l_cpticks;
626 	p->p_pctcpu = l.l_pctcpu;
627 	p->p_swtime = l.l_swtime;
628 	p->p_slptime = l.l_slptime;
629 	p->p_uticks = kp->p_user_time;
630 	p->p_sticks = kp->p_sys_time;
631 	/* TODO: p->p_iticks */
632 	ticks_to_timeval(&tv, kp->p_user_time);
633 	p->p_uutime_sec = tv.tv_sec;
634 	p->p_uutime_usec = tv.tv_usec;
635 	ticks_to_timeval(&tv, kp->p_sys_time);
636 	p->p_ustime_sec = tv.tv_sec;
637 	p->p_ustime_usec = tv.tv_usec;
638 
639 	p->p_priority = l.l_priority;
640 	p->p_usrpri = l.l_usrpri;
641 
642 	p->p_vm_rssize = howmany(vui.vui_total, PAGE_SIZE);
643 	p->p_vm_vsize = howmany(vui.vui_virtual, PAGE_SIZE);
644 	p->p_vm_msize = howmany(vui.vui_mvirtual, PAGE_SIZE);
645 
646 	p->p_uru_maxrss = vui.vui_maxrss;
647 	p->p_uru_minflt = vui.vui_minflt;
648 	p->p_uru_majflt = vui.vui_majflt;
649 
650 	p->p_cpuid = l.l_cpuid;
651 }
652 
653 /*
654  * Fill a process structure for the kernel pseudo-process (with PID 0).
655  */
656 static void
657 fill_proc2_kern(struct kinfo_proc2 * p)
658 {
659 
660 	memset(p, 0, sizeof(*p));
661 
662 	p->p_flag = L_INMEM | L_SYSTEM | L_SINTR;
663 	p->p_pid = 0;
664 	p->p_stat = LSSLEEP;
665 	p->p_nice = NZERO;
666 
667 	/* Use the KERNEL task wchan, for consistency between ps and top. */
668 	p->p_wchan = ((uint64_t)KERNEL << 8) | 0x00;
669 	strlcpy(p->p_wmesg, "kernel", sizeof(p->p_wmesg));
670 
671 	strlcpy(p->p_comm, "kernel", sizeof(p->p_comm));
672 	p->p_realflag = P_INMEM | P_SYSTEM | P_SINTR;
673 	p->p_realstat = SACTIVE;
674 	p->p_nlwps = NR_TASKS;
675 
676 	/*
677 	 * By using the KERNEL slot here, the kernel process will get a proper
678 	 * CPU usage average.
679 	 */
680 	fill_proc2_common(p, KERNEL + NR_TASKS);
681 }
682 
683 /*
684  * Fill a process structure for a user process.
685  */
686 static void
687 fill_proc2_user(struct kinfo_proc2 * p, int mslot)
688 {
689 	struct mproc *mp;
690 	struct fproc *fp;
691 	time_t boottime;
692 	dev_t tty;
693 	struct timeval tv;
694 	int i, r, kslot, zombie;
695 
696 	memset(p, 0, sizeof(*p));
697 
698 	if ((r = getuptime(NULL, NULL, &boottime)) != OK)
699 		panic("getuptime failed: %d", r);
700 
701 	kslot = NR_TASKS + mslot;
702 	mp = &mproc_tab[mslot];
703 	fp = &fproc_tab[mslot];
704 
705 	zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
706 	tty = (!zombie) ? fp->fp_tty : NO_DEV;
707 
708 	p->p_eflag = 0;
709 	if (tty != NO_DEV)
710 		p->p_eflag |= EPROC_CTTY;
711 	if (mp->mp_pid == mp->mp_procgrp) /* TODO: job control support */
712 		p->p_eflag |= EPROC_SLEADER;
713 
714 	p->p_exitsig = SIGCHLD; /* TODO */
715 
716 	p->p_flag = P_INMEM;
717 	if (mp->mp_flags & TAINTED)
718 		p->p_flag |= P_SUGID;
719 	if (mp->mp_tracer != NO_TRACER)
720 		p->p_flag |= P_TRACED;
721 	if (tty != NO_DEV)
722 		p->p_flag |= P_CONTROLT;
723 	p->p_pid = mp->mp_pid;
724 	if (mp->mp_parent >= 0 && mp->mp_parent < NR_PROCS)
725 		p->p_ppid = mproc_tab[mp->mp_parent].mp_pid;
726 	p->p_sid = mp->mp_procgrp; /* TODO: job control supported */
727 	p->p__pgid = mp->mp_procgrp;
728 	p->p_tpgid = (tty != NO_DEV) ? mp->mp_procgrp : 0;
729 	p->p_uid = mp->mp_effuid;
730 	p->p_ruid = mp->mp_realuid;
731 	p->p_gid = mp->mp_effgid;
732 	p->p_rgid = mp->mp_realgid;
733 	p->p_ngroups = MIN(mp->mp_ngroups, KI_NGROUPS);
734 	for (i = 0; i < p->p_ngroups; i++)
735 		p->p_groups[i] = mp->mp_sgroups[i];
736 	p->p_tdev = tty;
737 	memcpy(&p->p_siglist, &mp->mp_sigpending, sizeof(p->p_siglist));
738 	memcpy(&p->p_sigmask, &mp->mp_sigmask, sizeof(p->p_sigmask));
739 	memcpy(&p->p_sigcatch, &mp->mp_catch, sizeof(p->p_sigcatch));
740 	memcpy(&p->p_sigignore, &mp->mp_ignore, sizeof(p->p_sigignore));
741 	p->p_nice = mp->mp_nice + NZERO;
742 	strlcpy(p->p_comm, mp->mp_name, sizeof(p->p_comm));
743 	p->p_uvalid = 1;
744 	ticks_to_timeval(&tv, mp->mp_started);
745 	p->p_ustart_sec = boottime + tv.tv_sec;
746 	p->p_ustart_usec = tv.tv_usec;
747 	/* TODO: other rusage fields */
748 	ticks_to_timeval(&tv, mp->mp_child_utime + mp->mp_child_stime);
749 	p->p_uctime_sec = tv.tv_sec;
750 	p->p_uctime_usec = tv.tv_usec;
751 	p->p_realflag = p->p_flag;
752 	p->p_nlwps = (zombie) ? 0 : 1;
753 
754 	p->p_stat = get_lwp_stat(mslot, &p->p_wchan, p->p_wmesg,
755 	    sizeof(p->p_wmesg), &p->p_flag);
756 
757 	switch (p->p_stat) {
758 	case LSRUN:
759 		p->p_realstat = SACTIVE;
760 		p->p_nrlwps = 1;
761 		break;
762 	case LSSLEEP:
763 		p->p_realstat = SACTIVE;
764 		if (p->p_flag & L_SINTR)
765 			p->p_realflag |= P_SINTR;
766 		break;
767 	case LSSTOP:
768 		p->p_realstat = SSTOP;
769 		break;
770 	case LSZOMB:
771 		p->p_realstat = SZOMB;
772 		break;
773 	case LSDEAD:
774 		p->p_stat = LSZOMB; /* ps(1) STAT does not know LSDEAD */
775 		p->p_realstat = SDEAD;
776 		break;
777 	default:
778 		assert(0);
779 	}
780 
781 	if (!zombie)
782 		fill_proc2_common(p, kslot);
783 }
784 
785 /*
786  * Implementation of CTL_KERN KERN_PROC2.
787  */
788 ssize_t
789 mib_kern_proc2(struct mib_call * call, struct mib_node * node __unused,
790 	struct mib_oldp * oldp, struct mib_newp * newp __unused)
791 {
792 	struct kinfo_proc2 proc2;
793 	struct mproc *mp;
794 	size_t copysz;
795 	ssize_t off;
796 	dev_t tty;
797 	int r, req, arg, elsz, elmax, kmatch, zombie, mslot;
798 
799 	if (call->call_namelen != 4)
800 		return EINVAL;
801 
802 	req = call->call_name[0];
803 	arg = call->call_name[1];
804 	elsz = call->call_name[2];
805 	elmax = call->call_name[3]; /* redundant with the given oldlen.. */
806 
807 	/*
808 	 * The kernel is special, in that it does not have a slot in the PM or
809 	 * VFS tables.  As such, it is dealt with separately.  While checking
810 	 * arguments, we might as well check whether the kernel is matched.
811 	 */
812 	switch (req) {
813 	case KERN_PROC_ALL:
814 		kmatch = TRUE;
815 		break;
816 	case KERN_PROC_PID:
817 	case KERN_PROC_SESSION:
818 	case KERN_PROC_PGRP:
819 	case KERN_PROC_UID:
820 	case KERN_PROC_RUID:
821 	case KERN_PROC_GID:
822 	case KERN_PROC_RGID:
823 		kmatch = (arg == 0);
824 		break;
825 	case KERN_PROC_TTY:
826 		kmatch = ((dev_t)arg == KERN_PROC_TTY_NODEV);
827 		break;
828 	default:
829 		return EINVAL;
830 	}
831 
832 	if (elsz <= 0 || elmax < 0)
833 		return EINVAL;
834 
835 	if (!update_tables())
836 		return EINVAL;
837 
838 	off = 0;
839 	copysz = MIN((size_t)elsz, sizeof(proc2));
840 
841 	if (kmatch) {
842 		if (mib_inrange(oldp, off) && elmax > 0) {
843 			fill_proc2_kern(&proc2);
844 			if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
845 				return r;
846 			elmax--;
847 		}
848 		off += elsz;
849 	}
850 
851 	for (mslot = 0; mslot < NR_PROCS; mslot++) {
852 		mp = &mproc_tab[mslot];
853 
854 		if (!(mp->mp_flags & IN_USE))
855 			continue;
856 
857 		switch (req) {
858 		case KERN_PROC_PID:
859 			if ((pid_t)arg != mp->mp_pid)
860 				continue;
861 			break;
862 		case KERN_PROC_SESSION: /* TODO: job control support */
863 		case KERN_PROC_PGRP:
864 			if ((pid_t)arg != mp->mp_procgrp)
865 				continue;
866 			break;
867 		case KERN_PROC_TTY:
868 			if ((dev_t)arg == KERN_PROC_TTY_REVOKE)
869 				continue; /* TODO: revoke(2) support */
870 			/* Do not access the fproc_tab slot of zombies. */
871 			zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
872 			tty = (zombie) ? fproc_tab[mslot].fp_tty : NO_DEV;
873 			if ((dev_t)arg == KERN_PROC_TTY_NODEV) {
874 				if (tty != NO_DEV)
875 					continue;
876 			} else if ((dev_t)arg == NO_DEV || (dev_t)arg != tty)
877 				continue;
878 			break;
879 		case KERN_PROC_UID:
880 			if ((uid_t)arg != mp->mp_effuid)
881 				continue;
882 			break;
883 		case KERN_PROC_RUID:
884 			if ((uid_t)arg != mp->mp_realuid)
885 				continue;
886 			break;
887 		case KERN_PROC_GID:
888 			if ((gid_t)arg != mp->mp_effgid)
889 				continue;
890 			break;
891 		case KERN_PROC_RGID:
892 			if ((gid_t)arg != mp->mp_realgid)
893 				continue;
894 			break;
895 		}
896 
897 		if (mib_inrange(oldp, off) && elmax > 0) {
898 			fill_proc2_user(&proc2, mslot);
899 			if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
900 				return r;
901 			elmax--;
902 		}
903 		off += elsz;
904 	}
905 
906 	if (oldp == NULL && req != KERN_PROC_PID)
907 		off += EXTRA_PROCS * elsz;
908 
909 	return off;
910 }
911 
912 /*
913  * Implementation of CTL_KERN KERN_PROC_ARGS.
914  */
915 ssize_t
916 mib_kern_proc_args(struct mib_call * call, struct mib_node * node __unused,
917 	struct mib_oldp * oldp, struct mib_newp * newp __unused)
918 {
919 	char vbuf[PAGE_SIZE], sbuf[PAGE_SIZE], obuf[PAGE_SIZE];
920 	struct ps_strings pss;
921 	struct mproc *mp;
922 	char *buf, *p, *q, *pptr;
923 	vir_bytes vaddr, vpage, spage, paddr, ppage;
924 	size_t max, off, olen, oleft, oldlen, bytes, pleft;
925 	unsigned int copybudget;
926 	pid_t pid;
927 	int req, mslot, count, aborted, ended;
928 	ssize_t r;
929 
930 	if (call->call_namelen != 2)
931 		return EINVAL;
932 
933 	pid = call->call_name[0];
934 	req = call->call_name[1];
935 
936 	switch (req) {
937 	case KERN_PROC_ARGV:
938 	case KERN_PROC_ENV:
939 	case KERN_PROC_NARGV:
940 	case KERN_PROC_NENV:
941 		break;
942 	default:
943 		return EOPNOTSUPP;
944 	}
945 
946 	if (!update_tables())
947 		return EINVAL;
948 
949 	if ((mslot = get_mslot(pid)) == NO_SLOT)
950 		return ESRCH;
951 	mp = &mproc_tab[mslot];
952 	if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
953 		return ESRCH;
954 
955 	/* We can return the count field size without copying in any data. */
956 	if (oldp == NULL && (req == KERN_PROC_NARGV || req == KERN_PROC_NENV))
957 		return sizeof(count);
958 
959 	if (sys_datacopy(mp->mp_endpoint,
960 	    mp->mp_frame_addr + mp->mp_frame_len - sizeof(pss),
961 	    SELF, (vir_bytes)&pss, sizeof(pss)) != OK)
962 		return EINVAL;
963 
964 	/*
965 	 * Determine the upper size limit of the requested data.  Not only may
966 	 * the size never exceed ARG_MAX, it may also not exceed the frame
967 	 * length as given in its original exec call.  In fact, the frame
968 	 * length should be substantially larger: all strings for both the
969 	 * arguments and the environment are in there, along with other stuff,
970 	 * and there must be no overlap between strings.  It is possible that
971 	 * the application called setproctitle(3), in which case the ps_strings
972 	 * pointers refer to data outside the frame altogether.  However, this
973 	 * data should not exceed 2048 bytes, and we cover this by rounding up
974 	 * the frame length to a multiple of the page size.  Anyhow, NetBSD
975 	 * blindly returns ARG_MAX when asked for a size estimate, so with this
976 	 * maximum we are already quite a bit more accurate.
977 	 */
978 	max = roundup(MIN(mp->mp_frame_len, ARG_MAX), PAGE_SIZE);
979 
980 	switch (req) {
981 	case KERN_PROC_NARGV:
982 		count = pss.ps_nargvstr;
983 		return mib_copyout(oldp, 0, &count, sizeof(count));
984 	case KERN_PROC_NENV:
985 		count = pss.ps_nenvstr;
986 		return mib_copyout(oldp, 0, &count, sizeof(count));
987 	case KERN_PROC_ARGV:
988 		if (oldp == NULL)
989 			return max;
990 		vaddr = (vir_bytes)pss.ps_argvstr;
991 		count = pss.ps_nargvstr;
992 		break;
993 	case KERN_PROC_ENV:
994 		if (oldp == NULL)
995 			return max;
996 		vaddr = (vir_bytes)pss.ps_envstr;
997 		count = pss.ps_nenvstr;
998 		break;
999 	}
1000 
1001 	/*
1002 	 * Go through the strings.  Copy in entire, machine-aligned pages at
1003 	 * once, in the hope that all data is stored consecutively, which it
1004 	 * should be: we expect that the vector is followed by the strings, and
1005 	 * that the strings are stored in order of vector reference.  We keep
1006 	 * up to two pages with copied-in data: one for the vector, and
1007 	 * optionally one for string data.  In addition, we keep one page with
1008 	 * data to be copied out, so that we do not cause a lot of copy
1009 	 * overhead for short strings.
1010 	 *
1011 	 * We stop whenever any of the following conditions are met:
1012 	 * - copying in data from the target process fails for any reason;
1013 	 * - we have processed the last index ('count') into the vector;
1014 	 * - the current vector element is a NULL pointer;
1015 	 * - the requested number of output bytes ('oldlen') has been reached;
1016 	 * - the maximum number of output bytes ('max') has been reached;
1017 	 * - the number of page copy-ins exceeds an estimated threshold;
1018 	 * - copying out data fails for any reason (we then return the error).
1019 	 *
1020 	 * We limit the number of page copy-ins because otherwise a rogue
1021 	 * process could create an argument vector consisting of only two-byte
1022 	 * strings that all span two pages, causing us to copy up to 1GB of
1023 	 * data with the current ARG_MAX value of 256K.  No reasonable vector
1024 	 * should cause more than (ARG_MAX / PAGE_SIZE) page copies for
1025 	 * strings; we are nice enough to allow twice that.  Vector copies do
1026 	 * not count, as they are linear anyway.
1027 	 *
1028 	 * Unlike every other sysctl(2) call, we are supposed to truncate the
1029 	 * resulting size (the returned 'oldlen') to the requested size (the
1030 	 * given 'oldlen') *and* return the resulting size, rather than ENOMEM
1031 	 * and the real size.  Unfortunately, libkvm actually relies on this.
1032 	 *
1033 	 * Generally speaking, upon failure we just return a truncated result.
1034 	 * In case of truncation, the data we copy out need not be null
1035 	 * terminated.  It is up to userland to process the data correctly.
1036 	 */
1037 	if (trunc_page(vaddr) == 0 || vaddr % sizeof(char *) != 0)
1038 		return 0;
1039 
1040 	off = 0;
1041 	olen = 0;
1042 	aborted = FALSE;
1043 
1044 	oldlen = mib_getoldlen(oldp);
1045 	if (oldlen > max)
1046 		oldlen = max;
1047 
1048 	copybudget = (ARG_MAX / PAGE_SIZE) * 2;
1049 
1050 	vpage = 0;
1051 	spage = 0;
1052 
1053 	while (count > 0 && off + olen < oldlen && !aborted) {
1054 		/*
1055 		 * Start by fetching the page containing the current vector
1056 		 * element, if needed.  We could limit the fetch to the vector
1057 		 * size, but our hope is that for the simple cases, the strings
1058 		 * are on the remainder of the same page, so we save a copy
1059 		 * call.  TODO: since the strings should follow the vector, we
1060 		 * could start the copy at the base of the vector.
1061 		 */
1062 		if (trunc_page(vaddr) != vpage) {
1063 			vpage = trunc_page(vaddr);
1064 			if (sys_datacopy(mp->mp_endpoint, vpage, SELF,
1065 			    (vir_bytes)vbuf, PAGE_SIZE) != OK)
1066 				break;
1067 		}
1068 
1069 		/* Get the current vector element, pointing to a string. */
1070 		memcpy(&pptr, &vbuf[vaddr - vpage], sizeof(pptr));
1071 		paddr = (vir_bytes)pptr;
1072 		ppage = trunc_page(paddr);
1073 		if (ppage == 0)
1074 			break;
1075 
1076 		/* Fetch the string itself, one page at a time at most. */
1077 		do {
1078 			/*
1079 			 * See if the string pointer falls inside either the
1080 			 * vector page or the previously fetched string page
1081 			 * (if any).  If not, fetch a string page.
1082 			 */
1083 			if (ppage == vpage) {
1084 				buf = vbuf;
1085 			} else if (ppage == spage) {
1086 				buf = sbuf;
1087 			} else {
1088 				if (--copybudget == 0) {
1089 					aborted = TRUE;
1090 					break;
1091 				}
1092 				spage = ppage;
1093 				if (sys_datacopy(mp->mp_endpoint, spage, SELF,
1094 				    (vir_bytes)sbuf, PAGE_SIZE) != OK) {
1095 					aborted = TRUE;
1096 					break;
1097 				}
1098 				buf = sbuf;
1099 			}
1100 
1101 			/*
1102 			 * We now have a string fragment in a buffer.  See if
1103 			 * the string is null terminated.  If not, all the data
1104 			 * up to the buffer end is part of the string, and the
1105 			 * string continues on the next page.
1106 			 */
1107 			p = &buf[paddr - ppage];
1108 			pleft = PAGE_SIZE - (paddr - ppage);
1109 			assert(pleft > 0);
1110 
1111 			if ((q = memchr(p, '\0', pleft)) != NULL) {
1112 				bytes = (size_t)(q - p + 1);
1113 				assert(bytes <= pleft);
1114 				ended = TRUE;
1115 			} else {
1116 				bytes = pleft;
1117 				ended = FALSE;
1118 			}
1119 
1120 			/* Limit the result to the requested length. */
1121 			if (off + olen + bytes > oldlen)
1122 				bytes = oldlen - off - olen;
1123 
1124 			/*
1125 			 * Add 'bytes' bytes from string pointer 'p' to the
1126 			 * output buffer, copying out its contents to userland
1127 			 * if it has filled up.
1128 			 */
1129 			if (olen + bytes > sizeof(obuf)) {
1130 				oleft = sizeof(obuf) - olen;
1131 				memcpy(&obuf[olen], p, oleft);
1132 
1133 				if ((r = mib_copyout(oldp, off, obuf,
1134 				    sizeof(obuf))) < 0)
1135 					return r;
1136 				off += sizeof(obuf);
1137 				olen = 0;
1138 
1139 				p += oleft;
1140 				bytes -= oleft;
1141 			}
1142 			if (bytes > 0) {
1143 				memcpy(&obuf[olen], p, bytes);
1144 				olen += bytes;
1145 			}
1146 
1147 			/*
1148 			 * Continue as long as we have not yet found the string
1149 			 * end, and we have not yet filled the output buffer.
1150 			 */
1151 			paddr += pleft;
1152 			assert(trunc_page(paddr) == paddr);
1153 			ppage = paddr;
1154 		} while (!ended && off + olen < oldlen);
1155 
1156 		vaddr += sizeof(char *);
1157 		count--;
1158 	}
1159 
1160 	/* Copy out any remainder of the output buffer. */
1161 	if (olen > 0) {
1162 		if ((r = mib_copyout(oldp, off, obuf, olen)) < 0)
1163 			return r;
1164 		off += olen;
1165 	}
1166 
1167 	assert(off <= oldlen);
1168 	return off;
1169 }
1170