xref: /onnv-gate/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 442:d1b9362cc59a)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
60Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
70Sstevel@tonic-gate  * with the License.
80Sstevel@tonic-gate  *
90Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
100Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
110Sstevel@tonic-gate  * See the License for the specific language governing permissions
120Sstevel@tonic-gate  * and limitations under the License.
130Sstevel@tonic-gate  *
140Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
150Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
160Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
170Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
180Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
190Sstevel@tonic-gate  *
200Sstevel@tonic-gate  * CDDL HEADER END
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate /*
23*442Sgm149974  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * rcapd is a long-running daemon enforcing project-based resource caps (see
310Sstevel@tonic-gate  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
320Sstevel@tonic-gate  * "collection") may have a memory cap.  A single thread monitors the resource
330Sstevel@tonic-gate  * utilization of capped collections, enforces caps when they are exceeded (and
340Sstevel@tonic-gate  * other conditions are met), and incorporates changes in configuration or
350Sstevel@tonic-gate  * caps.  Each of these actions occurs not more frequently than the rate
360Sstevel@tonic-gate  * specified with rcapadm(1M).
370Sstevel@tonic-gate  */
380Sstevel@tonic-gate 
390Sstevel@tonic-gate #include <sys/priocntl.h>
400Sstevel@tonic-gate #include <sys/proc.h>
410Sstevel@tonic-gate #include <sys/resource.h>
420Sstevel@tonic-gate #include <sys/sysinfo.h>
430Sstevel@tonic-gate #include <sys/stat.h>
440Sstevel@tonic-gate #include <sys/sysmacros.h>
450Sstevel@tonic-gate #include <sys/time.h>
460Sstevel@tonic-gate #include <sys/types.h>
470Sstevel@tonic-gate #include <dirent.h>
480Sstevel@tonic-gate #include <errno.h>
490Sstevel@tonic-gate #include <fcntl.h>
500Sstevel@tonic-gate #include <kstat.h>
510Sstevel@tonic-gate #include <libintl.h>
520Sstevel@tonic-gate #include <limits.h>
530Sstevel@tonic-gate #include <locale.h>
540Sstevel@tonic-gate #include <priv.h>
550Sstevel@tonic-gate #include <signal.h>
560Sstevel@tonic-gate #include <stdarg.h>
570Sstevel@tonic-gate #include <stdio.h>
580Sstevel@tonic-gate #include <stdlib.h>
590Sstevel@tonic-gate #include <strings.h>
600Sstevel@tonic-gate #include <time.h>
610Sstevel@tonic-gate #include <unistd.h>
620Sstevel@tonic-gate #include <zone.h>
630Sstevel@tonic-gate #include <assert.h>
640Sstevel@tonic-gate #include "rcapd.h"
650Sstevel@tonic-gate #include "rcapd_mapping.h"
660Sstevel@tonic-gate #include "rcapd_rfd.h"
670Sstevel@tonic-gate #include "rcapd_stat.h"
680Sstevel@tonic-gate #include "utils.h"
690Sstevel@tonic-gate 
700Sstevel@tonic-gate #define	POSITIVE_MIN(x, y) \
710Sstevel@tonic-gate 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
720Sstevel@tonic-gate #define	NEXT_EVENT_TIME(base, seconds) \
730Sstevel@tonic-gate 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
740Sstevel@tonic-gate 	: (hrtime_t)0)
750Sstevel@tonic-gate #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
760Sstevel@tonic-gate 	((rcfg.rcfg_stat_file[0] != 0) ?  \
770Sstevel@tonic-gate 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
780Sstevel@tonic-gate #define	EVENT_TIME(time, eventtime) \
790Sstevel@tonic-gate 	(((time) > (eventtime)) && (eventtime) != 0)
800Sstevel@tonic-gate #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
810Sstevel@tonic-gate #define	DAEMON_UID		1		/* uid to use */
820Sstevel@tonic-gate 
830Sstevel@tonic-gate typedef struct soft_scan_arg {
840Sstevel@tonic-gate 	uint64_t ssa_sum_excess;
850Sstevel@tonic-gate 	int64_t ssa_scan_goal;
860Sstevel@tonic-gate } soft_scan_arg_t;
870Sstevel@tonic-gate 
880Sstevel@tonic-gate static int debug_mode = 0;		/* debug mode flag */
890Sstevel@tonic-gate static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
900Sstevel@tonic-gate 					/* scanned */
910Sstevel@tonic-gate static kstat_ctl_t *kctl;		/* kstat chain */
920Sstevel@tonic-gate static uint64_t new_sp = 0, old_sp = 0;	/* measure delta in page scan count */
930Sstevel@tonic-gate static int enforce_caps = 0;		/* cap enforcement flag, dependent on */
940Sstevel@tonic-gate 					/* enforce_soft_caps and */
950Sstevel@tonic-gate 					/* global_scanner_running */
960Sstevel@tonic-gate static int enforce_soft_caps = 0;	/* soft cap enforcement flag, */
970Sstevel@tonic-gate 					/* depending on memory pressure */
980Sstevel@tonic-gate static int memory_pressure = 0;		/* physical memory utilization (%) */
990Sstevel@tonic-gate static int memory_pressure_sample = 0;	/* count of samples */
1000Sstevel@tonic-gate static int global_scanner_running = 0;	/* global scanning flag, to avoid */
1010Sstevel@tonic-gate 					/* interference with kernel's page */
1020Sstevel@tonic-gate 					/* scanner */
1030Sstevel@tonic-gate static hrtime_t next_report;		/* time of next report */
1040Sstevel@tonic-gate static int termination_signal = 0;	/* terminating signal */
1050Sstevel@tonic-gate 
1060Sstevel@tonic-gate rcfg_t rcfg;
1070Sstevel@tonic-gate 
1080Sstevel@tonic-gate /*
1090Sstevel@tonic-gate  * Flags.
1100Sstevel@tonic-gate  */
1110Sstevel@tonic-gate static int ever_ran;
1120Sstevel@tonic-gate int should_run;
1130Sstevel@tonic-gate static int should_reconfigure;
1140Sstevel@tonic-gate 
1150Sstevel@tonic-gate static int verify_statistics(void);
1160Sstevel@tonic-gate static int update_statistics(void);
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate /*
1190Sstevel@tonic-gate  * Checks if a process is marked 'system'.  Returns zero only when it is not.
1200Sstevel@tonic-gate  */
1210Sstevel@tonic-gate static int
1220Sstevel@tonic-gate proc_issystem(pid_t pid)
1230Sstevel@tonic-gate {
1240Sstevel@tonic-gate 	char pc_clname[PC_CLNMSZ];
1250Sstevel@tonic-gate 
1260Sstevel@tonic-gate 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
1270Sstevel@tonic-gate 	    PC_KY_NULL) != -1) {
1280Sstevel@tonic-gate 		return (strcmp(pc_clname, "SYS") == 0);
1290Sstevel@tonic-gate 	} else {
1300Sstevel@tonic-gate 		debug("cannot get class-specific scheduling parameters; "
1310Sstevel@tonic-gate 		    "assuming system process");
1320Sstevel@tonic-gate 		return (-1);
1330Sstevel@tonic-gate 	}
1340Sstevel@tonic-gate }
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate /*
1370Sstevel@tonic-gate  * fname is the process name, for debugging messages, and unscannable is a flag
1380Sstevel@tonic-gate  * indicating whether the process should be scanned.
1390Sstevel@tonic-gate  */
1400Sstevel@tonic-gate static void
1410Sstevel@tonic-gate lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable)
1420Sstevel@tonic-gate {
1430Sstevel@tonic-gate 	lcollection_t *lcol;
1440Sstevel@tonic-gate 	lprocess_t *lproc;
1450Sstevel@tonic-gate 
1460Sstevel@tonic-gate 	if ((lcol = lcollection_find(colid)) == NULL)
1470Sstevel@tonic-gate 		return;
1480Sstevel@tonic-gate 
1490Sstevel@tonic-gate 	/*
1500Sstevel@tonic-gate 	 * If the process is already being tracked, update the unscannable flag,
1510Sstevel@tonic-gate 	 * as determined by the caller, from the process's psinfo.
1520Sstevel@tonic-gate 	 */
1530Sstevel@tonic-gate 	lproc = lcol->lcol_lprocess;
1540Sstevel@tonic-gate 	while (lproc != NULL) {
1550Sstevel@tonic-gate 		if (lproc->lpc_pid == pid) {
1560Sstevel@tonic-gate 			lproc->lpc_mark = 1;
1570Sstevel@tonic-gate 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
1580Sstevel@tonic-gate 				debug("process %d: became unscannable\n",
1590Sstevel@tonic-gate 				    (int)lproc->lpc_pid);
1600Sstevel@tonic-gate 				lproc->lpc_unscannable = 1;
1610Sstevel@tonic-gate 			}
1620Sstevel@tonic-gate 			return;
1630Sstevel@tonic-gate 		}
1640Sstevel@tonic-gate 		lproc = lproc->lpc_next;
1650Sstevel@tonic-gate 	}
1660Sstevel@tonic-gate 
1670Sstevel@tonic-gate 	/*
1680Sstevel@tonic-gate 	 * We've fallen off the list without finding our current process;
1690Sstevel@tonic-gate 	 * insert it at the list head.
1700Sstevel@tonic-gate 	 */
1710Sstevel@tonic-gate 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
1720Sstevel@tonic-gate 		debug("insufficient memory to track new process %d", (int)pid);
1730Sstevel@tonic-gate 	else {
1740Sstevel@tonic-gate 		(void) bzero(lproc, sizeof (*lproc));
1750Sstevel@tonic-gate 		lproc->lpc_pid = pid;
1760Sstevel@tonic-gate 		lproc->lpc_mark = 1;
1770Sstevel@tonic-gate 		lproc->lpc_collection = lcol;
1780Sstevel@tonic-gate 		lproc->lpc_psinfo_fd = -1;
1790Sstevel@tonic-gate 		lproc->lpc_pgdata_fd = -1;
1800Sstevel@tonic-gate 		lproc->lpc_xmap_fd = -1;
1810Sstevel@tonic-gate 
1820Sstevel@tonic-gate 		/*
1830Sstevel@tonic-gate 		 * If the caller didn't flag this process as unscannable
1840Sstevel@tonic-gate 		 * already, do some more checking.
1850Sstevel@tonic-gate 		 */
1860Sstevel@tonic-gate 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
1870Sstevel@tonic-gate 
1880Sstevel@tonic-gate #ifdef DEBUG
1890Sstevel@tonic-gate 		/*
1900Sstevel@tonic-gate 		 * Verify the sanity of lprocess.  It should not contain the
1910Sstevel@tonic-gate 		 * process we are about to prepend.
1920Sstevel@tonic-gate 		 */
1930Sstevel@tonic-gate 		if (lcollection_member(lcol, lproc)) {
1940Sstevel@tonic-gate 			lprocess_t *cur = lcol->lcol_lprocess;
1950Sstevel@tonic-gate 			debug("The collection %lld already has these members, "
1960Sstevel@tonic-gate 			    "including me, %d!\n", (long long)lcol->lcol_id,
1970Sstevel@tonic-gate 			    (int)lproc->lpc_pid);
1980Sstevel@tonic-gate 			while (cur != NULL) {
1990Sstevel@tonic-gate 				debug("\t%d\n", (int)cur->lpc_pid);
2000Sstevel@tonic-gate 				cur = cur->lpc_next;
2010Sstevel@tonic-gate 			}
2020Sstevel@tonic-gate 			info(gettext("process already on lprocess\n"));
2030Sstevel@tonic-gate 			abort();
2040Sstevel@tonic-gate 		}
2050Sstevel@tonic-gate #endif /* DEBUG */
2060Sstevel@tonic-gate 		lproc->lpc_next = lcol->lcol_lprocess;
2070Sstevel@tonic-gate 		if (lproc->lpc_next != NULL)
2080Sstevel@tonic-gate 			lproc->lpc_next->lpc_prev = lproc;
2090Sstevel@tonic-gate 		lproc->lpc_prev = NULL;
2100Sstevel@tonic-gate 		lcol->lcol_lprocess = lproc;
2110Sstevel@tonic-gate 
2120Sstevel@tonic-gate 		debug("tracking %d %d %s%s\n", (int)colid, (int)pid, fname,
2130Sstevel@tonic-gate 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
2140Sstevel@tonic-gate 		lcol->lcol_stat.lcols_proc_in++;
2150Sstevel@tonic-gate 	}
2160Sstevel@tonic-gate }
2170Sstevel@tonic-gate 
2180Sstevel@tonic-gate static int
2190Sstevel@tonic-gate list_walk_process_cb(lcollection_t *lcol, void *arg)
2200Sstevel@tonic-gate {
2210Sstevel@tonic-gate 	int (*cb)(lcollection_t *, lprocess_t *) =
2220Sstevel@tonic-gate 	    (int(*)(lcollection_t *, lprocess_t *))arg;
2230Sstevel@tonic-gate 	lprocess_t *member;
2240Sstevel@tonic-gate 	lprocess_t *next;
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate 	member = lcol->lcol_lprocess;
2270Sstevel@tonic-gate 	while (member != NULL) {
2280Sstevel@tonic-gate 		pid_t pid = member->lpc_pid;
2290Sstevel@tonic-gate 		next = member->lpc_next;
2300Sstevel@tonic-gate 
2310Sstevel@tonic-gate 		debug_high("list_walk_all lpc %d\n", (int)pid);
2320Sstevel@tonic-gate 		if (cb(lcol, member) != 0) {
2330Sstevel@tonic-gate 			debug_high("list_walk_all aborted at lpc %d\n",
2340Sstevel@tonic-gate 			    (int)pid);
2350Sstevel@tonic-gate 			return (1);
2360Sstevel@tonic-gate 		}
2370Sstevel@tonic-gate 		member = next;
2380Sstevel@tonic-gate 	}
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate 	return (0);
2410Sstevel@tonic-gate }
2420Sstevel@tonic-gate 
2430Sstevel@tonic-gate /*
2440Sstevel@tonic-gate  * Invoke the given callback for each process in each collection.  Callbacks
2450Sstevel@tonic-gate  * are allowed to change the linkage of the process on which they act.
2460Sstevel@tonic-gate  */
2470Sstevel@tonic-gate static void
2480Sstevel@tonic-gate list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
2490Sstevel@tonic-gate {
2500Sstevel@tonic-gate 	list_walk_collection(list_walk_process_cb, (void *)cb);
2510Sstevel@tonic-gate }
2520Sstevel@tonic-gate 
2530Sstevel@tonic-gate static void
2540Sstevel@tonic-gate revoke_psinfo(rfd_t *rfd)
2550Sstevel@tonic-gate {
2560Sstevel@tonic-gate 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate 	if (lpc != NULL) {
2590Sstevel@tonic-gate 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
2600Sstevel@tonic-gate 		ASSERT(lpc->lpc_psinfo_fd != -1);
2610Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
2620Sstevel@tonic-gate 	} else
2630Sstevel@tonic-gate 		debug("revoking psinfo fd for unknown process\n");
2640Sstevel@tonic-gate }
2650Sstevel@tonic-gate 
2660Sstevel@tonic-gate /*
2670Sstevel@tonic-gate  * Retrieve a process's psinfo via an already-opened or new file descriptor.
2680Sstevel@tonic-gate  * The supplied descriptor will be closed on failure.  An optional callback
2690Sstevel@tonic-gate  * will be invoked with the last descriptor tried, and a supplied callback
2700Sstevel@tonic-gate  * argument, as its arguments, such that the new descriptor may be cached, or
2710Sstevel@tonic-gate  * an old one may be invalidated.  If the result of the callback is zero, the
2720Sstevel@tonic-gate  * the caller is to assume responsibility for the file descriptor, to close it
2730Sstevel@tonic-gate  * with rfd_close().
2740Sstevel@tonic-gate  *
2750Sstevel@tonic-gate  * On failure, a nonzero value is returned.
2760Sstevel@tonic-gate  */
2770Sstevel@tonic-gate int
2780Sstevel@tonic-gate get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
2790Sstevel@tonic-gate     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
2800Sstevel@tonic-gate {
2810Sstevel@tonic-gate 	int fd;
2820Sstevel@tonic-gate 	int can_try_uncached;
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
2850Sstevel@tonic-gate 
2860Sstevel@tonic-gate 	do {
2870Sstevel@tonic-gate 		if (cached_fd >= 0) {
2880Sstevel@tonic-gate 			fd = cached_fd;
2890Sstevel@tonic-gate 			can_try_uncached = 1;
2900Sstevel@tonic-gate 			debug_high("%d/psinfo, trying cached fd %d\n",
2910Sstevel@tonic-gate 			    (int)pid, fd);
2920Sstevel@tonic-gate 		} else {
2930Sstevel@tonic-gate 			char pathbuf[PROC_PATH_MAX];
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate 			can_try_uncached = 0;
2960Sstevel@tonic-gate 			(void) snprintf(pathbuf, sizeof (pathbuf),
2970Sstevel@tonic-gate 			    "/proc/%d/psinfo", (int)pid);
2980Sstevel@tonic-gate 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
2990Sstevel@tonic-gate 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
3000Sstevel@tonic-gate 				debug("cannot open %s", pathbuf);
3010Sstevel@tonic-gate 				break;
3020Sstevel@tonic-gate 			} else
3030Sstevel@tonic-gate 				debug_high("opened %s, fd %d\n", pathbuf, fd);
3040Sstevel@tonic-gate 		}
3050Sstevel@tonic-gate 
3060Sstevel@tonic-gate 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
3070Sstevel@tonic-gate 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
3080Sstevel@tonic-gate 			break;
3090Sstevel@tonic-gate 		else {
3100Sstevel@tonic-gate 			debug_high("closed fd %d\n", fd);
3110Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3120Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3130Sstevel@tonic-gate 			fd = cached_fd = -1;
3140Sstevel@tonic-gate 		}
3150Sstevel@tonic-gate 	} while (can_try_uncached == 1);
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
3180Sstevel@tonic-gate 		if (fd >= 0) {
3190Sstevel@tonic-gate 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
3200Sstevel@tonic-gate 			    "uncached" : "cached", fd);
3210Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3220Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3230Sstevel@tonic-gate 		}
3240Sstevel@tonic-gate 
3250Sstevel@tonic-gate 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
3260Sstevel@tonic-gate 	    fd_update_cb != NULL ? "cached" : "uncached");
3270Sstevel@tonic-gate 	return ((fd >= 0) ? 0 : -1);
3280Sstevel@tonic-gate }
3290Sstevel@tonic-gate 
3300Sstevel@tonic-gate /*
3310Sstevel@tonic-gate  * Retrieve the collection membership of all processes in our zone, and update
3320Sstevel@tonic-gate  * the psinfo of those non-system, non-zombie ones in collections.
3330Sstevel@tonic-gate  */
3340Sstevel@tonic-gate static void
3350Sstevel@tonic-gate proc_cb(const pid_t pid)
3360Sstevel@tonic-gate {
3370Sstevel@tonic-gate 	static zoneid_t ours = (zoneid_t)-1;
3380Sstevel@tonic-gate 	psinfo_t psinfo;
3390Sstevel@tonic-gate 
3400Sstevel@tonic-gate 	if (ours == (zoneid_t)-1)
3410Sstevel@tonic-gate 		ours = getzoneid();
3420Sstevel@tonic-gate 
3430Sstevel@tonic-gate 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0 &&
3440Sstevel@tonic-gate 	    psinfo.pr_zoneid == ours)
3450Sstevel@tonic-gate 		lprocess_insert_mark(psinfo.pr_pid, rc_getidbypsinfo(&psinfo),
3460Sstevel@tonic-gate 		    psinfo.pr_psargs, psinfo.pr_nlwp == 0);
3470Sstevel@tonic-gate }
3480Sstevel@tonic-gate 
3490Sstevel@tonic-gate /*
3500Sstevel@tonic-gate  * Cache the process' psinfo fd, taking responsibility for freeing it.
3510Sstevel@tonic-gate  */
3520Sstevel@tonic-gate int
3530Sstevel@tonic-gate lprocess_update_psinfo_fd_cb(void *arg, int fd)
3540Sstevel@tonic-gate {
3550Sstevel@tonic-gate 	lprocess_t *lpc = arg;
3560Sstevel@tonic-gate 
3570Sstevel@tonic-gate 	lpc->lpc_psinfo_fd = fd;
3580Sstevel@tonic-gate 	return (0);
3590Sstevel@tonic-gate }
3600Sstevel@tonic-gate 
3610Sstevel@tonic-gate /*
3620Sstevel@tonic-gate  * Update the RSS of processes in monitored collections.
3630Sstevel@tonic-gate  */
3640Sstevel@tonic-gate /*ARGSUSED*/
3650Sstevel@tonic-gate static int
3660Sstevel@tonic-gate mem_sample_cb(lcollection_t *lcol, lprocess_t *lpc)
3670Sstevel@tonic-gate {
3680Sstevel@tonic-gate 	psinfo_t psinfo;
3690Sstevel@tonic-gate 
3700Sstevel@tonic-gate 	if (get_psinfo(lpc->lpc_pid, &psinfo, lpc->lpc_psinfo_fd,
3710Sstevel@tonic-gate 	    lprocess_update_psinfo_fd_cb, lpc, lpc) == 0) {
3720Sstevel@tonic-gate 		lpc->lpc_rss = psinfo.pr_rssize;
3730Sstevel@tonic-gate 		lpc->lpc_size = psinfo.pr_size;
3740Sstevel@tonic-gate 	} else {
3750Sstevel@tonic-gate 		if (errno == ENOENT)
3760Sstevel@tonic-gate 			debug("process %d finished\n", (int)lpc->lpc_pid);
3770Sstevel@tonic-gate 		else
3780Sstevel@tonic-gate 			debug("process %d: cannot read psinfo",
3790Sstevel@tonic-gate 			    (int)lpc->lpc_pid);
3800Sstevel@tonic-gate 		lprocess_free(lpc);
3810Sstevel@tonic-gate 	}
3820Sstevel@tonic-gate 
3830Sstevel@tonic-gate 	return (0);
3840Sstevel@tonic-gate }
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate /*
3870Sstevel@tonic-gate  * Sample the collection RSS, updating the collection's statistics with the
3880Sstevel@tonic-gate  * results.
3890Sstevel@tonic-gate  */
3900Sstevel@tonic-gate /*ARGSUSED*/
3910Sstevel@tonic-gate static int
3920Sstevel@tonic-gate rss_sample_col_cb(lcollection_t *lcol, void *arg)
3930Sstevel@tonic-gate {
3940Sstevel@tonic-gate 	int64_t excess;
3950Sstevel@tonic-gate 	uint64_t rss;
3960Sstevel@tonic-gate 
3970Sstevel@tonic-gate 	/*
3980Sstevel@tonic-gate 	 * If updating statistics for a new interval, reset the affected
3990Sstevel@tonic-gate 	 * counters.
4000Sstevel@tonic-gate 	 */
4010Sstevel@tonic-gate 	if (lcol->lcol_stat_invalidate != 0) {
4020Sstevel@tonic-gate 		lcol->lcol_stat_old = lcol->lcol_stat;
4030Sstevel@tonic-gate 		lcol->lcol_stat.lcols_min_rss = (int64_t)-1;
4040Sstevel@tonic-gate 		lcol->lcol_stat.lcols_max_rss = 0;
4050Sstevel@tonic-gate 		lcol->lcol_stat_invalidate = 0;
4060Sstevel@tonic-gate 	}
4070Sstevel@tonic-gate 
4080Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sample++;
4090Sstevel@tonic-gate 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
4100Sstevel@tonic-gate 	rss = lcol->lcol_rss;
4110Sstevel@tonic-gate 	if (excess > 0)
4120Sstevel@tonic-gate 		lcol->lcol_stat.lcols_rss_act_sum += rss;
4130Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sum += rss;
4140Sstevel@tonic-gate 
4150Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_min_rss > rss)
4160Sstevel@tonic-gate 		lcol->lcol_stat.lcols_min_rss = rss;
4170Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_max_rss < rss)
4180Sstevel@tonic-gate 		lcol->lcol_stat.lcols_max_rss = rss;
4190Sstevel@tonic-gate 
4200Sstevel@tonic-gate 	return (0);
4210Sstevel@tonic-gate }
4220Sstevel@tonic-gate 
4230Sstevel@tonic-gate /*
4240Sstevel@tonic-gate  * Open /proc and walk entries.
4250Sstevel@tonic-gate  */
4260Sstevel@tonic-gate static void
4270Sstevel@tonic-gate proc_walk_all(void (*cb)(const pid_t))
4280Sstevel@tonic-gate {
4290Sstevel@tonic-gate 	DIR *pdir;
4300Sstevel@tonic-gate 	struct dirent *dirent;
4310Sstevel@tonic-gate 	pid_t pid;
4320Sstevel@tonic-gate 
4330Sstevel@tonic-gate 	(void) rfd_reserve(1);
4340Sstevel@tonic-gate 	if ((pdir = opendir("/proc")) == NULL)
4350Sstevel@tonic-gate 		die(gettext("couldn't open /proc!"));
4360Sstevel@tonic-gate 
4370Sstevel@tonic-gate 	while ((dirent = readdir(pdir)) != NULL) {
4380Sstevel@tonic-gate 		if (strcmp(".", dirent->d_name) == 0 ||
4390Sstevel@tonic-gate 		    strcmp("..", dirent->d_name) == 0)
4400Sstevel@tonic-gate 			continue;
4410Sstevel@tonic-gate 		pid = atoi(dirent->d_name);
4420Sstevel@tonic-gate 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
4430Sstevel@tonic-gate 		if (pid == rcapd_pid)
4440Sstevel@tonic-gate 			continue;
4450Sstevel@tonic-gate 		else
4460Sstevel@tonic-gate 			cb(pid);
4470Sstevel@tonic-gate 	}
4480Sstevel@tonic-gate 	(void) closedir(pdir);
4490Sstevel@tonic-gate }
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate /*
4520Sstevel@tonic-gate  * Memory update callback.
4530Sstevel@tonic-gate  */
4540Sstevel@tonic-gate static int
4550Sstevel@tonic-gate memory_all_cb(lcollection_t *lcol, lprocess_t *lpc)
4560Sstevel@tonic-gate {
4570Sstevel@tonic-gate 	debug_high("%s %s, pid %d: rss += %llu/%llu\n", rcfg.rcfg_mode_name,
4580Sstevel@tonic-gate 	    lcol->lcol_name, (int)lpc->lpc_pid,
4590Sstevel@tonic-gate 	    (unsigned long long)lpc->lpc_rss,
4600Sstevel@tonic-gate 	    (unsigned long long)lpc->lpc_size);
4610Sstevel@tonic-gate 	ASSERT(lpc->lpc_rss <= lpc->lpc_size);
4620Sstevel@tonic-gate 	lcol->lcol_rss += lpc->lpc_rss;
4630Sstevel@tonic-gate 	lcol->lcol_image_size += lpc->lpc_size;
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 	return (0);
4660Sstevel@tonic-gate }
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate /*
4690Sstevel@tonic-gate  * Clear unmarked callback.
4700Sstevel@tonic-gate  */
4710Sstevel@tonic-gate /*ARGSUSED*/
4720Sstevel@tonic-gate static int
4730Sstevel@tonic-gate sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
4740Sstevel@tonic-gate {
4750Sstevel@tonic-gate 	if (lpc->lpc_mark) {
4760Sstevel@tonic-gate 		lpc->lpc_mark = 0;
4770Sstevel@tonic-gate 	} else {
4780Sstevel@tonic-gate 		debug("process %d finished\n", (int)lpc->lpc_pid);
4790Sstevel@tonic-gate 		lprocess_free(lpc);
4800Sstevel@tonic-gate 	}
4810Sstevel@tonic-gate 
4820Sstevel@tonic-gate 	return (0);
4830Sstevel@tonic-gate }
4840Sstevel@tonic-gate 
4850Sstevel@tonic-gate /*
4860Sstevel@tonic-gate  * Memory clear callback.
4870Sstevel@tonic-gate  */
4880Sstevel@tonic-gate /*ARGSUSED*/
4890Sstevel@tonic-gate static int
4900Sstevel@tonic-gate collection_zero_mem_cb(lcollection_t *lcol, void *arg)
4910Sstevel@tonic-gate {
4920Sstevel@tonic-gate 	lcol->lcol_rss = 0;
4930Sstevel@tonic-gate 	lcol->lcol_image_size = 0;
4940Sstevel@tonic-gate 
4950Sstevel@tonic-gate 	return (0);
4960Sstevel@tonic-gate }
4970Sstevel@tonic-gate 
4980Sstevel@tonic-gate /*
4990Sstevel@tonic-gate  * Print, for debugging purposes, a collection's recently-sampled RSS and
5000Sstevel@tonic-gate  * excess.
5010Sstevel@tonic-gate  */
5020Sstevel@tonic-gate /*ARGSUSED*/
5030Sstevel@tonic-gate static int
5040Sstevel@tonic-gate excess_print_cb(lcollection_t *lcol, void *arg)
5050Sstevel@tonic-gate {
5060Sstevel@tonic-gate 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
5090Sstevel@tonic-gate 	    rcfg.rcfg_mode_name, lcol->lcol_name,
5100Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss,
5110Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
5120Sstevel@tonic-gate 	    (long long)excess);
5130Sstevel@tonic-gate 
5140Sstevel@tonic-gate 	return (0);
5150Sstevel@tonic-gate }
5160Sstevel@tonic-gate 
5170Sstevel@tonic-gate /*
5180Sstevel@tonic-gate  * Scan those collections which have exceeded their caps.
5190Sstevel@tonic-gate  */
5200Sstevel@tonic-gate /*ARGSUSED*/
5210Sstevel@tonic-gate static int
5220Sstevel@tonic-gate scan_cb(lcollection_t *lcol, void *arg)
5230Sstevel@tonic-gate {
5240Sstevel@tonic-gate 	int64_t excess;
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
5270Sstevel@tonic-gate 		scan(lcol, excess);
5280Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
5290Sstevel@tonic-gate 	}
5300Sstevel@tonic-gate 
5310Sstevel@tonic-gate 	return (0);
5320Sstevel@tonic-gate }
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate /*
5350Sstevel@tonic-gate  * Do a soft scan of those collections which have excesses.  A soft scan is one
5360Sstevel@tonic-gate  * in which the cap enforcement pressure is taken into account.  The difference
5370Sstevel@tonic-gate  * between the utilized physical memory and the cap enforcement pressure will
5380Sstevel@tonic-gate  * be scanned-for, and each collection will be scanned proportionally by their
5390Sstevel@tonic-gate  * present excesses.
5400Sstevel@tonic-gate  */
5410Sstevel@tonic-gate static int
5420Sstevel@tonic-gate soft_scan_cb(lcollection_t *lcol, void *a)
5430Sstevel@tonic-gate {
5440Sstevel@tonic-gate 	int64_t excess;
5450Sstevel@tonic-gate 	soft_scan_arg_t *arg = a;
5460Sstevel@tonic-gate 
5470Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
5480Sstevel@tonic-gate 		debug("col %lld excess %lld scan_goal %lld sum_excess %llu, "
5490Sstevel@tonic-gate 		    "scanning %lld\n", (long long)lcol->lcol_id,
5500Sstevel@tonic-gate 		    (long long)excess, (long long)arg->ssa_scan_goal,
5510Sstevel@tonic-gate 		    (unsigned long long)arg->ssa_sum_excess,
5520Sstevel@tonic-gate 		    (long long)(excess * arg->ssa_scan_goal /
5530Sstevel@tonic-gate 		    arg->ssa_sum_excess));
5540Sstevel@tonic-gate 
5550Sstevel@tonic-gate 		scan(lcol, (int64_t)(excess * arg->ssa_scan_goal /
5560Sstevel@tonic-gate 		    arg->ssa_sum_excess));
5570Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
5580Sstevel@tonic-gate 	}
5590Sstevel@tonic-gate 
5600Sstevel@tonic-gate 	return (0);
5610Sstevel@tonic-gate }
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate /*
5640Sstevel@tonic-gate  * When a scan could happen, but caps aren't enforced tick the
5650Sstevel@tonic-gate  * lcols_unenforced_cap counter.
5660Sstevel@tonic-gate  */
5670Sstevel@tonic-gate /*ARGSUSED*/
5680Sstevel@tonic-gate static int
5690Sstevel@tonic-gate unenforced_cap_cb(lcollection_t *lcol, void *arg)
5700Sstevel@tonic-gate {
5710Sstevel@tonic-gate 	lcol->lcol_stat.lcols_unenforced_cap++;
5720Sstevel@tonic-gate 
5730Sstevel@tonic-gate 	return (0);
5740Sstevel@tonic-gate }
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate /*
5770Sstevel@tonic-gate  * Update the count of physically installed memory.
5780Sstevel@tonic-gate  */
5790Sstevel@tonic-gate static void
5800Sstevel@tonic-gate update_phys_total(void)
5810Sstevel@tonic-gate {
5820Sstevel@tonic-gate 	uint64_t old_phys_total;
5830Sstevel@tonic-gate 
5840Sstevel@tonic-gate 	old_phys_total = phys_total;
5850Sstevel@tonic-gate 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE)
5860Sstevel@tonic-gate 	    / 1024;
5870Sstevel@tonic-gate 	if (phys_total != old_phys_total)
5880Sstevel@tonic-gate 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
5890Sstevel@tonic-gate 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
5900Sstevel@tonic-gate }
5910Sstevel@tonic-gate 
5920Sstevel@tonic-gate /*
5930Sstevel@tonic-gate  * Unlink a process from its collection, updating relevant statistics, and
5940Sstevel@tonic-gate  * freeing its associated memory.
5950Sstevel@tonic-gate  */
5960Sstevel@tonic-gate void
5970Sstevel@tonic-gate lprocess_free(lprocess_t *lpc)
5980Sstevel@tonic-gate {
5990Sstevel@tonic-gate 	pid_t pid;
6000Sstevel@tonic-gate 
6010Sstevel@tonic-gate 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
6020Sstevel@tonic-gate 
6030Sstevel@tonic-gate 	if (lpc->lpc_prev != NULL)
6040Sstevel@tonic-gate 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
6050Sstevel@tonic-gate 	if (lpc->lpc_next != NULL)
6060Sstevel@tonic-gate 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
6070Sstevel@tonic-gate 	if (lpc->lpc_collection->lcol_lprocess == lpc)
6080Sstevel@tonic-gate 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
6090Sstevel@tonic-gate 		    lpc ? lpc->lpc_next : NULL);
6100Sstevel@tonic-gate 	lpc->lpc_next = lpc->lpc_prev = NULL;
6110Sstevel@tonic-gate 
6120Sstevel@tonic-gate 	if (lpc->lpc_prpageheader != NULL)
6130Sstevel@tonic-gate 		free(lpc->lpc_prpageheader);
6140Sstevel@tonic-gate 	if (lpc->lpc_xmap != NULL)
6150Sstevel@tonic-gate 		free(lpc->lpc_xmap);
6160Sstevel@tonic-gate 	if (lpc->lpc_psinfo_fd >= 0) {
6170Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
6180Sstevel@tonic-gate 			debug("could not close %d lpc_psinfo_fd %d",
6190Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
6200Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
6210Sstevel@tonic-gate 	}
6220Sstevel@tonic-gate 	if (lpc->lpc_pgdata_fd >= 0) {
6230Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
6240Sstevel@tonic-gate 			debug("could not close %d lpc_pgdata_fd %d",
6250Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
6260Sstevel@tonic-gate 		lpc->lpc_pgdata_fd = -1;
6270Sstevel@tonic-gate 	}
6280Sstevel@tonic-gate 	if (lpc->lpc_xmap_fd >= 0) {
6290Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
6300Sstevel@tonic-gate 			debug("could not close %d lpc_xmap_fd %d",
6310Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
6320Sstevel@tonic-gate 		lpc->lpc_xmap_fd = -1;
6330Sstevel@tonic-gate 	}
6340Sstevel@tonic-gate 	if (lpc->lpc_ignore != NULL)
6350Sstevel@tonic-gate 		lmapping_free(&lpc->lpc_ignore);
6360Sstevel@tonic-gate 	pid = lpc->lpc_pid;
6370Sstevel@tonic-gate 	free(lpc);
6380Sstevel@tonic-gate 	debug_high("process %d freed\n", (int)pid);
6390Sstevel@tonic-gate }
6400Sstevel@tonic-gate 
6410Sstevel@tonic-gate /*
6420Sstevel@tonic-gate  * Collection clear callback.
6430Sstevel@tonic-gate  */
6440Sstevel@tonic-gate /*ARGSUSED*/
6450Sstevel@tonic-gate static int
6460Sstevel@tonic-gate collection_clear_cb(lcollection_t *lcol, void *arg)
6470Sstevel@tonic-gate {
6480Sstevel@tonic-gate 	lcol->lcol_mark = 0;
6490Sstevel@tonic-gate 
6500Sstevel@tonic-gate 	return (0);
6510Sstevel@tonic-gate }
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate /*
6540Sstevel@tonic-gate  * Respond to a terminating signal by setting a termination flag.
6550Sstevel@tonic-gate  */
6560Sstevel@tonic-gate /*ARGSUSED*/
6570Sstevel@tonic-gate static void
6580Sstevel@tonic-gate terminate_signal(int signal)
6590Sstevel@tonic-gate {
6600Sstevel@tonic-gate 	if (termination_signal == 0)
6610Sstevel@tonic-gate 		termination_signal = signal;
6620Sstevel@tonic-gate 	should_run = 0;
6630Sstevel@tonic-gate }
6640Sstevel@tonic-gate 
6650Sstevel@tonic-gate /*
6660Sstevel@tonic-gate  * Handle any synchronous or asynchronous signals that would ordinarily cause a
6670Sstevel@tonic-gate  * process to abort.
6680Sstevel@tonic-gate  */
6690Sstevel@tonic-gate /*ARGSUSED*/
6700Sstevel@tonic-gate static void
6710Sstevel@tonic-gate abort_signal(int signal)
6720Sstevel@tonic-gate {
6730Sstevel@tonic-gate 	/*
6740Sstevel@tonic-gate 	 * Allow the scanner to make a last-ditch effort to resume any stopped
6750Sstevel@tonic-gate 	 * processes.
6760Sstevel@tonic-gate 	 */
6770Sstevel@tonic-gate 	scan_abort();
6780Sstevel@tonic-gate 	abort();
6790Sstevel@tonic-gate }
6800Sstevel@tonic-gate 
6810Sstevel@tonic-gate /*
6820Sstevel@tonic-gate  * Clean up collections which have been removed due to configuration.  Unlink
6830Sstevel@tonic-gate  * the collection from lcollection and free it.
6840Sstevel@tonic-gate  */
6850Sstevel@tonic-gate /*ARGSUSED*/
6860Sstevel@tonic-gate static int
6870Sstevel@tonic-gate collection_sweep_cb(lcollection_t *lcol, void *arg)
6880Sstevel@tonic-gate {
6890Sstevel@tonic-gate 	if (lcol->lcol_mark == 0) {
6900Sstevel@tonic-gate 		debug("freeing %s %s\n", rcfg.rcfg_mode_name, lcol->lcol_name);
6910Sstevel@tonic-gate 		lcollection_free(lcol);
6920Sstevel@tonic-gate 	}
6930Sstevel@tonic-gate 
6940Sstevel@tonic-gate 	return (0);
6950Sstevel@tonic-gate }
6960Sstevel@tonic-gate 
6970Sstevel@tonic-gate /*
6980Sstevel@tonic-gate  * Set those variables which depend on the global configuration.
6990Sstevel@tonic-gate  */
7000Sstevel@tonic-gate static void
7010Sstevel@tonic-gate finish_configuration(void)
7020Sstevel@tonic-gate {
7030Sstevel@tonic-gate 	/*
7040Sstevel@tonic-gate 	 * Warn that any lnode (or non-project) mode specification (by an SRM
7050Sstevel@tonic-gate 	 * 1.3 configuration file, for example) is ignored.
7060Sstevel@tonic-gate 	 */
7070Sstevel@tonic-gate 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
7080Sstevel@tonic-gate 		warn(gettext("%s mode specification ignored -- using project"
7090Sstevel@tonic-gate 		    " mode\n"), rcfg.rcfg_mode_name);
7100Sstevel@tonic-gate 		rcfg.rcfg_mode_name = "project";
7110Sstevel@tonic-gate 		rcfg.rcfg_mode = rctype_project;
7120Sstevel@tonic-gate 	}
7130Sstevel@tonic-gate 
7140Sstevel@tonic-gate 	lcollection_set_type(rcfg.rcfg_mode);
7150Sstevel@tonic-gate }
7160Sstevel@tonic-gate 
7170Sstevel@tonic-gate /*
7180Sstevel@tonic-gate  * Cause the configuration file to be reread and applied.
7190Sstevel@tonic-gate  */
7200Sstevel@tonic-gate static void
7210Sstevel@tonic-gate reread_configuration_file(void)
7220Sstevel@tonic-gate {
7230Sstevel@tonic-gate 	rcfg_t rcfg_new;
7240Sstevel@tonic-gate 	struct stat st;
7250Sstevel@tonic-gate 
7260Sstevel@tonic-gate 	if (stat(rcfg.rcfg_filename, &st) == 0 && st.st_mtime ==
7270Sstevel@tonic-gate 	    rcfg.rcfg_last_modification)
7280Sstevel@tonic-gate 		return;
7290Sstevel@tonic-gate 
7300Sstevel@tonic-gate 	if (rcfg_read(rcfg.rcfg_filename, rcfg.rcfg_fd, &rcfg_new,
7310Sstevel@tonic-gate 	    update_statistics) != 0)
7320Sstevel@tonic-gate 		warn(gettext("can't reread configuration"));
7330Sstevel@tonic-gate 	else {
7340Sstevel@tonic-gate 		/*
7350Sstevel@tonic-gate 		 * The configuration file has been read.  Remove existing
7360Sstevel@tonic-gate 		 * collections in case there is a change in collection type.
7370Sstevel@tonic-gate 		 */
7380Sstevel@tonic-gate 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
7390Sstevel@tonic-gate 			list_walk_collection(collection_clear_cb, NULL);
7400Sstevel@tonic-gate 			list_walk_collection(collection_sweep_cb, NULL);
7410Sstevel@tonic-gate 		}
7420Sstevel@tonic-gate 
7430Sstevel@tonic-gate 		/*
7440Sstevel@tonic-gate 		 * Make the newly-read configuration the global one, and update
7450Sstevel@tonic-gate 		 * any variables that depend on it.
7460Sstevel@tonic-gate 		 */
7470Sstevel@tonic-gate 		rcfg = rcfg_new;
7480Sstevel@tonic-gate 		finish_configuration();
7490Sstevel@tonic-gate 	}
7500Sstevel@tonic-gate }
7510Sstevel@tonic-gate 
7520Sstevel@tonic-gate /*
7530Sstevel@tonic-gate  * Reread the configuration filex, then examine changes, additions, and
7540Sstevel@tonic-gate  * deletions to cap definitions.
7550Sstevel@tonic-gate  */
7560Sstevel@tonic-gate static void
7570Sstevel@tonic-gate reconfigure(void)
7580Sstevel@tonic-gate {
7590Sstevel@tonic-gate 	debug("reconfigure...\n");
7600Sstevel@tonic-gate 
7610Sstevel@tonic-gate 	/*
7620Sstevel@tonic-gate 	 * Reread the configuration data.
7630Sstevel@tonic-gate 	 */
7640Sstevel@tonic-gate 	reread_configuration_file();
7650Sstevel@tonic-gate 
7660Sstevel@tonic-gate 	/*
7670Sstevel@tonic-gate 	 * Walk the lcollection, marking active collections so inactive ones
7680Sstevel@tonic-gate 	 * can be freed.
7690Sstevel@tonic-gate 	 */
7700Sstevel@tonic-gate 	list_walk_collection(collection_clear_cb, NULL);
7710Sstevel@tonic-gate 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
7720Sstevel@tonic-gate 	list_walk_collection(collection_sweep_cb, NULL);
7730Sstevel@tonic-gate }
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate /*
7760Sstevel@tonic-gate  * Respond to SIGHUP by triggering the rereading the configuration file and cap
7770Sstevel@tonic-gate  * definitions.
7780Sstevel@tonic-gate  */
7790Sstevel@tonic-gate /*ARGSUSED*/
7800Sstevel@tonic-gate static void
7810Sstevel@tonic-gate sighup(int signal)
7820Sstevel@tonic-gate {
7830Sstevel@tonic-gate 	should_reconfigure = 1;
7840Sstevel@tonic-gate }
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate /*
7870Sstevel@tonic-gate  * Print, for debugging purposes, each collection's interval statistics.
7880Sstevel@tonic-gate  */
7890Sstevel@tonic-gate /*ARGSUSED*/
7900Sstevel@tonic-gate static int
7910Sstevel@tonic-gate simple_report_collection_cb(lcollection_t *lcol, void *arg)
7920Sstevel@tonic-gate {
7930Sstevel@tonic-gate #define	DELTA(field) \
7940Sstevel@tonic-gate 	(unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
7950Sstevel@tonic-gate 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
7960Sstevel@tonic-gate #define	VALID(field) \
7970Sstevel@tonic-gate 	(unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
7980Sstevel@tonic-gate 	    lcol->lcol_stat.field)
7990Sstevel@tonic-gate 
8000Sstevel@tonic-gate 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
8010Sstevel@tonic-gate 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
8020Sstevel@tonic-gate 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
8030Sstevel@tonic-gate 	    "%llu scans over %llu ms\n", rcfg.rcfg_mode_name, lcol->lcol_name,
8040Sstevel@tonic-gate 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
8050Sstevel@tonic-gate 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
8060Sstevel@tonic-gate 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
8070Sstevel@tonic-gate 	    VALID(lcols_min_rss), VALID(lcols_max_rss),
8080Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
8090Sstevel@tonic-gate 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
8100Sstevel@tonic-gate 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
8110Sstevel@tonic-gate 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
8120Sstevel@tonic-gate 	    / MILLISEC));
8130Sstevel@tonic-gate 
8140Sstevel@tonic-gate #undef DELTA
8150Sstevel@tonic-gate #undef VALID
8160Sstevel@tonic-gate 
8170Sstevel@tonic-gate 	return (0);
8180Sstevel@tonic-gate }
8190Sstevel@tonic-gate 
8200Sstevel@tonic-gate /*
8210Sstevel@tonic-gate  * Record each collection's interval statistics in the statistics file.
8220Sstevel@tonic-gate  */
8230Sstevel@tonic-gate static int
8240Sstevel@tonic-gate report_collection_cb(lcollection_t *lcol, void *arg)
8250Sstevel@tonic-gate {
8260Sstevel@tonic-gate 	lcollection_report_t dc;
8270Sstevel@tonic-gate 	int fd = (intptr_t)arg;
8280Sstevel@tonic-gate 
8290Sstevel@tonic-gate 	/*
8300Sstevel@tonic-gate 	 * Copy the relevant fields to the collection's record.
8310Sstevel@tonic-gate 	 */
8320Sstevel@tonic-gate 	bzero(&dc, sizeof (dc));
8330Sstevel@tonic-gate 	dc.lcol_id = lcol->lcol_id;
8340Sstevel@tonic-gate 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
8350Sstevel@tonic-gate 	dc.lcol_rss = lcol->lcol_rss;
8360Sstevel@tonic-gate 	dc.lcol_image_size = lcol->lcol_image_size;
8370Sstevel@tonic-gate 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
8380Sstevel@tonic-gate 	dc.lcol_stat = lcol->lcol_stat;
8390Sstevel@tonic-gate 
8400Sstevel@tonic-gate 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
8410Sstevel@tonic-gate 		/*
8420Sstevel@tonic-gate 		 * Set a flag to indicate that the exported interval snapshot
8430Sstevel@tonic-gate 		 * values should be reset at the next sample.
8440Sstevel@tonic-gate 		 */
8450Sstevel@tonic-gate 		lcol->lcol_stat_invalidate = 1;
8460Sstevel@tonic-gate 	} else {
8470Sstevel@tonic-gate 		debug("can't write %s %s statistics", rcfg.rcfg_mode_name,
8480Sstevel@tonic-gate 		    lcol->lcol_name);
8490Sstevel@tonic-gate 	}
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate 	return (0);
8520Sstevel@tonic-gate }
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate /*
8550Sstevel@tonic-gate  * Determine the count of pages scanned by the global page scanner, obtained
8560Sstevel@tonic-gate  * from the cpu_stat:*::scan kstats.  Return zero on success.
8570Sstevel@tonic-gate  */
8580Sstevel@tonic-gate static int
8590Sstevel@tonic-gate get_globally_scanned_pages(uint64_t *scannedp)
8600Sstevel@tonic-gate {
8610Sstevel@tonic-gate 	kstat_t *ksp;
8620Sstevel@tonic-gate 	uint64_t scanned = 0;
8630Sstevel@tonic-gate 
8640Sstevel@tonic-gate 	if (kstat_chain_update(kctl) == -1) {
8650Sstevel@tonic-gate 		warn(gettext("can't update kstat chain"));
8660Sstevel@tonic-gate 		return (0);
8670Sstevel@tonic-gate 	}
8680Sstevel@tonic-gate 
8690Sstevel@tonic-gate 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
8700Sstevel@tonic-gate 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
8710Sstevel@tonic-gate 			if (kstat_read(kctl, ksp, NULL) != -1) {
8720Sstevel@tonic-gate 				scanned += ((cpu_stat_t *)
8730Sstevel@tonic-gate 				    ksp->ks_data)->cpu_vminfo.scan;
8740Sstevel@tonic-gate 			} else
8750Sstevel@tonic-gate 				return (-1);
8760Sstevel@tonic-gate 		}
8770Sstevel@tonic-gate 	}
8780Sstevel@tonic-gate 
8790Sstevel@tonic-gate 	*scannedp = scanned;
8800Sstevel@tonic-gate 	return (0);
8810Sstevel@tonic-gate }
8820Sstevel@tonic-gate 
8830Sstevel@tonic-gate /*
8840Sstevel@tonic-gate  * Update the shared statistics file with each collection's current statistics.
8850Sstevel@tonic-gate  * Return zero on success.
8860Sstevel@tonic-gate  */
8870Sstevel@tonic-gate static int
8880Sstevel@tonic-gate update_statistics(void)
8890Sstevel@tonic-gate {
8900Sstevel@tonic-gate 	int fd, res;
8910Sstevel@tonic-gate 	static char template[LINELEN];
8920Sstevel@tonic-gate 
8930Sstevel@tonic-gate 	/*
894*442Sgm149974 	 * Try to create a directory irrespective of whether it is existing
895*442Sgm149974 	 * or not. If it is not there then it will create. Otherwise any way
896*442Sgm149974 	 * it will fail at mkstemp call below.
897*442Sgm149974 	 */
898*442Sgm149974 	(void) mkdir(STAT_FILE_DIR, 0755);
899*442Sgm149974 
900*442Sgm149974 	/*
9010Sstevel@tonic-gate 	 * Create a temporary file.
9020Sstevel@tonic-gate 	 */
9030Sstevel@tonic-gate 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
9040Sstevel@tonic-gate 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
9050Sstevel@tonic-gate 		debug("temporary file template size too small\n");
9060Sstevel@tonic-gate 		return (-1);
9070Sstevel@tonic-gate 	}
9080Sstevel@tonic-gate 	(void) strcpy(template, rcfg.rcfg_stat_file);
9090Sstevel@tonic-gate 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
9100Sstevel@tonic-gate 	(void) rfd_reserve(1);
9110Sstevel@tonic-gate 	fd = mkstemp(template);
9120Sstevel@tonic-gate 
9130Sstevel@tonic-gate 	/*
9140Sstevel@tonic-gate 	 * Write the header and per-collection statistics.
9150Sstevel@tonic-gate 	 */
9160Sstevel@tonic-gate 	if (fd >= 0) {
9170Sstevel@tonic-gate 		rcapd_stat_hdr_t rs;
9180Sstevel@tonic-gate 
9190Sstevel@tonic-gate 		rs.rs_pid = rcapd_pid;
9200Sstevel@tonic-gate 		rs.rs_time = gethrtime();
9210Sstevel@tonic-gate 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
9220Sstevel@tonic-gate 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
9230Sstevel@tonic-gate 		rs.rs_pressure_cur = memory_pressure;
9240Sstevel@tonic-gate 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
9250Sstevel@tonic-gate 		rs.rs_pressure_sample = memory_pressure_sample;
9260Sstevel@tonic-gate 
9270Sstevel@tonic-gate 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
9280Sstevel@tonic-gate 		    sizeof (rs)) {
9290Sstevel@tonic-gate 			list_walk_collection(report_collection_cb,
9300Sstevel@tonic-gate 				(void *)(intptr_t)fd);
9310Sstevel@tonic-gate 			/*
9320Sstevel@tonic-gate 			 * Replace the existing statistics file with this new
9330Sstevel@tonic-gate 			 * one.
9340Sstevel@tonic-gate 			 */
9350Sstevel@tonic-gate 			res = rename(template, rcfg.rcfg_stat_file);
9360Sstevel@tonic-gate 		} else
9370Sstevel@tonic-gate 			res = -1;
9380Sstevel@tonic-gate 		(void) close(fd);
9390Sstevel@tonic-gate 	} else
9400Sstevel@tonic-gate 		res = -1;
9410Sstevel@tonic-gate 
9420Sstevel@tonic-gate 	return (res);
9430Sstevel@tonic-gate }
9440Sstevel@tonic-gate 
9450Sstevel@tonic-gate /*
9460Sstevel@tonic-gate  * Verify the statistics file can be created and written to, and die if an
9470Sstevel@tonic-gate  * existing file may be in use by another rcapd.
9480Sstevel@tonic-gate  */
9490Sstevel@tonic-gate static int
9500Sstevel@tonic-gate verify_statistics(void)
9510Sstevel@tonic-gate {
9520Sstevel@tonic-gate 	pid_t pid;
9530Sstevel@tonic-gate 
9540Sstevel@tonic-gate 	/*
9550Sstevel@tonic-gate 	 * Warn if another instance of rcapd might be active.
9560Sstevel@tonic-gate 	 */
9570Sstevel@tonic-gate 	(void) rfd_reserve(1);
9580Sstevel@tonic-gate 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
9590Sstevel@tonic-gate 	if (pid != rcapd_pid && pid != -1)
9600Sstevel@tonic-gate 		die(gettext("%s exists; rcapd may already be active\n"),
9610Sstevel@tonic-gate 		    rcfg.rcfg_stat_file);
9620Sstevel@tonic-gate 
9630Sstevel@tonic-gate 	return (update_statistics());
9640Sstevel@tonic-gate }
9650Sstevel@tonic-gate 
9660Sstevel@tonic-gate static int
9670Sstevel@tonic-gate sum_excess_cb(lcollection_t *lcol, void *arg)
9680Sstevel@tonic-gate {
9690Sstevel@tonic-gate 	uint64_t *sum_excess = arg;
9700Sstevel@tonic-gate 
9710Sstevel@tonic-gate 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
9720Sstevel@tonic-gate 	    lcol->lcol_rss_cap));
9730Sstevel@tonic-gate 	return (0);
9740Sstevel@tonic-gate }
9750Sstevel@tonic-gate 
9760Sstevel@tonic-gate static void
9770Sstevel@tonic-gate rcapd_usage(void)
9780Sstevel@tonic-gate {
9790Sstevel@tonic-gate 	info(gettext("usage: rcapd [-d]\n"));
9800Sstevel@tonic-gate }
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate void
9830Sstevel@tonic-gate check_update_statistics(void)
9840Sstevel@tonic-gate {
9850Sstevel@tonic-gate 	hrtime_t now = gethrtime();
9860Sstevel@tonic-gate 
9870Sstevel@tonic-gate 	if (EVENT_TIME(now, next_report)) {
9880Sstevel@tonic-gate 		debug("updating statistics...\n");
9890Sstevel@tonic-gate 		list_walk_collection(simple_report_collection_cb, NULL);
9900Sstevel@tonic-gate 		if (update_statistics() != 0)
9910Sstevel@tonic-gate 			debug("couldn't update statistics");
9920Sstevel@tonic-gate 		next_report = NEXT_REPORT_EVENT_TIME(now,
9930Sstevel@tonic-gate 		    rcfg.rcfg_report_interval);
9940Sstevel@tonic-gate 	}
9950Sstevel@tonic-gate }
9960Sstevel@tonic-gate 
9970Sstevel@tonic-gate static void
9980Sstevel@tonic-gate verify_and_set_privileges(void)
9990Sstevel@tonic-gate {
10000Sstevel@tonic-gate 	priv_set_t *required =
10010Sstevel@tonic-gate 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
10020Sstevel@tonic-gate 
10030Sstevel@tonic-gate 	/*
10040Sstevel@tonic-gate 	 * Ensure the required privileges, suitable for controlling processes,
10050Sstevel@tonic-gate 	 * are possessed.
10060Sstevel@tonic-gate 	 */
10070Sstevel@tonic-gate 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
10080Sstevel@tonic-gate 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
10090Sstevel@tonic-gate 		die(gettext("can't set requisite privileges"));
10100Sstevel@tonic-gate 
10110Sstevel@tonic-gate 	/*
10120Sstevel@tonic-gate 	 * Ensure access to /var/run/daemon.
10130Sstevel@tonic-gate 	 */
10140Sstevel@tonic-gate 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
10150Sstevel@tonic-gate 		die(gettext("cannot become user daemon"));
10160Sstevel@tonic-gate 
10170Sstevel@tonic-gate 	priv_freeset(required);
10180Sstevel@tonic-gate }
10190Sstevel@tonic-gate 
10200Sstevel@tonic-gate int
10210Sstevel@tonic-gate main(int argc, char *argv[])
10220Sstevel@tonic-gate {
10230Sstevel@tonic-gate 	int res;
10240Sstevel@tonic-gate 	int should_fork = 1;	/* fork flag */
10250Sstevel@tonic-gate 	hrtime_t now;		/* current time */
10260Sstevel@tonic-gate 	hrtime_t next;		/* time of next event */
10270Sstevel@tonic-gate 	int sig;		/* signal iteration */
10280Sstevel@tonic-gate 	struct rlimit rl;
10290Sstevel@tonic-gate 	hrtime_t next_proc_walk;	/* time of next /proc scan */
10300Sstevel@tonic-gate 	hrtime_t next_configuration;	/* time of next configuration */
10310Sstevel@tonic-gate 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
10320Sstevel@tonic-gate 	int old_enforce_caps;		/* track changes in enforcement */
10330Sstevel@tonic-gate 					/* conditions */
10340Sstevel@tonic-gate 	soft_scan_arg_t arg;
10350Sstevel@tonic-gate 
10360Sstevel@tonic-gate 	(void) set_message_priority(RCM_INFO);
10370Sstevel@tonic-gate 	(void) setprogname("rcapd");
10380Sstevel@tonic-gate 	rcapd_pid = getpid();
10390Sstevel@tonic-gate 	(void) chdir("/");
10400Sstevel@tonic-gate 	should_run = 1;
10410Sstevel@tonic-gate 	ever_ran = 0;
10420Sstevel@tonic-gate 
10430Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
10440Sstevel@tonic-gate 	(void) textdomain(TEXT_DOMAIN);
10450Sstevel@tonic-gate 
10460Sstevel@tonic-gate 	/*
10470Sstevel@tonic-gate 	 * Parse command-line options.
10480Sstevel@tonic-gate 	 */
10490Sstevel@tonic-gate 	while ((res = getopt(argc, argv, "dF")) > 0)
10500Sstevel@tonic-gate 		switch (res) {
10510Sstevel@tonic-gate 		case 'd':
10520Sstevel@tonic-gate 			should_fork = 0;
10530Sstevel@tonic-gate 			if (debug_mode == 0) {
10540Sstevel@tonic-gate 				debug_mode = 1;
10550Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG);
10560Sstevel@tonic-gate 			} else
10570Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG_HIGH);
10580Sstevel@tonic-gate 			break;
10590Sstevel@tonic-gate 		case 'F':
10600Sstevel@tonic-gate 			should_fork = 0;
10610Sstevel@tonic-gate 			break;
10620Sstevel@tonic-gate 		default:
10630Sstevel@tonic-gate 			rcapd_usage();
10640Sstevel@tonic-gate 			return (E_USAGE);
10650Sstevel@tonic-gate 			/*NOTREACHED*/
10660Sstevel@tonic-gate 		}
10670Sstevel@tonic-gate 
10680Sstevel@tonic-gate 	/*
10690Sstevel@tonic-gate 	 * If not debugging, fork and continue operating, changing the
10700Sstevel@tonic-gate 	 * destination of messages to syslog().
10710Sstevel@tonic-gate 	 */
10720Sstevel@tonic-gate 	if (should_fork == 1) {
10730Sstevel@tonic-gate 		pid_t child;
10740Sstevel@tonic-gate 		debug("forking\n");
10750Sstevel@tonic-gate 		child = fork();
10760Sstevel@tonic-gate 		if (child == -1)
10770Sstevel@tonic-gate 			die(gettext("cannot fork"));
10780Sstevel@tonic-gate 		if (child > 0)
10790Sstevel@tonic-gate 			return (0);
10800Sstevel@tonic-gate 		else {
10810Sstevel@tonic-gate 			rcapd_pid = getpid();
10820Sstevel@tonic-gate 			(void) set_message_destination(RCD_SYSLOG);
10830Sstevel@tonic-gate 			(void) fclose(stdin);
10840Sstevel@tonic-gate 			(void) fclose(stdout);
10850Sstevel@tonic-gate 			(void) fclose(stderr);
10860Sstevel@tonic-gate 		}
10870Sstevel@tonic-gate 		/*
10880Sstevel@tonic-gate 		 * Start a new session and detatch from the controlling tty.
10890Sstevel@tonic-gate 		 */
10900Sstevel@tonic-gate 		if (setsid() == (pid_t)-1)
10910Sstevel@tonic-gate 			debug(gettext("setsid() failed; cannot detach from "
10920Sstevel@tonic-gate 			    "terminal"));
10930Sstevel@tonic-gate 	}
10940Sstevel@tonic-gate 
10950Sstevel@tonic-gate 	/*
10960Sstevel@tonic-gate 	 * Read the configuration file.
10970Sstevel@tonic-gate 	 */
10980Sstevel@tonic-gate 	if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, verify_statistics)
10990Sstevel@tonic-gate 	    != 0)
11000Sstevel@tonic-gate 		die(gettext("invalid configuration: %s"),
11010Sstevel@tonic-gate 		    RCAPD_DEFAULT_CONF_FILE);
11020Sstevel@tonic-gate 	finish_configuration();
11030Sstevel@tonic-gate 	should_reconfigure = 0;
11040Sstevel@tonic-gate 
11050Sstevel@tonic-gate 	/*
11060Sstevel@tonic-gate 	 * Check that required privileges are possessed.
11070Sstevel@tonic-gate 	 */
11080Sstevel@tonic-gate 	verify_and_set_privileges();
11090Sstevel@tonic-gate 
11100Sstevel@tonic-gate 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
11110Sstevel@tonic-gate 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
11120Sstevel@tonic-gate 	    rcfg.rcfg_reconfiguration_interval);
11130Sstevel@tonic-gate 
11140Sstevel@tonic-gate 	if (rcfg.rcfg_memory_cap_enforcement_pressure == 0) {
11150Sstevel@tonic-gate 		/*
11160Sstevel@tonic-gate 		 * Always enforce caps when strict caps are used.
11170Sstevel@tonic-gate 		 */
11180Sstevel@tonic-gate 		enforce_caps = 1;
11190Sstevel@tonic-gate 	}
11200Sstevel@tonic-gate 
11210Sstevel@tonic-gate 	/*
11220Sstevel@tonic-gate 	 * Open the kstat chain.
11230Sstevel@tonic-gate 	 */
11240Sstevel@tonic-gate 	kctl = kstat_open();
11250Sstevel@tonic-gate 	if (kctl == NULL)
11260Sstevel@tonic-gate 		die(gettext("can't open kstats"));
11270Sstevel@tonic-gate 
11280Sstevel@tonic-gate 	/*
11290Sstevel@tonic-gate 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
11300Sstevel@tonic-gate 	 * be effectively managed without revoking descriptors (at 3 per
11310Sstevel@tonic-gate 	 * process).
11320Sstevel@tonic-gate 	 */
11330Sstevel@tonic-gate 	rl.rlim_cur = 32 * 1024;
11340Sstevel@tonic-gate 	rl.rlim_max = 32 * 1024;
11350Sstevel@tonic-gate 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
11360Sstevel@tonic-gate 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
11370Sstevel@tonic-gate 		rl.rlim_cur = rl.rlim_max;
11380Sstevel@tonic-gate 		(void) setrlimit(RLIMIT_NOFILE, &rl);
11390Sstevel@tonic-gate 	}
11400Sstevel@tonic-gate 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
11410Sstevel@tonic-gate 		debug("fd limit: %lu\n", rl.rlim_cur);
11420Sstevel@tonic-gate 	else
11430Sstevel@tonic-gate 		debug("fd limit: unknown\n");
11440Sstevel@tonic-gate 
11450Sstevel@tonic-gate 	/*
11460Sstevel@tonic-gate 	 * Handle those signals whose (default) exit disposition
11470Sstevel@tonic-gate 	 * prevents rcapd from finishing scanning before terminating.
11480Sstevel@tonic-gate 	 */
11490Sstevel@tonic-gate 	(void) sigset(SIGINT, terminate_signal);
11500Sstevel@tonic-gate 	(void) sigset(SIGQUIT, abort_signal);
11510Sstevel@tonic-gate 	(void) sigset(SIGILL, abort_signal);
11520Sstevel@tonic-gate 	(void) sigset(SIGEMT, abort_signal);
11530Sstevel@tonic-gate 	(void) sigset(SIGFPE, abort_signal);
11540Sstevel@tonic-gate 	(void) sigset(SIGBUS, abort_signal);
11550Sstevel@tonic-gate 	(void) sigset(SIGSEGV, abort_signal);
11560Sstevel@tonic-gate 	(void) sigset(SIGSYS, abort_signal);
11570Sstevel@tonic-gate 	(void) sigset(SIGPIPE, terminate_signal);
11580Sstevel@tonic-gate 	(void) sigset(SIGALRM, terminate_signal);
11590Sstevel@tonic-gate 	(void) sigset(SIGTERM, terminate_signal);
11600Sstevel@tonic-gate 	(void) sigset(SIGUSR1, terminate_signal);
11610Sstevel@tonic-gate 	(void) sigset(SIGUSR2, terminate_signal);
11620Sstevel@tonic-gate 	(void) sigset(SIGPOLL, terminate_signal);
11630Sstevel@tonic-gate 	(void) sigset(SIGVTALRM, terminate_signal);
11640Sstevel@tonic-gate 	(void) sigset(SIGXCPU, abort_signal);
11650Sstevel@tonic-gate 	(void) sigset(SIGXFSZ, abort_signal);
11660Sstevel@tonic-gate 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
11670Sstevel@tonic-gate 		(void) sigset(sig, terminate_signal);
11680Sstevel@tonic-gate 
11690Sstevel@tonic-gate 	/*
11700Sstevel@tonic-gate 	 * Install a signal handler for reconfiguration processing.
11710Sstevel@tonic-gate 	 */
11720Sstevel@tonic-gate 	(void) sigset(SIGHUP, sighup);
11730Sstevel@tonic-gate 
11740Sstevel@tonic-gate 	/*
11750Sstevel@tonic-gate 	 * Determine which process collections to cap.
11760Sstevel@tonic-gate 	 */
11770Sstevel@tonic-gate 	lcollection_update(LCU_COMPLETE);
11780Sstevel@tonic-gate 
11790Sstevel@tonic-gate 	/*
11800Sstevel@tonic-gate 	 * Loop forever, monitoring collections' resident set sizes and
11810Sstevel@tonic-gate 	 * enforcing their caps.  Look for changes in caps and process
11820Sstevel@tonic-gate 	 * membership, as well as responding to requests to reread the
11830Sstevel@tonic-gate 	 * configuration.  Update per-collection statistics periodically.
11840Sstevel@tonic-gate 	 */
11850Sstevel@tonic-gate 	while (should_run != 0) {
11860Sstevel@tonic-gate 		struct timespec ts;
11870Sstevel@tonic-gate 
11880Sstevel@tonic-gate 		/*
11890Sstevel@tonic-gate 		 * Announce that rcapd is starting.
11900Sstevel@tonic-gate 		 */
11910Sstevel@tonic-gate 		if (ever_ran == 0) {
11920Sstevel@tonic-gate 			info(gettext("starting\n"));
11930Sstevel@tonic-gate 			ever_ran = 1;
11940Sstevel@tonic-gate 		}
11950Sstevel@tonic-gate 
11960Sstevel@tonic-gate 		/*
11970Sstevel@tonic-gate 		 * Update the process list once every proc_walk_interval.  The
11980Sstevel@tonic-gate 		 * condition of global memory pressure is also checked at the
11990Sstevel@tonic-gate 		 * same frequency, if strict caps are in use.
12000Sstevel@tonic-gate 		 */
12010Sstevel@tonic-gate 		now = gethrtime();
12020Sstevel@tonic-gate 
12030Sstevel@tonic-gate 		/*
12040Sstevel@tonic-gate 		 * Detect configuration and cap changes at every
12050Sstevel@tonic-gate 		 * reconfiguration_interval, or when SIGHUP has been received.
12060Sstevel@tonic-gate 		 */
12070Sstevel@tonic-gate 		if (EVENT_TIME(now, next_configuration) ||
12080Sstevel@tonic-gate 		    should_reconfigure == 1) {
12090Sstevel@tonic-gate 			reconfigure();
12100Sstevel@tonic-gate 			next_configuration = NEXT_EVENT_TIME(now,
12110Sstevel@tonic-gate 			    rcfg.rcfg_reconfiguration_interval);
12120Sstevel@tonic-gate 
12130Sstevel@tonic-gate 			/*
12140Sstevel@tonic-gate 			 * Reset each event time to the shorter of the
12150Sstevel@tonic-gate 			 * previous and new intervals.
12160Sstevel@tonic-gate 			 */
12170Sstevel@tonic-gate 			if (next_report == 0 &&
12180Sstevel@tonic-gate 			    rcfg.rcfg_report_interval > 0)
12190Sstevel@tonic-gate 				next_report = now;
12200Sstevel@tonic-gate 			else
12210Sstevel@tonic-gate 				next_report = POSITIVE_MIN(next_report,
12220Sstevel@tonic-gate 				    NEXT_REPORT_EVENT_TIME(now,
12230Sstevel@tonic-gate 				    rcfg.rcfg_report_interval));
12240Sstevel@tonic-gate 			if (next_proc_walk == 0 &&
12250Sstevel@tonic-gate 			    rcfg.rcfg_proc_walk_interval > 0)
12260Sstevel@tonic-gate 				next_proc_walk = now;
12270Sstevel@tonic-gate 			else
12280Sstevel@tonic-gate 				next_proc_walk = POSITIVE_MIN(next_proc_walk,
12290Sstevel@tonic-gate 				    NEXT_EVENT_TIME(now,
12300Sstevel@tonic-gate 				    rcfg.rcfg_proc_walk_interval));
12310Sstevel@tonic-gate 			if (next_rss_sample == 0 &&
12320Sstevel@tonic-gate 			    rcfg.rcfg_rss_sample_interval > 0)
12330Sstevel@tonic-gate 				next_rss_sample = now;
12340Sstevel@tonic-gate 			else
12350Sstevel@tonic-gate 				next_rss_sample = POSITIVE_MIN(next_rss_sample,
12360Sstevel@tonic-gate 				    NEXT_EVENT_TIME(now,
12370Sstevel@tonic-gate 				    rcfg.rcfg_rss_sample_interval));
12380Sstevel@tonic-gate 
12390Sstevel@tonic-gate 			should_reconfigure = 0;
12400Sstevel@tonic-gate 			continue;
12410Sstevel@tonic-gate 		}
12420Sstevel@tonic-gate 
12430Sstevel@tonic-gate 		if (EVENT_TIME(now, next_proc_walk)) {
12440Sstevel@tonic-gate 			debug("scanning process list...\n");
12450Sstevel@tonic-gate 			proc_walk_all(proc_cb); /* mark */
12460Sstevel@tonic-gate 			list_walk_all(sweep_process_cb);
12470Sstevel@tonic-gate 			next_proc_walk = NEXT_EVENT_TIME(now,
12480Sstevel@tonic-gate 			    rcfg.rcfg_proc_walk_interval);
12490Sstevel@tonic-gate 		}
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 		if (EVENT_TIME(now, next_rss_sample)) {
12520Sstevel@tonic-gate 			/*
12530Sstevel@tonic-gate 			 * Check for changes to the amount of installed
12540Sstevel@tonic-gate 			 * physical memory, to compute the current memory
12550Sstevel@tonic-gate 			 * pressure.
12560Sstevel@tonic-gate 			 */
12570Sstevel@tonic-gate 			update_phys_total();
12580Sstevel@tonic-gate 
12590Sstevel@tonic-gate 			/*
12600Sstevel@tonic-gate 			 * If soft caps are in use, determine if global memory
12610Sstevel@tonic-gate 			 * pressure exceeds the configured maximum above which
12620Sstevel@tonic-gate 			 * soft caps are enforced.
12630Sstevel@tonic-gate 			 */
12640Sstevel@tonic-gate 			memory_pressure = 100 -
12650Sstevel@tonic-gate 			    (int)((sysconf(_SC_AVPHYS_PAGES) *
12660Sstevel@tonic-gate 			    (sysconf(_SC_PAGESIZE) / 1024)) * 100.0 /
12670Sstevel@tonic-gate 			    phys_total);
12680Sstevel@tonic-gate 			memory_pressure_sample++;
12690Sstevel@tonic-gate 			if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) {
12700Sstevel@tonic-gate 				if (memory_pressure >
12710Sstevel@tonic-gate 				    rcfg.rcfg_memory_cap_enforcement_pressure) {
12720Sstevel@tonic-gate 					if (enforce_soft_caps == 0) {
12730Sstevel@tonic-gate 						debug("memory pressure %d%%\n",
12740Sstevel@tonic-gate 						    memory_pressure);
12750Sstevel@tonic-gate 						enforce_soft_caps = 1;
12760Sstevel@tonic-gate 					}
12770Sstevel@tonic-gate 				} else {
12780Sstevel@tonic-gate 					if (enforce_soft_caps == 1)
12790Sstevel@tonic-gate 						enforce_soft_caps = 0;
12800Sstevel@tonic-gate 				}
12810Sstevel@tonic-gate 			}
12820Sstevel@tonic-gate 
12830Sstevel@tonic-gate 			/*
12840Sstevel@tonic-gate 			 * Determine if the global page scanner is running,
12850Sstevel@tonic-gate 			 * while which no memory caps should be enforced, to
12860Sstevel@tonic-gate 			 * prevent interference with the global page scanner.
12870Sstevel@tonic-gate 			 */
12880Sstevel@tonic-gate 			if (get_globally_scanned_pages(&new_sp) == 0) {
12890Sstevel@tonic-gate 				if (old_sp == 0)
12900Sstevel@tonic-gate 					/*EMPTY*/
12910Sstevel@tonic-gate 					;
12920Sstevel@tonic-gate 				else if ((new_sp - old_sp) > 0) {
12930Sstevel@tonic-gate 					if (global_scanner_running == 0) {
12940Sstevel@tonic-gate 						debug("global memory pressure "
12950Sstevel@tonic-gate 						    "detected (%llu pages "
12960Sstevel@tonic-gate 						    "scanned since last "
12970Sstevel@tonic-gate 						    "interval)\n",
12980Sstevel@tonic-gate 						    (unsigned long long)
12990Sstevel@tonic-gate 						    (new_sp - old_sp));
13000Sstevel@tonic-gate 						global_scanner_running = 1;
13010Sstevel@tonic-gate 					}
13020Sstevel@tonic-gate 				} else if (global_scanner_running == 1) {
13030Sstevel@tonic-gate 					debug("global memory pressure "
13040Sstevel@tonic-gate 					    "relieved\n");
13050Sstevel@tonic-gate 					global_scanner_running = 0;
13060Sstevel@tonic-gate 				}
13070Sstevel@tonic-gate 				old_sp = new_sp;
13080Sstevel@tonic-gate 			} else {
13090Sstevel@tonic-gate 				warn(gettext("kstat_read() failed"));
13100Sstevel@tonic-gate 				new_sp = old_sp;
13110Sstevel@tonic-gate 			}
13120Sstevel@tonic-gate 
13130Sstevel@tonic-gate 			/*
13140Sstevel@tonic-gate 			 * Cap enforcement is determined by the previous two
13150Sstevel@tonic-gate 			 * conditions.
13160Sstevel@tonic-gate 			 */
13170Sstevel@tonic-gate 			old_enforce_caps = enforce_caps;
13180Sstevel@tonic-gate 			enforce_caps =
13190Sstevel@tonic-gate 			    (rcfg.rcfg_memory_cap_enforcement_pressure ==
13200Sstevel@tonic-gate 			    0 || enforce_soft_caps == 1) &&
13210Sstevel@tonic-gate 			    !global_scanner_running;
13220Sstevel@tonic-gate 			if (old_enforce_caps != enforce_caps)
13230Sstevel@tonic-gate 				debug("%senforcing caps\n", enforce_caps == 0 ?
13240Sstevel@tonic-gate 				    "not " : "");
13250Sstevel@tonic-gate 
13260Sstevel@tonic-gate 			/*
13270Sstevel@tonic-gate 			 * Sample collections' member processes' RSSes and
13280Sstevel@tonic-gate 			 * recompute collections' excess.
13290Sstevel@tonic-gate 			 */
13300Sstevel@tonic-gate 			list_walk_all(mem_sample_cb);
13310Sstevel@tonic-gate 			list_walk_collection(collection_zero_mem_cb, NULL);
13320Sstevel@tonic-gate 			list_walk_all(memory_all_cb);
13330Sstevel@tonic-gate 			list_walk_collection(rss_sample_col_cb, NULL);
13340Sstevel@tonic-gate 			if (rcfg.rcfg_memory_cap_enforcement_pressure > 0)
13350Sstevel@tonic-gate 				debug("memory pressure %d%%\n",
13360Sstevel@tonic-gate 				    memory_pressure);
13370Sstevel@tonic-gate 			list_walk_collection(excess_print_cb, NULL);
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate 			/*
13400Sstevel@tonic-gate 			 * If soft caps are in use, determine the size of the
13410Sstevel@tonic-gate 			 * portion from each collection to scan for.
13420Sstevel@tonic-gate 			 */
13430Sstevel@tonic-gate 			if (enforce_soft_caps == 1) {
13440Sstevel@tonic-gate 				/*
13450Sstevel@tonic-gate 				 * Compute the sum of the collections'
13460Sstevel@tonic-gate 				 * excesses, which will be the denominator.
13470Sstevel@tonic-gate 				 */
13480Sstevel@tonic-gate 				arg.ssa_sum_excess = 0;
13490Sstevel@tonic-gate 				list_walk_collection(sum_excess_cb,
13500Sstevel@tonic-gate 				    &arg.ssa_sum_excess);
13510Sstevel@tonic-gate 
13520Sstevel@tonic-gate 				/*
13530Sstevel@tonic-gate 				 * Compute the quantity of memory (in
13540Sstevel@tonic-gate 				 * kilobytes) above the cap enforcement
13550Sstevel@tonic-gate 				 * pressure.  Set the scan goal to that
13560Sstevel@tonic-gate 				 * quantity (or at most the excess).
13570Sstevel@tonic-gate 				 */
13580Sstevel@tonic-gate 				arg.ssa_scan_goal = MIN((
13590Sstevel@tonic-gate 				    sysconf(_SC_PHYS_PAGES) * (100 -
13600Sstevel@tonic-gate 				    rcfg.rcfg_memory_cap_enforcement_pressure)
13610Sstevel@tonic-gate 				    / 100 - sysconf(_SC_AVPHYS_PAGES)) *
13620Sstevel@tonic-gate 				    (sysconf(_SC_PAGESIZE) / 1024),
13630Sstevel@tonic-gate 				    arg.ssa_sum_excess);
13640Sstevel@tonic-gate 			}
13650Sstevel@tonic-gate 
13660Sstevel@tonic-gate 			/*
13670Sstevel@tonic-gate 			 * Victimize offending collections.
13680Sstevel@tonic-gate 			 */
13690Sstevel@tonic-gate 			if (enforce_caps == 1 && ((enforce_soft_caps == 1 &&
13700Sstevel@tonic-gate 			    arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0) ||
13710Sstevel@tonic-gate 			    (enforce_soft_caps == 0)))
13720Sstevel@tonic-gate 				if (enforce_soft_caps == 1) {
13730Sstevel@tonic-gate 					debug("scan goal is %lldKB\n",
13740Sstevel@tonic-gate 					    (long long)arg.ssa_scan_goal);
13750Sstevel@tonic-gate 					list_walk_collection(soft_scan_cb,
13760Sstevel@tonic-gate 					    &arg);
13770Sstevel@tonic-gate 				} else
13780Sstevel@tonic-gate 					list_walk_collection(scan_cb, NULL);
13790Sstevel@tonic-gate 			else
13800Sstevel@tonic-gate 				list_walk_collection(unenforced_cap_cb, NULL);
13810Sstevel@tonic-gate 
13820Sstevel@tonic-gate 			next_rss_sample = NEXT_EVENT_TIME(now,
13830Sstevel@tonic-gate 			    rcfg.rcfg_rss_sample_interval);
13840Sstevel@tonic-gate 		}
13850Sstevel@tonic-gate 
13860Sstevel@tonic-gate 		/*
13870Sstevel@tonic-gate 		 * Update the statistics file, if it's time.
13880Sstevel@tonic-gate 		 */
13890Sstevel@tonic-gate 		check_update_statistics();
13900Sstevel@tonic-gate 
13910Sstevel@tonic-gate 		/*
13920Sstevel@tonic-gate 		 * Sleep for some time before repeating.
13930Sstevel@tonic-gate 		 */
13940Sstevel@tonic-gate 		now = gethrtime();
13950Sstevel@tonic-gate 		next = next_configuration;
13960Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_proc_walk);
13970Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_report);
13980Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_rss_sample);
13990Sstevel@tonic-gate 		if (next > now && should_run != 0) {
14000Sstevel@tonic-gate 			debug("sleeping %-4.2f seconds\n", (float)(next -
14010Sstevel@tonic-gate 			    now) / (float)NANOSEC);
14020Sstevel@tonic-gate 			hrt2ts(next - now, &ts);
14030Sstevel@tonic-gate 			(void) nanosleep(&ts, NULL);
14040Sstevel@tonic-gate 		}
14050Sstevel@tonic-gate 	}
14060Sstevel@tonic-gate 	if (termination_signal != 0)
14070Sstevel@tonic-gate 		debug("exiting due to signal %d\n", termination_signal);
14080Sstevel@tonic-gate 	if (ever_ran != 0)
14090Sstevel@tonic-gate 		info(gettext("exiting\n"));
14100Sstevel@tonic-gate 
14110Sstevel@tonic-gate 	/*
14120Sstevel@tonic-gate 	 * Unlink the statistics file before exiting.
14130Sstevel@tonic-gate 	 */
14140Sstevel@tonic-gate 	if (rcfg.rcfg_stat_file[0] != 0)
14150Sstevel@tonic-gate 		(void) unlink(rcfg.rcfg_stat_file);
14160Sstevel@tonic-gate 
14170Sstevel@tonic-gate 	return (E_SUCCESS);
14180Sstevel@tonic-gate }
1419