xref: /onnv-gate/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 13093:48f2dbca79a2)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51914Scasper  * Common Development and Distribution License (the "License").
61914Scasper  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
21*13093SRoger.Faulkner@Oracle.COM 
220Sstevel@tonic-gate /*
23*13093SRoger.Faulkner@Oracle.COM  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * rcapd is a long-running daemon enforcing project-based resource caps (see
280Sstevel@tonic-gate  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
290Sstevel@tonic-gate  * "collection") may have a memory cap.  A single thread monitors the resource
300Sstevel@tonic-gate  * utilization of capped collections, enforces caps when they are exceeded (and
310Sstevel@tonic-gate  * other conditions are met), and incorporates changes in configuration or
320Sstevel@tonic-gate  * caps.  Each of these actions occurs not more frequently than the rate
330Sstevel@tonic-gate  * specified with rcapadm(1M).
340Sstevel@tonic-gate  */
350Sstevel@tonic-gate 
360Sstevel@tonic-gate #include <sys/priocntl.h>
370Sstevel@tonic-gate #include <sys/proc.h>
380Sstevel@tonic-gate #include <sys/resource.h>
390Sstevel@tonic-gate #include <sys/sysinfo.h>
400Sstevel@tonic-gate #include <sys/stat.h>
410Sstevel@tonic-gate #include <sys/sysmacros.h>
420Sstevel@tonic-gate #include <sys/time.h>
430Sstevel@tonic-gate #include <sys/types.h>
440Sstevel@tonic-gate #include <dirent.h>
450Sstevel@tonic-gate #include <errno.h>
460Sstevel@tonic-gate #include <fcntl.h>
470Sstevel@tonic-gate #include <kstat.h>
480Sstevel@tonic-gate #include <libintl.h>
490Sstevel@tonic-gate #include <limits.h>
500Sstevel@tonic-gate #include <locale.h>
510Sstevel@tonic-gate #include <priv.h>
520Sstevel@tonic-gate #include <signal.h>
530Sstevel@tonic-gate #include <stdarg.h>
540Sstevel@tonic-gate #include <stdio.h>
551914Scasper #include <stdio_ext.h>
560Sstevel@tonic-gate #include <stdlib.h>
574119Stn143363 #include <libscf.h>
580Sstevel@tonic-gate #include <strings.h>
590Sstevel@tonic-gate #include <time.h>
600Sstevel@tonic-gate #include <unistd.h>
610Sstevel@tonic-gate #include <zone.h>
620Sstevel@tonic-gate #include <assert.h>
633247Sgjelinek #include <sys/vm_usage.h>
640Sstevel@tonic-gate #include "rcapd.h"
650Sstevel@tonic-gate #include "rcapd_mapping.h"
660Sstevel@tonic-gate #include "rcapd_rfd.h"
670Sstevel@tonic-gate #include "rcapd_stat.h"
680Sstevel@tonic-gate #include "utils.h"
690Sstevel@tonic-gate 
700Sstevel@tonic-gate #define	POSITIVE_MIN(x, y) \
710Sstevel@tonic-gate 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
720Sstevel@tonic-gate #define	NEXT_EVENT_TIME(base, seconds) \
730Sstevel@tonic-gate 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
740Sstevel@tonic-gate 	: (hrtime_t)0)
750Sstevel@tonic-gate #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
760Sstevel@tonic-gate 	((rcfg.rcfg_stat_file[0] != 0) ?  \
770Sstevel@tonic-gate 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
780Sstevel@tonic-gate #define	EVENT_TIME(time, eventtime) \
790Sstevel@tonic-gate 	(((time) > (eventtime)) && (eventtime) != 0)
800Sstevel@tonic-gate #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
810Sstevel@tonic-gate #define	DAEMON_UID		1		/* uid to use */
820Sstevel@tonic-gate 
833247Sgjelinek #define	CAPPED_PROJECT	0x01
843247Sgjelinek #define	CAPPED_ZONE	0x02
853247Sgjelinek 
860Sstevel@tonic-gate typedef struct soft_scan_arg {
870Sstevel@tonic-gate 	uint64_t ssa_sum_excess;
880Sstevel@tonic-gate 	int64_t ssa_scan_goal;
893247Sgjelinek 	boolean_t ssa_project_over_cap;
900Sstevel@tonic-gate } soft_scan_arg_t;
910Sstevel@tonic-gate 
923247Sgjelinek typedef struct sample_col_arg {
933247Sgjelinek 	boolean_t sca_any_over_cap;
943247Sgjelinek 	boolean_t sca_project_over_cap;
953247Sgjelinek } sample_col_arg_t;
963247Sgjelinek 
973247Sgjelinek 
980Sstevel@tonic-gate static int debug_mode = 0;		/* debug mode flag */
990Sstevel@tonic-gate static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
1000Sstevel@tonic-gate 					/* scanned */
1010Sstevel@tonic-gate static kstat_ctl_t *kctl;		/* kstat chain */
1020Sstevel@tonic-gate static int memory_pressure = 0;		/* physical memory utilization (%) */
1030Sstevel@tonic-gate static int memory_pressure_sample = 0;	/* count of samples */
1043247Sgjelinek static long page_size_kb = 0;		/* system page size in KB */
1053247Sgjelinek static size_t nvmu_vals = 0;		/* # of kernel RSS/swap vals in array */
1063247Sgjelinek static size_t vmu_vals_len = 0;		/* size of RSS/swap vals array */
1073247Sgjelinek static vmusage_t *vmu_vals = NULL;	/* snapshot of kernel RSS/swap values */
1080Sstevel@tonic-gate static hrtime_t next_report;		/* time of next report */
1090Sstevel@tonic-gate static int termination_signal = 0;	/* terminating signal */
1103247Sgjelinek static zoneid_t my_zoneid = (zoneid_t)-1;
1113247Sgjelinek static lcollection_t *gz_col;		/* global zone collection */
1120Sstevel@tonic-gate 
1130Sstevel@tonic-gate rcfg_t rcfg;
1143247Sgjelinek /*
1153247Sgjelinek  * Updated when we re-read the collection configurations if this rcapd instance
1163247Sgjelinek  * is running in the global zone and the global zone is capped.
1173247Sgjelinek  */
1183247Sgjelinek boolean_t gz_capped = B_FALSE;
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate /*
1210Sstevel@tonic-gate  * Flags.
1220Sstevel@tonic-gate  */
1230Sstevel@tonic-gate static int ever_ran;
1240Sstevel@tonic-gate int should_run;
1250Sstevel@tonic-gate static int should_reconfigure;
1260Sstevel@tonic-gate 
1270Sstevel@tonic-gate static int verify_statistics(void);
1280Sstevel@tonic-gate static int update_statistics(void);
1290Sstevel@tonic-gate 
1300Sstevel@tonic-gate /*
1313247Sgjelinek  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
1320Sstevel@tonic-gate  */
1333247Sgjelinek static boolean_t
proc_issystem(pid_t pid)1340Sstevel@tonic-gate proc_issystem(pid_t pid)
1350Sstevel@tonic-gate {
1360Sstevel@tonic-gate 	char pc_clname[PC_CLNMSZ];
1370Sstevel@tonic-gate 
1380Sstevel@tonic-gate 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
1390Sstevel@tonic-gate 	    PC_KY_NULL) != -1) {
1400Sstevel@tonic-gate 		return (strcmp(pc_clname, "SYS") == 0);
1410Sstevel@tonic-gate 	} else {
1420Sstevel@tonic-gate 		debug("cannot get class-specific scheduling parameters; "
1433247Sgjelinek 		    "assuming system process\n");
1443247Sgjelinek 		return (B_TRUE);
1450Sstevel@tonic-gate 	}
1460Sstevel@tonic-gate }
1470Sstevel@tonic-gate 
1480Sstevel@tonic-gate static void
lprocess_insert_mark(psinfo_t * psinfop)1493247Sgjelinek lprocess_insert_mark(psinfo_t *psinfop)
1500Sstevel@tonic-gate {
1513247Sgjelinek 	pid_t pid = psinfop->pr_pid;
1523247Sgjelinek 	/* flag indicating whether the process should be scanned. */
1533247Sgjelinek 	int unscannable = psinfop->pr_nlwp == 0;
1543247Sgjelinek 	rcid_t colid;
1550Sstevel@tonic-gate 	lcollection_t *lcol;
1560Sstevel@tonic-gate 	lprocess_t *lproc;
1570Sstevel@tonic-gate 
1583247Sgjelinek 	/*
1593247Sgjelinek 	 * Determine which collection to put this process into.  We only have
1603247Sgjelinek 	 * to worry about tracking both zone and project capped processes if
1613247Sgjelinek 	 * this rcapd instance is running in the global zone, since we'll only
1623247Sgjelinek 	 * see processes in our own projects in a non-global zone.  In the
1633247Sgjelinek 	 * global zone, if the process belongs to a non-global zone, we only
1643247Sgjelinek 	 * need to track it for the capped non-global zone collection.  For
1653247Sgjelinek 	 * global zone processes, we first attempt to put the process into a
1663247Sgjelinek 	 * capped project collection.  On the second pass into this function
1673247Sgjelinek 	 * the projid will be cleared so we will just track the process for the
1683247Sgjelinek 	 * global zone collection as a whole.
1693247Sgjelinek 	 */
1703247Sgjelinek 	if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
1713247Sgjelinek 		colid.rcid_type = RCIDT_PROJECT;
1723247Sgjelinek 		colid.rcid_val = psinfop->pr_projid;
1733247Sgjelinek 	} else {
1743247Sgjelinek 		/* try to add to zone collection */
1753247Sgjelinek 		colid.rcid_type = RCIDT_ZONE;
1763247Sgjelinek 		colid.rcid_val = psinfop->pr_zoneid;
1773247Sgjelinek 	}
1783247Sgjelinek 
1793247Sgjelinek 	if ((lcol = lcollection_find(&colid)) == NULL)
1800Sstevel@tonic-gate 		return;
1810Sstevel@tonic-gate 
1820Sstevel@tonic-gate 	/*
1830Sstevel@tonic-gate 	 * If the process is already being tracked, update the unscannable flag,
1840Sstevel@tonic-gate 	 * as determined by the caller, from the process's psinfo.
1850Sstevel@tonic-gate 	 */
1860Sstevel@tonic-gate 	lproc = lcol->lcol_lprocess;
1870Sstevel@tonic-gate 	while (lproc != NULL) {
1880Sstevel@tonic-gate 		if (lproc->lpc_pid == pid) {
1890Sstevel@tonic-gate 			lproc->lpc_mark = 1;
1900Sstevel@tonic-gate 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
1910Sstevel@tonic-gate 				debug("process %d: became unscannable\n",
1920Sstevel@tonic-gate 				    (int)lproc->lpc_pid);
1930Sstevel@tonic-gate 				lproc->lpc_unscannable = 1;
1940Sstevel@tonic-gate 			}
1950Sstevel@tonic-gate 			return;
1960Sstevel@tonic-gate 		}
1970Sstevel@tonic-gate 		lproc = lproc->lpc_next;
1980Sstevel@tonic-gate 	}
1990Sstevel@tonic-gate 
2000Sstevel@tonic-gate 	/*
2010Sstevel@tonic-gate 	 * We've fallen off the list without finding our current process;
2020Sstevel@tonic-gate 	 * insert it at the list head.
2030Sstevel@tonic-gate 	 */
2040Sstevel@tonic-gate 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
2050Sstevel@tonic-gate 		debug("insufficient memory to track new process %d", (int)pid);
2060Sstevel@tonic-gate 	else {
2070Sstevel@tonic-gate 		(void) bzero(lproc, sizeof (*lproc));
2080Sstevel@tonic-gate 		lproc->lpc_pid = pid;
2090Sstevel@tonic-gate 		lproc->lpc_mark = 1;
2100Sstevel@tonic-gate 		lproc->lpc_collection = lcol;
2110Sstevel@tonic-gate 		lproc->lpc_psinfo_fd = -1;
2120Sstevel@tonic-gate 		lproc->lpc_pgdata_fd = -1;
2130Sstevel@tonic-gate 		lproc->lpc_xmap_fd = -1;
2140Sstevel@tonic-gate 
2150Sstevel@tonic-gate 		/*
2160Sstevel@tonic-gate 		 * If the caller didn't flag this process as unscannable
2170Sstevel@tonic-gate 		 * already, do some more checking.
2180Sstevel@tonic-gate 		 */
2190Sstevel@tonic-gate 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
2200Sstevel@tonic-gate 
2210Sstevel@tonic-gate #ifdef DEBUG
2220Sstevel@tonic-gate 		/*
2230Sstevel@tonic-gate 		 * Verify the sanity of lprocess.  It should not contain the
2240Sstevel@tonic-gate 		 * process we are about to prepend.
2250Sstevel@tonic-gate 		 */
2260Sstevel@tonic-gate 		if (lcollection_member(lcol, lproc)) {
2270Sstevel@tonic-gate 			lprocess_t *cur = lcol->lcol_lprocess;
2280Sstevel@tonic-gate 			debug("The collection %lld already has these members, "
2293247Sgjelinek 			    "including me, %d!\n",
2303247Sgjelinek 			    (long long)lcol->lcol_id.rcid_val,
2310Sstevel@tonic-gate 			    (int)lproc->lpc_pid);
2320Sstevel@tonic-gate 			while (cur != NULL) {
2330Sstevel@tonic-gate 				debug("\t%d\n", (int)cur->lpc_pid);
2340Sstevel@tonic-gate 				cur = cur->lpc_next;
2350Sstevel@tonic-gate 			}
2360Sstevel@tonic-gate 			info(gettext("process already on lprocess\n"));
2370Sstevel@tonic-gate 			abort();
2380Sstevel@tonic-gate 		}
2390Sstevel@tonic-gate #endif /* DEBUG */
2400Sstevel@tonic-gate 		lproc->lpc_next = lcol->lcol_lprocess;
2410Sstevel@tonic-gate 		if (lproc->lpc_next != NULL)
2420Sstevel@tonic-gate 			lproc->lpc_next->lpc_prev = lproc;
2430Sstevel@tonic-gate 		lproc->lpc_prev = NULL;
2440Sstevel@tonic-gate 		lcol->lcol_lprocess = lproc;
2450Sstevel@tonic-gate 
2463247Sgjelinek 		debug("tracking %s %ld %d %s%s\n",
2473247Sgjelinek 		    (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
2483247Sgjelinek 		    (long)colid.rcid_val,
2493247Sgjelinek 		    (int)pid, psinfop->pr_psargs,
2500Sstevel@tonic-gate 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
2510Sstevel@tonic-gate 		lcol->lcol_stat.lcols_proc_in++;
2520Sstevel@tonic-gate 	}
2530Sstevel@tonic-gate }
2540Sstevel@tonic-gate 
2550Sstevel@tonic-gate static int
list_walk_process_cb(lcollection_t * lcol,void * arg)2560Sstevel@tonic-gate list_walk_process_cb(lcollection_t *lcol, void *arg)
2570Sstevel@tonic-gate {
2580Sstevel@tonic-gate 	int (*cb)(lcollection_t *, lprocess_t *) =
2590Sstevel@tonic-gate 	    (int(*)(lcollection_t *, lprocess_t *))arg;
2600Sstevel@tonic-gate 	lprocess_t *member;
2610Sstevel@tonic-gate 	lprocess_t *next;
2620Sstevel@tonic-gate 
2630Sstevel@tonic-gate 	member = lcol->lcol_lprocess;
2640Sstevel@tonic-gate 	while (member != NULL) {
2650Sstevel@tonic-gate 		pid_t pid = member->lpc_pid;
2660Sstevel@tonic-gate 		next = member->lpc_next;
2670Sstevel@tonic-gate 
2680Sstevel@tonic-gate 		debug_high("list_walk_all lpc %d\n", (int)pid);
2690Sstevel@tonic-gate 		if (cb(lcol, member) != 0) {
2700Sstevel@tonic-gate 			debug_high("list_walk_all aborted at lpc %d\n",
2710Sstevel@tonic-gate 			    (int)pid);
2720Sstevel@tonic-gate 			return (1);
2730Sstevel@tonic-gate 		}
2740Sstevel@tonic-gate 		member = next;
2750Sstevel@tonic-gate 	}
2760Sstevel@tonic-gate 
2770Sstevel@tonic-gate 	return (0);
2780Sstevel@tonic-gate }
2790Sstevel@tonic-gate 
2800Sstevel@tonic-gate /*
2810Sstevel@tonic-gate  * Invoke the given callback for each process in each collection.  Callbacks
2820Sstevel@tonic-gate  * are allowed to change the linkage of the process on which they act.
2830Sstevel@tonic-gate  */
2840Sstevel@tonic-gate static void
list_walk_all(int (* cb)(lcollection_t *,lprocess_t *))2850Sstevel@tonic-gate list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
2860Sstevel@tonic-gate {
2870Sstevel@tonic-gate 	list_walk_collection(list_walk_process_cb, (void *)cb);
2880Sstevel@tonic-gate }
2890Sstevel@tonic-gate 
2900Sstevel@tonic-gate static void
revoke_psinfo(rfd_t * rfd)2910Sstevel@tonic-gate revoke_psinfo(rfd_t *rfd)
2920Sstevel@tonic-gate {
2930Sstevel@tonic-gate 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate 	if (lpc != NULL) {
2960Sstevel@tonic-gate 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
2970Sstevel@tonic-gate 		ASSERT(lpc->lpc_psinfo_fd != -1);
2980Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
2990Sstevel@tonic-gate 	} else
3000Sstevel@tonic-gate 		debug("revoking psinfo fd for unknown process\n");
3010Sstevel@tonic-gate }
3020Sstevel@tonic-gate 
3030Sstevel@tonic-gate /*
3040Sstevel@tonic-gate  * Retrieve a process's psinfo via an already-opened or new file descriptor.
3050Sstevel@tonic-gate  * The supplied descriptor will be closed on failure.  An optional callback
3060Sstevel@tonic-gate  * will be invoked with the last descriptor tried, and a supplied callback
3070Sstevel@tonic-gate  * argument, as its arguments, such that the new descriptor may be cached, or
3080Sstevel@tonic-gate  * an old one may be invalidated.  If the result of the callback is zero, the
3090Sstevel@tonic-gate  * the caller is to assume responsibility for the file descriptor, to close it
3100Sstevel@tonic-gate  * with rfd_close().
3110Sstevel@tonic-gate  *
3120Sstevel@tonic-gate  * On failure, a nonzero value is returned.
3130Sstevel@tonic-gate  */
3140Sstevel@tonic-gate int
get_psinfo(pid_t pid,psinfo_t * psinfo,int cached_fd,int (* fd_update_cb)(void *,int),void * arg,lprocess_t * lpc)3150Sstevel@tonic-gate get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
3160Sstevel@tonic-gate     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
3170Sstevel@tonic-gate {
3180Sstevel@tonic-gate 	int fd;
3190Sstevel@tonic-gate 	int can_try_uncached;
3200Sstevel@tonic-gate 
3210Sstevel@tonic-gate 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
3220Sstevel@tonic-gate 
3230Sstevel@tonic-gate 	do {
3240Sstevel@tonic-gate 		if (cached_fd >= 0) {
3250Sstevel@tonic-gate 			fd = cached_fd;
3260Sstevel@tonic-gate 			can_try_uncached = 1;
3270Sstevel@tonic-gate 			debug_high("%d/psinfo, trying cached fd %d\n",
3280Sstevel@tonic-gate 			    (int)pid, fd);
3290Sstevel@tonic-gate 		} else {
3300Sstevel@tonic-gate 			char pathbuf[PROC_PATH_MAX];
3310Sstevel@tonic-gate 
3320Sstevel@tonic-gate 			can_try_uncached = 0;
3330Sstevel@tonic-gate 			(void) snprintf(pathbuf, sizeof (pathbuf),
3340Sstevel@tonic-gate 			    "/proc/%d/psinfo", (int)pid);
3350Sstevel@tonic-gate 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
3360Sstevel@tonic-gate 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
3370Sstevel@tonic-gate 				debug("cannot open %s", pathbuf);
3380Sstevel@tonic-gate 				break;
3390Sstevel@tonic-gate 			} else
3400Sstevel@tonic-gate 				debug_high("opened %s, fd %d\n", pathbuf, fd);
3410Sstevel@tonic-gate 		}
3420Sstevel@tonic-gate 
3430Sstevel@tonic-gate 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
3440Sstevel@tonic-gate 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
3450Sstevel@tonic-gate 			break;
3460Sstevel@tonic-gate 		else {
3470Sstevel@tonic-gate 			debug_high("closed fd %d\n", fd);
3480Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3490Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3500Sstevel@tonic-gate 			fd = cached_fd = -1;
3510Sstevel@tonic-gate 		}
3520Sstevel@tonic-gate 	} while (can_try_uncached == 1);
3530Sstevel@tonic-gate 
3540Sstevel@tonic-gate 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
3550Sstevel@tonic-gate 		if (fd >= 0) {
3560Sstevel@tonic-gate 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
3570Sstevel@tonic-gate 			    "uncached" : "cached", fd);
3580Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3590Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3600Sstevel@tonic-gate 		}
3610Sstevel@tonic-gate 
3620Sstevel@tonic-gate 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
3630Sstevel@tonic-gate 	    fd_update_cb != NULL ? "cached" : "uncached");
3640Sstevel@tonic-gate 	return ((fd >= 0) ? 0 : -1);
3650Sstevel@tonic-gate }
3660Sstevel@tonic-gate 
3670Sstevel@tonic-gate /*
3683247Sgjelinek  * Retrieve the collection membership of all processes and update the psinfo of
3693247Sgjelinek  * those non-system, non-zombie ones in collections.  For global zone processes,
3703247Sgjelinek  * we first attempt to put the process into a capped project collection.  We
3713247Sgjelinek  * also want to track the process for the global zone collection as a whole.
3720Sstevel@tonic-gate  */
3730Sstevel@tonic-gate static void
proc_cb(const pid_t pid)3740Sstevel@tonic-gate proc_cb(const pid_t pid)
3750Sstevel@tonic-gate {
3760Sstevel@tonic-gate 	psinfo_t psinfo;
3770Sstevel@tonic-gate 
3783247Sgjelinek 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
3793247Sgjelinek 		lprocess_insert_mark(&psinfo);
3803247Sgjelinek 		if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
3813247Sgjelinek 			/*
3823247Sgjelinek 			 * We also want to track this process for the global
3833247Sgjelinek 			 * zone as a whole so add it to the global zone
3843247Sgjelinek 			 * collection as well.
3853247Sgjelinek 			 */
3863247Sgjelinek 			psinfo.pr_projid = -1;
3873247Sgjelinek 			lprocess_insert_mark(&psinfo);
3883247Sgjelinek 		}
3893247Sgjelinek 	}
3900Sstevel@tonic-gate }
3910Sstevel@tonic-gate 
3920Sstevel@tonic-gate /*
3930Sstevel@tonic-gate  * Cache the process' psinfo fd, taking responsibility for freeing it.
3940Sstevel@tonic-gate  */
3950Sstevel@tonic-gate int
lprocess_update_psinfo_fd_cb(void * arg,int fd)3960Sstevel@tonic-gate lprocess_update_psinfo_fd_cb(void *arg, int fd)
3970Sstevel@tonic-gate {
3980Sstevel@tonic-gate 	lprocess_t *lpc = arg;
3990Sstevel@tonic-gate 
4000Sstevel@tonic-gate 	lpc->lpc_psinfo_fd = fd;
4010Sstevel@tonic-gate 	return (0);
4020Sstevel@tonic-gate }
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate /*
4053247Sgjelinek  * Get the system pagesize.
4060Sstevel@tonic-gate  */
4073247Sgjelinek static void
get_page_size(void)4083247Sgjelinek get_page_size(void)
4090Sstevel@tonic-gate {
4103247Sgjelinek 	page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
4113247Sgjelinek 	debug("physical page size: %luKB\n", page_size_kb);
4123247Sgjelinek }
4133247Sgjelinek 
4143247Sgjelinek static void
tm_fmt(char * msg,hrtime_t t1,hrtime_t t2)4153247Sgjelinek tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
4163247Sgjelinek {
4173247Sgjelinek 	hrtime_t diff = t2 - t1;
4180Sstevel@tonic-gate 
4193247Sgjelinek 	if (diff < MILLISEC)
4203247Sgjelinek 		debug("%s: %lld nanoseconds\n", msg, diff);
4213247Sgjelinek 	else if (diff < MICROSEC)
4223247Sgjelinek 		debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
4233247Sgjelinek 	else if (diff < NANOSEC)
4243247Sgjelinek 		debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
4253247Sgjelinek 	else
4263247Sgjelinek 		debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
4273247Sgjelinek }
4283247Sgjelinek 
4293247Sgjelinek /*
4303247Sgjelinek  * Get the zone's & project's RSS from the kernel.
4313247Sgjelinek  */
4323247Sgjelinek static void
rss_sample(boolean_t my_zone_only,uint_t col_types)4333247Sgjelinek rss_sample(boolean_t my_zone_only, uint_t col_types)
4343247Sgjelinek {
4353247Sgjelinek 	size_t nres;
4363247Sgjelinek 	size_t i;
4373247Sgjelinek 	uint_t flags;
4383247Sgjelinek 	hrtime_t t1, t2;
4393247Sgjelinek 
4403247Sgjelinek 	if (my_zone_only) {
4413247Sgjelinek 		flags = VMUSAGE_ZONE;
4420Sstevel@tonic-gate 	} else {
4433247Sgjelinek 		flags = 0;
4443247Sgjelinek 		if (col_types & CAPPED_PROJECT)
4453247Sgjelinek 			flags |= VMUSAGE_PROJECTS;
4463247Sgjelinek 		if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
4473247Sgjelinek 			flags |= VMUSAGE_ALL_ZONES;
4480Sstevel@tonic-gate 	}
4490Sstevel@tonic-gate 
4503247Sgjelinek 	debug("vmusage sample flags 0x%x\n", flags);
4513247Sgjelinek 	if (flags == 0)
4523247Sgjelinek 		return;
4533247Sgjelinek 
4543247Sgjelinek again:
4553247Sgjelinek 	/* try the current buffer to see if the list will fit */
4563247Sgjelinek 	nres = vmu_vals_len;
4573247Sgjelinek 	t1 = gethrtime();
4583247Sgjelinek 	if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
4593247Sgjelinek 	    vmu_vals, &nres) != 0) {
4603247Sgjelinek 		if (errno != EOVERFLOW) {
4613247Sgjelinek 			warn(gettext("can't read RSS from kernel\n"));
4623247Sgjelinek 			return;
4633247Sgjelinek 		}
4643247Sgjelinek 	}
4653247Sgjelinek 	t2 = gethrtime();
4663247Sgjelinek 	tm_fmt("getvmusage time", t1, t2);
4673247Sgjelinek 
4683247Sgjelinek 	debug("kernel nres %lu\n", (ulong_t)nres);
4693247Sgjelinek 
4703247Sgjelinek 	if (nres > vmu_vals_len) {
4713247Sgjelinek 		/* array size is now too small, increase it and try again */
4723247Sgjelinek 		free(vmu_vals);
4733247Sgjelinek 
4743247Sgjelinek 		if ((vmu_vals = (vmusage_t *)calloc(nres,
4753247Sgjelinek 		    sizeof (vmusage_t))) == NULL) {
4763247Sgjelinek 			warn(gettext("out of memory: could not read RSS from "
4773247Sgjelinek 			    "kernel\n"));
4783247Sgjelinek 			vmu_vals_len = nvmu_vals = 0;
4793247Sgjelinek 			return;
4803247Sgjelinek 		}
4813247Sgjelinek 		vmu_vals_len = nres;
4823247Sgjelinek 		goto again;
4833247Sgjelinek 	}
4843247Sgjelinek 
4853247Sgjelinek 	nvmu_vals = nres;
4863247Sgjelinek 
4873247Sgjelinek 	debug("vmusage_sample\n");
4883247Sgjelinek 	for (i = 0; i < nvmu_vals; i++) {
4893247Sgjelinek 		debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
4903247Sgjelinek 		    "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
4913247Sgjelinek 		    vmu_vals[i].vmu_type,
4923247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_rss_all,
4933247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
4943247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_swap_all);
4953247Sgjelinek 	}
4963247Sgjelinek }
4973247Sgjelinek 
4983247Sgjelinek static void
update_col_rss(lcollection_t * lcol)4993247Sgjelinek update_col_rss(lcollection_t *lcol)
5003247Sgjelinek {
5013247Sgjelinek 	int i;
5023247Sgjelinek 
5033247Sgjelinek 	lcol->lcol_rss = 0;
5043247Sgjelinek 	lcol->lcol_image_size = 0;
5053247Sgjelinek 
5063247Sgjelinek 	for (i = 0; i < nvmu_vals; i++) {
5073247Sgjelinek 		if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
5083247Sgjelinek 			continue;
5093247Sgjelinek 
5103247Sgjelinek 		if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
5113247Sgjelinek 		    lcol->lcol_id.rcid_type != RCIDT_ZONE)
5123247Sgjelinek 			continue;
5133247Sgjelinek 
5143247Sgjelinek 		if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
5153247Sgjelinek 		    lcol->lcol_id.rcid_type != RCIDT_PROJECT)
5163247Sgjelinek 			continue;
5173247Sgjelinek 
5183247Sgjelinek 		/* we found the right RSS entry, update the collection vals */
5193247Sgjelinek 		lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
5203247Sgjelinek 		lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
5213247Sgjelinek 		break;
5223247Sgjelinek 	}
5230Sstevel@tonic-gate }
5240Sstevel@tonic-gate 
5250Sstevel@tonic-gate /*
5260Sstevel@tonic-gate  * Sample the collection RSS, updating the collection's statistics with the
5273247Sgjelinek  * results.  Also, sum the rss of all capped projects & return true if
5283247Sgjelinek  * the collection is over cap.
5290Sstevel@tonic-gate  */
5300Sstevel@tonic-gate static int
rss_sample_col_cb(lcollection_t * lcol,void * arg)5310Sstevel@tonic-gate rss_sample_col_cb(lcollection_t *lcol, void *arg)
5320Sstevel@tonic-gate {
5330Sstevel@tonic-gate 	int64_t excess;
5340Sstevel@tonic-gate 	uint64_t rss;
5353247Sgjelinek 	sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
5360Sstevel@tonic-gate 
5373247Sgjelinek 	update_col_rss(lcol);
5380Sstevel@tonic-gate 
5390Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sample++;
5400Sstevel@tonic-gate 	rss = lcol->lcol_rss;
5413247Sgjelinek 	excess = rss - lcol->lcol_rss_cap;
5423247Sgjelinek 	if (excess > 0) {
5430Sstevel@tonic-gate 		lcol->lcol_stat.lcols_rss_act_sum += rss;
5443247Sgjelinek 		col_argp->sca_any_over_cap = B_TRUE;
5453247Sgjelinek 		if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
5463247Sgjelinek 			col_argp->sca_project_over_cap = B_TRUE;
5473247Sgjelinek 	}
5480Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sum += rss;
5490Sstevel@tonic-gate 
5500Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_min_rss > rss)
5510Sstevel@tonic-gate 		lcol->lcol_stat.lcols_min_rss = rss;
5520Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_max_rss < rss)
5530Sstevel@tonic-gate 		lcol->lcol_stat.lcols_max_rss = rss;
5540Sstevel@tonic-gate 
5550Sstevel@tonic-gate 	return (0);
5560Sstevel@tonic-gate }
5570Sstevel@tonic-gate 
5580Sstevel@tonic-gate /*
5593247Sgjelinek  * Determine if we have capped projects, capped zones or both.
5603247Sgjelinek  */
5613247Sgjelinek static int
col_type_cb(lcollection_t * lcol,void * arg)5623247Sgjelinek col_type_cb(lcollection_t *lcol, void *arg)
5633247Sgjelinek {
5643247Sgjelinek 	uint_t *col_type = (uint_t *)arg;
5653247Sgjelinek 
5663247Sgjelinek 	/* skip uncapped collections */
5673247Sgjelinek 	if (lcol->lcol_rss_cap == 0)
5683247Sgjelinek 		return (1);
5693247Sgjelinek 
5703247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
5713247Sgjelinek 		*col_type |= CAPPED_PROJECT;
5723247Sgjelinek 	else
5733247Sgjelinek 		*col_type |= CAPPED_ZONE;
5743247Sgjelinek 
5753247Sgjelinek 	/* once we know everything is capped, we can stop looking */
5763247Sgjelinek 	if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
5773247Sgjelinek 		return (1);
5783247Sgjelinek 
5793247Sgjelinek 	return (0);
5803247Sgjelinek }
5813247Sgjelinek 
5823247Sgjelinek /*
5830Sstevel@tonic-gate  * Open /proc and walk entries.
5840Sstevel@tonic-gate  */
5850Sstevel@tonic-gate static void
proc_walk_all(void (* cb)(const pid_t))5860Sstevel@tonic-gate proc_walk_all(void (*cb)(const pid_t))
5870Sstevel@tonic-gate {
5880Sstevel@tonic-gate 	DIR *pdir;
5890Sstevel@tonic-gate 	struct dirent *dirent;
5900Sstevel@tonic-gate 	pid_t pid;
5910Sstevel@tonic-gate 
5920Sstevel@tonic-gate 	(void) rfd_reserve(1);
5930Sstevel@tonic-gate 	if ((pdir = opendir("/proc")) == NULL)
5940Sstevel@tonic-gate 		die(gettext("couldn't open /proc!"));
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate 	while ((dirent = readdir(pdir)) != NULL) {
5970Sstevel@tonic-gate 		if (strcmp(".", dirent->d_name) == 0 ||
5980Sstevel@tonic-gate 		    strcmp("..", dirent->d_name) == 0)
5990Sstevel@tonic-gate 			continue;
6000Sstevel@tonic-gate 		pid = atoi(dirent->d_name);
6010Sstevel@tonic-gate 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
6020Sstevel@tonic-gate 		if (pid == rcapd_pid)
6030Sstevel@tonic-gate 			continue;
6040Sstevel@tonic-gate 		else
6050Sstevel@tonic-gate 			cb(pid);
6060Sstevel@tonic-gate 	}
6070Sstevel@tonic-gate 	(void) closedir(pdir);
6080Sstevel@tonic-gate }
6090Sstevel@tonic-gate 
6100Sstevel@tonic-gate /*
6110Sstevel@tonic-gate  * Clear unmarked callback.
6120Sstevel@tonic-gate  */
6130Sstevel@tonic-gate /*ARGSUSED*/
6140Sstevel@tonic-gate static int
sweep_process_cb(lcollection_t * lcol,lprocess_t * lpc)6150Sstevel@tonic-gate sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
6160Sstevel@tonic-gate {
6170Sstevel@tonic-gate 	if (lpc->lpc_mark) {
6180Sstevel@tonic-gate 		lpc->lpc_mark = 0;
6190Sstevel@tonic-gate 	} else {
6200Sstevel@tonic-gate 		debug("process %d finished\n", (int)lpc->lpc_pid);
6210Sstevel@tonic-gate 		lprocess_free(lpc);
6220Sstevel@tonic-gate 	}
6230Sstevel@tonic-gate 
6240Sstevel@tonic-gate 	return (0);
6250Sstevel@tonic-gate }
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate /*
6280Sstevel@tonic-gate  * Print, for debugging purposes, a collection's recently-sampled RSS and
6290Sstevel@tonic-gate  * excess.
6300Sstevel@tonic-gate  */
6310Sstevel@tonic-gate /*ARGSUSED*/
6320Sstevel@tonic-gate static int
excess_print_cb(lcollection_t * lcol,void * arg)6330Sstevel@tonic-gate excess_print_cb(lcollection_t *lcol, void *arg)
6340Sstevel@tonic-gate {
6350Sstevel@tonic-gate 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
6360Sstevel@tonic-gate 
6370Sstevel@tonic-gate 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
6383247Sgjelinek 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
6393247Sgjelinek 	    lcol->lcol_name,
6400Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss,
6410Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
6420Sstevel@tonic-gate 	    (long long)excess);
6430Sstevel@tonic-gate 
6440Sstevel@tonic-gate 	return (0);
6450Sstevel@tonic-gate }
6460Sstevel@tonic-gate 
6470Sstevel@tonic-gate /*
6480Sstevel@tonic-gate  * Scan those collections which have exceeded their caps.
6493247Sgjelinek  *
6503247Sgjelinek  * If we're running in the global zone it might have a cap.  We don't want to
6513247Sgjelinek  * do any capping for the global zone yet since we might get under the cap by
6523247Sgjelinek  * just capping the projects in the global zone.
6530Sstevel@tonic-gate  */
6540Sstevel@tonic-gate /*ARGSUSED*/
6550Sstevel@tonic-gate static int
scan_cb(lcollection_t * lcol,void * arg)6560Sstevel@tonic-gate scan_cb(lcollection_t *lcol, void *arg)
6570Sstevel@tonic-gate {
6580Sstevel@tonic-gate 	int64_t excess;
6590Sstevel@tonic-gate 
6603247Sgjelinek 	/* skip over global zone collection for now but keep track for later */
6613247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
6623247Sgjelinek 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
6633247Sgjelinek 		gz_col = lcol;
6643247Sgjelinek 		return (0);
6653247Sgjelinek 	}
6663247Sgjelinek 
6670Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
6680Sstevel@tonic-gate 		scan(lcol, excess);
6690Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
6700Sstevel@tonic-gate 	}
6710Sstevel@tonic-gate 
6720Sstevel@tonic-gate 	return (0);
6730Sstevel@tonic-gate }
6740Sstevel@tonic-gate 
6750Sstevel@tonic-gate /*
6763247Sgjelinek  * Scan the global zone collection and see if it still exceeds its cap.
6773247Sgjelinek  * We take into account the effects of capping any global zone projects here.
6783247Sgjelinek  */
6793247Sgjelinek static void
scan_gz(lcollection_t * lcol,boolean_t project_over_cap)6803247Sgjelinek scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
6813247Sgjelinek {
6823247Sgjelinek 	int64_t excess;
6833247Sgjelinek 
6843247Sgjelinek 	/*
6853247Sgjelinek 	 * If we had projects over their cap and the global zone was also over
6863247Sgjelinek 	 * its cap then we need to get the up-to-date global zone rss to
6873247Sgjelinek 	 * determine if we are still over the global zone cap.  We might have
6883247Sgjelinek 	 * gone under while we scanned the capped projects.  If there were no
6893247Sgjelinek 	 * projects over cap then we can use the rss value we already have for
6903247Sgjelinek 	 * the global zone.
6913247Sgjelinek 	 */
6923247Sgjelinek 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
6933247Sgjelinek 	if (project_over_cap && excess > 0) {
6943247Sgjelinek 		rss_sample(B_TRUE, CAPPED_ZONE);
6953247Sgjelinek 		update_col_rss(lcol);
6963247Sgjelinek 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
6973247Sgjelinek 	}
6983247Sgjelinek 
6993247Sgjelinek 	if (excess > 0) {
7003247Sgjelinek 		debug("global zone excess %lldKB\n", (long long)excess);
7013247Sgjelinek 		scan(lcol, excess);
7023247Sgjelinek 		lcol->lcol_stat.lcols_scan++;
7033247Sgjelinek 	}
7043247Sgjelinek }
7053247Sgjelinek 
7063247Sgjelinek /*
7070Sstevel@tonic-gate  * Do a soft scan of those collections which have excesses.  A soft scan is one
7080Sstevel@tonic-gate  * in which the cap enforcement pressure is taken into account.  The difference
7090Sstevel@tonic-gate  * between the utilized physical memory and the cap enforcement pressure will
7100Sstevel@tonic-gate  * be scanned-for, and each collection will be scanned proportionally by their
7110Sstevel@tonic-gate  * present excesses.
7120Sstevel@tonic-gate  */
7130Sstevel@tonic-gate static int
soft_scan_cb(lcollection_t * lcol,void * a)7140Sstevel@tonic-gate soft_scan_cb(lcollection_t *lcol, void *a)
7150Sstevel@tonic-gate {
7160Sstevel@tonic-gate 	int64_t excess;
7170Sstevel@tonic-gate 	soft_scan_arg_t *arg = a;
7180Sstevel@tonic-gate 
7193247Sgjelinek 	/* skip over global zone collection for now but keep track for later */
7203247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
7213247Sgjelinek 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
7223247Sgjelinek 		gz_col = lcol;
7233247Sgjelinek 		return (0);
7243247Sgjelinek 	}
7253247Sgjelinek 
7260Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
7273247Sgjelinek 		int64_t adjusted_excess =
7283247Sgjelinek 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
7293247Sgjelinek 
7303247Sgjelinek 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
7313247Sgjelinek 		    "scanning %lld\n",
7323247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
7333247Sgjelinek 		    "project" : "zone"),
7343247Sgjelinek 		    (long)lcol->lcol_id.rcid_val,
7350Sstevel@tonic-gate 		    (long long)excess, (long long)arg->ssa_scan_goal,
7360Sstevel@tonic-gate 		    (unsigned long long)arg->ssa_sum_excess,
7373247Sgjelinek 		    (long long)adjusted_excess);
7380Sstevel@tonic-gate 
7393247Sgjelinek 		scan(lcol, adjusted_excess);
7400Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
7410Sstevel@tonic-gate 	}
7420Sstevel@tonic-gate 
7430Sstevel@tonic-gate 	return (0);
7440Sstevel@tonic-gate }
7450Sstevel@tonic-gate 
7463247Sgjelinek static void
soft_scan_gz(lcollection_t * lcol,void * a)7473247Sgjelinek soft_scan_gz(lcollection_t *lcol, void *a)
7483247Sgjelinek {
7493247Sgjelinek 	int64_t excess;
7503247Sgjelinek 	soft_scan_arg_t *arg = a;
7513247Sgjelinek 
7523247Sgjelinek 	/*
7533247Sgjelinek 	 * If we had projects over their cap and the global zone was also over
7543247Sgjelinek 	 * its cap then we need to get the up-to-date global zone rss to
7553247Sgjelinek 	 * determine if we are still over the global zone cap.  We might have
7563247Sgjelinek 	 * gone under while we scanned the capped projects.  If there were no
7573247Sgjelinek 	 * projects over cap then we can use the rss value we already have for
7583247Sgjelinek 	 * the global zone.
7593247Sgjelinek 	 */
7603247Sgjelinek 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
7613247Sgjelinek 	if (arg->ssa_project_over_cap && excess > 0) {
7623247Sgjelinek 		rss_sample(B_TRUE, CAPPED_ZONE);
7633247Sgjelinek 		update_col_rss(lcol);
7643247Sgjelinek 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
7653247Sgjelinek 	}
7663247Sgjelinek 
7673247Sgjelinek 	if (excess > 0) {
7683247Sgjelinek 		int64_t adjusted_excess =
7693247Sgjelinek 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
7703247Sgjelinek 
7713247Sgjelinek 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
7723247Sgjelinek 		    "scanning %lld\n",
7733247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
7743247Sgjelinek 		    "project" : "zone"),
7753247Sgjelinek 		    (long)lcol->lcol_id.rcid_val,
7763247Sgjelinek 		    (long long)excess, (long long)arg->ssa_scan_goal,
7773247Sgjelinek 		    (unsigned long long)arg->ssa_sum_excess,
7783247Sgjelinek 		    (long long)adjusted_excess);
7793247Sgjelinek 
7803247Sgjelinek 		scan(lcol, adjusted_excess);
7813247Sgjelinek 		lcol->lcol_stat.lcols_scan++;
7823247Sgjelinek 	}
7833247Sgjelinek }
7843247Sgjelinek 
7850Sstevel@tonic-gate /*
7860Sstevel@tonic-gate  * When a scan could happen, but caps aren't enforced tick the
7870Sstevel@tonic-gate  * lcols_unenforced_cap counter.
7880Sstevel@tonic-gate  */
7890Sstevel@tonic-gate /*ARGSUSED*/
7900Sstevel@tonic-gate static int
unenforced_cap_cb(lcollection_t * lcol,void * arg)7910Sstevel@tonic-gate unenforced_cap_cb(lcollection_t *lcol, void *arg)
7920Sstevel@tonic-gate {
7930Sstevel@tonic-gate 	lcol->lcol_stat.lcols_unenforced_cap++;
7940Sstevel@tonic-gate 
7950Sstevel@tonic-gate 	return (0);
7960Sstevel@tonic-gate }
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate /*
7990Sstevel@tonic-gate  * Update the count of physically installed memory.
8000Sstevel@tonic-gate  */
8010Sstevel@tonic-gate static void
update_phys_total(void)8020Sstevel@tonic-gate update_phys_total(void)
8030Sstevel@tonic-gate {
8040Sstevel@tonic-gate 	uint64_t old_phys_total;
8050Sstevel@tonic-gate 
8060Sstevel@tonic-gate 	old_phys_total = phys_total;
8073247Sgjelinek 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
8080Sstevel@tonic-gate 	if (phys_total != old_phys_total)
8090Sstevel@tonic-gate 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
8100Sstevel@tonic-gate 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
8110Sstevel@tonic-gate }
8120Sstevel@tonic-gate 
8130Sstevel@tonic-gate /*
8140Sstevel@tonic-gate  * Unlink a process from its collection, updating relevant statistics, and
8150Sstevel@tonic-gate  * freeing its associated memory.
8160Sstevel@tonic-gate  */
8170Sstevel@tonic-gate void
lprocess_free(lprocess_t * lpc)8180Sstevel@tonic-gate lprocess_free(lprocess_t *lpc)
8190Sstevel@tonic-gate {
8200Sstevel@tonic-gate 	pid_t pid;
8210Sstevel@tonic-gate 
8220Sstevel@tonic-gate 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
8230Sstevel@tonic-gate 
8240Sstevel@tonic-gate 	if (lpc->lpc_prev != NULL)
8250Sstevel@tonic-gate 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
8260Sstevel@tonic-gate 	if (lpc->lpc_next != NULL)
8270Sstevel@tonic-gate 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
8280Sstevel@tonic-gate 	if (lpc->lpc_collection->lcol_lprocess == lpc)
8290Sstevel@tonic-gate 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
8300Sstevel@tonic-gate 		    lpc ? lpc->lpc_next : NULL);
8310Sstevel@tonic-gate 	lpc->lpc_next = lpc->lpc_prev = NULL;
8320Sstevel@tonic-gate 
8330Sstevel@tonic-gate 	if (lpc->lpc_prpageheader != NULL)
8340Sstevel@tonic-gate 		free(lpc->lpc_prpageheader);
8350Sstevel@tonic-gate 	if (lpc->lpc_xmap != NULL)
8360Sstevel@tonic-gate 		free(lpc->lpc_xmap);
8370Sstevel@tonic-gate 	if (lpc->lpc_psinfo_fd >= 0) {
8380Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
8390Sstevel@tonic-gate 			debug("could not close %d lpc_psinfo_fd %d",
8400Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
8410Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
8420Sstevel@tonic-gate 	}
8430Sstevel@tonic-gate 	if (lpc->lpc_pgdata_fd >= 0) {
8440Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
8450Sstevel@tonic-gate 			debug("could not close %d lpc_pgdata_fd %d",
8460Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
8470Sstevel@tonic-gate 		lpc->lpc_pgdata_fd = -1;
8480Sstevel@tonic-gate 	}
8490Sstevel@tonic-gate 	if (lpc->lpc_xmap_fd >= 0) {
8500Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
8510Sstevel@tonic-gate 			debug("could not close %d lpc_xmap_fd %d",
8520Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
8530Sstevel@tonic-gate 		lpc->lpc_xmap_fd = -1;
8540Sstevel@tonic-gate 	}
8550Sstevel@tonic-gate 	if (lpc->lpc_ignore != NULL)
8560Sstevel@tonic-gate 		lmapping_free(&lpc->lpc_ignore);
8570Sstevel@tonic-gate 	pid = lpc->lpc_pid;
8580Sstevel@tonic-gate 	free(lpc);
8590Sstevel@tonic-gate 	debug_high("process %d freed\n", (int)pid);
8600Sstevel@tonic-gate }
8610Sstevel@tonic-gate 
8620Sstevel@tonic-gate /*
8630Sstevel@tonic-gate  * Collection clear callback.
8640Sstevel@tonic-gate  */
8650Sstevel@tonic-gate /*ARGSUSED*/
8660Sstevel@tonic-gate static int
collection_clear_cb(lcollection_t * lcol,void * arg)8670Sstevel@tonic-gate collection_clear_cb(lcollection_t *lcol, void *arg)
8680Sstevel@tonic-gate {
8690Sstevel@tonic-gate 	lcol->lcol_mark = 0;
8700Sstevel@tonic-gate 
8710Sstevel@tonic-gate 	return (0);
8720Sstevel@tonic-gate }
8730Sstevel@tonic-gate 
8740Sstevel@tonic-gate /*
8750Sstevel@tonic-gate  * Respond to a terminating signal by setting a termination flag.
8760Sstevel@tonic-gate  */
8770Sstevel@tonic-gate /*ARGSUSED*/
8780Sstevel@tonic-gate static void
terminate_signal(int signal)8790Sstevel@tonic-gate terminate_signal(int signal)
8800Sstevel@tonic-gate {
8810Sstevel@tonic-gate 	if (termination_signal == 0)
8820Sstevel@tonic-gate 		termination_signal = signal;
8830Sstevel@tonic-gate 	should_run = 0;
8840Sstevel@tonic-gate }
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate /*
8870Sstevel@tonic-gate  * Handle any synchronous or asynchronous signals that would ordinarily cause a
8880Sstevel@tonic-gate  * process to abort.
8890Sstevel@tonic-gate  */
8900Sstevel@tonic-gate /*ARGSUSED*/
8910Sstevel@tonic-gate static void
abort_signal(int signal)8920Sstevel@tonic-gate abort_signal(int signal)
8930Sstevel@tonic-gate {
8940Sstevel@tonic-gate 	/*
8950Sstevel@tonic-gate 	 * Allow the scanner to make a last-ditch effort to resume any stopped
8960Sstevel@tonic-gate 	 * processes.
8970Sstevel@tonic-gate 	 */
8980Sstevel@tonic-gate 	scan_abort();
8990Sstevel@tonic-gate 	abort();
9000Sstevel@tonic-gate }
9010Sstevel@tonic-gate 
9020Sstevel@tonic-gate /*
9030Sstevel@tonic-gate  * Clean up collections which have been removed due to configuration.  Unlink
9040Sstevel@tonic-gate  * the collection from lcollection and free it.
9050Sstevel@tonic-gate  */
9060Sstevel@tonic-gate /*ARGSUSED*/
9070Sstevel@tonic-gate static int
collection_sweep_cb(lcollection_t * lcol,void * arg)9080Sstevel@tonic-gate collection_sweep_cb(lcollection_t *lcol, void *arg)
9090Sstevel@tonic-gate {
9100Sstevel@tonic-gate 	if (lcol->lcol_mark == 0) {
9113247Sgjelinek 		debug("freeing %s %s\n",
9123247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
9133247Sgjelinek 		    "project" : "zone"), lcol->lcol_name);
9140Sstevel@tonic-gate 		lcollection_free(lcol);
9150Sstevel@tonic-gate 	}
9160Sstevel@tonic-gate 
9170Sstevel@tonic-gate 	return (0);
9180Sstevel@tonic-gate }
9190Sstevel@tonic-gate 
9200Sstevel@tonic-gate /*
9210Sstevel@tonic-gate  * Set those variables which depend on the global configuration.
9220Sstevel@tonic-gate  */
9230Sstevel@tonic-gate static void
finish_configuration(void)9240Sstevel@tonic-gate finish_configuration(void)
9250Sstevel@tonic-gate {
9260Sstevel@tonic-gate 	/*
9270Sstevel@tonic-gate 	 * Warn that any lnode (or non-project) mode specification (by an SRM
9280Sstevel@tonic-gate 	 * 1.3 configuration file, for example) is ignored.
9290Sstevel@tonic-gate 	 */
9300Sstevel@tonic-gate 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
9310Sstevel@tonic-gate 		warn(gettext("%s mode specification ignored -- using project"
9320Sstevel@tonic-gate 		    " mode\n"), rcfg.rcfg_mode_name);
9330Sstevel@tonic-gate 		rcfg.rcfg_mode_name = "project";
9340Sstevel@tonic-gate 		rcfg.rcfg_mode = rctype_project;
9350Sstevel@tonic-gate 	}
9360Sstevel@tonic-gate }
9370Sstevel@tonic-gate 
9380Sstevel@tonic-gate /*
9394119Stn143363  * Cause the configuration to be reread and applied.
9400Sstevel@tonic-gate  */
9410Sstevel@tonic-gate static void
reread_configuration(void)9424119Stn143363 reread_configuration(void)
9430Sstevel@tonic-gate {
9440Sstevel@tonic-gate 	rcfg_t rcfg_new;
9450Sstevel@tonic-gate 
9464119Stn143363 	if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
9474119Stn143363 		warn(gettext("can't reread configuration \n"));
9484119Stn143363 		exit(SMF_EXIT_ERR_CONFIG);
9494119Stn143363 	} else {
9500Sstevel@tonic-gate 		/*
9514119Stn143363 		 * Done reading configuration.  Remove existing
9520Sstevel@tonic-gate 		 * collections in case there is a change in collection type.
9530Sstevel@tonic-gate 		 */
9540Sstevel@tonic-gate 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
9550Sstevel@tonic-gate 			list_walk_collection(collection_clear_cb, NULL);
9560Sstevel@tonic-gate 			list_walk_collection(collection_sweep_cb, NULL);
9570Sstevel@tonic-gate 		}
9580Sstevel@tonic-gate 
9590Sstevel@tonic-gate 		/*
9600Sstevel@tonic-gate 		 * Make the newly-read configuration the global one, and update
9610Sstevel@tonic-gate 		 * any variables that depend on it.
9620Sstevel@tonic-gate 		 */
9630Sstevel@tonic-gate 		rcfg = rcfg_new;
9640Sstevel@tonic-gate 		finish_configuration();
9650Sstevel@tonic-gate 	}
9660Sstevel@tonic-gate }
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate /*
9694119Stn143363  * First, examine changes, additions, and deletions to cap definitions.
9704119Stn143363  * Then, set the next event time.
9710Sstevel@tonic-gate  */
9720Sstevel@tonic-gate static void
reconfigure(hrtime_t now,hrtime_t * next_configuration,hrtime_t * next_proc_walk,hrtime_t * next_rss_sample)9733247Sgjelinek reconfigure(hrtime_t now, hrtime_t *next_configuration,
9743247Sgjelinek     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
9750Sstevel@tonic-gate {
9760Sstevel@tonic-gate 	debug("reconfigure...\n");
9770Sstevel@tonic-gate 
9780Sstevel@tonic-gate 	/*
9790Sstevel@tonic-gate 	 * Walk the lcollection, marking active collections so inactive ones
9800Sstevel@tonic-gate 	 * can be freed.
9810Sstevel@tonic-gate 	 */
9820Sstevel@tonic-gate 	list_walk_collection(collection_clear_cb, NULL);
9830Sstevel@tonic-gate 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
9840Sstevel@tonic-gate 	list_walk_collection(collection_sweep_cb, NULL);
9853247Sgjelinek 
9863247Sgjelinek 	*next_configuration = NEXT_EVENT_TIME(now,
9873247Sgjelinek 	    rcfg.rcfg_reconfiguration_interval);
9883247Sgjelinek 
9893247Sgjelinek 	/*
9903247Sgjelinek 	 * Reset each event time to the shorter of the previous and new
9913247Sgjelinek 	 * intervals.
9923247Sgjelinek 	 */
9933247Sgjelinek 	if (next_report == 0 && rcfg.rcfg_report_interval > 0)
9943247Sgjelinek 		next_report = now;
9953247Sgjelinek 	else
9963247Sgjelinek 		next_report = POSITIVE_MIN(next_report,
9973247Sgjelinek 		    NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
9983247Sgjelinek 
9993247Sgjelinek 	if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
10003247Sgjelinek 		*next_proc_walk = now;
10013247Sgjelinek 	else
10023247Sgjelinek 		*next_proc_walk = POSITIVE_MIN(*next_proc_walk,
10033247Sgjelinek 		    NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
10043247Sgjelinek 
10053247Sgjelinek 	if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
10063247Sgjelinek 		*next_rss_sample = now;
10073247Sgjelinek 	else
10083247Sgjelinek 		*next_rss_sample = POSITIVE_MIN(*next_rss_sample,
10093247Sgjelinek 		    NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
10100Sstevel@tonic-gate }
10110Sstevel@tonic-gate 
10120Sstevel@tonic-gate /*
10134119Stn143363  * Respond to SIGHUP by triggering the rereading the configuration and cap
10140Sstevel@tonic-gate  * definitions.
10150Sstevel@tonic-gate  */
10160Sstevel@tonic-gate /*ARGSUSED*/
10170Sstevel@tonic-gate static void
sighup(int signal)10180Sstevel@tonic-gate sighup(int signal)
10190Sstevel@tonic-gate {
10200Sstevel@tonic-gate 	should_reconfigure = 1;
10210Sstevel@tonic-gate }
10220Sstevel@tonic-gate 
10230Sstevel@tonic-gate /*
10240Sstevel@tonic-gate  * Print, for debugging purposes, each collection's interval statistics.
10250Sstevel@tonic-gate  */
10260Sstevel@tonic-gate /*ARGSUSED*/
10270Sstevel@tonic-gate static int
simple_report_collection_cb(lcollection_t * lcol,void * arg)10280Sstevel@tonic-gate simple_report_collection_cb(lcollection_t *lcol, void *arg)
10290Sstevel@tonic-gate {
10300Sstevel@tonic-gate #define	DELTA(field) \
10313247Sgjelinek 	(unsigned long long)( \
10320Sstevel@tonic-gate 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
10330Sstevel@tonic-gate 
10340Sstevel@tonic-gate 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
10350Sstevel@tonic-gate 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
10360Sstevel@tonic-gate 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
10373247Sgjelinek 	    "%llu scans over %llu ms\n",
10383247Sgjelinek 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
10393247Sgjelinek 	    lcol->lcol_name,
10400Sstevel@tonic-gate 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
10410Sstevel@tonic-gate 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
10420Sstevel@tonic-gate 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
10433247Sgjelinek 	    (unsigned long long)lcol->lcol_stat.lcols_min_rss,
10443247Sgjelinek 	    (unsigned long long)lcol->lcol_stat.lcols_max_rss,
10450Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
10460Sstevel@tonic-gate 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
10470Sstevel@tonic-gate 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
10480Sstevel@tonic-gate 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
10490Sstevel@tonic-gate 	    / MILLISEC));
10500Sstevel@tonic-gate 
10510Sstevel@tonic-gate #undef DELTA
10520Sstevel@tonic-gate 
10530Sstevel@tonic-gate 	return (0);
10540Sstevel@tonic-gate }
10550Sstevel@tonic-gate 
10560Sstevel@tonic-gate /*
10570Sstevel@tonic-gate  * Record each collection's interval statistics in the statistics file.
10580Sstevel@tonic-gate  */
10590Sstevel@tonic-gate static int
report_collection_cb(lcollection_t * lcol,void * arg)10600Sstevel@tonic-gate report_collection_cb(lcollection_t *lcol, void *arg)
10610Sstevel@tonic-gate {
10620Sstevel@tonic-gate 	lcollection_report_t dc;
10630Sstevel@tonic-gate 	int fd = (intptr_t)arg;
10640Sstevel@tonic-gate 
10650Sstevel@tonic-gate 	/*
10660Sstevel@tonic-gate 	 * Copy the relevant fields to the collection's record.
10670Sstevel@tonic-gate 	 */
10680Sstevel@tonic-gate 	bzero(&dc, sizeof (dc));
10690Sstevel@tonic-gate 	dc.lcol_id = lcol->lcol_id;
10700Sstevel@tonic-gate 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
10710Sstevel@tonic-gate 	dc.lcol_rss = lcol->lcol_rss;
10720Sstevel@tonic-gate 	dc.lcol_image_size = lcol->lcol_image_size;
10730Sstevel@tonic-gate 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
10740Sstevel@tonic-gate 	dc.lcol_stat = lcol->lcol_stat;
10750Sstevel@tonic-gate 
10760Sstevel@tonic-gate 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
10773247Sgjelinek 		lcol->lcol_stat_old = lcol->lcol_stat;
10780Sstevel@tonic-gate 	} else {
10793247Sgjelinek 		debug("can't write %s %s statistics",
10803247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
10813247Sgjelinek 		    "project" : "zone"),
10820Sstevel@tonic-gate 		    lcol->lcol_name);
10830Sstevel@tonic-gate 	}
10840Sstevel@tonic-gate 
10850Sstevel@tonic-gate 	return (0);
10860Sstevel@tonic-gate }
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate /*
10890Sstevel@tonic-gate  * Determine the count of pages scanned by the global page scanner, obtained
10900Sstevel@tonic-gate  * from the cpu_stat:*::scan kstats.  Return zero on success.
10910Sstevel@tonic-gate  */
10920Sstevel@tonic-gate static int
get_globally_scanned_pages(uint64_t * scannedp)10930Sstevel@tonic-gate get_globally_scanned_pages(uint64_t *scannedp)
10940Sstevel@tonic-gate {
10950Sstevel@tonic-gate 	kstat_t *ksp;
10960Sstevel@tonic-gate 	uint64_t scanned = 0;
10970Sstevel@tonic-gate 
10980Sstevel@tonic-gate 	if (kstat_chain_update(kctl) == -1) {
10990Sstevel@tonic-gate 		warn(gettext("can't update kstat chain"));
11000Sstevel@tonic-gate 		return (0);
11010Sstevel@tonic-gate 	}
11020Sstevel@tonic-gate 
11030Sstevel@tonic-gate 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
11040Sstevel@tonic-gate 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
11050Sstevel@tonic-gate 			if (kstat_read(kctl, ksp, NULL) != -1) {
11060Sstevel@tonic-gate 				scanned += ((cpu_stat_t *)
11070Sstevel@tonic-gate 				    ksp->ks_data)->cpu_vminfo.scan;
11083247Sgjelinek 			} else {
11090Sstevel@tonic-gate 				return (-1);
11103247Sgjelinek 			}
11110Sstevel@tonic-gate 		}
11120Sstevel@tonic-gate 	}
11130Sstevel@tonic-gate 
11140Sstevel@tonic-gate 	*scannedp = scanned;
11150Sstevel@tonic-gate 	return (0);
11160Sstevel@tonic-gate }
11170Sstevel@tonic-gate 
11180Sstevel@tonic-gate /*
11193247Sgjelinek  * Determine if the global page scanner is running, during which no memory
11203247Sgjelinek  * caps should be enforced, to prevent interference with the global page
11213247Sgjelinek  * scanner.
11223247Sgjelinek  */
11233247Sgjelinek static boolean_t
is_global_scanner_running()11243247Sgjelinek is_global_scanner_running()
11253247Sgjelinek {
11263247Sgjelinek 	/* measure delta in page scan count */
11273247Sgjelinek 	static uint64_t new_sp = 0;
11283247Sgjelinek 	static uint64_t old_sp = 0;
11293247Sgjelinek 	boolean_t res = B_FALSE;
11303247Sgjelinek 
11313247Sgjelinek 	if (get_globally_scanned_pages(&new_sp) == 0) {
11323247Sgjelinek 		if (old_sp != 0 && (new_sp - old_sp) > 0) {
11333247Sgjelinek 			debug("global memory pressure detected (%llu "
11343247Sgjelinek 			    "pages scanned since last interval)\n",
11353247Sgjelinek 			    (unsigned long long)(new_sp - old_sp));
11363247Sgjelinek 			res = B_TRUE;
11373247Sgjelinek 		}
11383247Sgjelinek 		old_sp = new_sp;
11393247Sgjelinek 	} else {
11403247Sgjelinek 		warn(gettext("unable to read cpu statistics"));
11413247Sgjelinek 		new_sp = old_sp;
11423247Sgjelinek 	}
11433247Sgjelinek 
11443247Sgjelinek 	return (res);
11453247Sgjelinek }
11463247Sgjelinek 
11473247Sgjelinek /*
11483247Sgjelinek  * If soft caps are in use, determine if global memory pressure exceeds the
11493247Sgjelinek  * configured maximum above which soft caps are enforced.
11503247Sgjelinek  */
11513247Sgjelinek static boolean_t
must_enforce_soft_caps()11523247Sgjelinek must_enforce_soft_caps()
11533247Sgjelinek {
11543247Sgjelinek 	/*
11553247Sgjelinek 	 * Check for changes to the amount of installed physical memory, to
11563247Sgjelinek 	 * compute the current memory pressure.
11573247Sgjelinek 	 */
11583247Sgjelinek 	update_phys_total();
11593247Sgjelinek 
11603247Sgjelinek 	memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
11613247Sgjelinek 	    * 100.0 / phys_total);
11623247Sgjelinek 	memory_pressure_sample++;
11633247Sgjelinek 	if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
11643247Sgjelinek 	    memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
11653247Sgjelinek 		return (B_TRUE);
11663247Sgjelinek 	}
11673247Sgjelinek 
11683247Sgjelinek 	return (B_FALSE);
11693247Sgjelinek }
11703247Sgjelinek 
11713247Sgjelinek /*
11720Sstevel@tonic-gate  * Update the shared statistics file with each collection's current statistics.
11730Sstevel@tonic-gate  * Return zero on success.
11740Sstevel@tonic-gate  */
11750Sstevel@tonic-gate static int
update_statistics(void)11760Sstevel@tonic-gate update_statistics(void)
11770Sstevel@tonic-gate {
11780Sstevel@tonic-gate 	int fd, res;
11790Sstevel@tonic-gate 	static char template[LINELEN];
11800Sstevel@tonic-gate 
11810Sstevel@tonic-gate 	/*
1182442Sgm149974 	 * Try to create a directory irrespective of whether it is existing
1183442Sgm149974 	 * or not. If it is not there then it will create. Otherwise any way
1184442Sgm149974 	 * it will fail at mkstemp call below.
1185442Sgm149974 	 */
1186442Sgm149974 	(void) mkdir(STAT_FILE_DIR, 0755);
1187442Sgm149974 
1188442Sgm149974 	/*
11890Sstevel@tonic-gate 	 * Create a temporary file.
11900Sstevel@tonic-gate 	 */
11910Sstevel@tonic-gate 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
11920Sstevel@tonic-gate 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
11930Sstevel@tonic-gate 		debug("temporary file template size too small\n");
11940Sstevel@tonic-gate 		return (-1);
11950Sstevel@tonic-gate 	}
11960Sstevel@tonic-gate 	(void) strcpy(template, rcfg.rcfg_stat_file);
11970Sstevel@tonic-gate 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
11980Sstevel@tonic-gate 	(void) rfd_reserve(1);
11990Sstevel@tonic-gate 	fd = mkstemp(template);
12000Sstevel@tonic-gate 
12010Sstevel@tonic-gate 	/*
12020Sstevel@tonic-gate 	 * Write the header and per-collection statistics.
12030Sstevel@tonic-gate 	 */
12040Sstevel@tonic-gate 	if (fd >= 0) {
12050Sstevel@tonic-gate 		rcapd_stat_hdr_t rs;
12060Sstevel@tonic-gate 
12070Sstevel@tonic-gate 		rs.rs_pid = rcapd_pid;
12080Sstevel@tonic-gate 		rs.rs_time = gethrtime();
12090Sstevel@tonic-gate 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
12100Sstevel@tonic-gate 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
12110Sstevel@tonic-gate 		rs.rs_pressure_cur = memory_pressure;
12120Sstevel@tonic-gate 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
12130Sstevel@tonic-gate 		rs.rs_pressure_sample = memory_pressure_sample;
12140Sstevel@tonic-gate 
12150Sstevel@tonic-gate 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
12160Sstevel@tonic-gate 		    sizeof (rs)) {
12170Sstevel@tonic-gate 			list_walk_collection(report_collection_cb,
1218*13093SRoger.Faulkner@Oracle.COM 			    (void *)(intptr_t)fd);
12190Sstevel@tonic-gate 			/*
12200Sstevel@tonic-gate 			 * Replace the existing statistics file with this new
12210Sstevel@tonic-gate 			 * one.
12220Sstevel@tonic-gate 			 */
12230Sstevel@tonic-gate 			res = rename(template, rcfg.rcfg_stat_file);
12240Sstevel@tonic-gate 		} else
12250Sstevel@tonic-gate 			res = -1;
12260Sstevel@tonic-gate 		(void) close(fd);
12270Sstevel@tonic-gate 	} else
12280Sstevel@tonic-gate 		res = -1;
12290Sstevel@tonic-gate 
12300Sstevel@tonic-gate 	return (res);
12310Sstevel@tonic-gate }
12320Sstevel@tonic-gate 
12330Sstevel@tonic-gate /*
12340Sstevel@tonic-gate  * Verify the statistics file can be created and written to, and die if an
12350Sstevel@tonic-gate  * existing file may be in use by another rcapd.
12360Sstevel@tonic-gate  */
12370Sstevel@tonic-gate static int
verify_statistics(void)12380Sstevel@tonic-gate verify_statistics(void)
12390Sstevel@tonic-gate {
12400Sstevel@tonic-gate 	pid_t pid;
12410Sstevel@tonic-gate 
12420Sstevel@tonic-gate 	/*
12430Sstevel@tonic-gate 	 * Warn if another instance of rcapd might be active.
12440Sstevel@tonic-gate 	 */
12450Sstevel@tonic-gate 	(void) rfd_reserve(1);
12460Sstevel@tonic-gate 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
12470Sstevel@tonic-gate 	if (pid != rcapd_pid && pid != -1)
12480Sstevel@tonic-gate 		die(gettext("%s exists; rcapd may already be active\n"),
12490Sstevel@tonic-gate 		    rcfg.rcfg_stat_file);
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 	return (update_statistics());
12520Sstevel@tonic-gate }
12530Sstevel@tonic-gate 
12540Sstevel@tonic-gate static int
sum_excess_cb(lcollection_t * lcol,void * arg)12550Sstevel@tonic-gate sum_excess_cb(lcollection_t *lcol, void *arg)
12560Sstevel@tonic-gate {
12570Sstevel@tonic-gate 	uint64_t *sum_excess = arg;
12580Sstevel@tonic-gate 
12590Sstevel@tonic-gate 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
12600Sstevel@tonic-gate 	    lcol->lcol_rss_cap));
12610Sstevel@tonic-gate 	return (0);
12620Sstevel@tonic-gate }
12630Sstevel@tonic-gate 
12643247Sgjelinek /*
12653247Sgjelinek  * Compute the quantity of memory (in kilobytes) above the cap enforcement
12663247Sgjelinek  * pressure.  Set the scan goal to that quantity (or at most the excess).
12673247Sgjelinek  */
12683247Sgjelinek static void
compute_soft_scan_goal(soft_scan_arg_t * argp)12693247Sgjelinek compute_soft_scan_goal(soft_scan_arg_t *argp)
12703247Sgjelinek {
12713247Sgjelinek 	/*
12723247Sgjelinek 	 * Compute the sum of the collections' excesses, which will be the
12733247Sgjelinek 	 * denominator.
12743247Sgjelinek 	 */
12753247Sgjelinek 	argp->ssa_sum_excess = 0;
12763247Sgjelinek 	list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
12773247Sgjelinek 
12783247Sgjelinek 	argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
12793247Sgjelinek 	    (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
12803247Sgjelinek 	    sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
12813247Sgjelinek 	    argp->ssa_sum_excess);
12823247Sgjelinek }
12833247Sgjelinek 
12840Sstevel@tonic-gate static void
rcapd_usage(void)12850Sstevel@tonic-gate rcapd_usage(void)
12860Sstevel@tonic-gate {
12870Sstevel@tonic-gate 	info(gettext("usage: rcapd [-d]\n"));
12880Sstevel@tonic-gate }
12890Sstevel@tonic-gate 
12900Sstevel@tonic-gate void
check_update_statistics(void)12910Sstevel@tonic-gate check_update_statistics(void)
12920Sstevel@tonic-gate {
12930Sstevel@tonic-gate 	hrtime_t now = gethrtime();
12940Sstevel@tonic-gate 
12950Sstevel@tonic-gate 	if (EVENT_TIME(now, next_report)) {
12960Sstevel@tonic-gate 		debug("updating statistics...\n");
12970Sstevel@tonic-gate 		list_walk_collection(simple_report_collection_cb, NULL);
12980Sstevel@tonic-gate 		if (update_statistics() != 0)
12990Sstevel@tonic-gate 			debug("couldn't update statistics");
13000Sstevel@tonic-gate 		next_report = NEXT_REPORT_EVENT_TIME(now,
13010Sstevel@tonic-gate 		    rcfg.rcfg_report_interval);
13020Sstevel@tonic-gate 	}
13030Sstevel@tonic-gate }
13040Sstevel@tonic-gate 
13050Sstevel@tonic-gate static void
verify_and_set_privileges(void)13060Sstevel@tonic-gate verify_and_set_privileges(void)
13070Sstevel@tonic-gate {
13080Sstevel@tonic-gate 	priv_set_t *required =
13090Sstevel@tonic-gate 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
13100Sstevel@tonic-gate 
13110Sstevel@tonic-gate 	/*
13120Sstevel@tonic-gate 	 * Ensure the required privileges, suitable for controlling processes,
13130Sstevel@tonic-gate 	 * are possessed.
13140Sstevel@tonic-gate 	 */
13150Sstevel@tonic-gate 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
13160Sstevel@tonic-gate 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
13170Sstevel@tonic-gate 		die(gettext("can't set requisite privileges"));
13180Sstevel@tonic-gate 
13190Sstevel@tonic-gate 	/*
13200Sstevel@tonic-gate 	 * Ensure access to /var/run/daemon.
13210Sstevel@tonic-gate 	 */
13220Sstevel@tonic-gate 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
13230Sstevel@tonic-gate 		die(gettext("cannot become user daemon"));
13240Sstevel@tonic-gate 
13250Sstevel@tonic-gate 	priv_freeset(required);
13260Sstevel@tonic-gate }
13270Sstevel@tonic-gate 
13283247Sgjelinek /*
13293247Sgjelinek  * This function does the top-level work to determine if we should do any
13303247Sgjelinek  * memory capping, and if so, it invokes the right call-backs to do the work.
13313247Sgjelinek  */
13323247Sgjelinek static void
do_capping(hrtime_t now,hrtime_t * next_proc_walk)13333247Sgjelinek do_capping(hrtime_t now, hrtime_t *next_proc_walk)
13343247Sgjelinek {
13353247Sgjelinek 	boolean_t enforce_caps;
13363247Sgjelinek 	/* soft cap enforcement flag, depending on memory pressure */
13373247Sgjelinek 	boolean_t enforce_soft_caps;
13383247Sgjelinek 	/* avoid interference with kernel's page scanner */
13393247Sgjelinek 	boolean_t global_scanner_running;
13403247Sgjelinek 	sample_col_arg_t col_arg;
13413247Sgjelinek 	soft_scan_arg_t arg;
13423247Sgjelinek 	uint_t col_types = 0;
13433247Sgjelinek 
13443247Sgjelinek 	/* check what kind of collections (project/zone) are capped */
13453247Sgjelinek 	list_walk_collection(col_type_cb, &col_types);
13463247Sgjelinek 	debug("collection types: 0x%x\n", col_types);
13473247Sgjelinek 
13483247Sgjelinek 	/* no capped collections, skip checking rss */
13493247Sgjelinek 	if (col_types == 0)
13503247Sgjelinek 		return;
13513247Sgjelinek 
13523247Sgjelinek 	/* Determine if soft caps are enforced. */
13533247Sgjelinek 	enforce_soft_caps = must_enforce_soft_caps();
13543247Sgjelinek 
13553247Sgjelinek 	/* Determine if the global page scanner is running. */
13563247Sgjelinek 	global_scanner_running = is_global_scanner_running();
13573247Sgjelinek 
13583247Sgjelinek 	/*
13593247Sgjelinek 	 * Sample collections' member processes RSSes and recompute
13603247Sgjelinek 	 * collections' excess.
13613247Sgjelinek 	 */
13623247Sgjelinek 	rss_sample(B_FALSE, col_types);
13633247Sgjelinek 
13643247Sgjelinek 	col_arg.sca_any_over_cap = B_FALSE;
13653247Sgjelinek 	col_arg.sca_project_over_cap = B_FALSE;
13663247Sgjelinek 	list_walk_collection(rss_sample_col_cb, &col_arg);
13673247Sgjelinek 	list_walk_collection(excess_print_cb, NULL);
13683247Sgjelinek 	debug("any collection/project over cap = %d, %d\n",
13693247Sgjelinek 	    col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
13703247Sgjelinek 
13713247Sgjelinek 	if (enforce_soft_caps)
13723247Sgjelinek 		debug("memory pressure %d%%\n", memory_pressure);
13733247Sgjelinek 
13743247Sgjelinek 	/*
13753247Sgjelinek 	 * Cap enforcement is determined by the previous conditions.
13763247Sgjelinek 	 */
13773247Sgjelinek 	enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
13783247Sgjelinek 	    (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
13793247Sgjelinek 	    enforce_soft_caps);
13803247Sgjelinek 
13813247Sgjelinek 	debug("%senforcing caps\n", enforce_caps ? "" : "not ");
13823247Sgjelinek 
13833247Sgjelinek 	/*
13843247Sgjelinek 	 * If soft caps are in use, determine the size of the portion from each
13853247Sgjelinek 	 * collection to scan for.
13863247Sgjelinek 	 */
13873247Sgjelinek 	if (enforce_caps && enforce_soft_caps)
13883247Sgjelinek 		compute_soft_scan_goal(&arg);
13893247Sgjelinek 
13903247Sgjelinek 	/*
13913247Sgjelinek 	 * Victimize offending collections.
13923247Sgjelinek 	 */
13933247Sgjelinek 	if (enforce_caps && (!enforce_soft_caps ||
13943247Sgjelinek 	    (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
13953247Sgjelinek 
13963247Sgjelinek 		/*
13973247Sgjelinek 		 * Since at least one collection is over its cap & needs
13983247Sgjelinek 		 * enforcing, check if it is at least time for a process walk
13993247Sgjelinek 		 * (we could be well past time since we only walk /proc when
14003247Sgjelinek 		 * we need to) and if so, update each collections process list
14013247Sgjelinek 		 * in a single pass through /proc.
14023247Sgjelinek 		 */
14033247Sgjelinek 		if (EVENT_TIME(now, *next_proc_walk)) {
14043247Sgjelinek 			debug("scanning process list...\n");
14053247Sgjelinek 			proc_walk_all(proc_cb);		 /* insert & mark */
14063247Sgjelinek 			list_walk_all(sweep_process_cb); /* free dead procs */
14073247Sgjelinek 			*next_proc_walk = NEXT_EVENT_TIME(now,
14083247Sgjelinek 			    rcfg.rcfg_proc_walk_interval);
14093247Sgjelinek 		}
14103247Sgjelinek 
14113247Sgjelinek 		gz_col = NULL;
14123247Sgjelinek 		if (enforce_soft_caps) {
14133247Sgjelinek 			debug("scan goal is %lldKB\n",
14143247Sgjelinek 			    (long long)arg.ssa_scan_goal);
14153247Sgjelinek 			list_walk_collection(soft_scan_cb, &arg);
14163247Sgjelinek 			if (gz_capped && gz_col != NULL) {
14173247Sgjelinek 				/* process global zone */
14183247Sgjelinek 				arg.ssa_project_over_cap =
14193247Sgjelinek 				    col_arg.sca_project_over_cap;
14203247Sgjelinek 				soft_scan_gz(gz_col, &arg);
14213247Sgjelinek 			}
14223247Sgjelinek 		} else {
14233247Sgjelinek 			list_walk_collection(scan_cb, NULL);
14243247Sgjelinek 			if (gz_capped && gz_col != NULL) {
14253247Sgjelinek 				/* process global zone */
14263247Sgjelinek 				scan_gz(gz_col, col_arg.sca_project_over_cap);
14273247Sgjelinek 			}
14283247Sgjelinek 		}
14293247Sgjelinek 	} else if (col_arg.sca_any_over_cap) {
14303247Sgjelinek 		list_walk_collection(unenforced_cap_cb, NULL);
14313247Sgjelinek 	}
14323247Sgjelinek }
14333247Sgjelinek 
14340Sstevel@tonic-gate int
main(int argc,char * argv[])14350Sstevel@tonic-gate main(int argc, char *argv[])
14360Sstevel@tonic-gate {
14370Sstevel@tonic-gate 	int res;
14380Sstevel@tonic-gate 	int should_fork = 1;	/* fork flag */
14390Sstevel@tonic-gate 	hrtime_t now;		/* current time */
14400Sstevel@tonic-gate 	hrtime_t next;		/* time of next event */
14410Sstevel@tonic-gate 	int sig;		/* signal iteration */
14420Sstevel@tonic-gate 	struct rlimit rl;
14430Sstevel@tonic-gate 	hrtime_t next_proc_walk;	/* time of next /proc scan */
14440Sstevel@tonic-gate 	hrtime_t next_configuration;	/* time of next configuration */
14450Sstevel@tonic-gate 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
14460Sstevel@tonic-gate 
14470Sstevel@tonic-gate 	(void) set_message_priority(RCM_INFO);
1448*13093SRoger.Faulkner@Oracle.COM 	(void) setpname("rcapd");
14490Sstevel@tonic-gate 	rcapd_pid = getpid();
14500Sstevel@tonic-gate 	(void) chdir("/");
14510Sstevel@tonic-gate 	should_run = 1;
14520Sstevel@tonic-gate 	ever_ran = 0;
14530Sstevel@tonic-gate 
14540Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
14550Sstevel@tonic-gate 	(void) textdomain(TEXT_DOMAIN);
14560Sstevel@tonic-gate 
14570Sstevel@tonic-gate 	/*
14580Sstevel@tonic-gate 	 * Parse command-line options.
14590Sstevel@tonic-gate 	 */
14600Sstevel@tonic-gate 	while ((res = getopt(argc, argv, "dF")) > 0)
14610Sstevel@tonic-gate 		switch (res) {
14620Sstevel@tonic-gate 		case 'd':
14630Sstevel@tonic-gate 			should_fork = 0;
14640Sstevel@tonic-gate 			if (debug_mode == 0) {
14650Sstevel@tonic-gate 				debug_mode = 1;
14660Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG);
14670Sstevel@tonic-gate 			} else
14680Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG_HIGH);
14690Sstevel@tonic-gate 			break;
14700Sstevel@tonic-gate 		case 'F':
14710Sstevel@tonic-gate 			should_fork = 0;
14720Sstevel@tonic-gate 			break;
14730Sstevel@tonic-gate 		default:
14740Sstevel@tonic-gate 			rcapd_usage();
14750Sstevel@tonic-gate 			return (E_USAGE);
14760Sstevel@tonic-gate 			/*NOTREACHED*/
14770Sstevel@tonic-gate 		}
14780Sstevel@tonic-gate 
14790Sstevel@tonic-gate 	/*
14804119Stn143363 	 * Read the configuration.
14814119Stn143363 	 */
14824119Stn143363 	if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
14834119Stn143363 		warn(gettext("resource caps not configured\n"));
14844119Stn143363 		return (SMF_EXIT_ERR_CONFIG);
14854119Stn143363 	}
14864119Stn143363 
14874119Stn143363 	/*
14880Sstevel@tonic-gate 	 * If not debugging, fork and continue operating, changing the
14890Sstevel@tonic-gate 	 * destination of messages to syslog().
14900Sstevel@tonic-gate 	 */
14910Sstevel@tonic-gate 	if (should_fork == 1) {
14920Sstevel@tonic-gate 		pid_t child;
14930Sstevel@tonic-gate 		debug("forking\n");
14940Sstevel@tonic-gate 		child = fork();
14950Sstevel@tonic-gate 		if (child == -1)
14960Sstevel@tonic-gate 			die(gettext("cannot fork"));
14970Sstevel@tonic-gate 		if (child > 0)
14980Sstevel@tonic-gate 			return (0);
14990Sstevel@tonic-gate 		else {
15000Sstevel@tonic-gate 			rcapd_pid = getpid();
15010Sstevel@tonic-gate 			(void) set_message_destination(RCD_SYSLOG);
15020Sstevel@tonic-gate 			(void) fclose(stdin);
15030Sstevel@tonic-gate 			(void) fclose(stdout);
15040Sstevel@tonic-gate 			(void) fclose(stderr);
15050Sstevel@tonic-gate 		}
15060Sstevel@tonic-gate 		/*
15070Sstevel@tonic-gate 		 * Start a new session and detatch from the controlling tty.
15080Sstevel@tonic-gate 		 */
15090Sstevel@tonic-gate 		if (setsid() == (pid_t)-1)
15100Sstevel@tonic-gate 			debug(gettext("setsid() failed; cannot detach from "
15110Sstevel@tonic-gate 			    "terminal"));
15120Sstevel@tonic-gate 	}
15130Sstevel@tonic-gate 
15140Sstevel@tonic-gate 	finish_configuration();
15150Sstevel@tonic-gate 	should_reconfigure = 0;
15160Sstevel@tonic-gate 
15170Sstevel@tonic-gate 	/*
15180Sstevel@tonic-gate 	 * Check that required privileges are possessed.
15190Sstevel@tonic-gate 	 */
15200Sstevel@tonic-gate 	verify_and_set_privileges();
15210Sstevel@tonic-gate 
15220Sstevel@tonic-gate 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
15230Sstevel@tonic-gate 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
15240Sstevel@tonic-gate 	    rcfg.rcfg_reconfiguration_interval);
15250Sstevel@tonic-gate 
15260Sstevel@tonic-gate 	/*
15270Sstevel@tonic-gate 	 * Open the kstat chain.
15280Sstevel@tonic-gate 	 */
15290Sstevel@tonic-gate 	kctl = kstat_open();
15300Sstevel@tonic-gate 	if (kctl == NULL)
15310Sstevel@tonic-gate 		die(gettext("can't open kstats"));
15320Sstevel@tonic-gate 
15330Sstevel@tonic-gate 	/*
15340Sstevel@tonic-gate 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
15350Sstevel@tonic-gate 	 * be effectively managed without revoking descriptors (at 3 per
15360Sstevel@tonic-gate 	 * process).
15370Sstevel@tonic-gate 	 */
15380Sstevel@tonic-gate 	rl.rlim_cur = 32 * 1024;
15390Sstevel@tonic-gate 	rl.rlim_max = 32 * 1024;
15400Sstevel@tonic-gate 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
15410Sstevel@tonic-gate 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
15420Sstevel@tonic-gate 		rl.rlim_cur = rl.rlim_max;
15430Sstevel@tonic-gate 		(void) setrlimit(RLIMIT_NOFILE, &rl);
15440Sstevel@tonic-gate 	}
15451914Scasper 	(void) enable_extended_FILE_stdio(-1, -1);
15461914Scasper 
15470Sstevel@tonic-gate 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
15480Sstevel@tonic-gate 		debug("fd limit: %lu\n", rl.rlim_cur);
15490Sstevel@tonic-gate 	else
15500Sstevel@tonic-gate 		debug("fd limit: unknown\n");
15510Sstevel@tonic-gate 
15523247Sgjelinek 	get_page_size();
15533247Sgjelinek 	my_zoneid = getzoneid();
15543247Sgjelinek 
15550Sstevel@tonic-gate 	/*
15560Sstevel@tonic-gate 	 * Handle those signals whose (default) exit disposition
15570Sstevel@tonic-gate 	 * prevents rcapd from finishing scanning before terminating.
15580Sstevel@tonic-gate 	 */
15590Sstevel@tonic-gate 	(void) sigset(SIGINT, terminate_signal);
15600Sstevel@tonic-gate 	(void) sigset(SIGQUIT, abort_signal);
15610Sstevel@tonic-gate 	(void) sigset(SIGILL, abort_signal);
15620Sstevel@tonic-gate 	(void) sigset(SIGEMT, abort_signal);
15630Sstevel@tonic-gate 	(void) sigset(SIGFPE, abort_signal);
15640Sstevel@tonic-gate 	(void) sigset(SIGBUS, abort_signal);
15650Sstevel@tonic-gate 	(void) sigset(SIGSEGV, abort_signal);
15660Sstevel@tonic-gate 	(void) sigset(SIGSYS, abort_signal);
15670Sstevel@tonic-gate 	(void) sigset(SIGPIPE, terminate_signal);
15680Sstevel@tonic-gate 	(void) sigset(SIGALRM, terminate_signal);
15690Sstevel@tonic-gate 	(void) sigset(SIGTERM, terminate_signal);
15700Sstevel@tonic-gate 	(void) sigset(SIGUSR1, terminate_signal);
15710Sstevel@tonic-gate 	(void) sigset(SIGUSR2, terminate_signal);
15720Sstevel@tonic-gate 	(void) sigset(SIGPOLL, terminate_signal);
15730Sstevel@tonic-gate 	(void) sigset(SIGVTALRM, terminate_signal);
15740Sstevel@tonic-gate 	(void) sigset(SIGXCPU, abort_signal);
15750Sstevel@tonic-gate 	(void) sigset(SIGXFSZ, abort_signal);
15760Sstevel@tonic-gate 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
15770Sstevel@tonic-gate 		(void) sigset(sig, terminate_signal);
15780Sstevel@tonic-gate 
15790Sstevel@tonic-gate 	/*
15800Sstevel@tonic-gate 	 * Install a signal handler for reconfiguration processing.
15810Sstevel@tonic-gate 	 */
15820Sstevel@tonic-gate 	(void) sigset(SIGHUP, sighup);
15830Sstevel@tonic-gate 
15840Sstevel@tonic-gate 	/*
15850Sstevel@tonic-gate 	 * Determine which process collections to cap.
15860Sstevel@tonic-gate 	 */
15870Sstevel@tonic-gate 	lcollection_update(LCU_COMPLETE);
15880Sstevel@tonic-gate 
15890Sstevel@tonic-gate 	/*
15900Sstevel@tonic-gate 	 * Loop forever, monitoring collections' resident set sizes and
15913247Sgjelinek 	 * enforcing their caps.  Look for changes in caps as well as
15923247Sgjelinek 	 * responding to requests to reread the configuration.  Update
15933247Sgjelinek 	 * per-collection statistics periodically.
15940Sstevel@tonic-gate 	 */
15950Sstevel@tonic-gate 	while (should_run != 0) {
15960Sstevel@tonic-gate 		struct timespec ts;
15970Sstevel@tonic-gate 
15980Sstevel@tonic-gate 		/*
15990Sstevel@tonic-gate 		 * Announce that rcapd is starting.
16000Sstevel@tonic-gate 		 */
16010Sstevel@tonic-gate 		if (ever_ran == 0) {
16020Sstevel@tonic-gate 			info(gettext("starting\n"));
16030Sstevel@tonic-gate 			ever_ran = 1;
16040Sstevel@tonic-gate 		}
16050Sstevel@tonic-gate 
16060Sstevel@tonic-gate 		/*
16073247Sgjelinek 		 * Check the configuration at every next_configuration interval.
16083247Sgjelinek 		 * Update the rss data once every next_rss_sample interval.
16093247Sgjelinek 		 * The condition of global memory pressure is also checked at
16103247Sgjelinek 		 * the same frequency, if strict caps are in use.
16110Sstevel@tonic-gate 		 */
16120Sstevel@tonic-gate 		now = gethrtime();
16130Sstevel@tonic-gate 
16140Sstevel@tonic-gate 		/*
16154119Stn143363 		 * Detect configuration and cap changes only when SIGHUP
16164119Stn143363 		 * is received. Call reconfigure to apply new configuration
16174119Stn143363 		 * parameters.
16180Sstevel@tonic-gate 		 */
16194119Stn143363 		if (should_reconfigure == 1) {
16204119Stn143363 			reread_configuration();
16214119Stn143363 			should_reconfigure = 0;
16223247Sgjelinek 			reconfigure(now, &next_configuration, &next_proc_walk,
16233247Sgjelinek 			    &next_rss_sample);
16244119Stn143363 		}
16254119Stn143363 
16264119Stn143363 		if (EVENT_TIME(now, next_configuration)) {
16274119Stn143363 			reconfigure(now, &next_configuration, &next_proc_walk,
16284119Stn143363 			    &next_rss_sample);
16290Sstevel@tonic-gate 		}
16300Sstevel@tonic-gate 
16313247Sgjelinek 		/*
16323247Sgjelinek 		 * Do the main work for enforcing caps.
16333247Sgjelinek 		 */
16340Sstevel@tonic-gate 		if (EVENT_TIME(now, next_rss_sample)) {
16353247Sgjelinek 			do_capping(now, &next_proc_walk);
16360Sstevel@tonic-gate 
16370Sstevel@tonic-gate 			next_rss_sample = NEXT_EVENT_TIME(now,
16380Sstevel@tonic-gate 			    rcfg.rcfg_rss_sample_interval);
16390Sstevel@tonic-gate 		}
16400Sstevel@tonic-gate 
16410Sstevel@tonic-gate 		/*
16420Sstevel@tonic-gate 		 * Update the statistics file, if it's time.
16430Sstevel@tonic-gate 		 */
16440Sstevel@tonic-gate 		check_update_statistics();
16450Sstevel@tonic-gate 
16460Sstevel@tonic-gate 		/*
16470Sstevel@tonic-gate 		 * Sleep for some time before repeating.
16480Sstevel@tonic-gate 		 */
16490Sstevel@tonic-gate 		now = gethrtime();
16500Sstevel@tonic-gate 		next = next_configuration;
16510Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_report);
16520Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_rss_sample);
16530Sstevel@tonic-gate 		if (next > now && should_run != 0) {
16540Sstevel@tonic-gate 			debug("sleeping %-4.2f seconds\n", (float)(next -
16550Sstevel@tonic-gate 			    now) / (float)NANOSEC);
16560Sstevel@tonic-gate 			hrt2ts(next - now, &ts);
16570Sstevel@tonic-gate 			(void) nanosleep(&ts, NULL);
16580Sstevel@tonic-gate 		}
16590Sstevel@tonic-gate 	}
16600Sstevel@tonic-gate 	if (termination_signal != 0)
16610Sstevel@tonic-gate 		debug("exiting due to signal %d\n", termination_signal);
16620Sstevel@tonic-gate 	if (ever_ran != 0)
16630Sstevel@tonic-gate 		info(gettext("exiting\n"));
16640Sstevel@tonic-gate 
16650Sstevel@tonic-gate 	/*
16660Sstevel@tonic-gate 	 * Unlink the statistics file before exiting.
16670Sstevel@tonic-gate 	 */
16680Sstevel@tonic-gate 	if (rcfg.rcfg_stat_file[0] != 0)
16690Sstevel@tonic-gate 		(void) unlink(rcfg.rcfg_stat_file);
16700Sstevel@tonic-gate 
16710Sstevel@tonic-gate 	return (E_SUCCESS);
16720Sstevel@tonic-gate }
1673