xref: /onnv-gate/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 3247:e05001c14ea2)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51914Scasper  * Common Development and Distribution License (the "License").
61914Scasper  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
221914Scasper  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * rcapd is a long-running daemon enforcing project-based resource caps (see
300Sstevel@tonic-gate  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
310Sstevel@tonic-gate  * "collection") may have a memory cap.  A single thread monitors the resource
320Sstevel@tonic-gate  * utilization of capped collections, enforces caps when they are exceeded (and
330Sstevel@tonic-gate  * other conditions are met), and incorporates changes in configuration or
340Sstevel@tonic-gate  * caps.  Each of these actions occurs not more frequently than the rate
350Sstevel@tonic-gate  * specified with rcapadm(1M).
360Sstevel@tonic-gate  */
370Sstevel@tonic-gate 
380Sstevel@tonic-gate #include <sys/priocntl.h>
390Sstevel@tonic-gate #include <sys/proc.h>
400Sstevel@tonic-gate #include <sys/resource.h>
410Sstevel@tonic-gate #include <sys/sysinfo.h>
420Sstevel@tonic-gate #include <sys/stat.h>
430Sstevel@tonic-gate #include <sys/sysmacros.h>
440Sstevel@tonic-gate #include <sys/time.h>
450Sstevel@tonic-gate #include <sys/types.h>
460Sstevel@tonic-gate #include <dirent.h>
470Sstevel@tonic-gate #include <errno.h>
480Sstevel@tonic-gate #include <fcntl.h>
490Sstevel@tonic-gate #include <kstat.h>
500Sstevel@tonic-gate #include <libintl.h>
510Sstevel@tonic-gate #include <limits.h>
520Sstevel@tonic-gate #include <locale.h>
530Sstevel@tonic-gate #include <priv.h>
540Sstevel@tonic-gate #include <signal.h>
550Sstevel@tonic-gate #include <stdarg.h>
560Sstevel@tonic-gate #include <stdio.h>
571914Scasper #include <stdio_ext.h>
580Sstevel@tonic-gate #include <stdlib.h>
590Sstevel@tonic-gate #include <strings.h>
600Sstevel@tonic-gate #include <time.h>
610Sstevel@tonic-gate #include <unistd.h>
620Sstevel@tonic-gate #include <zone.h>
630Sstevel@tonic-gate #include <assert.h>
64*3247Sgjelinek #include <sys/vm_usage.h>
650Sstevel@tonic-gate #include "rcapd.h"
660Sstevel@tonic-gate #include "rcapd_mapping.h"
670Sstevel@tonic-gate #include "rcapd_rfd.h"
680Sstevel@tonic-gate #include "rcapd_stat.h"
690Sstevel@tonic-gate #include "utils.h"
700Sstevel@tonic-gate 
710Sstevel@tonic-gate #define	POSITIVE_MIN(x, y) \
720Sstevel@tonic-gate 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
730Sstevel@tonic-gate #define	NEXT_EVENT_TIME(base, seconds) \
740Sstevel@tonic-gate 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
750Sstevel@tonic-gate 	: (hrtime_t)0)
760Sstevel@tonic-gate #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
770Sstevel@tonic-gate 	((rcfg.rcfg_stat_file[0] != 0) ?  \
780Sstevel@tonic-gate 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
790Sstevel@tonic-gate #define	EVENT_TIME(time, eventtime) \
800Sstevel@tonic-gate 	(((time) > (eventtime)) && (eventtime) != 0)
810Sstevel@tonic-gate #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
820Sstevel@tonic-gate #define	DAEMON_UID		1		/* uid to use */
830Sstevel@tonic-gate 
84*3247Sgjelinek #define	CAPPED_PROJECT	0x01
85*3247Sgjelinek #define	CAPPED_ZONE	0x02
86*3247Sgjelinek 
870Sstevel@tonic-gate typedef struct soft_scan_arg {
880Sstevel@tonic-gate 	uint64_t ssa_sum_excess;
890Sstevel@tonic-gate 	int64_t ssa_scan_goal;
90*3247Sgjelinek 	boolean_t ssa_project_over_cap;
910Sstevel@tonic-gate } soft_scan_arg_t;
920Sstevel@tonic-gate 
93*3247Sgjelinek typedef struct sample_col_arg {
94*3247Sgjelinek 	boolean_t sca_any_over_cap;
95*3247Sgjelinek 	boolean_t sca_project_over_cap;
96*3247Sgjelinek } sample_col_arg_t;
97*3247Sgjelinek 
98*3247Sgjelinek 
990Sstevel@tonic-gate static int debug_mode = 0;		/* debug mode flag */
1000Sstevel@tonic-gate static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
1010Sstevel@tonic-gate 					/* scanned */
1020Sstevel@tonic-gate static kstat_ctl_t *kctl;		/* kstat chain */
1030Sstevel@tonic-gate static int memory_pressure = 0;		/* physical memory utilization (%) */
1040Sstevel@tonic-gate static int memory_pressure_sample = 0;	/* count of samples */
105*3247Sgjelinek static long page_size_kb = 0;		/* system page size in KB */
106*3247Sgjelinek static size_t nvmu_vals = 0;		/* # of kernel RSS/swap vals in array */
107*3247Sgjelinek static size_t vmu_vals_len = 0;		/* size of RSS/swap vals array */
108*3247Sgjelinek static vmusage_t *vmu_vals = NULL;	/* snapshot of kernel RSS/swap values */
1090Sstevel@tonic-gate static hrtime_t next_report;		/* time of next report */
1100Sstevel@tonic-gate static int termination_signal = 0;	/* terminating signal */
111*3247Sgjelinek static zoneid_t my_zoneid = (zoneid_t)-1;
112*3247Sgjelinek static lcollection_t *gz_col;		/* global zone collection */
1130Sstevel@tonic-gate 
1140Sstevel@tonic-gate rcfg_t rcfg;
115*3247Sgjelinek /*
116*3247Sgjelinek  * Updated when we re-read the collection configurations if this rcapd instance
117*3247Sgjelinek  * is running in the global zone and the global zone is capped.
118*3247Sgjelinek  */
119*3247Sgjelinek boolean_t gz_capped = B_FALSE;
1200Sstevel@tonic-gate 
1210Sstevel@tonic-gate /*
1220Sstevel@tonic-gate  * Flags.
1230Sstevel@tonic-gate  */
1240Sstevel@tonic-gate static int ever_ran;
1250Sstevel@tonic-gate int should_run;
1260Sstevel@tonic-gate static int should_reconfigure;
1270Sstevel@tonic-gate 
1280Sstevel@tonic-gate static int verify_statistics(void);
1290Sstevel@tonic-gate static int update_statistics(void);
1300Sstevel@tonic-gate 
1310Sstevel@tonic-gate /*
132*3247Sgjelinek  * Checks if a process is marked 'system'.  Returns FALSE only when it is not.
1330Sstevel@tonic-gate  */
134*3247Sgjelinek static boolean_t
1350Sstevel@tonic-gate proc_issystem(pid_t pid)
1360Sstevel@tonic-gate {
1370Sstevel@tonic-gate 	char pc_clname[PC_CLNMSZ];
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
1400Sstevel@tonic-gate 	    PC_KY_NULL) != -1) {
1410Sstevel@tonic-gate 		return (strcmp(pc_clname, "SYS") == 0);
1420Sstevel@tonic-gate 	} else {
1430Sstevel@tonic-gate 		debug("cannot get class-specific scheduling parameters; "
144*3247Sgjelinek 		    "assuming system process\n");
145*3247Sgjelinek 		return (B_TRUE);
1460Sstevel@tonic-gate 	}
1470Sstevel@tonic-gate }
1480Sstevel@tonic-gate 
1490Sstevel@tonic-gate static void
150*3247Sgjelinek lprocess_insert_mark(psinfo_t *psinfop)
1510Sstevel@tonic-gate {
152*3247Sgjelinek 	pid_t pid = psinfop->pr_pid;
153*3247Sgjelinek 	/* flag indicating whether the process should be scanned. */
154*3247Sgjelinek 	int unscannable = psinfop->pr_nlwp == 0;
155*3247Sgjelinek 	rcid_t colid;
1560Sstevel@tonic-gate 	lcollection_t *lcol;
1570Sstevel@tonic-gate 	lprocess_t *lproc;
1580Sstevel@tonic-gate 
159*3247Sgjelinek 	/*
160*3247Sgjelinek 	 * Determine which collection to put this process into.  We only have
161*3247Sgjelinek 	 * to worry about tracking both zone and project capped processes if
162*3247Sgjelinek 	 * this rcapd instance is running in the global zone, since we'll only
163*3247Sgjelinek 	 * see processes in our own projects in a non-global zone.  In the
164*3247Sgjelinek 	 * global zone, if the process belongs to a non-global zone, we only
165*3247Sgjelinek 	 * need to track it for the capped non-global zone collection.  For
166*3247Sgjelinek 	 * global zone processes, we first attempt to put the process into a
167*3247Sgjelinek 	 * capped project collection.  On the second pass into this function
168*3247Sgjelinek 	 * the projid will be cleared so we will just track the process for the
169*3247Sgjelinek 	 * global zone collection as a whole.
170*3247Sgjelinek 	 */
171*3247Sgjelinek 	if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
172*3247Sgjelinek 		colid.rcid_type = RCIDT_PROJECT;
173*3247Sgjelinek 		colid.rcid_val = psinfop->pr_projid;
174*3247Sgjelinek 	} else {
175*3247Sgjelinek 		/* try to add to zone collection */
176*3247Sgjelinek 		colid.rcid_type = RCIDT_ZONE;
177*3247Sgjelinek 		colid.rcid_val = psinfop->pr_zoneid;
178*3247Sgjelinek 	}
179*3247Sgjelinek 
180*3247Sgjelinek 	if ((lcol = lcollection_find(&colid)) == NULL)
1810Sstevel@tonic-gate 		return;
1820Sstevel@tonic-gate 
1830Sstevel@tonic-gate 	/*
1840Sstevel@tonic-gate 	 * If the process is already being tracked, update the unscannable flag,
1850Sstevel@tonic-gate 	 * as determined by the caller, from the process's psinfo.
1860Sstevel@tonic-gate 	 */
1870Sstevel@tonic-gate 	lproc = lcol->lcol_lprocess;
1880Sstevel@tonic-gate 	while (lproc != NULL) {
1890Sstevel@tonic-gate 		if (lproc->lpc_pid == pid) {
1900Sstevel@tonic-gate 			lproc->lpc_mark = 1;
1910Sstevel@tonic-gate 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
1920Sstevel@tonic-gate 				debug("process %d: became unscannable\n",
1930Sstevel@tonic-gate 				    (int)lproc->lpc_pid);
1940Sstevel@tonic-gate 				lproc->lpc_unscannable = 1;
1950Sstevel@tonic-gate 			}
1960Sstevel@tonic-gate 			return;
1970Sstevel@tonic-gate 		}
1980Sstevel@tonic-gate 		lproc = lproc->lpc_next;
1990Sstevel@tonic-gate 	}
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate 	/*
2020Sstevel@tonic-gate 	 * We've fallen off the list without finding our current process;
2030Sstevel@tonic-gate 	 * insert it at the list head.
2040Sstevel@tonic-gate 	 */
2050Sstevel@tonic-gate 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
2060Sstevel@tonic-gate 		debug("insufficient memory to track new process %d", (int)pid);
2070Sstevel@tonic-gate 	else {
2080Sstevel@tonic-gate 		(void) bzero(lproc, sizeof (*lproc));
2090Sstevel@tonic-gate 		lproc->lpc_pid = pid;
2100Sstevel@tonic-gate 		lproc->lpc_mark = 1;
2110Sstevel@tonic-gate 		lproc->lpc_collection = lcol;
2120Sstevel@tonic-gate 		lproc->lpc_psinfo_fd = -1;
2130Sstevel@tonic-gate 		lproc->lpc_pgdata_fd = -1;
2140Sstevel@tonic-gate 		lproc->lpc_xmap_fd = -1;
2150Sstevel@tonic-gate 
2160Sstevel@tonic-gate 		/*
2170Sstevel@tonic-gate 		 * If the caller didn't flag this process as unscannable
2180Sstevel@tonic-gate 		 * already, do some more checking.
2190Sstevel@tonic-gate 		 */
2200Sstevel@tonic-gate 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate #ifdef DEBUG
2230Sstevel@tonic-gate 		/*
2240Sstevel@tonic-gate 		 * Verify the sanity of lprocess.  It should not contain the
2250Sstevel@tonic-gate 		 * process we are about to prepend.
2260Sstevel@tonic-gate 		 */
2270Sstevel@tonic-gate 		if (lcollection_member(lcol, lproc)) {
2280Sstevel@tonic-gate 			lprocess_t *cur = lcol->lcol_lprocess;
2290Sstevel@tonic-gate 			debug("The collection %lld already has these members, "
230*3247Sgjelinek 			    "including me, %d!\n",
231*3247Sgjelinek 			    (long long)lcol->lcol_id.rcid_val,
2320Sstevel@tonic-gate 			    (int)lproc->lpc_pid);
2330Sstevel@tonic-gate 			while (cur != NULL) {
2340Sstevel@tonic-gate 				debug("\t%d\n", (int)cur->lpc_pid);
2350Sstevel@tonic-gate 				cur = cur->lpc_next;
2360Sstevel@tonic-gate 			}
2370Sstevel@tonic-gate 			info(gettext("process already on lprocess\n"));
2380Sstevel@tonic-gate 			abort();
2390Sstevel@tonic-gate 		}
2400Sstevel@tonic-gate #endif /* DEBUG */
2410Sstevel@tonic-gate 		lproc->lpc_next = lcol->lcol_lprocess;
2420Sstevel@tonic-gate 		if (lproc->lpc_next != NULL)
2430Sstevel@tonic-gate 			lproc->lpc_next->lpc_prev = lproc;
2440Sstevel@tonic-gate 		lproc->lpc_prev = NULL;
2450Sstevel@tonic-gate 		lcol->lcol_lprocess = lproc;
2460Sstevel@tonic-gate 
247*3247Sgjelinek 		debug("tracking %s %ld %d %s%s\n",
248*3247Sgjelinek 		    (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
249*3247Sgjelinek 		    (long)colid.rcid_val,
250*3247Sgjelinek 		    (int)pid, psinfop->pr_psargs,
2510Sstevel@tonic-gate 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
2520Sstevel@tonic-gate 		lcol->lcol_stat.lcols_proc_in++;
2530Sstevel@tonic-gate 	}
2540Sstevel@tonic-gate }
2550Sstevel@tonic-gate 
2560Sstevel@tonic-gate static int
2570Sstevel@tonic-gate list_walk_process_cb(lcollection_t *lcol, void *arg)
2580Sstevel@tonic-gate {
2590Sstevel@tonic-gate 	int (*cb)(lcollection_t *, lprocess_t *) =
2600Sstevel@tonic-gate 	    (int(*)(lcollection_t *, lprocess_t *))arg;
2610Sstevel@tonic-gate 	lprocess_t *member;
2620Sstevel@tonic-gate 	lprocess_t *next;
2630Sstevel@tonic-gate 
2640Sstevel@tonic-gate 	member = lcol->lcol_lprocess;
2650Sstevel@tonic-gate 	while (member != NULL) {
2660Sstevel@tonic-gate 		pid_t pid = member->lpc_pid;
2670Sstevel@tonic-gate 		next = member->lpc_next;
2680Sstevel@tonic-gate 
2690Sstevel@tonic-gate 		debug_high("list_walk_all lpc %d\n", (int)pid);
2700Sstevel@tonic-gate 		if (cb(lcol, member) != 0) {
2710Sstevel@tonic-gate 			debug_high("list_walk_all aborted at lpc %d\n",
2720Sstevel@tonic-gate 			    (int)pid);
2730Sstevel@tonic-gate 			return (1);
2740Sstevel@tonic-gate 		}
2750Sstevel@tonic-gate 		member = next;
2760Sstevel@tonic-gate 	}
2770Sstevel@tonic-gate 
2780Sstevel@tonic-gate 	return (0);
2790Sstevel@tonic-gate }
2800Sstevel@tonic-gate 
2810Sstevel@tonic-gate /*
2820Sstevel@tonic-gate  * Invoke the given callback for each process in each collection.  Callbacks
2830Sstevel@tonic-gate  * are allowed to change the linkage of the process on which they act.
2840Sstevel@tonic-gate  */
2850Sstevel@tonic-gate static void
2860Sstevel@tonic-gate list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
2870Sstevel@tonic-gate {
2880Sstevel@tonic-gate 	list_walk_collection(list_walk_process_cb, (void *)cb);
2890Sstevel@tonic-gate }
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate static void
2920Sstevel@tonic-gate revoke_psinfo(rfd_t *rfd)
2930Sstevel@tonic-gate {
2940Sstevel@tonic-gate 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate 	if (lpc != NULL) {
2970Sstevel@tonic-gate 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
2980Sstevel@tonic-gate 		ASSERT(lpc->lpc_psinfo_fd != -1);
2990Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
3000Sstevel@tonic-gate 	} else
3010Sstevel@tonic-gate 		debug("revoking psinfo fd for unknown process\n");
3020Sstevel@tonic-gate }
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate /*
3050Sstevel@tonic-gate  * Retrieve a process's psinfo via an already-opened or new file descriptor.
3060Sstevel@tonic-gate  * The supplied descriptor will be closed on failure.  An optional callback
3070Sstevel@tonic-gate  * will be invoked with the last descriptor tried, and a supplied callback
3080Sstevel@tonic-gate  * argument, as its arguments, such that the new descriptor may be cached, or
3090Sstevel@tonic-gate  * an old one may be invalidated.  If the result of the callback is zero, the
3100Sstevel@tonic-gate  * the caller is to assume responsibility for the file descriptor, to close it
3110Sstevel@tonic-gate  * with rfd_close().
3120Sstevel@tonic-gate  *
3130Sstevel@tonic-gate  * On failure, a nonzero value is returned.
3140Sstevel@tonic-gate  */
3150Sstevel@tonic-gate int
3160Sstevel@tonic-gate get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
3170Sstevel@tonic-gate     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
3180Sstevel@tonic-gate {
3190Sstevel@tonic-gate 	int fd;
3200Sstevel@tonic-gate 	int can_try_uncached;
3210Sstevel@tonic-gate 
3220Sstevel@tonic-gate 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 	do {
3250Sstevel@tonic-gate 		if (cached_fd >= 0) {
3260Sstevel@tonic-gate 			fd = cached_fd;
3270Sstevel@tonic-gate 			can_try_uncached = 1;
3280Sstevel@tonic-gate 			debug_high("%d/psinfo, trying cached fd %d\n",
3290Sstevel@tonic-gate 			    (int)pid, fd);
3300Sstevel@tonic-gate 		} else {
3310Sstevel@tonic-gate 			char pathbuf[PROC_PATH_MAX];
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate 			can_try_uncached = 0;
3340Sstevel@tonic-gate 			(void) snprintf(pathbuf, sizeof (pathbuf),
3350Sstevel@tonic-gate 			    "/proc/%d/psinfo", (int)pid);
3360Sstevel@tonic-gate 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
3370Sstevel@tonic-gate 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
3380Sstevel@tonic-gate 				debug("cannot open %s", pathbuf);
3390Sstevel@tonic-gate 				break;
3400Sstevel@tonic-gate 			} else
3410Sstevel@tonic-gate 				debug_high("opened %s, fd %d\n", pathbuf, fd);
3420Sstevel@tonic-gate 		}
3430Sstevel@tonic-gate 
3440Sstevel@tonic-gate 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
3450Sstevel@tonic-gate 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
3460Sstevel@tonic-gate 			break;
3470Sstevel@tonic-gate 		else {
3480Sstevel@tonic-gate 			debug_high("closed fd %d\n", fd);
3490Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3500Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3510Sstevel@tonic-gate 			fd = cached_fd = -1;
3520Sstevel@tonic-gate 		}
3530Sstevel@tonic-gate 	} while (can_try_uncached == 1);
3540Sstevel@tonic-gate 
3550Sstevel@tonic-gate 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
3560Sstevel@tonic-gate 		if (fd >= 0) {
3570Sstevel@tonic-gate 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
3580Sstevel@tonic-gate 			    "uncached" : "cached", fd);
3590Sstevel@tonic-gate 			if (rfd_close(fd) != 0)
3600Sstevel@tonic-gate 				debug("could not close fd %d", fd);
3610Sstevel@tonic-gate 		}
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
3640Sstevel@tonic-gate 	    fd_update_cb != NULL ? "cached" : "uncached");
3650Sstevel@tonic-gate 	return ((fd >= 0) ? 0 : -1);
3660Sstevel@tonic-gate }
3670Sstevel@tonic-gate 
3680Sstevel@tonic-gate /*
369*3247Sgjelinek  * Retrieve the collection membership of all processes and update the psinfo of
370*3247Sgjelinek  * those non-system, non-zombie ones in collections.  For global zone processes,
371*3247Sgjelinek  * we first attempt to put the process into a capped project collection.  We
372*3247Sgjelinek  * also want to track the process for the global zone collection as a whole.
3730Sstevel@tonic-gate  */
3740Sstevel@tonic-gate static void
3750Sstevel@tonic-gate proc_cb(const pid_t pid)
3760Sstevel@tonic-gate {
3770Sstevel@tonic-gate 	psinfo_t psinfo;
3780Sstevel@tonic-gate 
379*3247Sgjelinek 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
380*3247Sgjelinek 		lprocess_insert_mark(&psinfo);
381*3247Sgjelinek 		if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
382*3247Sgjelinek 			/*
383*3247Sgjelinek 			 * We also want to track this process for the global
384*3247Sgjelinek 			 * zone as a whole so add it to the global zone
385*3247Sgjelinek 			 * collection as well.
386*3247Sgjelinek 			 */
387*3247Sgjelinek 			psinfo.pr_projid = -1;
388*3247Sgjelinek 			lprocess_insert_mark(&psinfo);
389*3247Sgjelinek 		}
390*3247Sgjelinek 	}
3910Sstevel@tonic-gate }
3920Sstevel@tonic-gate 
3930Sstevel@tonic-gate /*
3940Sstevel@tonic-gate  * Cache the process' psinfo fd, taking responsibility for freeing it.
3950Sstevel@tonic-gate  */
3960Sstevel@tonic-gate int
3970Sstevel@tonic-gate lprocess_update_psinfo_fd_cb(void *arg, int fd)
3980Sstevel@tonic-gate {
3990Sstevel@tonic-gate 	lprocess_t *lpc = arg;
4000Sstevel@tonic-gate 
4010Sstevel@tonic-gate 	lpc->lpc_psinfo_fd = fd;
4020Sstevel@tonic-gate 	return (0);
4030Sstevel@tonic-gate }
4040Sstevel@tonic-gate 
4050Sstevel@tonic-gate /*
406*3247Sgjelinek  * Get the system pagesize.
4070Sstevel@tonic-gate  */
408*3247Sgjelinek static void
409*3247Sgjelinek get_page_size(void)
4100Sstevel@tonic-gate {
411*3247Sgjelinek 	page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
412*3247Sgjelinek 	debug("physical page size: %luKB\n", page_size_kb);
413*3247Sgjelinek }
414*3247Sgjelinek 
415*3247Sgjelinek static void
416*3247Sgjelinek tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
417*3247Sgjelinek {
418*3247Sgjelinek 	hrtime_t diff = t2 - t1;
4190Sstevel@tonic-gate 
420*3247Sgjelinek 	if (diff < MILLISEC)
421*3247Sgjelinek 		debug("%s: %lld nanoseconds\n", msg, diff);
422*3247Sgjelinek 	else if (diff < MICROSEC)
423*3247Sgjelinek 		debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
424*3247Sgjelinek 	else if (diff < NANOSEC)
425*3247Sgjelinek 		debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
426*3247Sgjelinek 	else
427*3247Sgjelinek 		debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
428*3247Sgjelinek }
429*3247Sgjelinek 
430*3247Sgjelinek /*
431*3247Sgjelinek  * Get the zone's & project's RSS from the kernel.
432*3247Sgjelinek  */
433*3247Sgjelinek static void
434*3247Sgjelinek rss_sample(boolean_t my_zone_only, uint_t col_types)
435*3247Sgjelinek {
436*3247Sgjelinek 	size_t nres;
437*3247Sgjelinek 	size_t i;
438*3247Sgjelinek 	uint_t flags;
439*3247Sgjelinek 	hrtime_t t1, t2;
440*3247Sgjelinek 
441*3247Sgjelinek 	if (my_zone_only) {
442*3247Sgjelinek 		flags = VMUSAGE_ZONE;
4430Sstevel@tonic-gate 	} else {
444*3247Sgjelinek 		flags = 0;
445*3247Sgjelinek 		if (col_types & CAPPED_PROJECT)
446*3247Sgjelinek 			flags |= VMUSAGE_PROJECTS;
447*3247Sgjelinek 		if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
448*3247Sgjelinek 			flags |= VMUSAGE_ALL_ZONES;
4490Sstevel@tonic-gate 	}
4500Sstevel@tonic-gate 
451*3247Sgjelinek 	debug("vmusage sample flags 0x%x\n", flags);
452*3247Sgjelinek 	if (flags == 0)
453*3247Sgjelinek 		return;
454*3247Sgjelinek 
455*3247Sgjelinek again:
456*3247Sgjelinek 	/* try the current buffer to see if the list will fit */
457*3247Sgjelinek 	nres = vmu_vals_len;
458*3247Sgjelinek 	t1 = gethrtime();
459*3247Sgjelinek 	if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
460*3247Sgjelinek 	    vmu_vals, &nres) != 0) {
461*3247Sgjelinek 		if (errno != EOVERFLOW) {
462*3247Sgjelinek 			warn(gettext("can't read RSS from kernel\n"));
463*3247Sgjelinek 			return;
464*3247Sgjelinek 		}
465*3247Sgjelinek 	}
466*3247Sgjelinek 	t2 = gethrtime();
467*3247Sgjelinek 	tm_fmt("getvmusage time", t1, t2);
468*3247Sgjelinek 
469*3247Sgjelinek 	debug("kernel nres %lu\n", (ulong_t)nres);
470*3247Sgjelinek 
471*3247Sgjelinek 	if (nres > vmu_vals_len) {
472*3247Sgjelinek 		/* array size is now too small, increase it and try again */
473*3247Sgjelinek 		free(vmu_vals);
474*3247Sgjelinek 
475*3247Sgjelinek 		if ((vmu_vals = (vmusage_t *)calloc(nres,
476*3247Sgjelinek 		    sizeof (vmusage_t))) == NULL) {
477*3247Sgjelinek 			warn(gettext("out of memory: could not read RSS from "
478*3247Sgjelinek 			    "kernel\n"));
479*3247Sgjelinek 			vmu_vals_len = nvmu_vals = 0;
480*3247Sgjelinek 			return;
481*3247Sgjelinek 		}
482*3247Sgjelinek 		vmu_vals_len = nres;
483*3247Sgjelinek 		goto again;
484*3247Sgjelinek 	}
485*3247Sgjelinek 
486*3247Sgjelinek 	nvmu_vals = nres;
487*3247Sgjelinek 
488*3247Sgjelinek 	debug("vmusage_sample\n");
489*3247Sgjelinek 	for (i = 0; i < nvmu_vals; i++) {
490*3247Sgjelinek 		debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
491*3247Sgjelinek 		    "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
492*3247Sgjelinek 		    vmu_vals[i].vmu_type,
493*3247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_rss_all,
494*3247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
495*3247Sgjelinek 		    (unsigned long long)vmu_vals[i].vmu_swap_all);
496*3247Sgjelinek 	}
497*3247Sgjelinek }
498*3247Sgjelinek 
499*3247Sgjelinek static void
500*3247Sgjelinek update_col_rss(lcollection_t *lcol)
501*3247Sgjelinek {
502*3247Sgjelinek 	int i;
503*3247Sgjelinek 
504*3247Sgjelinek 	lcol->lcol_rss = 0;
505*3247Sgjelinek 	lcol->lcol_image_size = 0;
506*3247Sgjelinek 
507*3247Sgjelinek 	for (i = 0; i < nvmu_vals; i++) {
508*3247Sgjelinek 		if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
509*3247Sgjelinek 			continue;
510*3247Sgjelinek 
511*3247Sgjelinek 		if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
512*3247Sgjelinek 		    lcol->lcol_id.rcid_type != RCIDT_ZONE)
513*3247Sgjelinek 			continue;
514*3247Sgjelinek 
515*3247Sgjelinek 		if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
516*3247Sgjelinek 		    lcol->lcol_id.rcid_type != RCIDT_PROJECT)
517*3247Sgjelinek 			continue;
518*3247Sgjelinek 
519*3247Sgjelinek 		/* we found the right RSS entry, update the collection vals */
520*3247Sgjelinek 		lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
521*3247Sgjelinek 		lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
522*3247Sgjelinek 		break;
523*3247Sgjelinek 	}
5240Sstevel@tonic-gate }
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate /*
5270Sstevel@tonic-gate  * Sample the collection RSS, updating the collection's statistics with the
528*3247Sgjelinek  * results.  Also, sum the rss of all capped projects & return true if
529*3247Sgjelinek  * the collection is over cap.
5300Sstevel@tonic-gate  */
5310Sstevel@tonic-gate static int
5320Sstevel@tonic-gate rss_sample_col_cb(lcollection_t *lcol, void *arg)
5330Sstevel@tonic-gate {
5340Sstevel@tonic-gate 	int64_t excess;
5350Sstevel@tonic-gate 	uint64_t rss;
536*3247Sgjelinek 	sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
5370Sstevel@tonic-gate 
538*3247Sgjelinek 	update_col_rss(lcol);
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sample++;
5410Sstevel@tonic-gate 	rss = lcol->lcol_rss;
542*3247Sgjelinek 	excess = rss - lcol->lcol_rss_cap;
543*3247Sgjelinek 	if (excess > 0) {
5440Sstevel@tonic-gate 		lcol->lcol_stat.lcols_rss_act_sum += rss;
545*3247Sgjelinek 		col_argp->sca_any_over_cap = B_TRUE;
546*3247Sgjelinek 		if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
547*3247Sgjelinek 			col_argp->sca_project_over_cap = B_TRUE;
548*3247Sgjelinek 	}
5490Sstevel@tonic-gate 	lcol->lcol_stat.lcols_rss_sum += rss;
5500Sstevel@tonic-gate 
5510Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_min_rss > rss)
5520Sstevel@tonic-gate 		lcol->lcol_stat.lcols_min_rss = rss;
5530Sstevel@tonic-gate 	if (lcol->lcol_stat.lcols_max_rss < rss)
5540Sstevel@tonic-gate 		lcol->lcol_stat.lcols_max_rss = rss;
5550Sstevel@tonic-gate 
5560Sstevel@tonic-gate 	return (0);
5570Sstevel@tonic-gate }
5580Sstevel@tonic-gate 
5590Sstevel@tonic-gate /*
560*3247Sgjelinek  * Determine if we have capped projects, capped zones or both.
561*3247Sgjelinek  */
562*3247Sgjelinek static int
563*3247Sgjelinek col_type_cb(lcollection_t *lcol, void *arg)
564*3247Sgjelinek {
565*3247Sgjelinek 	uint_t *col_type = (uint_t *)arg;
566*3247Sgjelinek 
567*3247Sgjelinek 	/* skip uncapped collections */
568*3247Sgjelinek 	if (lcol->lcol_rss_cap == 0)
569*3247Sgjelinek 		return (1);
570*3247Sgjelinek 
571*3247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
572*3247Sgjelinek 		*col_type |= CAPPED_PROJECT;
573*3247Sgjelinek 	else
574*3247Sgjelinek 		*col_type |= CAPPED_ZONE;
575*3247Sgjelinek 
576*3247Sgjelinek 	/* once we know everything is capped, we can stop looking */
577*3247Sgjelinek 	if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
578*3247Sgjelinek 		return (1);
579*3247Sgjelinek 
580*3247Sgjelinek 	return (0);
581*3247Sgjelinek }
582*3247Sgjelinek 
583*3247Sgjelinek /*
5840Sstevel@tonic-gate  * Open /proc and walk entries.
5850Sstevel@tonic-gate  */
5860Sstevel@tonic-gate static void
5870Sstevel@tonic-gate proc_walk_all(void (*cb)(const pid_t))
5880Sstevel@tonic-gate {
5890Sstevel@tonic-gate 	DIR *pdir;
5900Sstevel@tonic-gate 	struct dirent *dirent;
5910Sstevel@tonic-gate 	pid_t pid;
5920Sstevel@tonic-gate 
5930Sstevel@tonic-gate 	(void) rfd_reserve(1);
5940Sstevel@tonic-gate 	if ((pdir = opendir("/proc")) == NULL)
5950Sstevel@tonic-gate 		die(gettext("couldn't open /proc!"));
5960Sstevel@tonic-gate 
5970Sstevel@tonic-gate 	while ((dirent = readdir(pdir)) != NULL) {
5980Sstevel@tonic-gate 		if (strcmp(".", dirent->d_name) == 0 ||
5990Sstevel@tonic-gate 		    strcmp("..", dirent->d_name) == 0)
6000Sstevel@tonic-gate 			continue;
6010Sstevel@tonic-gate 		pid = atoi(dirent->d_name);
6020Sstevel@tonic-gate 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
6030Sstevel@tonic-gate 		if (pid == rcapd_pid)
6040Sstevel@tonic-gate 			continue;
6050Sstevel@tonic-gate 		else
6060Sstevel@tonic-gate 			cb(pid);
6070Sstevel@tonic-gate 	}
6080Sstevel@tonic-gate 	(void) closedir(pdir);
6090Sstevel@tonic-gate }
6100Sstevel@tonic-gate 
6110Sstevel@tonic-gate /*
6120Sstevel@tonic-gate  * Clear unmarked callback.
6130Sstevel@tonic-gate  */
6140Sstevel@tonic-gate /*ARGSUSED*/
6150Sstevel@tonic-gate static int
6160Sstevel@tonic-gate sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
6170Sstevel@tonic-gate {
6180Sstevel@tonic-gate 	if (lpc->lpc_mark) {
6190Sstevel@tonic-gate 		lpc->lpc_mark = 0;
6200Sstevel@tonic-gate 	} else {
6210Sstevel@tonic-gate 		debug("process %d finished\n", (int)lpc->lpc_pid);
6220Sstevel@tonic-gate 		lprocess_free(lpc);
6230Sstevel@tonic-gate 	}
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate 	return (0);
6260Sstevel@tonic-gate }
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate /*
6290Sstevel@tonic-gate  * Print, for debugging purposes, a collection's recently-sampled RSS and
6300Sstevel@tonic-gate  * excess.
6310Sstevel@tonic-gate  */
6320Sstevel@tonic-gate /*ARGSUSED*/
6330Sstevel@tonic-gate static int
6340Sstevel@tonic-gate excess_print_cb(lcollection_t *lcol, void *arg)
6350Sstevel@tonic-gate {
6360Sstevel@tonic-gate 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
6370Sstevel@tonic-gate 
6380Sstevel@tonic-gate 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
639*3247Sgjelinek 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
640*3247Sgjelinek 	    lcol->lcol_name,
6410Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss,
6420Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
6430Sstevel@tonic-gate 	    (long long)excess);
6440Sstevel@tonic-gate 
6450Sstevel@tonic-gate 	return (0);
6460Sstevel@tonic-gate }
6470Sstevel@tonic-gate 
6480Sstevel@tonic-gate /*
6490Sstevel@tonic-gate  * Scan those collections which have exceeded their caps.
650*3247Sgjelinek  *
651*3247Sgjelinek  * If we're running in the global zone it might have a cap.  We don't want to
652*3247Sgjelinek  * do any capping for the global zone yet since we might get under the cap by
653*3247Sgjelinek  * just capping the projects in the global zone.
6540Sstevel@tonic-gate  */
6550Sstevel@tonic-gate /*ARGSUSED*/
6560Sstevel@tonic-gate static int
6570Sstevel@tonic-gate scan_cb(lcollection_t *lcol, void *arg)
6580Sstevel@tonic-gate {
6590Sstevel@tonic-gate 	int64_t excess;
6600Sstevel@tonic-gate 
661*3247Sgjelinek 	/* skip over global zone collection for now but keep track for later */
662*3247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
663*3247Sgjelinek 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
664*3247Sgjelinek 		gz_col = lcol;
665*3247Sgjelinek 		return (0);
666*3247Sgjelinek 	}
667*3247Sgjelinek 
6680Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
6690Sstevel@tonic-gate 		scan(lcol, excess);
6700Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
6710Sstevel@tonic-gate 	}
6720Sstevel@tonic-gate 
6730Sstevel@tonic-gate 	return (0);
6740Sstevel@tonic-gate }
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate /*
677*3247Sgjelinek  * Scan the global zone collection and see if it still exceeds its cap.
678*3247Sgjelinek  * We take into account the effects of capping any global zone projects here.
679*3247Sgjelinek  */
680*3247Sgjelinek static void
681*3247Sgjelinek scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
682*3247Sgjelinek {
683*3247Sgjelinek 	int64_t excess;
684*3247Sgjelinek 
685*3247Sgjelinek 	/*
686*3247Sgjelinek 	 * If we had projects over their cap and the global zone was also over
687*3247Sgjelinek 	 * its cap then we need to get the up-to-date global zone rss to
688*3247Sgjelinek 	 * determine if we are still over the global zone cap.  We might have
689*3247Sgjelinek 	 * gone under while we scanned the capped projects.  If there were no
690*3247Sgjelinek 	 * projects over cap then we can use the rss value we already have for
691*3247Sgjelinek 	 * the global zone.
692*3247Sgjelinek 	 */
693*3247Sgjelinek 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
694*3247Sgjelinek 	if (project_over_cap && excess > 0) {
695*3247Sgjelinek 		rss_sample(B_TRUE, CAPPED_ZONE);
696*3247Sgjelinek 		update_col_rss(lcol);
697*3247Sgjelinek 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
698*3247Sgjelinek 	}
699*3247Sgjelinek 
700*3247Sgjelinek 	if (excess > 0) {
701*3247Sgjelinek 		debug("global zone excess %lldKB\n", (long long)excess);
702*3247Sgjelinek 		scan(lcol, excess);
703*3247Sgjelinek 		lcol->lcol_stat.lcols_scan++;
704*3247Sgjelinek 	}
705*3247Sgjelinek }
706*3247Sgjelinek 
707*3247Sgjelinek /*
7080Sstevel@tonic-gate  * Do a soft scan of those collections which have excesses.  A soft scan is one
7090Sstevel@tonic-gate  * in which the cap enforcement pressure is taken into account.  The difference
7100Sstevel@tonic-gate  * between the utilized physical memory and the cap enforcement pressure will
7110Sstevel@tonic-gate  * be scanned-for, and each collection will be scanned proportionally by their
7120Sstevel@tonic-gate  * present excesses.
7130Sstevel@tonic-gate  */
7140Sstevel@tonic-gate static int
7150Sstevel@tonic-gate soft_scan_cb(lcollection_t *lcol, void *a)
7160Sstevel@tonic-gate {
7170Sstevel@tonic-gate 	int64_t excess;
7180Sstevel@tonic-gate 	soft_scan_arg_t *arg = a;
7190Sstevel@tonic-gate 
720*3247Sgjelinek 	/* skip over global zone collection for now but keep track for later */
721*3247Sgjelinek 	if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
722*3247Sgjelinek 	    lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
723*3247Sgjelinek 		gz_col = lcol;
724*3247Sgjelinek 		return (0);
725*3247Sgjelinek 	}
726*3247Sgjelinek 
7270Sstevel@tonic-gate 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
728*3247Sgjelinek 		int64_t adjusted_excess =
729*3247Sgjelinek 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
730*3247Sgjelinek 
731*3247Sgjelinek 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
732*3247Sgjelinek 		    "scanning %lld\n",
733*3247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
734*3247Sgjelinek 		    "project" : "zone"),
735*3247Sgjelinek 		    (long)lcol->lcol_id.rcid_val,
7360Sstevel@tonic-gate 		    (long long)excess, (long long)arg->ssa_scan_goal,
7370Sstevel@tonic-gate 		    (unsigned long long)arg->ssa_sum_excess,
738*3247Sgjelinek 		    (long long)adjusted_excess);
7390Sstevel@tonic-gate 
740*3247Sgjelinek 		scan(lcol, adjusted_excess);
7410Sstevel@tonic-gate 		lcol->lcol_stat.lcols_scan++;
7420Sstevel@tonic-gate 	}
7430Sstevel@tonic-gate 
7440Sstevel@tonic-gate 	return (0);
7450Sstevel@tonic-gate }
7460Sstevel@tonic-gate 
747*3247Sgjelinek static void
748*3247Sgjelinek soft_scan_gz(lcollection_t *lcol, void *a)
749*3247Sgjelinek {
750*3247Sgjelinek 	int64_t excess;
751*3247Sgjelinek 	soft_scan_arg_t *arg = a;
752*3247Sgjelinek 
753*3247Sgjelinek 	/*
754*3247Sgjelinek 	 * If we had projects over their cap and the global zone was also over
755*3247Sgjelinek 	 * its cap then we need to get the up-to-date global zone rss to
756*3247Sgjelinek 	 * determine if we are still over the global zone cap.  We might have
757*3247Sgjelinek 	 * gone under while we scanned the capped projects.  If there were no
758*3247Sgjelinek 	 * projects over cap then we can use the rss value we already have for
759*3247Sgjelinek 	 * the global zone.
760*3247Sgjelinek 	 */
761*3247Sgjelinek 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
762*3247Sgjelinek 	if (arg->ssa_project_over_cap && excess > 0) {
763*3247Sgjelinek 		rss_sample(B_TRUE, CAPPED_ZONE);
764*3247Sgjelinek 		update_col_rss(lcol);
765*3247Sgjelinek 		excess = lcol->lcol_rss - lcol->lcol_rss_cap;
766*3247Sgjelinek 	}
767*3247Sgjelinek 
768*3247Sgjelinek 	if (excess > 0) {
769*3247Sgjelinek 		int64_t adjusted_excess =
770*3247Sgjelinek 		    excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
771*3247Sgjelinek 
772*3247Sgjelinek 		debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
773*3247Sgjelinek 		    "scanning %lld\n",
774*3247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
775*3247Sgjelinek 		    "project" : "zone"),
776*3247Sgjelinek 		    (long)lcol->lcol_id.rcid_val,
777*3247Sgjelinek 		    (long long)excess, (long long)arg->ssa_scan_goal,
778*3247Sgjelinek 		    (unsigned long long)arg->ssa_sum_excess,
779*3247Sgjelinek 		    (long long)adjusted_excess);
780*3247Sgjelinek 
781*3247Sgjelinek 		scan(lcol, adjusted_excess);
782*3247Sgjelinek 		lcol->lcol_stat.lcols_scan++;
783*3247Sgjelinek 	}
784*3247Sgjelinek }
785*3247Sgjelinek 
7860Sstevel@tonic-gate /*
7870Sstevel@tonic-gate  * When a scan could happen, but caps aren't enforced tick the
7880Sstevel@tonic-gate  * lcols_unenforced_cap counter.
7890Sstevel@tonic-gate  */
7900Sstevel@tonic-gate /*ARGSUSED*/
7910Sstevel@tonic-gate static int
7920Sstevel@tonic-gate unenforced_cap_cb(lcollection_t *lcol, void *arg)
7930Sstevel@tonic-gate {
7940Sstevel@tonic-gate 	lcol->lcol_stat.lcols_unenforced_cap++;
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate 	return (0);
7970Sstevel@tonic-gate }
7980Sstevel@tonic-gate 
7990Sstevel@tonic-gate /*
8000Sstevel@tonic-gate  * Update the count of physically installed memory.
8010Sstevel@tonic-gate  */
8020Sstevel@tonic-gate static void
8030Sstevel@tonic-gate update_phys_total(void)
8040Sstevel@tonic-gate {
8050Sstevel@tonic-gate 	uint64_t old_phys_total;
8060Sstevel@tonic-gate 
8070Sstevel@tonic-gate 	old_phys_total = phys_total;
808*3247Sgjelinek 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
8090Sstevel@tonic-gate 	if (phys_total != old_phys_total)
8100Sstevel@tonic-gate 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
8110Sstevel@tonic-gate 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
8120Sstevel@tonic-gate }
8130Sstevel@tonic-gate 
8140Sstevel@tonic-gate /*
8150Sstevel@tonic-gate  * Unlink a process from its collection, updating relevant statistics, and
8160Sstevel@tonic-gate  * freeing its associated memory.
8170Sstevel@tonic-gate  */
8180Sstevel@tonic-gate void
8190Sstevel@tonic-gate lprocess_free(lprocess_t *lpc)
8200Sstevel@tonic-gate {
8210Sstevel@tonic-gate 	pid_t pid;
8220Sstevel@tonic-gate 
8230Sstevel@tonic-gate 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
8240Sstevel@tonic-gate 
8250Sstevel@tonic-gate 	if (lpc->lpc_prev != NULL)
8260Sstevel@tonic-gate 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
8270Sstevel@tonic-gate 	if (lpc->lpc_next != NULL)
8280Sstevel@tonic-gate 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
8290Sstevel@tonic-gate 	if (lpc->lpc_collection->lcol_lprocess == lpc)
8300Sstevel@tonic-gate 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
8310Sstevel@tonic-gate 		    lpc ? lpc->lpc_next : NULL);
8320Sstevel@tonic-gate 	lpc->lpc_next = lpc->lpc_prev = NULL;
8330Sstevel@tonic-gate 
8340Sstevel@tonic-gate 	if (lpc->lpc_prpageheader != NULL)
8350Sstevel@tonic-gate 		free(lpc->lpc_prpageheader);
8360Sstevel@tonic-gate 	if (lpc->lpc_xmap != NULL)
8370Sstevel@tonic-gate 		free(lpc->lpc_xmap);
8380Sstevel@tonic-gate 	if (lpc->lpc_psinfo_fd >= 0) {
8390Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
8400Sstevel@tonic-gate 			debug("could not close %d lpc_psinfo_fd %d",
8410Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
8420Sstevel@tonic-gate 		lpc->lpc_psinfo_fd = -1;
8430Sstevel@tonic-gate 	}
8440Sstevel@tonic-gate 	if (lpc->lpc_pgdata_fd >= 0) {
8450Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
8460Sstevel@tonic-gate 			debug("could not close %d lpc_pgdata_fd %d",
8470Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
8480Sstevel@tonic-gate 		lpc->lpc_pgdata_fd = -1;
8490Sstevel@tonic-gate 	}
8500Sstevel@tonic-gate 	if (lpc->lpc_xmap_fd >= 0) {
8510Sstevel@tonic-gate 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
8520Sstevel@tonic-gate 			debug("could not close %d lpc_xmap_fd %d",
8530Sstevel@tonic-gate 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
8540Sstevel@tonic-gate 		lpc->lpc_xmap_fd = -1;
8550Sstevel@tonic-gate 	}
8560Sstevel@tonic-gate 	if (lpc->lpc_ignore != NULL)
8570Sstevel@tonic-gate 		lmapping_free(&lpc->lpc_ignore);
8580Sstevel@tonic-gate 	pid = lpc->lpc_pid;
8590Sstevel@tonic-gate 	free(lpc);
8600Sstevel@tonic-gate 	debug_high("process %d freed\n", (int)pid);
8610Sstevel@tonic-gate }
8620Sstevel@tonic-gate 
8630Sstevel@tonic-gate /*
8640Sstevel@tonic-gate  * Collection clear callback.
8650Sstevel@tonic-gate  */
8660Sstevel@tonic-gate /*ARGSUSED*/
8670Sstevel@tonic-gate static int
8680Sstevel@tonic-gate collection_clear_cb(lcollection_t *lcol, void *arg)
8690Sstevel@tonic-gate {
8700Sstevel@tonic-gate 	lcol->lcol_mark = 0;
8710Sstevel@tonic-gate 
8720Sstevel@tonic-gate 	return (0);
8730Sstevel@tonic-gate }
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate /*
8760Sstevel@tonic-gate  * Respond to a terminating signal by setting a termination flag.
8770Sstevel@tonic-gate  */
8780Sstevel@tonic-gate /*ARGSUSED*/
8790Sstevel@tonic-gate static void
8800Sstevel@tonic-gate terminate_signal(int signal)
8810Sstevel@tonic-gate {
8820Sstevel@tonic-gate 	if (termination_signal == 0)
8830Sstevel@tonic-gate 		termination_signal = signal;
8840Sstevel@tonic-gate 	should_run = 0;
8850Sstevel@tonic-gate }
8860Sstevel@tonic-gate 
8870Sstevel@tonic-gate /*
8880Sstevel@tonic-gate  * Handle any synchronous or asynchronous signals that would ordinarily cause a
8890Sstevel@tonic-gate  * process to abort.
8900Sstevel@tonic-gate  */
8910Sstevel@tonic-gate /*ARGSUSED*/
8920Sstevel@tonic-gate static void
8930Sstevel@tonic-gate abort_signal(int signal)
8940Sstevel@tonic-gate {
8950Sstevel@tonic-gate 	/*
8960Sstevel@tonic-gate 	 * Allow the scanner to make a last-ditch effort to resume any stopped
8970Sstevel@tonic-gate 	 * processes.
8980Sstevel@tonic-gate 	 */
8990Sstevel@tonic-gate 	scan_abort();
9000Sstevel@tonic-gate 	abort();
9010Sstevel@tonic-gate }
9020Sstevel@tonic-gate 
9030Sstevel@tonic-gate /*
9040Sstevel@tonic-gate  * Clean up collections which have been removed due to configuration.  Unlink
9050Sstevel@tonic-gate  * the collection from lcollection and free it.
9060Sstevel@tonic-gate  */
9070Sstevel@tonic-gate /*ARGSUSED*/
9080Sstevel@tonic-gate static int
9090Sstevel@tonic-gate collection_sweep_cb(lcollection_t *lcol, void *arg)
9100Sstevel@tonic-gate {
9110Sstevel@tonic-gate 	if (lcol->lcol_mark == 0) {
912*3247Sgjelinek 		debug("freeing %s %s\n",
913*3247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
914*3247Sgjelinek 		    "project" : "zone"), lcol->lcol_name);
9150Sstevel@tonic-gate 		lcollection_free(lcol);
9160Sstevel@tonic-gate 	}
9170Sstevel@tonic-gate 
9180Sstevel@tonic-gate 	return (0);
9190Sstevel@tonic-gate }
9200Sstevel@tonic-gate 
9210Sstevel@tonic-gate /*
9220Sstevel@tonic-gate  * Set those variables which depend on the global configuration.
9230Sstevel@tonic-gate  */
9240Sstevel@tonic-gate static void
9250Sstevel@tonic-gate finish_configuration(void)
9260Sstevel@tonic-gate {
9270Sstevel@tonic-gate 	/*
9280Sstevel@tonic-gate 	 * Warn that any lnode (or non-project) mode specification (by an SRM
9290Sstevel@tonic-gate 	 * 1.3 configuration file, for example) is ignored.
9300Sstevel@tonic-gate 	 */
9310Sstevel@tonic-gate 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
9320Sstevel@tonic-gate 		warn(gettext("%s mode specification ignored -- using project"
9330Sstevel@tonic-gate 		    " mode\n"), rcfg.rcfg_mode_name);
9340Sstevel@tonic-gate 		rcfg.rcfg_mode_name = "project";
9350Sstevel@tonic-gate 		rcfg.rcfg_mode = rctype_project;
9360Sstevel@tonic-gate 	}
9370Sstevel@tonic-gate }
9380Sstevel@tonic-gate 
9390Sstevel@tonic-gate /*
9400Sstevel@tonic-gate  * Cause the configuration file to be reread and applied.
9410Sstevel@tonic-gate  */
9420Sstevel@tonic-gate static void
9430Sstevel@tonic-gate reread_configuration_file(void)
9440Sstevel@tonic-gate {
9450Sstevel@tonic-gate 	rcfg_t rcfg_new;
9460Sstevel@tonic-gate 	struct stat st;
9470Sstevel@tonic-gate 
9480Sstevel@tonic-gate 	if (stat(rcfg.rcfg_filename, &st) == 0 && st.st_mtime ==
9490Sstevel@tonic-gate 	    rcfg.rcfg_last_modification)
9500Sstevel@tonic-gate 		return;
9510Sstevel@tonic-gate 
9520Sstevel@tonic-gate 	if (rcfg_read(rcfg.rcfg_filename, rcfg.rcfg_fd, &rcfg_new,
9530Sstevel@tonic-gate 	    update_statistics) != 0)
9540Sstevel@tonic-gate 		warn(gettext("can't reread configuration"));
9550Sstevel@tonic-gate 	else {
9560Sstevel@tonic-gate 		/*
9570Sstevel@tonic-gate 		 * The configuration file has been read.  Remove existing
9580Sstevel@tonic-gate 		 * collections in case there is a change in collection type.
9590Sstevel@tonic-gate 		 */
9600Sstevel@tonic-gate 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
9610Sstevel@tonic-gate 			list_walk_collection(collection_clear_cb, NULL);
9620Sstevel@tonic-gate 			list_walk_collection(collection_sweep_cb, NULL);
9630Sstevel@tonic-gate 		}
9640Sstevel@tonic-gate 
9650Sstevel@tonic-gate 		/*
9660Sstevel@tonic-gate 		 * Make the newly-read configuration the global one, and update
9670Sstevel@tonic-gate 		 * any variables that depend on it.
9680Sstevel@tonic-gate 		 */
9690Sstevel@tonic-gate 		rcfg = rcfg_new;
9700Sstevel@tonic-gate 		finish_configuration();
9710Sstevel@tonic-gate 	}
9720Sstevel@tonic-gate }
9730Sstevel@tonic-gate 
9740Sstevel@tonic-gate /*
9750Sstevel@tonic-gate  * Reread the configuration filex, then examine changes, additions, and
9760Sstevel@tonic-gate  * deletions to cap definitions.
9770Sstevel@tonic-gate  */
9780Sstevel@tonic-gate static void
979*3247Sgjelinek reconfigure(hrtime_t now, hrtime_t *next_configuration,
980*3247Sgjelinek     hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
9810Sstevel@tonic-gate {
9820Sstevel@tonic-gate 	debug("reconfigure...\n");
9830Sstevel@tonic-gate 
9840Sstevel@tonic-gate 	/*
9850Sstevel@tonic-gate 	 * Reread the configuration data.
9860Sstevel@tonic-gate 	 */
9870Sstevel@tonic-gate 	reread_configuration_file();
9880Sstevel@tonic-gate 
9890Sstevel@tonic-gate 	/*
9900Sstevel@tonic-gate 	 * Walk the lcollection, marking active collections so inactive ones
9910Sstevel@tonic-gate 	 * can be freed.
9920Sstevel@tonic-gate 	 */
9930Sstevel@tonic-gate 	list_walk_collection(collection_clear_cb, NULL);
9940Sstevel@tonic-gate 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
9950Sstevel@tonic-gate 	list_walk_collection(collection_sweep_cb, NULL);
996*3247Sgjelinek 
997*3247Sgjelinek 	*next_configuration = NEXT_EVENT_TIME(now,
998*3247Sgjelinek 	    rcfg.rcfg_reconfiguration_interval);
999*3247Sgjelinek 
1000*3247Sgjelinek 	/*
1001*3247Sgjelinek 	 * Reset each event time to the shorter of the previous and new
1002*3247Sgjelinek 	 * intervals.
1003*3247Sgjelinek 	 */
1004*3247Sgjelinek 	if (next_report == 0 && rcfg.rcfg_report_interval > 0)
1005*3247Sgjelinek 		next_report = now;
1006*3247Sgjelinek 	else
1007*3247Sgjelinek 		next_report = POSITIVE_MIN(next_report,
1008*3247Sgjelinek 		    NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
1009*3247Sgjelinek 
1010*3247Sgjelinek 	if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1011*3247Sgjelinek 		*next_proc_walk = now;
1012*3247Sgjelinek 	else
1013*3247Sgjelinek 		*next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1014*3247Sgjelinek 		    NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1015*3247Sgjelinek 
1016*3247Sgjelinek 	if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1017*3247Sgjelinek 		*next_rss_sample = now;
1018*3247Sgjelinek 	else
1019*3247Sgjelinek 		*next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1020*3247Sgjelinek 		    NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
10210Sstevel@tonic-gate }
10220Sstevel@tonic-gate 
10230Sstevel@tonic-gate /*
10240Sstevel@tonic-gate  * Respond to SIGHUP by triggering the rereading the configuration file and cap
10250Sstevel@tonic-gate  * definitions.
10260Sstevel@tonic-gate  */
10270Sstevel@tonic-gate /*ARGSUSED*/
10280Sstevel@tonic-gate static void
10290Sstevel@tonic-gate sighup(int signal)
10300Sstevel@tonic-gate {
10310Sstevel@tonic-gate 	should_reconfigure = 1;
10320Sstevel@tonic-gate }
10330Sstevel@tonic-gate 
10340Sstevel@tonic-gate /*
10350Sstevel@tonic-gate  * Print, for debugging purposes, each collection's interval statistics.
10360Sstevel@tonic-gate  */
10370Sstevel@tonic-gate /*ARGSUSED*/
10380Sstevel@tonic-gate static int
10390Sstevel@tonic-gate simple_report_collection_cb(lcollection_t *lcol, void *arg)
10400Sstevel@tonic-gate {
10410Sstevel@tonic-gate #define	DELTA(field) \
1042*3247Sgjelinek 	(unsigned long long)( \
10430Sstevel@tonic-gate 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
10440Sstevel@tonic-gate 
10450Sstevel@tonic-gate 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
10460Sstevel@tonic-gate 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
10470Sstevel@tonic-gate 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1048*3247Sgjelinek 	    "%llu scans over %llu ms\n",
1049*3247Sgjelinek 	    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1050*3247Sgjelinek 	    lcol->lcol_name,
10510Sstevel@tonic-gate 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
10520Sstevel@tonic-gate 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
10530Sstevel@tonic-gate 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1054*3247Sgjelinek 	    (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1055*3247Sgjelinek 	    (unsigned long long)lcol->lcol_stat.lcols_max_rss,
10560Sstevel@tonic-gate 	    (unsigned long long)lcol->lcol_rss_cap,
10570Sstevel@tonic-gate 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
10580Sstevel@tonic-gate 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
10590Sstevel@tonic-gate 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
10600Sstevel@tonic-gate 	    / MILLISEC));
10610Sstevel@tonic-gate 
10620Sstevel@tonic-gate #undef DELTA
10630Sstevel@tonic-gate 
10640Sstevel@tonic-gate 	return (0);
10650Sstevel@tonic-gate }
10660Sstevel@tonic-gate 
10670Sstevel@tonic-gate /*
10680Sstevel@tonic-gate  * Record each collection's interval statistics in the statistics file.
10690Sstevel@tonic-gate  */
10700Sstevel@tonic-gate static int
10710Sstevel@tonic-gate report_collection_cb(lcollection_t *lcol, void *arg)
10720Sstevel@tonic-gate {
10730Sstevel@tonic-gate 	lcollection_report_t dc;
10740Sstevel@tonic-gate 	int fd = (intptr_t)arg;
10750Sstevel@tonic-gate 
10760Sstevel@tonic-gate 	/*
10770Sstevel@tonic-gate 	 * Copy the relevant fields to the collection's record.
10780Sstevel@tonic-gate 	 */
10790Sstevel@tonic-gate 	bzero(&dc, sizeof (dc));
10800Sstevel@tonic-gate 	dc.lcol_id = lcol->lcol_id;
10810Sstevel@tonic-gate 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
10820Sstevel@tonic-gate 	dc.lcol_rss = lcol->lcol_rss;
10830Sstevel@tonic-gate 	dc.lcol_image_size = lcol->lcol_image_size;
10840Sstevel@tonic-gate 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
10850Sstevel@tonic-gate 	dc.lcol_stat = lcol->lcol_stat;
10860Sstevel@tonic-gate 
10870Sstevel@tonic-gate 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1088*3247Sgjelinek 		lcol->lcol_stat_old = lcol->lcol_stat;
10890Sstevel@tonic-gate 	} else {
1090*3247Sgjelinek 		debug("can't write %s %s statistics",
1091*3247Sgjelinek 		    (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1092*3247Sgjelinek 		    "project" : "zone"),
10930Sstevel@tonic-gate 		    lcol->lcol_name);
10940Sstevel@tonic-gate 	}
10950Sstevel@tonic-gate 
10960Sstevel@tonic-gate 	return (0);
10970Sstevel@tonic-gate }
10980Sstevel@tonic-gate 
10990Sstevel@tonic-gate /*
11000Sstevel@tonic-gate  * Determine the count of pages scanned by the global page scanner, obtained
11010Sstevel@tonic-gate  * from the cpu_stat:*::scan kstats.  Return zero on success.
11020Sstevel@tonic-gate  */
11030Sstevel@tonic-gate static int
11040Sstevel@tonic-gate get_globally_scanned_pages(uint64_t *scannedp)
11050Sstevel@tonic-gate {
11060Sstevel@tonic-gate 	kstat_t *ksp;
11070Sstevel@tonic-gate 	uint64_t scanned = 0;
11080Sstevel@tonic-gate 
11090Sstevel@tonic-gate 	if (kstat_chain_update(kctl) == -1) {
11100Sstevel@tonic-gate 		warn(gettext("can't update kstat chain"));
11110Sstevel@tonic-gate 		return (0);
11120Sstevel@tonic-gate 	}
11130Sstevel@tonic-gate 
11140Sstevel@tonic-gate 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
11150Sstevel@tonic-gate 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
11160Sstevel@tonic-gate 			if (kstat_read(kctl, ksp, NULL) != -1) {
11170Sstevel@tonic-gate 				scanned += ((cpu_stat_t *)
11180Sstevel@tonic-gate 				    ksp->ks_data)->cpu_vminfo.scan;
1119*3247Sgjelinek 			} else {
11200Sstevel@tonic-gate 				return (-1);
1121*3247Sgjelinek 			}
11220Sstevel@tonic-gate 		}
11230Sstevel@tonic-gate 	}
11240Sstevel@tonic-gate 
11250Sstevel@tonic-gate 	*scannedp = scanned;
11260Sstevel@tonic-gate 	return (0);
11270Sstevel@tonic-gate }
11280Sstevel@tonic-gate 
11290Sstevel@tonic-gate /*
1130*3247Sgjelinek  * Determine if the global page scanner is running, during which no memory
1131*3247Sgjelinek  * caps should be enforced, to prevent interference with the global page
1132*3247Sgjelinek  * scanner.
1133*3247Sgjelinek  */
1134*3247Sgjelinek static boolean_t
1135*3247Sgjelinek is_global_scanner_running()
1136*3247Sgjelinek {
1137*3247Sgjelinek 	/* measure delta in page scan count */
1138*3247Sgjelinek 	static uint64_t new_sp = 0;
1139*3247Sgjelinek 	static uint64_t old_sp = 0;
1140*3247Sgjelinek 	boolean_t res = B_FALSE;
1141*3247Sgjelinek 
1142*3247Sgjelinek 	if (get_globally_scanned_pages(&new_sp) == 0) {
1143*3247Sgjelinek 		if (old_sp != 0 && (new_sp - old_sp) > 0) {
1144*3247Sgjelinek 			debug("global memory pressure detected (%llu "
1145*3247Sgjelinek 			    "pages scanned since last interval)\n",
1146*3247Sgjelinek 			    (unsigned long long)(new_sp - old_sp));
1147*3247Sgjelinek 			res = B_TRUE;
1148*3247Sgjelinek 		}
1149*3247Sgjelinek 		old_sp = new_sp;
1150*3247Sgjelinek 	} else {
1151*3247Sgjelinek 		warn(gettext("unable to read cpu statistics"));
1152*3247Sgjelinek 		new_sp = old_sp;
1153*3247Sgjelinek 	}
1154*3247Sgjelinek 
1155*3247Sgjelinek 	return (res);
1156*3247Sgjelinek }
1157*3247Sgjelinek 
1158*3247Sgjelinek /*
1159*3247Sgjelinek  * If soft caps are in use, determine if global memory pressure exceeds the
1160*3247Sgjelinek  * configured maximum above which soft caps are enforced.
1161*3247Sgjelinek  */
1162*3247Sgjelinek static boolean_t
1163*3247Sgjelinek must_enforce_soft_caps()
1164*3247Sgjelinek {
1165*3247Sgjelinek 	/*
1166*3247Sgjelinek 	 * Check for changes to the amount of installed physical memory, to
1167*3247Sgjelinek 	 * compute the current memory pressure.
1168*3247Sgjelinek 	 */
1169*3247Sgjelinek 	update_phys_total();
1170*3247Sgjelinek 
1171*3247Sgjelinek 	memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1172*3247Sgjelinek 	    * 100.0 / phys_total);
1173*3247Sgjelinek 	memory_pressure_sample++;
1174*3247Sgjelinek 	if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1175*3247Sgjelinek 	    memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1176*3247Sgjelinek 		return (B_TRUE);
1177*3247Sgjelinek 	}
1178*3247Sgjelinek 
1179*3247Sgjelinek 	return (B_FALSE);
1180*3247Sgjelinek }
1181*3247Sgjelinek 
1182*3247Sgjelinek /*
11830Sstevel@tonic-gate  * Update the shared statistics file with each collection's current statistics.
11840Sstevel@tonic-gate  * Return zero on success.
11850Sstevel@tonic-gate  */
11860Sstevel@tonic-gate static int
11870Sstevel@tonic-gate update_statistics(void)
11880Sstevel@tonic-gate {
11890Sstevel@tonic-gate 	int fd, res;
11900Sstevel@tonic-gate 	static char template[LINELEN];
11910Sstevel@tonic-gate 
11920Sstevel@tonic-gate 	/*
1193442Sgm149974 	 * Try to create a directory irrespective of whether it is existing
1194442Sgm149974 	 * or not. If it is not there then it will create. Otherwise any way
1195442Sgm149974 	 * it will fail at mkstemp call below.
1196442Sgm149974 	 */
1197442Sgm149974 	(void) mkdir(STAT_FILE_DIR, 0755);
1198442Sgm149974 
1199442Sgm149974 	/*
12000Sstevel@tonic-gate 	 * Create a temporary file.
12010Sstevel@tonic-gate 	 */
12020Sstevel@tonic-gate 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
12030Sstevel@tonic-gate 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
12040Sstevel@tonic-gate 		debug("temporary file template size too small\n");
12050Sstevel@tonic-gate 		return (-1);
12060Sstevel@tonic-gate 	}
12070Sstevel@tonic-gate 	(void) strcpy(template, rcfg.rcfg_stat_file);
12080Sstevel@tonic-gate 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
12090Sstevel@tonic-gate 	(void) rfd_reserve(1);
12100Sstevel@tonic-gate 	fd = mkstemp(template);
12110Sstevel@tonic-gate 
12120Sstevel@tonic-gate 	/*
12130Sstevel@tonic-gate 	 * Write the header and per-collection statistics.
12140Sstevel@tonic-gate 	 */
12150Sstevel@tonic-gate 	if (fd >= 0) {
12160Sstevel@tonic-gate 		rcapd_stat_hdr_t rs;
12170Sstevel@tonic-gate 
12180Sstevel@tonic-gate 		rs.rs_pid = rcapd_pid;
12190Sstevel@tonic-gate 		rs.rs_time = gethrtime();
12200Sstevel@tonic-gate 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
12210Sstevel@tonic-gate 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
12220Sstevel@tonic-gate 		rs.rs_pressure_cur = memory_pressure;
12230Sstevel@tonic-gate 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
12240Sstevel@tonic-gate 		rs.rs_pressure_sample = memory_pressure_sample;
12250Sstevel@tonic-gate 
12260Sstevel@tonic-gate 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
12270Sstevel@tonic-gate 		    sizeof (rs)) {
12280Sstevel@tonic-gate 			list_walk_collection(report_collection_cb,
12290Sstevel@tonic-gate 				(void *)(intptr_t)fd);
12300Sstevel@tonic-gate 			/*
12310Sstevel@tonic-gate 			 * Replace the existing statistics file with this new
12320Sstevel@tonic-gate 			 * one.
12330Sstevel@tonic-gate 			 */
12340Sstevel@tonic-gate 			res = rename(template, rcfg.rcfg_stat_file);
12350Sstevel@tonic-gate 		} else
12360Sstevel@tonic-gate 			res = -1;
12370Sstevel@tonic-gate 		(void) close(fd);
12380Sstevel@tonic-gate 	} else
12390Sstevel@tonic-gate 		res = -1;
12400Sstevel@tonic-gate 
12410Sstevel@tonic-gate 	return (res);
12420Sstevel@tonic-gate }
12430Sstevel@tonic-gate 
12440Sstevel@tonic-gate /*
12450Sstevel@tonic-gate  * Verify the statistics file can be created and written to, and die if an
12460Sstevel@tonic-gate  * existing file may be in use by another rcapd.
12470Sstevel@tonic-gate  */
12480Sstevel@tonic-gate static int
12490Sstevel@tonic-gate verify_statistics(void)
12500Sstevel@tonic-gate {
12510Sstevel@tonic-gate 	pid_t pid;
12520Sstevel@tonic-gate 
12530Sstevel@tonic-gate 	/*
12540Sstevel@tonic-gate 	 * Warn if another instance of rcapd might be active.
12550Sstevel@tonic-gate 	 */
12560Sstevel@tonic-gate 	(void) rfd_reserve(1);
12570Sstevel@tonic-gate 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
12580Sstevel@tonic-gate 	if (pid != rcapd_pid && pid != -1)
12590Sstevel@tonic-gate 		die(gettext("%s exists; rcapd may already be active\n"),
12600Sstevel@tonic-gate 		    rcfg.rcfg_stat_file);
12610Sstevel@tonic-gate 
12620Sstevel@tonic-gate 	return (update_statistics());
12630Sstevel@tonic-gate }
12640Sstevel@tonic-gate 
12650Sstevel@tonic-gate static int
12660Sstevel@tonic-gate sum_excess_cb(lcollection_t *lcol, void *arg)
12670Sstevel@tonic-gate {
12680Sstevel@tonic-gate 	uint64_t *sum_excess = arg;
12690Sstevel@tonic-gate 
12700Sstevel@tonic-gate 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
12710Sstevel@tonic-gate 	    lcol->lcol_rss_cap));
12720Sstevel@tonic-gate 	return (0);
12730Sstevel@tonic-gate }
12740Sstevel@tonic-gate 
1275*3247Sgjelinek /*
1276*3247Sgjelinek  * Compute the quantity of memory (in kilobytes) above the cap enforcement
1277*3247Sgjelinek  * pressure.  Set the scan goal to that quantity (or at most the excess).
1278*3247Sgjelinek  */
1279*3247Sgjelinek static void
1280*3247Sgjelinek compute_soft_scan_goal(soft_scan_arg_t *argp)
1281*3247Sgjelinek {
1282*3247Sgjelinek 	/*
1283*3247Sgjelinek 	 * Compute the sum of the collections' excesses, which will be the
1284*3247Sgjelinek 	 * denominator.
1285*3247Sgjelinek 	 */
1286*3247Sgjelinek 	argp->ssa_sum_excess = 0;
1287*3247Sgjelinek 	list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1288*3247Sgjelinek 
1289*3247Sgjelinek 	argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1290*3247Sgjelinek 	    (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1291*3247Sgjelinek 	    sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1292*3247Sgjelinek 	    argp->ssa_sum_excess);
1293*3247Sgjelinek }
1294*3247Sgjelinek 
12950Sstevel@tonic-gate static void
12960Sstevel@tonic-gate rcapd_usage(void)
12970Sstevel@tonic-gate {
12980Sstevel@tonic-gate 	info(gettext("usage: rcapd [-d]\n"));
12990Sstevel@tonic-gate }
13000Sstevel@tonic-gate 
13010Sstevel@tonic-gate void
13020Sstevel@tonic-gate check_update_statistics(void)
13030Sstevel@tonic-gate {
13040Sstevel@tonic-gate 	hrtime_t now = gethrtime();
13050Sstevel@tonic-gate 
13060Sstevel@tonic-gate 	if (EVENT_TIME(now, next_report)) {
13070Sstevel@tonic-gate 		debug("updating statistics...\n");
13080Sstevel@tonic-gate 		list_walk_collection(simple_report_collection_cb, NULL);
13090Sstevel@tonic-gate 		if (update_statistics() != 0)
13100Sstevel@tonic-gate 			debug("couldn't update statistics");
13110Sstevel@tonic-gate 		next_report = NEXT_REPORT_EVENT_TIME(now,
13120Sstevel@tonic-gate 		    rcfg.rcfg_report_interval);
13130Sstevel@tonic-gate 	}
13140Sstevel@tonic-gate }
13150Sstevel@tonic-gate 
13160Sstevel@tonic-gate static void
13170Sstevel@tonic-gate verify_and_set_privileges(void)
13180Sstevel@tonic-gate {
13190Sstevel@tonic-gate 	priv_set_t *required =
13200Sstevel@tonic-gate 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
13210Sstevel@tonic-gate 
13220Sstevel@tonic-gate 	/*
13230Sstevel@tonic-gate 	 * Ensure the required privileges, suitable for controlling processes,
13240Sstevel@tonic-gate 	 * are possessed.
13250Sstevel@tonic-gate 	 */
13260Sstevel@tonic-gate 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
13270Sstevel@tonic-gate 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
13280Sstevel@tonic-gate 		die(gettext("can't set requisite privileges"));
13290Sstevel@tonic-gate 
13300Sstevel@tonic-gate 	/*
13310Sstevel@tonic-gate 	 * Ensure access to /var/run/daemon.
13320Sstevel@tonic-gate 	 */
13330Sstevel@tonic-gate 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
13340Sstevel@tonic-gate 		die(gettext("cannot become user daemon"));
13350Sstevel@tonic-gate 
13360Sstevel@tonic-gate 	priv_freeset(required);
13370Sstevel@tonic-gate }
13380Sstevel@tonic-gate 
1339*3247Sgjelinek /*
1340*3247Sgjelinek  * This function does the top-level work to determine if we should do any
1341*3247Sgjelinek  * memory capping, and if so, it invokes the right call-backs to do the work.
1342*3247Sgjelinek  */
1343*3247Sgjelinek static void
1344*3247Sgjelinek do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1345*3247Sgjelinek {
1346*3247Sgjelinek 	boolean_t enforce_caps;
1347*3247Sgjelinek 	/* soft cap enforcement flag, depending on memory pressure */
1348*3247Sgjelinek 	boolean_t enforce_soft_caps;
1349*3247Sgjelinek 	/* avoid interference with kernel's page scanner */
1350*3247Sgjelinek 	boolean_t global_scanner_running;
1351*3247Sgjelinek 	sample_col_arg_t col_arg;
1352*3247Sgjelinek 	soft_scan_arg_t arg;
1353*3247Sgjelinek 	uint_t col_types = 0;
1354*3247Sgjelinek 
1355*3247Sgjelinek 	/* check what kind of collections (project/zone) are capped */
1356*3247Sgjelinek 	list_walk_collection(col_type_cb, &col_types);
1357*3247Sgjelinek 	debug("collection types: 0x%x\n", col_types);
1358*3247Sgjelinek 
1359*3247Sgjelinek 	/* no capped collections, skip checking rss */
1360*3247Sgjelinek 	if (col_types == 0)
1361*3247Sgjelinek 		return;
1362*3247Sgjelinek 
1363*3247Sgjelinek 	/* Determine if soft caps are enforced. */
1364*3247Sgjelinek 	enforce_soft_caps = must_enforce_soft_caps();
1365*3247Sgjelinek 
1366*3247Sgjelinek 	/* Determine if the global page scanner is running. */
1367*3247Sgjelinek 	global_scanner_running = is_global_scanner_running();
1368*3247Sgjelinek 
1369*3247Sgjelinek 	/*
1370*3247Sgjelinek 	 * Sample collections' member processes RSSes and recompute
1371*3247Sgjelinek 	 * collections' excess.
1372*3247Sgjelinek 	 */
1373*3247Sgjelinek 	rss_sample(B_FALSE, col_types);
1374*3247Sgjelinek 
1375*3247Sgjelinek 	col_arg.sca_any_over_cap = B_FALSE;
1376*3247Sgjelinek 	col_arg.sca_project_over_cap = B_FALSE;
1377*3247Sgjelinek 	list_walk_collection(rss_sample_col_cb, &col_arg);
1378*3247Sgjelinek 	list_walk_collection(excess_print_cb, NULL);
1379*3247Sgjelinek 	debug("any collection/project over cap = %d, %d\n",
1380*3247Sgjelinek 	    col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1381*3247Sgjelinek 
1382*3247Sgjelinek 	if (enforce_soft_caps)
1383*3247Sgjelinek 		debug("memory pressure %d%%\n", memory_pressure);
1384*3247Sgjelinek 
1385*3247Sgjelinek 	/*
1386*3247Sgjelinek 	 * Cap enforcement is determined by the previous conditions.
1387*3247Sgjelinek 	 */
1388*3247Sgjelinek 	enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1389*3247Sgjelinek 	    (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1390*3247Sgjelinek 	    enforce_soft_caps);
1391*3247Sgjelinek 
1392*3247Sgjelinek 	debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1393*3247Sgjelinek 
1394*3247Sgjelinek 	/*
1395*3247Sgjelinek 	 * If soft caps are in use, determine the size of the portion from each
1396*3247Sgjelinek 	 * collection to scan for.
1397*3247Sgjelinek 	 */
1398*3247Sgjelinek 	if (enforce_caps && enforce_soft_caps)
1399*3247Sgjelinek 		compute_soft_scan_goal(&arg);
1400*3247Sgjelinek 
1401*3247Sgjelinek 	/*
1402*3247Sgjelinek 	 * Victimize offending collections.
1403*3247Sgjelinek 	 */
1404*3247Sgjelinek 	if (enforce_caps && (!enforce_soft_caps ||
1405*3247Sgjelinek 	    (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1406*3247Sgjelinek 
1407*3247Sgjelinek 		/*
1408*3247Sgjelinek 		 * Since at least one collection is over its cap & needs
1409*3247Sgjelinek 		 * enforcing, check if it is at least time for a process walk
1410*3247Sgjelinek 		 * (we could be well past time since we only walk /proc when
1411*3247Sgjelinek 		 * we need to) and if so, update each collections process list
1412*3247Sgjelinek 		 * in a single pass through /proc.
1413*3247Sgjelinek 		 */
1414*3247Sgjelinek 		if (EVENT_TIME(now, *next_proc_walk)) {
1415*3247Sgjelinek 			debug("scanning process list...\n");
1416*3247Sgjelinek 			proc_walk_all(proc_cb);		 /* insert & mark */
1417*3247Sgjelinek 			list_walk_all(sweep_process_cb); /* free dead procs */
1418*3247Sgjelinek 			*next_proc_walk = NEXT_EVENT_TIME(now,
1419*3247Sgjelinek 			    rcfg.rcfg_proc_walk_interval);
1420*3247Sgjelinek 		}
1421*3247Sgjelinek 
1422*3247Sgjelinek 		gz_col = NULL;
1423*3247Sgjelinek 		if (enforce_soft_caps) {
1424*3247Sgjelinek 			debug("scan goal is %lldKB\n",
1425*3247Sgjelinek 			    (long long)arg.ssa_scan_goal);
1426*3247Sgjelinek 			list_walk_collection(soft_scan_cb, &arg);
1427*3247Sgjelinek 			if (gz_capped && gz_col != NULL) {
1428*3247Sgjelinek 				/* process global zone */
1429*3247Sgjelinek 				arg.ssa_project_over_cap =
1430*3247Sgjelinek 				    col_arg.sca_project_over_cap;
1431*3247Sgjelinek 				soft_scan_gz(gz_col, &arg);
1432*3247Sgjelinek 			}
1433*3247Sgjelinek 		} else {
1434*3247Sgjelinek 			list_walk_collection(scan_cb, NULL);
1435*3247Sgjelinek 			if (gz_capped && gz_col != NULL) {
1436*3247Sgjelinek 				/* process global zone */
1437*3247Sgjelinek 				scan_gz(gz_col, col_arg.sca_project_over_cap);
1438*3247Sgjelinek 			}
1439*3247Sgjelinek 		}
1440*3247Sgjelinek 	} else if (col_arg.sca_any_over_cap) {
1441*3247Sgjelinek 		list_walk_collection(unenforced_cap_cb, NULL);
1442*3247Sgjelinek 	}
1443*3247Sgjelinek }
1444*3247Sgjelinek 
14450Sstevel@tonic-gate int
14460Sstevel@tonic-gate main(int argc, char *argv[])
14470Sstevel@tonic-gate {
14480Sstevel@tonic-gate 	int res;
14490Sstevel@tonic-gate 	int should_fork = 1;	/* fork flag */
14500Sstevel@tonic-gate 	hrtime_t now;		/* current time */
14510Sstevel@tonic-gate 	hrtime_t next;		/* time of next event */
14520Sstevel@tonic-gate 	int sig;		/* signal iteration */
14530Sstevel@tonic-gate 	struct rlimit rl;
14540Sstevel@tonic-gate 	hrtime_t next_proc_walk;	/* time of next /proc scan */
14550Sstevel@tonic-gate 	hrtime_t next_configuration;	/* time of next configuration */
14560Sstevel@tonic-gate 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
14570Sstevel@tonic-gate 
14580Sstevel@tonic-gate 	(void) set_message_priority(RCM_INFO);
14590Sstevel@tonic-gate 	(void) setprogname("rcapd");
14600Sstevel@tonic-gate 	rcapd_pid = getpid();
14610Sstevel@tonic-gate 	(void) chdir("/");
14620Sstevel@tonic-gate 	should_run = 1;
14630Sstevel@tonic-gate 	ever_ran = 0;
14640Sstevel@tonic-gate 
14650Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
14660Sstevel@tonic-gate 	(void) textdomain(TEXT_DOMAIN);
14670Sstevel@tonic-gate 
14680Sstevel@tonic-gate 	/*
14690Sstevel@tonic-gate 	 * Parse command-line options.
14700Sstevel@tonic-gate 	 */
14710Sstevel@tonic-gate 	while ((res = getopt(argc, argv, "dF")) > 0)
14720Sstevel@tonic-gate 		switch (res) {
14730Sstevel@tonic-gate 		case 'd':
14740Sstevel@tonic-gate 			should_fork = 0;
14750Sstevel@tonic-gate 			if (debug_mode == 0) {
14760Sstevel@tonic-gate 				debug_mode = 1;
14770Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG);
14780Sstevel@tonic-gate 			} else
14790Sstevel@tonic-gate 				(void) set_message_priority(RCM_DEBUG_HIGH);
14800Sstevel@tonic-gate 			break;
14810Sstevel@tonic-gate 		case 'F':
14820Sstevel@tonic-gate 			should_fork = 0;
14830Sstevel@tonic-gate 			break;
14840Sstevel@tonic-gate 		default:
14850Sstevel@tonic-gate 			rcapd_usage();
14860Sstevel@tonic-gate 			return (E_USAGE);
14870Sstevel@tonic-gate 			/*NOTREACHED*/
14880Sstevel@tonic-gate 		}
14890Sstevel@tonic-gate 
14900Sstevel@tonic-gate 	/*
14910Sstevel@tonic-gate 	 * If not debugging, fork and continue operating, changing the
14920Sstevel@tonic-gate 	 * destination of messages to syslog().
14930Sstevel@tonic-gate 	 */
14940Sstevel@tonic-gate 	if (should_fork == 1) {
14950Sstevel@tonic-gate 		pid_t child;
14960Sstevel@tonic-gate 		debug("forking\n");
14970Sstevel@tonic-gate 		child = fork();
14980Sstevel@tonic-gate 		if (child == -1)
14990Sstevel@tonic-gate 			die(gettext("cannot fork"));
15000Sstevel@tonic-gate 		if (child > 0)
15010Sstevel@tonic-gate 			return (0);
15020Sstevel@tonic-gate 		else {
15030Sstevel@tonic-gate 			rcapd_pid = getpid();
15040Sstevel@tonic-gate 			(void) set_message_destination(RCD_SYSLOG);
15050Sstevel@tonic-gate 			(void) fclose(stdin);
15060Sstevel@tonic-gate 			(void) fclose(stdout);
15070Sstevel@tonic-gate 			(void) fclose(stderr);
15080Sstevel@tonic-gate 		}
15090Sstevel@tonic-gate 		/*
15100Sstevel@tonic-gate 		 * Start a new session and detatch from the controlling tty.
15110Sstevel@tonic-gate 		 */
15120Sstevel@tonic-gate 		if (setsid() == (pid_t)-1)
15130Sstevel@tonic-gate 			debug(gettext("setsid() failed; cannot detach from "
15140Sstevel@tonic-gate 			    "terminal"));
15150Sstevel@tonic-gate 	}
15160Sstevel@tonic-gate 
15170Sstevel@tonic-gate 	/*
15180Sstevel@tonic-gate 	 * Read the configuration file.
15190Sstevel@tonic-gate 	 */
15200Sstevel@tonic-gate 	if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, verify_statistics)
15212517Stn143363 	    != 0) {
15222517Stn143363 		/*
15232517Stn143363 		 * A configuration file may not exist if rcapd is started
15242517Stn143363 		 * by enabling the smf rcap service, so attempt to create
15252517Stn143363 		 * a default file.
15262517Stn143363 		 */
15272517Stn143363 		create_config_file(NULL);
15282517Stn143363 
15292517Stn143363 		/*
15302517Stn143363 		 * A real failure if still can't read the
15312517Stn143363 		 * configuration file
15322517Stn143363 		 */
15332517Stn143363 		if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg,
15342517Stn143363 		    verify_statistics) != 0)
15352517Stn143363 			die(gettext("resource caps not configured %s"),
15362517Stn143363 			    RCAPD_DEFAULT_CONF_FILE);
15372517Stn143363 	}
15380Sstevel@tonic-gate 	finish_configuration();
15390Sstevel@tonic-gate 	should_reconfigure = 0;
15400Sstevel@tonic-gate 
15410Sstevel@tonic-gate 	/*
15420Sstevel@tonic-gate 	 * Check that required privileges are possessed.
15430Sstevel@tonic-gate 	 */
15440Sstevel@tonic-gate 	verify_and_set_privileges();
15450Sstevel@tonic-gate 
15460Sstevel@tonic-gate 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
15470Sstevel@tonic-gate 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
15480Sstevel@tonic-gate 	    rcfg.rcfg_reconfiguration_interval);
15490Sstevel@tonic-gate 
15500Sstevel@tonic-gate 	/*
15510Sstevel@tonic-gate 	 * Open the kstat chain.
15520Sstevel@tonic-gate 	 */
15530Sstevel@tonic-gate 	kctl = kstat_open();
15540Sstevel@tonic-gate 	if (kctl == NULL)
15550Sstevel@tonic-gate 		die(gettext("can't open kstats"));
15560Sstevel@tonic-gate 
15570Sstevel@tonic-gate 	/*
15580Sstevel@tonic-gate 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
15590Sstevel@tonic-gate 	 * be effectively managed without revoking descriptors (at 3 per
15600Sstevel@tonic-gate 	 * process).
15610Sstevel@tonic-gate 	 */
15620Sstevel@tonic-gate 	rl.rlim_cur = 32 * 1024;
15630Sstevel@tonic-gate 	rl.rlim_max = 32 * 1024;
15640Sstevel@tonic-gate 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
15650Sstevel@tonic-gate 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
15660Sstevel@tonic-gate 		rl.rlim_cur = rl.rlim_max;
15670Sstevel@tonic-gate 		(void) setrlimit(RLIMIT_NOFILE, &rl);
15680Sstevel@tonic-gate 	}
15691914Scasper 	(void) enable_extended_FILE_stdio(-1, -1);
15701914Scasper 
15710Sstevel@tonic-gate 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
15720Sstevel@tonic-gate 		debug("fd limit: %lu\n", rl.rlim_cur);
15730Sstevel@tonic-gate 	else
15740Sstevel@tonic-gate 		debug("fd limit: unknown\n");
15750Sstevel@tonic-gate 
1576*3247Sgjelinek 	get_page_size();
1577*3247Sgjelinek 	my_zoneid = getzoneid();
1578*3247Sgjelinek 
15790Sstevel@tonic-gate 	/*
15800Sstevel@tonic-gate 	 * Handle those signals whose (default) exit disposition
15810Sstevel@tonic-gate 	 * prevents rcapd from finishing scanning before terminating.
15820Sstevel@tonic-gate 	 */
15830Sstevel@tonic-gate 	(void) sigset(SIGINT, terminate_signal);
15840Sstevel@tonic-gate 	(void) sigset(SIGQUIT, abort_signal);
15850Sstevel@tonic-gate 	(void) sigset(SIGILL, abort_signal);
15860Sstevel@tonic-gate 	(void) sigset(SIGEMT, abort_signal);
15870Sstevel@tonic-gate 	(void) sigset(SIGFPE, abort_signal);
15880Sstevel@tonic-gate 	(void) sigset(SIGBUS, abort_signal);
15890Sstevel@tonic-gate 	(void) sigset(SIGSEGV, abort_signal);
15900Sstevel@tonic-gate 	(void) sigset(SIGSYS, abort_signal);
15910Sstevel@tonic-gate 	(void) sigset(SIGPIPE, terminate_signal);
15920Sstevel@tonic-gate 	(void) sigset(SIGALRM, terminate_signal);
15930Sstevel@tonic-gate 	(void) sigset(SIGTERM, terminate_signal);
15940Sstevel@tonic-gate 	(void) sigset(SIGUSR1, terminate_signal);
15950Sstevel@tonic-gate 	(void) sigset(SIGUSR2, terminate_signal);
15960Sstevel@tonic-gate 	(void) sigset(SIGPOLL, terminate_signal);
15970Sstevel@tonic-gate 	(void) sigset(SIGVTALRM, terminate_signal);
15980Sstevel@tonic-gate 	(void) sigset(SIGXCPU, abort_signal);
15990Sstevel@tonic-gate 	(void) sigset(SIGXFSZ, abort_signal);
16000Sstevel@tonic-gate 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
16010Sstevel@tonic-gate 		(void) sigset(sig, terminate_signal);
16020Sstevel@tonic-gate 
16030Sstevel@tonic-gate 	/*
16040Sstevel@tonic-gate 	 * Install a signal handler for reconfiguration processing.
16050Sstevel@tonic-gate 	 */
16060Sstevel@tonic-gate 	(void) sigset(SIGHUP, sighup);
16070Sstevel@tonic-gate 
16080Sstevel@tonic-gate 	/*
16090Sstevel@tonic-gate 	 * Determine which process collections to cap.
16100Sstevel@tonic-gate 	 */
16110Sstevel@tonic-gate 	lcollection_update(LCU_COMPLETE);
16120Sstevel@tonic-gate 
16130Sstevel@tonic-gate 	/*
16140Sstevel@tonic-gate 	 * Loop forever, monitoring collections' resident set sizes and
1615*3247Sgjelinek 	 * enforcing their caps.  Look for changes in caps as well as
1616*3247Sgjelinek 	 * responding to requests to reread the configuration.  Update
1617*3247Sgjelinek 	 * per-collection statistics periodically.
16180Sstevel@tonic-gate 	 */
16190Sstevel@tonic-gate 	while (should_run != 0) {
16200Sstevel@tonic-gate 		struct timespec ts;
16210Sstevel@tonic-gate 
16220Sstevel@tonic-gate 		/*
16230Sstevel@tonic-gate 		 * Announce that rcapd is starting.
16240Sstevel@tonic-gate 		 */
16250Sstevel@tonic-gate 		if (ever_ran == 0) {
16260Sstevel@tonic-gate 			info(gettext("starting\n"));
16270Sstevel@tonic-gate 			ever_ran = 1;
16280Sstevel@tonic-gate 		}
16290Sstevel@tonic-gate 
16300Sstevel@tonic-gate 		/*
1631*3247Sgjelinek 		 * Check the configuration at every next_configuration interval.
1632*3247Sgjelinek 		 * Update the rss data once every next_rss_sample interval.
1633*3247Sgjelinek 		 * The condition of global memory pressure is also checked at
1634*3247Sgjelinek 		 * the same frequency, if strict caps are in use.
16350Sstevel@tonic-gate 		 */
16360Sstevel@tonic-gate 		now = gethrtime();
16370Sstevel@tonic-gate 
16380Sstevel@tonic-gate 		/*
16390Sstevel@tonic-gate 		 * Detect configuration and cap changes at every
16400Sstevel@tonic-gate 		 * reconfiguration_interval, or when SIGHUP has been received.
16410Sstevel@tonic-gate 		 */
16420Sstevel@tonic-gate 		if (EVENT_TIME(now, next_configuration) ||
16430Sstevel@tonic-gate 		    should_reconfigure == 1) {
1644*3247Sgjelinek 			reconfigure(now, &next_configuration, &next_proc_walk,
1645*3247Sgjelinek 			    &next_rss_sample);
16460Sstevel@tonic-gate 			should_reconfigure = 0;
16470Sstevel@tonic-gate 		}
16480Sstevel@tonic-gate 
1649*3247Sgjelinek 		/*
1650*3247Sgjelinek 		 * Do the main work for enforcing caps.
1651*3247Sgjelinek 		 */
16520Sstevel@tonic-gate 		if (EVENT_TIME(now, next_rss_sample)) {
1653*3247Sgjelinek 			do_capping(now, &next_proc_walk);
16540Sstevel@tonic-gate 
16550Sstevel@tonic-gate 			next_rss_sample = NEXT_EVENT_TIME(now,
16560Sstevel@tonic-gate 			    rcfg.rcfg_rss_sample_interval);
16570Sstevel@tonic-gate 		}
16580Sstevel@tonic-gate 
16590Sstevel@tonic-gate 		/*
16600Sstevel@tonic-gate 		 * Update the statistics file, if it's time.
16610Sstevel@tonic-gate 		 */
16620Sstevel@tonic-gate 		check_update_statistics();
16630Sstevel@tonic-gate 
16640Sstevel@tonic-gate 		/*
16650Sstevel@tonic-gate 		 * Sleep for some time before repeating.
16660Sstevel@tonic-gate 		 */
16670Sstevel@tonic-gate 		now = gethrtime();
16680Sstevel@tonic-gate 		next = next_configuration;
16690Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_report);
16700Sstevel@tonic-gate 		next = POSITIVE_MIN(next, next_rss_sample);
16710Sstevel@tonic-gate 		if (next > now && should_run != 0) {
16720Sstevel@tonic-gate 			debug("sleeping %-4.2f seconds\n", (float)(next -
16730Sstevel@tonic-gate 			    now) / (float)NANOSEC);
16740Sstevel@tonic-gate 			hrt2ts(next - now, &ts);
16750Sstevel@tonic-gate 			(void) nanosleep(&ts, NULL);
16760Sstevel@tonic-gate 		}
16770Sstevel@tonic-gate 	}
16780Sstevel@tonic-gate 	if (termination_signal != 0)
16790Sstevel@tonic-gate 		debug("exiting due to signal %d\n", termination_signal);
16800Sstevel@tonic-gate 	if (ever_ran != 0)
16810Sstevel@tonic-gate 		info(gettext("exiting\n"));
16820Sstevel@tonic-gate 
16830Sstevel@tonic-gate 	/*
16840Sstevel@tonic-gate 	 * Unlink the statistics file before exiting.
16850Sstevel@tonic-gate 	 */
16860Sstevel@tonic-gate 	if (rcfg.rcfg_stat_file[0] != 0)
16870Sstevel@tonic-gate 		(void) unlink(rcfg.rcfg_stat_file);
16880Sstevel@tonic-gate 
16890Sstevel@tonic-gate 	return (E_SUCCESS);
16900Sstevel@tonic-gate }
1691