10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*3247Sgjelinek * Common Development and Distribution License (the "License").
6*3247Sgjelinek * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*3247Sgjelinek * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
270Sstevel@tonic-gate
280Sstevel@tonic-gate #include <sys/mman.h>
290Sstevel@tonic-gate #include <sys/param.h>
300Sstevel@tonic-gate #include <sys/stat.h>
310Sstevel@tonic-gate #include <sys/types.h>
320Sstevel@tonic-gate #include <assert.h>
330Sstevel@tonic-gate #include <errno.h>
340Sstevel@tonic-gate #include <fcntl.h>
350Sstevel@tonic-gate #include <libproc.h>
360Sstevel@tonic-gate #include <limits.h>
370Sstevel@tonic-gate #include <procfs.h>
380Sstevel@tonic-gate #include <stdio.h>
390Sstevel@tonic-gate #include <stdlib.h>
400Sstevel@tonic-gate #include <strings.h>
410Sstevel@tonic-gate #include <time.h>
420Sstevel@tonic-gate #include <unistd.h>
430Sstevel@tonic-gate #include "rcapd.h"
440Sstevel@tonic-gate #include "rcapd_rfd.h"
450Sstevel@tonic-gate #include "rcapd_mapping.h"
460Sstevel@tonic-gate #include "utils.h"
470Sstevel@tonic-gate
480Sstevel@tonic-gate static int lpc_xmap_update(lprocess_t *);
490Sstevel@tonic-gate #ifdef DEBUG
500Sstevel@tonic-gate extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
510Sstevel@tonic-gate #endif /* DEBUG */
520Sstevel@tonic-gate
530Sstevel@tonic-gate /*
540Sstevel@tonic-gate * The number of file descriptors required to grab a process and create an
550Sstevel@tonic-gate * agent in it.
560Sstevel@tonic-gate */
570Sstevel@tonic-gate #define PGRAB_FD_COUNT 10
580Sstevel@tonic-gate
590Sstevel@tonic-gate /*
600Sstevel@tonic-gate * Record a position in an address space as it corresponds to a prpageheader_t
610Sstevel@tonic-gate * and affiliated structures.
620Sstevel@tonic-gate */
630Sstevel@tonic-gate typedef struct prpageheader_cur {
640Sstevel@tonic-gate int pr_nmap; /* number of mappings in address space */
650Sstevel@tonic-gate int pr_map; /* number of this mapping */
660Sstevel@tonic-gate uint64_t pr_pgoff; /* page offset into mapping */
670Sstevel@tonic-gate uint64_t pr_npage; /* number of pages in mapping */
680Sstevel@tonic-gate uint64_t pr_pagesize; /* page size of mapping */
690Sstevel@tonic-gate uintptr_t pr_addr; /* base of mapping */
700Sstevel@tonic-gate prpageheader_t *pr_prpageheader; /* associated page header */
710Sstevel@tonic-gate void *pr_pdaddr; /* address of page's byte in pagedata */
720Sstevel@tonic-gate prxmap_t *pr_xmap; /* array containing per-segment information */
730Sstevel@tonic-gate int pr_nxmap; /* number of xmaps in array */
740Sstevel@tonic-gate int64_t pr_rss; /* number of resident pages in mapping, */
750Sstevel@tonic-gate /* or -1 if xmap is out of sync */
760Sstevel@tonic-gate int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
770Sstevel@tonic-gate } prpageheader_cur_t;
780Sstevel@tonic-gate
790Sstevel@tonic-gate static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
800Sstevel@tonic-gate
810Sstevel@tonic-gate typedef enum {
820Sstevel@tonic-gate STDL_NORMAL,
830Sstevel@tonic-gate STDL_HIGH
840Sstevel@tonic-gate } st_debug_level_t;
850Sstevel@tonic-gate
860Sstevel@tonic-gate /*
870Sstevel@tonic-gate * Output a scanning-related debug message.
880Sstevel@tonic-gate */
890Sstevel@tonic-gate /*PRINTFLIKE3*/ /*ARGSUSED*/
900Sstevel@tonic-gate static void
st_debug(st_debug_level_t level,lcollection_t * lcol,char * msg,...)910Sstevel@tonic-gate st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
920Sstevel@tonic-gate {
930Sstevel@tonic-gate #ifdef DEBUG_MSG
940Sstevel@tonic-gate va_list alist;
950Sstevel@tonic-gate char *buf;
960Sstevel@tonic-gate size_t len;
970Sstevel@tonic-gate
980Sstevel@tonic-gate if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
990Sstevel@tonic-gate : RCM_DEBUG))
1000Sstevel@tonic-gate return;
1010Sstevel@tonic-gate
1020Sstevel@tonic-gate len = strlen(msg) + LINELEN;
1030Sstevel@tonic-gate buf = malloc(len);
1040Sstevel@tonic-gate if (buf == NULL)
1050Sstevel@tonic-gate return;
106*3247Sgjelinek (void) snprintf(buf, len, "%s %s scanner %s",
107*3247Sgjelinek (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1080Sstevel@tonic-gate lcol->lcol_name, msg);
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate va_start(alist, msg);
1110Sstevel@tonic-gate vdprintfe(RCM_DEBUG, buf, alist);
1120Sstevel@tonic-gate va_end(alist);
1130Sstevel@tonic-gate
1140Sstevel@tonic-gate free(buf);
1150Sstevel@tonic-gate #endif /* DEBUG_MSG */
1160Sstevel@tonic-gate }
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate /*
1190Sstevel@tonic-gate * Determine the collection's current victim, based on its last. The last will
1200Sstevel@tonic-gate * be returned, or, if invalid, any other valid process, if the collection has
1210Sstevel@tonic-gate * any.
1220Sstevel@tonic-gate */
1230Sstevel@tonic-gate static lprocess_t *
get_valid_victim(lcollection_t * lcol,lprocess_t * lpc)1240Sstevel@tonic-gate get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
1250Sstevel@tonic-gate {
1260Sstevel@tonic-gate if (lpc == NULL || !lcollection_member(lcol, lpc))
1270Sstevel@tonic-gate lpc = lcol->lcol_lprocess;
1280Sstevel@tonic-gate
1290Sstevel@tonic-gate /*
1300Sstevel@tonic-gate * Find the next scannable process, and make it the victim.
1310Sstevel@tonic-gate */
1320Sstevel@tonic-gate while (lpc != NULL && lpc->lpc_unscannable != 0)
1330Sstevel@tonic-gate lpc = lpc->lpc_next;
1340Sstevel@tonic-gate
1350Sstevel@tonic-gate return (lpc);
1360Sstevel@tonic-gate }
1370Sstevel@tonic-gate
1380Sstevel@tonic-gate /*
1390Sstevel@tonic-gate * Get a process's combined current pagedata (per-page referenced and modified
1400Sstevel@tonic-gate * bits) and set the supplied pointer to it. The caller is responsible for
1410Sstevel@tonic-gate * freeing the data. If the pagedata is unreadable, a nonzero value is
1420Sstevel@tonic-gate * returned, and errno is set. Otherwise, 0 is returned.
1430Sstevel@tonic-gate */
1440Sstevel@tonic-gate static int
get_pagedata(prpageheader_t ** pghpp,int fd)1450Sstevel@tonic-gate get_pagedata(prpageheader_t **pghpp, int fd)
1460Sstevel@tonic-gate {
1470Sstevel@tonic-gate int res;
1480Sstevel@tonic-gate struct stat st;
1490Sstevel@tonic-gate
1500Sstevel@tonic-gate redo:
1510Sstevel@tonic-gate errno = 0;
1520Sstevel@tonic-gate if (fstat(fd, &st) != 0) {
1530Sstevel@tonic-gate debug("cannot stat pagedata\n");
1540Sstevel@tonic-gate return (-1);
1550Sstevel@tonic-gate }
1560Sstevel@tonic-gate
1570Sstevel@tonic-gate errno = 0;
1580Sstevel@tonic-gate *pghpp = malloc(st.st_size);
1590Sstevel@tonic-gate if (*pghpp == NULL) {
1600Sstevel@tonic-gate debug("cannot malloc() %ld bytes for pagedata", st.st_size);
1610Sstevel@tonic-gate return (-1);
1620Sstevel@tonic-gate }
1630Sstevel@tonic-gate (void) bzero(*pghpp, st.st_size);
1640Sstevel@tonic-gate
1650Sstevel@tonic-gate errno = 0;
1660Sstevel@tonic-gate if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
1670Sstevel@tonic-gate free(*pghpp);
1680Sstevel@tonic-gate *pghpp = NULL;
1690Sstevel@tonic-gate if (res > 0 || errno == E2BIG) {
1700Sstevel@tonic-gate debug("pagedata changed size, retrying\n");
1710Sstevel@tonic-gate goto redo;
1720Sstevel@tonic-gate } else {
1730Sstevel@tonic-gate debug("cannot read pagedata");
1740Sstevel@tonic-gate return (-1);
1750Sstevel@tonic-gate }
1760Sstevel@tonic-gate }
1770Sstevel@tonic-gate
1780Sstevel@tonic-gate return (0);
1790Sstevel@tonic-gate }
1800Sstevel@tonic-gate
1810Sstevel@tonic-gate /*
1820Sstevel@tonic-gate * Return the count of kilobytes of pages represented by the given pagedata
1830Sstevel@tonic-gate * which meet the given criteria, having pages which are in all of the states
1840Sstevel@tonic-gate * specified by the mask, and in none of the states in the notmask. If the
1850Sstevel@tonic-gate * CP_CLEAR flag is set, the pagedata will also be cleared.
1860Sstevel@tonic-gate */
1870Sstevel@tonic-gate #define CP_CLEAR 1
1880Sstevel@tonic-gate static uint64_t
count_pages(prpageheader_t * pghp,int flags,int mask,int notmask)1890Sstevel@tonic-gate count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
1900Sstevel@tonic-gate {
1910Sstevel@tonic-gate int map;
1920Sstevel@tonic-gate caddr_t cur, end;
1930Sstevel@tonic-gate prpageheader_t pgh = *pghp;
1940Sstevel@tonic-gate prasmap_t *asmapp;
1950Sstevel@tonic-gate uint64_t count = 0;
1960Sstevel@tonic-gate
1970Sstevel@tonic-gate cur = (caddr_t)pghp + sizeof (*pghp);
1980Sstevel@tonic-gate for (map = 0; map < pgh.pr_nmap; map++) {
1990Sstevel@tonic-gate asmapp = (prasmap_t *)(uintptr_t)cur;
2000Sstevel@tonic-gate cur += sizeof (*asmapp);
2010Sstevel@tonic-gate end = cur + asmapp->pr_npage;
2020Sstevel@tonic-gate while (cur < end) {
2030Sstevel@tonic-gate if ((*cur & mask) == mask && (*cur & notmask) == 0)
2040Sstevel@tonic-gate count += asmapp->pr_pagesize / 1024;
2050Sstevel@tonic-gate if ((flags & CP_CLEAR) != 0)
2060Sstevel@tonic-gate *cur = 0;
2070Sstevel@tonic-gate cur++;
2080Sstevel@tonic-gate }
2090Sstevel@tonic-gate
2100Sstevel@tonic-gate /*
2110Sstevel@tonic-gate * Skip to next 64-bit-aligned address to get the next
2120Sstevel@tonic-gate * prasmap_t.
2130Sstevel@tonic-gate */
2140Sstevel@tonic-gate cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
2150Sstevel@tonic-gate }
2160Sstevel@tonic-gate
2170Sstevel@tonic-gate return (count);
2180Sstevel@tonic-gate }
2190Sstevel@tonic-gate
2200Sstevel@tonic-gate /*
2210Sstevel@tonic-gate * Return the amount of memory (in kilobytes) that hasn't been referenced or
2220Sstevel@tonic-gate * modified, which memory which will be paged out first. Should be written to
2230Sstevel@tonic-gate * exclude nonresident pages when sufficient interfaces exist.
2240Sstevel@tonic-gate */
2250Sstevel@tonic-gate static uint64_t
unrm_size(lprocess_t * lpc)2260Sstevel@tonic-gate unrm_size(lprocess_t *lpc)
2270Sstevel@tonic-gate {
2280Sstevel@tonic-gate return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
2290Sstevel@tonic-gate 0, PG_MODIFIED | PG_REFERENCED));
2300Sstevel@tonic-gate }
2310Sstevel@tonic-gate
2320Sstevel@tonic-gate /*
2330Sstevel@tonic-gate * Advance a prpageheader_cur_t to the address space's next mapping, returning
2340Sstevel@tonic-gate * its address, or NULL if there is none. Any known nonpageable or nonresident
2350Sstevel@tonic-gate * mappings will be skipped over.
2360Sstevel@tonic-gate */
2370Sstevel@tonic-gate static uintptr_t
advance_prpageheader_cur_nextmapping(prpageheader_cur_t * pcp)2380Sstevel@tonic-gate advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
2390Sstevel@tonic-gate {
2400Sstevel@tonic-gate prasmap_t *pap;
2410Sstevel@tonic-gate int i;
2420Sstevel@tonic-gate
2430Sstevel@tonic-gate next:
2440Sstevel@tonic-gate ASSERT(pcp->pr_map < pcp->pr_nmap);
2450Sstevel@tonic-gate if ((pcp->pr_map + 1) == pcp->pr_nmap)
2460Sstevel@tonic-gate return (NULL);
2470Sstevel@tonic-gate pcp->pr_map++;
2480Sstevel@tonic-gate if (pcp->pr_pgoff < pcp->pr_npage) {
249428Ssl108498 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
250428Ssl108498 ((uintptr_t)pcp->pr_pdaddr +
2510Sstevel@tonic-gate (pcp->pr_npage - pcp->pr_pgoff));
2520Sstevel@tonic-gate pcp->pr_pgoff = pcp->pr_npage;
2530Sstevel@tonic-gate }
2540Sstevel@tonic-gate /*
2550Sstevel@tonic-gate * Skip to next 64-bit-aligned address to get the next prasmap_t.
2560Sstevel@tonic-gate */
2570Sstevel@tonic-gate pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
2580Sstevel@tonic-gate pap = (prasmap_t *)pcp->pr_pdaddr;
2590Sstevel@tonic-gate pcp->pr_pgoff = 0;
2600Sstevel@tonic-gate pcp->pr_npage = pap->pr_npage;
2610Sstevel@tonic-gate pcp->pr_pagesize = pap->pr_pagesize;
2620Sstevel@tonic-gate pcp->pr_addr = pap->pr_vaddr;
2630Sstevel@tonic-gate pcp->pr_pdaddr = pap + 1;
2640Sstevel@tonic-gate
2650Sstevel@tonic-gate /*
2660Sstevel@tonic-gate * Skip any known nonpageable mappings. Currently, the only one
2670Sstevel@tonic-gate * detected is the schedctl page.
2680Sstevel@tonic-gate */
2690Sstevel@tonic-gate if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
2700Sstevel@tonic-gate MA_ANON)) == 0 && pap->pr_npage == 1) {
2710Sstevel@tonic-gate debug("identified nonpageable schedctl mapping at %p\n",
2720Sstevel@tonic-gate (void *)pcp->pr_addr);
2730Sstevel@tonic-gate goto next;
2740Sstevel@tonic-gate }
2750Sstevel@tonic-gate
2760Sstevel@tonic-gate /*
2770Sstevel@tonic-gate * Skip mappings with no resident pages. If the xmap does not
2780Sstevel@tonic-gate * correspond to the pagedata for any reason, it will be ignored.
2790Sstevel@tonic-gate */
2800Sstevel@tonic-gate pcp->pr_rss = -1;
2810Sstevel@tonic-gate pcp->pr_pg_rss = -1;
2820Sstevel@tonic-gate for (i = 0; i < pcp->pr_nxmap; i++) {
2830Sstevel@tonic-gate prxmap_t *xmap = &pcp->pr_xmap[i];
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
2860Sstevel@tonic-gate (pcp->pr_npage * pcp->pr_pagesize)) {
2870Sstevel@tonic-gate pcp->pr_rss = xmap->pr_rss;
2880Sstevel@tonic-gate /*
2890Sstevel@tonic-gate * Remove COW pages from the pageable RSS count.
2900Sstevel@tonic-gate */
2910Sstevel@tonic-gate if ((xmap->pr_mflags & MA_SHARED) == 0)
2920Sstevel@tonic-gate pcp->pr_pg_rss = xmap->pr_anon;
2930Sstevel@tonic-gate break;
2940Sstevel@tonic-gate }
2950Sstevel@tonic-gate }
2960Sstevel@tonic-gate if (pcp->pr_rss == 0) {
2970Sstevel@tonic-gate debug("identified nonresident mapping at 0x%p\n",
2980Sstevel@tonic-gate (void *)pcp->pr_addr);
2990Sstevel@tonic-gate goto next;
3000Sstevel@tonic-gate } else if (pcp->pr_pg_rss == 0) {
3010Sstevel@tonic-gate debug("identified unpageable mapping at 0x%p\n",
3020Sstevel@tonic-gate (void *)pcp->pr_addr);
3030Sstevel@tonic-gate goto next;
3040Sstevel@tonic-gate }
3050Sstevel@tonic-gate
3060Sstevel@tonic-gate return (pcp->pr_addr);
3070Sstevel@tonic-gate }
3080Sstevel@tonic-gate
3090Sstevel@tonic-gate /*
3100Sstevel@tonic-gate * Advance a prpageheader_cur_t to the mapping's next page, returning its
3110Sstevel@tonic-gate * address, or NULL if there is none.
3120Sstevel@tonic-gate */
3130Sstevel@tonic-gate static void *
advance_prpageheader_cur(prpageheader_cur_t * pcp)3140Sstevel@tonic-gate advance_prpageheader_cur(prpageheader_cur_t *pcp)
3150Sstevel@tonic-gate {
3160Sstevel@tonic-gate ASSERT(pcp->pr_pgoff < pcp->pr_npage);
3170Sstevel@tonic-gate if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
3180Sstevel@tonic-gate return (NULL);
3190Sstevel@tonic-gate pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
3200Sstevel@tonic-gate pcp->pr_pgoff++;
3210Sstevel@tonic-gate
3220Sstevel@tonic-gate ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
3230Sstevel@tonic-gate return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
3240Sstevel@tonic-gate }
3250Sstevel@tonic-gate
3260Sstevel@tonic-gate /*
3270Sstevel@tonic-gate * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
3280Sstevel@tonic-gate * of an address space.
3290Sstevel@tonic-gate */
3300Sstevel@tonic-gate static void *
set_prpageheader_cur(prpageheader_cur_t * pcp,prpageheader_t * php,prxmap_t * xmap,int nxmap)3310Sstevel@tonic-gate set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
3320Sstevel@tonic-gate prxmap_t *xmap, int nxmap)
3330Sstevel@tonic-gate {
3340Sstevel@tonic-gate bzero(pcp, sizeof (*pcp));
3350Sstevel@tonic-gate pcp->pr_nmap = php->pr_nmap;
3360Sstevel@tonic-gate pcp->pr_map = -1;
3370Sstevel@tonic-gate pcp->pr_prpageheader = php;
3380Sstevel@tonic-gate pcp->pr_xmap = xmap;
3390Sstevel@tonic-gate pcp->pr_nxmap = nxmap;
3400Sstevel@tonic-gate pcp->pr_pdaddr = (prpageheader_t *)php + 1;
3410Sstevel@tonic-gate
3420Sstevel@tonic-gate return ((void *)advance_prpageheader_cur_nextmapping(pcp));
3430Sstevel@tonic-gate }
3440Sstevel@tonic-gate
3450Sstevel@tonic-gate /*
3460Sstevel@tonic-gate * Position a prpageheader_cur_t to the mapped address greater or equal to the
3470Sstevel@tonic-gate * given value.
3480Sstevel@tonic-gate */
3490Sstevel@tonic-gate static void *
set_prpageheader_cur_addr(prpageheader_cur_t * pcp,prpageheader_t * php,prxmap_t * xmap,int nxmap,void * naddr)3500Sstevel@tonic-gate set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
3510Sstevel@tonic-gate prxmap_t *xmap, int nxmap, void *naddr)
3520Sstevel@tonic-gate {
3530Sstevel@tonic-gate void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
3540Sstevel@tonic-gate
3550Sstevel@tonic-gate while (addr != NULL && addr <= naddr)
3560Sstevel@tonic-gate if (naddr < (void *)((caddr_t)pcp->pr_addr +
3570Sstevel@tonic-gate pcp->pr_pagesize * pcp->pr_npage)) {
3580Sstevel@tonic-gate uint64_t pgdiff = ((uintptr_t)naddr -
3590Sstevel@tonic-gate (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
3600Sstevel@tonic-gate pcp->pr_pgoff += pgdiff;
3610Sstevel@tonic-gate pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
3620Sstevel@tonic-gate addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
3630Sstevel@tonic-gate pcp->pr_pgoff;
3640Sstevel@tonic-gate break;
3650Sstevel@tonic-gate } else
3660Sstevel@tonic-gate addr =
3670Sstevel@tonic-gate (void *)advance_prpageheader_cur_nextmapping(pcp);
3680Sstevel@tonic-gate
3690Sstevel@tonic-gate return (addr);
3700Sstevel@tonic-gate }
3710Sstevel@tonic-gate
3720Sstevel@tonic-gate static void
revoke_pagedata(rfd_t * rfd)3730Sstevel@tonic-gate revoke_pagedata(rfd_t *rfd)
3740Sstevel@tonic-gate {
3750Sstevel@tonic-gate lprocess_t *lpc = rfd->rfd_data;
3760Sstevel@tonic-gate
3770Sstevel@tonic-gate st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
3780Sstevel@tonic-gate " process %d\n", (int)lpc->lpc_pid);
3790Sstevel@tonic-gate ASSERT(lpc->lpc_pgdata_fd != -1);
3800Sstevel@tonic-gate lpc->lpc_pgdata_fd = -1;
3810Sstevel@tonic-gate }
3820Sstevel@tonic-gate
3830Sstevel@tonic-gate #ifdef DEBUG
3840Sstevel@tonic-gate static void
mklmapping(lmapping_t ** lm,prpageheader_t * pgh)3850Sstevel@tonic-gate mklmapping(lmapping_t **lm, prpageheader_t *pgh)
3860Sstevel@tonic-gate {
3870Sstevel@tonic-gate prpageheader_cur_t cur;
3880Sstevel@tonic-gate void *addr;
3890Sstevel@tonic-gate
3900Sstevel@tonic-gate addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
3910Sstevel@tonic-gate ASSERT(*lm == NULL);
3920Sstevel@tonic-gate while (addr != NULL) {
3930Sstevel@tonic-gate (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
3940Sstevel@tonic-gate cur.pr_pagesize);
3950Sstevel@tonic-gate addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
3960Sstevel@tonic-gate }
3970Sstevel@tonic-gate }
3980Sstevel@tonic-gate
3990Sstevel@tonic-gate static void
lmapping_dump(lmapping_t * lm)4000Sstevel@tonic-gate lmapping_dump(lmapping_t *lm)
4010Sstevel@tonic-gate {
4020Sstevel@tonic-gate debug("lm: %p\n", (void *)lm);
4030Sstevel@tonic-gate while (lm != NULL) {
4040Sstevel@tonic-gate debug("\t(%p, %llx\n", (void *)lm->lm_addr,
4050Sstevel@tonic-gate (unsigned long long)lm->lm_size);
4060Sstevel@tonic-gate lm = lm->lm_next;
4070Sstevel@tonic-gate }
4080Sstevel@tonic-gate }
4090Sstevel@tonic-gate #endif /* DEBUG */
4100Sstevel@tonic-gate
4110Sstevel@tonic-gate /*
4120Sstevel@tonic-gate * OR two prpagedata_t which are supposedly snapshots of the same address
4130Sstevel@tonic-gate * space. Intersecting mappings with different page sizes are tolerated but
4140Sstevel@tonic-gate * not normalized (not accurate). If the mappings of the two snapshots differ
4150Sstevel@tonic-gate * in any regard, the supplied mappings_changed flag will be set.
4160Sstevel@tonic-gate */
4170Sstevel@tonic-gate static void
OR_pagedata(prpageheader_t * src,prpageheader_t * dst,int * mappings_changedp)4180Sstevel@tonic-gate OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
4190Sstevel@tonic-gate {
4200Sstevel@tonic-gate prpageheader_cur_t src_cur;
4210Sstevel@tonic-gate prpageheader_cur_t dst_cur;
4220Sstevel@tonic-gate uintptr_t src_addr;
4230Sstevel@tonic-gate uintptr_t dst_addr;
4240Sstevel@tonic-gate int mappings_changed = 0;
4250Sstevel@tonic-gate
4260Sstevel@tonic-gate /*
4270Sstevel@tonic-gate * OR source pagedata with the destination, for pages of intersecting
4280Sstevel@tonic-gate * mappings.
4290Sstevel@tonic-gate */
4300Sstevel@tonic-gate src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
4310Sstevel@tonic-gate dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
4320Sstevel@tonic-gate while (src_addr != NULL && dst_addr != NULL) {
4330Sstevel@tonic-gate while (src_addr == dst_addr && src_addr != NULL) {
4340Sstevel@tonic-gate *(char *)dst_cur.pr_pdaddr |=
4350Sstevel@tonic-gate *(char *)src_cur.pr_pdaddr;
4360Sstevel@tonic-gate src_addr = (uintptr_t)advance_prpageheader_cur(
4370Sstevel@tonic-gate &src_cur);
4380Sstevel@tonic-gate dst_addr = (uintptr_t)advance_prpageheader_cur(
4390Sstevel@tonic-gate &dst_cur);
4400Sstevel@tonic-gate }
4410Sstevel@tonic-gate if (src_addr != dst_addr)
4420Sstevel@tonic-gate mappings_changed = 1;
4430Sstevel@tonic-gate src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
4440Sstevel@tonic-gate dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
4450Sstevel@tonic-gate while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
4460Sstevel@tonic-gate NULL) {
4470Sstevel@tonic-gate mappings_changed = 1;
4480Sstevel@tonic-gate if (src_addr < dst_addr)
4490Sstevel@tonic-gate src_addr = advance_prpageheader_cur_nextmapping(
4500Sstevel@tonic-gate &src_cur);
4510Sstevel@tonic-gate else
4520Sstevel@tonic-gate dst_addr = advance_prpageheader_cur_nextmapping(
4530Sstevel@tonic-gate &dst_cur);
4540Sstevel@tonic-gate }
4550Sstevel@tonic-gate }
4560Sstevel@tonic-gate
4570Sstevel@tonic-gate *mappings_changedp = mappings_changed;
4580Sstevel@tonic-gate }
4590Sstevel@tonic-gate
4600Sstevel@tonic-gate /*
4610Sstevel@tonic-gate * Merge the current pagedata with that on hand. If the pagedata is
4620Sstevel@tonic-gate * unretrievable for any reason, such as the process having exited or being a
4630Sstevel@tonic-gate * zombie, a nonzero value is returned, the process should be marked
4640Sstevel@tonic-gate * unscannable, and future attempts to scan it should be avoided, since the
4650Sstevel@tonic-gate * symptom is probably permament. If the mappings of either pagedata
4660Sstevel@tonic-gate * differ in any respect, the supplied callback will be invoked once.
4670Sstevel@tonic-gate */
4680Sstevel@tonic-gate static int
merge_current_pagedata(lprocess_t * lpc,void (* mappings_changed_cb)(lprocess_t *))4690Sstevel@tonic-gate merge_current_pagedata(lprocess_t *lpc,
4700Sstevel@tonic-gate void(*mappings_changed_cb) (lprocess_t *))
4710Sstevel@tonic-gate {
4720Sstevel@tonic-gate prpageheader_t *pghp;
4730Sstevel@tonic-gate int mappings_changed = 0;
474*3247Sgjelinek uint64_t cnt;
4750Sstevel@tonic-gate
4760Sstevel@tonic-gate if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
4770Sstevel@tonic-gate 0) {
4780Sstevel@tonic-gate char pathbuf[PROC_PATH_MAX];
4790Sstevel@tonic-gate
4800Sstevel@tonic-gate (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
4810Sstevel@tonic-gate (int)lpc->lpc_pid);
4820Sstevel@tonic-gate if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
4830Sstevel@tonic-gate revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
4840Sstevel@tonic-gate get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
4850Sstevel@tonic-gate return (-1);
4860Sstevel@tonic-gate debug("starting/resuming pagedata collection for %d\n",
4870Sstevel@tonic-gate (int)lpc->lpc_pid);
4880Sstevel@tonic-gate }
489*3247Sgjelinek
490*3247Sgjelinek cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
491*3247Sgjelinek if (cnt != 0 || lpc->lpc_rss != 0)
492*3247Sgjelinek debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
493*3247Sgjelinek (int)lpc->lpc_pid, (unsigned long long)cnt,
494*3247Sgjelinek (unsigned long long)lpc->lpc_rss);
4950Sstevel@tonic-gate if (lpc->lpc_prpageheader != NULL) {
4960Sstevel@tonic-gate /*
4970Sstevel@tonic-gate * OR the two snapshots.
4980Sstevel@tonic-gate */
4990Sstevel@tonic-gate #ifdef DEBUG
5000Sstevel@tonic-gate lmapping_t *old = NULL;
5010Sstevel@tonic-gate lmapping_t *new = NULL;
5020Sstevel@tonic-gate
5030Sstevel@tonic-gate mklmapping(&new, pghp);
5040Sstevel@tonic-gate mklmapping(&old, lpc->lpc_prpageheader);
5050Sstevel@tonic-gate #endif /* DEBUG */
5060Sstevel@tonic-gate OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
5070Sstevel@tonic-gate #ifdef DEBUG
5080Sstevel@tonic-gate if (((mappings_changed != 0) ^
5090Sstevel@tonic-gate (lmapping_dump_diff(old, new) != 0))) {
5100Sstevel@tonic-gate debug("lmapping_changed inconsistent with lmapping\n");
5110Sstevel@tonic-gate debug("old\n");
5120Sstevel@tonic-gate lmapping_dump(old);
5130Sstevel@tonic-gate debug("new\n");
5140Sstevel@tonic-gate lmapping_dump(new);
5150Sstevel@tonic-gate debug("ignored\n");
5160Sstevel@tonic-gate lmapping_dump(lpc->lpc_ignore);
5170Sstevel@tonic-gate ASSERT(0);
5180Sstevel@tonic-gate }
5190Sstevel@tonic-gate lmapping_free(&new);
5200Sstevel@tonic-gate lmapping_free(&old);
5210Sstevel@tonic-gate #endif /* DEBUG */
5220Sstevel@tonic-gate free(lpc->lpc_prpageheader);
5230Sstevel@tonic-gate } else
5240Sstevel@tonic-gate mappings_changed = 1;
5250Sstevel@tonic-gate lpc->lpc_prpageheader = pghp;
526*3247Sgjelinek
527*3247Sgjelinek cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
528*3247Sgjelinek if (cnt != 0 || lpc->lpc_rss != 0)
529*3247Sgjelinek debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
530*3247Sgjelinek (int)lpc->lpc_pid, (unsigned long long)cnt,
531*3247Sgjelinek (unsigned long long)lpc->lpc_rss);
5320Sstevel@tonic-gate if (mappings_changed != 0) {
5330Sstevel@tonic-gate debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
5340Sstevel@tonic-gate if (mappings_changed_cb != NULL)
5350Sstevel@tonic-gate mappings_changed_cb(lpc);
5360Sstevel@tonic-gate }
5370Sstevel@tonic-gate return (0);
5380Sstevel@tonic-gate }
5390Sstevel@tonic-gate
5400Sstevel@tonic-gate /*
5410Sstevel@tonic-gate * Attempt to page out a region of the given process's address space. May
5420Sstevel@tonic-gate * return nonzero if not all of the pages may are pageable, for any reason.
5430Sstevel@tonic-gate */
5440Sstevel@tonic-gate static int
pageout(pid_t pid,struct ps_prochandle * Pr,caddr_t start,caddr_t end)5450Sstevel@tonic-gate pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
5460Sstevel@tonic-gate {
5470Sstevel@tonic-gate int res;
5480Sstevel@tonic-gate
5490Sstevel@tonic-gate if (end <= start)
5500Sstevel@tonic-gate return (0);
5510Sstevel@tonic-gate
5520Sstevel@tonic-gate errno = 0;
5530Sstevel@tonic-gate res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
5540Sstevel@tonic-gate (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
5550Sstevel@tonic-gate debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
5560Sstevel@tonic-gate
5570Sstevel@tonic-gate /*
5580Sstevel@tonic-gate * EBUSY indicates none of the pages have backing store allocated, or
5590Sstevel@tonic-gate * some pages were locked, which are less interesting than other
5600Sstevel@tonic-gate * conditions, which are noted.
5610Sstevel@tonic-gate */
5620Sstevel@tonic-gate if (res != 0)
5630Sstevel@tonic-gate if (errno == EBUSY)
5640Sstevel@tonic-gate res = 0;
5650Sstevel@tonic-gate else
5660Sstevel@tonic-gate debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
5670Sstevel@tonic-gate (void *)start, (long long)(end - start), errno);
5680Sstevel@tonic-gate
5690Sstevel@tonic-gate return (res);
5700Sstevel@tonic-gate }
5710Sstevel@tonic-gate
5720Sstevel@tonic-gate /*
5730Sstevel@tonic-gate * Compute the delta of the victim process's RSS since the last call. If the
5740Sstevel@tonic-gate * psinfo cannot be obtained, no work is done, and no error is returned; it is
5750Sstevel@tonic-gate * up to the caller to detect the process' termination via other means.
5760Sstevel@tonic-gate */
5770Sstevel@tonic-gate static int64_t
rss_delta(psinfo_t * new_psinfo,psinfo_t * old_psinfo,lprocess_t * vic)5780Sstevel@tonic-gate rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
5790Sstevel@tonic-gate {
5800Sstevel@tonic-gate int64_t d_rss = 0;
5810Sstevel@tonic-gate
5820Sstevel@tonic-gate if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
5830Sstevel@tonic-gate lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
5840Sstevel@tonic-gate d_rss = (int64_t)new_psinfo->pr_rssize -
5850Sstevel@tonic-gate (int64_t)old_psinfo->pr_rssize;
5860Sstevel@tonic-gate if (d_rss < 0)
5870Sstevel@tonic-gate vic->lpc_collection->lcol_stat.lcols_pg_eff +=
5880Sstevel@tonic-gate (- d_rss);
5890Sstevel@tonic-gate *old_psinfo = *new_psinfo;
5900Sstevel@tonic-gate }
5910Sstevel@tonic-gate
5920Sstevel@tonic-gate return (d_rss);
5930Sstevel@tonic-gate }
5940Sstevel@tonic-gate
5950Sstevel@tonic-gate static void
unignore_mappings(lprocess_t * lpc)5960Sstevel@tonic-gate unignore_mappings(lprocess_t *lpc)
5970Sstevel@tonic-gate {
5980Sstevel@tonic-gate lmapping_free(&lpc->lpc_ignore);
5990Sstevel@tonic-gate }
6000Sstevel@tonic-gate
6010Sstevel@tonic-gate static void
unignore_referenced_mappings(lprocess_t * lpc)6020Sstevel@tonic-gate unignore_referenced_mappings(lprocess_t *lpc)
6030Sstevel@tonic-gate {
6040Sstevel@tonic-gate prpageheader_cur_t cur;
6050Sstevel@tonic-gate void *vicaddr;
6060Sstevel@tonic-gate
6070Sstevel@tonic-gate vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
6080Sstevel@tonic-gate while (vicaddr != NULL) {
6090Sstevel@tonic-gate if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
6100Sstevel@tonic-gate != 0) {
6110Sstevel@tonic-gate if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
6120Sstevel@tonic-gate cur.pr_npage * cur.pr_pagesize) == 0)
6130Sstevel@tonic-gate debug("removed mapping 0x%p+0t%llukB from"
6140Sstevel@tonic-gate " ignored set\n", (void *)cur.pr_addr,
6150Sstevel@tonic-gate (unsigned long long)(cur.pr_npage *
6160Sstevel@tonic-gate cur.pr_pagesize / 1024));
6170Sstevel@tonic-gate vicaddr = (void *)advance_prpageheader_cur_nextmapping(
6180Sstevel@tonic-gate &cur);
6190Sstevel@tonic-gate } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
6200Sstevel@tonic-gate vicaddr = (void *)advance_prpageheader_cur_nextmapping(
6210Sstevel@tonic-gate &cur);
6220Sstevel@tonic-gate }
6230Sstevel@tonic-gate }
6240Sstevel@tonic-gate
6250Sstevel@tonic-gate /*
6260Sstevel@tonic-gate * Resume scanning, starting with the last victim, if it is still valid, or any
6270Sstevel@tonic-gate * other one, otherwise.
6280Sstevel@tonic-gate */
6290Sstevel@tonic-gate void
scan(lcollection_t * lcol,int64_t excess)6300Sstevel@tonic-gate scan(lcollection_t *lcol, int64_t excess)
6310Sstevel@tonic-gate {
6320Sstevel@tonic-gate lprocess_t *vic, *lpc;
6330Sstevel@tonic-gate void *vicaddr, *endaddr, *nvicaddr;
6340Sstevel@tonic-gate prpageheader_cur_t cur;
6350Sstevel@tonic-gate psinfo_t old_psinfo, new_psinfo;
6360Sstevel@tonic-gate hrtime_t scan_start;
6370Sstevel@tonic-gate int res, resumed;
6380Sstevel@tonic-gate uint64_t col_unrm_size;
6390Sstevel@tonic-gate
6400Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
6410Sstevel@tonic-gate (long long)excess);
6420Sstevel@tonic-gate
6430Sstevel@tonic-gate /*
6440Sstevel@tonic-gate * Determine the address to start scanning at, depending on whether
6450Sstevel@tonic-gate * scanning can be resumed.
6460Sstevel@tonic-gate */
6470Sstevel@tonic-gate endaddr = NULL;
6480Sstevel@tonic-gate if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
6490Sstevel@tonic-gate lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
6500Sstevel@tonic-gate vicaddr = lcol->lcol_resaddr;
6510Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
6520Sstevel@tonic-gate (int)vic->lpc_pid);
6530Sstevel@tonic-gate resumed = 1;
6540Sstevel@tonic-gate } else {
6550Sstevel@tonic-gate vicaddr = NULL;
6560Sstevel@tonic-gate resumed = 0;
6570Sstevel@tonic-gate }
6580Sstevel@tonic-gate
6590Sstevel@tonic-gate scan_start = gethrtime();
6600Sstevel@tonic-gate /*
6610Sstevel@tonic-gate * Obtain the most current pagedata for the processes that might be
6620Sstevel@tonic-gate * scanned, and remove from the ignored set any mappings which have
6630Sstevel@tonic-gate * referenced or modified pages (in the hopes that the pageability of
6640Sstevel@tonic-gate * the mapping's pages may have changed). Determine if the
6650Sstevel@tonic-gate * unreferenced and unmodified portion is impossibly small to suffice
6660Sstevel@tonic-gate * to reduce the excess completely. If so, ignore these bits so that
6670Sstevel@tonic-gate * even working set will be paged out.
6680Sstevel@tonic-gate */
6690Sstevel@tonic-gate col_unrm_size = 0;
6700Sstevel@tonic-gate lpc = vic;
6710Sstevel@tonic-gate while (lpc != NULL && should_run) {
6720Sstevel@tonic-gate if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
6730Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "process %d:"
6740Sstevel@tonic-gate " exited/temporarily unscannable",
6750Sstevel@tonic-gate (int)lpc->lpc_pid);
6760Sstevel@tonic-gate goto next;
6770Sstevel@tonic-gate }
6780Sstevel@tonic-gate debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
6790Sstevel@tonic-gate (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
6800Sstevel@tonic-gate (unsigned long long)lpc->lpc_size);
6810Sstevel@tonic-gate col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
6820Sstevel@tonic-gate
6830Sstevel@tonic-gate if ((lcol->lcol_stat.lcols_scan_count %
6840Sstevel@tonic-gate RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
6850Sstevel@tonic-gate /*
6860Sstevel@tonic-gate * Periodically clear the set of ignored mappings.
6870Sstevel@tonic-gate * This will allow processes whose ignored segments'
6880Sstevel@tonic-gate * pageability have changed (without a corresponding
6890Sstevel@tonic-gate * reference or modification to a page) to be
6900Sstevel@tonic-gate * recognized.
6910Sstevel@tonic-gate */
6920Sstevel@tonic-gate if (lcol->lcol_stat.lcols_scan_count > 0)
6930Sstevel@tonic-gate unignore_mappings(lpc);
6940Sstevel@tonic-gate } else {
6950Sstevel@tonic-gate /*
6960Sstevel@tonic-gate * Ensure mappings with referenced or modified pages
6970Sstevel@tonic-gate * are not in the ignored set. Their usage might mean
6980Sstevel@tonic-gate * the condition which made them unpageable is gone.
6990Sstevel@tonic-gate */
7000Sstevel@tonic-gate unignore_referenced_mappings(lpc);
7010Sstevel@tonic-gate }
7020Sstevel@tonic-gate next:
7030Sstevel@tonic-gate lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
7040Sstevel@tonic-gate lpc->lpc_next) : NULL;
7050Sstevel@tonic-gate }
7060Sstevel@tonic-gate if (col_unrm_size < excess) {
7070Sstevel@tonic-gate lpc = vic;
7080Sstevel@tonic-gate debug("will not reduce excess with only unreferenced pages\n");
7090Sstevel@tonic-gate while (lpc != NULL && should_run) {
7100Sstevel@tonic-gate if (lpc->lpc_prpageheader != NULL) {
7110Sstevel@tonic-gate (void) count_pages(lpc->lpc_prpageheader,
7120Sstevel@tonic-gate CP_CLEAR, 0, 0);
7130Sstevel@tonic-gate if (lpc->lpc_pgdata_fd >= 0) {
7140Sstevel@tonic-gate if (rfd_close(lpc->lpc_pgdata_fd) != 0)
7150Sstevel@tonic-gate debug("coud not close %d"
7160Sstevel@tonic-gate " lpc_pgdata_fd %d",
7170Sstevel@tonic-gate (int)lpc->lpc_pid,
7180Sstevel@tonic-gate lpc->lpc_pgdata_fd);
7190Sstevel@tonic-gate lpc->lpc_pgdata_fd = -1;
7200Sstevel@tonic-gate }
7210Sstevel@tonic-gate }
7220Sstevel@tonic-gate lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
7230Sstevel@tonic-gate lpc->lpc_next) : NULL;
7240Sstevel@tonic-gate }
7250Sstevel@tonic-gate }
7260Sstevel@tonic-gate
7270Sstevel@tonic-gate /*
7280Sstevel@tonic-gate * Examine each process for pages to remove until the excess is
7290Sstevel@tonic-gate * reduced.
7300Sstevel@tonic-gate */
7310Sstevel@tonic-gate while (vic != NULL && excess > 0 && should_run) {
7320Sstevel@tonic-gate /*
7330Sstevel@tonic-gate * Skip processes whose death was reported when the merging of
7340Sstevel@tonic-gate * pagedata was attempted.
7350Sstevel@tonic-gate */
7360Sstevel@tonic-gate if (vic->lpc_prpageheader == NULL)
7370Sstevel@tonic-gate goto nextproc;
7380Sstevel@tonic-gate
7390Sstevel@tonic-gate /*
7400Sstevel@tonic-gate * Obtain optional segment residency information.
7410Sstevel@tonic-gate */
7420Sstevel@tonic-gate if (lpc_xmap_update(vic) != 0)
7430Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "process %d: xmap"
7440Sstevel@tonic-gate " unreadable; ignoring", (int)vic->lpc_pid);
7450Sstevel@tonic-gate
7460Sstevel@tonic-gate #ifdef DEBUG_MSG
7470Sstevel@tonic-gate {
7480Sstevel@tonic-gate void *ovicaddr = vicaddr;
7490Sstevel@tonic-gate #endif /* DEBUG_MSG */
7500Sstevel@tonic-gate vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
7510Sstevel@tonic-gate vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
7520Sstevel@tonic-gate #ifdef DEBUG_MSG
7530Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "trying to resume from"
7540Sstevel@tonic-gate " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
7550Sstevel@tonic-gate }
7560Sstevel@tonic-gate #endif /* DEBUG_MSG */
7570Sstevel@tonic-gate
7580Sstevel@tonic-gate /*
7590Sstevel@tonic-gate * Take control of the victim.
7600Sstevel@tonic-gate */
7610Sstevel@tonic-gate if (get_psinfo(vic->lpc_pid, &old_psinfo,
7620Sstevel@tonic-gate vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
7630Sstevel@tonic-gate vic, vic) != 0) {
7640Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
7650Sstevel@tonic-gate (int)vic->lpc_pid);
7660Sstevel@tonic-gate goto nextproc;
7670Sstevel@tonic-gate }
7680Sstevel@tonic-gate (void) rfd_reserve(PGRAB_FD_COUNT);
7690Sstevel@tonic-gate if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
7700Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
7710Sstevel@tonic-gate (int)vic->lpc_pid, res);
7720Sstevel@tonic-gate goto nextproc;
7730Sstevel@tonic-gate }
7740Sstevel@tonic-gate if (Pcreate_agent(scan_pr) != 0) {
7750Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "cannot control %d",
7760Sstevel@tonic-gate (int)vic->lpc_pid);
7770Sstevel@tonic-gate goto nextproc;
7780Sstevel@tonic-gate }
7790Sstevel@tonic-gate /*
7800Sstevel@tonic-gate * Be very pessimistic about the state of the agent LWP --
7810Sstevel@tonic-gate * verify it's actually stopped.
7820Sstevel@tonic-gate */
7830Sstevel@tonic-gate errno = 0;
7840Sstevel@tonic-gate while (Pstate(scan_pr) == PS_RUN)
7850Sstevel@tonic-gate (void) Pwait(scan_pr, 0);
7860Sstevel@tonic-gate if (Pstate(scan_pr) != PS_STOP) {
7870Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "agent not in expected"
7880Sstevel@tonic-gate " state (%d)", Pstate(scan_pr));
7890Sstevel@tonic-gate goto nextproc;
7900Sstevel@tonic-gate }
7910Sstevel@tonic-gate
7920Sstevel@tonic-gate /*
7930Sstevel@tonic-gate * Within the victim's address space, find contiguous ranges of
7940Sstevel@tonic-gate * unreferenced pages to page out.
7950Sstevel@tonic-gate */
7960Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
7970Sstevel@tonic-gate (int)vic->lpc_pid);
7980Sstevel@tonic-gate while (excess > 0 && vicaddr != NULL && should_run) {
7990Sstevel@tonic-gate /*
8000Sstevel@tonic-gate * Skip mappings in the ignored set. Mappings get
8010Sstevel@tonic-gate * placed in the ignored set when all their resident
8020Sstevel@tonic-gate * pages are unreference and unmodified, yet unpageable
8030Sstevel@tonic-gate * -- such as when they are locked, or involved in
8040Sstevel@tonic-gate * asynchronous I/O. They will be scanned again when
8050Sstevel@tonic-gate * some page is referenced or modified.
8060Sstevel@tonic-gate */
8070Sstevel@tonic-gate if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
8080Sstevel@tonic-gate cur.pr_npage * cur.pr_pagesize)) {
8090Sstevel@tonic-gate debug("ignored mapping at 0x%p\n",
8100Sstevel@tonic-gate (void *)cur.pr_addr);
8110Sstevel@tonic-gate /*
8120Sstevel@tonic-gate * Update statistics.
8130Sstevel@tonic-gate */
8140Sstevel@tonic-gate lcol->lcol_stat.lcols_pg_att +=
8150Sstevel@tonic-gate cur.pr_npage * cur.pr_pagesize / 1024;
8160Sstevel@tonic-gate
8170Sstevel@tonic-gate vicaddr = (void *)
8180Sstevel@tonic-gate advance_prpageheader_cur_nextmapping(&cur);
8190Sstevel@tonic-gate continue;
8200Sstevel@tonic-gate }
8210Sstevel@tonic-gate
8220Sstevel@tonic-gate /*
8230Sstevel@tonic-gate * Determine a range of unreferenced pages to page out,
8240Sstevel@tonic-gate * and clear the R/M bits in the preceding referenced
8250Sstevel@tonic-gate * range.
8260Sstevel@tonic-gate */
8270Sstevel@tonic-gate st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
8280Sstevel@tonic-gate " npage %llu\n", vicaddr,
8290Sstevel@tonic-gate (unsigned long long)cur.pr_npage);
8300Sstevel@tonic-gate while (vicaddr != NULL &&
8310Sstevel@tonic-gate *(caddr_t)cur.pr_pdaddr != 0) {
8320Sstevel@tonic-gate *(caddr_t)cur.pr_pdaddr = 0;
8330Sstevel@tonic-gate vicaddr = advance_prpageheader_cur(&cur);
8340Sstevel@tonic-gate }
8350Sstevel@tonic-gate st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
8360Sstevel@tonic-gate " %p\n", vicaddr, cur.pr_pdaddr);
8370Sstevel@tonic-gate if (vicaddr == NULL) {
8380Sstevel@tonic-gate /*
8390Sstevel@tonic-gate * The end of mapping was reached before any
8400Sstevel@tonic-gate * unreferenced pages were seen.
8410Sstevel@tonic-gate */
8420Sstevel@tonic-gate vicaddr = (void *)
8430Sstevel@tonic-gate advance_prpageheader_cur_nextmapping(&cur);
8440Sstevel@tonic-gate continue;
8450Sstevel@tonic-gate }
8460Sstevel@tonic-gate do
8470Sstevel@tonic-gate endaddr = advance_prpageheader_cur(&cur);
8480Sstevel@tonic-gate while (endaddr != NULL &&
8490Sstevel@tonic-gate *(caddr_t)cur.pr_pdaddr == 0 &&
8500Sstevel@tonic-gate (((intptr_t)endaddr - (intptr_t)vicaddr) /
8510Sstevel@tonic-gate 1024) < excess);
8520Sstevel@tonic-gate st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
8530Sstevel@tonic-gate endaddr, *(caddr_t)cur.pr_pdaddr);
8540Sstevel@tonic-gate
8550Sstevel@tonic-gate /*
8560Sstevel@tonic-gate * Page out from vicaddr to the end of the mapping, or
8570Sstevel@tonic-gate * endaddr if set, then continue scanning after
8580Sstevel@tonic-gate * endaddr, or the next mapping, if not set.
8590Sstevel@tonic-gate */
8600Sstevel@tonic-gate nvicaddr = endaddr;
8610Sstevel@tonic-gate if (endaddr == NULL)
8620Sstevel@tonic-gate endaddr = (caddr_t)cur.pr_addr +
8630Sstevel@tonic-gate cur.pr_pagesize * cur.pr_npage;
8640Sstevel@tonic-gate if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
8650Sstevel@tonic-gate 0) {
8660Sstevel@tonic-gate int64_t d_rss, att;
8670Sstevel@tonic-gate int willignore = 0;
8680Sstevel@tonic-gate
8690Sstevel@tonic-gate excess += (d_rss = rss_delta(
8700Sstevel@tonic-gate &new_psinfo, &old_psinfo, vic));
8710Sstevel@tonic-gate
8720Sstevel@tonic-gate /*
8730Sstevel@tonic-gate * If this pageout attempt was unsuccessful
8740Sstevel@tonic-gate * (the resident portion was not affected), and
8750Sstevel@tonic-gate * was for the whole mapping, put it in the
8760Sstevel@tonic-gate * ignored set, so it will not be scanned again
8770Sstevel@tonic-gate * until some page is referenced or modified.
8780Sstevel@tonic-gate */
8790Sstevel@tonic-gate if (d_rss >= 0 && (void *)cur.pr_addr ==
8800Sstevel@tonic-gate vicaddr && (cur.pr_pagesize * cur.pr_npage)
8810Sstevel@tonic-gate == ((uintptr_t)endaddr -
8820Sstevel@tonic-gate (uintptr_t)vicaddr)) {
8830Sstevel@tonic-gate if (lmapping_insert(
8840Sstevel@tonic-gate &vic->lpc_ignore,
8850Sstevel@tonic-gate cur.pr_addr,
8860Sstevel@tonic-gate cur.pr_pagesize *
8870Sstevel@tonic-gate cur.pr_npage) != 0)
8880Sstevel@tonic-gate debug("not enough memory to add"
8890Sstevel@tonic-gate " mapping at %p to ignored"
8900Sstevel@tonic-gate " set\n",
8910Sstevel@tonic-gate (void *)cur.pr_addr);
8920Sstevel@tonic-gate willignore = 1;
8930Sstevel@tonic-gate }
8940Sstevel@tonic-gate
8950Sstevel@tonic-gate /*
8960Sstevel@tonic-gate * Update statistics.
8970Sstevel@tonic-gate */
8980Sstevel@tonic-gate lcol->lcol_stat.lcols_pg_att += (att =
8990Sstevel@tonic-gate ((intptr_t)endaddr - (intptr_t)vicaddr) /
9000Sstevel@tonic-gate 1024);
9010Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
9020Sstevel@tonic-gate "+0t(%llu/%llu)kB%s\n", vicaddr,
9030Sstevel@tonic-gate (unsigned long long)((d_rss <
9040Sstevel@tonic-gate 0) ? - d_rss : 0), (unsigned long long)att,
9050Sstevel@tonic-gate willignore ? " (will ignore)" : "");
9060Sstevel@tonic-gate } else {
9070Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol,
9080Sstevel@tonic-gate "process %d: exited/unscannable\n",
9090Sstevel@tonic-gate (int)vic->lpc_pid);
9100Sstevel@tonic-gate vic->lpc_unscannable = 1;
9110Sstevel@tonic-gate goto nextproc;
9120Sstevel@tonic-gate }
9130Sstevel@tonic-gate
9140Sstevel@tonic-gate /*
9150Sstevel@tonic-gate * Update the statistics file, if it's time.
9160Sstevel@tonic-gate */
9170Sstevel@tonic-gate check_update_statistics();
9180Sstevel@tonic-gate
9190Sstevel@tonic-gate vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
9200Sstevel@tonic-gate *)advance_prpageheader_cur_nextmapping(&cur);
9210Sstevel@tonic-gate }
9220Sstevel@tonic-gate excess += rss_delta(&new_psinfo, &old_psinfo, vic);
9230Sstevel@tonic-gate st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
9240Sstevel@tonic-gate (long long)excess);
9250Sstevel@tonic-gate nextproc:
9260Sstevel@tonic-gate /*
9270Sstevel@tonic-gate * If a process was grabbed, release it, destroying its agent.
9280Sstevel@tonic-gate */
9290Sstevel@tonic-gate if (scan_pr != NULL) {
9300Sstevel@tonic-gate (void) Prelease(scan_pr, 0);
9310Sstevel@tonic-gate scan_pr = NULL;
9320Sstevel@tonic-gate }
9330Sstevel@tonic-gate lcol->lcol_victim = vic;
9340Sstevel@tonic-gate /*
9350Sstevel@tonic-gate * Scan the collection at most once. Only if scanning was not
9360Sstevel@tonic-gate * aborted for any reason, and the end of lprocess has not been
9370Sstevel@tonic-gate * reached, determine the next victim and scan it.
9380Sstevel@tonic-gate */
9390Sstevel@tonic-gate if (vic != NULL) {
9400Sstevel@tonic-gate if (vic->lpc_next != NULL) {
9410Sstevel@tonic-gate /*
9420Sstevel@tonic-gate * Determine the next process to be scanned.
9430Sstevel@tonic-gate */
9440Sstevel@tonic-gate if (excess > 0) {
9450Sstevel@tonic-gate vic = get_valid_victim(lcol,
9460Sstevel@tonic-gate vic->lpc_next);
9470Sstevel@tonic-gate vicaddr = 0;
9480Sstevel@tonic-gate }
9490Sstevel@tonic-gate } else {
9500Sstevel@tonic-gate /*
9510Sstevel@tonic-gate * A complete scan of the collection was made,
9520Sstevel@tonic-gate * so tick the scan counter and stop scanning
9530Sstevel@tonic-gate * until the next request.
9540Sstevel@tonic-gate */
9550Sstevel@tonic-gate lcol->lcol_stat.lcols_scan_count++;
9560Sstevel@tonic-gate lcol->lcol_stat.lcols_scan_time_complete
9570Sstevel@tonic-gate = lcol->lcol_stat.lcols_scan_time;
9580Sstevel@tonic-gate /*
9590Sstevel@tonic-gate * If an excess still exists, tick the
9600Sstevel@tonic-gate * "ineffective scan" counter, signalling that
9610Sstevel@tonic-gate * the cap may be uneforceable.
9620Sstevel@tonic-gate */
9630Sstevel@tonic-gate if (resumed == 0 && excess > 0)
9640Sstevel@tonic-gate lcol->lcol_stat
9650Sstevel@tonic-gate .lcols_scan_ineffective++;
9660Sstevel@tonic-gate /*
9670Sstevel@tonic-gate * Scanning should start at the beginning of
9680Sstevel@tonic-gate * the process list at the next request.
9690Sstevel@tonic-gate */
9700Sstevel@tonic-gate if (excess > 0)
9710Sstevel@tonic-gate vic = NULL;
9720Sstevel@tonic-gate }
9730Sstevel@tonic-gate }
9740Sstevel@tonic-gate }
9750Sstevel@tonic-gate lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
9760Sstevel@tonic-gate st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
9770Sstevel@tonic-gate (long long)excess);
9780Sstevel@tonic-gate
9790Sstevel@tonic-gate lcol->lcol_resaddr = vicaddr;
9800Sstevel@tonic-gate if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
9810Sstevel@tonic-gate lcol->lcol_victim = get_valid_victim(lcol,
9820Sstevel@tonic-gate lcol->lcol_victim->lpc_next);
9830Sstevel@tonic-gate }
9840Sstevel@tonic-gate }
9850Sstevel@tonic-gate
9860Sstevel@tonic-gate /*
9870Sstevel@tonic-gate * Abort the scan in progress, and destroy the agent LWP of any grabbed
9880Sstevel@tonic-gate * processes.
9890Sstevel@tonic-gate */
9900Sstevel@tonic-gate void
scan_abort(void)9910Sstevel@tonic-gate scan_abort(void)
9920Sstevel@tonic-gate {
9930Sstevel@tonic-gate if (scan_pr != NULL)
9940Sstevel@tonic-gate (void) Prelease(scan_pr, NULL);
9950Sstevel@tonic-gate }
9960Sstevel@tonic-gate
9970Sstevel@tonic-gate static void
revoke_xmap(rfd_t * rfd)9980Sstevel@tonic-gate revoke_xmap(rfd_t *rfd)
9990Sstevel@tonic-gate {
10000Sstevel@tonic-gate lprocess_t *lpc = rfd->rfd_data;
10010Sstevel@tonic-gate
10020Sstevel@tonic-gate debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
10030Sstevel@tonic-gate ASSERT(lpc->lpc_xmap_fd != -1);
10040Sstevel@tonic-gate lpc->lpc_xmap_fd = -1;
10050Sstevel@tonic-gate }
10060Sstevel@tonic-gate
10070Sstevel@tonic-gate /*
10080Sstevel@tonic-gate * Retrieve the process's current xmap , which is used to determine the size of
10090Sstevel@tonic-gate * the resident portion of its segments. Return zero if successful.
10100Sstevel@tonic-gate */
10110Sstevel@tonic-gate static int
lpc_xmap_update(lprocess_t * lpc)10120Sstevel@tonic-gate lpc_xmap_update(lprocess_t *lpc)
10130Sstevel@tonic-gate {
10140Sstevel@tonic-gate int res;
10150Sstevel@tonic-gate struct stat st;
10160Sstevel@tonic-gate
10170Sstevel@tonic-gate free(lpc->lpc_xmap);
10180Sstevel@tonic-gate lpc->lpc_xmap = NULL;
10190Sstevel@tonic-gate lpc->lpc_nxmap = -1;
10200Sstevel@tonic-gate
10210Sstevel@tonic-gate if (lpc->lpc_xmap_fd == -1) {
10220Sstevel@tonic-gate char pathbuf[PROC_PATH_MAX];
10230Sstevel@tonic-gate
10240Sstevel@tonic-gate (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
10250Sstevel@tonic-gate (int)lpc->lpc_pid);
10260Sstevel@tonic-gate if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
10270Sstevel@tonic-gate revoke_xmap, lpc, O_RDONLY, 0)) < 0)
10280Sstevel@tonic-gate return (-1);
10290Sstevel@tonic-gate }
10300Sstevel@tonic-gate
10310Sstevel@tonic-gate redo:
10320Sstevel@tonic-gate errno = 0;
10330Sstevel@tonic-gate if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
10340Sstevel@tonic-gate debug("cannot stat xmap\n");
10350Sstevel@tonic-gate (void) rfd_close(lpc->lpc_xmap_fd);
10360Sstevel@tonic-gate lpc->lpc_xmap_fd = -1;
10370Sstevel@tonic-gate return (-1);
10380Sstevel@tonic-gate }
10390Sstevel@tonic-gate
10400Sstevel@tonic-gate if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
10410Sstevel@tonic-gate debug("xmap wrong size\n");
10420Sstevel@tonic-gate (void) rfd_close(lpc->lpc_xmap_fd);
10430Sstevel@tonic-gate lpc->lpc_xmap_fd = -1;
10440Sstevel@tonic-gate return (-1);
10450Sstevel@tonic-gate }
10460Sstevel@tonic-gate
10470Sstevel@tonic-gate lpc->lpc_xmap = malloc(st.st_size);
10480Sstevel@tonic-gate if (lpc->lpc_xmap == NULL) {
10490Sstevel@tonic-gate debug("cannot malloc() %ld bytes for xmap", st.st_size);
10500Sstevel@tonic-gate (void) rfd_close(lpc->lpc_xmap_fd);
10510Sstevel@tonic-gate lpc->lpc_xmap_fd = -1;
10520Sstevel@tonic-gate return (-1);
10530Sstevel@tonic-gate }
10540Sstevel@tonic-gate
10550Sstevel@tonic-gate if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
10560Sstevel@tonic-gate st.st_size) {
10570Sstevel@tonic-gate free(lpc->lpc_xmap);
10580Sstevel@tonic-gate lpc->lpc_xmap = NULL;
10590Sstevel@tonic-gate if (res > 0) {
10600Sstevel@tonic-gate debug("xmap changed size, retrying\n");
10610Sstevel@tonic-gate goto redo;
10620Sstevel@tonic-gate } else {
10630Sstevel@tonic-gate debug("cannot read xmap");
10640Sstevel@tonic-gate return (-1);
10650Sstevel@tonic-gate }
10660Sstevel@tonic-gate }
10670Sstevel@tonic-gate lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
10680Sstevel@tonic-gate
10690Sstevel@tonic-gate return (0);
10700Sstevel@tonic-gate }
1071