xref: /onnv-gate/usr/src/uts/sun4u/os/memscrub.c (revision 11474:857f9db4ef05)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52895Svb70745  * Common Development and Distribution License (the "License").
62895Svb70745  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*11474SJonathan.Adams@Sun.COM  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * sun4u Memory Scrubbing
280Sstevel@tonic-gate  *
290Sstevel@tonic-gate  * On detection of a correctable memory ECC error, the sun4u kernel
300Sstevel@tonic-gate  * returns the corrected data to the requester and re-writes it
310Sstevel@tonic-gate  * to memory (DRAM).  So if the correctable error was transient,
320Sstevel@tonic-gate  * the read has effectively been cleaned (scrubbed) from memory.
330Sstevel@tonic-gate  *
340Sstevel@tonic-gate  * Scrubbing thus reduces the likelyhood that multiple transient errors
350Sstevel@tonic-gate  * will occur in the same memory word, making uncorrectable errors due
360Sstevel@tonic-gate  * to transients less likely.
370Sstevel@tonic-gate  *
380Sstevel@tonic-gate  * Thus is born the desire that every memory location be periodically
390Sstevel@tonic-gate  * accessed.
400Sstevel@tonic-gate  *
410Sstevel@tonic-gate  * This file implements a memory scrubbing thread.  This scrubber
420Sstevel@tonic-gate  * guarantees that all of physical memory is accessed periodically
430Sstevel@tonic-gate  * (memscrub_period_sec -- 12 hours).
440Sstevel@tonic-gate  *
450Sstevel@tonic-gate  * It attempts to do this as unobtrusively as possible.  The thread
460Sstevel@tonic-gate  * schedules itself to wake up at an interval such that if it reads
473876Spt157919  * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
480Sstevel@tonic-gate  * memory in in memscrub_period_sec (12 hours).
490Sstevel@tonic-gate  *
503876Spt157919  * The scrubber uses the block load and prefetch hardware to read memory
513876Spt157919  * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds.  Unlike the
523876Spt157919  * original sun4d scrubber the sun4u scrubber does not read ahead if the
533876Spt157919  * system is idle because we can read memory very efficently.
540Sstevel@tonic-gate  *
550Sstevel@tonic-gate  * The scrubber maintains a private copy of the phys_install memory list
560Sstevel@tonic-gate  * to keep track of what memory should be scrubbed.
570Sstevel@tonic-gate  *
580Sstevel@tonic-gate  * The global routines memscrub_add_span() and memscrub_delete_span() are
590Sstevel@tonic-gate  * used to add and delete from this list.  If hotplug memory is later
600Sstevel@tonic-gate  * supported these two routines can be used to notify the scrubber of
610Sstevel@tonic-gate  * memory configuration changes.
620Sstevel@tonic-gate  *
630Sstevel@tonic-gate  * The following parameters can be set via /etc/system
640Sstevel@tonic-gate  *
650Sstevel@tonic-gate  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
660Sstevel@tonic-gate  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
670Sstevel@tonic-gate  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
680Sstevel@tonic-gate  * memscrub_delay_start_sec = (5 minutes)
690Sstevel@tonic-gate  * memscrub_verbose = (0)
700Sstevel@tonic-gate  * memscrub_override_ticks = (1 tick)
710Sstevel@tonic-gate  * disable_memscrub = (0)
720Sstevel@tonic-gate  * pause_memscrub = (0)
730Sstevel@tonic-gate  * read_all_memscrub = (0)
740Sstevel@tonic-gate  *
750Sstevel@tonic-gate  * The scrubber will print NOTICE messages of what it is doing if
760Sstevel@tonic-gate  * "memscrub_verbose" is set.
770Sstevel@tonic-gate  *
780Sstevel@tonic-gate  * If the scrubber's sleep time calculation drops to zero ticks,
790Sstevel@tonic-gate  * memscrub_override_ticks will be used as the sleep time instead. The
803876Spt157919  * sleep time should only drop to zero on a system with over 131.84
810Sstevel@tonic-gate  * terabytes of memory, or where the default scrubber parameters have
820Sstevel@tonic-gate  * been adjusted. For example, reducing memscrub_span_pages or
830Sstevel@tonic-gate  * memscrub_period_sec causes the sleep time to drop to zero with less
840Sstevel@tonic-gate  * memory. Note that since the sleep time is calculated in clock ticks,
850Sstevel@tonic-gate  * using hires clock ticks allows for more memory before the sleep time
860Sstevel@tonic-gate  * becomes zero.
870Sstevel@tonic-gate  *
880Sstevel@tonic-gate  * The scrubber will exit (or never be started) if it finds the variable
890Sstevel@tonic-gate  * "disable_memscrub" set.
900Sstevel@tonic-gate  *
910Sstevel@tonic-gate  * The scrubber will pause (not read memory) when "pause_memscrub"
920Sstevel@tonic-gate  * is set.  It will check the state of pause_memscrub at each wakeup
930Sstevel@tonic-gate  * period.  The scrubber will not make up for lost time.  If you
940Sstevel@tonic-gate  * pause the scrubber for a prolonged period of time you can use
950Sstevel@tonic-gate  * the "read_all_memscrub" switch (see below) to catch up. In addition,
960Sstevel@tonic-gate  * pause_memscrub is used internally by the post memory DR callbacks.
970Sstevel@tonic-gate  * It is set for the small period of time during which the callbacks
980Sstevel@tonic-gate  * are executing. This ensures "memscrub_lock" will be released,
990Sstevel@tonic-gate  * allowing the callbacks to finish.
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  * The scrubber will read all memory if "read_all_memscrub" is set.
1020Sstevel@tonic-gate  * The normal span read will also occur during the wakeup.
1030Sstevel@tonic-gate  *
1040Sstevel@tonic-gate  * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
1050Sstevel@tonic-gate  * must have before we'll start the scrubber.
1060Sstevel@tonic-gate  *
1073876Spt157919  * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
1080Sstevel@tonic-gate  * is a "good" amount of minimum time for the thread to run at a time.
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
1110Sstevel@tonic-gate  * twice the frequency the hardware folk estimated would be necessary.
1120Sstevel@tonic-gate  *
1130Sstevel@tonic-gate  * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
1140Sstevel@tonic-gate  * that the scurbber should get its fair share of time (since it
1150Sstevel@tonic-gate  * is short).  At a priority of 0 the scrubber will be starved.
1160Sstevel@tonic-gate  */
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate #include <sys/systm.h>		/* timeout, types, t_lock */
1190Sstevel@tonic-gate #include <sys/cmn_err.h>
1200Sstevel@tonic-gate #include <sys/sysmacros.h>	/* MIN */
1210Sstevel@tonic-gate #include <sys/memlist.h>	/* memlist */
1220Sstevel@tonic-gate #include <sys/mem_config.h>	/* memory add/delete */
1230Sstevel@tonic-gate #include <sys/kmem.h>		/* KMEM_NOSLEEP */
1240Sstevel@tonic-gate #include <sys/cpuvar.h>		/* ncpus_online */
1250Sstevel@tonic-gate #include <sys/debug.h>		/* ASSERTs */
1260Sstevel@tonic-gate #include <sys/machsystm.h>	/* lddphys */
1270Sstevel@tonic-gate #include <sys/cpu_module.h>	/* vtag_flushpage */
1280Sstevel@tonic-gate #include <sys/kstat.h>
1290Sstevel@tonic-gate #include <sys/atomic.h>		/* atomic_add_32 */
1300Sstevel@tonic-gate 
1310Sstevel@tonic-gate #include <vm/hat.h>
1320Sstevel@tonic-gate #include <vm/seg_kmem.h>
1330Sstevel@tonic-gate #include <vm/hat_sfmmu.h>	/* XXX FIXME - delete */
1340Sstevel@tonic-gate 
1350Sstevel@tonic-gate #include <sys/time.h>
1360Sstevel@tonic-gate #include <sys/callb.h>		/* CPR callback */
1370Sstevel@tonic-gate #include <sys/ontrap.h>
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate /*
1400Sstevel@tonic-gate  * Should really have paddr_t defined, but it is broken.  Use
1410Sstevel@tonic-gate  * ms_paddr_t in the meantime to make the code cleaner
1420Sstevel@tonic-gate  */
1430Sstevel@tonic-gate typedef uint64_t ms_paddr_t;
1440Sstevel@tonic-gate 
1450Sstevel@tonic-gate /*
1460Sstevel@tonic-gate  * Global Routines:
1470Sstevel@tonic-gate  */
1480Sstevel@tonic-gate int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
1490Sstevel@tonic-gate int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
1500Sstevel@tonic-gate int memscrub_init(void);
1512895Svb70745 void memscrub_induced_error(void);
1520Sstevel@tonic-gate 
1530Sstevel@tonic-gate /*
1540Sstevel@tonic-gate  * Global Data:
1550Sstevel@tonic-gate  */
1560Sstevel@tonic-gate 
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate  * scrub if we have at least this many pages
1590Sstevel@tonic-gate  */
1600Sstevel@tonic-gate #define	MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
1610Sstevel@tonic-gate 
1620Sstevel@tonic-gate /*
1630Sstevel@tonic-gate  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
1640Sstevel@tonic-gate  */
1650Sstevel@tonic-gate #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
1660Sstevel@tonic-gate 
1670Sstevel@tonic-gate /*
1680Sstevel@tonic-gate  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
1690Sstevel@tonic-gate  */
1703876Spt157919 #define	MEMSCRUB_DFL_SPAN_PAGES	((32 * 1024 * 1024) / PAGESIZE)
1710Sstevel@tonic-gate 
1720Sstevel@tonic-gate /*
1730Sstevel@tonic-gate  * almost anything is higher priority than scrubbing
1740Sstevel@tonic-gate  */
1750Sstevel@tonic-gate #define	MEMSCRUB_DFL_THREAD_PRI	MINCLSYSPRI
1760Sstevel@tonic-gate 
1770Sstevel@tonic-gate /*
1780Sstevel@tonic-gate  * size used when scanning memory
1790Sstevel@tonic-gate  */
1800Sstevel@tonic-gate #define	MEMSCRUB_BLOCK_SIZE		256
1810Sstevel@tonic-gate #define	MEMSCRUB_BLOCK_SIZE_SHIFT	8 	/* log2(MEMSCRUB_BLOCK_SIZE) */
1820Sstevel@tonic-gate #define	MEMSCRUB_BLOCKS_PER_PAGE	(PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate #define	MEMSCRUB_BPP4M		MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
1850Sstevel@tonic-gate #define	MEMSCRUB_BPP512K	MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
1860Sstevel@tonic-gate #define	MEMSCRUB_BPP64K		MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
1870Sstevel@tonic-gate #define	MEMSCRUB_BPP		MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
1880Sstevel@tonic-gate 
1890Sstevel@tonic-gate /*
1900Sstevel@tonic-gate  * This message indicates that we have exceeded the limitations of
1910Sstevel@tonic-gate  * the memscrubber. See the comments above regarding what would
1920Sstevel@tonic-gate  * cause the sleep time to become zero. In DEBUG mode, this message
1930Sstevel@tonic-gate  * is logged on the console and in the messages file. In non-DEBUG
1940Sstevel@tonic-gate  * mode, it is only logged in the messages file.
1950Sstevel@tonic-gate  */
1960Sstevel@tonic-gate #ifdef DEBUG
1970Sstevel@tonic-gate #define	MEMSCRUB_OVERRIDE_MSG	"Memory scrubber sleep time is zero " \
1980Sstevel@tonic-gate 	"seconds, consuming entire CPU."
1990Sstevel@tonic-gate #else
2000Sstevel@tonic-gate #define	MEMSCRUB_OVERRIDE_MSG	"!Memory scrubber sleep time is zero " \
2010Sstevel@tonic-gate 	"seconds, consuming entire CPU."
2020Sstevel@tonic-gate #endif /* DEBUG */
2030Sstevel@tonic-gate 
2040Sstevel@tonic-gate /*
2050Sstevel@tonic-gate  * we can patch these defaults in /etc/system if necessary
2060Sstevel@tonic-gate  */
2070Sstevel@tonic-gate uint_t disable_memscrub = 0;
2080Sstevel@tonic-gate uint_t pause_memscrub = 0;
2090Sstevel@tonic-gate uint_t read_all_memscrub = 0;
2100Sstevel@tonic-gate uint_t memscrub_verbose = 0;
2110Sstevel@tonic-gate uint_t memscrub_all_idle = 0;
2120Sstevel@tonic-gate uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
2130Sstevel@tonic-gate uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
2140Sstevel@tonic-gate uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
2150Sstevel@tonic-gate uint_t memscrub_delay_start_sec = 5 * 60;
2160Sstevel@tonic-gate uint_t memscrub_override_ticks = 1;
2170Sstevel@tonic-gate 
2180Sstevel@tonic-gate /*
2190Sstevel@tonic-gate  * Static Routines
2200Sstevel@tonic-gate  */
2210Sstevel@tonic-gate static void memscrubber(void);
2220Sstevel@tonic-gate static void memscrub_cleanup(void);
2230Sstevel@tonic-gate static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
2240Sstevel@tonic-gate static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
2250Sstevel@tonic-gate static void memscrub_scan(uint_t blks, ms_paddr_t src);
2260Sstevel@tonic-gate 
2270Sstevel@tonic-gate /*
2280Sstevel@tonic-gate  * Static Data
2290Sstevel@tonic-gate  */
2300Sstevel@tonic-gate 
2310Sstevel@tonic-gate static struct memlist *memscrub_memlist;
2320Sstevel@tonic-gate static uint_t memscrub_phys_pages;
2330Sstevel@tonic-gate 
2340Sstevel@tonic-gate static kcondvar_t memscrub_cv;
2350Sstevel@tonic-gate static kmutex_t memscrub_lock;
2360Sstevel@tonic-gate /*
2370Sstevel@tonic-gate  * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
2380Sstevel@tonic-gate  */
2390Sstevel@tonic-gate static void memscrub_init_mem_config(void);
2400Sstevel@tonic-gate static void memscrub_uninit_mem_config(void);
2410Sstevel@tonic-gate 
2420Sstevel@tonic-gate /*
2432895Svb70745  * Linked list of memscrub aware spans having retired pages.
2442895Svb70745  * Currently enabled only on sun4u USIII-based platforms.
2452895Svb70745  */
2462895Svb70745 typedef struct memscrub_page_retire_span {
2472895Svb70745 	ms_paddr_t				address;
2482895Svb70745 	struct memscrub_page_retire_span	*next;
2492895Svb70745 } memscrub_page_retire_span_t;
2502895Svb70745 
2512895Svb70745 static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
2522895Svb70745 
2532895Svb70745 static void memscrub_page_retire_span_add(ms_paddr_t);
2542895Svb70745 static void memscrub_page_retire_span_delete(ms_paddr_t);
2552895Svb70745 static int memscrub_page_retire_span_search(ms_paddr_t);
2562895Svb70745 static void memscrub_page_retire_span_list_update(void);
2572895Svb70745 
2582895Svb70745 /*
2592895Svb70745  * add_to_page_retire_list: Set by cpu_async_log_err() routine
2602895Svb70745  * by calling memscrub_induced_error() when CE/UE occurs on a retired
2612895Svb70745  * page due to memscrub reading.  Cleared by memscrub after updating
2622895Svb70745  * global page retire span list.  Piggybacking on protection of
2632895Svb70745  * memscrub_lock, which is held during set and clear.
2642895Svb70745  * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
2652895Svb70745  * on softint context, which gets fired on a cpu memscrub thread currently
2662895Svb70745  * running.  Memscrub thread has affinity set during memscrub_read(), hence
2672895Svb70745  * migration to new cpu not expected.
2682895Svb70745  */
2692895Svb70745 static int add_to_page_retire_list = 0;
2702895Svb70745 
2712895Svb70745 /*
2720Sstevel@tonic-gate  * Keep track of some interesting statistics
2730Sstevel@tonic-gate  */
2740Sstevel@tonic-gate static struct memscrub_kstats {
2750Sstevel@tonic-gate 	kstat_named_t	done_early;	/* ahead of schedule */
2760Sstevel@tonic-gate 	kstat_named_t	early_sec;	/* by cumulative num secs */
2770Sstevel@tonic-gate 	kstat_named_t	done_late;	/* behind schedule */
2780Sstevel@tonic-gate 	kstat_named_t	late_sec;	/* by cumulative num secs */
2790Sstevel@tonic-gate 	kstat_named_t	interval_ticks;	/* num ticks between intervals */
2800Sstevel@tonic-gate 	kstat_named_t	force_run;	/* forced to run, non-timeout */
2810Sstevel@tonic-gate 	kstat_named_t	errors_found;	/* num errors found by memscrub */
2820Sstevel@tonic-gate } memscrub_counts = {
2830Sstevel@tonic-gate 	{ "done_early",		KSTAT_DATA_UINT32 },
2840Sstevel@tonic-gate 	{ "early_sec", 		KSTAT_DATA_UINT32 },
2850Sstevel@tonic-gate 	{ "done_late", 		KSTAT_DATA_UINT32 },
2860Sstevel@tonic-gate 	{ "late_sec",		KSTAT_DATA_UINT32 },
2870Sstevel@tonic-gate 	{ "interval_ticks",	KSTAT_DATA_UINT32 },
2880Sstevel@tonic-gate 	{ "force_run",		KSTAT_DATA_UINT32 },
2890Sstevel@tonic-gate 	{ "errors_found",	KSTAT_DATA_UINT32 },
2900Sstevel@tonic-gate };
2910Sstevel@tonic-gate static struct kstat *memscrub_ksp = (struct kstat *)NULL;
2920Sstevel@tonic-gate 
2930Sstevel@tonic-gate static timeout_id_t memscrub_tid = 0;	/* keep track of timeout id */
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate /*
2960Sstevel@tonic-gate  * create memscrub_memlist from phys_install list
2970Sstevel@tonic-gate  * initialize locks, set memscrub_phys_pages.
2980Sstevel@tonic-gate  */
2990Sstevel@tonic-gate int
3000Sstevel@tonic-gate memscrub_init(void)
3010Sstevel@tonic-gate {
3020Sstevel@tonic-gate 	struct memlist *src;
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate 	/*
3050Sstevel@tonic-gate 	 * only startup the scrubber if we have a minimum
3060Sstevel@tonic-gate 	 * number of pages
3070Sstevel@tonic-gate 	 */
3080Sstevel@tonic-gate 	if (physinstalled >= MEMSCRUB_MIN_PAGES) {
3090Sstevel@tonic-gate 
3100Sstevel@tonic-gate 		/*
3110Sstevel@tonic-gate 		 * initialize locks
3120Sstevel@tonic-gate 		 */
3130Sstevel@tonic-gate 		mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
3140Sstevel@tonic-gate 		cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 		/*
3170Sstevel@tonic-gate 		 * copy phys_install to memscrub_memlist
3180Sstevel@tonic-gate 		 */
319*11474SJonathan.Adams@Sun.COM 		for (src = phys_install; src; src = src->ml_next) {
3200Sstevel@tonic-gate 			if (memscrub_add_span(
321*11474SJonathan.Adams@Sun.COM 			    (pfn_t)(src->ml_address >> PAGESHIFT),
322*11474SJonathan.Adams@Sun.COM 			    (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
3230Sstevel@tonic-gate 				memscrub_cleanup();
3240Sstevel@tonic-gate 				return (-1);
3250Sstevel@tonic-gate 			}
3260Sstevel@tonic-gate 		}
3270Sstevel@tonic-gate 
3280Sstevel@tonic-gate 		/*
3290Sstevel@tonic-gate 		 * initialize kstats
3300Sstevel@tonic-gate 		 */
3310Sstevel@tonic-gate 		memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
332*11474SJonathan.Adams@Sun.COM 		    "misc", KSTAT_TYPE_NAMED,
333*11474SJonathan.Adams@Sun.COM 		    sizeof (memscrub_counts) / sizeof (kstat_named_t),
334*11474SJonathan.Adams@Sun.COM 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate 		if (memscrub_ksp) {
3370Sstevel@tonic-gate 			memscrub_ksp->ks_data = (void *)&memscrub_counts;
3380Sstevel@tonic-gate 			kstat_install(memscrub_ksp);
3390Sstevel@tonic-gate 		} else {
3400Sstevel@tonic-gate 			cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
3410Sstevel@tonic-gate 		}
3420Sstevel@tonic-gate 
3430Sstevel@tonic-gate 		/*
3440Sstevel@tonic-gate 		 * create memscrubber thread
3450Sstevel@tonic-gate 		 */
3460Sstevel@tonic-gate 		(void) thread_create(NULL, 0, (void (*)())memscrubber,
3470Sstevel@tonic-gate 		    NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
3480Sstevel@tonic-gate 
3490Sstevel@tonic-gate 		/*
3500Sstevel@tonic-gate 		 * We don't want call backs changing the list
3510Sstevel@tonic-gate 		 * if there is no thread running. We do not
3520Sstevel@tonic-gate 		 * attempt to deal with stopping/starting scrubbing
3530Sstevel@tonic-gate 		 * on memory size changes.
3540Sstevel@tonic-gate 		 */
3550Sstevel@tonic-gate 		memscrub_init_mem_config();
3560Sstevel@tonic-gate 	}
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 	return (0);
3590Sstevel@tonic-gate }
3600Sstevel@tonic-gate 
3610Sstevel@tonic-gate static void
3620Sstevel@tonic-gate memscrub_cleanup(void)
3630Sstevel@tonic-gate {
3640Sstevel@tonic-gate 	memscrub_uninit_mem_config();
3650Sstevel@tonic-gate 	while (memscrub_memlist) {
3660Sstevel@tonic-gate 		(void) memscrub_delete_span(
367*11474SJonathan.Adams@Sun.COM 		    (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
368*11474SJonathan.Adams@Sun.COM 		    (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
3690Sstevel@tonic-gate 	}
3700Sstevel@tonic-gate 	if (memscrub_ksp)
3710Sstevel@tonic-gate 		kstat_delete(memscrub_ksp);
3720Sstevel@tonic-gate 	cv_destroy(&memscrub_cv);
3730Sstevel@tonic-gate 	mutex_destroy(&memscrub_lock);
3740Sstevel@tonic-gate }
3750Sstevel@tonic-gate 
3760Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
3770Sstevel@tonic-gate static void
3780Sstevel@tonic-gate memscrub_printmemlist(char *title, struct memlist *listp)
3790Sstevel@tonic-gate {
3800Sstevel@tonic-gate 	struct memlist *list;
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	cmn_err(CE_CONT, "%s:\n", title);
3830Sstevel@tonic-gate 
384*11474SJonathan.Adams@Sun.COM 	for (list = listp; list; list = list->ml_next) {
3850Sstevel@tonic-gate 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
386*11474SJonathan.Adams@Sun.COM 		    list->ml_address, list->ml_size);
3870Sstevel@tonic-gate 	}
3880Sstevel@tonic-gate }
3890Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
3900Sstevel@tonic-gate 
3910Sstevel@tonic-gate /* ARGSUSED */
3920Sstevel@tonic-gate static void
3930Sstevel@tonic-gate memscrub_wakeup(void *c)
3940Sstevel@tonic-gate {
3950Sstevel@tonic-gate 	/*
3960Sstevel@tonic-gate 	 * grab mutex to guarantee that our wakeup call
3970Sstevel@tonic-gate 	 * arrives after we go to sleep -- so we can't sleep forever.
3980Sstevel@tonic-gate 	 */
3990Sstevel@tonic-gate 	mutex_enter(&memscrub_lock);
4000Sstevel@tonic-gate 	cv_signal(&memscrub_cv);
4010Sstevel@tonic-gate 	mutex_exit(&memscrub_lock);
4020Sstevel@tonic-gate }
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate /*
4050Sstevel@tonic-gate  * provide an interface external to the memscrubber
4060Sstevel@tonic-gate  * which will force the memscrub thread to run vs.
4070Sstevel@tonic-gate  * waiting for the timeout, if one is set
4080Sstevel@tonic-gate  */
4090Sstevel@tonic-gate void
4100Sstevel@tonic-gate memscrub_run(void)
4110Sstevel@tonic-gate {
4120Sstevel@tonic-gate 	memscrub_counts.force_run.value.ui32++;
4130Sstevel@tonic-gate 	if (memscrub_tid) {
4140Sstevel@tonic-gate 		(void) untimeout(memscrub_tid);
4150Sstevel@tonic-gate 		memscrub_wakeup((void *)NULL);
4160Sstevel@tonic-gate 	}
4170Sstevel@tonic-gate }
4180Sstevel@tonic-gate 
4190Sstevel@tonic-gate /*
4200Sstevel@tonic-gate  * this calculation doesn't account for the time
4210Sstevel@tonic-gate  * that the actual scan consumes -- so we'd fall
4220Sstevel@tonic-gate  * slightly behind schedule with this interval.
4230Sstevel@tonic-gate  * It's very small.
4240Sstevel@tonic-gate  */
4250Sstevel@tonic-gate 
4260Sstevel@tonic-gate static uint_t
4270Sstevel@tonic-gate compute_interval_ticks(void)
4280Sstevel@tonic-gate {
4290Sstevel@tonic-gate 	/*
4300Sstevel@tonic-gate 	 * We use msp_safe mpp_safe below to insure somebody
4310Sstevel@tonic-gate 	 * doesn't set memscrub_span_pages or memscrub_phys_pages
4320Sstevel@tonic-gate 	 * to 0 on us.
4330Sstevel@tonic-gate 	 */
4340Sstevel@tonic-gate 	static uint_t msp_safe, mpp_safe;
4350Sstevel@tonic-gate 	static uint_t interval_ticks, period_ticks;
4360Sstevel@tonic-gate 	msp_safe = memscrub_span_pages;
4370Sstevel@tonic-gate 	mpp_safe = memscrub_phys_pages;
4380Sstevel@tonic-gate 
4390Sstevel@tonic-gate 	period_ticks = memscrub_period_sec * hz;
4400Sstevel@tonic-gate 	interval_ticks = period_ticks;
4410Sstevel@tonic-gate 
4420Sstevel@tonic-gate 	ASSERT(mutex_owned(&memscrub_lock));
4430Sstevel@tonic-gate 
4440Sstevel@tonic-gate 	if ((msp_safe != 0) && (mpp_safe != 0)) {
4450Sstevel@tonic-gate 		if (memscrub_phys_pages <= msp_safe) {
4460Sstevel@tonic-gate 			interval_ticks = period_ticks;
4470Sstevel@tonic-gate 		} else {
4480Sstevel@tonic-gate 			interval_ticks = (period_ticks /
4490Sstevel@tonic-gate 			    (mpp_safe / msp_safe));
4500Sstevel@tonic-gate 		}
4510Sstevel@tonic-gate 	}
4520Sstevel@tonic-gate 	return (interval_ticks);
4530Sstevel@tonic-gate }
4540Sstevel@tonic-gate 
4550Sstevel@tonic-gate void
4560Sstevel@tonic-gate memscrubber(void)
4570Sstevel@tonic-gate {
4580Sstevel@tonic-gate 	ms_paddr_t address, addr;
4590Sstevel@tonic-gate 	time_t deadline;
4600Sstevel@tonic-gate 	pgcnt_t pages;
4610Sstevel@tonic-gate 	uint_t reached_end = 1;
4620Sstevel@tonic-gate 	uint_t paused_message = 0;
4630Sstevel@tonic-gate 	uint_t interval_ticks = 0;
4640Sstevel@tonic-gate 	uint_t sleep_warn_printed = 0;
4650Sstevel@tonic-gate 	callb_cpr_t cprinfo;
4660Sstevel@tonic-gate 
4670Sstevel@tonic-gate 	/*
4680Sstevel@tonic-gate 	 * notify CPR of our existence
4690Sstevel@tonic-gate 	 */
4700Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
4710Sstevel@tonic-gate 
4720Sstevel@tonic-gate 	mutex_enter(&memscrub_lock);
4730Sstevel@tonic-gate 
4740Sstevel@tonic-gate 	if (memscrub_memlist == NULL) {
4750Sstevel@tonic-gate 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
4760Sstevel@tonic-gate 		goto memscrub_exit;
4770Sstevel@tonic-gate 	}
4780Sstevel@tonic-gate 
479*11474SJonathan.Adams@Sun.COM 	address = memscrub_memlist->ml_address;
4800Sstevel@tonic-gate 
4810Sstevel@tonic-gate 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
4820Sstevel@tonic-gate 
4830Sstevel@tonic-gate 	for (;;) {
4840Sstevel@tonic-gate 		if (disable_memscrub)
4850Sstevel@tonic-gate 			break;
4860Sstevel@tonic-gate 
4870Sstevel@tonic-gate 		/*
4880Sstevel@tonic-gate 		 * compute interval_ticks
4890Sstevel@tonic-gate 		 */
4900Sstevel@tonic-gate 		interval_ticks = compute_interval_ticks();
4910Sstevel@tonic-gate 
4920Sstevel@tonic-gate 		/*
4930Sstevel@tonic-gate 		 * If the calculated sleep time is zero, and pause_memscrub
4940Sstevel@tonic-gate 		 * has been set, make sure we sleep so that another thread
4950Sstevel@tonic-gate 		 * can acquire memscrub_lock.
4960Sstevel@tonic-gate 		 */
4970Sstevel@tonic-gate 		if (interval_ticks == 0 && pause_memscrub) {
4980Sstevel@tonic-gate 			interval_ticks = hz;
4990Sstevel@tonic-gate 		}
5000Sstevel@tonic-gate 
5010Sstevel@tonic-gate 		/*
5020Sstevel@tonic-gate 		 * And as a fail safe, under normal non-paused operation, do
5030Sstevel@tonic-gate 		 * not allow the sleep time to be zero.
5040Sstevel@tonic-gate 		 */
5050Sstevel@tonic-gate 		if (interval_ticks == 0) {
5060Sstevel@tonic-gate 			interval_ticks = memscrub_override_ticks;
5070Sstevel@tonic-gate 			if (!sleep_warn_printed) {
5080Sstevel@tonic-gate 				cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
5090Sstevel@tonic-gate 				sleep_warn_printed = 1;
5100Sstevel@tonic-gate 			}
5110Sstevel@tonic-gate 		}
5120Sstevel@tonic-gate 
5130Sstevel@tonic-gate 		memscrub_counts.interval_ticks.value.ui32 = interval_ticks;
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate 		/*
5160Sstevel@tonic-gate 		 * Did we just reach the end of memory? If we are at the
5170Sstevel@tonic-gate 		 * end of memory, delay end of memory processing until
5180Sstevel@tonic-gate 		 * pause_memscrub is not set.
5190Sstevel@tonic-gate 		 */
5200Sstevel@tonic-gate 		if (reached_end && !pause_memscrub) {
5210Sstevel@tonic-gate 			time_t now = gethrestime_sec();
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 			if (now >= deadline) {
5240Sstevel@tonic-gate 				memscrub_counts.done_late.value.ui32++;
5250Sstevel@tonic-gate 				memscrub_counts.late_sec.value.ui32 +=
526*11474SJonathan.Adams@Sun.COM 				    (now - deadline);
5270Sstevel@tonic-gate 				/*
5280Sstevel@tonic-gate 				 * past deadline, start right away
5290Sstevel@tonic-gate 				 */
5300Sstevel@tonic-gate 				interval_ticks = 0;
5310Sstevel@tonic-gate 
5320Sstevel@tonic-gate 				deadline = now + memscrub_period_sec;
5330Sstevel@tonic-gate 			} else {
5340Sstevel@tonic-gate 				/*
5350Sstevel@tonic-gate 				 * we finished ahead of schedule.
5360Sstevel@tonic-gate 				 * wait till previous deadline before re-start.
5370Sstevel@tonic-gate 				 */
5380Sstevel@tonic-gate 				interval_ticks = (deadline - now) * hz;
5390Sstevel@tonic-gate 				memscrub_counts.done_early.value.ui32++;
5400Sstevel@tonic-gate 				memscrub_counts.early_sec.value.ui32 +=
541*11474SJonathan.Adams@Sun.COM 				    (deadline - now);
5420Sstevel@tonic-gate 				deadline += memscrub_period_sec;
5430Sstevel@tonic-gate 			}
5440Sstevel@tonic-gate 			reached_end = 0;
5450Sstevel@tonic-gate 			sleep_warn_printed = 0;
5460Sstevel@tonic-gate 		}
5470Sstevel@tonic-gate 
5480Sstevel@tonic-gate 		if (interval_ticks != 0) {
5490Sstevel@tonic-gate 			/*
5500Sstevel@tonic-gate 			 * it is safe from our standpoint for CPR to
5510Sstevel@tonic-gate 			 * suspend the system
5520Sstevel@tonic-gate 			 */
5530Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
5540Sstevel@tonic-gate 
5550Sstevel@tonic-gate 			/*
5560Sstevel@tonic-gate 			 * hit the snooze bar
5570Sstevel@tonic-gate 			 */
5580Sstevel@tonic-gate 			memscrub_tid = timeout(memscrub_wakeup, NULL,
5590Sstevel@tonic-gate 			    interval_ticks);
5600Sstevel@tonic-gate 
5610Sstevel@tonic-gate 			/*
5620Sstevel@tonic-gate 			 * go to sleep
5630Sstevel@tonic-gate 			 */
5640Sstevel@tonic-gate 			cv_wait(&memscrub_cv, &memscrub_lock);
5650Sstevel@tonic-gate 
5660Sstevel@tonic-gate 			/*
5670Sstevel@tonic-gate 			 * at this point, no timeout should be set
5680Sstevel@tonic-gate 			 */
5690Sstevel@tonic-gate 			memscrub_tid = 0;
5700Sstevel@tonic-gate 
5710Sstevel@tonic-gate 			/*
5720Sstevel@tonic-gate 			 * we need to goto work and will be modifying
5730Sstevel@tonic-gate 			 * our internal state and mapping/unmapping
5740Sstevel@tonic-gate 			 * TTEs
5750Sstevel@tonic-gate 			 */
5760Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
5770Sstevel@tonic-gate 		}
5780Sstevel@tonic-gate 
5790Sstevel@tonic-gate 
5800Sstevel@tonic-gate 		if (memscrub_phys_pages == 0) {
5810Sstevel@tonic-gate 			cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
5820Sstevel@tonic-gate 			goto memscrub_exit;
5830Sstevel@tonic-gate 		}
5840Sstevel@tonic-gate 
5850Sstevel@tonic-gate 		if (!pause_memscrub) {
5860Sstevel@tonic-gate 			if (paused_message) {
5870Sstevel@tonic-gate 				paused_message = 0;
5880Sstevel@tonic-gate 				if (memscrub_verbose)
5890Sstevel@tonic-gate 					cmn_err(CE_NOTE, "Memory scrubber "
5900Sstevel@tonic-gate 					    "resuming");
5910Sstevel@tonic-gate 			}
5920Sstevel@tonic-gate 
5930Sstevel@tonic-gate 			if (read_all_memscrub) {
5940Sstevel@tonic-gate 				if (memscrub_verbose)
5950Sstevel@tonic-gate 					cmn_err(CE_NOTE, "Memory scrubber "
5960Sstevel@tonic-gate 					    "reading all memory per request");
5970Sstevel@tonic-gate 
598*11474SJonathan.Adams@Sun.COM 				addr = memscrub_memlist->ml_address;
5990Sstevel@tonic-gate 				reached_end = 0;
6000Sstevel@tonic-gate 				while (!reached_end) {
6010Sstevel@tonic-gate 					if (disable_memscrub)
6020Sstevel@tonic-gate 						break;
6030Sstevel@tonic-gate 					pages = memscrub_phys_pages;
6040Sstevel@tonic-gate 					reached_end = memscrub_verify_span(
6050Sstevel@tonic-gate 					    &addr, &pages);
6060Sstevel@tonic-gate 					memscrub_scan(pages *
6070Sstevel@tonic-gate 					    MEMSCRUB_BLOCKS_PER_PAGE, addr);
6080Sstevel@tonic-gate 					addr += ((uint64_t)pages * PAGESIZE);
6090Sstevel@tonic-gate 				}
6100Sstevel@tonic-gate 				read_all_memscrub = 0;
6110Sstevel@tonic-gate 			}
6120Sstevel@tonic-gate 
6130Sstevel@tonic-gate 			/*
6140Sstevel@tonic-gate 			 * read 1 span
6150Sstevel@tonic-gate 			 */
6160Sstevel@tonic-gate 			pages = memscrub_span_pages;
6170Sstevel@tonic-gate 
6180Sstevel@tonic-gate 			if (disable_memscrub)
6190Sstevel@tonic-gate 				break;
6200Sstevel@tonic-gate 
6210Sstevel@tonic-gate 			/*
6220Sstevel@tonic-gate 			 * determine physical address range
6230Sstevel@tonic-gate 			 */
6240Sstevel@tonic-gate 			reached_end = memscrub_verify_span(&address,
6250Sstevel@tonic-gate 			    &pages);
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate 			memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
6280Sstevel@tonic-gate 			    address);
6290Sstevel@tonic-gate 
6300Sstevel@tonic-gate 			address += ((uint64_t)pages * PAGESIZE);
6310Sstevel@tonic-gate 		}
6320Sstevel@tonic-gate 
6330Sstevel@tonic-gate 		if (pause_memscrub && !paused_message) {
6340Sstevel@tonic-gate 			paused_message = 1;
6350Sstevel@tonic-gate 			if (memscrub_verbose)
6360Sstevel@tonic-gate 				cmn_err(CE_NOTE, "Memory scrubber paused");
6370Sstevel@tonic-gate 		}
6380Sstevel@tonic-gate 	}
6390Sstevel@tonic-gate 
6400Sstevel@tonic-gate memscrub_exit:
6410Sstevel@tonic-gate 	cmn_err(CE_NOTE, "Memory scrubber exiting");
6420Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cprinfo);
6430Sstevel@tonic-gate 	memscrub_cleanup();
6440Sstevel@tonic-gate 	thread_exit();
6450Sstevel@tonic-gate 	/* NOTREACHED */
6460Sstevel@tonic-gate }
6470Sstevel@tonic-gate 
6480Sstevel@tonic-gate /*
6490Sstevel@tonic-gate  * condition address and size
6500Sstevel@tonic-gate  * such that they span legal physical addresses.
6510Sstevel@tonic-gate  *
6520Sstevel@tonic-gate  * when appropriate, address will be rounded up to start of next
6530Sstevel@tonic-gate  * struct memlist, and pages will be rounded down to the end of the
6540Sstevel@tonic-gate  * memlist size.
6550Sstevel@tonic-gate  *
6560Sstevel@tonic-gate  * returns 1 if reached end of list, else returns 0.
6570Sstevel@tonic-gate  */
6580Sstevel@tonic-gate static int
6590Sstevel@tonic-gate memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
6600Sstevel@tonic-gate {
6610Sstevel@tonic-gate 	struct memlist *mlp;
6620Sstevel@tonic-gate 	ms_paddr_t address = *addrp;
6630Sstevel@tonic-gate 	uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
6640Sstevel@tonic-gate 	uint64_t bytes_remaining;
6650Sstevel@tonic-gate 	int reached_end = 0;
6660Sstevel@tonic-gate 
6670Sstevel@tonic-gate 	ASSERT(mutex_owned(&memscrub_lock));
6680Sstevel@tonic-gate 
6690Sstevel@tonic-gate 	/*
6700Sstevel@tonic-gate 	 * find memlist struct that contains addrp
6710Sstevel@tonic-gate 	 * assumes memlist is sorted by ascending address.
6720Sstevel@tonic-gate 	 */
673*11474SJonathan.Adams@Sun.COM 	for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
6740Sstevel@tonic-gate 		/*
6750Sstevel@tonic-gate 		 * if before this chunk, round up to beginning
6760Sstevel@tonic-gate 		 */
677*11474SJonathan.Adams@Sun.COM 		if (address < mlp->ml_address) {
678*11474SJonathan.Adams@Sun.COM 			address = mlp->ml_address;
6790Sstevel@tonic-gate 			break;
6800Sstevel@tonic-gate 		}
6810Sstevel@tonic-gate 		/*
6820Sstevel@tonic-gate 		 * if before end of chunk, then we found it
6830Sstevel@tonic-gate 		 */
684*11474SJonathan.Adams@Sun.COM 		if (address < (mlp->ml_address + mlp->ml_size))
6850Sstevel@tonic-gate 			break;
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 		/* else go to next struct memlist */
6880Sstevel@tonic-gate 	}
6890Sstevel@tonic-gate 	/*
6900Sstevel@tonic-gate 	 * if we hit end of list, start at beginning
6910Sstevel@tonic-gate 	 */
6920Sstevel@tonic-gate 	if (mlp == NULL) {
6930Sstevel@tonic-gate 		mlp = memscrub_memlist;
694*11474SJonathan.Adams@Sun.COM 		address = mlp->ml_address;
6950Sstevel@tonic-gate 	}
6960Sstevel@tonic-gate 
6970Sstevel@tonic-gate 	/*
6980Sstevel@tonic-gate 	 * now we have legal address, and its mlp, condition bytes
6990Sstevel@tonic-gate 	 */
700*11474SJonathan.Adams@Sun.COM 	bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
7010Sstevel@tonic-gate 
7020Sstevel@tonic-gate 	if (bytes > bytes_remaining)
7030Sstevel@tonic-gate 		bytes = bytes_remaining;
7040Sstevel@tonic-gate 
7050Sstevel@tonic-gate 	/*
7060Sstevel@tonic-gate 	 * will this span take us to end of list?
7070Sstevel@tonic-gate 	 */
708*11474SJonathan.Adams@Sun.COM 	if ((mlp->ml_next == NULL) &&
709*11474SJonathan.Adams@Sun.COM 	    ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
7100Sstevel@tonic-gate 		reached_end = 1;
7110Sstevel@tonic-gate 
7120Sstevel@tonic-gate 	/* return values */
7130Sstevel@tonic-gate 	*addrp = address;
7140Sstevel@tonic-gate 	*pagesp = bytes / PAGESIZE;
7150Sstevel@tonic-gate 
7160Sstevel@tonic-gate 	return (reached_end);
7170Sstevel@tonic-gate }
7180Sstevel@tonic-gate 
7190Sstevel@tonic-gate /*
7200Sstevel@tonic-gate  * add a span to the memscrub list
7210Sstevel@tonic-gate  * add to memscrub_phys_pages
7220Sstevel@tonic-gate  */
7230Sstevel@tonic-gate int
7240Sstevel@tonic-gate memscrub_add_span(pfn_t pfn, pgcnt_t pages)
7250Sstevel@tonic-gate {
7260Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
7270Sstevel@tonic-gate 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
7280Sstevel@tonic-gate 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
7290Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 	int retval;
7320Sstevel@tonic-gate 
7330Sstevel@tonic-gate 	mutex_enter(&memscrub_lock);
7340Sstevel@tonic-gate 
7350Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
7360Sstevel@tonic-gate 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
7370Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
7380Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
7390Sstevel@tonic-gate 	    " size: 0x%llx\n", address, bytes);
7400Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 	retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
7430Sstevel@tonic-gate 	    &memscrub_phys_pages);
7440Sstevel@tonic-gate 
7450Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
7460Sstevel@tonic-gate 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
7470Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
7480Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
7490Sstevel@tonic-gate 
7500Sstevel@tonic-gate 	mutex_exit(&memscrub_lock);
7510Sstevel@tonic-gate 
7520Sstevel@tonic-gate 	return (retval);
7530Sstevel@tonic-gate }
7540Sstevel@tonic-gate 
7550Sstevel@tonic-gate static int
7560Sstevel@tonic-gate memscrub_add_span_gen(
7570Sstevel@tonic-gate 	pfn_t pfn,
7580Sstevel@tonic-gate 	pgcnt_t pages,
7590Sstevel@tonic-gate 	struct memlist **list,
7600Sstevel@tonic-gate 	uint_t *npgs)
7610Sstevel@tonic-gate {
7620Sstevel@tonic-gate 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
7630Sstevel@tonic-gate 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
7640Sstevel@tonic-gate 	struct memlist *dst;
7650Sstevel@tonic-gate 	struct memlist *prev, *next;
7660Sstevel@tonic-gate 	int retval = 0;
7670Sstevel@tonic-gate 
7680Sstevel@tonic-gate 	/*
7690Sstevel@tonic-gate 	 * allocate a new struct memlist
7700Sstevel@tonic-gate 	 */
7710Sstevel@tonic-gate 
7720Sstevel@tonic-gate 	dst = (struct memlist *)
7730Sstevel@tonic-gate 	    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	if (dst == NULL) {
7760Sstevel@tonic-gate 		retval = -1;
7770Sstevel@tonic-gate 		goto add_done;
7780Sstevel@tonic-gate 	}
7790Sstevel@tonic-gate 
780*11474SJonathan.Adams@Sun.COM 	dst->ml_address = address;
781*11474SJonathan.Adams@Sun.COM 	dst->ml_size = bytes;
7820Sstevel@tonic-gate 
7830Sstevel@tonic-gate 	/*
7840Sstevel@tonic-gate 	 * first insert
7850Sstevel@tonic-gate 	 */
7860Sstevel@tonic-gate 	if (*list == NULL) {
787*11474SJonathan.Adams@Sun.COM 		dst->ml_prev = NULL;
788*11474SJonathan.Adams@Sun.COM 		dst->ml_next = NULL;
7890Sstevel@tonic-gate 		*list = dst;
7900Sstevel@tonic-gate 
7910Sstevel@tonic-gate 		goto add_done;
7920Sstevel@tonic-gate 	}
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate 	/*
7950Sstevel@tonic-gate 	 * insert into sorted list
7960Sstevel@tonic-gate 	 */
7970Sstevel@tonic-gate 	for (prev = NULL, next = *list;
7980Sstevel@tonic-gate 	    next != NULL;
799*11474SJonathan.Adams@Sun.COM 	    prev = next, next = next->ml_next) {
800*11474SJonathan.Adams@Sun.COM 		if (address > (next->ml_address + next->ml_size))
8010Sstevel@tonic-gate 			continue;
8020Sstevel@tonic-gate 
8030Sstevel@tonic-gate 		/*
8040Sstevel@tonic-gate 		 * else insert here
8050Sstevel@tonic-gate 		 */
8060Sstevel@tonic-gate 
8070Sstevel@tonic-gate 		/*
8080Sstevel@tonic-gate 		 * prepend to next
8090Sstevel@tonic-gate 		 */
810*11474SJonathan.Adams@Sun.COM 		if ((address + bytes) == next->ml_address) {
8110Sstevel@tonic-gate 			kmem_free(dst, sizeof (struct memlist));
8120Sstevel@tonic-gate 
813*11474SJonathan.Adams@Sun.COM 			next->ml_address = address;
814*11474SJonathan.Adams@Sun.COM 			next->ml_size += bytes;
8150Sstevel@tonic-gate 
8160Sstevel@tonic-gate 			goto add_done;
8170Sstevel@tonic-gate 		}
8180Sstevel@tonic-gate 
8190Sstevel@tonic-gate 		/*
8200Sstevel@tonic-gate 		 * append to next
8210Sstevel@tonic-gate 		 */
822*11474SJonathan.Adams@Sun.COM 		if (address == (next->ml_address + next->ml_size)) {
8230Sstevel@tonic-gate 			kmem_free(dst, sizeof (struct memlist));
8240Sstevel@tonic-gate 
825*11474SJonathan.Adams@Sun.COM 			if (next->ml_next) {
8260Sstevel@tonic-gate 				/*
827*11474SJonathan.Adams@Sun.COM 				 * don't overlap with next->ml_next
8280Sstevel@tonic-gate 				 */
829*11474SJonathan.Adams@Sun.COM 				if ((address + bytes) >
830*11474SJonathan.Adams@Sun.COM 				    next->ml_next->ml_address) {
8310Sstevel@tonic-gate 					retval = -1;
8320Sstevel@tonic-gate 					goto add_done;
8330Sstevel@tonic-gate 				}
8340Sstevel@tonic-gate 				/*
835*11474SJonathan.Adams@Sun.COM 				 * concatenate next and next->ml_next
8360Sstevel@tonic-gate 				 */
837*11474SJonathan.Adams@Sun.COM 				if ((address + bytes) ==
838*11474SJonathan.Adams@Sun.COM 				    next->ml_next->ml_address) {
839*11474SJonathan.Adams@Sun.COM 					struct memlist *mlp = next->ml_next;
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate 					if (next == *list)
842*11474SJonathan.Adams@Sun.COM 						*list = next->ml_next;
8430Sstevel@tonic-gate 
844*11474SJonathan.Adams@Sun.COM 					mlp->ml_address = next->ml_address;
845*11474SJonathan.Adams@Sun.COM 					mlp->ml_size += next->ml_size;
846*11474SJonathan.Adams@Sun.COM 					mlp->ml_size += bytes;
8470Sstevel@tonic-gate 
848*11474SJonathan.Adams@Sun.COM 					if (next->ml_prev)
849*11474SJonathan.Adams@Sun.COM 						next->ml_prev->ml_next = mlp;
850*11474SJonathan.Adams@Sun.COM 					mlp->ml_prev = next->ml_prev;
8510Sstevel@tonic-gate 
8520Sstevel@tonic-gate 					kmem_free(next,
853*11474SJonathan.Adams@Sun.COM 					    sizeof (struct memlist));
8540Sstevel@tonic-gate 					goto add_done;
8550Sstevel@tonic-gate 				}
8560Sstevel@tonic-gate 			}
8570Sstevel@tonic-gate 
858*11474SJonathan.Adams@Sun.COM 			next->ml_size += bytes;
8590Sstevel@tonic-gate 
8600Sstevel@tonic-gate 			goto add_done;
8610Sstevel@tonic-gate 		}
8620Sstevel@tonic-gate 
8630Sstevel@tonic-gate 		/* don't overlap with next */
864*11474SJonathan.Adams@Sun.COM 		if ((address + bytes) > next->ml_address) {
8650Sstevel@tonic-gate 			retval = -1;
8660Sstevel@tonic-gate 			kmem_free(dst, sizeof (struct memlist));
8670Sstevel@tonic-gate 			goto add_done;
8680Sstevel@tonic-gate 		}
8690Sstevel@tonic-gate 
8700Sstevel@tonic-gate 		/*
8710Sstevel@tonic-gate 		 * insert before next
8720Sstevel@tonic-gate 		 */
873*11474SJonathan.Adams@Sun.COM 		dst->ml_prev = prev;
874*11474SJonathan.Adams@Sun.COM 		dst->ml_next = next;
875*11474SJonathan.Adams@Sun.COM 		next->ml_prev = dst;
8760Sstevel@tonic-gate 		if (prev == NULL) {
8770Sstevel@tonic-gate 			*list = dst;
8780Sstevel@tonic-gate 		} else {
879*11474SJonathan.Adams@Sun.COM 			prev->ml_next = dst;
8800Sstevel@tonic-gate 		}
8810Sstevel@tonic-gate 		goto add_done;
8820Sstevel@tonic-gate 	}	/* end for */
8830Sstevel@tonic-gate 
8840Sstevel@tonic-gate 	/*
8850Sstevel@tonic-gate 	 * end of list, prev is valid and next is NULL
8860Sstevel@tonic-gate 	 */
887*11474SJonathan.Adams@Sun.COM 	prev->ml_next = dst;
888*11474SJonathan.Adams@Sun.COM 	dst->ml_prev = prev;
889*11474SJonathan.Adams@Sun.COM 	dst->ml_next = NULL;
8900Sstevel@tonic-gate 
8910Sstevel@tonic-gate add_done:
8920Sstevel@tonic-gate 
8930Sstevel@tonic-gate 	if (retval != -1)
8940Sstevel@tonic-gate 		*npgs += pages;
8950Sstevel@tonic-gate 
8960Sstevel@tonic-gate 	return (retval);
8970Sstevel@tonic-gate }
8980Sstevel@tonic-gate 
8990Sstevel@tonic-gate /*
9000Sstevel@tonic-gate  * delete a span from the memscrub list
9010Sstevel@tonic-gate  * subtract from memscrub_phys_pages
9020Sstevel@tonic-gate  */
9030Sstevel@tonic-gate int
9040Sstevel@tonic-gate memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
9050Sstevel@tonic-gate {
9060Sstevel@tonic-gate 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
9070Sstevel@tonic-gate 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
9080Sstevel@tonic-gate 	struct memlist *dst, *next;
9090Sstevel@tonic-gate 	int retval = 0;
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 	mutex_enter(&memscrub_lock);
9120Sstevel@tonic-gate 
9130Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
9140Sstevel@tonic-gate 	memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
9150Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
9160Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
9170Sstevel@tonic-gate 	    address, bytes);
9180Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
9190Sstevel@tonic-gate 
9200Sstevel@tonic-gate 	/*
9210Sstevel@tonic-gate 	 * find struct memlist containing page
9220Sstevel@tonic-gate 	 */
923*11474SJonathan.Adams@Sun.COM 	for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
924*11474SJonathan.Adams@Sun.COM 		if ((address >= next->ml_address) &&
925*11474SJonathan.Adams@Sun.COM 		    (address < next->ml_address + next->ml_size))
9260Sstevel@tonic-gate 			break;
9270Sstevel@tonic-gate 	}
9280Sstevel@tonic-gate 
9290Sstevel@tonic-gate 	/*
9300Sstevel@tonic-gate 	 * if start address not in list
9310Sstevel@tonic-gate 	 */
9320Sstevel@tonic-gate 	if (next == NULL) {
9330Sstevel@tonic-gate 		retval = -1;
9340Sstevel@tonic-gate 		goto delete_done;
9350Sstevel@tonic-gate 	}
9360Sstevel@tonic-gate 
9370Sstevel@tonic-gate 	/*
9380Sstevel@tonic-gate 	 * error if size goes off end of this struct memlist
9390Sstevel@tonic-gate 	 */
940*11474SJonathan.Adams@Sun.COM 	if (address + bytes > next->ml_address + next->ml_size) {
9410Sstevel@tonic-gate 		retval = -1;
9420Sstevel@tonic-gate 		goto delete_done;
9430Sstevel@tonic-gate 	}
9440Sstevel@tonic-gate 
9450Sstevel@tonic-gate 	/*
9460Sstevel@tonic-gate 	 * pages at beginning of struct memlist
9470Sstevel@tonic-gate 	 */
948*11474SJonathan.Adams@Sun.COM 	if (address == next->ml_address) {
9490Sstevel@tonic-gate 		/*
9500Sstevel@tonic-gate 		 * if start & size match, delete from list
9510Sstevel@tonic-gate 		 */
952*11474SJonathan.Adams@Sun.COM 		if (bytes == next->ml_size) {
9530Sstevel@tonic-gate 			if (next == memscrub_memlist)
954*11474SJonathan.Adams@Sun.COM 				memscrub_memlist = next->ml_next;
955*11474SJonathan.Adams@Sun.COM 			if (next->ml_prev != NULL)
956*11474SJonathan.Adams@Sun.COM 				next->ml_prev->ml_next = next->ml_next;
957*11474SJonathan.Adams@Sun.COM 			if (next->ml_next != NULL)
958*11474SJonathan.Adams@Sun.COM 				next->ml_next->ml_prev = next->ml_prev;
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 			kmem_free(next, sizeof (struct memlist));
9610Sstevel@tonic-gate 		} else {
9620Sstevel@tonic-gate 		/*
9630Sstevel@tonic-gate 		 * increment start address by bytes
9640Sstevel@tonic-gate 		 */
965*11474SJonathan.Adams@Sun.COM 			next->ml_address += bytes;
966*11474SJonathan.Adams@Sun.COM 			next->ml_size -= bytes;
9670Sstevel@tonic-gate 		}
9680Sstevel@tonic-gate 		goto delete_done;
9690Sstevel@tonic-gate 	}
9700Sstevel@tonic-gate 
9710Sstevel@tonic-gate 	/*
9720Sstevel@tonic-gate 	 * pages at end of struct memlist
9730Sstevel@tonic-gate 	 */
974*11474SJonathan.Adams@Sun.COM 	if (address + bytes == next->ml_address + next->ml_size) {
9750Sstevel@tonic-gate 		/*
9760Sstevel@tonic-gate 		 * decrement size by bytes
9770Sstevel@tonic-gate 		 */
978*11474SJonathan.Adams@Sun.COM 		next->ml_size -= bytes;
9790Sstevel@tonic-gate 		goto delete_done;
9800Sstevel@tonic-gate 	}
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate 	/*
9830Sstevel@tonic-gate 	 * delete a span in the middle of the struct memlist
9840Sstevel@tonic-gate 	 */
9850Sstevel@tonic-gate 	{
9860Sstevel@tonic-gate 		/*
9870Sstevel@tonic-gate 		 * create a new struct memlist
9880Sstevel@tonic-gate 		 */
9890Sstevel@tonic-gate 		dst = (struct memlist *)
9900Sstevel@tonic-gate 		    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate 		if (dst == NULL) {
9930Sstevel@tonic-gate 			retval = -1;
9940Sstevel@tonic-gate 			goto delete_done;
9950Sstevel@tonic-gate 		}
9960Sstevel@tonic-gate 
9970Sstevel@tonic-gate 		/*
9980Sstevel@tonic-gate 		 * existing struct memlist gets address
9990Sstevel@tonic-gate 		 * and size up to pfn
10000Sstevel@tonic-gate 		 */
1001*11474SJonathan.Adams@Sun.COM 		dst->ml_address = address + bytes;
1002*11474SJonathan.Adams@Sun.COM 		dst->ml_size =
1003*11474SJonathan.Adams@Sun.COM 		    (next->ml_address + next->ml_size) - dst->ml_address;
1004*11474SJonathan.Adams@Sun.COM 		next->ml_size = address - next->ml_address;
10050Sstevel@tonic-gate 
10060Sstevel@tonic-gate 		/*
10070Sstevel@tonic-gate 		 * new struct memlist gets address starting
10080Sstevel@tonic-gate 		 * after pfn, until end
10090Sstevel@tonic-gate 		 */
10100Sstevel@tonic-gate 
10110Sstevel@tonic-gate 		/*
10120Sstevel@tonic-gate 		 * link in new memlist after old
10130Sstevel@tonic-gate 		 */
1014*11474SJonathan.Adams@Sun.COM 		dst->ml_next = next->ml_next;
1015*11474SJonathan.Adams@Sun.COM 		dst->ml_prev = next;
10160Sstevel@tonic-gate 
1017*11474SJonathan.Adams@Sun.COM 		if (next->ml_next != NULL)
1018*11474SJonathan.Adams@Sun.COM 			next->ml_next->ml_prev = dst;
1019*11474SJonathan.Adams@Sun.COM 		next->ml_next = dst;
10200Sstevel@tonic-gate 	}
10210Sstevel@tonic-gate 
10220Sstevel@tonic-gate delete_done:
10230Sstevel@tonic-gate 	if (retval != -1) {
10240Sstevel@tonic-gate 		memscrub_phys_pages -= pages;
10250Sstevel@tonic-gate 		if (memscrub_phys_pages == 0)
10260Sstevel@tonic-gate 			disable_memscrub = 1;
10270Sstevel@tonic-gate 	}
10280Sstevel@tonic-gate 
10290Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
10300Sstevel@tonic-gate 	memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
10310Sstevel@tonic-gate 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
10320Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
10330Sstevel@tonic-gate 
10340Sstevel@tonic-gate 	mutex_exit(&memscrub_lock);
10350Sstevel@tonic-gate 	return (retval);
10360Sstevel@tonic-gate }
10370Sstevel@tonic-gate 
10380Sstevel@tonic-gate static void
10390Sstevel@tonic-gate memscrub_scan(uint_t blks, ms_paddr_t src)
10400Sstevel@tonic-gate {
10410Sstevel@tonic-gate 	uint_t 		psz, bpp, pgsread;
10420Sstevel@tonic-gate 	pfn_t		pfn;
10430Sstevel@tonic-gate 	ms_paddr_t	pa;
10440Sstevel@tonic-gate 	caddr_t		va;
10450Sstevel@tonic-gate 	on_trap_data_t	otd;
10462895Svb70745 	int		scan_mmu_pagesize = 0;
10472895Svb70745 	int		retired_pages = 0;
10480Sstevel@tonic-gate 
10490Sstevel@tonic-gate 	extern void memscrub_read(caddr_t src, uint_t blks);
10500Sstevel@tonic-gate 
10510Sstevel@tonic-gate 	ASSERT(mutex_owned(&memscrub_lock));
10520Sstevel@tonic-gate 
10530Sstevel@tonic-gate 	pgsread = 0;
10540Sstevel@tonic-gate 	pa = src;
10550Sstevel@tonic-gate 
10562895Svb70745 	if (memscrub_page_retire_span_list != NULL) {
10572895Svb70745 		if (memscrub_page_retire_span_search(src)) {
10582895Svb70745 			/* retired pages in current span */
10592895Svb70745 			scan_mmu_pagesize = 1;
10602895Svb70745 		}
10612895Svb70745 	}
10622895Svb70745 
10632895Svb70745 #ifdef MEMSCRUB_DEBUG
10642895Svb70745 	cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
10652895Svb70745 #endif /* MEMSCRUB_DEBUG */
10662895Svb70745 
10670Sstevel@tonic-gate 	while (blks != 0) {
10680Sstevel@tonic-gate 		/* Ensure the PA is properly aligned */
10690Sstevel@tonic-gate 		if (((pa & MMU_PAGEMASK4M) == pa) &&
1070*11474SJonathan.Adams@Sun.COM 		    (blks >= MEMSCRUB_BPP4M)) {
10710Sstevel@tonic-gate 			psz = MMU_PAGESIZE4M;
10720Sstevel@tonic-gate 			bpp = MEMSCRUB_BPP4M;
10730Sstevel@tonic-gate 		} else if (((pa & MMU_PAGEMASK512K) == pa) &&
1074*11474SJonathan.Adams@Sun.COM 		    (blks >= MEMSCRUB_BPP512K)) {
10750Sstevel@tonic-gate 			psz = MMU_PAGESIZE512K;
10760Sstevel@tonic-gate 			bpp = MEMSCRUB_BPP512K;
10770Sstevel@tonic-gate 		} else if (((pa & MMU_PAGEMASK64K) == pa) &&
1078*11474SJonathan.Adams@Sun.COM 		    (blks >= MEMSCRUB_BPP64K)) {
10790Sstevel@tonic-gate 			psz = MMU_PAGESIZE64K;
10800Sstevel@tonic-gate 			bpp = MEMSCRUB_BPP64K;
10810Sstevel@tonic-gate 		} else if ((pa & MMU_PAGEMASK) == pa) {
10820Sstevel@tonic-gate 			psz = MMU_PAGESIZE;
10830Sstevel@tonic-gate 			bpp = MEMSCRUB_BPP;
10840Sstevel@tonic-gate 		} else {
10850Sstevel@tonic-gate 			if (memscrub_verbose) {
10860Sstevel@tonic-gate 				cmn_err(CE_NOTE, "Memory scrubber ignoring "
10870Sstevel@tonic-gate 				    "non-page aligned block starting at 0x%"
10880Sstevel@tonic-gate 				    PRIx64, src);
10890Sstevel@tonic-gate 			}
10900Sstevel@tonic-gate 			return;
10910Sstevel@tonic-gate 		}
10920Sstevel@tonic-gate 		if (blks < bpp) bpp = blks;
10930Sstevel@tonic-gate 
10940Sstevel@tonic-gate #ifdef MEMSCRUB_DEBUG
10950Sstevel@tonic-gate 		cmn_err(CE_NOTE, "Going to run psz=%x, "
10960Sstevel@tonic-gate 		    "bpp=%x pa=%llx\n", psz, bpp, pa);
10970Sstevel@tonic-gate #endif /* MEMSCRUB_DEBUG */
10980Sstevel@tonic-gate 
10990Sstevel@tonic-gate 		/*
11000Sstevel@tonic-gate 		 * MEMSCRUBBASE is a 4MB aligned page in the
11010Sstevel@tonic-gate 		 * kernel so that we can quickly map the PA
11020Sstevel@tonic-gate 		 * to a VA for the block loads performed in
11030Sstevel@tonic-gate 		 * memscrub_read.
11040Sstevel@tonic-gate 		 */
11050Sstevel@tonic-gate 		pfn = mmu_btop(pa);
11060Sstevel@tonic-gate 		va = (caddr_t)MEMSCRUBBASE;
11070Sstevel@tonic-gate 		hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1108*11474SJonathan.Adams@Sun.COM 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
11090Sstevel@tonic-gate 
11100Sstevel@tonic-gate 		/*
11110Sstevel@tonic-gate 		 * Can't allow the memscrubber to migrate across CPUs as
11120Sstevel@tonic-gate 		 * we need to know whether CEEN is enabled for the current
11130Sstevel@tonic-gate 		 * CPU to enable us to scrub the memory. Don't use
11140Sstevel@tonic-gate 		 * kpreempt_disable as the time we take to scan a span (even
11150Sstevel@tonic-gate 		 * without cpu_check_ce having to manually cpu_check_block)
11160Sstevel@tonic-gate 		 * is too long to hold a higher priority thread (eg, RT)
11170Sstevel@tonic-gate 		 * off cpu.
11180Sstevel@tonic-gate 		 */
11190Sstevel@tonic-gate 		thread_affinity_set(curthread, CPU_CURRENT);
11200Sstevel@tonic-gate 
11210Sstevel@tonic-gate 		/*
11220Sstevel@tonic-gate 		 * Protect read scrub from async faults.  For now, we simply
11230Sstevel@tonic-gate 		 * maintain a count of such faults caught.
11240Sstevel@tonic-gate 		 */
11250Sstevel@tonic-gate 
11262895Svb70745 		if (!scan_mmu_pagesize && !on_trap(&otd, OT_DATA_EC)) {
11270Sstevel@tonic-gate 			memscrub_read(va, bpp);
11280Sstevel@tonic-gate 			/*
11290Sstevel@tonic-gate 			 * Check if CEs require logging
11300Sstevel@tonic-gate 			 */
11310Sstevel@tonic-gate 			cpu_check_ce(SCRUBBER_CEEN_CHECK,
11320Sstevel@tonic-gate 			    (uint64_t)pa, va, psz);
1133102Srjnoe 			no_trap();
11340Sstevel@tonic-gate 			thread_affinity_clear(curthread);
11350Sstevel@tonic-gate 		} else {
11360Sstevel@tonic-gate 			no_trap();
11370Sstevel@tonic-gate 			thread_affinity_clear(curthread);
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 			/*
11400Sstevel@tonic-gate 			 * Got an async error..
11410Sstevel@tonic-gate 			 * Try rescanning it at MMU_PAGESIZE
11420Sstevel@tonic-gate 			 * granularity if we were trying to
11430Sstevel@tonic-gate 			 * read at a larger page size.
11440Sstevel@tonic-gate 			 * This is to ensure we continue to
11450Sstevel@tonic-gate 			 * scan the rest of the span.
11462895Svb70745 			 * OR scanning MMU_PAGESIZE granularity to avoid
11472895Svb70745 			 * reading retired pages memory when scan_mmu_pagesize
11482895Svb70745 			 * is set.
11490Sstevel@tonic-gate 			 */
11502895Svb70745 			if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
11510Sstevel@tonic-gate 			    caddr_t vaddr = va;
11520Sstevel@tonic-gate 			    ms_paddr_t paddr = pa;
11530Sstevel@tonic-gate 			    int tmp = 0;
11540Sstevel@tonic-gate 			    for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
11552895Svb70745 				/* Don't scrub retired pages */
11562895Svb70745 				if (page_retire_check(paddr, NULL) == 0) {
11572895Svb70745 					vaddr += MMU_PAGESIZE;
11582895Svb70745 					paddr += MMU_PAGESIZE;
11592895Svb70745 					retired_pages++;
11602895Svb70745 					continue;
11612895Svb70745 				}
11620Sstevel@tonic-gate 				thread_affinity_set(curthread, CPU_CURRENT);
1163102Srjnoe 				if (!on_trap(&otd, OT_DATA_EC)) {
11640Sstevel@tonic-gate 				    memscrub_read(vaddr, MEMSCRUB_BPP);
1165102Srjnoe 				    cpu_check_ce(SCRUBBER_CEEN_CHECK,
1166102Srjnoe 					(uint64_t)paddr, vaddr, MMU_PAGESIZE);
1167102Srjnoe 				    no_trap();
1168102Srjnoe 				} else {
1169102Srjnoe 				    no_trap();
11700Sstevel@tonic-gate 				    memscrub_counts.errors_found.value.ui32++;
1171102Srjnoe 				}
11720Sstevel@tonic-gate 				thread_affinity_clear(curthread);
11730Sstevel@tonic-gate 				vaddr += MMU_PAGESIZE;
11740Sstevel@tonic-gate 				paddr += MMU_PAGESIZE;
11750Sstevel@tonic-gate 			    }
11760Sstevel@tonic-gate 			}
11770Sstevel@tonic-gate 		}
11780Sstevel@tonic-gate 		hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
11790Sstevel@tonic-gate 
11800Sstevel@tonic-gate 		blks -= bpp;
11810Sstevel@tonic-gate 		pa += psz;
11820Sstevel@tonic-gate 		pgsread++;
11830Sstevel@tonic-gate 	}
11842895Svb70745 
11852895Svb70745 	/*
11862895Svb70745 	 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
11872895Svb70745 	 * pages found so delete span from global list.
11882895Svb70745 	 */
11892895Svb70745 	if (scan_mmu_pagesize && retired_pages == 0)
11902895Svb70745 		memscrub_page_retire_span_delete(src);
11912895Svb70745 
11922895Svb70745 	/*
11932895Svb70745 	 * Encountered CE/UE on a retired page during memscrub read of current
11942895Svb70745 	 * span.  Adding span to global list to enable avoid reading further.
11952895Svb70745 	 */
11962895Svb70745 	if (add_to_page_retire_list) {
11972895Svb70745 		if (!memscrub_page_retire_span_search(src))
11982895Svb70745 			memscrub_page_retire_span_add(src);
11992895Svb70745 		add_to_page_retire_list = 0;
12002895Svb70745 	}
12012895Svb70745 
12020Sstevel@tonic-gate 	if (memscrub_verbose) {
12030Sstevel@tonic-gate 		cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
12040Sstevel@tonic-gate 		    "at 0x%" PRIx64, pgsread, src);
12050Sstevel@tonic-gate 	}
12060Sstevel@tonic-gate }
12070Sstevel@tonic-gate 
12080Sstevel@tonic-gate /*
12092895Svb70745  * Called by cpu_async_log_err() when memscrub read causes
12102895Svb70745  * CE/UE on a retired page.
12112895Svb70745  */
12122895Svb70745 void
12132895Svb70745 memscrub_induced_error(void)
12142895Svb70745 {
12152895Svb70745 	add_to_page_retire_list = 1;
12162895Svb70745 }
12172895Svb70745 
12182895Svb70745 
12192895Svb70745 /*
12202895Svb70745  * Called by memscrub_scan().
12212895Svb70745  * pa: physical address of span with CE/UE, add to global list.
12222895Svb70745  */
12232895Svb70745 static void
12242895Svb70745 memscrub_page_retire_span_add(ms_paddr_t pa)
12252895Svb70745 {
12262895Svb70745 	memscrub_page_retire_span_t *new_span;
12272895Svb70745 
12282895Svb70745 	new_span = (memscrub_page_retire_span_t *)
12292895Svb70745 	    kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
12302895Svb70745 
12312895Svb70745 	if (new_span == NULL) {
12322895Svb70745 #ifdef MEMSCRUB_DEBUG
12332895Svb70745 		cmn_err(CE_NOTE, "failed to allocate new span - span with"
12342895Svb70745 		    " retired page/s not tracked.\n");
12352895Svb70745 #endif /* MEMSCRUB_DEBUG */
12362895Svb70745 		return;
12372895Svb70745 	}
12382895Svb70745 
12392895Svb70745 	new_span->address = pa;
12402895Svb70745 	new_span->next = memscrub_page_retire_span_list;
12412895Svb70745 	memscrub_page_retire_span_list = new_span;
12422895Svb70745 }
12432895Svb70745 
12442895Svb70745 /*
12452895Svb70745  * Called by memscrub_scan().
12462895Svb70745  * pa: physical address of span to be removed from global list.
12472895Svb70745  */
12482895Svb70745 static void
12492895Svb70745 memscrub_page_retire_span_delete(ms_paddr_t pa)
12502895Svb70745 {
12512895Svb70745 	memscrub_page_retire_span_t *prev_span, *next_span;
12522895Svb70745 
12532895Svb70745 	prev_span = memscrub_page_retire_span_list;
12542895Svb70745 	next_span = memscrub_page_retire_span_list->next;
12552895Svb70745 
12562895Svb70745 	if (pa == prev_span->address) {
12572895Svb70745 		memscrub_page_retire_span_list = next_span;
12582895Svb70745 		kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
12592895Svb70745 		return;
12602895Svb70745 	}
12612895Svb70745 
12622895Svb70745 	while (next_span) {
12632895Svb70745 		if (pa == next_span->address) {
12642895Svb70745 			prev_span->next = next_span->next;
12652895Svb70745 			kmem_free(next_span,
12662895Svb70745 			    sizeof (memscrub_page_retire_span_t));
12672895Svb70745 			return;
12682895Svb70745 		}
12692895Svb70745 		prev_span = next_span;
12702895Svb70745 		next_span = next_span->next;
12712895Svb70745 	}
12722895Svb70745 }
12732895Svb70745 
12742895Svb70745 /*
12752895Svb70745  * Called by memscrub_scan().
12762895Svb70745  * pa: physical address of span to be searched in global list.
12772895Svb70745  */
12782895Svb70745 static int
12792895Svb70745 memscrub_page_retire_span_search(ms_paddr_t pa)
12802895Svb70745 {
12812895Svb70745 	memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
12822895Svb70745 
12832895Svb70745 	while (next_span) {
12842895Svb70745 		if (pa == next_span->address)
12852895Svb70745 			return (1);
12862895Svb70745 		next_span = next_span->next;
12872895Svb70745 	}
12882895Svb70745 	return (0);
12892895Svb70745 }
12902895Svb70745 
12912895Svb70745 /*
12922895Svb70745  * Called from new_memscrub() as a result of memory delete.
12932895Svb70745  * Using page_numtopp_nolock() to determine if we have valid PA.
12942895Svb70745  */
12952895Svb70745 static void
12962895Svb70745 memscrub_page_retire_span_list_update(void)
12972895Svb70745 {
12982895Svb70745 	memscrub_page_retire_span_t *prev, *cur, *next;
12992895Svb70745 
13002895Svb70745 	if (memscrub_page_retire_span_list == NULL)
13012895Svb70745 		return;
13022895Svb70745 
13032895Svb70745 	prev = cur = memscrub_page_retire_span_list;
13042895Svb70745 	next = cur->next;
13052895Svb70745 
13062895Svb70745 	while (cur) {
13072895Svb70745 		if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
13082895Svb70745 			if (cur == memscrub_page_retire_span_list) {
13092895Svb70745 				memscrub_page_retire_span_list = next;
13102895Svb70745 				kmem_free(cur,
13112895Svb70745 				    sizeof (memscrub_page_retire_span_t));
13122895Svb70745 				prev = cur = memscrub_page_retire_span_list;
13132895Svb70745 			} else {
13142895Svb70745 				prev->next = cur->next;
13152895Svb70745 				kmem_free(cur,
13162895Svb70745 				    sizeof (memscrub_page_retire_span_t));
13172895Svb70745 				cur = next;
13182895Svb70745 			}
13192895Svb70745 		} else {
13202895Svb70745 			prev = cur;
13212895Svb70745 			cur = next;
13222895Svb70745 		}
13232895Svb70745 		if (cur != NULL)
13242895Svb70745 			next = cur->next;
13252895Svb70745 	}
13262895Svb70745 }
13272895Svb70745 
13282895Svb70745 /*
13290Sstevel@tonic-gate  * The memory add/delete callback mechanism does not pass in the
13300Sstevel@tonic-gate  * page ranges. The phys_install list has been updated though, so
13310Sstevel@tonic-gate  * create a new scrub list from it.
13320Sstevel@tonic-gate  */
13330Sstevel@tonic-gate 
13340Sstevel@tonic-gate static int
13352895Svb70745 new_memscrub(int update_page_retire_list)
13360Sstevel@tonic-gate {
13370Sstevel@tonic-gate 	struct memlist *src, *list, *old_list;
13380Sstevel@tonic-gate 	uint_t npgs;
13390Sstevel@tonic-gate 
13400Sstevel@tonic-gate 	/*
13410Sstevel@tonic-gate 	 * copy phys_install to memscrub_memlist
13420Sstevel@tonic-gate 	 */
13430Sstevel@tonic-gate 	list = NULL;
13440Sstevel@tonic-gate 	npgs = 0;
13450Sstevel@tonic-gate 	memlist_read_lock();
1346*11474SJonathan.Adams@Sun.COM 	for (src = phys_install; src; src = src->ml_next) {
1347*11474SJonathan.Adams@Sun.COM 		if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
1348*11474SJonathan.Adams@Sun.COM 		    (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
13490Sstevel@tonic-gate 			memlist_read_unlock();
13500Sstevel@tonic-gate 			while (list) {
13510Sstevel@tonic-gate 				struct memlist *el;
13520Sstevel@tonic-gate 
13530Sstevel@tonic-gate 				el = list;
1354*11474SJonathan.Adams@Sun.COM 				list = list->ml_next;
13550Sstevel@tonic-gate 				kmem_free(el, sizeof (struct memlist));
13560Sstevel@tonic-gate 			}
13570Sstevel@tonic-gate 			return (-1);
13580Sstevel@tonic-gate 		}
13590Sstevel@tonic-gate 	}
13600Sstevel@tonic-gate 	memlist_read_unlock();
13610Sstevel@tonic-gate 
13620Sstevel@tonic-gate 	mutex_enter(&memscrub_lock);
13630Sstevel@tonic-gate 	memscrub_phys_pages = npgs;
13640Sstevel@tonic-gate 	old_list = memscrub_memlist;
13650Sstevel@tonic-gate 	memscrub_memlist = list;
13662895Svb70745 
13672895Svb70745 	if (update_page_retire_list)
13682895Svb70745 		memscrub_page_retire_span_list_update();
13692895Svb70745 
13700Sstevel@tonic-gate 	mutex_exit(&memscrub_lock);
13710Sstevel@tonic-gate 
13720Sstevel@tonic-gate 	while (old_list) {
13730Sstevel@tonic-gate 		struct memlist *el;
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 		el = old_list;
1376*11474SJonathan.Adams@Sun.COM 		old_list = old_list->ml_next;
13770Sstevel@tonic-gate 		kmem_free(el, sizeof (struct memlist));
13780Sstevel@tonic-gate 	}
13792895Svb70745 
13800Sstevel@tonic-gate 	return (0);
13810Sstevel@tonic-gate }
13820Sstevel@tonic-gate 
13830Sstevel@tonic-gate /*ARGSUSED*/
13840Sstevel@tonic-gate static void
13850Sstevel@tonic-gate memscrub_mem_config_post_add(
13860Sstevel@tonic-gate 	void *arg,
13870Sstevel@tonic-gate 	pgcnt_t delta_pages)
13880Sstevel@tonic-gate {
13890Sstevel@tonic-gate 	/*
13900Sstevel@tonic-gate 	 * We increment pause_memscrub before entering new_memscrub(). This
13910Sstevel@tonic-gate 	 * will force the memscrubber to sleep, allowing the DR callback
13920Sstevel@tonic-gate 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
13930Sstevel@tonic-gate 	 * atomic_add_32() allows concurrent memory DR operations to use the
13940Sstevel@tonic-gate 	 * callbacks safely.
13950Sstevel@tonic-gate 	 */
13960Sstevel@tonic-gate 	atomic_add_32(&pause_memscrub, 1);
13970Sstevel@tonic-gate 	ASSERT(pause_memscrub != 0);
13980Sstevel@tonic-gate 
13990Sstevel@tonic-gate 	/*
14000Sstevel@tonic-gate 	 * "Don't care" if we are not scrubbing new memory.
14010Sstevel@tonic-gate 	 */
14022895Svb70745 	(void) new_memscrub(0);		/* retain page retire list */
14030Sstevel@tonic-gate 
14040Sstevel@tonic-gate 	/* Restore the pause setting. */
14050Sstevel@tonic-gate 	atomic_add_32(&pause_memscrub, -1);
14060Sstevel@tonic-gate }
14070Sstevel@tonic-gate 
14080Sstevel@tonic-gate /*ARGSUSED*/
14090Sstevel@tonic-gate static int
14100Sstevel@tonic-gate memscrub_mem_config_pre_del(
14110Sstevel@tonic-gate 	void *arg,
14120Sstevel@tonic-gate 	pgcnt_t delta_pages)
14130Sstevel@tonic-gate {
14140Sstevel@tonic-gate 	/* Nothing to do. */
14150Sstevel@tonic-gate 	return (0);
14160Sstevel@tonic-gate }
14170Sstevel@tonic-gate 
14180Sstevel@tonic-gate /*ARGSUSED*/
14190Sstevel@tonic-gate static void
14200Sstevel@tonic-gate memscrub_mem_config_post_del(
14210Sstevel@tonic-gate 	void *arg,
14220Sstevel@tonic-gate 	pgcnt_t delta_pages,
14230Sstevel@tonic-gate 	int cancelled)
14240Sstevel@tonic-gate {
14250Sstevel@tonic-gate 	/*
14260Sstevel@tonic-gate 	 * We increment pause_memscrub before entering new_memscrub(). This
14270Sstevel@tonic-gate 	 * will force the memscrubber to sleep, allowing the DR callback
14280Sstevel@tonic-gate 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
14290Sstevel@tonic-gate 	 * atomic_add_32() allows concurrent memory DR operations to use the
14300Sstevel@tonic-gate 	 * callbacks safely.
14310Sstevel@tonic-gate 	 */
14320Sstevel@tonic-gate 	atomic_add_32(&pause_memscrub, 1);
14330Sstevel@tonic-gate 	ASSERT(pause_memscrub != 0);
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate 	/*
14360Sstevel@tonic-gate 	 * Must stop scrubbing deleted memory as it may be disconnected.
14370Sstevel@tonic-gate 	 */
14382895Svb70745 	if (new_memscrub(1)) {	/* update page retire list */
14390Sstevel@tonic-gate 		disable_memscrub = 1;
14400Sstevel@tonic-gate 	}
14410Sstevel@tonic-gate 
14420Sstevel@tonic-gate 	/* Restore the pause setting. */
14430Sstevel@tonic-gate 	atomic_add_32(&pause_memscrub, -1);
14440Sstevel@tonic-gate }
14450Sstevel@tonic-gate 
14460Sstevel@tonic-gate static kphysm_setup_vector_t memscrub_mem_config_vec = {
14470Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
14480Sstevel@tonic-gate 	memscrub_mem_config_post_add,
14490Sstevel@tonic-gate 	memscrub_mem_config_pre_del,
14500Sstevel@tonic-gate 	memscrub_mem_config_post_del,
14510Sstevel@tonic-gate };
14520Sstevel@tonic-gate 
14530Sstevel@tonic-gate static void
14540Sstevel@tonic-gate memscrub_init_mem_config()
14550Sstevel@tonic-gate {
14560Sstevel@tonic-gate 	int ret;
14570Sstevel@tonic-gate 
14580Sstevel@tonic-gate 	ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
14590Sstevel@tonic-gate 	    (void *)NULL);
14600Sstevel@tonic-gate 	ASSERT(ret == 0);
14610Sstevel@tonic-gate }
14620Sstevel@tonic-gate 
14630Sstevel@tonic-gate static void
14640Sstevel@tonic-gate memscrub_uninit_mem_config()
14650Sstevel@tonic-gate {
14660Sstevel@tonic-gate 	/* This call is OK if the register call was not done. */
14670Sstevel@tonic-gate 	kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
14680Sstevel@tonic-gate }
1469