1*917Selowe /*
2*917Selowe  * CDDL HEADER START
3*917Selowe  *
4*917Selowe  * The contents of this file are subject to the terms of the
5*917Selowe  * Common Development and Distribution License, Version 1.0 only
6*917Selowe  * (the "License").  You may not use this file except in compliance
7*917Selowe  * with the License.
8*917Selowe  *
9*917Selowe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*917Selowe  * or http://www.opensolaris.org/os/licensing.
11*917Selowe  * See the License for the specific language governing permissions
12*917Selowe  * and limitations under the License.
13*917Selowe  *
14*917Selowe  * When distributing Covered Code, include this CDDL HEADER in each
15*917Selowe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*917Selowe  * If applicable, add the following below this CDDL HEADER, with the
17*917Selowe  * fields enclosed by brackets "[]" replaced with your own identifying
18*917Selowe  * information: Portions Copyright [yyyy] [name of copyright owner]
19*917Selowe  *
20*917Selowe  * CDDL HEADER END
21*917Selowe  */
22*917Selowe /*
23*917Selowe  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*917Selowe  * Use is subject to license terms.
25*917Selowe  */
26*917Selowe 
27*917Selowe #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*917Selowe 
29*917Selowe /*
30*917Selowe  * Page Retire - Big Theory Statement.
31*917Selowe  *
32*917Selowe  * This file handles removing sections of faulty memory from use when the
33*917Selowe  * user land FMA Diagnosis Engine requests that a page be removed or when
34*917Selowe  * a CE or UE is detected by the hardware.
35*917Selowe  *
36*917Selowe  * In the bad old days, the kernel side of Page Retire did a lot of the work
37*917Selowe  * on its own. Now, with the DE keeping track of errors, the kernel side is
38*917Selowe  * rather simple minded on most platforms.
39*917Selowe  *
40*917Selowe  * Errors are all reflected to the DE, and after digesting the error and
41*917Selowe  * looking at all previously reported errors, the DE decides what should
42*917Selowe  * be done about the current error. If the DE wants a particular page to
43*917Selowe  * be retired, then the kernel page retire code is invoked via an ioctl.
44*917Selowe  * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
45*917Selowe  * page retire to handle the error. Since page retire is just a simple
46*917Selowe  * mechanism it doesn't need to differentiate between the different callers.
47*917Selowe  *
48*917Selowe  * The p_toxic field in the page_t is used to indicate which errors have
49*917Selowe  * occurred and what action has been taken on a given page. Because errors are
50*917Selowe  * reported without regard to the locked state of a page, no locks are used
51*917Selowe  * to SET the error bits in p_toxic. However, in order to clear the error
52*917Selowe  * bits, the page_t must be held exclusively locked.
53*917Selowe  *
54*917Selowe  * When page_retire() is called, it must be able to acquire locks, sleep, etc.
55*917Selowe  * It must not be called from high-level interrupt context.
56*917Selowe  *
57*917Selowe  * Depending on how the requested page is being used at the time of the retire
58*917Selowe  * request (and on the availability of sufficient system resources), the page
59*917Selowe  * may be retired immediately, or just marked for retirement later. For
60*917Selowe  * example, locked pages are marked, while free pages are retired. Multiple
61*917Selowe  * requests may be made to retire the same page, although there is no need
62*917Selowe  * to: once the p_toxic flags are set, the page will be retired as soon as it
63*917Selowe  * can be exclusively locked.
64*917Selowe  *
65*917Selowe  * The retire mechanism is driven centrally out of page_unlock(). To expedite
66*917Selowe  * the retirement of pages, further requests for SE_SHARED locks are denied
67*917Selowe  * as long as a page retirement is pending. In addition, as long as pages are
68*917Selowe  * pending retirement a background thread runs periodically trying to retire
69*917Selowe  * those pages. Pages which could not be retired while the system is running
70*917Selowe  * are scrubbed prior to rebooting to avoid latent errors on the next boot.
71*917Selowe  *
72*917Selowe  * Single CE pages and UE pages without persistent errors are scrubbed and
73*917Selowe  * returned to service. Recidivist pages, as well as FMA-directed requests
74*917Selowe  * for retirement, result in the page being taken out of service. Once the
75*917Selowe  * decision is made to take a page out of service, the page is cleared, hashed
76*917Selowe  * onto the retired_pages vnode, marked as retired, and it is unlocked.  No
77*917Selowe  * other requesters (except for unretire) are allowed to lock retired pages.
78*917Selowe  *
79*917Selowe  * The public routines return (sadly) 0 if they worked and a non-zero error
80*917Selowe  * value if something went wrong. This is done for the ioctl side of the
81*917Selowe  * world to allow errors to be reflected all the way out to user land. The
82*917Selowe  * non-zero values are explained in comments atop each function.
83*917Selowe  */
84*917Selowe 
85*917Selowe /*
86*917Selowe  * Things to fix:
87*917Selowe  *
88*917Selowe  * 	1. Cleanup SE_EWANTED.  Since we're aggressive about trying to retire
89*917Selowe  *	pages, we can use page_retire_pp() to replace SE_EWANTED and all
90*917Selowe  *	the special delete_memory_thread() code just goes away.
91*917Selowe  *
92*917Selowe  * 	2. Trying to retire non-relocatable kvp pages may result in a
93*917Selowe  *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
94*917Selowe  *      and calls page_lookup() in the free path; since kvp pages are modified
95*917Selowe  *      and don't have a usable backing store, page_retire() can't do anything
96*917Selowe  *      with them, and we'll keep denying the lock to seg_kmem_free() in a
97*917Selowe  *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
98*917Selowe  *      hence only call page_retire_pp() from page_unlock() in the free path.
99*917Selowe  *      Since most kernel pages are indefinitely held anyway, and don't
100*917Selowe  *      participate in I/O, this is of little consequence.
101*917Selowe  *
102*917Selowe  *      3. Low memory situations will be interesting. If we don't have
103*917Selowe  *      enough memory for page_relocate() to succeed, we won't be able to
104*917Selowe  *      retire dirty pages; nobody will be able to push them out to disk
105*917Selowe  *      either, since we aggressively deny the page lock. We could change
106*917Selowe  *      fsflush so it can recognize this situation, grab the lock, and push
107*917Selowe  *      the page out, where we'll catch it in the free path and retire it.
108*917Selowe  *
109*917Selowe  *	4. Beware of places that have code like this in them:
110*917Selowe  *
111*917Selowe  *		if (! page_tryupgrade(pp)) {
112*917Selowe  *			page_unlock(pp);
113*917Selowe  *			while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
114*917Selowe  *				/ *NOTHING* /
115*917Selowe  *			}
116*917Selowe  *		}
117*917Selowe  *		page_free(pp);
118*917Selowe  *
119*917Selowe  *	The problem is that pp can change identity right after the
120*917Selowe  *	page_unlock() call.  In particular, page_retire() can step in
121*917Selowe  *	there, change pp's identity, and hash pp onto the retired_vnode.
122*917Selowe  *
123*917Selowe  *	Of course, other functions besides page_retire() can have the
124*917Selowe  *	same effect. A kmem reader can waltz by, set up a mapping to the
125*917Selowe  *	page, and then unlock the page. Page_free() will then go castors
126*917Selowe  *	up. So if anybody is doing this, it's already a bug.
127*917Selowe  *
128*917Selowe  *      5. mdboot()'s call into page_retire_hunt() should probably be
129*917Selowe  *      moved lower. Where the call is made now, we can get into trouble
130*917Selowe  *      by scrubbing a kernel page that is then accessed later.
131*917Selowe  */
132*917Selowe 
133*917Selowe #include <sys/types.h>
134*917Selowe #include <sys/param.h>
135*917Selowe #include <sys/systm.h>
136*917Selowe #include <sys/mman.h>
137*917Selowe #include <sys/vnode.h>
138*917Selowe #include <sys/cmn_err.h>
139*917Selowe #include <sys/ksynch.h>
140*917Selowe #include <sys/thread.h>
141*917Selowe #include <sys/disp.h>
142*917Selowe #include <sys/ontrap.h>
143*917Selowe #include <sys/vmsystm.h>
144*917Selowe #include <sys/mem_config.h>
145*917Selowe #include <sys/atomic.h>
146*917Selowe #include <sys/callb.h>
147*917Selowe #include <vm/page.h>
148*917Selowe #include <vm/vm_dep.h>
149*917Selowe #include <vm/as.h>
150*917Selowe #include <vm/hat.h>
151*917Selowe 
152*917Selowe /*
153*917Selowe  * vnode for all pages which are retired from the VM system;
154*917Selowe  */
155*917Selowe vnode_t *retired_pages;
156*917Selowe 
157*917Selowe /*
158*917Selowe  * Background thread that wakes up periodically to try to retire pending
159*917Selowe  * pages. This prevents threads from becoming blocked indefinitely in
160*917Selowe  * page_lookup() or some other routine should the page(s) they are waiting
161*917Selowe  * on become eligible for social security.
162*917Selowe  */
163*917Selowe static void page_retire_thread(void);
164*917Selowe static kthread_t *pr_thread_id;
165*917Selowe static kcondvar_t pr_cv;
166*917Selowe static kmutex_t pr_thread_mutex;
167*917Selowe static clock_t pr_thread_shortwait;
168*917Selowe static clock_t pr_thread_longwait;
169*917Selowe 
170*917Selowe /*
171*917Selowe  * Make a list of all of the pages that have been marked for retirement
172*917Selowe  * but are not yet retired.  At system shutdown, we will scrub all of the
173*917Selowe  * pages in the list in case there are outstanding UEs.  Then, we
174*917Selowe  * cross-check this list against the number of pages that are yet to be
175*917Selowe  * retired, and if we find inconsistencies, we scan every page_t in the
176*917Selowe  * whole system looking for any pages that need to be scrubbed for UEs.
177*917Selowe  * The background thread also uses this queue to determine which pages
178*917Selowe  * it should keep trying to retire.
179*917Selowe  */
180*917Selowe #ifdef	DEBUG
181*917Selowe #define	PR_PENDING_QMAX	32
182*917Selowe #else	/* DEBUG */
183*917Selowe #define	PR_PENDING_QMAX	256
184*917Selowe #endif	/* DEBUG */
185*917Selowe page_t		*pr_pending_q[PR_PENDING_QMAX];
186*917Selowe kmutex_t	pr_q_mutex;
187*917Selowe 
188*917Selowe /*
189*917Selowe  * Page retire global kstats
190*917Selowe  */
191*917Selowe struct page_retire_kstat {
192*917Selowe 	kstat_named_t	pr_retired;
193*917Selowe 	kstat_named_t	pr_requested;
194*917Selowe 	kstat_named_t	pr_requested_free;
195*917Selowe 	kstat_named_t	pr_enqueue_fail;
196*917Selowe 	kstat_named_t	pr_dequeue_fail;
197*917Selowe 	kstat_named_t	pr_pending;
198*917Selowe 	kstat_named_t	pr_failed;
199*917Selowe 	kstat_named_t	pr_failed_kernel;
200*917Selowe 	kstat_named_t	pr_limit;
201*917Selowe 	kstat_named_t	pr_limit_exceeded;
202*917Selowe 	kstat_named_t	pr_fma;
203*917Selowe 	kstat_named_t	pr_mce;
204*917Selowe 	kstat_named_t	pr_ue;
205*917Selowe 	kstat_named_t	pr_ue_cleared_retire;
206*917Selowe 	kstat_named_t	pr_ue_cleared_free;
207*917Selowe 	kstat_named_t	pr_ue_persistent;
208*917Selowe 	kstat_named_t	pr_unretired;
209*917Selowe };
210*917Selowe 
211*917Selowe static struct page_retire_kstat page_retire_kstat = {
212*917Selowe 	{ "pages_retired",		KSTAT_DATA_UINT64},
213*917Selowe 	{ "pages_retire_request",	KSTAT_DATA_UINT64},
214*917Selowe 	{ "pages_retire_request_free",	KSTAT_DATA_UINT64},
215*917Selowe 	{ "pages_notenqueued", 		KSTAT_DATA_UINT64},
216*917Selowe 	{ "pages_notdequeued", 		KSTAT_DATA_UINT64},
217*917Selowe 	{ "pages_pending", 		KSTAT_DATA_UINT64},
218*917Selowe 	{ "pages_deferred",		KSTAT_DATA_UINT64},
219*917Selowe 	{ "pages_deferred_kernel",	KSTAT_DATA_UINT64},
220*917Selowe 	{ "pages_limit",		KSTAT_DATA_UINT64},
221*917Selowe 	{ "pages_limit_exceeded",	KSTAT_DATA_UINT64},
222*917Selowe 	{ "pages_fma",			KSTAT_DATA_UINT64},
223*917Selowe 	{ "pages_multiple_ce",		KSTAT_DATA_UINT64},
224*917Selowe 	{ "pages_ue",			KSTAT_DATA_UINT64},
225*917Selowe 	{ "pages_ue_cleared_retired",	KSTAT_DATA_UINT64},
226*917Selowe 	{ "pages_ue_cleared_freed",	KSTAT_DATA_UINT64},
227*917Selowe 	{ "pages_ue_persistent",	KSTAT_DATA_UINT64},
228*917Selowe 	{ "pages_unretired",		KSTAT_DATA_UINT64},
229*917Selowe };
230*917Selowe 
231*917Selowe static kstat_t  *page_retire_ksp = NULL;
232*917Selowe 
233*917Selowe #define	PR_INCR_KSTAT(stat)	\
234*917Selowe 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1)
235*917Selowe #define	PR_DECR_KSTAT(stat)	\
236*917Selowe 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1)
237*917Selowe 
238*917Selowe #define	PR_KSTAT_RETIRED_CE	(page_retire_kstat.pr_mce.value.ui64)
239*917Selowe #define	PR_KSTAT_RETIRED_FMA	(page_retire_kstat.pr_fma.value.ui64)
240*917Selowe #define	PR_KSTAT_RETIRED_NOTUE	(PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
241*917Selowe #define	PR_KSTAT_PENDING	(page_retire_kstat.pr_pending.value.ui64)
242*917Selowe #define	PR_KSTAT_EQFAIL		(page_retire_kstat.pr_enqueue_fail.value.ui64)
243*917Selowe #define	PR_KSTAT_DQFAIL		(page_retire_kstat.pr_dequeue_fail.value.ui64)
244*917Selowe 
245*917Selowe /*
246*917Selowe  * Limit the number of multiple CE page retires.
247*917Selowe  * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
248*917Selowe  * basis points, where 100 basis points equals one percent.
249*917Selowe  */
250*917Selowe #define	MCE_BPT	10
251*917Selowe uint64_t	max_pages_retired_bps = MCE_BPT;
252*917Selowe #define	PAGE_RETIRE_LIMIT	((physmem * max_pages_retired_bps) / 10000)
253*917Selowe 
254*917Selowe /*
255*917Selowe  * Control over the verbosity of page retirement.
256*917Selowe  *
257*917Selowe  * When set to zero (the default), no messages will be printed.
258*917Selowe  * When set to one, summary messages will be printed.
259*917Selowe  * When set > one, all messages will be printed.
260*917Selowe  *
261*917Selowe  * A value of one will trigger detailed messages for retirement operations,
262*917Selowe  * and is intended as a platform tunable for processors where FMA's DE does
263*917Selowe  * not run (e.g., spitfire). Values > one are intended for debugging only.
264*917Selowe  */
265*917Selowe int page_retire_messages = 0;
266*917Selowe 
267*917Selowe /*
268*917Selowe  * Control whether or not we retire dirty UE pages. By default we do
269*917Selowe  * since we assume the data is corrupt and the process(es) using it will
270*917Selowe  * be killed. This is platform tunable only, and should probably not be
271*917Selowe  * changed, ever.
272*917Selowe  */
273*917Selowe int page_retire_modified = 1;
274*917Selowe 
275*917Selowe /*
276*917Selowe  * Control whether or not we return scrubbed UE pages to service.
277*917Selowe  * By default we do not since FMA wants to run its diagnostics first
278*917Selowe  * and then ask us to unretire the page if it passes. Non-FMA platforms
279*917Selowe  * may set this to zero so we will only retire recidivist pages. It should
280*917Selowe  * not be changed by the user.
281*917Selowe  */
282*917Selowe int page_retire_first_ue = 1;
283*917Selowe 
284*917Selowe /*
285*917Selowe  * Master enable for page retire. This prevents a CE or UE early in boot
286*917Selowe  * from trying to retire a page before page_retire_init() has finished
287*917Selowe  * setting things up. This is internal only and is not a tunable!
288*917Selowe  */
289*917Selowe static int pr_enable = 0;
290*917Selowe 
291*917Selowe extern struct vnode kvp;
292*917Selowe 
293*917Selowe #ifdef	DEBUG
294*917Selowe struct page_retire_debug {
295*917Selowe 	int prd_dup;
296*917Selowe 	int prd_noaction;
297*917Selowe 	int prd_queued;
298*917Selowe 	int prd_notqueued;
299*917Selowe 	int prd_dequeue;
300*917Selowe 	int prd_top;
301*917Selowe 	int prd_locked;
302*917Selowe 	int prd_reloc;
303*917Selowe 	int prd_modce;
304*917Selowe 	int prd_modue_fail;
305*917Selowe 	int prd_modue_retire;
306*917Selowe 	int prd_kern;
307*917Selowe 	int prd_free;
308*917Selowe 	int prd_noreclaim;
309*917Selowe 	int prd_hashout;
310*917Selowe 	int prd_fma;
311*917Selowe 	int prd_uescrubbed;
312*917Selowe 	int prd_uenotscrubbed;
313*917Selowe 	int prd_mce;
314*917Selowe 	int prd_prlocked;
315*917Selowe 	int prd_prnotlocked;
316*917Selowe 	int prd_prretired;
317*917Selowe 	int prd_ulocked;
318*917Selowe 	int prd_unotretired;
319*917Selowe 	int prd_udestroy;
320*917Selowe 	int prd_uhashout;
321*917Selowe 	int prd_uunretired;
322*917Selowe 	int prd_unotlocked;
323*917Selowe 	int prd_checkhit;
324*917Selowe 	int prd_checkmiss;
325*917Selowe 	int prd_tctop;
326*917Selowe 	int prd_tclocked;
327*917Selowe 	int prd_hunt;
328*917Selowe 	int prd_dohunt;
329*917Selowe 	int prd_earlyhunt;
330*917Selowe 	int prd_latehunt;
331*917Selowe 	int prd_nofreedemote;
332*917Selowe 	int prd_nodemote;
333*917Selowe 	int prd_demoted;
334*917Selowe } pr_debug;
335*917Selowe 
336*917Selowe #define	PR_DEBUG(foo)	((pr_debug.foo)++)
337*917Selowe 
338*917Selowe /*
339*917Selowe  * A type histogram. We record the incidence of the various toxic
340*917Selowe  * flag combinations along with the interesting page attributes. The
341*917Selowe  * goal is to get as many combinations as we can while driving all
342*917Selowe  * pr_debug values nonzero (indicating we've exercised all possible
343*917Selowe  * code paths across all possible page types). Not all combinations
344*917Selowe  * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
345*917Selowe  *
346*917Selowe  * pr_type offset bit encoding (when examining with a debugger):
347*917Selowe  *
348*917Selowe  *    PRT_NAMED  - 0x4
349*917Selowe  *    PRT_KERNEL - 0x8
350*917Selowe  *    PRT_FREE   - 0x10
351*917Selowe  *    PRT_MOD    - 0x20
352*917Selowe  *    PRT_FMA    - 0x0
353*917Selowe  *    PRT_MCE    - 0x40
354*917Selowe  *    PRT_UE     - 0x80
355*917Selowe  */
356*917Selowe 
357*917Selowe #define	PRT_NAMED	0x01
358*917Selowe #define	PRT_KERNEL	0x02
359*917Selowe #define	PRT_FREE	0x04
360*917Selowe #define	PRT_MOD		0x08
361*917Selowe #define	PRT_FMA		0x00	/* yes, this is not a mistake */
362*917Selowe #define	PRT_MCE		0x10
363*917Selowe #define	PRT_UE		0x20
364*917Selowe #define	PRT_ALL		0x3F
365*917Selowe 
366*917Selowe int pr_types[PRT_ALL+1];
367*917Selowe 
368*917Selowe #define	PR_TYPES(pp)	{			\
369*917Selowe 	int whichtype = 0;			\
370*917Selowe 	if (pp->p_vnode)			\
371*917Selowe 		whichtype |= PRT_NAMED;		\
372*917Selowe 	if (pp->p_vnode == &kvp)		\
373*917Selowe 		whichtype |= PRT_KERNEL;	\
374*917Selowe 	if (PP_ISFREE(pp))			\
375*917Selowe 		whichtype |= PRT_FREE;		\
376*917Selowe 	if (hat_ismod(pp))			\
377*917Selowe 		whichtype |= PRT_MOD;		\
378*917Selowe 	if (pp->p_toxic & PR_UE)		\
379*917Selowe 		whichtype |= PRT_UE;		\
380*917Selowe 	if (pp->p_toxic & PR_MCE)		\
381*917Selowe 		whichtype |= PRT_MCE;		\
382*917Selowe 	pr_types[whichtype]++;			\
383*917Selowe }
384*917Selowe 
385*917Selowe int recl_calls;
386*917Selowe int recl_mtbf = 3;
387*917Selowe int reloc_calls;
388*917Selowe int reloc_mtbf = 7;
389*917Selowe int pr_calls;
390*917Selowe int pr_mtbf = 15;
391*917Selowe 
392*917Selowe #define	MTBF(v, f)	(((++(v)) & (f)) != (f))
393*917Selowe 
394*917Selowe #else	/* DEBUG */
395*917Selowe 
396*917Selowe #define	PR_DEBUG(foo)	/* nothing */
397*917Selowe #define	PR_TYPES(foo)	/* nothing */
398*917Selowe #define	MTBF(v, f)	(1)
399*917Selowe 
400*917Selowe #endif	/* DEBUG */
401*917Selowe 
402*917Selowe /*
403*917Selowe  * page_retire_done() - completion processing
404*917Selowe  *
405*917Selowe  * Used by the page_retire code for common completion processing.
406*917Selowe  * It keeps track of how many times a given result has happened,
407*917Selowe  * and writes out an occasional message.
408*917Selowe  *
409*917Selowe  * May be called with a NULL pp (PRD_INVALID_PA case).
410*917Selowe  */
411*917Selowe #define	PRD_INVALID_KEY		-1
412*917Selowe #define	PRD_SUCCESS		0
413*917Selowe #define	PRD_PENDING		1
414*917Selowe #define	PRD_FAILED		2
415*917Selowe #define	PRD_DUPLICATE		3
416*917Selowe #define	PRD_INVALID_PA		4
417*917Selowe #define	PRD_LIMIT		5
418*917Selowe #define	PRD_UE_SCRUBBED		6
419*917Selowe #define	PRD_UNR_SUCCESS		7
420*917Selowe #define	PRD_UNR_CANTLOCK	8
421*917Selowe #define	PRD_UNR_NOT		9
422*917Selowe 
423*917Selowe typedef struct page_retire_op {
424*917Selowe 	int	pr_key;		/* one of the PRD_* defines from above */
425*917Selowe 	int	pr_count;	/* How many times this has happened */
426*917Selowe 	int	pr_retval;	/* return value */
427*917Selowe 	int	pr_msglvl;	/* message level - when to print */
428*917Selowe 	char	*pr_message;	/* Cryptic message for field service */
429*917Selowe } page_retire_op_t;
430*917Selowe 
431*917Selowe static page_retire_op_t page_retire_ops[] = {
432*917Selowe 	/* key			count	retval	msglvl	message */
433*917Selowe 	{PRD_SUCCESS,		0,	0,	1,
434*917Selowe 		"Page 0x%08x.%08x removed from service"},
435*917Selowe 	{PRD_PENDING,		0,	EAGAIN,	2,
436*917Selowe 		"Page 0x%08x.%08x will be retired on free"},
437*917Selowe 	{PRD_FAILED,		0,	EAGAIN,	0, NULL},
438*917Selowe 	{PRD_DUPLICATE,		0,	EBUSY,	2,
439*917Selowe 		"Page 0x%08x.%08x already retired"},
440*917Selowe 	{PRD_INVALID_PA,	0,	EINVAL, 2,
441*917Selowe 		"PA 0x%08x.%08x is not a relocatable page"},
442*917Selowe 	{PRD_LIMIT,		0,	0,	1,
443*917Selowe 		"Page 0x%08x.%08x not retired due to limit exceeded"},
444*917Selowe 	{PRD_UE_SCRUBBED,	0,	0,	1,
445*917Selowe 		"Previously reported error on page 0x%08x.%08x cleared"},
446*917Selowe 	{PRD_UNR_SUCCESS,	0,	0,	1,
447*917Selowe 		"Page 0x%08x.%08x returned to service"},
448*917Selowe 	{PRD_UNR_CANTLOCK,	0,	EAGAIN,	2,
449*917Selowe 		"Page 0x%08x.%08x could not be unretired"},
450*917Selowe 	{PRD_UNR_NOT,		0,	EBADF,	2,
451*917Selowe 		"Page 0x%08x.%08x is not retired"},
452*917Selowe 	{PRD_INVALID_KEY,	0,	0,	0, NULL} /* MUST BE LAST! */
453*917Selowe };
454*917Selowe 
455*917Selowe /*
456*917Selowe  * print a message if page_retire_messages is true.
457*917Selowe  */
458*917Selowe #define	PR_MESSAGE(debuglvl, msglvl, msg, pa)				\
459*917Selowe {									\
460*917Selowe 	uint64_t p = (uint64_t)pa;					\
461*917Selowe 	if (page_retire_messages >= msglvl && msg != NULL) {		\
462*917Selowe 		cmn_err(debuglvl, msg,					\
463*917Selowe 		    (uint32_t)(p >> 32), (uint32_t)p);			\
464*917Selowe 	}								\
465*917Selowe }
466*917Selowe 
467*917Selowe /*
468*917Selowe  * Note that multiple bits may be set in a single settoxic operation.
469*917Selowe  * May be called without the page locked.
470*917Selowe  */
471*917Selowe void
472*917Selowe page_settoxic(page_t *pp, uchar_t bits)
473*917Selowe {
474*917Selowe 	atomic_or_8(&pp->p_toxic, bits);
475*917Selowe }
476*917Selowe 
477*917Selowe /*
478*917Selowe  * Note that multiple bits may cleared in a single clrtoxic operation.
479*917Selowe  * Must be called with the page exclusively locked.
480*917Selowe  */
481*917Selowe void
482*917Selowe page_clrtoxic(page_t *pp, uchar_t bits)
483*917Selowe {
484*917Selowe 	ASSERT(PAGE_EXCL(pp));
485*917Selowe 	atomic_and_8(&pp->p_toxic, ~bits);
486*917Selowe }
487*917Selowe 
488*917Selowe /*
489*917Selowe  * Prints any page retire messages to the user, and decides what
490*917Selowe  * error code is appropriate for the condition reported.
491*917Selowe  */
492*917Selowe static int
493*917Selowe page_retire_done(page_t *pp, int code)
494*917Selowe {
495*917Selowe 	page_retire_op_t *prop;
496*917Selowe 	uint64_t	pa = 0;
497*917Selowe 	int		i;
498*917Selowe 
499*917Selowe 	if (pp != NULL) {
500*917Selowe 		pa = mmu_ptob(pp->p_pagenum);
501*917Selowe 	}
502*917Selowe 
503*917Selowe 	prop = NULL;
504*917Selowe 	for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
505*917Selowe 		if (page_retire_ops[i].pr_key == code) {
506*917Selowe 			prop = &page_retire_ops[i];
507*917Selowe 			break;
508*917Selowe 		}
509*917Selowe 	}
510*917Selowe 
511*917Selowe #ifdef	DEBUG
512*917Selowe 	if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
513*917Selowe 		cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
514*917Selowe 	}
515*917Selowe #endif
516*917Selowe 
517*917Selowe 	ASSERT(prop->pr_key == code);
518*917Selowe 
519*917Selowe 	prop->pr_count++;
520*917Selowe 
521*917Selowe 	PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
522*917Selowe 	if (pp != NULL) {
523*917Selowe 		page_settoxic(pp, PR_MSG);
524*917Selowe 	}
525*917Selowe 
526*917Selowe 	return (prop->pr_retval);
527*917Selowe }
528*917Selowe 
529*917Selowe /*
530*917Selowe  * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages
531*917Selowe  * that we were not able to retire. On large machines, walking the complete
532*917Selowe  * page_t array and looking at every page_t takes too long. So, as a page is
533*917Selowe  * marked toxic, we track it using a list that can be processed at reboot
534*917Selowe  * time.  page_retire_enqueue() will do its best to try to avoid duplicate
535*917Selowe  * entries, but if we get too many errors at once the queue can overflow,
536*917Selowe  * in which case we will end up walking every page_t as a last resort.
537*917Selowe  * The background thread also makes use of this queue to find which pages
538*917Selowe  * are pending retirement.
539*917Selowe  */
540*917Selowe static void
541*917Selowe page_retire_enqueue(page_t *pp)
542*917Selowe {
543*917Selowe 	int	nslot = -1;
544*917Selowe 	int	i;
545*917Selowe 
546*917Selowe 	mutex_enter(&pr_q_mutex);
547*917Selowe 
548*917Selowe 	/*
549*917Selowe 	 * Check to make sure retire hasn't already dequeued it.
550*917Selowe 	 * In the meantime if the page was cleaned up, no need
551*917Selowe 	 * to enqueue it.
552*917Selowe 	 */
553*917Selowe 	if (PP_RETIRED(pp) || pp->p_toxic == 0) {
554*917Selowe 		mutex_exit(&pr_q_mutex);
555*917Selowe 		PR_DEBUG(prd_noaction);
556*917Selowe 		return;
557*917Selowe 	}
558*917Selowe 
559*917Selowe 	for (i = 0; i < PR_PENDING_QMAX; i++) {
560*917Selowe 		if (pr_pending_q[i] == pp) {
561*917Selowe 			mutex_exit(&pr_q_mutex);
562*917Selowe 			PR_DEBUG(prd_dup);
563*917Selowe 			return;
564*917Selowe 		} else if (nslot == -1 && pr_pending_q[i] == NULL) {
565*917Selowe 			nslot = i;
566*917Selowe 		}
567*917Selowe 	}
568*917Selowe 
569*917Selowe 	PR_INCR_KSTAT(pr_pending);
570*917Selowe 
571*917Selowe 	if (nslot != -1) {
572*917Selowe 		pr_pending_q[nslot] = pp;
573*917Selowe 		PR_DEBUG(prd_queued);
574*917Selowe 	} else {
575*917Selowe 		PR_INCR_KSTAT(pr_enqueue_fail);
576*917Selowe 		PR_DEBUG(prd_notqueued);
577*917Selowe 	}
578*917Selowe 	mutex_exit(&pr_q_mutex);
579*917Selowe }
580*917Selowe 
581*917Selowe static void
582*917Selowe page_retire_dequeue(page_t *pp)
583*917Selowe {
584*917Selowe 	int i;
585*917Selowe 
586*917Selowe 	mutex_enter(&pr_q_mutex);
587*917Selowe 
588*917Selowe 	for (i = 0; i < PR_PENDING_QMAX; i++) {
589*917Selowe 		if (pr_pending_q[i] == pp) {
590*917Selowe 			pr_pending_q[i] = NULL;
591*917Selowe 			break;
592*917Selowe 		}
593*917Selowe 	}
594*917Selowe 
595*917Selowe 	if (i == PR_PENDING_QMAX) {
596*917Selowe 		PR_INCR_KSTAT(pr_dequeue_fail);
597*917Selowe 	}
598*917Selowe 
599*917Selowe 	PR_DECR_KSTAT(pr_pending);
600*917Selowe 	PR_DEBUG(prd_dequeue);
601*917Selowe 
602*917Selowe 	mutex_exit(&pr_q_mutex);
603*917Selowe }
604*917Selowe 
605*917Selowe /*
606*917Selowe  * Act like page_destroy(), but instead of freeing the page, hash it onto
607*917Selowe  * the retired_pages vnode, and mark it retired.
608*917Selowe  *
609*917Selowe  * For fun, we try to scrub the page until it's squeaky clean.
610*917Selowe  * availrmem is adjusted here.
611*917Selowe  */
612*917Selowe static void
613*917Selowe page_retire_destroy(page_t *pp)
614*917Selowe {
615*917Selowe 	ASSERT(PAGE_EXCL(pp));
616*917Selowe 	ASSERT(!PP_ISFREE(pp));
617*917Selowe 	ASSERT(pp->p_szc == 0);
618*917Selowe 	ASSERT(!hat_page_is_mapped(pp));
619*917Selowe 	ASSERT(!pp->p_vnode);
620*917Selowe 
621*917Selowe 	page_clr_all_props(pp);
622*917Selowe 	pagescrub(pp, 0, MMU_PAGESIZE);
623*917Selowe 
624*917Selowe 	pp->p_next = NULL;
625*917Selowe 	pp->p_prev = NULL;
626*917Selowe 	if (page_hashin(pp, retired_pages, (u_offset_t)pp, NULL) == 0) {
627*917Selowe 		cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
628*917Selowe 	}
629*917Selowe 
630*917Selowe 	page_settoxic(pp, PR_RETIRED);
631*917Selowe 	page_clrtoxic(pp, PR_BUSY);
632*917Selowe 	page_retire_dequeue(pp);
633*917Selowe 	PR_INCR_KSTAT(pr_retired);
634*917Selowe 
635*917Selowe 	if (pp->p_toxic & PR_FMA) {
636*917Selowe 		PR_INCR_KSTAT(pr_fma);
637*917Selowe 	} else if (pp->p_toxic & PR_UE) {
638*917Selowe 		PR_INCR_KSTAT(pr_ue);
639*917Selowe 	} else {
640*917Selowe 		PR_INCR_KSTAT(pr_mce);
641*917Selowe 	}
642*917Selowe 
643*917Selowe 	mutex_enter(&freemem_lock);
644*917Selowe 	availrmem--;
645*917Selowe 	mutex_exit(&freemem_lock);
646*917Selowe 
647*917Selowe 	page_unlock(pp);
648*917Selowe }
649*917Selowe 
650*917Selowe /*
651*917Selowe  * Check whether the number of pages which have been retired already exceeds
652*917Selowe  * the maximum allowable percentage of memory which may be retired.
653*917Selowe  *
654*917Selowe  * Returns 1 if the limit has been exceeded.
655*917Selowe  */
656*917Selowe static int
657*917Selowe page_retire_limit(void)
658*917Selowe {
659*917Selowe 	if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
660*917Selowe 		PR_INCR_KSTAT(pr_limit_exceeded);
661*917Selowe 		return (1);
662*917Selowe 	}
663*917Selowe 
664*917Selowe 	return (0);
665*917Selowe }
666*917Selowe 
667*917Selowe #define	MSG_DM	"Data Mismatch occurred at PA 0x%08x.%08x"		\
668*917Selowe 	"[ 0x%x != 0x%x ] while attempting to clear previously "	\
669*917Selowe 	"reported error; page removed from service"
670*917Selowe 
671*917Selowe #define	MSG_UE	"Uncorrectable Error occurred at PA 0x%08x.%08x while "	\
672*917Selowe 	"attempting to clear previously reported error; page removed "	\
673*917Selowe 	"from service"
674*917Selowe 
675*917Selowe /*
676*917Selowe  * Attempt to clear a UE from a page.
677*917Selowe  * Returns 1 if the error has been successfully cleared.
678*917Selowe  */
679*917Selowe static int
680*917Selowe page_clear_transient_ue(page_t *pp)
681*917Selowe {
682*917Selowe 	caddr_t		kaddr;
683*917Selowe 	uint8_t		rb, wb;
684*917Selowe 	uint64_t	pa;
685*917Selowe 	uint32_t	pa_hi, pa_lo;
686*917Selowe 	on_trap_data_t	otd;
687*917Selowe 	int		errors = 0;
688*917Selowe 	int		i;
689*917Selowe 
690*917Selowe 	ASSERT(PAGE_EXCL(pp));
691*917Selowe 	ASSERT(PP_PR_REQ(pp));
692*917Selowe 	ASSERT(pp->p_szc == 0);
693*917Selowe 	ASSERT(!hat_page_is_mapped(pp));
694*917Selowe 
695*917Selowe 	/*
696*917Selowe 	 * Clear the page and attempt to clear the UE.  If we trap
697*917Selowe 	 * on the next access to the page, we know the UE has recurred.
698*917Selowe 	 */
699*917Selowe 	pagescrub(pp, 0, PAGESIZE);
700*917Selowe 
701*917Selowe 	/*
702*917Selowe 	 * Map the page and write a bunch of bit patterns to compare
703*917Selowe 	 * what we wrote with what we read back.  This isn't a perfect
704*917Selowe 	 * test but it should be good enough to catch most of the
705*917Selowe 	 * recurring UEs. If this fails to catch a recurrent UE, we'll
706*917Selowe 	 * retire the page the next time we see a UE on the page.
707*917Selowe 	 */
708*917Selowe 	kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
709*917Selowe 
710*917Selowe 	pa = ptob((uint64_t)page_pptonum(pp));
711*917Selowe 	pa_hi = (uint32_t)(pa >> 32);
712*917Selowe 	pa_lo = (uint32_t)pa;
713*917Selowe 
714*917Selowe 	/*
715*917Selowe 	 * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
716*917Selowe 	 * the cache in between reading and writing.  We do this under
717*917Selowe 	 * on_trap() protection to avoid recursion.
718*917Selowe 	 */
719*917Selowe 	if (on_trap(&otd, OT_DATA_EC)) {
720*917Selowe 		PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
721*917Selowe 		errors = 1;
722*917Selowe 	} else {
723*917Selowe 		for (wb = 0xff; wb > 0; wb--) {
724*917Selowe 			for (i = 0; i < PAGESIZE; i++) {
725*917Selowe 				kaddr[i] = wb;
726*917Selowe 			}
727*917Selowe 
728*917Selowe 			sync_data_memory(kaddr, PAGESIZE);
729*917Selowe 
730*917Selowe 			for (i = 0; i < PAGESIZE; i++) {
731*917Selowe 				rb = kaddr[i];
732*917Selowe 				if (rb != wb) {
733*917Selowe 					/*
734*917Selowe 					 * We had a mismatch without a trap.
735*917Selowe 					 * Uh-oh. Something is really wrong
736*917Selowe 					 * with this system.
737*917Selowe 					 */
738*917Selowe 					if (page_retire_messages) {
739*917Selowe 						cmn_err(CE_WARN, MSG_DM,
740*917Selowe 						    pa_hi, pa_lo, rb, wb);
741*917Selowe 					}
742*917Selowe 					errors = 1;
743*917Selowe 					goto out;	/* double break */
744*917Selowe 				}
745*917Selowe 			}
746*917Selowe 		}
747*917Selowe 	}
748*917Selowe out:
749*917Selowe 	no_trap();
750*917Selowe 	ppmapout(kaddr);
751*917Selowe 
752*917Selowe 	return (errors ? 0 : 1);
753*917Selowe }
754*917Selowe 
755*917Selowe /*
756*917Selowe  * Try to clear a page_t with a single UE. If the UE was transient, it is
757*917Selowe  * returned to service, and we return 1. Otherwise we return 0 meaning
758*917Selowe  * that further processing is required to retire the page.
759*917Selowe  */
760*917Selowe static int
761*917Selowe page_retire_transient_ue(page_t *pp)
762*917Selowe {
763*917Selowe 	ASSERT(PAGE_EXCL(pp));
764*917Selowe 	ASSERT(!hat_page_is_mapped(pp));
765*917Selowe 
766*917Selowe 	/*
767*917Selowe 	 * If this page is a repeat offender, retire him under the
768*917Selowe 	 * "two strikes and you're out" rule. The caller is responsible
769*917Selowe 	 * for scrubbing the page to try to clear the error.
770*917Selowe 	 */
771*917Selowe 	if (pp->p_toxic & PR_UE_SCRUBBED) {
772*917Selowe 		PR_INCR_KSTAT(pr_ue_persistent);
773*917Selowe 		return (0);
774*917Selowe 	}
775*917Selowe 
776*917Selowe 	if (page_clear_transient_ue(pp)) {
777*917Selowe 		/*
778*917Selowe 		 * We set the PR_SCRUBBED_UE bit; if we ever see this
779*917Selowe 		 * page again, we will retire it, no questions asked.
780*917Selowe 		 */
781*917Selowe 		page_settoxic(pp, PR_UE_SCRUBBED);
782*917Selowe 
783*917Selowe 		if (page_retire_first_ue) {
784*917Selowe 			PR_INCR_KSTAT(pr_ue_cleared_retire);
785*917Selowe 			return (0);
786*917Selowe 		} else {
787*917Selowe 			PR_INCR_KSTAT(pr_ue_cleared_free);
788*917Selowe 
789*917Selowe 			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY);
790*917Selowe 			page_retire_dequeue(pp);
791*917Selowe 
792*917Selowe 			/*
793*917Selowe 			 * Clear the free bit if it's set, since the
794*917Selowe 			 * page free code will get cranky if we don't.
795*917Selowe 			 */
796*917Selowe 			PP_CLRFREE(pp);
797*917Selowe 
798*917Selowe 			/* LINTED: CONSTCOND */
799*917Selowe 			VN_DISPOSE(pp, B_FREE, 1, kcred);
800*917Selowe 			return (1);
801*917Selowe 		}
802*917Selowe 	}
803*917Selowe 
804*917Selowe 	PR_INCR_KSTAT(pr_ue_persistent);
805*917Selowe 	return (0);
806*917Selowe }
807*917Selowe 
808*917Selowe /*
809*917Selowe  * Update the statistics dynamically when our kstat is read.
810*917Selowe  */
811*917Selowe static int
812*917Selowe page_retire_kstat_update(kstat_t *ksp, int rw)
813*917Selowe {
814*917Selowe 	struct page_retire_kstat *pr;
815*917Selowe 
816*917Selowe 	if (ksp == NULL)
817*917Selowe 	    return (EINVAL);
818*917Selowe 
819*917Selowe 	switch (rw) {
820*917Selowe 
821*917Selowe 	case KSTAT_READ:
822*917Selowe 		pr = (struct page_retire_kstat *)ksp->ks_data;
823*917Selowe 		ASSERT(pr == &page_retire_kstat);
824*917Selowe 		pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
825*917Selowe 		return (0);
826*917Selowe 
827*917Selowe 	case KSTAT_WRITE:
828*917Selowe 		return (EACCES);
829*917Selowe 
830*917Selowe 	default:
831*917Selowe 		return (EINVAL);
832*917Selowe 	}
833*917Selowe 	/*NOTREACHED*/
834*917Selowe }
835*917Selowe 
836*917Selowe /*
837*917Selowe  * Initialize the page retire mechanism:
838*917Selowe  *
839*917Selowe  *   - Establish the correctable error retire limit.
840*917Selowe  *   - Initialize locks.
841*917Selowe  *   - Build the retired_pages vnode.
842*917Selowe  *   - Set up the kstats.
843*917Selowe  *   - Fire off the background thread.
844*917Selowe  *   - Tell page_tryretire() it's OK to start retiring pages.
845*917Selowe  */
846*917Selowe void
847*917Selowe page_retire_init(void)
848*917Selowe {
849*917Selowe 	const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL};
850*917Selowe 	struct vnodeops *vops;
851*917Selowe 
852*917Selowe 	const uint_t page_retire_ndata =
853*917Selowe 	    sizeof (page_retire_kstat) / sizeof (kstat_named_t);
854*917Selowe 
855*917Selowe 	ASSERT(page_retire_ksp == NULL);
856*917Selowe 
857*917Selowe 	if (max_pages_retired_bps <= 0) {
858*917Selowe 		max_pages_retired_bps = MCE_BPT;
859*917Selowe 	}
860*917Selowe 
861*917Selowe 	mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
862*917Selowe 
863*917Selowe 	retired_pages = vn_alloc(KM_SLEEP);
864*917Selowe 	if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
865*917Selowe 		cmn_err(CE_PANIC,
866*917Selowe 		    "page_retired_init: can't make retired vnodeops");
867*917Selowe 	}
868*917Selowe 	vn_setops(retired_pages, vops);
869*917Selowe 
870*917Selowe 	if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
871*917Selowe 	    "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
872*917Selowe 	    KSTAT_FLAG_VIRTUAL)) == NULL) {
873*917Selowe 		cmn_err(CE_WARN, "kstat_create for page_retire failed");
874*917Selowe 	} else {
875*917Selowe 		page_retire_ksp->ks_data = (void *)&page_retire_kstat;
876*917Selowe 		page_retire_ksp->ks_update = page_retire_kstat_update;
877*917Selowe 		kstat_install(page_retire_ksp);
878*917Selowe 	}
879*917Selowe 
880*917Selowe 	pr_thread_shortwait = 23 * hz;
881*917Selowe 	pr_thread_longwait = 1201 * hz;
882*917Selowe 	mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
883*917Selowe 	cv_init(&pr_cv, NULL, CV_DEFAULT, NULL);
884*917Selowe 	pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0,
885*917Selowe 	    TS_RUN, minclsyspri);
886*917Selowe 
887*917Selowe 	pr_enable = 1;
888*917Selowe }
889*917Selowe 
890*917Selowe /*
891*917Selowe  * page_retire_hunt() callback for the retire thread.
892*917Selowe  */
893*917Selowe static void
894*917Selowe page_retire_thread_cb(page_t *pp)
895*917Selowe {
896*917Selowe 	PR_DEBUG(prd_tctop);
897*917Selowe 	if (pp->p_vnode != &kvp && page_trylock(pp, SE_EXCL)) {
898*917Selowe 		PR_DEBUG(prd_tclocked);
899*917Selowe 		page_unlock(pp);
900*917Selowe 	}
901*917Selowe }
902*917Selowe 
903*917Selowe /*
904*917Selowe  * page_retire_hunt() callback for mdboot().
905*917Selowe  *
906*917Selowe  * It is necessary to scrub any failing pages prior to reboot in order to
907*917Selowe  * prevent a latent error trap from occurring on the next boot.
908*917Selowe  */
909*917Selowe void
910*917Selowe page_retire_mdboot_cb(page_t *pp)
911*917Selowe {
912*917Selowe 	/*
913*917Selowe 	 * Don't scrub the kernel, since we might still need it, unless
914*917Selowe 	 * we have UEs on the page, in which case we have nothing to lose.
915*917Selowe 	 */
916*917Selowe 	if (pp->p_vnode != &kvp || PP_TOXIC(pp)) {
917*917Selowe 		pp->p_selock = -1;	/* pacify ASSERTs */
918*917Selowe 		pagescrub(pp, 0, PAGESIZE);
919*917Selowe 		pp->p_selock = 0;
920*917Selowe 	}
921*917Selowe 	pp->p_toxic = 0;
922*917Selowe }
923*917Selowe 
924*917Selowe /*
925*917Selowe  * Hunt down any pages in the system that have not yet been retired, invoking
926*917Selowe  * the provided callback function on each of them.
927*917Selowe  */
928*917Selowe void
929*917Selowe page_retire_hunt(void (*callback)(page_t *))
930*917Selowe {
931*917Selowe 	page_t *pp;
932*917Selowe 	page_t *first;
933*917Selowe 	int i, found;
934*917Selowe 
935*917Selowe 	PR_DEBUG(prd_hunt);
936*917Selowe 
937*917Selowe 	if (PR_KSTAT_PENDING == 0) {
938*917Selowe 		return;
939*917Selowe 	}
940*917Selowe 
941*917Selowe 	PR_DEBUG(prd_dohunt);
942*917Selowe 
943*917Selowe 	found = 0;
944*917Selowe 	mutex_enter(&pr_q_mutex);
945*917Selowe 
946*917Selowe 	for (i = 0; i < PR_PENDING_QMAX; i++) {
947*917Selowe 		if ((pp = pr_pending_q[i]) != NULL) {
948*917Selowe 			mutex_exit(&pr_q_mutex);
949*917Selowe 			callback(pp);
950*917Selowe 			mutex_enter(&pr_q_mutex);
951*917Selowe 			found++;
952*917Selowe 		}
953*917Selowe 	}
954*917Selowe 
955*917Selowe 	if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == PR_KSTAT_PENDING) {
956*917Selowe 		mutex_exit(&pr_q_mutex);
957*917Selowe 		PR_DEBUG(prd_earlyhunt);
958*917Selowe 		return;
959*917Selowe 	}
960*917Selowe 	mutex_exit(&pr_q_mutex);
961*917Selowe 
962*917Selowe 	PR_DEBUG(prd_latehunt);
963*917Selowe 
964*917Selowe 	/*
965*917Selowe 	 * We've lost track of a page somewhere. Hunt it down.
966*917Selowe 	 */
967*917Selowe 	memsegs_lock(0);
968*917Selowe 	pp = first = page_first();
969*917Selowe 	do {
970*917Selowe 		if (PP_PR_REQ(pp)) {
971*917Selowe 			callback(pp);
972*917Selowe 			if (++found == PR_KSTAT_PENDING) {
973*917Selowe 				break;	/* got 'em all */
974*917Selowe 			}
975*917Selowe 		}
976*917Selowe 	} while ((pp = page_next(pp)) != first);
977*917Selowe 	memsegs_unlock(0);
978*917Selowe }
979*917Selowe 
980*917Selowe /*
981*917Selowe  * The page_retire_thread loops forever, looking to see if there are
982*917Selowe  * pages still waiting to be retired.
983*917Selowe  */
984*917Selowe static void
985*917Selowe page_retire_thread(void)
986*917Selowe {
987*917Selowe 	callb_cpr_t c;
988*917Selowe 
989*917Selowe 	CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire");
990*917Selowe 
991*917Selowe 	mutex_enter(&pr_thread_mutex);
992*917Selowe 	for (;;) {
993*917Selowe 		if (pr_enable && PR_KSTAT_PENDING) {
994*917Selowe 			kmem_reap();
995*917Selowe 			seg_preap();
996*917Selowe 			page_retire_hunt(page_retire_thread_cb);
997*917Selowe 			CALLB_CPR_SAFE_BEGIN(&c);
998*917Selowe 			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
999*917Selowe 			    lbolt + pr_thread_shortwait);
1000*917Selowe 			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
1001*917Selowe 		} else {
1002*917Selowe 			CALLB_CPR_SAFE_BEGIN(&c);
1003*917Selowe 			(void) cv_timedwait(&pr_cv, &pr_thread_mutex,
1004*917Selowe 			    lbolt + pr_thread_longwait);
1005*917Selowe 			CALLB_CPR_SAFE_END(&c, &pr_thread_mutex);
1006*917Selowe 		}
1007*917Selowe 	}
1008*917Selowe 	/*NOTREACHED*/
1009*917Selowe }
1010*917Selowe 
1011*917Selowe /*
1012*917Selowe  * page_retire_pp() decides what to do with a failing page.
1013*917Selowe  *
1014*917Selowe  * When we get a free page (e.g. the scrubber or in the free path) life is
1015*917Selowe  * nice because the page is clean and marked free -- those always retire
1016*917Selowe  * nicely. From there we go by order of difficulty. If the page has data,
1017*917Selowe  * we attempt to relocate its contents to a suitable replacement page. If
1018*917Selowe  * that does not succeed, we look to see if it is clean. If after all of
1019*917Selowe  * this we have a clean, unmapped page (which we usually do!), we retire it.
1020*917Selowe  * If the page is not clean, we still process it regardless on a UE; for
1021*917Selowe  * CEs or FMA requests, we fail leaving the page in service. The page will
1022*917Selowe  * eventually be tried again later. We always return with the page unlocked
1023*917Selowe  * since we are called from page_unlock().
1024*917Selowe  *
1025*917Selowe  * We don't call panic or do anything fancy down in here. Our boss the DE
1026*917Selowe  * gets paid handsomely to do his job of figuring out what to do when errors
1027*917Selowe  * occur. We just do what he tells us to do.
1028*917Selowe  */
1029*917Selowe static int
1030*917Selowe page_retire_pp(page_t *pp)
1031*917Selowe {
1032*917Selowe 	int		toxic;
1033*917Selowe 
1034*917Selowe 	ASSERT(PAGE_EXCL(pp));
1035*917Selowe 	ASSERT(pp->p_iolock_state == 0);
1036*917Selowe 	ASSERT(pp->p_szc == 0);
1037*917Selowe 
1038*917Selowe 	PR_DEBUG(prd_top);
1039*917Selowe 	PR_TYPES(pp);
1040*917Selowe 
1041*917Selowe 	toxic = pp->p_toxic;
1042*917Selowe 	ASSERT(toxic & PR_REASONS);
1043*917Selowe 
1044*917Selowe 	if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) &&
1045*917Selowe 	    page_retire_limit()) {
1046*917Selowe 		page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY);
1047*917Selowe 		page_retire_dequeue(pp);
1048*917Selowe 		page_unlock(pp);
1049*917Selowe 		return (page_retire_done(pp, PRD_LIMIT));
1050*917Selowe 	}
1051*917Selowe 
1052*917Selowe 	if (PP_ISFREE(pp)) {
1053*917Selowe 		PR_DEBUG(prd_free);
1054*917Selowe 		if (!MTBF(recl_calls, recl_mtbf) || !page_reclaim(pp, NULL)) {
1055*917Selowe 			PR_DEBUG(prd_noreclaim);
1056*917Selowe 			PR_INCR_KSTAT(pr_failed);
1057*917Selowe 			page_unlock(pp);
1058*917Selowe 			return (page_retire_done(pp, PRD_FAILED));
1059*917Selowe 		}
1060*917Selowe 	}
1061*917Selowe 
1062*917Selowe 	if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISFREE(pp) &&
1063*917Selowe 	    !PP_ISNORELOC(pp) && MTBF(reloc_calls, reloc_mtbf)) {
1064*917Selowe 		page_t *newpp;
1065*917Selowe 		spgcnt_t count;
1066*917Selowe 
1067*917Selowe 		/*
1068*917Selowe 		 * If we can relocate the page, great! newpp will go
1069*917Selowe 		 * on without us, and everything is fine.  Regardless
1070*917Selowe 		 * of whether the relocation succeeds, we are still
1071*917Selowe 		 * going to take `pp' around back and shoot it.
1072*917Selowe 		 */
1073*917Selowe 		PR_DEBUG(prd_reloc);
1074*917Selowe 		newpp = NULL;
1075*917Selowe 		if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) {
1076*917Selowe 			page_unlock(newpp);
1077*917Selowe 			ASSERT(hat_page_getattr(pp, P_MOD) == 0);
1078*917Selowe 		}
1079*917Selowe 	}
1080*917Selowe 
1081*917Selowe 	if (pp->p_vnode == &kvp) {
1082*917Selowe 		PR_DEBUG(prd_kern);
1083*917Selowe 		PR_INCR_KSTAT(pr_failed_kernel);
1084*917Selowe 		page_unlock(pp);
1085*917Selowe 		return (page_retire_done(pp, PRD_FAILED));
1086*917Selowe 	}
1087*917Selowe 
1088*917Selowe 	if (pp->p_lckcnt || pp->p_cowcnt) {
1089*917Selowe 		if (toxic & PR_UE) {
1090*917Selowe 			(void) page_clear_lck_cow(pp, 1);
1091*917Selowe 		} else {
1092*917Selowe 			PR_DEBUG(prd_locked);
1093*917Selowe 			PR_INCR_KSTAT(pr_failed);
1094*917Selowe 			page_unlock(pp);
1095*917Selowe 			return (page_retire_done(pp, PRD_FAILED));
1096*917Selowe 		}
1097*917Selowe 	}
1098*917Selowe 
1099*917Selowe 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1100*917Selowe 	ASSERT(!PP_ISFREE(pp));
1101*917Selowe 	ASSERT(!hat_page_is_mapped(pp));
1102*917Selowe 
1103*917Selowe 	/*
1104*917Selowe 	 * If the page is modified, was not relocated, and not toxic,
1105*917Selowe 	 * we can't retire it without dropping data on the floor.
1106*917Selowe 	 *
1107*917Selowe 	 * RFE: we could change fsflush so that it (and only it) will
1108*917Selowe 	 * be allowed to lock this page and push it out.  Once it cleans
1109*917Selowe 	 * the page, we'd then be able to retire it on the free path.
1110*917Selowe 	 * In practice, this should be exceedingly rare.
1111*917Selowe 	 */
1112*917Selowe 	if (hat_ismod(pp)) {
1113*917Selowe 		if ((toxic & PR_UE) == 0) {
1114*917Selowe 			PR_DEBUG(prd_modce);
1115*917Selowe 			PR_INCR_KSTAT(pr_failed);
1116*917Selowe 			page_unlock(pp);
1117*917Selowe 			return (page_retire_done(pp, PRD_FAILED));
1118*917Selowe 		} else if (page_retire_modified == 0) {
1119*917Selowe 			PR_DEBUG(prd_modue_fail);
1120*917Selowe 			PR_INCR_KSTAT(pr_failed);
1121*917Selowe 			page_unlock(pp);
1122*917Selowe 			return (page_retire_done(pp, PRD_FAILED));
1123*917Selowe 		}
1124*917Selowe 		PR_DEBUG(prd_modue_retire);
1125*917Selowe 	}
1126*917Selowe 
1127*917Selowe 	if (pp->p_vnode) {
1128*917Selowe 		PR_DEBUG(prd_hashout);
1129*917Selowe 		page_hashout(pp, NULL);
1130*917Selowe 	}
1131*917Selowe 	ASSERT(!pp->p_vnode);
1132*917Selowe 
1133*917Selowe 	/*
1134*917Selowe 	 * The problem page is locked, demoted, unmapped, not free,
1135*917Selowe 	 * hashed out, and not COW or mlocked (whew!).
1136*917Selowe 	 *
1137*917Selowe 	 * Now we select our ammunition, take it around back, and shoot it.
1138*917Selowe 	 */
1139*917Selowe 	if (toxic & PR_UE) {
1140*917Selowe 		if (hat_ismod(pp)) {
1141*917Selowe 			/*
1142*917Selowe 			 * Let the user know we are dropping their data
1143*917Selowe 			 * on the floor.
1144*917Selowe 			 */
1145*917Selowe 			PR_MESSAGE(CE_WARN, 1, "Removing modified page "
1146*917Selowe 			    "0x%08x.%08x from service",
1147*917Selowe 			    mmu_ptob(pp->p_pagenum));
1148*917Selowe 		}
1149*917Selowe 		if (page_retire_transient_ue(pp)) {
1150*917Selowe 			PR_DEBUG(prd_uescrubbed);
1151*917Selowe 			return (page_retire_done(pp, PRD_UE_SCRUBBED));
1152*917Selowe 		} else {
1153*917Selowe 			PR_DEBUG(prd_uenotscrubbed);
1154*917Selowe 			page_retire_destroy(pp);
1155*917Selowe 			return (page_retire_done(pp, PRD_SUCCESS));
1156*917Selowe 		}
1157*917Selowe 	} else if (toxic & PR_FMA) {
1158*917Selowe 		PR_DEBUG(prd_fma);
1159*917Selowe 		page_retire_destroy(pp);
1160*917Selowe 		return (page_retire_done(pp, PRD_SUCCESS));
1161*917Selowe 	} else if (toxic & PR_MCE) {
1162*917Selowe 		PR_DEBUG(prd_mce);
1163*917Selowe 		page_retire_destroy(pp);
1164*917Selowe 		return (page_retire_done(pp, PRD_SUCCESS));
1165*917Selowe 	}
1166*917Selowe 	panic("page_retire_pp: bad toxic flags %d", toxic);
1167*917Selowe 	/*NOTREACHED*/
1168*917Selowe }
1169*917Selowe 
1170*917Selowe /*
1171*917Selowe  * Try to retire a page when we stumble onto it in the page lock routines.
1172*917Selowe  */
1173*917Selowe void
1174*917Selowe page_tryretire(page_t *pp)
1175*917Selowe {
1176*917Selowe 	ASSERT(PAGE_EXCL(pp));
1177*917Selowe 
1178*917Selowe 	if (!pr_enable) {
1179*917Selowe 		page_unlock(pp);
1180*917Selowe 		return;
1181*917Selowe 	}
1182*917Selowe 
1183*917Selowe 	/*
1184*917Selowe 	 * If the page is a big page, try to break it up.
1185*917Selowe 	 *
1186*917Selowe 	 * If there are other bad pages besides `pp', they will be
1187*917Selowe 	 * recursively retired for us thanks to a bit of magic.
1188*917Selowe 	 * If the page is a small page with errors, try to retire it.
1189*917Selowe 	 */
1190*917Selowe 	if (pp->p_szc > 0) {
1191*917Selowe 		if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) {
1192*917Selowe 			page_unlock(pp);
1193*917Selowe 			PR_DEBUG(prd_nofreedemote);
1194*917Selowe 			return;
1195*917Selowe 		} else if (!page_try_demote_pages(pp)) {
1196*917Selowe 			page_unlock(pp);
1197*917Selowe 			PR_DEBUG(prd_nodemote);
1198*917Selowe 			return;
1199*917Selowe 		}
1200*917Selowe 		PR_DEBUG(prd_demoted);
1201*917Selowe 		page_unlock(pp);
1202*917Selowe 	} else {
1203*917Selowe 		(void) page_retire_pp(pp);
1204*917Selowe 	}
1205*917Selowe }
1206*917Selowe 
1207*917Selowe /*
1208*917Selowe  * page_retire() - the front door in to retire a page.
1209*917Selowe  *
1210*917Selowe  * Ideally, page_retire() would instantly retire the requested page.
1211*917Selowe  * Unfortunately, some pages are locked or otherwise tied up and cannot be
1212*917Selowe  * retired right away. To deal with that, bits are set in p_toxic of the
1213*917Selowe  * page_t. An attempt is made to lock the page; if the attempt is successful,
1214*917Selowe  * we instantly unlock the page counting on page_unlock() to notice p_toxic
1215*917Selowe  * is nonzero and to call back into page_retire_pp(). Success is determined
1216*917Selowe  * by looking to see whether the page has been retired once it has been
1217*917Selowe  * unlocked.
1218*917Selowe  *
1219*917Selowe  * Returns:
1220*917Selowe  *
1221*917Selowe  *   - 0 on success,
1222*917Selowe  *   - EINVAL when the PA is whacko,
1223*917Selowe  *   - EBUSY if the page is already retired, or
1224*917Selowe  *   - EAGAIN if the page could not be _immediately_ retired.
1225*917Selowe  */
1226*917Selowe int
1227*917Selowe page_retire(uint64_t pa, uchar_t reason)
1228*917Selowe {
1229*917Selowe 	page_t	*pp;
1230*917Selowe 
1231*917Selowe 	ASSERT(reason & PR_REASONS);		/* there must be a reason */
1232*917Selowe 	ASSERT(!(reason & ~PR_REASONS));	/* but no other bits */
1233*917Selowe 
1234*917Selowe 	pp = page_numtopp_nolock(mmu_btop(pa));
1235*917Selowe 	if (pp == NULL) {
1236*917Selowe 		PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
1237*917Selowe 		    " page 0x%08x.%08x; page is not relocatable memory", pa);
1238*917Selowe 		return (page_retire_done(pp, PRD_INVALID_PA));
1239*917Selowe 	}
1240*917Selowe 	if (PP_RETIRED(pp)) {
1241*917Selowe 		return (page_retire_done(pp, PRD_DUPLICATE));
1242*917Selowe 	}
1243*917Selowe 
1244*917Selowe 	if (reason & PR_UE) {
1245*917Selowe 		PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
1246*917Selowe 		    " page 0x%08x.%08x", pa);
1247*917Selowe 	} else {
1248*917Selowe 		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
1249*917Selowe 		    " page 0x%08x.%08x", pa);
1250*917Selowe 	}
1251*917Selowe 	page_settoxic(pp, reason);
1252*917Selowe 	page_retire_enqueue(pp);
1253*917Selowe 
1254*917Selowe 	/*
1255*917Selowe 	 * And now for some magic.
1256*917Selowe 	 *
1257*917Selowe 	 * We marked this page toxic up above.  All there is left to do is
1258*917Selowe 	 * to try to lock the page and then unlock it.  The page lock routines
1259*917Selowe 	 * will intercept the page and retire it if they can.  If the page
1260*917Selowe 	 * cannot be locked, 's okay -- page_unlock() will eventually get it,
1261*917Selowe 	 * or the background thread, until then the lock routines will deny
1262*917Selowe 	 * further locks on it.
1263*917Selowe 	 */
1264*917Selowe 	if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) {
1265*917Selowe 		PR_DEBUG(prd_prlocked);
1266*917Selowe 		page_unlock(pp);
1267*917Selowe 	} else {
1268*917Selowe 		PR_DEBUG(prd_prnotlocked);
1269*917Selowe 	}
1270*917Selowe 
1271*917Selowe 	if (PP_RETIRED(pp)) {
1272*917Selowe 		PR_DEBUG(prd_prretired);
1273*917Selowe 		return (0);
1274*917Selowe 	} else {
1275*917Selowe 		cv_signal(&pr_cv);
1276*917Selowe 		PR_INCR_KSTAT(pr_failed);
1277*917Selowe 
1278*917Selowe 		if (pp->p_toxic & PR_MSG) {
1279*917Selowe 			return (page_retire_done(pp, PRD_FAILED));
1280*917Selowe 		} else {
1281*917Selowe 			return (page_retire_done(pp, PRD_PENDING));
1282*917Selowe 		}
1283*917Selowe 	}
1284*917Selowe }
1285*917Selowe 
1286*917Selowe /*
1287*917Selowe  * Take a retired page off the retired-pages vnode and clear the toxic flags.
1288*917Selowe  * If "free" is nonzero, lock it and put it back on the freelist. If "free"
1289*917Selowe  * is zero, the caller already holds SE_EXCL lock so we simply unretire it
1290*917Selowe  * and don't do anything else with it.
1291*917Selowe  *
1292*917Selowe  * Any unretire messages are printed from this routine.
1293*917Selowe  *
1294*917Selowe  * Returns 0 if page pp was unretired; else an error code.
1295*917Selowe  */
1296*917Selowe int
1297*917Selowe page_unretire_pp(page_t *pp, int free)
1298*917Selowe {
1299*917Selowe 	/*
1300*917Selowe 	 * To be retired, a page has to be hashed onto the retired_pages vnode
1301*917Selowe 	 * and have PR_RETIRED set in p_toxic.
1302*917Selowe 	 */
1303*917Selowe 	if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
1304*917Selowe 		ASSERT(PAGE_EXCL(pp));
1305*917Selowe 		PR_DEBUG(prd_ulocked);
1306*917Selowe 		if (!PP_RETIRED(pp)) {
1307*917Selowe 			PR_DEBUG(prd_unotretired);
1308*917Selowe 			page_unlock(pp);
1309*917Selowe 			return (page_retire_done(pp, PRD_UNR_NOT));
1310*917Selowe 		}
1311*917Selowe 
1312*917Selowe 		PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
1313*917Selowe 		    " page 0x%08x.%08x", mmu_ptob(pp->p_pagenum));
1314*917Selowe 		if (pp->p_toxic & PR_FMA) {
1315*917Selowe 			PR_DECR_KSTAT(pr_fma);
1316*917Selowe 		} else if (pp->p_toxic & PR_UE) {
1317*917Selowe 			PR_DECR_KSTAT(pr_ue);
1318*917Selowe 		} else {
1319*917Selowe 			PR_DECR_KSTAT(pr_mce);
1320*917Selowe 		}
1321*917Selowe 		page_clrtoxic(pp, PR_ALLFLAGS);
1322*917Selowe 
1323*917Selowe 		if (free) {
1324*917Selowe 			PR_DEBUG(prd_udestroy);
1325*917Selowe 			page_destroy(pp, 0);
1326*917Selowe 		} else {
1327*917Selowe 			PR_DEBUG(prd_uhashout);
1328*917Selowe 			page_hashout(pp, NULL);
1329*917Selowe 		}
1330*917Selowe 
1331*917Selowe 		mutex_enter(&freemem_lock);
1332*917Selowe 		availrmem++;
1333*917Selowe 		mutex_exit(&freemem_lock);
1334*917Selowe 
1335*917Selowe 		PR_DEBUG(prd_uunretired);
1336*917Selowe 		PR_DECR_KSTAT(pr_retired);
1337*917Selowe 		PR_INCR_KSTAT(pr_unretired);
1338*917Selowe 		return (page_retire_done(pp, PRD_UNR_SUCCESS));
1339*917Selowe 	}
1340*917Selowe 	PR_DEBUG(prd_unotlocked);
1341*917Selowe 	return (page_retire_done(pp, PRD_UNR_CANTLOCK));
1342*917Selowe }
1343*917Selowe 
1344*917Selowe /*
1345*917Selowe  * Return a page to service by moving it from the retired_pages vnode
1346*917Selowe  * onto the freelist.
1347*917Selowe  *
1348*917Selowe  * Called from mmioctl_page_retire() on behalf of the FMA DE.
1349*917Selowe  *
1350*917Selowe  * Returns:
1351*917Selowe  *
1352*917Selowe  *   - 0 if the page is unretired,
1353*917Selowe  *   - EAGAIN if the pp can not be locked,
1354*917Selowe  *   - EINVAL if the PA is whacko, and
1355*917Selowe  *   - EBADF if the pp is not retired.
1356*917Selowe  */
1357*917Selowe int
1358*917Selowe page_unretire(uint64_t pa)
1359*917Selowe {
1360*917Selowe 	page_t	*pp;
1361*917Selowe 
1362*917Selowe 	pp = page_numtopp_nolock(mmu_btop(pa));
1363*917Selowe 	if (pp == NULL) {
1364*917Selowe 		return (page_retire_done(pp, PRD_INVALID_PA));
1365*917Selowe 	}
1366*917Selowe 
1367*917Selowe 	return (page_unretire_pp(pp, 1));
1368*917Selowe }
1369*917Selowe 
1370*917Selowe /*
1371*917Selowe  * Test a page to see if it is retired. If errors is non-NULL, the toxic
1372*917Selowe  * bits of the page are returned. Returns 0 on success, error code on failure.
1373*917Selowe  */
1374*917Selowe int
1375*917Selowe page_retire_check_pp(page_t *pp, uint64_t *errors)
1376*917Selowe {
1377*917Selowe 	int rc;
1378*917Selowe 
1379*917Selowe 	if (PP_RETIRED(pp)) {
1380*917Selowe 		PR_DEBUG(prd_checkhit);
1381*917Selowe 		rc = 0;
1382*917Selowe 	} else {
1383*917Selowe 		PR_DEBUG(prd_checkmiss);
1384*917Selowe 		rc = EAGAIN;
1385*917Selowe 	}
1386*917Selowe 
1387*917Selowe 	/*
1388*917Selowe 	 * We have magically arranged the bit values returned to fmd(1M)
1389*917Selowe 	 * to line up with the FMA, MCE, and UE bits of the page_t.
1390*917Selowe 	 */
1391*917Selowe 	if (errors) {
1392*917Selowe 		uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
1393*917Selowe 		if (toxic & PR_UE_SCRUBBED) {
1394*917Selowe 			toxic &= ~PR_UE_SCRUBBED;
1395*917Selowe 			toxic |= PR_UE;
1396*917Selowe 		}
1397*917Selowe 		*errors = toxic;
1398*917Selowe 	}
1399*917Selowe 
1400*917Selowe 	return (rc);
1401*917Selowe }
1402*917Selowe 
1403*917Selowe /*
1404*917Selowe  * Test to see if the page_t for a given PA is retired, and return the
1405*917Selowe  * hardware errors we have seen on the page if requested.
1406*917Selowe  *
1407*917Selowe  * Called from mmioctl_page_retire on behalf of the FMA DE.
1408*917Selowe  *
1409*917Selowe  * Returns:
1410*917Selowe  *
1411*917Selowe  *   - 0 if the page is retired,
1412*917Selowe  *   - EAGAIN if it is not, and
1413*917Selowe  *   - EINVAL if the PA is whacko.
1414*917Selowe  */
1415*917Selowe int
1416*917Selowe page_retire_check(uint64_t pa, uint64_t *errors)
1417*917Selowe {
1418*917Selowe 	page_t	*pp;
1419*917Selowe 
1420*917Selowe 	if (errors) {
1421*917Selowe 		*errors = 0;
1422*917Selowe 	}
1423*917Selowe 
1424*917Selowe 	pp = page_numtopp_nolock(mmu_btop(pa));
1425*917Selowe 	if (pp == NULL) {
1426*917Selowe 		return (page_retire_done(pp, PRD_INVALID_PA));
1427*917Selowe 	}
1428*917Selowe 
1429*917Selowe 	return (page_retire_check_pp(pp, errors));
1430*917Selowe }
1431*917Selowe 
1432*917Selowe /*
1433*917Selowe  * Page retire self-test. For now, it always returns 0.
1434*917Selowe  */
1435*917Selowe int
1436*917Selowe page_retire_test(void)
1437*917Selowe {
1438*917Selowe 	page_t *first, *pp, *cpp, *cpp2, *lpp;
1439*917Selowe 
1440*917Selowe 	/*
1441*917Selowe 	 * Tests the corner case where a large page can't be retired
1442*917Selowe 	 * because one of the constituent pages is locked. We mark
1443*917Selowe 	 * one page to be retired and try to retire it, and mark the
1444*917Selowe 	 * other page to be retired but don't try to retire it, so
1445*917Selowe 	 * that page_unlock() in the failure path will recurse and try
1446*917Selowe 	 * to retire THAT page. This is the worst possible situation
1447*917Selowe 	 * we can get ourselves into.
1448*917Selowe 	 */
1449*917Selowe 	memsegs_lock(0);
1450*917Selowe 	pp = first = page_first();
1451*917Selowe 	do {
1452*917Selowe 		if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
1453*917Selowe 			cpp = pp + 1;
1454*917Selowe 			lpp = PP_ISFREE(pp)? pp : pp + 2;
1455*917Selowe 			cpp2 = pp + 3;
1456*917Selowe 			if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
1457*917Selowe 				continue;
1458*917Selowe 			if (!page_trylock(cpp, SE_EXCL)) {
1459*917Selowe 				page_unlock(lpp);
1460*917Selowe 				continue;
1461*917Selowe 			}
1462*917Selowe 			page_settoxic(cpp, PR_FMA | PR_BUSY);
1463*917Selowe 			page_settoxic(cpp2, PR_FMA);
1464*917Selowe 			page_tryretire(cpp);	/* will fail */
1465*917Selowe 			page_unlock(lpp);
1466*917Selowe 			(void) page_retire(cpp->p_pagenum, PR_FMA);
1467*917Selowe 			(void) page_retire(cpp2->p_pagenum, PR_FMA);
1468*917Selowe 		}
1469*917Selowe 	} while ((pp = page_next(pp)) != first);
1470*917Selowe 	memsegs_unlock(0);
1471*917Selowe 
1472*917Selowe 	return (0);
1473*917Selowe }
1474