1*917Selowe /* 2*917Selowe * CDDL HEADER START 3*917Selowe * 4*917Selowe * The contents of this file are subject to the terms of the 5*917Selowe * Common Development and Distribution License, Version 1.0 only 6*917Selowe * (the "License"). You may not use this file except in compliance 7*917Selowe * with the License. 8*917Selowe * 9*917Selowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*917Selowe * or http://www.opensolaris.org/os/licensing. 11*917Selowe * See the License for the specific language governing permissions 12*917Selowe * and limitations under the License. 13*917Selowe * 14*917Selowe * When distributing Covered Code, include this CDDL HEADER in each 15*917Selowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*917Selowe * If applicable, add the following below this CDDL HEADER, with the 17*917Selowe * fields enclosed by brackets "[]" replaced with your own identifying 18*917Selowe * information: Portions Copyright [yyyy] [name of copyright owner] 19*917Selowe * 20*917Selowe * CDDL HEADER END 21*917Selowe */ 22*917Selowe /* 23*917Selowe * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*917Selowe * Use is subject to license terms. 25*917Selowe */ 26*917Selowe 27*917Selowe #pragma ident "%Z%%M% %I% %E% SMI" 28*917Selowe 29*917Selowe /* 30*917Selowe * Page Retire - Big Theory Statement. 31*917Selowe * 32*917Selowe * This file handles removing sections of faulty memory from use when the 33*917Selowe * user land FMA Diagnosis Engine requests that a page be removed or when 34*917Selowe * a CE or UE is detected by the hardware. 35*917Selowe * 36*917Selowe * In the bad old days, the kernel side of Page Retire did a lot of the work 37*917Selowe * on its own. Now, with the DE keeping track of errors, the kernel side is 38*917Selowe * rather simple minded on most platforms. 39*917Selowe * 40*917Selowe * Errors are all reflected to the DE, and after digesting the error and 41*917Selowe * looking at all previously reported errors, the DE decides what should 42*917Selowe * be done about the current error. If the DE wants a particular page to 43*917Selowe * be retired, then the kernel page retire code is invoked via an ioctl. 44*917Selowe * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling 45*917Selowe * page retire to handle the error. Since page retire is just a simple 46*917Selowe * mechanism it doesn't need to differentiate between the different callers. 47*917Selowe * 48*917Selowe * The p_toxic field in the page_t is used to indicate which errors have 49*917Selowe * occurred and what action has been taken on a given page. Because errors are 50*917Selowe * reported without regard to the locked state of a page, no locks are used 51*917Selowe * to SET the error bits in p_toxic. However, in order to clear the error 52*917Selowe * bits, the page_t must be held exclusively locked. 53*917Selowe * 54*917Selowe * When page_retire() is called, it must be able to acquire locks, sleep, etc. 55*917Selowe * It must not be called from high-level interrupt context. 56*917Selowe * 57*917Selowe * Depending on how the requested page is being used at the time of the retire 58*917Selowe * request (and on the availability of sufficient system resources), the page 59*917Selowe * may be retired immediately, or just marked for retirement later. For 60*917Selowe * example, locked pages are marked, while free pages are retired. Multiple 61*917Selowe * requests may be made to retire the same page, although there is no need 62*917Selowe * to: once the p_toxic flags are set, the page will be retired as soon as it 63*917Selowe * can be exclusively locked. 64*917Selowe * 65*917Selowe * The retire mechanism is driven centrally out of page_unlock(). To expedite 66*917Selowe * the retirement of pages, further requests for SE_SHARED locks are denied 67*917Selowe * as long as a page retirement is pending. In addition, as long as pages are 68*917Selowe * pending retirement a background thread runs periodically trying to retire 69*917Selowe * those pages. Pages which could not be retired while the system is running 70*917Selowe * are scrubbed prior to rebooting to avoid latent errors on the next boot. 71*917Selowe * 72*917Selowe * Single CE pages and UE pages without persistent errors are scrubbed and 73*917Selowe * returned to service. Recidivist pages, as well as FMA-directed requests 74*917Selowe * for retirement, result in the page being taken out of service. Once the 75*917Selowe * decision is made to take a page out of service, the page is cleared, hashed 76*917Selowe * onto the retired_pages vnode, marked as retired, and it is unlocked. No 77*917Selowe * other requesters (except for unretire) are allowed to lock retired pages. 78*917Selowe * 79*917Selowe * The public routines return (sadly) 0 if they worked and a non-zero error 80*917Selowe * value if something went wrong. This is done for the ioctl side of the 81*917Selowe * world to allow errors to be reflected all the way out to user land. The 82*917Selowe * non-zero values are explained in comments atop each function. 83*917Selowe */ 84*917Selowe 85*917Selowe /* 86*917Selowe * Things to fix: 87*917Selowe * 88*917Selowe * 1. Cleanup SE_EWANTED. Since we're aggressive about trying to retire 89*917Selowe * pages, we can use page_retire_pp() to replace SE_EWANTED and all 90*917Selowe * the special delete_memory_thread() code just goes away. 91*917Selowe * 92*917Selowe * 2. Trying to retire non-relocatable kvp pages may result in a 93*917Selowe * quagmire. This is because seg_kmem() no longer keeps its pages locked, 94*917Selowe * and calls page_lookup() in the free path; since kvp pages are modified 95*917Selowe * and don't have a usable backing store, page_retire() can't do anything 96*917Selowe * with them, and we'll keep denying the lock to seg_kmem_free() in a 97*917Selowe * vicious cycle. To prevent that, we don't deny locks to kvp pages, and 98*917Selowe * hence only call page_retire_pp() from page_unlock() in the free path. 99*917Selowe * Since most kernel pages are indefinitely held anyway, and don't 100*917Selowe * participate in I/O, this is of little consequence. 101*917Selowe * 102*917Selowe * 3. Low memory situations will be interesting. If we don't have 103*917Selowe * enough memory for page_relocate() to succeed, we won't be able to 104*917Selowe * retire dirty pages; nobody will be able to push them out to disk 105*917Selowe * either, since we aggressively deny the page lock. We could change 106*917Selowe * fsflush so it can recognize this situation, grab the lock, and push 107*917Selowe * the page out, where we'll catch it in the free path and retire it. 108*917Selowe * 109*917Selowe * 4. Beware of places that have code like this in them: 110*917Selowe * 111*917Selowe * if (! page_tryupgrade(pp)) { 112*917Selowe * page_unlock(pp); 113*917Selowe * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) { 114*917Selowe * / *NOTHING* / 115*917Selowe * } 116*917Selowe * } 117*917Selowe * page_free(pp); 118*917Selowe * 119*917Selowe * The problem is that pp can change identity right after the 120*917Selowe * page_unlock() call. In particular, page_retire() can step in 121*917Selowe * there, change pp's identity, and hash pp onto the retired_vnode. 122*917Selowe * 123*917Selowe * Of course, other functions besides page_retire() can have the 124*917Selowe * same effect. A kmem reader can waltz by, set up a mapping to the 125*917Selowe * page, and then unlock the page. Page_free() will then go castors 126*917Selowe * up. So if anybody is doing this, it's already a bug. 127*917Selowe * 128*917Selowe * 5. mdboot()'s call into page_retire_hunt() should probably be 129*917Selowe * moved lower. Where the call is made now, we can get into trouble 130*917Selowe * by scrubbing a kernel page that is then accessed later. 131*917Selowe */ 132*917Selowe 133*917Selowe #include <sys/types.h> 134*917Selowe #include <sys/param.h> 135*917Selowe #include <sys/systm.h> 136*917Selowe #include <sys/mman.h> 137*917Selowe #include <sys/vnode.h> 138*917Selowe #include <sys/cmn_err.h> 139*917Selowe #include <sys/ksynch.h> 140*917Selowe #include <sys/thread.h> 141*917Selowe #include <sys/disp.h> 142*917Selowe #include <sys/ontrap.h> 143*917Selowe #include <sys/vmsystm.h> 144*917Selowe #include <sys/mem_config.h> 145*917Selowe #include <sys/atomic.h> 146*917Selowe #include <sys/callb.h> 147*917Selowe #include <vm/page.h> 148*917Selowe #include <vm/vm_dep.h> 149*917Selowe #include <vm/as.h> 150*917Selowe #include <vm/hat.h> 151*917Selowe 152*917Selowe /* 153*917Selowe * vnode for all pages which are retired from the VM system; 154*917Selowe */ 155*917Selowe vnode_t *retired_pages; 156*917Selowe 157*917Selowe /* 158*917Selowe * Background thread that wakes up periodically to try to retire pending 159*917Selowe * pages. This prevents threads from becoming blocked indefinitely in 160*917Selowe * page_lookup() or some other routine should the page(s) they are waiting 161*917Selowe * on become eligible for social security. 162*917Selowe */ 163*917Selowe static void page_retire_thread(void); 164*917Selowe static kthread_t *pr_thread_id; 165*917Selowe static kcondvar_t pr_cv; 166*917Selowe static kmutex_t pr_thread_mutex; 167*917Selowe static clock_t pr_thread_shortwait; 168*917Selowe static clock_t pr_thread_longwait; 169*917Selowe 170*917Selowe /* 171*917Selowe * Make a list of all of the pages that have been marked for retirement 172*917Selowe * but are not yet retired. At system shutdown, we will scrub all of the 173*917Selowe * pages in the list in case there are outstanding UEs. Then, we 174*917Selowe * cross-check this list against the number of pages that are yet to be 175*917Selowe * retired, and if we find inconsistencies, we scan every page_t in the 176*917Selowe * whole system looking for any pages that need to be scrubbed for UEs. 177*917Selowe * The background thread also uses this queue to determine which pages 178*917Selowe * it should keep trying to retire. 179*917Selowe */ 180*917Selowe #ifdef DEBUG 181*917Selowe #define PR_PENDING_QMAX 32 182*917Selowe #else /* DEBUG */ 183*917Selowe #define PR_PENDING_QMAX 256 184*917Selowe #endif /* DEBUG */ 185*917Selowe page_t *pr_pending_q[PR_PENDING_QMAX]; 186*917Selowe kmutex_t pr_q_mutex; 187*917Selowe 188*917Selowe /* 189*917Selowe * Page retire global kstats 190*917Selowe */ 191*917Selowe struct page_retire_kstat { 192*917Selowe kstat_named_t pr_retired; 193*917Selowe kstat_named_t pr_requested; 194*917Selowe kstat_named_t pr_requested_free; 195*917Selowe kstat_named_t pr_enqueue_fail; 196*917Selowe kstat_named_t pr_dequeue_fail; 197*917Selowe kstat_named_t pr_pending; 198*917Selowe kstat_named_t pr_failed; 199*917Selowe kstat_named_t pr_failed_kernel; 200*917Selowe kstat_named_t pr_limit; 201*917Selowe kstat_named_t pr_limit_exceeded; 202*917Selowe kstat_named_t pr_fma; 203*917Selowe kstat_named_t pr_mce; 204*917Selowe kstat_named_t pr_ue; 205*917Selowe kstat_named_t pr_ue_cleared_retire; 206*917Selowe kstat_named_t pr_ue_cleared_free; 207*917Selowe kstat_named_t pr_ue_persistent; 208*917Selowe kstat_named_t pr_unretired; 209*917Selowe }; 210*917Selowe 211*917Selowe static struct page_retire_kstat page_retire_kstat = { 212*917Selowe { "pages_retired", KSTAT_DATA_UINT64}, 213*917Selowe { "pages_retire_request", KSTAT_DATA_UINT64}, 214*917Selowe { "pages_retire_request_free", KSTAT_DATA_UINT64}, 215*917Selowe { "pages_notenqueued", KSTAT_DATA_UINT64}, 216*917Selowe { "pages_notdequeued", KSTAT_DATA_UINT64}, 217*917Selowe { "pages_pending", KSTAT_DATA_UINT64}, 218*917Selowe { "pages_deferred", KSTAT_DATA_UINT64}, 219*917Selowe { "pages_deferred_kernel", KSTAT_DATA_UINT64}, 220*917Selowe { "pages_limit", KSTAT_DATA_UINT64}, 221*917Selowe { "pages_limit_exceeded", KSTAT_DATA_UINT64}, 222*917Selowe { "pages_fma", KSTAT_DATA_UINT64}, 223*917Selowe { "pages_multiple_ce", KSTAT_DATA_UINT64}, 224*917Selowe { "pages_ue", KSTAT_DATA_UINT64}, 225*917Selowe { "pages_ue_cleared_retired", KSTAT_DATA_UINT64}, 226*917Selowe { "pages_ue_cleared_freed", KSTAT_DATA_UINT64}, 227*917Selowe { "pages_ue_persistent", KSTAT_DATA_UINT64}, 228*917Selowe { "pages_unretired", KSTAT_DATA_UINT64}, 229*917Selowe }; 230*917Selowe 231*917Selowe static kstat_t *page_retire_ksp = NULL; 232*917Selowe 233*917Selowe #define PR_INCR_KSTAT(stat) \ 234*917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1) 235*917Selowe #define PR_DECR_KSTAT(stat) \ 236*917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1) 237*917Selowe 238*917Selowe #define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64) 239*917Selowe #define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64) 240*917Selowe #define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA) 241*917Selowe #define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64) 242*917Selowe #define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64) 243*917Selowe #define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) 244*917Selowe 245*917Selowe /* 246*917Selowe * Limit the number of multiple CE page retires. 247*917Selowe * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in 248*917Selowe * basis points, where 100 basis points equals one percent. 249*917Selowe */ 250*917Selowe #define MCE_BPT 10 251*917Selowe uint64_t max_pages_retired_bps = MCE_BPT; 252*917Selowe #define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000) 253*917Selowe 254*917Selowe /* 255*917Selowe * Control over the verbosity of page retirement. 256*917Selowe * 257*917Selowe * When set to zero (the default), no messages will be printed. 258*917Selowe * When set to one, summary messages will be printed. 259*917Selowe * When set > one, all messages will be printed. 260*917Selowe * 261*917Selowe * A value of one will trigger detailed messages for retirement operations, 262*917Selowe * and is intended as a platform tunable for processors where FMA's DE does 263*917Selowe * not run (e.g., spitfire). Values > one are intended for debugging only. 264*917Selowe */ 265*917Selowe int page_retire_messages = 0; 266*917Selowe 267*917Selowe /* 268*917Selowe * Control whether or not we retire dirty UE pages. By default we do 269*917Selowe * since we assume the data is corrupt and the process(es) using it will 270*917Selowe * be killed. This is platform tunable only, and should probably not be 271*917Selowe * changed, ever. 272*917Selowe */ 273*917Selowe int page_retire_modified = 1; 274*917Selowe 275*917Selowe /* 276*917Selowe * Control whether or not we return scrubbed UE pages to service. 277*917Selowe * By default we do not since FMA wants to run its diagnostics first 278*917Selowe * and then ask us to unretire the page if it passes. Non-FMA platforms 279*917Selowe * may set this to zero so we will only retire recidivist pages. It should 280*917Selowe * not be changed by the user. 281*917Selowe */ 282*917Selowe int page_retire_first_ue = 1; 283*917Selowe 284*917Selowe /* 285*917Selowe * Master enable for page retire. This prevents a CE or UE early in boot 286*917Selowe * from trying to retire a page before page_retire_init() has finished 287*917Selowe * setting things up. This is internal only and is not a tunable! 288*917Selowe */ 289*917Selowe static int pr_enable = 0; 290*917Selowe 291*917Selowe extern struct vnode kvp; 292*917Selowe 293*917Selowe #ifdef DEBUG 294*917Selowe struct page_retire_debug { 295*917Selowe int prd_dup; 296*917Selowe int prd_noaction; 297*917Selowe int prd_queued; 298*917Selowe int prd_notqueued; 299*917Selowe int prd_dequeue; 300*917Selowe int prd_top; 301*917Selowe int prd_locked; 302*917Selowe int prd_reloc; 303*917Selowe int prd_modce; 304*917Selowe int prd_modue_fail; 305*917Selowe int prd_modue_retire; 306*917Selowe int prd_kern; 307*917Selowe int prd_free; 308*917Selowe int prd_noreclaim; 309*917Selowe int prd_hashout; 310*917Selowe int prd_fma; 311*917Selowe int prd_uescrubbed; 312*917Selowe int prd_uenotscrubbed; 313*917Selowe int prd_mce; 314*917Selowe int prd_prlocked; 315*917Selowe int prd_prnotlocked; 316*917Selowe int prd_prretired; 317*917Selowe int prd_ulocked; 318*917Selowe int prd_unotretired; 319*917Selowe int prd_udestroy; 320*917Selowe int prd_uhashout; 321*917Selowe int prd_uunretired; 322*917Selowe int prd_unotlocked; 323*917Selowe int prd_checkhit; 324*917Selowe int prd_checkmiss; 325*917Selowe int prd_tctop; 326*917Selowe int prd_tclocked; 327*917Selowe int prd_hunt; 328*917Selowe int prd_dohunt; 329*917Selowe int prd_earlyhunt; 330*917Selowe int prd_latehunt; 331*917Selowe int prd_nofreedemote; 332*917Selowe int prd_nodemote; 333*917Selowe int prd_demoted; 334*917Selowe } pr_debug; 335*917Selowe 336*917Selowe #define PR_DEBUG(foo) ((pr_debug.foo)++) 337*917Selowe 338*917Selowe /* 339*917Selowe * A type histogram. We record the incidence of the various toxic 340*917Selowe * flag combinations along with the interesting page attributes. The 341*917Selowe * goal is to get as many combinations as we can while driving all 342*917Selowe * pr_debug values nonzero (indicating we've exercised all possible 343*917Selowe * code paths across all possible page types). Not all combinations 344*917Selowe * will make sense -- e.g. PRT_MOD|PRT_KERNEL. 345*917Selowe * 346*917Selowe * pr_type offset bit encoding (when examining with a debugger): 347*917Selowe * 348*917Selowe * PRT_NAMED - 0x4 349*917Selowe * PRT_KERNEL - 0x8 350*917Selowe * PRT_FREE - 0x10 351*917Selowe * PRT_MOD - 0x20 352*917Selowe * PRT_FMA - 0x0 353*917Selowe * PRT_MCE - 0x40 354*917Selowe * PRT_UE - 0x80 355*917Selowe */ 356*917Selowe 357*917Selowe #define PRT_NAMED 0x01 358*917Selowe #define PRT_KERNEL 0x02 359*917Selowe #define PRT_FREE 0x04 360*917Selowe #define PRT_MOD 0x08 361*917Selowe #define PRT_FMA 0x00 /* yes, this is not a mistake */ 362*917Selowe #define PRT_MCE 0x10 363*917Selowe #define PRT_UE 0x20 364*917Selowe #define PRT_ALL 0x3F 365*917Selowe 366*917Selowe int pr_types[PRT_ALL+1]; 367*917Selowe 368*917Selowe #define PR_TYPES(pp) { \ 369*917Selowe int whichtype = 0; \ 370*917Selowe if (pp->p_vnode) \ 371*917Selowe whichtype |= PRT_NAMED; \ 372*917Selowe if (pp->p_vnode == &kvp) \ 373*917Selowe whichtype |= PRT_KERNEL; \ 374*917Selowe if (PP_ISFREE(pp)) \ 375*917Selowe whichtype |= PRT_FREE; \ 376*917Selowe if (hat_ismod(pp)) \ 377*917Selowe whichtype |= PRT_MOD; \ 378*917Selowe if (pp->p_toxic & PR_UE) \ 379*917Selowe whichtype |= PRT_UE; \ 380*917Selowe if (pp->p_toxic & PR_MCE) \ 381*917Selowe whichtype |= PRT_MCE; \ 382*917Selowe pr_types[whichtype]++; \ 383*917Selowe } 384*917Selowe 385*917Selowe int recl_calls; 386*917Selowe int recl_mtbf = 3; 387*917Selowe int reloc_calls; 388*917Selowe int reloc_mtbf = 7; 389*917Selowe int pr_calls; 390*917Selowe int pr_mtbf = 15; 391*917Selowe 392*917Selowe #define MTBF(v, f) (((++(v)) & (f)) != (f)) 393*917Selowe 394*917Selowe #else /* DEBUG */ 395*917Selowe 396*917Selowe #define PR_DEBUG(foo) /* nothing */ 397*917Selowe #define PR_TYPES(foo) /* nothing */ 398*917Selowe #define MTBF(v, f) (1) 399*917Selowe 400*917Selowe #endif /* DEBUG */ 401*917Selowe 402*917Selowe /* 403*917Selowe * page_retire_done() - completion processing 404*917Selowe * 405*917Selowe * Used by the page_retire code for common completion processing. 406*917Selowe * It keeps track of how many times a given result has happened, 407*917Selowe * and writes out an occasional message. 408*917Selowe * 409*917Selowe * May be called with a NULL pp (PRD_INVALID_PA case). 410*917Selowe */ 411*917Selowe #define PRD_INVALID_KEY -1 412*917Selowe #define PRD_SUCCESS 0 413*917Selowe #define PRD_PENDING 1 414*917Selowe #define PRD_FAILED 2 415*917Selowe #define PRD_DUPLICATE 3 416*917Selowe #define PRD_INVALID_PA 4 417*917Selowe #define PRD_LIMIT 5 418*917Selowe #define PRD_UE_SCRUBBED 6 419*917Selowe #define PRD_UNR_SUCCESS 7 420*917Selowe #define PRD_UNR_CANTLOCK 8 421*917Selowe #define PRD_UNR_NOT 9 422*917Selowe 423*917Selowe typedef struct page_retire_op { 424*917Selowe int pr_key; /* one of the PRD_* defines from above */ 425*917Selowe int pr_count; /* How many times this has happened */ 426*917Selowe int pr_retval; /* return value */ 427*917Selowe int pr_msglvl; /* message level - when to print */ 428*917Selowe char *pr_message; /* Cryptic message for field service */ 429*917Selowe } page_retire_op_t; 430*917Selowe 431*917Selowe static page_retire_op_t page_retire_ops[] = { 432*917Selowe /* key count retval msglvl message */ 433*917Selowe {PRD_SUCCESS, 0, 0, 1, 434*917Selowe "Page 0x%08x.%08x removed from service"}, 435*917Selowe {PRD_PENDING, 0, EAGAIN, 2, 436*917Selowe "Page 0x%08x.%08x will be retired on free"}, 437*917Selowe {PRD_FAILED, 0, EAGAIN, 0, NULL}, 438*917Selowe {PRD_DUPLICATE, 0, EBUSY, 2, 439*917Selowe "Page 0x%08x.%08x already retired"}, 440*917Selowe {PRD_INVALID_PA, 0, EINVAL, 2, 441*917Selowe "PA 0x%08x.%08x is not a relocatable page"}, 442*917Selowe {PRD_LIMIT, 0, 0, 1, 443*917Selowe "Page 0x%08x.%08x not retired due to limit exceeded"}, 444*917Selowe {PRD_UE_SCRUBBED, 0, 0, 1, 445*917Selowe "Previously reported error on page 0x%08x.%08x cleared"}, 446*917Selowe {PRD_UNR_SUCCESS, 0, 0, 1, 447*917Selowe "Page 0x%08x.%08x returned to service"}, 448*917Selowe {PRD_UNR_CANTLOCK, 0, EAGAIN, 2, 449*917Selowe "Page 0x%08x.%08x could not be unretired"}, 450*917Selowe {PRD_UNR_NOT, 0, EBADF, 2, 451*917Selowe "Page 0x%08x.%08x is not retired"}, 452*917Selowe {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */ 453*917Selowe }; 454*917Selowe 455*917Selowe /* 456*917Selowe * print a message if page_retire_messages is true. 457*917Selowe */ 458*917Selowe #define PR_MESSAGE(debuglvl, msglvl, msg, pa) \ 459*917Selowe { \ 460*917Selowe uint64_t p = (uint64_t)pa; \ 461*917Selowe if (page_retire_messages >= msglvl && msg != NULL) { \ 462*917Selowe cmn_err(debuglvl, msg, \ 463*917Selowe (uint32_t)(p >> 32), (uint32_t)p); \ 464*917Selowe } \ 465*917Selowe } 466*917Selowe 467*917Selowe /* 468*917Selowe * Note that multiple bits may be set in a single settoxic operation. 469*917Selowe * May be called without the page locked. 470*917Selowe */ 471*917Selowe void 472*917Selowe page_settoxic(page_t *pp, uchar_t bits) 473*917Selowe { 474*917Selowe atomic_or_8(&pp->p_toxic, bits); 475*917Selowe } 476*917Selowe 477*917Selowe /* 478*917Selowe * Note that multiple bits may cleared in a single clrtoxic operation. 479*917Selowe * Must be called with the page exclusively locked. 480*917Selowe */ 481*917Selowe void 482*917Selowe page_clrtoxic(page_t *pp, uchar_t bits) 483*917Selowe { 484*917Selowe ASSERT(PAGE_EXCL(pp)); 485*917Selowe atomic_and_8(&pp->p_toxic, ~bits); 486*917Selowe } 487*917Selowe 488*917Selowe /* 489*917Selowe * Prints any page retire messages to the user, and decides what 490*917Selowe * error code is appropriate for the condition reported. 491*917Selowe */ 492*917Selowe static int 493*917Selowe page_retire_done(page_t *pp, int code) 494*917Selowe { 495*917Selowe page_retire_op_t *prop; 496*917Selowe uint64_t pa = 0; 497*917Selowe int i; 498*917Selowe 499*917Selowe if (pp != NULL) { 500*917Selowe pa = mmu_ptob(pp->p_pagenum); 501*917Selowe } 502*917Selowe 503*917Selowe prop = NULL; 504*917Selowe for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { 505*917Selowe if (page_retire_ops[i].pr_key == code) { 506*917Selowe prop = &page_retire_ops[i]; 507*917Selowe break; 508*917Selowe } 509*917Selowe } 510*917Selowe 511*917Selowe #ifdef DEBUG 512*917Selowe if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { 513*917Selowe cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); 514*917Selowe } 515*917Selowe #endif 516*917Selowe 517*917Selowe ASSERT(prop->pr_key == code); 518*917Selowe 519*917Selowe prop->pr_count++; 520*917Selowe 521*917Selowe PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); 522*917Selowe if (pp != NULL) { 523*917Selowe page_settoxic(pp, PR_MSG); 524*917Selowe } 525*917Selowe 526*917Selowe return (prop->pr_retval); 527*917Selowe } 528*917Selowe 529*917Selowe /* 530*917Selowe * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages 531*917Selowe * that we were not able to retire. On large machines, walking the complete 532*917Selowe * page_t array and looking at every page_t takes too long. So, as a page is 533*917Selowe * marked toxic, we track it using a list that can be processed at reboot 534*917Selowe * time. page_retire_enqueue() will do its best to try to avoid duplicate 535*917Selowe * entries, but if we get too many errors at once the queue can overflow, 536*917Selowe * in which case we will end up walking every page_t as a last resort. 537*917Selowe * The background thread also makes use of this queue to find which pages 538*917Selowe * are pending retirement. 539*917Selowe */ 540*917Selowe static void 541*917Selowe page_retire_enqueue(page_t *pp) 542*917Selowe { 543*917Selowe int nslot = -1; 544*917Selowe int i; 545*917Selowe 546*917Selowe mutex_enter(&pr_q_mutex); 547*917Selowe 548*917Selowe /* 549*917Selowe * Check to make sure retire hasn't already dequeued it. 550*917Selowe * In the meantime if the page was cleaned up, no need 551*917Selowe * to enqueue it. 552*917Selowe */ 553*917Selowe if (PP_RETIRED(pp) || pp->p_toxic == 0) { 554*917Selowe mutex_exit(&pr_q_mutex); 555*917Selowe PR_DEBUG(prd_noaction); 556*917Selowe return; 557*917Selowe } 558*917Selowe 559*917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 560*917Selowe if (pr_pending_q[i] == pp) { 561*917Selowe mutex_exit(&pr_q_mutex); 562*917Selowe PR_DEBUG(prd_dup); 563*917Selowe return; 564*917Selowe } else if (nslot == -1 && pr_pending_q[i] == NULL) { 565*917Selowe nslot = i; 566*917Selowe } 567*917Selowe } 568*917Selowe 569*917Selowe PR_INCR_KSTAT(pr_pending); 570*917Selowe 571*917Selowe if (nslot != -1) { 572*917Selowe pr_pending_q[nslot] = pp; 573*917Selowe PR_DEBUG(prd_queued); 574*917Selowe } else { 575*917Selowe PR_INCR_KSTAT(pr_enqueue_fail); 576*917Selowe PR_DEBUG(prd_notqueued); 577*917Selowe } 578*917Selowe mutex_exit(&pr_q_mutex); 579*917Selowe } 580*917Selowe 581*917Selowe static void 582*917Selowe page_retire_dequeue(page_t *pp) 583*917Selowe { 584*917Selowe int i; 585*917Selowe 586*917Selowe mutex_enter(&pr_q_mutex); 587*917Selowe 588*917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 589*917Selowe if (pr_pending_q[i] == pp) { 590*917Selowe pr_pending_q[i] = NULL; 591*917Selowe break; 592*917Selowe } 593*917Selowe } 594*917Selowe 595*917Selowe if (i == PR_PENDING_QMAX) { 596*917Selowe PR_INCR_KSTAT(pr_dequeue_fail); 597*917Selowe } 598*917Selowe 599*917Selowe PR_DECR_KSTAT(pr_pending); 600*917Selowe PR_DEBUG(prd_dequeue); 601*917Selowe 602*917Selowe mutex_exit(&pr_q_mutex); 603*917Selowe } 604*917Selowe 605*917Selowe /* 606*917Selowe * Act like page_destroy(), but instead of freeing the page, hash it onto 607*917Selowe * the retired_pages vnode, and mark it retired. 608*917Selowe * 609*917Selowe * For fun, we try to scrub the page until it's squeaky clean. 610*917Selowe * availrmem is adjusted here. 611*917Selowe */ 612*917Selowe static void 613*917Selowe page_retire_destroy(page_t *pp) 614*917Selowe { 615*917Selowe ASSERT(PAGE_EXCL(pp)); 616*917Selowe ASSERT(!PP_ISFREE(pp)); 617*917Selowe ASSERT(pp->p_szc == 0); 618*917Selowe ASSERT(!hat_page_is_mapped(pp)); 619*917Selowe ASSERT(!pp->p_vnode); 620*917Selowe 621*917Selowe page_clr_all_props(pp); 622*917Selowe pagescrub(pp, 0, MMU_PAGESIZE); 623*917Selowe 624*917Selowe pp->p_next = NULL; 625*917Selowe pp->p_prev = NULL; 626*917Selowe if (page_hashin(pp, retired_pages, (u_offset_t)pp, NULL) == 0) { 627*917Selowe cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); 628*917Selowe } 629*917Selowe 630*917Selowe page_settoxic(pp, PR_RETIRED); 631*917Selowe page_clrtoxic(pp, PR_BUSY); 632*917Selowe page_retire_dequeue(pp); 633*917Selowe PR_INCR_KSTAT(pr_retired); 634*917Selowe 635*917Selowe if (pp->p_toxic & PR_FMA) { 636*917Selowe PR_INCR_KSTAT(pr_fma); 637*917Selowe } else if (pp->p_toxic & PR_UE) { 638*917Selowe PR_INCR_KSTAT(pr_ue); 639*917Selowe } else { 640*917Selowe PR_INCR_KSTAT(pr_mce); 641*917Selowe } 642*917Selowe 643*917Selowe mutex_enter(&freemem_lock); 644*917Selowe availrmem--; 645*917Selowe mutex_exit(&freemem_lock); 646*917Selowe 647*917Selowe page_unlock(pp); 648*917Selowe } 649*917Selowe 650*917Selowe /* 651*917Selowe * Check whether the number of pages which have been retired already exceeds 652*917Selowe * the maximum allowable percentage of memory which may be retired. 653*917Selowe * 654*917Selowe * Returns 1 if the limit has been exceeded. 655*917Selowe */ 656*917Selowe static int 657*917Selowe page_retire_limit(void) 658*917Selowe { 659*917Selowe if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) { 660*917Selowe PR_INCR_KSTAT(pr_limit_exceeded); 661*917Selowe return (1); 662*917Selowe } 663*917Selowe 664*917Selowe return (0); 665*917Selowe } 666*917Selowe 667*917Selowe #define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \ 668*917Selowe "[ 0x%x != 0x%x ] while attempting to clear previously " \ 669*917Selowe "reported error; page removed from service" 670*917Selowe 671*917Selowe #define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \ 672*917Selowe "attempting to clear previously reported error; page removed " \ 673*917Selowe "from service" 674*917Selowe 675*917Selowe /* 676*917Selowe * Attempt to clear a UE from a page. 677*917Selowe * Returns 1 if the error has been successfully cleared. 678*917Selowe */ 679*917Selowe static int 680*917Selowe page_clear_transient_ue(page_t *pp) 681*917Selowe { 682*917Selowe caddr_t kaddr; 683*917Selowe uint8_t rb, wb; 684*917Selowe uint64_t pa; 685*917Selowe uint32_t pa_hi, pa_lo; 686*917Selowe on_trap_data_t otd; 687*917Selowe int errors = 0; 688*917Selowe int i; 689*917Selowe 690*917Selowe ASSERT(PAGE_EXCL(pp)); 691*917Selowe ASSERT(PP_PR_REQ(pp)); 692*917Selowe ASSERT(pp->p_szc == 0); 693*917Selowe ASSERT(!hat_page_is_mapped(pp)); 694*917Selowe 695*917Selowe /* 696*917Selowe * Clear the page and attempt to clear the UE. If we trap 697*917Selowe * on the next access to the page, we know the UE has recurred. 698*917Selowe */ 699*917Selowe pagescrub(pp, 0, PAGESIZE); 700*917Selowe 701*917Selowe /* 702*917Selowe * Map the page and write a bunch of bit patterns to compare 703*917Selowe * what we wrote with what we read back. This isn't a perfect 704*917Selowe * test but it should be good enough to catch most of the 705*917Selowe * recurring UEs. If this fails to catch a recurrent UE, we'll 706*917Selowe * retire the page the next time we see a UE on the page. 707*917Selowe */ 708*917Selowe kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); 709*917Selowe 710*917Selowe pa = ptob((uint64_t)page_pptonum(pp)); 711*917Selowe pa_hi = (uint32_t)(pa >> 32); 712*917Selowe pa_lo = (uint32_t)pa; 713*917Selowe 714*917Selowe /* 715*917Selowe * Fill the page with each (0x00 - 0xFF] bit pattern, flushing 716*917Selowe * the cache in between reading and writing. We do this under 717*917Selowe * on_trap() protection to avoid recursion. 718*917Selowe */ 719*917Selowe if (on_trap(&otd, OT_DATA_EC)) { 720*917Selowe PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); 721*917Selowe errors = 1; 722*917Selowe } else { 723*917Selowe for (wb = 0xff; wb > 0; wb--) { 724*917Selowe for (i = 0; i < PAGESIZE; i++) { 725*917Selowe kaddr[i] = wb; 726*917Selowe } 727*917Selowe 728*917Selowe sync_data_memory(kaddr, PAGESIZE); 729*917Selowe 730*917Selowe for (i = 0; i < PAGESIZE; i++) { 731*917Selowe rb = kaddr[i]; 732*917Selowe if (rb != wb) { 733*917Selowe /* 734*917Selowe * We had a mismatch without a trap. 735*917Selowe * Uh-oh. Something is really wrong 736*917Selowe * with this system. 737*917Selowe */ 738*917Selowe if (page_retire_messages) { 739*917Selowe cmn_err(CE_WARN, MSG_DM, 740*917Selowe pa_hi, pa_lo, rb, wb); 741*917Selowe } 742*917Selowe errors = 1; 743*917Selowe goto out; /* double break */ 744*917Selowe } 745*917Selowe } 746*917Selowe } 747*917Selowe } 748*917Selowe out: 749*917Selowe no_trap(); 750*917Selowe ppmapout(kaddr); 751*917Selowe 752*917Selowe return (errors ? 0 : 1); 753*917Selowe } 754*917Selowe 755*917Selowe /* 756*917Selowe * Try to clear a page_t with a single UE. If the UE was transient, it is 757*917Selowe * returned to service, and we return 1. Otherwise we return 0 meaning 758*917Selowe * that further processing is required to retire the page. 759*917Selowe */ 760*917Selowe static int 761*917Selowe page_retire_transient_ue(page_t *pp) 762*917Selowe { 763*917Selowe ASSERT(PAGE_EXCL(pp)); 764*917Selowe ASSERT(!hat_page_is_mapped(pp)); 765*917Selowe 766*917Selowe /* 767*917Selowe * If this page is a repeat offender, retire him under the 768*917Selowe * "two strikes and you're out" rule. The caller is responsible 769*917Selowe * for scrubbing the page to try to clear the error. 770*917Selowe */ 771*917Selowe if (pp->p_toxic & PR_UE_SCRUBBED) { 772*917Selowe PR_INCR_KSTAT(pr_ue_persistent); 773*917Selowe return (0); 774*917Selowe } 775*917Selowe 776*917Selowe if (page_clear_transient_ue(pp)) { 777*917Selowe /* 778*917Selowe * We set the PR_SCRUBBED_UE bit; if we ever see this 779*917Selowe * page again, we will retire it, no questions asked. 780*917Selowe */ 781*917Selowe page_settoxic(pp, PR_UE_SCRUBBED); 782*917Selowe 783*917Selowe if (page_retire_first_ue) { 784*917Selowe PR_INCR_KSTAT(pr_ue_cleared_retire); 785*917Selowe return (0); 786*917Selowe } else { 787*917Selowe PR_INCR_KSTAT(pr_ue_cleared_free); 788*917Selowe 789*917Selowe page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); 790*917Selowe page_retire_dequeue(pp); 791*917Selowe 792*917Selowe /* 793*917Selowe * Clear the free bit if it's set, since the 794*917Selowe * page free code will get cranky if we don't. 795*917Selowe */ 796*917Selowe PP_CLRFREE(pp); 797*917Selowe 798*917Selowe /* LINTED: CONSTCOND */ 799*917Selowe VN_DISPOSE(pp, B_FREE, 1, kcred); 800*917Selowe return (1); 801*917Selowe } 802*917Selowe } 803*917Selowe 804*917Selowe PR_INCR_KSTAT(pr_ue_persistent); 805*917Selowe return (0); 806*917Selowe } 807*917Selowe 808*917Selowe /* 809*917Selowe * Update the statistics dynamically when our kstat is read. 810*917Selowe */ 811*917Selowe static int 812*917Selowe page_retire_kstat_update(kstat_t *ksp, int rw) 813*917Selowe { 814*917Selowe struct page_retire_kstat *pr; 815*917Selowe 816*917Selowe if (ksp == NULL) 817*917Selowe return (EINVAL); 818*917Selowe 819*917Selowe switch (rw) { 820*917Selowe 821*917Selowe case KSTAT_READ: 822*917Selowe pr = (struct page_retire_kstat *)ksp->ks_data; 823*917Selowe ASSERT(pr == &page_retire_kstat); 824*917Selowe pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT; 825*917Selowe return (0); 826*917Selowe 827*917Selowe case KSTAT_WRITE: 828*917Selowe return (EACCES); 829*917Selowe 830*917Selowe default: 831*917Selowe return (EINVAL); 832*917Selowe } 833*917Selowe /*NOTREACHED*/ 834*917Selowe } 835*917Selowe 836*917Selowe /* 837*917Selowe * Initialize the page retire mechanism: 838*917Selowe * 839*917Selowe * - Establish the correctable error retire limit. 840*917Selowe * - Initialize locks. 841*917Selowe * - Build the retired_pages vnode. 842*917Selowe * - Set up the kstats. 843*917Selowe * - Fire off the background thread. 844*917Selowe * - Tell page_tryretire() it's OK to start retiring pages. 845*917Selowe */ 846*917Selowe void 847*917Selowe page_retire_init(void) 848*917Selowe { 849*917Selowe const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL}; 850*917Selowe struct vnodeops *vops; 851*917Selowe 852*917Selowe const uint_t page_retire_ndata = 853*917Selowe sizeof (page_retire_kstat) / sizeof (kstat_named_t); 854*917Selowe 855*917Selowe ASSERT(page_retire_ksp == NULL); 856*917Selowe 857*917Selowe if (max_pages_retired_bps <= 0) { 858*917Selowe max_pages_retired_bps = MCE_BPT; 859*917Selowe } 860*917Selowe 861*917Selowe mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL); 862*917Selowe 863*917Selowe retired_pages = vn_alloc(KM_SLEEP); 864*917Selowe if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) { 865*917Selowe cmn_err(CE_PANIC, 866*917Selowe "page_retired_init: can't make retired vnodeops"); 867*917Selowe } 868*917Selowe vn_setops(retired_pages, vops); 869*917Selowe 870*917Selowe if ((page_retire_ksp = kstat_create("unix", 0, "page_retire", 871*917Selowe "misc", KSTAT_TYPE_NAMED, page_retire_ndata, 872*917Selowe KSTAT_FLAG_VIRTUAL)) == NULL) { 873*917Selowe cmn_err(CE_WARN, "kstat_create for page_retire failed"); 874*917Selowe } else { 875*917Selowe page_retire_ksp->ks_data = (void *)&page_retire_kstat; 876*917Selowe page_retire_ksp->ks_update = page_retire_kstat_update; 877*917Selowe kstat_install(page_retire_ksp); 878*917Selowe } 879*917Selowe 880*917Selowe pr_thread_shortwait = 23 * hz; 881*917Selowe pr_thread_longwait = 1201 * hz; 882*917Selowe mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL); 883*917Selowe cv_init(&pr_cv, NULL, CV_DEFAULT, NULL); 884*917Selowe pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0, 885*917Selowe TS_RUN, minclsyspri); 886*917Selowe 887*917Selowe pr_enable = 1; 888*917Selowe } 889*917Selowe 890*917Selowe /* 891*917Selowe * page_retire_hunt() callback for the retire thread. 892*917Selowe */ 893*917Selowe static void 894*917Selowe page_retire_thread_cb(page_t *pp) 895*917Selowe { 896*917Selowe PR_DEBUG(prd_tctop); 897*917Selowe if (pp->p_vnode != &kvp && page_trylock(pp, SE_EXCL)) { 898*917Selowe PR_DEBUG(prd_tclocked); 899*917Selowe page_unlock(pp); 900*917Selowe } 901*917Selowe } 902*917Selowe 903*917Selowe /* 904*917Selowe * page_retire_hunt() callback for mdboot(). 905*917Selowe * 906*917Selowe * It is necessary to scrub any failing pages prior to reboot in order to 907*917Selowe * prevent a latent error trap from occurring on the next boot. 908*917Selowe */ 909*917Selowe void 910*917Selowe page_retire_mdboot_cb(page_t *pp) 911*917Selowe { 912*917Selowe /* 913*917Selowe * Don't scrub the kernel, since we might still need it, unless 914*917Selowe * we have UEs on the page, in which case we have nothing to lose. 915*917Selowe */ 916*917Selowe if (pp->p_vnode != &kvp || PP_TOXIC(pp)) { 917*917Selowe pp->p_selock = -1; /* pacify ASSERTs */ 918*917Selowe pagescrub(pp, 0, PAGESIZE); 919*917Selowe pp->p_selock = 0; 920*917Selowe } 921*917Selowe pp->p_toxic = 0; 922*917Selowe } 923*917Selowe 924*917Selowe /* 925*917Selowe * Hunt down any pages in the system that have not yet been retired, invoking 926*917Selowe * the provided callback function on each of them. 927*917Selowe */ 928*917Selowe void 929*917Selowe page_retire_hunt(void (*callback)(page_t *)) 930*917Selowe { 931*917Selowe page_t *pp; 932*917Selowe page_t *first; 933*917Selowe int i, found; 934*917Selowe 935*917Selowe PR_DEBUG(prd_hunt); 936*917Selowe 937*917Selowe if (PR_KSTAT_PENDING == 0) { 938*917Selowe return; 939*917Selowe } 940*917Selowe 941*917Selowe PR_DEBUG(prd_dohunt); 942*917Selowe 943*917Selowe found = 0; 944*917Selowe mutex_enter(&pr_q_mutex); 945*917Selowe 946*917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 947*917Selowe if ((pp = pr_pending_q[i]) != NULL) { 948*917Selowe mutex_exit(&pr_q_mutex); 949*917Selowe callback(pp); 950*917Selowe mutex_enter(&pr_q_mutex); 951*917Selowe found++; 952*917Selowe } 953*917Selowe } 954*917Selowe 955*917Selowe if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == PR_KSTAT_PENDING) { 956*917Selowe mutex_exit(&pr_q_mutex); 957*917Selowe PR_DEBUG(prd_earlyhunt); 958*917Selowe return; 959*917Selowe } 960*917Selowe mutex_exit(&pr_q_mutex); 961*917Selowe 962*917Selowe PR_DEBUG(prd_latehunt); 963*917Selowe 964*917Selowe /* 965*917Selowe * We've lost track of a page somewhere. Hunt it down. 966*917Selowe */ 967*917Selowe memsegs_lock(0); 968*917Selowe pp = first = page_first(); 969*917Selowe do { 970*917Selowe if (PP_PR_REQ(pp)) { 971*917Selowe callback(pp); 972*917Selowe if (++found == PR_KSTAT_PENDING) { 973*917Selowe break; /* got 'em all */ 974*917Selowe } 975*917Selowe } 976*917Selowe } while ((pp = page_next(pp)) != first); 977*917Selowe memsegs_unlock(0); 978*917Selowe } 979*917Selowe 980*917Selowe /* 981*917Selowe * The page_retire_thread loops forever, looking to see if there are 982*917Selowe * pages still waiting to be retired. 983*917Selowe */ 984*917Selowe static void 985*917Selowe page_retire_thread(void) 986*917Selowe { 987*917Selowe callb_cpr_t c; 988*917Selowe 989*917Selowe CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire"); 990*917Selowe 991*917Selowe mutex_enter(&pr_thread_mutex); 992*917Selowe for (;;) { 993*917Selowe if (pr_enable && PR_KSTAT_PENDING) { 994*917Selowe kmem_reap(); 995*917Selowe seg_preap(); 996*917Selowe page_retire_hunt(page_retire_thread_cb); 997*917Selowe CALLB_CPR_SAFE_BEGIN(&c); 998*917Selowe (void) cv_timedwait(&pr_cv, &pr_thread_mutex, 999*917Selowe lbolt + pr_thread_shortwait); 1000*917Selowe CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); 1001*917Selowe } else { 1002*917Selowe CALLB_CPR_SAFE_BEGIN(&c); 1003*917Selowe (void) cv_timedwait(&pr_cv, &pr_thread_mutex, 1004*917Selowe lbolt + pr_thread_longwait); 1005*917Selowe CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); 1006*917Selowe } 1007*917Selowe } 1008*917Selowe /*NOTREACHED*/ 1009*917Selowe } 1010*917Selowe 1011*917Selowe /* 1012*917Selowe * page_retire_pp() decides what to do with a failing page. 1013*917Selowe * 1014*917Selowe * When we get a free page (e.g. the scrubber or in the free path) life is 1015*917Selowe * nice because the page is clean and marked free -- those always retire 1016*917Selowe * nicely. From there we go by order of difficulty. If the page has data, 1017*917Selowe * we attempt to relocate its contents to a suitable replacement page. If 1018*917Selowe * that does not succeed, we look to see if it is clean. If after all of 1019*917Selowe * this we have a clean, unmapped page (which we usually do!), we retire it. 1020*917Selowe * If the page is not clean, we still process it regardless on a UE; for 1021*917Selowe * CEs or FMA requests, we fail leaving the page in service. The page will 1022*917Selowe * eventually be tried again later. We always return with the page unlocked 1023*917Selowe * since we are called from page_unlock(). 1024*917Selowe * 1025*917Selowe * We don't call panic or do anything fancy down in here. Our boss the DE 1026*917Selowe * gets paid handsomely to do his job of figuring out what to do when errors 1027*917Selowe * occur. We just do what he tells us to do. 1028*917Selowe */ 1029*917Selowe static int 1030*917Selowe page_retire_pp(page_t *pp) 1031*917Selowe { 1032*917Selowe int toxic; 1033*917Selowe 1034*917Selowe ASSERT(PAGE_EXCL(pp)); 1035*917Selowe ASSERT(pp->p_iolock_state == 0); 1036*917Selowe ASSERT(pp->p_szc == 0); 1037*917Selowe 1038*917Selowe PR_DEBUG(prd_top); 1039*917Selowe PR_TYPES(pp); 1040*917Selowe 1041*917Selowe toxic = pp->p_toxic; 1042*917Selowe ASSERT(toxic & PR_REASONS); 1043*917Selowe 1044*917Selowe if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && 1045*917Selowe page_retire_limit()) { 1046*917Selowe page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); 1047*917Selowe page_retire_dequeue(pp); 1048*917Selowe page_unlock(pp); 1049*917Selowe return (page_retire_done(pp, PRD_LIMIT)); 1050*917Selowe } 1051*917Selowe 1052*917Selowe if (PP_ISFREE(pp)) { 1053*917Selowe PR_DEBUG(prd_free); 1054*917Selowe if (!MTBF(recl_calls, recl_mtbf) || !page_reclaim(pp, NULL)) { 1055*917Selowe PR_DEBUG(prd_noreclaim); 1056*917Selowe PR_INCR_KSTAT(pr_failed); 1057*917Selowe page_unlock(pp); 1058*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1059*917Selowe } 1060*917Selowe } 1061*917Selowe 1062*917Selowe if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISFREE(pp) && 1063*917Selowe !PP_ISNORELOC(pp) && MTBF(reloc_calls, reloc_mtbf)) { 1064*917Selowe page_t *newpp; 1065*917Selowe spgcnt_t count; 1066*917Selowe 1067*917Selowe /* 1068*917Selowe * If we can relocate the page, great! newpp will go 1069*917Selowe * on without us, and everything is fine. Regardless 1070*917Selowe * of whether the relocation succeeds, we are still 1071*917Selowe * going to take `pp' around back and shoot it. 1072*917Selowe */ 1073*917Selowe PR_DEBUG(prd_reloc); 1074*917Selowe newpp = NULL; 1075*917Selowe if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { 1076*917Selowe page_unlock(newpp); 1077*917Selowe ASSERT(hat_page_getattr(pp, P_MOD) == 0); 1078*917Selowe } 1079*917Selowe } 1080*917Selowe 1081*917Selowe if (pp->p_vnode == &kvp) { 1082*917Selowe PR_DEBUG(prd_kern); 1083*917Selowe PR_INCR_KSTAT(pr_failed_kernel); 1084*917Selowe page_unlock(pp); 1085*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1086*917Selowe } 1087*917Selowe 1088*917Selowe if (pp->p_lckcnt || pp->p_cowcnt) { 1089*917Selowe if (toxic & PR_UE) { 1090*917Selowe (void) page_clear_lck_cow(pp, 1); 1091*917Selowe } else { 1092*917Selowe PR_DEBUG(prd_locked); 1093*917Selowe PR_INCR_KSTAT(pr_failed); 1094*917Selowe page_unlock(pp); 1095*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1096*917Selowe } 1097*917Selowe } 1098*917Selowe 1099*917Selowe (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1100*917Selowe ASSERT(!PP_ISFREE(pp)); 1101*917Selowe ASSERT(!hat_page_is_mapped(pp)); 1102*917Selowe 1103*917Selowe /* 1104*917Selowe * If the page is modified, was not relocated, and not toxic, 1105*917Selowe * we can't retire it without dropping data on the floor. 1106*917Selowe * 1107*917Selowe * RFE: we could change fsflush so that it (and only it) will 1108*917Selowe * be allowed to lock this page and push it out. Once it cleans 1109*917Selowe * the page, we'd then be able to retire it on the free path. 1110*917Selowe * In practice, this should be exceedingly rare. 1111*917Selowe */ 1112*917Selowe if (hat_ismod(pp)) { 1113*917Selowe if ((toxic & PR_UE) == 0) { 1114*917Selowe PR_DEBUG(prd_modce); 1115*917Selowe PR_INCR_KSTAT(pr_failed); 1116*917Selowe page_unlock(pp); 1117*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1118*917Selowe } else if (page_retire_modified == 0) { 1119*917Selowe PR_DEBUG(prd_modue_fail); 1120*917Selowe PR_INCR_KSTAT(pr_failed); 1121*917Selowe page_unlock(pp); 1122*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1123*917Selowe } 1124*917Selowe PR_DEBUG(prd_modue_retire); 1125*917Selowe } 1126*917Selowe 1127*917Selowe if (pp->p_vnode) { 1128*917Selowe PR_DEBUG(prd_hashout); 1129*917Selowe page_hashout(pp, NULL); 1130*917Selowe } 1131*917Selowe ASSERT(!pp->p_vnode); 1132*917Selowe 1133*917Selowe /* 1134*917Selowe * The problem page is locked, demoted, unmapped, not free, 1135*917Selowe * hashed out, and not COW or mlocked (whew!). 1136*917Selowe * 1137*917Selowe * Now we select our ammunition, take it around back, and shoot it. 1138*917Selowe */ 1139*917Selowe if (toxic & PR_UE) { 1140*917Selowe if (hat_ismod(pp)) { 1141*917Selowe /* 1142*917Selowe * Let the user know we are dropping their data 1143*917Selowe * on the floor. 1144*917Selowe */ 1145*917Selowe PR_MESSAGE(CE_WARN, 1, "Removing modified page " 1146*917Selowe "0x%08x.%08x from service", 1147*917Selowe mmu_ptob(pp->p_pagenum)); 1148*917Selowe } 1149*917Selowe if (page_retire_transient_ue(pp)) { 1150*917Selowe PR_DEBUG(prd_uescrubbed); 1151*917Selowe return (page_retire_done(pp, PRD_UE_SCRUBBED)); 1152*917Selowe } else { 1153*917Selowe PR_DEBUG(prd_uenotscrubbed); 1154*917Selowe page_retire_destroy(pp); 1155*917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1156*917Selowe } 1157*917Selowe } else if (toxic & PR_FMA) { 1158*917Selowe PR_DEBUG(prd_fma); 1159*917Selowe page_retire_destroy(pp); 1160*917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1161*917Selowe } else if (toxic & PR_MCE) { 1162*917Selowe PR_DEBUG(prd_mce); 1163*917Selowe page_retire_destroy(pp); 1164*917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1165*917Selowe } 1166*917Selowe panic("page_retire_pp: bad toxic flags %d", toxic); 1167*917Selowe /*NOTREACHED*/ 1168*917Selowe } 1169*917Selowe 1170*917Selowe /* 1171*917Selowe * Try to retire a page when we stumble onto it in the page lock routines. 1172*917Selowe */ 1173*917Selowe void 1174*917Selowe page_tryretire(page_t *pp) 1175*917Selowe { 1176*917Selowe ASSERT(PAGE_EXCL(pp)); 1177*917Selowe 1178*917Selowe if (!pr_enable) { 1179*917Selowe page_unlock(pp); 1180*917Selowe return; 1181*917Selowe } 1182*917Selowe 1183*917Selowe /* 1184*917Selowe * If the page is a big page, try to break it up. 1185*917Selowe * 1186*917Selowe * If there are other bad pages besides `pp', they will be 1187*917Selowe * recursively retired for us thanks to a bit of magic. 1188*917Selowe * If the page is a small page with errors, try to retire it. 1189*917Selowe */ 1190*917Selowe if (pp->p_szc > 0) { 1191*917Selowe if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { 1192*917Selowe page_unlock(pp); 1193*917Selowe PR_DEBUG(prd_nofreedemote); 1194*917Selowe return; 1195*917Selowe } else if (!page_try_demote_pages(pp)) { 1196*917Selowe page_unlock(pp); 1197*917Selowe PR_DEBUG(prd_nodemote); 1198*917Selowe return; 1199*917Selowe } 1200*917Selowe PR_DEBUG(prd_demoted); 1201*917Selowe page_unlock(pp); 1202*917Selowe } else { 1203*917Selowe (void) page_retire_pp(pp); 1204*917Selowe } 1205*917Selowe } 1206*917Selowe 1207*917Selowe /* 1208*917Selowe * page_retire() - the front door in to retire a page. 1209*917Selowe * 1210*917Selowe * Ideally, page_retire() would instantly retire the requested page. 1211*917Selowe * Unfortunately, some pages are locked or otherwise tied up and cannot be 1212*917Selowe * retired right away. To deal with that, bits are set in p_toxic of the 1213*917Selowe * page_t. An attempt is made to lock the page; if the attempt is successful, 1214*917Selowe * we instantly unlock the page counting on page_unlock() to notice p_toxic 1215*917Selowe * is nonzero and to call back into page_retire_pp(). Success is determined 1216*917Selowe * by looking to see whether the page has been retired once it has been 1217*917Selowe * unlocked. 1218*917Selowe * 1219*917Selowe * Returns: 1220*917Selowe * 1221*917Selowe * - 0 on success, 1222*917Selowe * - EINVAL when the PA is whacko, 1223*917Selowe * - EBUSY if the page is already retired, or 1224*917Selowe * - EAGAIN if the page could not be _immediately_ retired. 1225*917Selowe */ 1226*917Selowe int 1227*917Selowe page_retire(uint64_t pa, uchar_t reason) 1228*917Selowe { 1229*917Selowe page_t *pp; 1230*917Selowe 1231*917Selowe ASSERT(reason & PR_REASONS); /* there must be a reason */ 1232*917Selowe ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ 1233*917Selowe 1234*917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1235*917Selowe if (pp == NULL) { 1236*917Selowe PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" 1237*917Selowe " page 0x%08x.%08x; page is not relocatable memory", pa); 1238*917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1239*917Selowe } 1240*917Selowe if (PP_RETIRED(pp)) { 1241*917Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1242*917Selowe } 1243*917Selowe 1244*917Selowe if (reason & PR_UE) { 1245*917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" 1246*917Selowe " page 0x%08x.%08x", pa); 1247*917Selowe } else { 1248*917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" 1249*917Selowe " page 0x%08x.%08x", pa); 1250*917Selowe } 1251*917Selowe page_settoxic(pp, reason); 1252*917Selowe page_retire_enqueue(pp); 1253*917Selowe 1254*917Selowe /* 1255*917Selowe * And now for some magic. 1256*917Selowe * 1257*917Selowe * We marked this page toxic up above. All there is left to do is 1258*917Selowe * to try to lock the page and then unlock it. The page lock routines 1259*917Selowe * will intercept the page and retire it if they can. If the page 1260*917Selowe * cannot be locked, 's okay -- page_unlock() will eventually get it, 1261*917Selowe * or the background thread, until then the lock routines will deny 1262*917Selowe * further locks on it. 1263*917Selowe */ 1264*917Selowe if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { 1265*917Selowe PR_DEBUG(prd_prlocked); 1266*917Selowe page_unlock(pp); 1267*917Selowe } else { 1268*917Selowe PR_DEBUG(prd_prnotlocked); 1269*917Selowe } 1270*917Selowe 1271*917Selowe if (PP_RETIRED(pp)) { 1272*917Selowe PR_DEBUG(prd_prretired); 1273*917Selowe return (0); 1274*917Selowe } else { 1275*917Selowe cv_signal(&pr_cv); 1276*917Selowe PR_INCR_KSTAT(pr_failed); 1277*917Selowe 1278*917Selowe if (pp->p_toxic & PR_MSG) { 1279*917Selowe return (page_retire_done(pp, PRD_FAILED)); 1280*917Selowe } else { 1281*917Selowe return (page_retire_done(pp, PRD_PENDING)); 1282*917Selowe } 1283*917Selowe } 1284*917Selowe } 1285*917Selowe 1286*917Selowe /* 1287*917Selowe * Take a retired page off the retired-pages vnode and clear the toxic flags. 1288*917Selowe * If "free" is nonzero, lock it and put it back on the freelist. If "free" 1289*917Selowe * is zero, the caller already holds SE_EXCL lock so we simply unretire it 1290*917Selowe * and don't do anything else with it. 1291*917Selowe * 1292*917Selowe * Any unretire messages are printed from this routine. 1293*917Selowe * 1294*917Selowe * Returns 0 if page pp was unretired; else an error code. 1295*917Selowe */ 1296*917Selowe int 1297*917Selowe page_unretire_pp(page_t *pp, int free) 1298*917Selowe { 1299*917Selowe /* 1300*917Selowe * To be retired, a page has to be hashed onto the retired_pages vnode 1301*917Selowe * and have PR_RETIRED set in p_toxic. 1302*917Selowe */ 1303*917Selowe if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { 1304*917Selowe ASSERT(PAGE_EXCL(pp)); 1305*917Selowe PR_DEBUG(prd_ulocked); 1306*917Selowe if (!PP_RETIRED(pp)) { 1307*917Selowe PR_DEBUG(prd_unotretired); 1308*917Selowe page_unlock(pp); 1309*917Selowe return (page_retire_done(pp, PRD_UNR_NOT)); 1310*917Selowe } 1311*917Selowe 1312*917Selowe PR_MESSAGE(CE_NOTE, 1, "unretiring retired" 1313*917Selowe " page 0x%08x.%08x", mmu_ptob(pp->p_pagenum)); 1314*917Selowe if (pp->p_toxic & PR_FMA) { 1315*917Selowe PR_DECR_KSTAT(pr_fma); 1316*917Selowe } else if (pp->p_toxic & PR_UE) { 1317*917Selowe PR_DECR_KSTAT(pr_ue); 1318*917Selowe } else { 1319*917Selowe PR_DECR_KSTAT(pr_mce); 1320*917Selowe } 1321*917Selowe page_clrtoxic(pp, PR_ALLFLAGS); 1322*917Selowe 1323*917Selowe if (free) { 1324*917Selowe PR_DEBUG(prd_udestroy); 1325*917Selowe page_destroy(pp, 0); 1326*917Selowe } else { 1327*917Selowe PR_DEBUG(prd_uhashout); 1328*917Selowe page_hashout(pp, NULL); 1329*917Selowe } 1330*917Selowe 1331*917Selowe mutex_enter(&freemem_lock); 1332*917Selowe availrmem++; 1333*917Selowe mutex_exit(&freemem_lock); 1334*917Selowe 1335*917Selowe PR_DEBUG(prd_uunretired); 1336*917Selowe PR_DECR_KSTAT(pr_retired); 1337*917Selowe PR_INCR_KSTAT(pr_unretired); 1338*917Selowe return (page_retire_done(pp, PRD_UNR_SUCCESS)); 1339*917Selowe } 1340*917Selowe PR_DEBUG(prd_unotlocked); 1341*917Selowe return (page_retire_done(pp, PRD_UNR_CANTLOCK)); 1342*917Selowe } 1343*917Selowe 1344*917Selowe /* 1345*917Selowe * Return a page to service by moving it from the retired_pages vnode 1346*917Selowe * onto the freelist. 1347*917Selowe * 1348*917Selowe * Called from mmioctl_page_retire() on behalf of the FMA DE. 1349*917Selowe * 1350*917Selowe * Returns: 1351*917Selowe * 1352*917Selowe * - 0 if the page is unretired, 1353*917Selowe * - EAGAIN if the pp can not be locked, 1354*917Selowe * - EINVAL if the PA is whacko, and 1355*917Selowe * - EBADF if the pp is not retired. 1356*917Selowe */ 1357*917Selowe int 1358*917Selowe page_unretire(uint64_t pa) 1359*917Selowe { 1360*917Selowe page_t *pp; 1361*917Selowe 1362*917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1363*917Selowe if (pp == NULL) { 1364*917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1365*917Selowe } 1366*917Selowe 1367*917Selowe return (page_unretire_pp(pp, 1)); 1368*917Selowe } 1369*917Selowe 1370*917Selowe /* 1371*917Selowe * Test a page to see if it is retired. If errors is non-NULL, the toxic 1372*917Selowe * bits of the page are returned. Returns 0 on success, error code on failure. 1373*917Selowe */ 1374*917Selowe int 1375*917Selowe page_retire_check_pp(page_t *pp, uint64_t *errors) 1376*917Selowe { 1377*917Selowe int rc; 1378*917Selowe 1379*917Selowe if (PP_RETIRED(pp)) { 1380*917Selowe PR_DEBUG(prd_checkhit); 1381*917Selowe rc = 0; 1382*917Selowe } else { 1383*917Selowe PR_DEBUG(prd_checkmiss); 1384*917Selowe rc = EAGAIN; 1385*917Selowe } 1386*917Selowe 1387*917Selowe /* 1388*917Selowe * We have magically arranged the bit values returned to fmd(1M) 1389*917Selowe * to line up with the FMA, MCE, and UE bits of the page_t. 1390*917Selowe */ 1391*917Selowe if (errors) { 1392*917Selowe uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK); 1393*917Selowe if (toxic & PR_UE_SCRUBBED) { 1394*917Selowe toxic &= ~PR_UE_SCRUBBED; 1395*917Selowe toxic |= PR_UE; 1396*917Selowe } 1397*917Selowe *errors = toxic; 1398*917Selowe } 1399*917Selowe 1400*917Selowe return (rc); 1401*917Selowe } 1402*917Selowe 1403*917Selowe /* 1404*917Selowe * Test to see if the page_t for a given PA is retired, and return the 1405*917Selowe * hardware errors we have seen on the page if requested. 1406*917Selowe * 1407*917Selowe * Called from mmioctl_page_retire on behalf of the FMA DE. 1408*917Selowe * 1409*917Selowe * Returns: 1410*917Selowe * 1411*917Selowe * - 0 if the page is retired, 1412*917Selowe * - EAGAIN if it is not, and 1413*917Selowe * - EINVAL if the PA is whacko. 1414*917Selowe */ 1415*917Selowe int 1416*917Selowe page_retire_check(uint64_t pa, uint64_t *errors) 1417*917Selowe { 1418*917Selowe page_t *pp; 1419*917Selowe 1420*917Selowe if (errors) { 1421*917Selowe *errors = 0; 1422*917Selowe } 1423*917Selowe 1424*917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1425*917Selowe if (pp == NULL) { 1426*917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1427*917Selowe } 1428*917Selowe 1429*917Selowe return (page_retire_check_pp(pp, errors)); 1430*917Selowe } 1431*917Selowe 1432*917Selowe /* 1433*917Selowe * Page retire self-test. For now, it always returns 0. 1434*917Selowe */ 1435*917Selowe int 1436*917Selowe page_retire_test(void) 1437*917Selowe { 1438*917Selowe page_t *first, *pp, *cpp, *cpp2, *lpp; 1439*917Selowe 1440*917Selowe /* 1441*917Selowe * Tests the corner case where a large page can't be retired 1442*917Selowe * because one of the constituent pages is locked. We mark 1443*917Selowe * one page to be retired and try to retire it, and mark the 1444*917Selowe * other page to be retired but don't try to retire it, so 1445*917Selowe * that page_unlock() in the failure path will recurse and try 1446*917Selowe * to retire THAT page. This is the worst possible situation 1447*917Selowe * we can get ourselves into. 1448*917Selowe */ 1449*917Selowe memsegs_lock(0); 1450*917Selowe pp = first = page_first(); 1451*917Selowe do { 1452*917Selowe if (pp->p_szc && PP_PAGEROOT(pp) == pp) { 1453*917Selowe cpp = pp + 1; 1454*917Selowe lpp = PP_ISFREE(pp)? pp : pp + 2; 1455*917Selowe cpp2 = pp + 3; 1456*917Selowe if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) 1457*917Selowe continue; 1458*917Selowe if (!page_trylock(cpp, SE_EXCL)) { 1459*917Selowe page_unlock(lpp); 1460*917Selowe continue; 1461*917Selowe } 1462*917Selowe page_settoxic(cpp, PR_FMA | PR_BUSY); 1463*917Selowe page_settoxic(cpp2, PR_FMA); 1464*917Selowe page_tryretire(cpp); /* will fail */ 1465*917Selowe page_unlock(lpp); 1466*917Selowe (void) page_retire(cpp->p_pagenum, PR_FMA); 1467*917Selowe (void) page_retire(cpp2->p_pagenum, PR_FMA); 1468*917Selowe } 1469*917Selowe } while ((pp = page_next(pp)) != first); 1470*917Selowe memsegs_unlock(0); 1471*917Selowe 1472*917Selowe return (0); 1473*917Selowe } 1474