1917Selowe /* 2917Selowe * CDDL HEADER START 3917Selowe * 4917Selowe * The contents of this file are subject to the terms of the 5917Selowe * Common Development and Distribution License, Version 1.0 only 6917Selowe * (the "License"). You may not use this file except in compliance 7917Selowe * with the License. 8917Selowe * 9917Selowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10917Selowe * or http://www.opensolaris.org/os/licensing. 11917Selowe * See the License for the specific language governing permissions 12917Selowe * and limitations under the License. 13917Selowe * 14917Selowe * When distributing Covered Code, include this CDDL HEADER in each 15917Selowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16917Selowe * If applicable, add the following below this CDDL HEADER, with the 17917Selowe * fields enclosed by brackets "[]" replaced with your own identifying 18917Selowe * information: Portions Copyright [yyyy] [name of copyright owner] 19917Selowe * 20917Selowe * CDDL HEADER END 21917Selowe */ 22917Selowe /* 231338Selowe * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24917Selowe * Use is subject to license terms. 25917Selowe */ 26917Selowe 27917Selowe #pragma ident "%Z%%M% %I% %E% SMI" 28917Selowe 29917Selowe /* 30917Selowe * Page Retire - Big Theory Statement. 31917Selowe * 32917Selowe * This file handles removing sections of faulty memory from use when the 33917Selowe * user land FMA Diagnosis Engine requests that a page be removed or when 34917Selowe * a CE or UE is detected by the hardware. 35917Selowe * 36917Selowe * In the bad old days, the kernel side of Page Retire did a lot of the work 37917Selowe * on its own. Now, with the DE keeping track of errors, the kernel side is 38917Selowe * rather simple minded on most platforms. 39917Selowe * 40917Selowe * Errors are all reflected to the DE, and after digesting the error and 41917Selowe * looking at all previously reported errors, the DE decides what should 42917Selowe * be done about the current error. If the DE wants a particular page to 43917Selowe * be retired, then the kernel page retire code is invoked via an ioctl. 44917Selowe * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling 45917Selowe * page retire to handle the error. Since page retire is just a simple 46917Selowe * mechanism it doesn't need to differentiate between the different callers. 47917Selowe * 48917Selowe * The p_toxic field in the page_t is used to indicate which errors have 49917Selowe * occurred and what action has been taken on a given page. Because errors are 50917Selowe * reported without regard to the locked state of a page, no locks are used 51917Selowe * to SET the error bits in p_toxic. However, in order to clear the error 52917Selowe * bits, the page_t must be held exclusively locked. 53917Selowe * 54917Selowe * When page_retire() is called, it must be able to acquire locks, sleep, etc. 55917Selowe * It must not be called from high-level interrupt context. 56917Selowe * 57917Selowe * Depending on how the requested page is being used at the time of the retire 58917Selowe * request (and on the availability of sufficient system resources), the page 59917Selowe * may be retired immediately, or just marked for retirement later. For 60917Selowe * example, locked pages are marked, while free pages are retired. Multiple 61917Selowe * requests may be made to retire the same page, although there is no need 62917Selowe * to: once the p_toxic flags are set, the page will be retired as soon as it 63917Selowe * can be exclusively locked. 64917Selowe * 65917Selowe * The retire mechanism is driven centrally out of page_unlock(). To expedite 66917Selowe * the retirement of pages, further requests for SE_SHARED locks are denied 67917Selowe * as long as a page retirement is pending. In addition, as long as pages are 68917Selowe * pending retirement a background thread runs periodically trying to retire 69917Selowe * those pages. Pages which could not be retired while the system is running 70917Selowe * are scrubbed prior to rebooting to avoid latent errors on the next boot. 71917Selowe * 721338Selowe * UE pages without persistent errors are scrubbed and returned to service. 731338Selowe * Recidivist pages, as well as FMA-directed requests for retirement, result 741338Selowe * in the page being taken out of service. Once the decision is made to take 751338Selowe * a page out of service, the page is cleared, hashed onto the retired_pages 761338Selowe * vnode, marked as retired, and it is unlocked. No other requesters (except 771338Selowe * for unretire) are allowed to lock retired pages. 78917Selowe * 79917Selowe * The public routines return (sadly) 0 if they worked and a non-zero error 80917Selowe * value if something went wrong. This is done for the ioctl side of the 81917Selowe * world to allow errors to be reflected all the way out to user land. The 82917Selowe * non-zero values are explained in comments atop each function. 83917Selowe */ 84917Selowe 85917Selowe /* 86917Selowe * Things to fix: 87917Selowe * 88917Selowe * 1. Cleanup SE_EWANTED. Since we're aggressive about trying to retire 89917Selowe * pages, we can use page_retire_pp() to replace SE_EWANTED and all 90917Selowe * the special delete_memory_thread() code just goes away. 91917Selowe * 92917Selowe * 2. Trying to retire non-relocatable kvp pages may result in a 93917Selowe * quagmire. This is because seg_kmem() no longer keeps its pages locked, 94917Selowe * and calls page_lookup() in the free path; since kvp pages are modified 95917Selowe * and don't have a usable backing store, page_retire() can't do anything 96917Selowe * with them, and we'll keep denying the lock to seg_kmem_free() in a 97917Selowe * vicious cycle. To prevent that, we don't deny locks to kvp pages, and 98917Selowe * hence only call page_retire_pp() from page_unlock() in the free path. 99917Selowe * Since most kernel pages are indefinitely held anyway, and don't 100917Selowe * participate in I/O, this is of little consequence. 101917Selowe * 102917Selowe * 3. Low memory situations will be interesting. If we don't have 103917Selowe * enough memory for page_relocate() to succeed, we won't be able to 104917Selowe * retire dirty pages; nobody will be able to push them out to disk 105917Selowe * either, since we aggressively deny the page lock. We could change 106917Selowe * fsflush so it can recognize this situation, grab the lock, and push 107917Selowe * the page out, where we'll catch it in the free path and retire it. 108917Selowe * 109917Selowe * 4. Beware of places that have code like this in them: 110917Selowe * 111917Selowe * if (! page_tryupgrade(pp)) { 112917Selowe * page_unlock(pp); 113917Selowe * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) { 114917Selowe * / *NOTHING* / 115917Selowe * } 116917Selowe * } 117917Selowe * page_free(pp); 118917Selowe * 119917Selowe * The problem is that pp can change identity right after the 120917Selowe * page_unlock() call. In particular, page_retire() can step in 121917Selowe * there, change pp's identity, and hash pp onto the retired_vnode. 122917Selowe * 123917Selowe * Of course, other functions besides page_retire() can have the 124917Selowe * same effect. A kmem reader can waltz by, set up a mapping to the 125917Selowe * page, and then unlock the page. Page_free() will then go castors 126917Selowe * up. So if anybody is doing this, it's already a bug. 127917Selowe * 128917Selowe * 5. mdboot()'s call into page_retire_hunt() should probably be 129917Selowe * moved lower. Where the call is made now, we can get into trouble 130917Selowe * by scrubbing a kernel page that is then accessed later. 131917Selowe */ 132917Selowe 133917Selowe #include <sys/types.h> 134917Selowe #include <sys/param.h> 135917Selowe #include <sys/systm.h> 136917Selowe #include <sys/mman.h> 137917Selowe #include <sys/vnode.h> 138917Selowe #include <sys/cmn_err.h> 139917Selowe #include <sys/ksynch.h> 140917Selowe #include <sys/thread.h> 141917Selowe #include <sys/disp.h> 142917Selowe #include <sys/ontrap.h> 143917Selowe #include <sys/vmsystm.h> 144917Selowe #include <sys/mem_config.h> 145917Selowe #include <sys/atomic.h> 146917Selowe #include <sys/callb.h> 147917Selowe #include <vm/page.h> 148917Selowe #include <vm/vm_dep.h> 149917Selowe #include <vm/as.h> 150917Selowe #include <vm/hat.h> 151917Selowe 152917Selowe /* 153917Selowe * vnode for all pages which are retired from the VM system; 154917Selowe */ 155917Selowe vnode_t *retired_pages; 156917Selowe 157917Selowe /* 158917Selowe * Background thread that wakes up periodically to try to retire pending 159917Selowe * pages. This prevents threads from becoming blocked indefinitely in 160917Selowe * page_lookup() or some other routine should the page(s) they are waiting 161917Selowe * on become eligible for social security. 162917Selowe */ 163917Selowe static void page_retire_thread(void); 164917Selowe static kthread_t *pr_thread_id; 165917Selowe static kcondvar_t pr_cv; 166917Selowe static kmutex_t pr_thread_mutex; 167917Selowe static clock_t pr_thread_shortwait; 168917Selowe static clock_t pr_thread_longwait; 169917Selowe 170917Selowe /* 171917Selowe * Make a list of all of the pages that have been marked for retirement 172917Selowe * but are not yet retired. At system shutdown, we will scrub all of the 173917Selowe * pages in the list in case there are outstanding UEs. Then, we 174917Selowe * cross-check this list against the number of pages that are yet to be 175917Selowe * retired, and if we find inconsistencies, we scan every page_t in the 176917Selowe * whole system looking for any pages that need to be scrubbed for UEs. 177917Selowe * The background thread also uses this queue to determine which pages 178917Selowe * it should keep trying to retire. 179917Selowe */ 180917Selowe #ifdef DEBUG 181917Selowe #define PR_PENDING_QMAX 32 182917Selowe #else /* DEBUG */ 183917Selowe #define PR_PENDING_QMAX 256 184917Selowe #endif /* DEBUG */ 185917Selowe page_t *pr_pending_q[PR_PENDING_QMAX]; 186917Selowe kmutex_t pr_q_mutex; 187917Selowe 188917Selowe /* 189917Selowe * Page retire global kstats 190917Selowe */ 191917Selowe struct page_retire_kstat { 192917Selowe kstat_named_t pr_retired; 193917Selowe kstat_named_t pr_requested; 194917Selowe kstat_named_t pr_requested_free; 195917Selowe kstat_named_t pr_enqueue_fail; 196917Selowe kstat_named_t pr_dequeue_fail; 197917Selowe kstat_named_t pr_pending; 198917Selowe kstat_named_t pr_failed; 199917Selowe kstat_named_t pr_failed_kernel; 200917Selowe kstat_named_t pr_limit; 201917Selowe kstat_named_t pr_limit_exceeded; 202917Selowe kstat_named_t pr_fma; 203917Selowe kstat_named_t pr_mce; 204917Selowe kstat_named_t pr_ue; 205917Selowe kstat_named_t pr_ue_cleared_retire; 206917Selowe kstat_named_t pr_ue_cleared_free; 207917Selowe kstat_named_t pr_ue_persistent; 208917Selowe kstat_named_t pr_unretired; 209917Selowe }; 210917Selowe 211917Selowe static struct page_retire_kstat page_retire_kstat = { 212917Selowe { "pages_retired", KSTAT_DATA_UINT64}, 213917Selowe { "pages_retire_request", KSTAT_DATA_UINT64}, 214917Selowe { "pages_retire_request_free", KSTAT_DATA_UINT64}, 215917Selowe { "pages_notenqueued", KSTAT_DATA_UINT64}, 216917Selowe { "pages_notdequeued", KSTAT_DATA_UINT64}, 217917Selowe { "pages_pending", KSTAT_DATA_UINT64}, 218917Selowe { "pages_deferred", KSTAT_DATA_UINT64}, 219917Selowe { "pages_deferred_kernel", KSTAT_DATA_UINT64}, 220917Selowe { "pages_limit", KSTAT_DATA_UINT64}, 221917Selowe { "pages_limit_exceeded", KSTAT_DATA_UINT64}, 222917Selowe { "pages_fma", KSTAT_DATA_UINT64}, 223917Selowe { "pages_multiple_ce", KSTAT_DATA_UINT64}, 224917Selowe { "pages_ue", KSTAT_DATA_UINT64}, 225917Selowe { "pages_ue_cleared_retired", KSTAT_DATA_UINT64}, 226917Selowe { "pages_ue_cleared_freed", KSTAT_DATA_UINT64}, 227917Selowe { "pages_ue_persistent", KSTAT_DATA_UINT64}, 228917Selowe { "pages_unretired", KSTAT_DATA_UINT64}, 229917Selowe }; 230917Selowe 231917Selowe static kstat_t *page_retire_ksp = NULL; 232917Selowe 233917Selowe #define PR_INCR_KSTAT(stat) \ 234917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1) 235917Selowe #define PR_DECR_KSTAT(stat) \ 236917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1) 237917Selowe 238917Selowe #define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64) 239917Selowe #define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64) 240917Selowe #define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA) 241917Selowe #define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64) 242917Selowe #define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64) 243917Selowe #define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) 244917Selowe 245917Selowe /* 246917Selowe * Limit the number of multiple CE page retires. 247917Selowe * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in 248917Selowe * basis points, where 100 basis points equals one percent. 249917Selowe */ 250917Selowe #define MCE_BPT 10 251917Selowe uint64_t max_pages_retired_bps = MCE_BPT; 252917Selowe #define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000) 253917Selowe 254917Selowe /* 255917Selowe * Control over the verbosity of page retirement. 256917Selowe * 257917Selowe * When set to zero (the default), no messages will be printed. 258917Selowe * When set to one, summary messages will be printed. 259917Selowe * When set > one, all messages will be printed. 260917Selowe * 261917Selowe * A value of one will trigger detailed messages for retirement operations, 262917Selowe * and is intended as a platform tunable for processors where FMA's DE does 263917Selowe * not run (e.g., spitfire). Values > one are intended for debugging only. 264917Selowe */ 265917Selowe int page_retire_messages = 0; 266917Selowe 267917Selowe /* 268917Selowe * Control whether or not we return scrubbed UE pages to service. 269917Selowe * By default we do not since FMA wants to run its diagnostics first 270917Selowe * and then ask us to unretire the page if it passes. Non-FMA platforms 271917Selowe * may set this to zero so we will only retire recidivist pages. It should 272917Selowe * not be changed by the user. 273917Selowe */ 274917Selowe int page_retire_first_ue = 1; 275917Selowe 276917Selowe /* 277917Selowe * Master enable for page retire. This prevents a CE or UE early in boot 278917Selowe * from trying to retire a page before page_retire_init() has finished 279917Selowe * setting things up. This is internal only and is not a tunable! 280917Selowe */ 281917Selowe static int pr_enable = 0; 282917Selowe 283917Selowe extern struct vnode kvp; 284917Selowe 285917Selowe #ifdef DEBUG 286917Selowe struct page_retire_debug { 287*1381Selowe int prd_dup1; 288*1381Selowe int prd_dup2; 289*1381Selowe int prd_qdup; 290917Selowe int prd_noaction; 291917Selowe int prd_queued; 292917Selowe int prd_notqueued; 293917Selowe int prd_dequeue; 294917Selowe int prd_top; 295917Selowe int prd_locked; 296917Selowe int prd_reloc; 297973Selowe int prd_relocfail; 298973Selowe int prd_mod; 299973Selowe int prd_mod_late; 300917Selowe int prd_kern; 301917Selowe int prd_free; 302917Selowe int prd_noreclaim; 303917Selowe int prd_hashout; 304917Selowe int prd_fma; 305917Selowe int prd_uescrubbed; 306917Selowe int prd_uenotscrubbed; 307917Selowe int prd_mce; 308917Selowe int prd_prlocked; 309917Selowe int prd_prnotlocked; 310917Selowe int prd_prretired; 311917Selowe int prd_ulocked; 312917Selowe int prd_unotretired; 313917Selowe int prd_udestroy; 314917Selowe int prd_uhashout; 315917Selowe int prd_uunretired; 316917Selowe int prd_unotlocked; 317917Selowe int prd_checkhit; 318*1381Selowe int prd_checkmiss_pend; 319*1381Selowe int prd_checkmiss_noerr; 320917Selowe int prd_tctop; 321917Selowe int prd_tclocked; 322917Selowe int prd_hunt; 323917Selowe int prd_dohunt; 324917Selowe int prd_earlyhunt; 325917Selowe int prd_latehunt; 326917Selowe int prd_nofreedemote; 327917Selowe int prd_nodemote; 328917Selowe int prd_demoted; 329917Selowe } pr_debug; 330917Selowe 331917Selowe #define PR_DEBUG(foo) ((pr_debug.foo)++) 332917Selowe 333917Selowe /* 334917Selowe * A type histogram. We record the incidence of the various toxic 335917Selowe * flag combinations along with the interesting page attributes. The 336917Selowe * goal is to get as many combinations as we can while driving all 337917Selowe * pr_debug values nonzero (indicating we've exercised all possible 338917Selowe * code paths across all possible page types). Not all combinations 339917Selowe * will make sense -- e.g. PRT_MOD|PRT_KERNEL. 340917Selowe * 341917Selowe * pr_type offset bit encoding (when examining with a debugger): 342917Selowe * 343917Selowe * PRT_NAMED - 0x4 344917Selowe * PRT_KERNEL - 0x8 345917Selowe * PRT_FREE - 0x10 346917Selowe * PRT_MOD - 0x20 347917Selowe * PRT_FMA - 0x0 348917Selowe * PRT_MCE - 0x40 349917Selowe * PRT_UE - 0x80 350917Selowe */ 351917Selowe 352917Selowe #define PRT_NAMED 0x01 353917Selowe #define PRT_KERNEL 0x02 354917Selowe #define PRT_FREE 0x04 355917Selowe #define PRT_MOD 0x08 356917Selowe #define PRT_FMA 0x00 /* yes, this is not a mistake */ 357917Selowe #define PRT_MCE 0x10 358917Selowe #define PRT_UE 0x20 359917Selowe #define PRT_ALL 0x3F 360917Selowe 361917Selowe int pr_types[PRT_ALL+1]; 362917Selowe 363917Selowe #define PR_TYPES(pp) { \ 364917Selowe int whichtype = 0; \ 365917Selowe if (pp->p_vnode) \ 366917Selowe whichtype |= PRT_NAMED; \ 367973Selowe if (PP_ISKVP(pp)) \ 368917Selowe whichtype |= PRT_KERNEL; \ 369917Selowe if (PP_ISFREE(pp)) \ 370917Selowe whichtype |= PRT_FREE; \ 371917Selowe if (hat_ismod(pp)) \ 372917Selowe whichtype |= PRT_MOD; \ 373917Selowe if (pp->p_toxic & PR_UE) \ 374917Selowe whichtype |= PRT_UE; \ 375917Selowe if (pp->p_toxic & PR_MCE) \ 376917Selowe whichtype |= PRT_MCE; \ 377917Selowe pr_types[whichtype]++; \ 378917Selowe } 379917Selowe 380917Selowe int recl_calls; 381917Selowe int recl_mtbf = 3; 382917Selowe int reloc_calls; 383917Selowe int reloc_mtbf = 7; 384917Selowe int pr_calls; 385917Selowe int pr_mtbf = 15; 386917Selowe 387917Selowe #define MTBF(v, f) (((++(v)) & (f)) != (f)) 388917Selowe 389917Selowe #else /* DEBUG */ 390917Selowe 391917Selowe #define PR_DEBUG(foo) /* nothing */ 392917Selowe #define PR_TYPES(foo) /* nothing */ 393917Selowe #define MTBF(v, f) (1) 394917Selowe 395917Selowe #endif /* DEBUG */ 396917Selowe 397917Selowe /* 398917Selowe * page_retire_done() - completion processing 399917Selowe * 400917Selowe * Used by the page_retire code for common completion processing. 401917Selowe * It keeps track of how many times a given result has happened, 402917Selowe * and writes out an occasional message. 403917Selowe * 404917Selowe * May be called with a NULL pp (PRD_INVALID_PA case). 405917Selowe */ 406917Selowe #define PRD_INVALID_KEY -1 407917Selowe #define PRD_SUCCESS 0 408917Selowe #define PRD_PENDING 1 409917Selowe #define PRD_FAILED 2 410917Selowe #define PRD_DUPLICATE 3 411917Selowe #define PRD_INVALID_PA 4 412917Selowe #define PRD_LIMIT 5 413917Selowe #define PRD_UE_SCRUBBED 6 414917Selowe #define PRD_UNR_SUCCESS 7 415917Selowe #define PRD_UNR_CANTLOCK 8 416917Selowe #define PRD_UNR_NOT 9 417917Selowe 418917Selowe typedef struct page_retire_op { 419917Selowe int pr_key; /* one of the PRD_* defines from above */ 420917Selowe int pr_count; /* How many times this has happened */ 421917Selowe int pr_retval; /* return value */ 422917Selowe int pr_msglvl; /* message level - when to print */ 423917Selowe char *pr_message; /* Cryptic message for field service */ 424917Selowe } page_retire_op_t; 425917Selowe 426917Selowe static page_retire_op_t page_retire_ops[] = { 427917Selowe /* key count retval msglvl message */ 428917Selowe {PRD_SUCCESS, 0, 0, 1, 429917Selowe "Page 0x%08x.%08x removed from service"}, 430917Selowe {PRD_PENDING, 0, EAGAIN, 2, 431917Selowe "Page 0x%08x.%08x will be retired on free"}, 432917Selowe {PRD_FAILED, 0, EAGAIN, 0, NULL}, 433*1381Selowe {PRD_DUPLICATE, 0, EIO, 2, 434*1381Selowe "Page 0x%08x.%08x already retired or pending"}, 435917Selowe {PRD_INVALID_PA, 0, EINVAL, 2, 436917Selowe "PA 0x%08x.%08x is not a relocatable page"}, 437917Selowe {PRD_LIMIT, 0, 0, 1, 438917Selowe "Page 0x%08x.%08x not retired due to limit exceeded"}, 439917Selowe {PRD_UE_SCRUBBED, 0, 0, 1, 440917Selowe "Previously reported error on page 0x%08x.%08x cleared"}, 441917Selowe {PRD_UNR_SUCCESS, 0, 0, 1, 442917Selowe "Page 0x%08x.%08x returned to service"}, 443917Selowe {PRD_UNR_CANTLOCK, 0, EAGAIN, 2, 444917Selowe "Page 0x%08x.%08x could not be unretired"}, 445*1381Selowe {PRD_UNR_NOT, 0, EIO, 2, 446917Selowe "Page 0x%08x.%08x is not retired"}, 447917Selowe {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */ 448917Selowe }; 449917Selowe 450917Selowe /* 451917Selowe * print a message if page_retire_messages is true. 452917Selowe */ 453917Selowe #define PR_MESSAGE(debuglvl, msglvl, msg, pa) \ 454917Selowe { \ 455917Selowe uint64_t p = (uint64_t)pa; \ 456917Selowe if (page_retire_messages >= msglvl && msg != NULL) { \ 457917Selowe cmn_err(debuglvl, msg, \ 458917Selowe (uint32_t)(p >> 32), (uint32_t)p); \ 459917Selowe } \ 460917Selowe } 461917Selowe 462917Selowe /* 463917Selowe * Note that multiple bits may be set in a single settoxic operation. 464917Selowe * May be called without the page locked. 465917Selowe */ 466917Selowe void 467917Selowe page_settoxic(page_t *pp, uchar_t bits) 468917Selowe { 469917Selowe atomic_or_8(&pp->p_toxic, bits); 470917Selowe } 471917Selowe 472917Selowe /* 473917Selowe * Note that multiple bits may cleared in a single clrtoxic operation. 4741338Selowe * Must be called with the page exclusively locked to prevent races which 4751338Selowe * may attempt to retire a page without any toxic bits set. 476917Selowe */ 477917Selowe void 478917Selowe page_clrtoxic(page_t *pp, uchar_t bits) 479917Selowe { 480917Selowe ASSERT(PAGE_EXCL(pp)); 481917Selowe atomic_and_8(&pp->p_toxic, ~bits); 482917Selowe } 483917Selowe 484917Selowe /* 485917Selowe * Prints any page retire messages to the user, and decides what 486917Selowe * error code is appropriate for the condition reported. 487917Selowe */ 488917Selowe static int 489917Selowe page_retire_done(page_t *pp, int code) 490917Selowe { 491917Selowe page_retire_op_t *prop; 492917Selowe uint64_t pa = 0; 493917Selowe int i; 494917Selowe 495917Selowe if (pp != NULL) { 4961338Selowe pa = mmu_ptob((uint64_t)pp->p_pagenum); 497917Selowe } 498917Selowe 499917Selowe prop = NULL; 500917Selowe for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { 501917Selowe if (page_retire_ops[i].pr_key == code) { 502917Selowe prop = &page_retire_ops[i]; 503917Selowe break; 504917Selowe } 505917Selowe } 506917Selowe 507917Selowe #ifdef DEBUG 508917Selowe if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { 509917Selowe cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); 510917Selowe } 511917Selowe #endif 512917Selowe 513917Selowe ASSERT(prop->pr_key == code); 514917Selowe 515917Selowe prop->pr_count++; 516917Selowe 517917Selowe PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); 518917Selowe if (pp != NULL) { 519917Selowe page_settoxic(pp, PR_MSG); 520917Selowe } 521917Selowe 522917Selowe return (prop->pr_retval); 523917Selowe } 524917Selowe 525917Selowe /* 526917Selowe * On a reboot, our friend mdboot() wants to clear up any PP_PR_REQ() pages 527917Selowe * that we were not able to retire. On large machines, walking the complete 528917Selowe * page_t array and looking at every page_t takes too long. So, as a page is 529917Selowe * marked toxic, we track it using a list that can be processed at reboot 530917Selowe * time. page_retire_enqueue() will do its best to try to avoid duplicate 531917Selowe * entries, but if we get too many errors at once the queue can overflow, 532917Selowe * in which case we will end up walking every page_t as a last resort. 533917Selowe * The background thread also makes use of this queue to find which pages 534917Selowe * are pending retirement. 535917Selowe */ 536917Selowe static void 537917Selowe page_retire_enqueue(page_t *pp) 538917Selowe { 539917Selowe int nslot = -1; 540917Selowe int i; 541917Selowe 542917Selowe mutex_enter(&pr_q_mutex); 543917Selowe 544917Selowe /* 545917Selowe * Check to make sure retire hasn't already dequeued it. 546917Selowe * In the meantime if the page was cleaned up, no need 547917Selowe * to enqueue it. 548917Selowe */ 549917Selowe if (PP_RETIRED(pp) || pp->p_toxic == 0) { 550917Selowe mutex_exit(&pr_q_mutex); 551917Selowe PR_DEBUG(prd_noaction); 552917Selowe return; 553917Selowe } 554917Selowe 555917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 556917Selowe if (pr_pending_q[i] == pp) { 557917Selowe mutex_exit(&pr_q_mutex); 558*1381Selowe PR_DEBUG(prd_qdup); 559917Selowe return; 560917Selowe } else if (nslot == -1 && pr_pending_q[i] == NULL) { 561917Selowe nslot = i; 562917Selowe } 563917Selowe } 564917Selowe 565917Selowe PR_INCR_KSTAT(pr_pending); 566917Selowe 567917Selowe if (nslot != -1) { 568917Selowe pr_pending_q[nslot] = pp; 569917Selowe PR_DEBUG(prd_queued); 570917Selowe } else { 571917Selowe PR_INCR_KSTAT(pr_enqueue_fail); 572917Selowe PR_DEBUG(prd_notqueued); 573917Selowe } 574917Selowe mutex_exit(&pr_q_mutex); 575917Selowe } 576917Selowe 577917Selowe static void 578917Selowe page_retire_dequeue(page_t *pp) 579917Selowe { 580917Selowe int i; 581917Selowe 582917Selowe mutex_enter(&pr_q_mutex); 583917Selowe 584917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 585917Selowe if (pr_pending_q[i] == pp) { 586917Selowe pr_pending_q[i] = NULL; 587917Selowe break; 588917Selowe } 589917Selowe } 590917Selowe 591917Selowe if (i == PR_PENDING_QMAX) { 592917Selowe PR_INCR_KSTAT(pr_dequeue_fail); 593917Selowe } 594917Selowe 595917Selowe PR_DECR_KSTAT(pr_pending); 596917Selowe PR_DEBUG(prd_dequeue); 597917Selowe 598917Selowe mutex_exit(&pr_q_mutex); 599917Selowe } 600917Selowe 601917Selowe /* 602917Selowe * Act like page_destroy(), but instead of freeing the page, hash it onto 603917Selowe * the retired_pages vnode, and mark it retired. 604917Selowe * 605917Selowe * For fun, we try to scrub the page until it's squeaky clean. 606917Selowe * availrmem is adjusted here. 607917Selowe */ 608917Selowe static void 609917Selowe page_retire_destroy(page_t *pp) 610917Selowe { 611973Selowe u_offset_t off = (u_offset_t)((uintptr_t)pp); 612973Selowe 613917Selowe ASSERT(PAGE_EXCL(pp)); 614917Selowe ASSERT(!PP_ISFREE(pp)); 615917Selowe ASSERT(pp->p_szc == 0); 616917Selowe ASSERT(!hat_page_is_mapped(pp)); 617917Selowe ASSERT(!pp->p_vnode); 618917Selowe 619917Selowe page_clr_all_props(pp); 620917Selowe pagescrub(pp, 0, MMU_PAGESIZE); 621917Selowe 622917Selowe pp->p_next = NULL; 623917Selowe pp->p_prev = NULL; 624973Selowe if (page_hashin(pp, retired_pages, off, NULL) == 0) { 625917Selowe cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); 626917Selowe } 627917Selowe 628917Selowe page_settoxic(pp, PR_RETIRED); 629917Selowe page_clrtoxic(pp, PR_BUSY); 630917Selowe page_retire_dequeue(pp); 631917Selowe PR_INCR_KSTAT(pr_retired); 632917Selowe 633917Selowe if (pp->p_toxic & PR_FMA) { 634917Selowe PR_INCR_KSTAT(pr_fma); 635917Selowe } else if (pp->p_toxic & PR_UE) { 636917Selowe PR_INCR_KSTAT(pr_ue); 637917Selowe } else { 638917Selowe PR_INCR_KSTAT(pr_mce); 639917Selowe } 640917Selowe 641917Selowe mutex_enter(&freemem_lock); 642917Selowe availrmem--; 643917Selowe mutex_exit(&freemem_lock); 644917Selowe 645917Selowe page_unlock(pp); 646917Selowe } 647917Selowe 648917Selowe /* 649917Selowe * Check whether the number of pages which have been retired already exceeds 650917Selowe * the maximum allowable percentage of memory which may be retired. 651917Selowe * 652917Selowe * Returns 1 if the limit has been exceeded. 653917Selowe */ 654917Selowe static int 655917Selowe page_retire_limit(void) 656917Selowe { 657917Selowe if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) { 658917Selowe PR_INCR_KSTAT(pr_limit_exceeded); 659917Selowe return (1); 660917Selowe } 661917Selowe 662917Selowe return (0); 663917Selowe } 664917Selowe 665917Selowe #define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \ 666917Selowe "[ 0x%x != 0x%x ] while attempting to clear previously " \ 667917Selowe "reported error; page removed from service" 668917Selowe 669917Selowe #define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \ 670917Selowe "attempting to clear previously reported error; page removed " \ 671917Selowe "from service" 672917Selowe 673917Selowe /* 674917Selowe * Attempt to clear a UE from a page. 675917Selowe * Returns 1 if the error has been successfully cleared. 676917Selowe */ 677917Selowe static int 678917Selowe page_clear_transient_ue(page_t *pp) 679917Selowe { 680917Selowe caddr_t kaddr; 681917Selowe uint8_t rb, wb; 682917Selowe uint64_t pa; 683917Selowe uint32_t pa_hi, pa_lo; 684917Selowe on_trap_data_t otd; 685917Selowe int errors = 0; 686917Selowe int i; 687917Selowe 688917Selowe ASSERT(PAGE_EXCL(pp)); 689917Selowe ASSERT(PP_PR_REQ(pp)); 690917Selowe ASSERT(pp->p_szc == 0); 691917Selowe ASSERT(!hat_page_is_mapped(pp)); 692917Selowe 693917Selowe /* 694917Selowe * Clear the page and attempt to clear the UE. If we trap 695917Selowe * on the next access to the page, we know the UE has recurred. 696917Selowe */ 697917Selowe pagescrub(pp, 0, PAGESIZE); 698917Selowe 699917Selowe /* 700917Selowe * Map the page and write a bunch of bit patterns to compare 701917Selowe * what we wrote with what we read back. This isn't a perfect 702917Selowe * test but it should be good enough to catch most of the 703917Selowe * recurring UEs. If this fails to catch a recurrent UE, we'll 704917Selowe * retire the page the next time we see a UE on the page. 705917Selowe */ 706917Selowe kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); 707917Selowe 708917Selowe pa = ptob((uint64_t)page_pptonum(pp)); 709917Selowe pa_hi = (uint32_t)(pa >> 32); 710917Selowe pa_lo = (uint32_t)pa; 711917Selowe 712917Selowe /* 713917Selowe * Fill the page with each (0x00 - 0xFF] bit pattern, flushing 714917Selowe * the cache in between reading and writing. We do this under 715917Selowe * on_trap() protection to avoid recursion. 716917Selowe */ 717917Selowe if (on_trap(&otd, OT_DATA_EC)) { 718917Selowe PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); 719917Selowe errors = 1; 720917Selowe } else { 721917Selowe for (wb = 0xff; wb > 0; wb--) { 722917Selowe for (i = 0; i < PAGESIZE; i++) { 723917Selowe kaddr[i] = wb; 724917Selowe } 725917Selowe 726917Selowe sync_data_memory(kaddr, PAGESIZE); 727917Selowe 728917Selowe for (i = 0; i < PAGESIZE; i++) { 729917Selowe rb = kaddr[i]; 730917Selowe if (rb != wb) { 731917Selowe /* 732917Selowe * We had a mismatch without a trap. 733917Selowe * Uh-oh. Something is really wrong 734917Selowe * with this system. 735917Selowe */ 736917Selowe if (page_retire_messages) { 737917Selowe cmn_err(CE_WARN, MSG_DM, 738917Selowe pa_hi, pa_lo, rb, wb); 739917Selowe } 740917Selowe errors = 1; 741917Selowe goto out; /* double break */ 742917Selowe } 743917Selowe } 744917Selowe } 745917Selowe } 746917Selowe out: 747917Selowe no_trap(); 748917Selowe ppmapout(kaddr); 749917Selowe 750917Selowe return (errors ? 0 : 1); 751917Selowe } 752917Selowe 753917Selowe /* 754917Selowe * Try to clear a page_t with a single UE. If the UE was transient, it is 755917Selowe * returned to service, and we return 1. Otherwise we return 0 meaning 756917Selowe * that further processing is required to retire the page. 757917Selowe */ 758917Selowe static int 759917Selowe page_retire_transient_ue(page_t *pp) 760917Selowe { 761917Selowe ASSERT(PAGE_EXCL(pp)); 762917Selowe ASSERT(!hat_page_is_mapped(pp)); 763917Selowe 764917Selowe /* 765917Selowe * If this page is a repeat offender, retire him under the 766917Selowe * "two strikes and you're out" rule. The caller is responsible 767917Selowe * for scrubbing the page to try to clear the error. 768917Selowe */ 769917Selowe if (pp->p_toxic & PR_UE_SCRUBBED) { 770917Selowe PR_INCR_KSTAT(pr_ue_persistent); 771917Selowe return (0); 772917Selowe } 773917Selowe 774917Selowe if (page_clear_transient_ue(pp)) { 775917Selowe /* 776917Selowe * We set the PR_SCRUBBED_UE bit; if we ever see this 777917Selowe * page again, we will retire it, no questions asked. 778917Selowe */ 779917Selowe page_settoxic(pp, PR_UE_SCRUBBED); 780917Selowe 781917Selowe if (page_retire_first_ue) { 782917Selowe PR_INCR_KSTAT(pr_ue_cleared_retire); 783917Selowe return (0); 784917Selowe } else { 785917Selowe PR_INCR_KSTAT(pr_ue_cleared_free); 786917Selowe 787917Selowe page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG | PR_BUSY); 788917Selowe page_retire_dequeue(pp); 789917Selowe 790917Selowe /* LINTED: CONSTCOND */ 791917Selowe VN_DISPOSE(pp, B_FREE, 1, kcred); 792917Selowe return (1); 793917Selowe } 794917Selowe } 795917Selowe 796917Selowe PR_INCR_KSTAT(pr_ue_persistent); 797917Selowe return (0); 798917Selowe } 799917Selowe 800917Selowe /* 801917Selowe * Update the statistics dynamically when our kstat is read. 802917Selowe */ 803917Selowe static int 804917Selowe page_retire_kstat_update(kstat_t *ksp, int rw) 805917Selowe { 806917Selowe struct page_retire_kstat *pr; 807917Selowe 808917Selowe if (ksp == NULL) 809917Selowe return (EINVAL); 810917Selowe 811917Selowe switch (rw) { 812917Selowe 813917Selowe case KSTAT_READ: 814917Selowe pr = (struct page_retire_kstat *)ksp->ks_data; 815917Selowe ASSERT(pr == &page_retire_kstat); 816917Selowe pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT; 817917Selowe return (0); 818917Selowe 819917Selowe case KSTAT_WRITE: 820917Selowe return (EACCES); 821917Selowe 822917Selowe default: 823917Selowe return (EINVAL); 824917Selowe } 825917Selowe /*NOTREACHED*/ 826917Selowe } 827917Selowe 828917Selowe /* 829917Selowe * Initialize the page retire mechanism: 830917Selowe * 831917Selowe * - Establish the correctable error retire limit. 832917Selowe * - Initialize locks. 833917Selowe * - Build the retired_pages vnode. 834917Selowe * - Set up the kstats. 835917Selowe * - Fire off the background thread. 836917Selowe * - Tell page_tryretire() it's OK to start retiring pages. 837917Selowe */ 838917Selowe void 839917Selowe page_retire_init(void) 840917Selowe { 841917Selowe const fs_operation_def_t retired_vnodeops_template[] = {NULL, NULL}; 842917Selowe struct vnodeops *vops; 843917Selowe 844917Selowe const uint_t page_retire_ndata = 845917Selowe sizeof (page_retire_kstat) / sizeof (kstat_named_t); 846917Selowe 847917Selowe ASSERT(page_retire_ksp == NULL); 848917Selowe 849917Selowe if (max_pages_retired_bps <= 0) { 850917Selowe max_pages_retired_bps = MCE_BPT; 851917Selowe } 852917Selowe 853917Selowe mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL); 854917Selowe 855917Selowe retired_pages = vn_alloc(KM_SLEEP); 856917Selowe if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) { 857917Selowe cmn_err(CE_PANIC, 858917Selowe "page_retired_init: can't make retired vnodeops"); 859917Selowe } 860917Selowe vn_setops(retired_pages, vops); 861917Selowe 862917Selowe if ((page_retire_ksp = kstat_create("unix", 0, "page_retire", 863917Selowe "misc", KSTAT_TYPE_NAMED, page_retire_ndata, 864917Selowe KSTAT_FLAG_VIRTUAL)) == NULL) { 865917Selowe cmn_err(CE_WARN, "kstat_create for page_retire failed"); 866917Selowe } else { 867917Selowe page_retire_ksp->ks_data = (void *)&page_retire_kstat; 868917Selowe page_retire_ksp->ks_update = page_retire_kstat_update; 869917Selowe kstat_install(page_retire_ksp); 870917Selowe } 871917Selowe 872917Selowe pr_thread_shortwait = 23 * hz; 873917Selowe pr_thread_longwait = 1201 * hz; 874917Selowe mutex_init(&pr_thread_mutex, NULL, MUTEX_DEFAULT, NULL); 875917Selowe cv_init(&pr_cv, NULL, CV_DEFAULT, NULL); 876917Selowe pr_thread_id = thread_create(NULL, 0, page_retire_thread, NULL, 0, &p0, 877917Selowe TS_RUN, minclsyspri); 878917Selowe 879917Selowe pr_enable = 1; 880917Selowe } 881917Selowe 882917Selowe /* 883917Selowe * page_retire_hunt() callback for the retire thread. 884917Selowe */ 885917Selowe static void 886917Selowe page_retire_thread_cb(page_t *pp) 887917Selowe { 888917Selowe PR_DEBUG(prd_tctop); 889973Selowe if (!PP_ISKVP(pp) && page_trylock(pp, SE_EXCL)) { 890917Selowe PR_DEBUG(prd_tclocked); 891917Selowe page_unlock(pp); 892917Selowe } 893917Selowe } 894917Selowe 895917Selowe /* 896917Selowe * page_retire_hunt() callback for mdboot(). 897917Selowe * 898917Selowe * It is necessary to scrub any failing pages prior to reboot in order to 899917Selowe * prevent a latent error trap from occurring on the next boot. 900917Selowe */ 901917Selowe void 902917Selowe page_retire_mdboot_cb(page_t *pp) 903917Selowe { 904917Selowe /* 905917Selowe * Don't scrub the kernel, since we might still need it, unless 906917Selowe * we have UEs on the page, in which case we have nothing to lose. 907917Selowe */ 908973Selowe if (!PP_ISKVP(pp) || PP_TOXIC(pp)) { 909917Selowe pp->p_selock = -1; /* pacify ASSERTs */ 910973Selowe PP_CLRFREE(pp); 911917Selowe pagescrub(pp, 0, PAGESIZE); 912917Selowe pp->p_selock = 0; 913917Selowe } 914917Selowe pp->p_toxic = 0; 915917Selowe } 916917Selowe 917917Selowe /* 918917Selowe * Hunt down any pages in the system that have not yet been retired, invoking 919917Selowe * the provided callback function on each of them. 920917Selowe */ 921917Selowe void 922917Selowe page_retire_hunt(void (*callback)(page_t *)) 923917Selowe { 924917Selowe page_t *pp; 925917Selowe page_t *first; 926973Selowe uint64_t tbr, found; 927973Selowe int i; 928917Selowe 929917Selowe PR_DEBUG(prd_hunt); 930917Selowe 931917Selowe if (PR_KSTAT_PENDING == 0) { 932917Selowe return; 933917Selowe } 934917Selowe 935917Selowe PR_DEBUG(prd_dohunt); 936917Selowe 937917Selowe found = 0; 938917Selowe mutex_enter(&pr_q_mutex); 939917Selowe 940973Selowe tbr = PR_KSTAT_PENDING; 941973Selowe 942917Selowe for (i = 0; i < PR_PENDING_QMAX; i++) { 943917Selowe if ((pp = pr_pending_q[i]) != NULL) { 944917Selowe mutex_exit(&pr_q_mutex); 945917Selowe callback(pp); 946917Selowe mutex_enter(&pr_q_mutex); 947917Selowe found++; 948917Selowe } 949917Selowe } 950917Selowe 951973Selowe if (PR_KSTAT_EQFAIL == PR_KSTAT_DQFAIL && found == tbr) { 952917Selowe mutex_exit(&pr_q_mutex); 953917Selowe PR_DEBUG(prd_earlyhunt); 954917Selowe return; 955917Selowe } 956917Selowe mutex_exit(&pr_q_mutex); 957917Selowe 958917Selowe PR_DEBUG(prd_latehunt); 959917Selowe 960917Selowe /* 961917Selowe * We've lost track of a page somewhere. Hunt it down. 962917Selowe */ 963917Selowe memsegs_lock(0); 964917Selowe pp = first = page_first(); 965917Selowe do { 966917Selowe if (PP_PR_REQ(pp)) { 967917Selowe callback(pp); 968973Selowe if (++found == tbr) { 969917Selowe break; /* got 'em all */ 970917Selowe } 971917Selowe } 972917Selowe } while ((pp = page_next(pp)) != first); 973917Selowe memsegs_unlock(0); 974917Selowe } 975917Selowe 976917Selowe /* 977917Selowe * The page_retire_thread loops forever, looking to see if there are 978917Selowe * pages still waiting to be retired. 979917Selowe */ 980917Selowe static void 981917Selowe page_retire_thread(void) 982917Selowe { 983917Selowe callb_cpr_t c; 984917Selowe 985917Selowe CALLB_CPR_INIT(&c, &pr_thread_mutex, callb_generic_cpr, "page_retire"); 986917Selowe 987917Selowe mutex_enter(&pr_thread_mutex); 988917Selowe for (;;) { 989917Selowe if (pr_enable && PR_KSTAT_PENDING) { 9901338Selowe /* 9911338Selowe * Sigh. It's SO broken how we have to try to shake 9921338Selowe * loose the holder of the page. Since we have no 9931338Selowe * idea who or what has it locked, we go bang on 9941338Selowe * every door in the city to try to locate it. 9951338Selowe */ 996917Selowe kmem_reap(); 997917Selowe seg_preap(); 998917Selowe page_retire_hunt(page_retire_thread_cb); 999917Selowe CALLB_CPR_SAFE_BEGIN(&c); 1000917Selowe (void) cv_timedwait(&pr_cv, &pr_thread_mutex, 1001917Selowe lbolt + pr_thread_shortwait); 1002917Selowe CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); 1003917Selowe } else { 1004917Selowe CALLB_CPR_SAFE_BEGIN(&c); 1005917Selowe (void) cv_timedwait(&pr_cv, &pr_thread_mutex, 1006917Selowe lbolt + pr_thread_longwait); 1007917Selowe CALLB_CPR_SAFE_END(&c, &pr_thread_mutex); 1008917Selowe } 1009917Selowe } 1010917Selowe /*NOTREACHED*/ 1011917Selowe } 1012917Selowe 1013917Selowe /* 1014917Selowe * page_retire_pp() decides what to do with a failing page. 1015917Selowe * 1016917Selowe * When we get a free page (e.g. the scrubber or in the free path) life is 1017917Selowe * nice because the page is clean and marked free -- those always retire 1018917Selowe * nicely. From there we go by order of difficulty. If the page has data, 1019917Selowe * we attempt to relocate its contents to a suitable replacement page. If 1020917Selowe * that does not succeed, we look to see if it is clean. If after all of 1021917Selowe * this we have a clean, unmapped page (which we usually do!), we retire it. 1022917Selowe * If the page is not clean, we still process it regardless on a UE; for 1023917Selowe * CEs or FMA requests, we fail leaving the page in service. The page will 1024917Selowe * eventually be tried again later. We always return with the page unlocked 1025917Selowe * since we are called from page_unlock(). 1026917Selowe * 1027917Selowe * We don't call panic or do anything fancy down in here. Our boss the DE 1028917Selowe * gets paid handsomely to do his job of figuring out what to do when errors 1029917Selowe * occur. We just do what he tells us to do. 1030917Selowe */ 1031917Selowe static int 1032917Selowe page_retire_pp(page_t *pp) 1033917Selowe { 1034917Selowe int toxic; 1035917Selowe 1036917Selowe ASSERT(PAGE_EXCL(pp)); 1037917Selowe ASSERT(pp->p_iolock_state == 0); 1038917Selowe ASSERT(pp->p_szc == 0); 1039917Selowe 1040917Selowe PR_DEBUG(prd_top); 1041917Selowe PR_TYPES(pp); 1042917Selowe 1043917Selowe toxic = pp->p_toxic; 1044917Selowe ASSERT(toxic & PR_REASONS); 1045917Selowe 1046917Selowe if ((toxic & (PR_FMA | PR_MCE)) && !(toxic & PR_UE) && 1047917Selowe page_retire_limit()) { 1048917Selowe page_clrtoxic(pp, PR_FMA | PR_MCE | PR_MSG | PR_BUSY); 1049917Selowe page_retire_dequeue(pp); 1050917Selowe page_unlock(pp); 1051917Selowe return (page_retire_done(pp, PRD_LIMIT)); 1052917Selowe } 1053917Selowe 1054917Selowe if (PP_ISFREE(pp)) { 10551338Selowe int dbgnoreclaim = MTBF(recl_calls, recl_mtbf) == 0; 10561338Selowe 1057917Selowe PR_DEBUG(prd_free); 10581338Selowe 10591338Selowe if (dbgnoreclaim || !page_reclaim(pp, NULL)) { 1060917Selowe PR_DEBUG(prd_noreclaim); 1061917Selowe PR_INCR_KSTAT(pr_failed); 10621338Selowe /* 10631338Selowe * page_reclaim() returns with `pp' unlocked when 10641338Selowe * it fails. 10651338Selowe */ 10661338Selowe if (dbgnoreclaim) 10671338Selowe page_unlock(pp); 1068917Selowe return (page_retire_done(pp, PRD_FAILED)); 1069917Selowe } 1070917Selowe } 10711338Selowe ASSERT(!PP_ISFREE(pp)); 1072917Selowe 10731338Selowe if ((toxic & PR_UE) == 0 && pp->p_vnode && !PP_ISNORELOCKERNEL(pp) && 10741338Selowe MTBF(reloc_calls, reloc_mtbf)) { 1075917Selowe page_t *newpp; 1076917Selowe spgcnt_t count; 1077917Selowe 1078917Selowe /* 1079917Selowe * If we can relocate the page, great! newpp will go 1080917Selowe * on without us, and everything is fine. Regardless 1081917Selowe * of whether the relocation succeeds, we are still 1082917Selowe * going to take `pp' around back and shoot it. 1083917Selowe */ 1084917Selowe newpp = NULL; 1085917Selowe if (page_relocate(&pp, &newpp, 0, 0, &count, NULL) == 0) { 1086973Selowe PR_DEBUG(prd_reloc); 1087917Selowe page_unlock(newpp); 1088917Selowe ASSERT(hat_page_getattr(pp, P_MOD) == 0); 1089973Selowe } else { 1090973Selowe PR_DEBUG(prd_relocfail); 1091917Selowe } 1092917Selowe } 1093917Selowe 1094973Selowe if (hat_ismod(pp)) { 1095973Selowe PR_DEBUG(prd_mod); 1096973Selowe PR_INCR_KSTAT(pr_failed); 1097973Selowe page_unlock(pp); 1098973Selowe return (page_retire_done(pp, PRD_FAILED)); 1099973Selowe } 1100973Selowe 1101973Selowe if (PP_ISKVP(pp)) { 1102917Selowe PR_DEBUG(prd_kern); 1103917Selowe PR_INCR_KSTAT(pr_failed_kernel); 1104917Selowe page_unlock(pp); 1105917Selowe return (page_retire_done(pp, PRD_FAILED)); 1106917Selowe } 1107917Selowe 1108917Selowe if (pp->p_lckcnt || pp->p_cowcnt) { 1109973Selowe PR_DEBUG(prd_locked); 1110973Selowe PR_INCR_KSTAT(pr_failed); 1111973Selowe page_unlock(pp); 1112973Selowe return (page_retire_done(pp, PRD_FAILED)); 1113917Selowe } 1114917Selowe 1115917Selowe (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1116917Selowe ASSERT(!hat_page_is_mapped(pp)); 1117917Selowe 1118917Selowe /* 1119973Selowe * If the page is modified, and was not relocated; we can't 1120973Selowe * retire it without dropping data on the floor. We have to 1121973Selowe * recheck after unloading since the dirty bit could have been 1122973Selowe * set since we last checked. 1123917Selowe */ 1124917Selowe if (hat_ismod(pp)) { 1125973Selowe PR_DEBUG(prd_mod_late); 1126973Selowe PR_INCR_KSTAT(pr_failed); 1127973Selowe page_unlock(pp); 1128973Selowe return (page_retire_done(pp, PRD_FAILED)); 1129917Selowe } 1130917Selowe 1131917Selowe if (pp->p_vnode) { 1132917Selowe PR_DEBUG(prd_hashout); 1133917Selowe page_hashout(pp, NULL); 1134917Selowe } 1135917Selowe ASSERT(!pp->p_vnode); 1136917Selowe 1137917Selowe /* 1138917Selowe * The problem page is locked, demoted, unmapped, not free, 1139917Selowe * hashed out, and not COW or mlocked (whew!). 1140917Selowe * 1141917Selowe * Now we select our ammunition, take it around back, and shoot it. 1142917Selowe */ 1143917Selowe if (toxic & PR_UE) { 1144917Selowe if (page_retire_transient_ue(pp)) { 1145917Selowe PR_DEBUG(prd_uescrubbed); 1146917Selowe return (page_retire_done(pp, PRD_UE_SCRUBBED)); 1147917Selowe } else { 1148917Selowe PR_DEBUG(prd_uenotscrubbed); 1149917Selowe page_retire_destroy(pp); 1150917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1151917Selowe } 1152917Selowe } else if (toxic & PR_FMA) { 1153917Selowe PR_DEBUG(prd_fma); 1154917Selowe page_retire_destroy(pp); 1155917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1156917Selowe } else if (toxic & PR_MCE) { 1157917Selowe PR_DEBUG(prd_mce); 1158917Selowe page_retire_destroy(pp); 1159917Selowe return (page_retire_done(pp, PRD_SUCCESS)); 1160917Selowe } 1161917Selowe panic("page_retire_pp: bad toxic flags %d", toxic); 1162917Selowe /*NOTREACHED*/ 1163917Selowe } 1164917Selowe 1165917Selowe /* 1166917Selowe * Try to retire a page when we stumble onto it in the page lock routines. 1167917Selowe */ 1168917Selowe void 1169917Selowe page_tryretire(page_t *pp) 1170917Selowe { 1171917Selowe ASSERT(PAGE_EXCL(pp)); 1172917Selowe 1173917Selowe if (!pr_enable) { 1174917Selowe page_unlock(pp); 1175917Selowe return; 1176917Selowe } 1177917Selowe 1178917Selowe /* 1179917Selowe * If the page is a big page, try to break it up. 1180917Selowe * 1181917Selowe * If there are other bad pages besides `pp', they will be 1182917Selowe * recursively retired for us thanks to a bit of magic. 1183917Selowe * If the page is a small page with errors, try to retire it. 1184917Selowe */ 1185917Selowe if (pp->p_szc > 0) { 1186917Selowe if (PP_ISFREE(pp) && !page_try_demote_free_pages(pp)) { 1187917Selowe page_unlock(pp); 1188917Selowe PR_DEBUG(prd_nofreedemote); 1189917Selowe return; 1190917Selowe } else if (!page_try_demote_pages(pp)) { 1191917Selowe page_unlock(pp); 1192917Selowe PR_DEBUG(prd_nodemote); 1193917Selowe return; 1194917Selowe } 1195917Selowe PR_DEBUG(prd_demoted); 1196917Selowe page_unlock(pp); 1197917Selowe } else { 1198917Selowe (void) page_retire_pp(pp); 1199917Selowe } 1200917Selowe } 1201917Selowe 1202917Selowe /* 1203917Selowe * page_retire() - the front door in to retire a page. 1204917Selowe * 1205917Selowe * Ideally, page_retire() would instantly retire the requested page. 1206917Selowe * Unfortunately, some pages are locked or otherwise tied up and cannot be 1207917Selowe * retired right away. To deal with that, bits are set in p_toxic of the 1208917Selowe * page_t. An attempt is made to lock the page; if the attempt is successful, 1209917Selowe * we instantly unlock the page counting on page_unlock() to notice p_toxic 1210917Selowe * is nonzero and to call back into page_retire_pp(). Success is determined 1211917Selowe * by looking to see whether the page has been retired once it has been 1212917Selowe * unlocked. 1213917Selowe * 1214917Selowe * Returns: 1215917Selowe * 1216917Selowe * - 0 on success, 1217917Selowe * - EINVAL when the PA is whacko, 1218*1381Selowe * - EIO if the page is already retired or already pending retirement, or 1219*1381Selowe * - EAGAIN if the page could not be _immediately_ retired but is pending. 1220917Selowe */ 1221917Selowe int 1222917Selowe page_retire(uint64_t pa, uchar_t reason) 1223917Selowe { 1224917Selowe page_t *pp; 1225917Selowe 1226917Selowe ASSERT(reason & PR_REASONS); /* there must be a reason */ 1227917Selowe ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ 1228917Selowe 1229917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1230917Selowe if (pp == NULL) { 1231917Selowe PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" 1232917Selowe " page 0x%08x.%08x; page is not relocatable memory", pa); 1233917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1234917Selowe } 1235917Selowe if (PP_RETIRED(pp)) { 1236*1381Selowe PR_DEBUG(prd_dup1); 1237917Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1238917Selowe } 1239917Selowe 1240*1381Selowe if ((reason & PR_UE) && !PP_TOXIC(pp)) { 1241917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" 1242917Selowe " page 0x%08x.%08x", pa); 1243*1381Selowe } else if (PP_PR_REQ(pp)) { 1244*1381Selowe PR_DEBUG(prd_dup2); 1245*1381Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1246917Selowe } else { 1247917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" 1248917Selowe " page 0x%08x.%08x", pa); 1249917Selowe } 1250917Selowe page_settoxic(pp, reason); 1251917Selowe page_retire_enqueue(pp); 1252917Selowe 1253917Selowe /* 1254917Selowe * And now for some magic. 1255917Selowe * 1256917Selowe * We marked this page toxic up above. All there is left to do is 1257917Selowe * to try to lock the page and then unlock it. The page lock routines 1258917Selowe * will intercept the page and retire it if they can. If the page 1259917Selowe * cannot be locked, 's okay -- page_unlock() will eventually get it, 1260917Selowe * or the background thread, until then the lock routines will deny 1261917Selowe * further locks on it. 1262917Selowe */ 1263917Selowe if (MTBF(pr_calls, pr_mtbf) && page_trylock(pp, SE_EXCL)) { 1264917Selowe PR_DEBUG(prd_prlocked); 1265917Selowe page_unlock(pp); 1266917Selowe } else { 1267917Selowe PR_DEBUG(prd_prnotlocked); 1268917Selowe } 1269917Selowe 1270917Selowe if (PP_RETIRED(pp)) { 1271917Selowe PR_DEBUG(prd_prretired); 1272917Selowe return (0); 1273917Selowe } else { 1274917Selowe cv_signal(&pr_cv); 1275917Selowe PR_INCR_KSTAT(pr_failed); 1276917Selowe 1277917Selowe if (pp->p_toxic & PR_MSG) { 1278917Selowe return (page_retire_done(pp, PRD_FAILED)); 1279917Selowe } else { 1280917Selowe return (page_retire_done(pp, PRD_PENDING)); 1281917Selowe } 1282917Selowe } 1283917Selowe } 1284917Selowe 1285917Selowe /* 1286917Selowe * Take a retired page off the retired-pages vnode and clear the toxic flags. 1287917Selowe * If "free" is nonzero, lock it and put it back on the freelist. If "free" 1288917Selowe * is zero, the caller already holds SE_EXCL lock so we simply unretire it 1289917Selowe * and don't do anything else with it. 1290917Selowe * 1291917Selowe * Any unretire messages are printed from this routine. 1292917Selowe * 1293917Selowe * Returns 0 if page pp was unretired; else an error code. 1294917Selowe */ 1295917Selowe int 1296917Selowe page_unretire_pp(page_t *pp, int free) 1297917Selowe { 1298917Selowe /* 1299917Selowe * To be retired, a page has to be hashed onto the retired_pages vnode 1300917Selowe * and have PR_RETIRED set in p_toxic. 1301917Selowe */ 1302917Selowe if (free == 0 || page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { 1303917Selowe ASSERT(PAGE_EXCL(pp)); 1304917Selowe PR_DEBUG(prd_ulocked); 1305917Selowe if (!PP_RETIRED(pp)) { 1306917Selowe PR_DEBUG(prd_unotretired); 1307917Selowe page_unlock(pp); 1308917Selowe return (page_retire_done(pp, PRD_UNR_NOT)); 1309917Selowe } 1310917Selowe 1311917Selowe PR_MESSAGE(CE_NOTE, 1, "unretiring retired" 13121338Selowe " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); 1313917Selowe if (pp->p_toxic & PR_FMA) { 1314917Selowe PR_DECR_KSTAT(pr_fma); 1315917Selowe } else if (pp->p_toxic & PR_UE) { 1316917Selowe PR_DECR_KSTAT(pr_ue); 1317917Selowe } else { 1318917Selowe PR_DECR_KSTAT(pr_mce); 1319917Selowe } 1320917Selowe page_clrtoxic(pp, PR_ALLFLAGS); 1321917Selowe 1322917Selowe if (free) { 1323917Selowe PR_DEBUG(prd_udestroy); 1324917Selowe page_destroy(pp, 0); 1325917Selowe } else { 1326917Selowe PR_DEBUG(prd_uhashout); 1327917Selowe page_hashout(pp, NULL); 1328917Selowe } 1329917Selowe 1330917Selowe mutex_enter(&freemem_lock); 1331917Selowe availrmem++; 1332917Selowe mutex_exit(&freemem_lock); 1333917Selowe 1334917Selowe PR_DEBUG(prd_uunretired); 1335917Selowe PR_DECR_KSTAT(pr_retired); 1336917Selowe PR_INCR_KSTAT(pr_unretired); 1337917Selowe return (page_retire_done(pp, PRD_UNR_SUCCESS)); 1338917Selowe } 1339917Selowe PR_DEBUG(prd_unotlocked); 1340917Selowe return (page_retire_done(pp, PRD_UNR_CANTLOCK)); 1341917Selowe } 1342917Selowe 1343917Selowe /* 1344917Selowe * Return a page to service by moving it from the retired_pages vnode 1345917Selowe * onto the freelist. 1346917Selowe * 1347917Selowe * Called from mmioctl_page_retire() on behalf of the FMA DE. 1348917Selowe * 1349917Selowe * Returns: 1350917Selowe * 1351917Selowe * - 0 if the page is unretired, 1352917Selowe * - EAGAIN if the pp can not be locked, 1353917Selowe * - EINVAL if the PA is whacko, and 1354*1381Selowe * - EIO if the pp is not retired. 1355917Selowe */ 1356917Selowe int 1357917Selowe page_unretire(uint64_t pa) 1358917Selowe { 1359917Selowe page_t *pp; 1360917Selowe 1361917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1362917Selowe if (pp == NULL) { 1363917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1364917Selowe } 1365917Selowe 1366917Selowe return (page_unretire_pp(pp, 1)); 1367917Selowe } 1368917Selowe 1369917Selowe /* 1370917Selowe * Test a page to see if it is retired. If errors is non-NULL, the toxic 1371917Selowe * bits of the page are returned. Returns 0 on success, error code on failure. 1372917Selowe */ 1373917Selowe int 1374917Selowe page_retire_check_pp(page_t *pp, uint64_t *errors) 1375917Selowe { 1376917Selowe int rc; 1377917Selowe 1378917Selowe if (PP_RETIRED(pp)) { 1379917Selowe PR_DEBUG(prd_checkhit); 1380917Selowe rc = 0; 1381*1381Selowe } else if (PP_PR_REQ(pp)) { 1382*1381Selowe PR_DEBUG(prd_checkmiss_pend); 1383*1381Selowe rc = EAGAIN; 1384917Selowe } else { 1385*1381Selowe PR_DEBUG(prd_checkmiss_noerr); 1386*1381Selowe rc = EIO; 1387917Selowe } 1388917Selowe 1389917Selowe /* 1390917Selowe * We have magically arranged the bit values returned to fmd(1M) 1391917Selowe * to line up with the FMA, MCE, and UE bits of the page_t. 1392917Selowe */ 1393917Selowe if (errors) { 1394917Selowe uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK); 1395917Selowe if (toxic & PR_UE_SCRUBBED) { 1396917Selowe toxic &= ~PR_UE_SCRUBBED; 1397917Selowe toxic |= PR_UE; 1398917Selowe } 1399917Selowe *errors = toxic; 1400917Selowe } 1401917Selowe 1402917Selowe return (rc); 1403917Selowe } 1404917Selowe 1405917Selowe /* 1406917Selowe * Test to see if the page_t for a given PA is retired, and return the 1407917Selowe * hardware errors we have seen on the page if requested. 1408917Selowe * 1409917Selowe * Called from mmioctl_page_retire on behalf of the FMA DE. 1410917Selowe * 1411917Selowe * Returns: 1412917Selowe * 1413917Selowe * - 0 if the page is retired, 1414*1381Selowe * - EIO if the page is not retired and has no errors, 1415*1381Selowe * - EAGAIN if the page is not retired but is pending; and 1416917Selowe * - EINVAL if the PA is whacko. 1417917Selowe */ 1418917Selowe int 1419917Selowe page_retire_check(uint64_t pa, uint64_t *errors) 1420917Selowe { 1421917Selowe page_t *pp; 1422917Selowe 1423917Selowe if (errors) { 1424917Selowe *errors = 0; 1425917Selowe } 1426917Selowe 1427917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1428917Selowe if (pp == NULL) { 1429917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1430917Selowe } 1431917Selowe 1432917Selowe return (page_retire_check_pp(pp, errors)); 1433917Selowe } 1434917Selowe 1435917Selowe /* 1436917Selowe * Page retire self-test. For now, it always returns 0. 1437917Selowe */ 1438917Selowe int 1439917Selowe page_retire_test(void) 1440917Selowe { 1441917Selowe page_t *first, *pp, *cpp, *cpp2, *lpp; 1442917Selowe 1443917Selowe /* 1444917Selowe * Tests the corner case where a large page can't be retired 1445917Selowe * because one of the constituent pages is locked. We mark 1446917Selowe * one page to be retired and try to retire it, and mark the 1447917Selowe * other page to be retired but don't try to retire it, so 1448917Selowe * that page_unlock() in the failure path will recurse and try 1449917Selowe * to retire THAT page. This is the worst possible situation 1450917Selowe * we can get ourselves into. 1451917Selowe */ 1452917Selowe memsegs_lock(0); 1453917Selowe pp = first = page_first(); 1454917Selowe do { 1455917Selowe if (pp->p_szc && PP_PAGEROOT(pp) == pp) { 1456917Selowe cpp = pp + 1; 1457917Selowe lpp = PP_ISFREE(pp)? pp : pp + 2; 1458917Selowe cpp2 = pp + 3; 1459917Selowe if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) 1460917Selowe continue; 1461917Selowe if (!page_trylock(cpp, SE_EXCL)) { 1462917Selowe page_unlock(lpp); 1463917Selowe continue; 1464917Selowe } 1465917Selowe page_settoxic(cpp, PR_FMA | PR_BUSY); 1466917Selowe page_settoxic(cpp2, PR_FMA); 1467917Selowe page_tryretire(cpp); /* will fail */ 1468917Selowe page_unlock(lpp); 1469917Selowe (void) page_retire(cpp->p_pagenum, PR_FMA); 1470917Selowe (void) page_retire(cpp2->p_pagenum, PR_FMA); 1471917Selowe } 1472917Selowe } while ((pp = page_next(pp)) != first); 1473917Selowe memsegs_unlock(0); 1474917Selowe 1475917Selowe return (0); 1476917Selowe } 1477