1917Selowe /* 2917Selowe * CDDL HEADER START 3917Selowe * 4917Selowe * The contents of this file are subject to the terms of the 53253Smec * Common Development and Distribution License (the "License"). 63253Smec * You may not use this file except in compliance with the License. 7917Selowe * 8917Selowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9917Selowe * or http://www.opensolaris.org/os/licensing. 10917Selowe * See the License for the specific language governing permissions 11917Selowe * and limitations under the License. 12917Selowe * 13917Selowe * When distributing Covered Code, include this CDDL HEADER in each 14917Selowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15917Selowe * If applicable, add the following below this CDDL HEADER, with the 16917Selowe * fields enclosed by brackets "[]" replaced with your own identifying 17917Selowe * information: Portions Copyright [yyyy] [name of copyright owner] 18917Selowe * 19917Selowe * CDDL HEADER END 20917Selowe */ 21917Selowe /* 22*11873SVijay.Balakrishna@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23917Selowe * Use is subject to license terms. 24917Selowe */ 25917Selowe 26917Selowe /* 27917Selowe * Page Retire - Big Theory Statement. 28917Selowe * 29917Selowe * This file handles removing sections of faulty memory from use when the 30917Selowe * user land FMA Diagnosis Engine requests that a page be removed or when 31917Selowe * a CE or UE is detected by the hardware. 32917Selowe * 33917Selowe * In the bad old days, the kernel side of Page Retire did a lot of the work 34917Selowe * on its own. Now, with the DE keeping track of errors, the kernel side is 35917Selowe * rather simple minded on most platforms. 36917Selowe * 37917Selowe * Errors are all reflected to the DE, and after digesting the error and 38917Selowe * looking at all previously reported errors, the DE decides what should 39917Selowe * be done about the current error. If the DE wants a particular page to 40917Selowe * be retired, then the kernel page retire code is invoked via an ioctl. 41917Selowe * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling 42917Selowe * page retire to handle the error. Since page retire is just a simple 43917Selowe * mechanism it doesn't need to differentiate between the different callers. 44917Selowe * 45917Selowe * The p_toxic field in the page_t is used to indicate which errors have 46917Selowe * occurred and what action has been taken on a given page. Because errors are 47917Selowe * reported without regard to the locked state of a page, no locks are used 48917Selowe * to SET the error bits in p_toxic. However, in order to clear the error 49917Selowe * bits, the page_t must be held exclusively locked. 50917Selowe * 51917Selowe * When page_retire() is called, it must be able to acquire locks, sleep, etc. 52917Selowe * It must not be called from high-level interrupt context. 53917Selowe * 54917Selowe * Depending on how the requested page is being used at the time of the retire 55917Selowe * request (and on the availability of sufficient system resources), the page 56917Selowe * may be retired immediately, or just marked for retirement later. For 57917Selowe * example, locked pages are marked, while free pages are retired. Multiple 58917Selowe * requests may be made to retire the same page, although there is no need 59917Selowe * to: once the p_toxic flags are set, the page will be retired as soon as it 60917Selowe * can be exclusively locked. 61917Selowe * 62917Selowe * The retire mechanism is driven centrally out of page_unlock(). To expedite 63917Selowe * the retirement of pages, further requests for SE_SHARED locks are denied 64917Selowe * as long as a page retirement is pending. In addition, as long as pages are 65917Selowe * pending retirement a background thread runs periodically trying to retire 66917Selowe * those pages. Pages which could not be retired while the system is running 67917Selowe * are scrubbed prior to rebooting to avoid latent errors on the next boot. 68917Selowe * 691338Selowe * UE pages without persistent errors are scrubbed and returned to service. 701338Selowe * Recidivist pages, as well as FMA-directed requests for retirement, result 711338Selowe * in the page being taken out of service. Once the decision is made to take 721338Selowe * a page out of service, the page is cleared, hashed onto the retired_pages 731338Selowe * vnode, marked as retired, and it is unlocked. No other requesters (except 741338Selowe * for unretire) are allowed to lock retired pages. 75917Selowe * 76917Selowe * The public routines return (sadly) 0 if they worked and a non-zero error 77917Selowe * value if something went wrong. This is done for the ioctl side of the 78917Selowe * world to allow errors to be reflected all the way out to user land. The 79917Selowe * non-zero values are explained in comments atop each function. 80917Selowe */ 81917Selowe 82917Selowe /* 83917Selowe * Things to fix: 84917Selowe * 853253Smec * 1. Trying to retire non-relocatable kvp pages may result in a 86917Selowe * quagmire. This is because seg_kmem() no longer keeps its pages locked, 87917Selowe * and calls page_lookup() in the free path; since kvp pages are modified 88917Selowe * and don't have a usable backing store, page_retire() can't do anything 89917Selowe * with them, and we'll keep denying the lock to seg_kmem_free() in a 90917Selowe * vicious cycle. To prevent that, we don't deny locks to kvp pages, and 913253Smec * hence only try to retire a page from page_unlock() in the free path. 92917Selowe * Since most kernel pages are indefinitely held anyway, and don't 93917Selowe * participate in I/O, this is of little consequence. 94917Selowe * 953253Smec * 2. Low memory situations will be interesting. If we don't have 96917Selowe * enough memory for page_relocate() to succeed, we won't be able to 97917Selowe * retire dirty pages; nobody will be able to push them out to disk 98917Selowe * either, since we aggressively deny the page lock. We could change 99917Selowe * fsflush so it can recognize this situation, grab the lock, and push 100917Selowe * the page out, where we'll catch it in the free path and retire it. 101917Selowe * 1023253Smec * 3. Beware of places that have code like this in them: 103917Selowe * 104917Selowe * if (! page_tryupgrade(pp)) { 105917Selowe * page_unlock(pp); 106917Selowe * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) { 107917Selowe * / *NOTHING* / 108917Selowe * } 109917Selowe * } 110917Selowe * page_free(pp); 111917Selowe * 112917Selowe * The problem is that pp can change identity right after the 113917Selowe * page_unlock() call. In particular, page_retire() can step in 114917Selowe * there, change pp's identity, and hash pp onto the retired_vnode. 115917Selowe * 116917Selowe * Of course, other functions besides page_retire() can have the 117917Selowe * same effect. A kmem reader can waltz by, set up a mapping to the 118917Selowe * page, and then unlock the page. Page_free() will then go castors 119917Selowe * up. So if anybody is doing this, it's already a bug. 120917Selowe * 1213253Smec * 4. mdboot()'s call into page_retire_mdboot() should probably be 122917Selowe * moved lower. Where the call is made now, we can get into trouble 123917Selowe * by scrubbing a kernel page that is then accessed later. 124917Selowe */ 125917Selowe 126917Selowe #include <sys/types.h> 127917Selowe #include <sys/param.h> 128917Selowe #include <sys/systm.h> 129917Selowe #include <sys/mman.h> 130917Selowe #include <sys/vnode.h> 1313898Srsb #include <sys/vfs_opreg.h> 132917Selowe #include <sys/cmn_err.h> 133917Selowe #include <sys/ksynch.h> 134917Selowe #include <sys/thread.h> 135917Selowe #include <sys/disp.h> 136917Selowe #include <sys/ontrap.h> 137917Selowe #include <sys/vmsystm.h> 138917Selowe #include <sys/mem_config.h> 139917Selowe #include <sys/atomic.h> 140917Selowe #include <sys/callb.h> 141*11873SVijay.Balakrishna@Sun.COM #include <sys/kobj.h> 142917Selowe #include <vm/page.h> 143917Selowe #include <vm/vm_dep.h> 144917Selowe #include <vm/as.h> 145917Selowe #include <vm/hat.h> 14611185SSean.McEnroe@Sun.COM #include <vm/seg_kmem.h> 147917Selowe 148917Selowe /* 149917Selowe * vnode for all pages which are retired from the VM system; 150917Selowe */ 151917Selowe vnode_t *retired_pages; 152917Selowe 1533253Smec static int page_retire_pp_finish(page_t *, void *, uint_t); 154917Selowe 155917Selowe /* 156917Selowe * Make a list of all of the pages that have been marked for retirement 157917Selowe * but are not yet retired. At system shutdown, we will scrub all of the 158917Selowe * pages in the list in case there are outstanding UEs. Then, we 159917Selowe * cross-check this list against the number of pages that are yet to be 160917Selowe * retired, and if we find inconsistencies, we scan every page_t in the 161917Selowe * whole system looking for any pages that need to be scrubbed for UEs. 162917Selowe * The background thread also uses this queue to determine which pages 163917Selowe * it should keep trying to retire. 164917Selowe */ 165917Selowe #ifdef DEBUG 166917Selowe #define PR_PENDING_QMAX 32 167917Selowe #else /* DEBUG */ 168917Selowe #define PR_PENDING_QMAX 256 169917Selowe #endif /* DEBUG */ 170917Selowe page_t *pr_pending_q[PR_PENDING_QMAX]; 171917Selowe kmutex_t pr_q_mutex; 172917Selowe 173917Selowe /* 174917Selowe * Page retire global kstats 175917Selowe */ 176917Selowe struct page_retire_kstat { 177917Selowe kstat_named_t pr_retired; 178917Selowe kstat_named_t pr_requested; 179917Selowe kstat_named_t pr_requested_free; 180917Selowe kstat_named_t pr_enqueue_fail; 181917Selowe kstat_named_t pr_dequeue_fail; 182917Selowe kstat_named_t pr_pending; 1839544SChristopher.Baumbauer@Sun.COM kstat_named_t pr_pending_kas; 184917Selowe kstat_named_t pr_failed; 185917Selowe kstat_named_t pr_failed_kernel; 186917Selowe kstat_named_t pr_limit; 187917Selowe kstat_named_t pr_limit_exceeded; 188917Selowe kstat_named_t pr_fma; 189917Selowe kstat_named_t pr_mce; 190917Selowe kstat_named_t pr_ue; 191917Selowe kstat_named_t pr_ue_cleared_retire; 192917Selowe kstat_named_t pr_ue_cleared_free; 193917Selowe kstat_named_t pr_ue_persistent; 194917Selowe kstat_named_t pr_unretired; 195917Selowe }; 196917Selowe 197917Selowe static struct page_retire_kstat page_retire_kstat = { 198917Selowe { "pages_retired", KSTAT_DATA_UINT64}, 199917Selowe { "pages_retire_request", KSTAT_DATA_UINT64}, 200917Selowe { "pages_retire_request_free", KSTAT_DATA_UINT64}, 201917Selowe { "pages_notenqueued", KSTAT_DATA_UINT64}, 202917Selowe { "pages_notdequeued", KSTAT_DATA_UINT64}, 203917Selowe { "pages_pending", KSTAT_DATA_UINT64}, 2049544SChristopher.Baumbauer@Sun.COM { "pages_pending_kas", KSTAT_DATA_UINT64}, 205917Selowe { "pages_deferred", KSTAT_DATA_UINT64}, 206917Selowe { "pages_deferred_kernel", KSTAT_DATA_UINT64}, 207917Selowe { "pages_limit", KSTAT_DATA_UINT64}, 208917Selowe { "pages_limit_exceeded", KSTAT_DATA_UINT64}, 209917Selowe { "pages_fma", KSTAT_DATA_UINT64}, 210917Selowe { "pages_multiple_ce", KSTAT_DATA_UINT64}, 211917Selowe { "pages_ue", KSTAT_DATA_UINT64}, 212917Selowe { "pages_ue_cleared_retired", KSTAT_DATA_UINT64}, 213917Selowe { "pages_ue_cleared_freed", KSTAT_DATA_UINT64}, 214917Selowe { "pages_ue_persistent", KSTAT_DATA_UINT64}, 215917Selowe { "pages_unretired", KSTAT_DATA_UINT64}, 216917Selowe }; 217917Selowe 218917Selowe static kstat_t *page_retire_ksp = NULL; 219917Selowe 220917Selowe #define PR_INCR_KSTAT(stat) \ 221917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1) 222917Selowe #define PR_DECR_KSTAT(stat) \ 223917Selowe atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1) 224917Selowe 225917Selowe #define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64) 226917Selowe #define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64) 227917Selowe #define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA) 228917Selowe #define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64) 2299544SChristopher.Baumbauer@Sun.COM #define PR_KSTAT_PENDING_KAS (page_retire_kstat.pr_pending_kas.value.ui64) 230917Selowe #define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64) 231917Selowe #define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) 232917Selowe 233917Selowe /* 2343253Smec * page retire kstats to list all retired pages 2353253Smec */ 2363253Smec static int pr_list_kstat_update(kstat_t *ksp, int rw); 2373253Smec static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 2383253Smec kmutex_t pr_list_kstat_mutex; 2393253Smec 2403253Smec /* 241917Selowe * Limit the number of multiple CE page retires. 242917Selowe * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in 243917Selowe * basis points, where 100 basis points equals one percent. 244917Selowe */ 245917Selowe #define MCE_BPT 10 246917Selowe uint64_t max_pages_retired_bps = MCE_BPT; 247917Selowe #define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000) 248917Selowe 249917Selowe /* 250917Selowe * Control over the verbosity of page retirement. 251917Selowe * 252917Selowe * When set to zero (the default), no messages will be printed. 253917Selowe * When set to one, summary messages will be printed. 254917Selowe * When set > one, all messages will be printed. 255917Selowe * 256917Selowe * A value of one will trigger detailed messages for retirement operations, 257917Selowe * and is intended as a platform tunable for processors where FMA's DE does 258917Selowe * not run (e.g., spitfire). Values > one are intended for debugging only. 259917Selowe */ 260917Selowe int page_retire_messages = 0; 261917Selowe 262917Selowe /* 263917Selowe * Control whether or not we return scrubbed UE pages to service. 264917Selowe * By default we do not since FMA wants to run its diagnostics first 265917Selowe * and then ask us to unretire the page if it passes. Non-FMA platforms 266917Selowe * may set this to zero so we will only retire recidivist pages. It should 267917Selowe * not be changed by the user. 268917Selowe */ 269917Selowe int page_retire_first_ue = 1; 270917Selowe 271917Selowe /* 272917Selowe * Master enable for page retire. This prevents a CE or UE early in boot 273917Selowe * from trying to retire a page before page_retire_init() has finished 274917Selowe * setting things up. This is internal only and is not a tunable! 275917Selowe */ 276917Selowe static int pr_enable = 0; 277917Selowe 278*11873SVijay.Balakrishna@Sun.COM static void (*memscrub_notify_func)(uint64_t); 279*11873SVijay.Balakrishna@Sun.COM 280917Selowe #ifdef DEBUG 281917Selowe struct page_retire_debug { 2821381Selowe int prd_dup1; 2831381Selowe int prd_dup2; 2841381Selowe int prd_qdup; 285917Selowe int prd_noaction; 286917Selowe int prd_queued; 287917Selowe int prd_notqueued; 288917Selowe int prd_dequeue; 289917Selowe int prd_top; 290917Selowe int prd_locked; 291917Selowe int prd_reloc; 292973Selowe int prd_relocfail; 293973Selowe int prd_mod; 294973Selowe int prd_mod_late; 295917Selowe int prd_kern; 296917Selowe int prd_free; 297917Selowe int prd_noreclaim; 298917Selowe int prd_hashout; 299917Selowe int prd_fma; 300917Selowe int prd_uescrubbed; 301917Selowe int prd_uenotscrubbed; 302917Selowe int prd_mce; 303917Selowe int prd_prlocked; 304917Selowe int prd_prnotlocked; 305917Selowe int prd_prretired; 306917Selowe int prd_ulocked; 307917Selowe int prd_unotretired; 308917Selowe int prd_udestroy; 309917Selowe int prd_uhashout; 310917Selowe int prd_uunretired; 311917Selowe int prd_unotlocked; 312917Selowe int prd_checkhit; 3131381Selowe int prd_checkmiss_pend; 3141381Selowe int prd_checkmiss_noerr; 315917Selowe int prd_tctop; 316917Selowe int prd_tclocked; 317917Selowe int prd_hunt; 318917Selowe int prd_dohunt; 319917Selowe int prd_earlyhunt; 320917Selowe int prd_latehunt; 321917Selowe int prd_nofreedemote; 322917Selowe int prd_nodemote; 323917Selowe int prd_demoted; 324917Selowe } pr_debug; 325917Selowe 326917Selowe #define PR_DEBUG(foo) ((pr_debug.foo)++) 327917Selowe 328917Selowe /* 329917Selowe * A type histogram. We record the incidence of the various toxic 330917Selowe * flag combinations along with the interesting page attributes. The 331917Selowe * goal is to get as many combinations as we can while driving all 332917Selowe * pr_debug values nonzero (indicating we've exercised all possible 333917Selowe * code paths across all possible page types). Not all combinations 334917Selowe * will make sense -- e.g. PRT_MOD|PRT_KERNEL. 335917Selowe * 336917Selowe * pr_type offset bit encoding (when examining with a debugger): 337917Selowe * 338917Selowe * PRT_NAMED - 0x4 339917Selowe * PRT_KERNEL - 0x8 340917Selowe * PRT_FREE - 0x10 341917Selowe * PRT_MOD - 0x20 342917Selowe * PRT_FMA - 0x0 343917Selowe * PRT_MCE - 0x40 344917Selowe * PRT_UE - 0x80 345917Selowe */ 346917Selowe 347917Selowe #define PRT_NAMED 0x01 348917Selowe #define PRT_KERNEL 0x02 349917Selowe #define PRT_FREE 0x04 350917Selowe #define PRT_MOD 0x08 351917Selowe #define PRT_FMA 0x00 /* yes, this is not a mistake */ 352917Selowe #define PRT_MCE 0x10 353917Selowe #define PRT_UE 0x20 354917Selowe #define PRT_ALL 0x3F 355917Selowe 356917Selowe int pr_types[PRT_ALL+1]; 357917Selowe 358917Selowe #define PR_TYPES(pp) { \ 359917Selowe int whichtype = 0; \ 360917Selowe if (pp->p_vnode) \ 361917Selowe whichtype |= PRT_NAMED; \ 3623290Sjohansen if (PP_ISKAS(pp)) \ 363917Selowe whichtype |= PRT_KERNEL; \ 364917Selowe if (PP_ISFREE(pp)) \ 365917Selowe whichtype |= PRT_FREE; \ 366917Selowe if (hat_ismod(pp)) \ 367917Selowe whichtype |= PRT_MOD; \ 368917Selowe if (pp->p_toxic & PR_UE) \ 369917Selowe whichtype |= PRT_UE; \ 370917Selowe if (pp->p_toxic & PR_MCE) \ 371917Selowe whichtype |= PRT_MCE; \ 372917Selowe pr_types[whichtype]++; \ 373917Selowe } 374917Selowe 375917Selowe int recl_calls; 376917Selowe int recl_mtbf = 3; 377917Selowe int reloc_calls; 378917Selowe int reloc_mtbf = 7; 379917Selowe int pr_calls; 380917Selowe int pr_mtbf = 15; 381917Selowe 382917Selowe #define MTBF(v, f) (((++(v)) & (f)) != (f)) 383917Selowe 384917Selowe #else /* DEBUG */ 385917Selowe 386917Selowe #define PR_DEBUG(foo) /* nothing */ 387917Selowe #define PR_TYPES(foo) /* nothing */ 388917Selowe #define MTBF(v, f) (1) 389917Selowe 390917Selowe #endif /* DEBUG */ 391917Selowe 392917Selowe /* 393917Selowe * page_retire_done() - completion processing 394917Selowe * 395917Selowe * Used by the page_retire code for common completion processing. 396917Selowe * It keeps track of how many times a given result has happened, 397917Selowe * and writes out an occasional message. 398917Selowe * 399917Selowe * May be called with a NULL pp (PRD_INVALID_PA case). 400917Selowe */ 401917Selowe #define PRD_INVALID_KEY -1 402917Selowe #define PRD_SUCCESS 0 403917Selowe #define PRD_PENDING 1 404917Selowe #define PRD_FAILED 2 405917Selowe #define PRD_DUPLICATE 3 406917Selowe #define PRD_INVALID_PA 4 407917Selowe #define PRD_LIMIT 5 408917Selowe #define PRD_UE_SCRUBBED 6 409917Selowe #define PRD_UNR_SUCCESS 7 410917Selowe #define PRD_UNR_CANTLOCK 8 411917Selowe #define PRD_UNR_NOT 9 412917Selowe 413917Selowe typedef struct page_retire_op { 414917Selowe int pr_key; /* one of the PRD_* defines from above */ 415917Selowe int pr_count; /* How many times this has happened */ 416917Selowe int pr_retval; /* return value */ 417917Selowe int pr_msglvl; /* message level - when to print */ 418917Selowe char *pr_message; /* Cryptic message for field service */ 419917Selowe } page_retire_op_t; 420917Selowe 421917Selowe static page_retire_op_t page_retire_ops[] = { 422917Selowe /* key count retval msglvl message */ 423917Selowe {PRD_SUCCESS, 0, 0, 1, 424917Selowe "Page 0x%08x.%08x removed from service"}, 425917Selowe {PRD_PENDING, 0, EAGAIN, 2, 426917Selowe "Page 0x%08x.%08x will be retired on free"}, 427917Selowe {PRD_FAILED, 0, EAGAIN, 0, NULL}, 4281381Selowe {PRD_DUPLICATE, 0, EIO, 2, 4291381Selowe "Page 0x%08x.%08x already retired or pending"}, 430917Selowe {PRD_INVALID_PA, 0, EINVAL, 2, 431917Selowe "PA 0x%08x.%08x is not a relocatable page"}, 432917Selowe {PRD_LIMIT, 0, 0, 1, 433917Selowe "Page 0x%08x.%08x not retired due to limit exceeded"}, 434917Selowe {PRD_UE_SCRUBBED, 0, 0, 1, 435917Selowe "Previously reported error on page 0x%08x.%08x cleared"}, 436917Selowe {PRD_UNR_SUCCESS, 0, 0, 1, 437917Selowe "Page 0x%08x.%08x returned to service"}, 438917Selowe {PRD_UNR_CANTLOCK, 0, EAGAIN, 2, 439917Selowe "Page 0x%08x.%08x could not be unretired"}, 4401381Selowe {PRD_UNR_NOT, 0, EIO, 2, 441917Selowe "Page 0x%08x.%08x is not retired"}, 442917Selowe {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */ 443917Selowe }; 444917Selowe 445917Selowe /* 446917Selowe * print a message if page_retire_messages is true. 447917Selowe */ 448917Selowe #define PR_MESSAGE(debuglvl, msglvl, msg, pa) \ 449917Selowe { \ 450917Selowe uint64_t p = (uint64_t)pa; \ 451917Selowe if (page_retire_messages >= msglvl && msg != NULL) { \ 452917Selowe cmn_err(debuglvl, msg, \ 453917Selowe (uint32_t)(p >> 32), (uint32_t)p); \ 454917Selowe } \ 455917Selowe } 456917Selowe 457917Selowe /* 458917Selowe * Note that multiple bits may be set in a single settoxic operation. 459917Selowe * May be called without the page locked. 460917Selowe */ 461917Selowe void 462917Selowe page_settoxic(page_t *pp, uchar_t bits) 463917Selowe { 464917Selowe atomic_or_8(&pp->p_toxic, bits); 465917Selowe } 466917Selowe 467917Selowe /* 468917Selowe * Note that multiple bits may cleared in a single clrtoxic operation. 4691338Selowe * Must be called with the page exclusively locked to prevent races which 4701338Selowe * may attempt to retire a page without any toxic bits set. 4713253Smec * Note that the PR_CAPTURE bit can be cleared without the exclusive lock 4723253Smec * being held as there is a separate mutex which protects that bit. 473917Selowe */ 474917Selowe void 475917Selowe page_clrtoxic(page_t *pp, uchar_t bits) 476917Selowe { 4773253Smec ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp)); 478917Selowe atomic_and_8(&pp->p_toxic, ~bits); 479917Selowe } 480917Selowe 481917Selowe /* 482917Selowe * Prints any page retire messages to the user, and decides what 483917Selowe * error code is appropriate for the condition reported. 484917Selowe */ 485917Selowe static int 486917Selowe page_retire_done(page_t *pp, int code) 487917Selowe { 488917Selowe page_retire_op_t *prop; 489917Selowe uint64_t pa = 0; 490917Selowe int i; 491917Selowe 492917Selowe if (pp != NULL) { 4931338Selowe pa = mmu_ptob((uint64_t)pp->p_pagenum); 494917Selowe } 495917Selowe 496917Selowe prop = NULL; 497917Selowe for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { 498917Selowe if (page_retire_ops[i].pr_key == code) { 499917Selowe prop = &page_retire_ops[i]; 500917Selowe break; 501917Selowe } 502917Selowe } 503917Selowe 504917Selowe #ifdef DEBUG 505917Selowe if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { 506917Selowe cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); 507917Selowe } 508917Selowe #endif 509917Selowe 510917Selowe ASSERT(prop->pr_key == code); 511917Selowe 512917Selowe prop->pr_count++; 513917Selowe 514917Selowe PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); 515917Selowe if (pp != NULL) { 516917Selowe page_settoxic(pp, PR_MSG); 517917Selowe } 518917Selowe 519917Selowe return (prop->pr_retval); 520917Selowe } 521917Selowe 522917Selowe /* 523917Selowe * Act like page_destroy(), but instead of freeing the page, hash it onto 524917Selowe * the retired_pages vnode, and mark it retired. 525917Selowe * 526917Selowe * For fun, we try to scrub the page until it's squeaky clean. 527917Selowe * availrmem is adjusted here. 528917Selowe */ 529917Selowe static void 530917Selowe page_retire_destroy(page_t *pp) 531917Selowe { 532973Selowe u_offset_t off = (u_offset_t)((uintptr_t)pp); 533973Selowe 534917Selowe ASSERT(PAGE_EXCL(pp)); 535917Selowe ASSERT(!PP_ISFREE(pp)); 536917Selowe ASSERT(pp->p_szc == 0); 537917Selowe ASSERT(!hat_page_is_mapped(pp)); 538917Selowe ASSERT(!pp->p_vnode); 539917Selowe 54010271SJason.Beloro@Sun.COM page_clr_all_props(pp); 541917Selowe pagescrub(pp, 0, MMU_PAGESIZE); 542917Selowe 543917Selowe pp->p_next = NULL; 544917Selowe pp->p_prev = NULL; 545973Selowe if (page_hashin(pp, retired_pages, off, NULL) == 0) { 546917Selowe cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); 547917Selowe } 548917Selowe 549917Selowe page_settoxic(pp, PR_RETIRED); 550917Selowe PR_INCR_KSTAT(pr_retired); 551917Selowe 552917Selowe if (pp->p_toxic & PR_FMA) { 553917Selowe PR_INCR_KSTAT(pr_fma); 554917Selowe } else if (pp->p_toxic & PR_UE) { 555917Selowe PR_INCR_KSTAT(pr_ue); 556917Selowe } else { 557917Selowe PR_INCR_KSTAT(pr_mce); 558917Selowe } 559917Selowe 560917Selowe mutex_enter(&freemem_lock); 561917Selowe availrmem--; 562917Selowe mutex_exit(&freemem_lock); 563917Selowe 564917Selowe page_unlock(pp); 565917Selowe } 566917Selowe 567917Selowe /* 568917Selowe * Check whether the number of pages which have been retired already exceeds 569917Selowe * the maximum allowable percentage of memory which may be retired. 570917Selowe * 571917Selowe * Returns 1 if the limit has been exceeded. 572917Selowe */ 573917Selowe static int 574917Selowe page_retire_limit(void) 575917Selowe { 576917Selowe if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) { 577917Selowe PR_INCR_KSTAT(pr_limit_exceeded); 578917Selowe return (1); 579917Selowe } 580917Selowe 581917Selowe return (0); 582917Selowe } 583917Selowe 584917Selowe #define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \ 585917Selowe "[ 0x%x != 0x%x ] while attempting to clear previously " \ 586917Selowe "reported error; page removed from service" 587917Selowe 588917Selowe #define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \ 589917Selowe "attempting to clear previously reported error; page removed " \ 590917Selowe "from service" 591917Selowe 592917Selowe /* 593917Selowe * Attempt to clear a UE from a page. 594917Selowe * Returns 1 if the error has been successfully cleared. 595917Selowe */ 596917Selowe static int 597917Selowe page_clear_transient_ue(page_t *pp) 598917Selowe { 599917Selowe caddr_t kaddr; 600917Selowe uint8_t rb, wb; 601917Selowe uint64_t pa; 602917Selowe uint32_t pa_hi, pa_lo; 603917Selowe on_trap_data_t otd; 604917Selowe int errors = 0; 605917Selowe int i; 606917Selowe 607917Selowe ASSERT(PAGE_EXCL(pp)); 608917Selowe ASSERT(PP_PR_REQ(pp)); 609917Selowe ASSERT(pp->p_szc == 0); 610917Selowe ASSERT(!hat_page_is_mapped(pp)); 611917Selowe 612917Selowe /* 613917Selowe * Clear the page and attempt to clear the UE. If we trap 614917Selowe * on the next access to the page, we know the UE has recurred. 615917Selowe */ 616917Selowe pagescrub(pp, 0, PAGESIZE); 617917Selowe 618917Selowe /* 619917Selowe * Map the page and write a bunch of bit patterns to compare 620917Selowe * what we wrote with what we read back. This isn't a perfect 621917Selowe * test but it should be good enough to catch most of the 622917Selowe * recurring UEs. If this fails to catch a recurrent UE, we'll 623917Selowe * retire the page the next time we see a UE on the page. 624917Selowe */ 625917Selowe kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); 626917Selowe 627917Selowe pa = ptob((uint64_t)page_pptonum(pp)); 628917Selowe pa_hi = (uint32_t)(pa >> 32); 629917Selowe pa_lo = (uint32_t)pa; 630917Selowe 631917Selowe /* 6327458SChristopher.Baumbauer@Sun.COM * Disable preemption to prevent the off chance that 6337458SChristopher.Baumbauer@Sun.COM * we migrate while in the middle of running through 6347458SChristopher.Baumbauer@Sun.COM * the bit pattern and run on a different processor 6357458SChristopher.Baumbauer@Sun.COM * than what we started on. 6367458SChristopher.Baumbauer@Sun.COM */ 6377458SChristopher.Baumbauer@Sun.COM kpreempt_disable(); 6387458SChristopher.Baumbauer@Sun.COM 6397458SChristopher.Baumbauer@Sun.COM /* 640917Selowe * Fill the page with each (0x00 - 0xFF] bit pattern, flushing 641917Selowe * the cache in between reading and writing. We do this under 642917Selowe * on_trap() protection to avoid recursion. 643917Selowe */ 644917Selowe if (on_trap(&otd, OT_DATA_EC)) { 645917Selowe PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); 646917Selowe errors = 1; 647917Selowe } else { 648917Selowe for (wb = 0xff; wb > 0; wb--) { 649917Selowe for (i = 0; i < PAGESIZE; i++) { 650917Selowe kaddr[i] = wb; 651917Selowe } 652917Selowe 653917Selowe sync_data_memory(kaddr, PAGESIZE); 654917Selowe 655917Selowe for (i = 0; i < PAGESIZE; i++) { 656917Selowe rb = kaddr[i]; 657917Selowe if (rb != wb) { 658917Selowe /* 659917Selowe * We had a mismatch without a trap. 660917Selowe * Uh-oh. Something is really wrong 661917Selowe * with this system. 662917Selowe */ 663917Selowe if (page_retire_messages) { 664917Selowe cmn_err(CE_WARN, MSG_DM, 665917Selowe pa_hi, pa_lo, rb, wb); 666917Selowe } 667917Selowe errors = 1; 668917Selowe goto out; /* double break */ 669917Selowe } 670917Selowe } 671917Selowe } 672917Selowe } 673917Selowe out: 674917Selowe no_trap(); 6757458SChristopher.Baumbauer@Sun.COM kpreempt_enable(); 676917Selowe ppmapout(kaddr); 677917Selowe 678917Selowe return (errors ? 0 : 1); 679917Selowe } 680917Selowe 681917Selowe /* 682917Selowe * Try to clear a page_t with a single UE. If the UE was transient, it is 683917Selowe * returned to service, and we return 1. Otherwise we return 0 meaning 684917Selowe * that further processing is required to retire the page. 685917Selowe */ 686917Selowe static int 687917Selowe page_retire_transient_ue(page_t *pp) 688917Selowe { 689917Selowe ASSERT(PAGE_EXCL(pp)); 690917Selowe ASSERT(!hat_page_is_mapped(pp)); 691917Selowe 692917Selowe /* 693917Selowe * If this page is a repeat offender, retire him under the 694917Selowe * "two strikes and you're out" rule. The caller is responsible 695917Selowe * for scrubbing the page to try to clear the error. 696917Selowe */ 697917Selowe if (pp->p_toxic & PR_UE_SCRUBBED) { 698917Selowe PR_INCR_KSTAT(pr_ue_persistent); 699917Selowe return (0); 700917Selowe } 701917Selowe 702917Selowe if (page_clear_transient_ue(pp)) { 703917Selowe /* 704917Selowe * We set the PR_SCRUBBED_UE bit; if we ever see this 705917Selowe * page again, we will retire it, no questions asked. 706917Selowe */ 707917Selowe page_settoxic(pp, PR_UE_SCRUBBED); 708917Selowe 709917Selowe if (page_retire_first_ue) { 710917Selowe PR_INCR_KSTAT(pr_ue_cleared_retire); 711917Selowe return (0); 712917Selowe } else { 713917Selowe PR_INCR_KSTAT(pr_ue_cleared_free); 714917Selowe 7153253Smec page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG); 716917Selowe 717917Selowe /* LINTED: CONSTCOND */ 718917Selowe VN_DISPOSE(pp, B_FREE, 1, kcred); 719917Selowe return (1); 720917Selowe } 721917Selowe } 722917Selowe 723917Selowe PR_INCR_KSTAT(pr_ue_persistent); 724917Selowe return (0); 725917Selowe } 726917Selowe 727917Selowe /* 728917Selowe * Update the statistics dynamically when our kstat is read. 729917Selowe */ 730917Selowe static int 731917Selowe page_retire_kstat_update(kstat_t *ksp, int rw) 732917Selowe { 733917Selowe struct page_retire_kstat *pr; 734917Selowe 735917Selowe if (ksp == NULL) 7367458SChristopher.Baumbauer@Sun.COM return (EINVAL); 737917Selowe 738917Selowe switch (rw) { 739917Selowe 740917Selowe case KSTAT_READ: 741917Selowe pr = (struct page_retire_kstat *)ksp->ks_data; 742917Selowe ASSERT(pr == &page_retire_kstat); 743917Selowe pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT; 744917Selowe return (0); 745917Selowe 746917Selowe case KSTAT_WRITE: 747917Selowe return (EACCES); 748917Selowe 749917Selowe default: 750917Selowe return (EINVAL); 751917Selowe } 752917Selowe /*NOTREACHED*/ 753917Selowe } 754917Selowe 7553253Smec static int 7563253Smec pr_list_kstat_update(kstat_t *ksp, int rw) 7573253Smec { 7583253Smec uint_t count; 7593253Smec page_t *pp; 7603253Smec kmutex_t *vphm; 7613253Smec 7623253Smec if (rw == KSTAT_WRITE) 7633253Smec return (EACCES); 7643253Smec 7653253Smec vphm = page_vnode_mutex(retired_pages); 7663253Smec mutex_enter(vphm); 7673253Smec /* Needs to be under a lock so that for loop will work right */ 7683253Smec if (retired_pages->v_pages == NULL) { 7693253Smec mutex_exit(vphm); 7703253Smec ksp->ks_ndata = 0; 7713253Smec ksp->ks_data_size = 0; 7723253Smec return (0); 7733253Smec } 7743253Smec 7753253Smec count = 1; 7763253Smec for (pp = retired_pages->v_pages->p_vpnext; 7773253Smec pp != retired_pages->v_pages; pp = pp->p_vpnext) { 7783253Smec count++; 7793253Smec } 7803253Smec mutex_exit(vphm); 7813253Smec 7823253Smec ksp->ks_ndata = count; 7833253Smec ksp->ks_data_size = count * 2 * sizeof (uint64_t); 7843253Smec 7853253Smec return (0); 7863253Smec } 7873253Smec 7883253Smec /* 7893253Smec * all spans will be pagesize and no coalescing will be done with the 7903253Smec * list produced. 7913253Smec */ 7923253Smec static int 7933253Smec pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 7943253Smec { 7953253Smec kmutex_t *vphm; 7963253Smec page_t *pp; 7973253Smec struct memunit { 7983253Smec uint64_t address; 7993253Smec uint64_t size; 8003253Smec } *kspmem; 8013253Smec 8023253Smec if (rw == KSTAT_WRITE) 8033253Smec return (EACCES); 8043253Smec 8053253Smec ksp->ks_snaptime = gethrtime(); 8063253Smec 8073253Smec kspmem = (struct memunit *)buf; 8083253Smec 8093253Smec vphm = page_vnode_mutex(retired_pages); 8103253Smec mutex_enter(vphm); 8113253Smec pp = retired_pages->v_pages; 8123253Smec if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) || 8133253Smec (pp == NULL)) { 8143253Smec mutex_exit(vphm); 8153253Smec return (0); 8163253Smec } 8173253Smec kspmem->address = ptob(pp->p_pagenum); 8183253Smec kspmem->size = PAGESIZE; 8193253Smec kspmem++; 8203253Smec for (pp = pp->p_vpnext; pp != retired_pages->v_pages; 8213253Smec pp = pp->p_vpnext, kspmem++) { 8223253Smec if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 8233253Smec break; 8243253Smec kspmem->address = ptob(pp->p_pagenum); 8253253Smec kspmem->size = PAGESIZE; 8263253Smec } 8273253Smec mutex_exit(vphm); 8283253Smec 8293253Smec return (0); 8303253Smec } 8313253Smec 832917Selowe /* 8333480Sjfrank * page_retire_pend_count -- helper function for page_capture_thread, 8343480Sjfrank * returns the number of pages pending retirement. 8353480Sjfrank */ 8363480Sjfrank uint64_t 8373480Sjfrank page_retire_pend_count(void) 8383480Sjfrank { 8393480Sjfrank return (PR_KSTAT_PENDING); 8403480Sjfrank } 8413480Sjfrank 8429544SChristopher.Baumbauer@Sun.COM uint64_t 8439544SChristopher.Baumbauer@Sun.COM page_retire_pend_kas_count(void) 8443480Sjfrank { 8459544SChristopher.Baumbauer@Sun.COM return (PR_KSTAT_PENDING_KAS); 8463480Sjfrank } 8473480Sjfrank 8483480Sjfrank void 8499544SChristopher.Baumbauer@Sun.COM page_retire_incr_pend_count(void *datap) 8509544SChristopher.Baumbauer@Sun.COM { 8519544SChristopher.Baumbauer@Sun.COM PR_INCR_KSTAT(pr_pending); 8529544SChristopher.Baumbauer@Sun.COM 8539544SChristopher.Baumbauer@Sun.COM if ((datap == &kvp) || (datap == &zvp)) { 8549544SChristopher.Baumbauer@Sun.COM PR_INCR_KSTAT(pr_pending_kas); 8559544SChristopher.Baumbauer@Sun.COM } 8569544SChristopher.Baumbauer@Sun.COM } 8579544SChristopher.Baumbauer@Sun.COM 8589544SChristopher.Baumbauer@Sun.COM void 8599544SChristopher.Baumbauer@Sun.COM page_retire_decr_pend_count(void *datap) 8603480Sjfrank { 8613480Sjfrank PR_DECR_KSTAT(pr_pending); 8629544SChristopher.Baumbauer@Sun.COM 8639544SChristopher.Baumbauer@Sun.COM if ((datap == &kvp) || (datap == &zvp)) { 8649544SChristopher.Baumbauer@Sun.COM PR_DECR_KSTAT(pr_pending_kas); 8659544SChristopher.Baumbauer@Sun.COM } 8663480Sjfrank } 8673480Sjfrank 8683480Sjfrank /* 869917Selowe * Initialize the page retire mechanism: 870917Selowe * 871917Selowe * - Establish the correctable error retire limit. 872917Selowe * - Initialize locks. 873917Selowe * - Build the retired_pages vnode. 874917Selowe * - Set up the kstats. 875917Selowe * - Fire off the background thread. 8763253Smec * - Tell page_retire() it's OK to start retiring pages. 877917Selowe */ 878917Selowe void 879917Selowe page_retire_init(void) 880917Selowe { 8813898Srsb const fs_operation_def_t retired_vnodeops_template[] = { 8823898Srsb { NULL, NULL } 8833898Srsb }; 884917Selowe struct vnodeops *vops; 8853253Smec kstat_t *ksp; 886917Selowe 887917Selowe const uint_t page_retire_ndata = 888917Selowe sizeof (page_retire_kstat) / sizeof (kstat_named_t); 889917Selowe 890917Selowe ASSERT(page_retire_ksp == NULL); 891917Selowe 892917Selowe if (max_pages_retired_bps <= 0) { 893917Selowe max_pages_retired_bps = MCE_BPT; 894917Selowe } 895917Selowe 896917Selowe mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL); 897917Selowe 898917Selowe retired_pages = vn_alloc(KM_SLEEP); 899917Selowe if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) { 900917Selowe cmn_err(CE_PANIC, 901917Selowe "page_retired_init: can't make retired vnodeops"); 902917Selowe } 903917Selowe vn_setops(retired_pages, vops); 904917Selowe 905917Selowe if ((page_retire_ksp = kstat_create("unix", 0, "page_retire", 906917Selowe "misc", KSTAT_TYPE_NAMED, page_retire_ndata, 907917Selowe KSTAT_FLAG_VIRTUAL)) == NULL) { 908917Selowe cmn_err(CE_WARN, "kstat_create for page_retire failed"); 909917Selowe } else { 910917Selowe page_retire_ksp->ks_data = (void *)&page_retire_kstat; 911917Selowe page_retire_ksp->ks_update = page_retire_kstat_update; 912917Selowe kstat_install(page_retire_ksp); 913917Selowe } 914917Selowe 9153253Smec mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 9163253Smec ksp = kstat_create("unix", 0, "page_retire_list", "misc", 9173253Smec KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 9183253Smec if (ksp != NULL) { 9193253Smec ksp->ks_update = pr_list_kstat_update; 9203253Smec ksp->ks_snapshot = pr_list_kstat_snapshot; 9213253Smec ksp->ks_lock = &pr_list_kstat_mutex; 9223253Smec kstat_install(ksp); 9233253Smec } 924917Selowe 925*11873SVijay.Balakrishna@Sun.COM memscrub_notify_func = 926*11873SVijay.Balakrishna@Sun.COM (void(*)(uint64_t))kobj_getsymvalue("memscrub_notify", 0); 927*11873SVijay.Balakrishna@Sun.COM 9283253Smec page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish); 929917Selowe pr_enable = 1; 930917Selowe } 931917Selowe 932917Selowe /* 933917Selowe * page_retire_hunt() callback for the retire thread. 934917Selowe */ 935917Selowe static void 936917Selowe page_retire_thread_cb(page_t *pp) 937917Selowe { 938917Selowe PR_DEBUG(prd_tctop); 9393290Sjohansen if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) { 940917Selowe PR_DEBUG(prd_tclocked); 941917Selowe page_unlock(pp); 942917Selowe } 943917Selowe } 944917Selowe 945917Selowe /* 9463253Smec * Callback used by page_trycapture() to finish off retiring a page. 9473253Smec * The page has already been cleaned and we've been given sole access to 9483253Smec * it. 9493253Smec * Always returns 0 to indicate that callback succeded as the callback never 9503253Smec * fails to finish retiring the given page. 951917Selowe */ 9523253Smec /*ARGSUSED*/ 953917Selowe static int 9543253Smec page_retire_pp_finish(page_t *pp, void *notused, uint_t flags) 955917Selowe { 956917Selowe int toxic; 957917Selowe 958917Selowe ASSERT(PAGE_EXCL(pp)); 959917Selowe ASSERT(pp->p_iolock_state == 0); 960917Selowe ASSERT(pp->p_szc == 0); 961917Selowe 962917Selowe toxic = pp->p_toxic; 963917Selowe 964917Selowe /* 965917Selowe * The problem page is locked, demoted, unmapped, not free, 966917Selowe * hashed out, and not COW or mlocked (whew!). 967917Selowe * 968917Selowe * Now we select our ammunition, take it around back, and shoot it. 969917Selowe */ 970917Selowe if (toxic & PR_UE) { 9713253Smec ue_error: 972917Selowe if (page_retire_transient_ue(pp)) { 973917Selowe PR_DEBUG(prd_uescrubbed); 9743253Smec (void) page_retire_done(pp, PRD_UE_SCRUBBED); 975917Selowe } else { 976917Selowe PR_DEBUG(prd_uenotscrubbed); 977917Selowe page_retire_destroy(pp); 9783253Smec (void) page_retire_done(pp, PRD_SUCCESS); 979917Selowe } 9803253Smec return (0); 981917Selowe } else if (toxic & PR_FMA) { 982917Selowe PR_DEBUG(prd_fma); 983917Selowe page_retire_destroy(pp); 9843253Smec (void) page_retire_done(pp, PRD_SUCCESS); 9853253Smec return (0); 986917Selowe } else if (toxic & PR_MCE) { 987917Selowe PR_DEBUG(prd_mce); 988917Selowe page_retire_destroy(pp); 9893253Smec (void) page_retire_done(pp, PRD_SUCCESS); 9903253Smec return (0); 991917Selowe } 992917Selowe 993917Selowe /* 9943253Smec * When page_retire_first_ue is set to zero and a UE occurs which is 9953253Smec * transient, it's possible that we clear some flags set by a second 9963253Smec * UE error on the page which occurs while the first is currently being 9973253Smec * handled and thus we need to handle the case where none of the above 9983253Smec * are set. In this instance, PR_UE_SCRUBBED should be set and thus 9993253Smec * we should execute the UE code above. 1000917Selowe */ 10013253Smec if (toxic & PR_UE_SCRUBBED) { 10023253Smec goto ue_error; 1003917Selowe } 10043253Smec 10053253Smec /* 10063253Smec * It's impossible to get here. 10073253Smec */ 10083253Smec panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic); 10093253Smec return (0); 1010917Selowe } 1011917Selowe 1012917Selowe /* 1013917Selowe * page_retire() - the front door in to retire a page. 1014917Selowe * 1015917Selowe * Ideally, page_retire() would instantly retire the requested page. 1016917Selowe * Unfortunately, some pages are locked or otherwise tied up and cannot be 10173253Smec * retired right away. We use the page capture logic to deal with this 10183253Smec * situation as it will continuously try to retire the page in the background 10193253Smec * if the first attempt fails. Success is determined by looking to see whether 10203253Smec * the page has been retired after the page_trycapture() attempt. 1021917Selowe * 1022917Selowe * Returns: 1023917Selowe * 1024917Selowe * - 0 on success, 1025917Selowe * - EINVAL when the PA is whacko, 10261381Selowe * - EIO if the page is already retired or already pending retirement, or 10271381Selowe * - EAGAIN if the page could not be _immediately_ retired but is pending. 1028917Selowe */ 1029917Selowe int 1030917Selowe page_retire(uint64_t pa, uchar_t reason) 1031917Selowe { 1032917Selowe page_t *pp; 1033917Selowe 1034917Selowe ASSERT(reason & PR_REASONS); /* there must be a reason */ 1035917Selowe ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ 1036917Selowe 1037917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1038917Selowe if (pp == NULL) { 1039917Selowe PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" 1040917Selowe " page 0x%08x.%08x; page is not relocatable memory", pa); 1041917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1042917Selowe } 1043917Selowe if (PP_RETIRED(pp)) { 10441381Selowe PR_DEBUG(prd_dup1); 1045917Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1046917Selowe } 1047917Selowe 1048*11873SVijay.Balakrishna@Sun.COM if (memscrub_notify_func != NULL) { 1049*11873SVijay.Balakrishna@Sun.COM (void) memscrub_notify_func(pa); 1050*11873SVijay.Balakrishna@Sun.COM } 1051*11873SVijay.Balakrishna@Sun.COM 10521381Selowe if ((reason & PR_UE) && !PP_TOXIC(pp)) { 1053917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" 1054917Selowe " page 0x%08x.%08x", pa); 10551381Selowe } else if (PP_PR_REQ(pp)) { 10561381Selowe PR_DEBUG(prd_dup2); 10571381Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1058917Selowe } else { 1059917Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" 1060917Selowe " page 0x%08x.%08x", pa); 1061917Selowe } 10623253Smec 10633253Smec /* Avoid setting toxic bits in the first place */ 10643253Smec if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) && 10653253Smec page_retire_limit()) { 10663253Smec return (page_retire_done(pp, PRD_LIMIT)); 10673253Smec } 1068917Selowe 10693253Smec if (MTBF(pr_calls, pr_mtbf)) { 10703253Smec page_settoxic(pp, reason); 10719544SChristopher.Baumbauer@Sun.COM if (page_trycapture(pp, 0, CAPTURE_RETIRE, pp->p_vnode) == 0) { 10723253Smec PR_DEBUG(prd_prlocked); 10733253Smec } else { 10743253Smec PR_DEBUG(prd_prnotlocked); 10753253Smec } 1076917Selowe } else { 1077917Selowe PR_DEBUG(prd_prnotlocked); 1078917Selowe } 1079917Selowe 1080917Selowe if (PP_RETIRED(pp)) { 1081917Selowe PR_DEBUG(prd_prretired); 1082917Selowe return (0); 1083917Selowe } else { 10843253Smec cv_signal(&pc_cv); 1085917Selowe PR_INCR_KSTAT(pr_failed); 1086917Selowe 1087917Selowe if (pp->p_toxic & PR_MSG) { 1088917Selowe return (page_retire_done(pp, PRD_FAILED)); 1089917Selowe } else { 1090917Selowe return (page_retire_done(pp, PRD_PENDING)); 1091917Selowe } 1092917Selowe } 1093917Selowe } 1094917Selowe 1095917Selowe /* 1096917Selowe * Take a retired page off the retired-pages vnode and clear the toxic flags. 1097917Selowe * If "free" is nonzero, lock it and put it back on the freelist. If "free" 1098917Selowe * is zero, the caller already holds SE_EXCL lock so we simply unretire it 1099917Selowe * and don't do anything else with it. 1100917Selowe * 1101917Selowe * Any unretire messages are printed from this routine. 1102917Selowe * 1103917Selowe * Returns 0 if page pp was unretired; else an error code. 11043253Smec * 11053253Smec * If flags is: 11063253Smec * PR_UNR_FREE - lock the page, clear the toxic flags and free it 11073253Smec * to the freelist. 11083253Smec * PR_UNR_TEMP - lock the page, unretire it, leave the toxic 11093253Smec * bits set as is and return it to the caller. 11103253Smec * PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the 11113253Smec * toxic flags and return it to caller as is. 1112917Selowe */ 1113917Selowe int 11143253Smec page_unretire_pp(page_t *pp, int flags) 1115917Selowe { 1116917Selowe /* 1117917Selowe * To be retired, a page has to be hashed onto the retired_pages vnode 1118917Selowe * and have PR_RETIRED set in p_toxic. 1119917Selowe */ 11203253Smec if (flags == PR_UNR_CLEAN || 11213253Smec page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { 1122917Selowe ASSERT(PAGE_EXCL(pp)); 1123917Selowe PR_DEBUG(prd_ulocked); 1124917Selowe if (!PP_RETIRED(pp)) { 1125917Selowe PR_DEBUG(prd_unotretired); 1126917Selowe page_unlock(pp); 1127917Selowe return (page_retire_done(pp, PRD_UNR_NOT)); 1128917Selowe } 1129917Selowe 1130917Selowe PR_MESSAGE(CE_NOTE, 1, "unretiring retired" 11311338Selowe " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); 1132917Selowe if (pp->p_toxic & PR_FMA) { 1133917Selowe PR_DECR_KSTAT(pr_fma); 1134917Selowe } else if (pp->p_toxic & PR_UE) { 1135917Selowe PR_DECR_KSTAT(pr_ue); 1136917Selowe } else { 1137917Selowe PR_DECR_KSTAT(pr_mce); 1138917Selowe } 1139917Selowe 11403253Smec if (flags == PR_UNR_TEMP) 11413253Smec page_clrtoxic(pp, PR_RETIRED); 11423253Smec else 11433253Smec page_clrtoxic(pp, PR_TOXICFLAGS); 11443253Smec 11453253Smec if (flags == PR_UNR_FREE) { 1146917Selowe PR_DEBUG(prd_udestroy); 1147917Selowe page_destroy(pp, 0); 1148917Selowe } else { 1149917Selowe PR_DEBUG(prd_uhashout); 1150917Selowe page_hashout(pp, NULL); 1151917Selowe } 1152917Selowe 1153917Selowe mutex_enter(&freemem_lock); 1154917Selowe availrmem++; 1155917Selowe mutex_exit(&freemem_lock); 1156917Selowe 1157917Selowe PR_DEBUG(prd_uunretired); 1158917Selowe PR_DECR_KSTAT(pr_retired); 1159917Selowe PR_INCR_KSTAT(pr_unretired); 1160917Selowe return (page_retire_done(pp, PRD_UNR_SUCCESS)); 1161917Selowe } 1162917Selowe PR_DEBUG(prd_unotlocked); 1163917Selowe return (page_retire_done(pp, PRD_UNR_CANTLOCK)); 1164917Selowe } 1165917Selowe 1166917Selowe /* 1167917Selowe * Return a page to service by moving it from the retired_pages vnode 1168917Selowe * onto the freelist. 1169917Selowe * 1170917Selowe * Called from mmioctl_page_retire() on behalf of the FMA DE. 1171917Selowe * 1172917Selowe * Returns: 1173917Selowe * 1174917Selowe * - 0 if the page is unretired, 1175917Selowe * - EAGAIN if the pp can not be locked, 1176917Selowe * - EINVAL if the PA is whacko, and 11771381Selowe * - EIO if the pp is not retired. 1178917Selowe */ 1179917Selowe int 1180917Selowe page_unretire(uint64_t pa) 1181917Selowe { 1182917Selowe page_t *pp; 1183917Selowe 1184917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1185917Selowe if (pp == NULL) { 1186917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1187917Selowe } 1188917Selowe 11893253Smec return (page_unretire_pp(pp, PR_UNR_FREE)); 1190917Selowe } 1191917Selowe 1192917Selowe /* 1193917Selowe * Test a page to see if it is retired. If errors is non-NULL, the toxic 1194917Selowe * bits of the page are returned. Returns 0 on success, error code on failure. 1195917Selowe */ 1196917Selowe int 1197917Selowe page_retire_check_pp(page_t *pp, uint64_t *errors) 1198917Selowe { 1199917Selowe int rc; 1200917Selowe 1201917Selowe if (PP_RETIRED(pp)) { 1202917Selowe PR_DEBUG(prd_checkhit); 1203917Selowe rc = 0; 12041381Selowe } else if (PP_PR_REQ(pp)) { 12051381Selowe PR_DEBUG(prd_checkmiss_pend); 12061381Selowe rc = EAGAIN; 1207917Selowe } else { 12081381Selowe PR_DEBUG(prd_checkmiss_noerr); 12091381Selowe rc = EIO; 1210917Selowe } 1211917Selowe 1212917Selowe /* 1213917Selowe * We have magically arranged the bit values returned to fmd(1M) 1214917Selowe * to line up with the FMA, MCE, and UE bits of the page_t. 1215917Selowe */ 1216917Selowe if (errors) { 1217917Selowe uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK); 1218917Selowe if (toxic & PR_UE_SCRUBBED) { 1219917Selowe toxic &= ~PR_UE_SCRUBBED; 1220917Selowe toxic |= PR_UE; 1221917Selowe } 1222917Selowe *errors = toxic; 1223917Selowe } 1224917Selowe 1225917Selowe return (rc); 1226917Selowe } 1227917Selowe 1228917Selowe /* 1229917Selowe * Test to see if the page_t for a given PA is retired, and return the 1230917Selowe * hardware errors we have seen on the page if requested. 1231917Selowe * 1232917Selowe * Called from mmioctl_page_retire on behalf of the FMA DE. 1233917Selowe * 1234917Selowe * Returns: 1235917Selowe * 1236917Selowe * - 0 if the page is retired, 12371381Selowe * - EIO if the page is not retired and has no errors, 12381381Selowe * - EAGAIN if the page is not retired but is pending; and 1239917Selowe * - EINVAL if the PA is whacko. 1240917Selowe */ 1241917Selowe int 1242917Selowe page_retire_check(uint64_t pa, uint64_t *errors) 1243917Selowe { 1244917Selowe page_t *pp; 1245917Selowe 1246917Selowe if (errors) { 1247917Selowe *errors = 0; 1248917Selowe } 1249917Selowe 1250917Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1251917Selowe if (pp == NULL) { 1252917Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1253917Selowe } 1254917Selowe 1255917Selowe return (page_retire_check_pp(pp, errors)); 1256917Selowe } 1257917Selowe 1258917Selowe /* 1259917Selowe * Page retire self-test. For now, it always returns 0. 1260917Selowe */ 1261917Selowe int 1262917Selowe page_retire_test(void) 1263917Selowe { 1264917Selowe page_t *first, *pp, *cpp, *cpp2, *lpp; 1265917Selowe 1266917Selowe /* 1267917Selowe * Tests the corner case where a large page can't be retired 1268917Selowe * because one of the constituent pages is locked. We mark 1269917Selowe * one page to be retired and try to retire it, and mark the 1270917Selowe * other page to be retired but don't try to retire it, so 1271917Selowe * that page_unlock() in the failure path will recurse and try 1272917Selowe * to retire THAT page. This is the worst possible situation 1273917Selowe * we can get ourselves into. 1274917Selowe */ 1275917Selowe memsegs_lock(0); 1276917Selowe pp = first = page_first(); 1277917Selowe do { 1278917Selowe if (pp->p_szc && PP_PAGEROOT(pp) == pp) { 1279917Selowe cpp = pp + 1; 1280917Selowe lpp = PP_ISFREE(pp)? pp : pp + 2; 1281917Selowe cpp2 = pp + 3; 1282917Selowe if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) 1283917Selowe continue; 1284917Selowe if (!page_trylock(cpp, SE_EXCL)) { 1285917Selowe page_unlock(lpp); 1286917Selowe continue; 1287917Selowe } 12883253Smec 12893253Smec /* fails */ 12903253Smec (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); 12913253Smec 1292917Selowe page_unlock(lpp); 12933253Smec page_unlock(cpp); 12943253Smec (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); 12953253Smec (void) page_retire(ptob(cpp2->p_pagenum), PR_FMA); 1296917Selowe } 1297917Selowe } while ((pp = page_next(pp)) != first); 1298917Selowe memsegs_unlock(0); 1299917Selowe 1300917Selowe return (0); 1301917Selowe } 1302