xref: /onnv-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_cpu.c (revision 8221:28cd31f237ad)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51772Sjl139090  * Common Development and Distribution License (the "License").
61772Sjl139090  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
226111Scy152378  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #include <cma.h>
270Sstevel@tonic-gate 
280Sstevel@tonic-gate #include <fcntl.h>
290Sstevel@tonic-gate #include <unistd.h>
300Sstevel@tonic-gate #include <strings.h>
310Sstevel@tonic-gate #include <errno.h>
320Sstevel@tonic-gate #include <time.h>
330Sstevel@tonic-gate #include <fm/fmd_api.h>
347532SSean.Ye@Sun.COM #include <fm/fmd_agent.h>
350Sstevel@tonic-gate #include <sys/fm/protocol.h>
360Sstevel@tonic-gate #include <sys/bl.h>
370Sstevel@tonic-gate #include <sys/processor.h>
380Sstevel@tonic-gate 
39*8221SSean.Ye@Sun.COM static int cpu_statechange(fmd_hdl_t *, nvlist_t *, nvlist_t *, const char *,
40*8221SSean.Ye@Sun.COM     uint32_t, boolean_t);
41*8221SSean.Ye@Sun.COM 
42*8221SSean.Ye@Sun.COM #ifndef opl
437532SSean.Ye@Sun.COM /*
44*8221SSean.Ye@Sun.COM  * Perform retire/unretire by invoking the topo methods registered in the
45*8221SSean.Ye@Sun.COM  * hc-scheme resource.
46*8221SSean.Ye@Sun.COM  *
47*8221SSean.Ye@Sun.COM  * If the fault is found to be diagnosed under the old topology, the resource
48*8221SSean.Ye@Sun.COM  * will not exist in the current topology, then we fall back to legacy retire
49*8221SSean.Ye@Sun.COM  * (using the "cpu" scheme ASRU).
507532SSean.Ye@Sun.COM  */
517532SSean.Ye@Sun.COM 
527532SSean.Ye@Sun.COM static boolean_t
old_topo_fault(nvlist_t * nvl)537532SSean.Ye@Sun.COM old_topo_fault(nvlist_t *nvl)
547532SSean.Ye@Sun.COM {
55*8221SSean.Ye@Sun.COM 	nvlist_t *rsrc;
56*8221SSean.Ye@Sun.COM #ifdef i386
57*8221SSean.Ye@Sun.COM 	nvlist_t **hcl;
587532SSean.Ye@Sun.COM 	uint_t nhcl = 0;
597532SSean.Ye@Sun.COM 	char *name;
60*8221SSean.Ye@Sun.COM #endif
617532SSean.Ye@Sun.COM 
62*8221SSean.Ye@Sun.COM 	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) != 0)
63*8221SSean.Ye@Sun.COM 		return (B_TRUE);
64*8221SSean.Ye@Sun.COM #ifdef i386
65*8221SSean.Ye@Sun.COM 	/*
66*8221SSean.Ye@Sun.COM 	 * x86 has moved from "motherboard/chip/cpu" topo to
67*8221SSean.Ye@Sun.COM 	 * "motherboard/chip/core/strand"
68*8221SSean.Ye@Sun.COM 	 */
69*8221SSean.Ye@Sun.COM 	if (nvlist_lookup_nvlist_array(rsrc, FM_FMRI_HC_LIST, &hcl, &nhcl)
707532SSean.Ye@Sun.COM 	    == 0 && nhcl == 3 &&
717532SSean.Ye@Sun.COM 	    nvlist_lookup_string(hcl[0], FM_FMRI_HC_NAME, &name) == 0 &&
727532SSean.Ye@Sun.COM 	    strcmp(name, "motherboard") == 0 &&
737532SSean.Ye@Sun.COM 	    nvlist_lookup_string(hcl[1], FM_FMRI_HC_NAME, &name) == 0 &&
747532SSean.Ye@Sun.COM 	    strcmp(name, "chip") == 0 &&
757532SSean.Ye@Sun.COM 	    nvlist_lookup_string(hcl[2], FM_FMRI_HC_NAME, &name) == 0 &&
767532SSean.Ye@Sun.COM 	    strcmp(name, "cpu") == 0)
77*8221SSean.Ye@Sun.COM 		return (B_TRUE);
78*8221SSean.Ye@Sun.COM #endif
797532SSean.Ye@Sun.COM 
80*8221SSean.Ye@Sun.COM 	return (B_FALSE);
817532SSean.Ye@Sun.COM }
827532SSean.Ye@Sun.COM 
837532SSean.Ye@Sun.COM /* ARGSUSED */
847532SSean.Ye@Sun.COM int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)857532SSean.Ye@Sun.COM cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
867532SSean.Ye@Sun.COM     const char *uuid, boolean_t repair)
877532SSean.Ye@Sun.COM {
88*8221SSean.Ye@Sun.COM 	int i, err;
897532SSean.Ye@Sun.COM 	int rc = CMA_RA_SUCCESS;
907532SSean.Ye@Sun.COM 	nvlist_t *rsrc;
917532SSean.Ye@Sun.COM 
927532SSean.Ye@Sun.COM 	/*
937532SSean.Ye@Sun.COM 	 * For the cached faults which were diagnosed under the old
94*8221SSean.Ye@Sun.COM 	 * topology,  we fall back to retire by using cpu-scheme ASRUs.
95*8221SSean.Ye@Sun.COM 	 * Under xVM Dom0, since logic cpuid in "cpu" scheme ASRU makes no
96*8221SSean.Ye@Sun.COM 	 * sense, the fault should be ignored.
977532SSean.Ye@Sun.COM 	 */
987532SSean.Ye@Sun.COM 	if (old_topo_fault(nvl)) {
99*8221SSean.Ye@Sun.COM #ifdef i386
100*8221SSean.Ye@Sun.COM 		if (! cma_is_native)
101*8221SSean.Ye@Sun.COM 			return (CMA_RA_FAILURE);
102*8221SSean.Ye@Sun.COM #endif
103*8221SSean.Ye@Sun.COM 		return (cma_cpu_cpu_retire(hdl, nvl, asru, uuid, repair));
1047532SSean.Ye@Sun.COM 	}
1057532SSean.Ye@Sun.COM 
1067532SSean.Ye@Sun.COM 	/*
1077532SSean.Ye@Sun.COM 	 * Lookup the resource and call its topo methods to do retire/unretire
1087532SSean.Ye@Sun.COM 	 */
1097532SSean.Ye@Sun.COM 	if ((! repair && ! cma.cma_cpu_dooffline) ||
1107532SSean.Ye@Sun.COM 	    (repair && ! cma.cma_cpu_doonline)) {
1117532SSean.Ye@Sun.COM 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
1127532SSean.Ye@Sun.COM 		    repair ? "unretire" : "retire");
1137532SSean.Ye@Sun.COM 		cma_stats.cpu_supp.fmds_value.ui64++;
1147532SSean.Ye@Sun.COM 	} else {
1157532SSean.Ye@Sun.COM 		err = FMD_AGENT_RETIRE_FAIL;
1167532SSean.Ye@Sun.COM 		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
117*8221SSean.Ye@Sun.COM 			if (repair) {
118*8221SSean.Ye@Sun.COM 				err = fmd_nvl_fmri_unretire(hdl, rsrc);
119*8221SSean.Ye@Sun.COM 			} else {
120*8221SSean.Ye@Sun.COM 				for (i = 0; i < cma.cma_cpu_tries; i++) {
121*8221SSean.Ye@Sun.COM 					err = fmd_nvl_fmri_retire(hdl, rsrc);
122*8221SSean.Ye@Sun.COM 					if (err == FMD_AGENT_RETIRE_DONE)
123*8221SSean.Ye@Sun.COM 						break;
124*8221SSean.Ye@Sun.COM 					(void) nanosleep(&cma.cma_cpu_delay,
125*8221SSean.Ye@Sun.COM 					    NULL);
126*8221SSean.Ye@Sun.COM 				}
127*8221SSean.Ye@Sun.COM 			}
1287532SSean.Ye@Sun.COM 		}
1297532SSean.Ye@Sun.COM 		if (err == FMD_AGENT_RETIRE_DONE) {
1307532SSean.Ye@Sun.COM 			if (repair)
1317532SSean.Ye@Sun.COM 				cma_stats.cpu_repairs.fmds_value.ui64++;
1327532SSean.Ye@Sun.COM 			else
1337532SSean.Ye@Sun.COM 				cma_stats.cpu_flts.fmds_value.ui64++;
1347532SSean.Ye@Sun.COM 		} else {
1357532SSean.Ye@Sun.COM 			rc = CMA_RA_FAILURE;
1367532SSean.Ye@Sun.COM 			cma_stats.bad_flts.fmds_value.ui64++;
137*8221SSean.Ye@Sun.COM #ifdef sun4v
138*8221SSean.Ye@Sun.COM 			/* libldom requests are processed asynchronously */
139*8221SSean.Ye@Sun.COM 			cma_cpu_start_retry(hdl, nvl, uuid, repair);
140*8221SSean.Ye@Sun.COM #endif
1417532SSean.Ye@Sun.COM 		}
1427532SSean.Ye@Sun.COM 	}
1437532SSean.Ye@Sun.COM 
1447532SSean.Ye@Sun.COM 	if ((! repair && ! cma.cma_cpu_doblacklist) ||
1457532SSean.Ye@Sun.COM 	    (repair && ! cma.cma_cpu_dounblacklist)) {
1467532SSean.Ye@Sun.COM 		fmd_hdl_debug(hdl, "suppressed %s of CPU\n",
1477532SSean.Ye@Sun.COM 		    repair ? "unblacklist" : "blacklist");
1487532SSean.Ye@Sun.COM 		cma_stats.cpu_blsupp.fmds_value.ui64++;
1497532SSean.Ye@Sun.COM 	} else {
1507532SSean.Ye@Sun.COM 		if (cma_cpu_blacklist(hdl, nvl, asru, repair) < 0)
1517532SSean.Ye@Sun.COM 			cma_stats.cpu_blfails.fmds_value.ui64++;
1527532SSean.Ye@Sun.COM 	}
1537532SSean.Ye@Sun.COM 
1547532SSean.Ye@Sun.COM 	return (rc);
1557532SSean.Ye@Sun.COM }
156*8221SSean.Ye@Sun.COM 
157*8221SSean.Ye@Sun.COM #else /* opl */
158*8221SSean.Ye@Sun.COM 
159*8221SSean.Ye@Sun.COM /* ARGSUSED 4 */
160*8221SSean.Ye@Sun.COM int
cma_cpu_hc_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)161*8221SSean.Ye@Sun.COM cma_cpu_hc_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
162*8221SSean.Ye@Sun.COM     const char *uuid, boolean_t repair)
163*8221SSean.Ye@Sun.COM {
164*8221SSean.Ye@Sun.COM 	uint_t cpuid;
165*8221SSean.Ye@Sun.COM 	uint_t i, nprs;
166*8221SSean.Ye@Sun.COM 	nvlist_t **hc_prs = NULL, *hc_spec_nvl;
167*8221SSean.Ye@Sun.COM 
168*8221SSean.Ye@Sun.COM 	/* OPL has ASRU in "hc" scheme */
169*8221SSean.Ye@Sun.COM 	if (nvlist_lookup_nvlist(asru, FM_FMRI_HC_SPECIFIC,
170*8221SSean.Ye@Sun.COM 	    &hc_spec_nvl) != 0) {
171*8221SSean.Ye@Sun.COM 		cma_stats.bad_flts.fmds_value.ui64++;
172*8221SSean.Ye@Sun.COM 		fmd_hdl_debug(hdl,
173*8221SSean.Ye@Sun.COM 		    "cma_cpu_hc_retire lookup hc_spec_nvl failed\n");
174*8221SSean.Ye@Sun.COM 		return (CMA_RA_FAILURE);
175*8221SSean.Ye@Sun.COM 	}
176*8221SSean.Ye@Sun.COM 
177*8221SSean.Ye@Sun.COM 	if (nvlist_lookup_nvlist_array(hc_spec_nvl, FM_FMRI_HC_CPUIDS,
178*8221SSean.Ye@Sun.COM 	    &hc_prs, &nprs) != 0) {
179*8221SSean.Ye@Sun.COM 		cma_stats.bad_flts.fmds_value.ui64++;
180*8221SSean.Ye@Sun.COM 		fmd_hdl_debug(hdl,
181*8221SSean.Ye@Sun.COM 		    "cma_cpu_hc_retire lookup cpuid array failed\n");
182*8221SSean.Ye@Sun.COM 		return (CMA_RA_FAILURE);
183*8221SSean.Ye@Sun.COM 	}
184*8221SSean.Ye@Sun.COM 
185*8221SSean.Ye@Sun.COM 	for (i = 0; i < nprs; i++) {
186*8221SSean.Ye@Sun.COM 		if (nvlist_lookup_uint32(hc_prs[i],
187*8221SSean.Ye@Sun.COM 		    FM_FMRI_CPU_ID, &cpuid) != 0) {
188*8221SSean.Ye@Sun.COM 			cma_stats.bad_flts.fmds_value.ui64++;
189*8221SSean.Ye@Sun.COM 			return (CMA_RA_FAILURE);
190*8221SSean.Ye@Sun.COM 		}
191*8221SSean.Ye@Sun.COM 
192*8221SSean.Ye@Sun.COM 		if (cpu_statechange(hdl, nvl, hc_prs[i], uuid, cpuid, repair)
193*8221SSean.Ye@Sun.COM 		    != CMA_RA_SUCCESS) {
194*8221SSean.Ye@Sun.COM 			cma_stats.bad_flts.fmds_value.ui64++;
195*8221SSean.Ye@Sun.COM 			return (CMA_RA_FAILURE);
196*8221SSean.Ye@Sun.COM 		}
197*8221SSean.Ye@Sun.COM 	}
198*8221SSean.Ye@Sun.COM 
199*8221SSean.Ye@Sun.COM 	return (CMA_RA_SUCCESS);
200*8221SSean.Ye@Sun.COM }
201*8221SSean.Ye@Sun.COM #endif /* opl */
202*8221SSean.Ye@Sun.COM 
203*8221SSean.Ye@Sun.COM /*
204*8221SSean.Ye@Sun.COM  * The rest of this file uses ASRUs to do retire, this is now not the
205*8221SSean.Ye@Sun.COM  * preferable way, but it's still needed for some circumstances when
206*8221SSean.Ye@Sun.COM  * retire via topo methods can't work, ie.
207*8221SSean.Ye@Sun.COM  *
208*8221SSean.Ye@Sun.COM  * 1) There are legacy platforms which don't have full topology.
209*8221SSean.Ye@Sun.COM  * 2) The resources in the FMD cached faults may not be set or exist in the
210*8221SSean.Ye@Sun.COM  *    up-to-dated topology.
211*8221SSean.Ye@Sun.COM  */
2127532SSean.Ye@Sun.COM 
2136111Scy152378 /* ARGSUSED */
2140Sstevel@tonic-gate static int
cpu_online(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)2156111Scy152378 cpu_online(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
2166111Scy152378     uint32_t cpuid)
2170Sstevel@tonic-gate {
2186111Scy152378 	int err = CMA_RA_SUCCESS;
2190Sstevel@tonic-gate 
2206111Scy152378 	if (cma.cma_cpu_doonline) {
2216111Scy152378 		err = cma_cpu_statechange(hdl, asru, uuid, P_ONLINE,
2226111Scy152378 		    B_TRUE);
2236111Scy152378 	} else {
2246111Scy152378 		fmd_hdl_debug(hdl, "suppressed online of CPU %u\n",
2256111Scy152378 		    cpuid);
2266111Scy152378 		cma_stats.cpu_supp.fmds_value.ui64++;
2270Sstevel@tonic-gate 	}
2280Sstevel@tonic-gate 
2296111Scy152378 	/* OPL performs the blacklist in the service processor */
2306111Scy152378 #ifndef opl
2316111Scy152378 	if (cma.cma_cpu_dounblacklist) {
2326111Scy152378 		if (cma_cpu_blacklist(hdl, nvl, asru, B_TRUE) < 0)
2336111Scy152378 			cma_stats.cpu_blfails.fmds_value.ui64++;
2346111Scy152378 	} else {
2356111Scy152378 		fmd_hdl_debug(hdl, "suppressed unblacklist of CPU %u\n", cpuid);
2366111Scy152378 		cma_stats.cpu_blsupp.fmds_value.ui64++;
2376111Scy152378 	}
2386111Scy152378 #endif /* opl */
2396111Scy152378 
2406111Scy152378 	return (err);
2416111Scy152378 }
2420Sstevel@tonic-gate 
2436111Scy152378 /* ARGSUSED */
2446111Scy152378 static int
cpu_offline(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid)2456111Scy152378 cpu_offline(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
2466111Scy152378     uint32_t cpuid)
2476111Scy152378 {
2486111Scy152378 	int err = CMA_RA_FAILURE;
2490Sstevel@tonic-gate 
2506111Scy152378 	if (cma.cma_cpu_dooffline) {
2516111Scy152378 		int cpustate = P_FAULTED;
2526111Scy152378 
2536111Scy152378 		if (cma.cma_cpu_forcedoffline)
2546111Scy152378 			cpustate |= P_FORCED;
2556111Scy152378 		err = cma_cpu_statechange(hdl, asru, uuid, cpustate,
2566111Scy152378 		    B_FALSE);
2576111Scy152378 	} else {
2586111Scy152378 		fmd_hdl_debug(hdl, "suppressed offline of CPU %u\n",
2596111Scy152378 		    cpuid);
2606111Scy152378 		cma_stats.cpu_supp.fmds_value.ui64++;
2610Sstevel@tonic-gate 	}
2620Sstevel@tonic-gate 
2636111Scy152378 	/* OPL performs the blacklist in the service processor */
2646111Scy152378 #ifndef opl
2656111Scy152378 	if (cma.cma_cpu_doblacklist) {
2666111Scy152378 		if (cma_cpu_blacklist(hdl, nvl, asru, B_FALSE) < 0)
2676111Scy152378 			cma_stats.cpu_blfails.fmds_value.ui64++;
2686111Scy152378 	} else {
2696111Scy152378 		fmd_hdl_debug(hdl, "suppressed blacklist of CPU %u\n",
2706111Scy152378 		    cpuid);
2716111Scy152378 		cma_stats.cpu_blsupp.fmds_value.ui64++;
2726111Scy152378 	}
2736111Scy152378 #endif /* opl */
2740Sstevel@tonic-gate 
2756111Scy152378 	return (err);
2766111Scy152378 }
2770Sstevel@tonic-gate 
2786111Scy152378 static int
cpu_statechange(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,uint32_t cpuid,boolean_t repair)2796111Scy152378 cpu_statechange(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid,
2806111Scy152378     uint32_t cpuid, boolean_t repair)
2816111Scy152378 {
2826111Scy152378 	if (repair)
2836111Scy152378 		return (cpu_online(hdl, nvl, asru, uuid, cpuid));
2846111Scy152378 	else
2856111Scy152378 		return (cpu_offline(hdl, nvl, asru, uuid, cpuid));
2866111Scy152378 }
2870Sstevel@tonic-gate 
2886111Scy152378 const char *
p_online_state_fmt(int state)2896111Scy152378 p_online_state_fmt(int state)
2906111Scy152378 {
2916111Scy152378 	state &= ~P_FORCED;
2926111Scy152378 	switch (state) {
2936111Scy152378 	case P_OFFLINE:
2946111Scy152378 		return (PS_OFFLINE);
2956111Scy152378 	case P_ONLINE:
2966111Scy152378 		return (PS_ONLINE);
2976111Scy152378 	case P_FAULTED:
2986111Scy152378 		return (PS_FAULTED);
2996111Scy152378 	case P_POWEROFF:
3006111Scy152378 		return (PS_POWEROFF);
3016111Scy152378 	case P_NOINTR:
3026111Scy152378 		return (PS_NOINTR);
3036111Scy152378 	case P_SPARE:
3046111Scy152378 		return (PS_SPARE);
3056111Scy152378 	default:
3066111Scy152378 		return ("unknown");
3070Sstevel@tonic-gate 	}
3080Sstevel@tonic-gate }
3090Sstevel@tonic-gate 
3102112Sav145390 int
cma_cpu_cpu_retire(fmd_hdl_t * hdl,nvlist_t * nvl,nvlist_t * asru,const char * uuid,boolean_t repair)311*8221SSean.Ye@Sun.COM cma_cpu_cpu_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
312*8221SSean.Ye@Sun.COM     const char *uuid, boolean_t repair)
3130Sstevel@tonic-gate {
3146111Scy152378 	uint_t cpuid;
315962Stsien 
3160Sstevel@tonic-gate 	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
3170Sstevel@tonic-gate 		fmd_hdl_debug(hdl, "cpu fault missing '%s'\n", FM_FMRI_CPU_ID);
3180Sstevel@tonic-gate 		cma_stats.bad_flts.fmds_value.ui64++;
3191772Sjl139090 		return (CMA_RA_FAILURE);
3200Sstevel@tonic-gate 	}
3210Sstevel@tonic-gate 
3226111Scy152378 	return (cpu_statechange(hdl, nvl, asru, uuid, cpuid, repair));
3236111Scy152378 }
324