xref: /onnv-gate/usr/src/uts/common/os/cpu_pm.c (revision 8906:e559381f1e2b)
1*8906SEric.Saxe@Sun.COM /*
2*8906SEric.Saxe@Sun.COM  * CDDL HEADER START
3*8906SEric.Saxe@Sun.COM  *
4*8906SEric.Saxe@Sun.COM  * The contents of this file are subject to the terms of the
5*8906SEric.Saxe@Sun.COM  * Common Development and Distribution License (the "License").
6*8906SEric.Saxe@Sun.COM  * You may not use this file except in compliance with the License.
7*8906SEric.Saxe@Sun.COM  *
8*8906SEric.Saxe@Sun.COM  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*8906SEric.Saxe@Sun.COM  * or http://www.opensolaris.org/os/licensing.
10*8906SEric.Saxe@Sun.COM  * See the License for the specific language governing permissions
11*8906SEric.Saxe@Sun.COM  * and limitations under the License.
12*8906SEric.Saxe@Sun.COM  *
13*8906SEric.Saxe@Sun.COM  * When distributing Covered Code, include this CDDL HEADER in each
14*8906SEric.Saxe@Sun.COM  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*8906SEric.Saxe@Sun.COM  * If applicable, add the following below this CDDL HEADER, with the
16*8906SEric.Saxe@Sun.COM  * fields enclosed by brackets "[]" replaced with your own identifying
17*8906SEric.Saxe@Sun.COM  * information: Portions Copyright [yyyy] [name of copyright owner]
18*8906SEric.Saxe@Sun.COM  *
19*8906SEric.Saxe@Sun.COM  * CDDL HEADER END
20*8906SEric.Saxe@Sun.COM  */
21*8906SEric.Saxe@Sun.COM /*
22*8906SEric.Saxe@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23*8906SEric.Saxe@Sun.COM  * Use is subject to license terms.
24*8906SEric.Saxe@Sun.COM  */
25*8906SEric.Saxe@Sun.COM 
26*8906SEric.Saxe@Sun.COM #include <sys/cpu_pm.h>
27*8906SEric.Saxe@Sun.COM #include <sys/cmn_err.h>
28*8906SEric.Saxe@Sun.COM #include <sys/sdt.h>
29*8906SEric.Saxe@Sun.COM 
30*8906SEric.Saxe@Sun.COM /*
31*8906SEric.Saxe@Sun.COM  * Solaris Event Based CPU Power Manager
32*8906SEric.Saxe@Sun.COM  *
33*8906SEric.Saxe@Sun.COM  * This file implements platform independent event based CPU power management.
34*8906SEric.Saxe@Sun.COM  * When CPUs are configured into the system, the CMT scheduling subsystem will
35*8906SEric.Saxe@Sun.COM  * query the platform to determine if the CPU belongs to any power management
36*8906SEric.Saxe@Sun.COM  * domains. That is, sets of CPUs that share power management states.
37*8906SEric.Saxe@Sun.COM  *
38*8906SEric.Saxe@Sun.COM  * Active Power Management domains represent a group of CPUs across which the
39*8906SEric.Saxe@Sun.COM  * Operating System can request speed changes (which may in turn result
40*8906SEric.Saxe@Sun.COM  * in voltage changes). This allows the operating system to trade off
41*8906SEric.Saxe@Sun.COM  * performance for power savings.
42*8906SEric.Saxe@Sun.COM  *
43*8906SEric.Saxe@Sun.COM  * Idle Power Management domains can enter power savings states when they are
44*8906SEric.Saxe@Sun.COM  * unutilized. These states allow the Operating System to trade off power
45*8906SEric.Saxe@Sun.COM  * for performance (in the form of latency to transition from the idle state
46*8906SEric.Saxe@Sun.COM  * to an active one).
47*8906SEric.Saxe@Sun.COM  *
48*8906SEric.Saxe@Sun.COM  * For each active and idle power domain the CMT subsystem instantiates, a
49*8906SEric.Saxe@Sun.COM  * cpupm_domain_t structure is created. As the dispatcher schedules threads
50*8906SEric.Saxe@Sun.COM  * to run on the system's CPUs, it will also track the utilization of the
51*8906SEric.Saxe@Sun.COM  * enumerated power domains. Significant changes in utilization will result
52*8906SEric.Saxe@Sun.COM  * in the dispatcher sending the power manager events that relate to the
53*8906SEric.Saxe@Sun.COM  * utilization of the power domain. The power manager recieves the events,
54*8906SEric.Saxe@Sun.COM  * and in the context of the policy objectives in force, may decide to request
55*8906SEric.Saxe@Sun.COM  * the domain's power/performance state be changed.
56*8906SEric.Saxe@Sun.COM  *
57*8906SEric.Saxe@Sun.COM  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
58*8906SEric.Saxe@Sun.COM  * manager will request the CPUs in the domain run at their fastest (and most
59*8906SEric.Saxe@Sun.COM  * power consuming) state. When the domain becomes idle (utilization at zero),
60*8906SEric.Saxe@Sun.COM  * the power manager will request that the CPUs run at a speed that saves the
61*8906SEric.Saxe@Sun.COM  * most power.
62*8906SEric.Saxe@Sun.COM  *
63*8906SEric.Saxe@Sun.COM  * The advantage of this scheme, is that the CPU power manager working with the
64*8906SEric.Saxe@Sun.COM  * dispatcher can be extremely responsive to changes in utilization. Optimizing
65*8906SEric.Saxe@Sun.COM  * for performance in the presence of utilization, and power savings in the
66*8906SEric.Saxe@Sun.COM  * presence of idleness. Such close collaboration with the dispatcher has other
67*8906SEric.Saxe@Sun.COM  * benefits that will play out in the form of more sophisticated power /
68*8906SEric.Saxe@Sun.COM  * performance policy in the near future.
69*8906SEric.Saxe@Sun.COM  *
70*8906SEric.Saxe@Sun.COM  * Avoiding state thrashing in the presence of transient periods of utilization
71*8906SEric.Saxe@Sun.COM  * and idleness while still being responsive to non-transient periods is key.
72*8906SEric.Saxe@Sun.COM  * The power manager implmeents several "governors" that are used to throttle
73*8906SEric.Saxe@Sun.COM  * state transitions when a significant amount of transient idle or transient
74*8906SEric.Saxe@Sun.COM  * work is detected.
75*8906SEric.Saxe@Sun.COM  *
76*8906SEric.Saxe@Sun.COM  * Kernel background activity (e.g. taskq threads) are by far the most common
77*8906SEric.Saxe@Sun.COM  * form of transient utilization. Ungoverned in the face of this utililzation,
78*8906SEric.Saxe@Sun.COM  * hundreds of state transitions per second would result on an idle system.
79*8906SEric.Saxe@Sun.COM  *
80*8906SEric.Saxe@Sun.COM  * Transient idleness is common when a thread briefly yields the CPU to
81*8906SEric.Saxe@Sun.COM  * wait for an event elsewhere in the system. Where the idle period is short
82*8906SEric.Saxe@Sun.COM  * enough, the overhead associated with making the state transition doesn't
83*8906SEric.Saxe@Sun.COM  * justify the power savings.
84*8906SEric.Saxe@Sun.COM  */
85*8906SEric.Saxe@Sun.COM 
86*8906SEric.Saxe@Sun.COM static cpupm_domain_t *cpupm_domains = NULL;
87*8906SEric.Saxe@Sun.COM 
88*8906SEric.Saxe@Sun.COM /*
89*8906SEric.Saxe@Sun.COM  * Uninitialized state of CPU power management is disabled
90*8906SEric.Saxe@Sun.COM  */
91*8906SEric.Saxe@Sun.COM cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
92*8906SEric.Saxe@Sun.COM 
93*8906SEric.Saxe@Sun.COM /*
94*8906SEric.Saxe@Sun.COM  * Periods of utilization lasting less than this time interval are characterized
95*8906SEric.Saxe@Sun.COM  * as transient. State changes associated with transient work are considered
96*8906SEric.Saxe@Sun.COM  * to be mispredicted. That is, it's not worth raising and lower power states
97*8906SEric.Saxe@Sun.COM  * where the utilization lasts for less than this interval.
98*8906SEric.Saxe@Sun.COM  */
99*8906SEric.Saxe@Sun.COM hrtime_t cpupm_tw_predict_interval;
100*8906SEric.Saxe@Sun.COM 
101*8906SEric.Saxe@Sun.COM /*
102*8906SEric.Saxe@Sun.COM  * Periods of idleness lasting less than this time interval are characterized
103*8906SEric.Saxe@Sun.COM  * as transient. State changes associated with transient idle are considered
104*8906SEric.Saxe@Sun.COM  * to be mispredicted. That is, it's not worth lowering and raising power
105*8906SEric.Saxe@Sun.COM  * states where the idleness lasts for less than this interval.
106*8906SEric.Saxe@Sun.COM  */
107*8906SEric.Saxe@Sun.COM hrtime_t cpupm_ti_predict_interval;
108*8906SEric.Saxe@Sun.COM 
109*8906SEric.Saxe@Sun.COM /*
110*8906SEric.Saxe@Sun.COM  * Number of mispredictions after which future transitions will be governed.
111*8906SEric.Saxe@Sun.COM  */
112*8906SEric.Saxe@Sun.COM int cpupm_mispredict_thresh = 2;
113*8906SEric.Saxe@Sun.COM 
114*8906SEric.Saxe@Sun.COM /*
115*8906SEric.Saxe@Sun.COM  * Likewise, the number of mispredicted governed transitions after which the
116*8906SEric.Saxe@Sun.COM  * governor will be removed.
117*8906SEric.Saxe@Sun.COM  */
118*8906SEric.Saxe@Sun.COM int cpupm_mispredict_gov_thresh = 10;
119*8906SEric.Saxe@Sun.COM 
120*8906SEric.Saxe@Sun.COM /*
121*8906SEric.Saxe@Sun.COM  * The transient work and transient idle prediction intervals are initialized
122*8906SEric.Saxe@Sun.COM  * to be some multiple of the amount of time it takes to transition a power
123*8906SEric.Saxe@Sun.COM  * domain from the highest to the lowest power state, and back again, which
124*8906SEric.Saxe@Sun.COM  * is measured.
125*8906SEric.Saxe@Sun.COM  *
126*8906SEric.Saxe@Sun.COM  * The default values of those multiples are specified here. Tuning them higher
127*8906SEric.Saxe@Sun.COM  * will result in the transient work, and transient idle governors being used
128*8906SEric.Saxe@Sun.COM  * more aggresively, which limits the frequency of state transitions at the
129*8906SEric.Saxe@Sun.COM  * expense of performance and power savings, respectively.
130*8906SEric.Saxe@Sun.COM  */
131*8906SEric.Saxe@Sun.COM #define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
132*8906SEric.Saxe@Sun.COM #define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
133*8906SEric.Saxe@Sun.COM 
134*8906SEric.Saxe@Sun.COM /*
135*8906SEric.Saxe@Sun.COM  * Number of high=>low=>high measurements performed, of which the average
136*8906SEric.Saxe@Sun.COM  * is taken.
137*8906SEric.Saxe@Sun.COM  */
138*8906SEric.Saxe@Sun.COM #define	CPUPM_BENCHMARK_ITERS 5
139*8906SEric.Saxe@Sun.COM 
140*8906SEric.Saxe@Sun.COM int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
141*8906SEric.Saxe@Sun.COM int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
142*8906SEric.Saxe@Sun.COM 
143*8906SEric.Saxe@Sun.COM 
144*8906SEric.Saxe@Sun.COM static int	cpupm_governor_initialize(void);
145*8906SEric.Saxe@Sun.COM static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
146*8906SEric.Saxe@Sun.COM 
147*8906SEric.Saxe@Sun.COM cpupm_policy_t
148*8906SEric.Saxe@Sun.COM cpupm_get_policy(void)
149*8906SEric.Saxe@Sun.COM {
150*8906SEric.Saxe@Sun.COM 	return (cpupm_policy);
151*8906SEric.Saxe@Sun.COM }
152*8906SEric.Saxe@Sun.COM 
153*8906SEric.Saxe@Sun.COM int
154*8906SEric.Saxe@Sun.COM cpupm_set_policy(cpupm_policy_t new_policy)
155*8906SEric.Saxe@Sun.COM {
156*8906SEric.Saxe@Sun.COM 	static int	gov_init = 0;
157*8906SEric.Saxe@Sun.COM 	int		result = 0;
158*8906SEric.Saxe@Sun.COM 
159*8906SEric.Saxe@Sun.COM 	mutex_enter(&cpu_lock);
160*8906SEric.Saxe@Sun.COM 	if (new_policy == cpupm_policy) {
161*8906SEric.Saxe@Sun.COM 		mutex_exit(&cpu_lock);
162*8906SEric.Saxe@Sun.COM 		return (result);
163*8906SEric.Saxe@Sun.COM 	}
164*8906SEric.Saxe@Sun.COM 
165*8906SEric.Saxe@Sun.COM 	/*
166*8906SEric.Saxe@Sun.COM 	 * Pausing CPUs causes a high priority thread to be scheduled
167*8906SEric.Saxe@Sun.COM 	 * on all other CPUs (besides the current one). This locks out
168*8906SEric.Saxe@Sun.COM 	 * other CPUs from making CPUPM state transitions.
169*8906SEric.Saxe@Sun.COM 	 */
170*8906SEric.Saxe@Sun.COM 	switch (new_policy) {
171*8906SEric.Saxe@Sun.COM 	case CPUPM_POLICY_DISABLED:
172*8906SEric.Saxe@Sun.COM 		pause_cpus(NULL);
173*8906SEric.Saxe@Sun.COM 		cpupm_policy = CPUPM_POLICY_DISABLED;
174*8906SEric.Saxe@Sun.COM 		start_cpus();
175*8906SEric.Saxe@Sun.COM 
176*8906SEric.Saxe@Sun.COM 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
177*8906SEric.Saxe@Sun.COM 
178*8906SEric.Saxe@Sun.COM 		/*
179*8906SEric.Saxe@Sun.COM 		 * Once PAD has been enabled, it should always be possible
180*8906SEric.Saxe@Sun.COM 		 * to disable it.
181*8906SEric.Saxe@Sun.COM 		 */
182*8906SEric.Saxe@Sun.COM 		ASSERT(result == 0);
183*8906SEric.Saxe@Sun.COM 
184*8906SEric.Saxe@Sun.COM 		/*
185*8906SEric.Saxe@Sun.COM 		 * Bring all the active power domains to the maximum
186*8906SEric.Saxe@Sun.COM 		 * performance state.
187*8906SEric.Saxe@Sun.COM 		 */
188*8906SEric.Saxe@Sun.COM 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
189*8906SEric.Saxe@Sun.COM 		    CPUPM_STATE_MAX_PERF);
190*8906SEric.Saxe@Sun.COM 
191*8906SEric.Saxe@Sun.COM 		break;
192*8906SEric.Saxe@Sun.COM 	case CPUPM_POLICY_ELASTIC:
193*8906SEric.Saxe@Sun.COM 
194*8906SEric.Saxe@Sun.COM 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
195*8906SEric.Saxe@Sun.COM 		if (result < 0) {
196*8906SEric.Saxe@Sun.COM 			/*
197*8906SEric.Saxe@Sun.COM 			 * Failed to enable PAD across the active power
198*8906SEric.Saxe@Sun.COM 			 * domains, which may well be because none were
199*8906SEric.Saxe@Sun.COM 			 * enumerated.
200*8906SEric.Saxe@Sun.COM 			 */
201*8906SEric.Saxe@Sun.COM 			break;
202*8906SEric.Saxe@Sun.COM 		}
203*8906SEric.Saxe@Sun.COM 
204*8906SEric.Saxe@Sun.COM 		pause_cpus(NULL);
205*8906SEric.Saxe@Sun.COM 		/*
206*8906SEric.Saxe@Sun.COM 		 * Attempt to initialize the governor parameters the first
207*8906SEric.Saxe@Sun.COM 		 * time through.
208*8906SEric.Saxe@Sun.COM 		 */
209*8906SEric.Saxe@Sun.COM 		if (gov_init == 0) {
210*8906SEric.Saxe@Sun.COM 			result = cpupm_governor_initialize();
211*8906SEric.Saxe@Sun.COM 			if (result == 0) {
212*8906SEric.Saxe@Sun.COM 				gov_init = 1;
213*8906SEric.Saxe@Sun.COM 			} else {
214*8906SEric.Saxe@Sun.COM 				/*
215*8906SEric.Saxe@Sun.COM 				 * Failed to initialize the governor parameters
216*8906SEric.Saxe@Sun.COM 				 */
217*8906SEric.Saxe@Sun.COM 				start_cpus();
218*8906SEric.Saxe@Sun.COM 				break;
219*8906SEric.Saxe@Sun.COM 			}
220*8906SEric.Saxe@Sun.COM 		}
221*8906SEric.Saxe@Sun.COM 		cpupm_policy = CPUPM_POLICY_ELASTIC;
222*8906SEric.Saxe@Sun.COM 		start_cpus();
223*8906SEric.Saxe@Sun.COM 
224*8906SEric.Saxe@Sun.COM 		break;
225*8906SEric.Saxe@Sun.COM 	default:
226*8906SEric.Saxe@Sun.COM 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
227*8906SEric.Saxe@Sun.COM 		    new_policy);
228*8906SEric.Saxe@Sun.COM 		ASSERT(0);
229*8906SEric.Saxe@Sun.COM 		break;
230*8906SEric.Saxe@Sun.COM 	}
231*8906SEric.Saxe@Sun.COM 	mutex_exit(&cpu_lock);
232*8906SEric.Saxe@Sun.COM 
233*8906SEric.Saxe@Sun.COM 	return (result);
234*8906SEric.Saxe@Sun.COM }
235*8906SEric.Saxe@Sun.COM 
236*8906SEric.Saxe@Sun.COM /*
237*8906SEric.Saxe@Sun.COM  * Look for an existing power domain
238*8906SEric.Saxe@Sun.COM  */
239*8906SEric.Saxe@Sun.COM static cpupm_domain_t *
240*8906SEric.Saxe@Sun.COM cpupm_domain_find(id_t id, cpupm_dtype_t type)
241*8906SEric.Saxe@Sun.COM {
242*8906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
243*8906SEric.Saxe@Sun.COM 
244*8906SEric.Saxe@Sun.COM 	cpupm_domain_t *dom;
245*8906SEric.Saxe@Sun.COM 
246*8906SEric.Saxe@Sun.COM 	dom = cpupm_domains;
247*8906SEric.Saxe@Sun.COM 	while (dom != NULL) {
248*8906SEric.Saxe@Sun.COM 		if (id == dom->cpd_id && type == dom->cpd_type)
249*8906SEric.Saxe@Sun.COM 			return (dom);
250*8906SEric.Saxe@Sun.COM 		dom = dom->cpd_next;
251*8906SEric.Saxe@Sun.COM 	}
252*8906SEric.Saxe@Sun.COM 	return (NULL);
253*8906SEric.Saxe@Sun.COM }
254*8906SEric.Saxe@Sun.COM 
255*8906SEric.Saxe@Sun.COM /*
256*8906SEric.Saxe@Sun.COM  * Create a new domain
257*8906SEric.Saxe@Sun.COM  */
258*8906SEric.Saxe@Sun.COM static cpupm_domain_t *
259*8906SEric.Saxe@Sun.COM cpupm_domain_create(id_t id, cpupm_dtype_t type)
260*8906SEric.Saxe@Sun.COM {
261*8906SEric.Saxe@Sun.COM 	cpupm_domain_t *dom;
262*8906SEric.Saxe@Sun.COM 
263*8906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
264*8906SEric.Saxe@Sun.COM 
265*8906SEric.Saxe@Sun.COM 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
266*8906SEric.Saxe@Sun.COM 	dom->cpd_id = id;
267*8906SEric.Saxe@Sun.COM 	dom->cpd_type = type;
268*8906SEric.Saxe@Sun.COM 
269*8906SEric.Saxe@Sun.COM 	/* Link into the known domain list */
270*8906SEric.Saxe@Sun.COM 	dom->cpd_next = cpupm_domains;
271*8906SEric.Saxe@Sun.COM 	cpupm_domains = dom;
272*8906SEric.Saxe@Sun.COM 
273*8906SEric.Saxe@Sun.COM 	return (dom);
274*8906SEric.Saxe@Sun.COM }
275*8906SEric.Saxe@Sun.COM 
276*8906SEric.Saxe@Sun.COM static void
277*8906SEric.Saxe@Sun.COM cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
278*8906SEric.Saxe@Sun.COM {
279*8906SEric.Saxe@Sun.COM 	/*
280*8906SEric.Saxe@Sun.COM 	 * In the envent we're enumerating because the domain's state
281*8906SEric.Saxe@Sun.COM 	 * configuration has changed, toss any existing states.
282*8906SEric.Saxe@Sun.COM 	 */
283*8906SEric.Saxe@Sun.COM 	if (dom->cpd_nstates > 0) {
284*8906SEric.Saxe@Sun.COM 		kmem_free(dom->cpd_states,
285*8906SEric.Saxe@Sun.COM 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
286*8906SEric.Saxe@Sun.COM 		dom->cpd_nstates = 0;
287*8906SEric.Saxe@Sun.COM 	}
288*8906SEric.Saxe@Sun.COM 
289*8906SEric.Saxe@Sun.COM 	/*
290*8906SEric.Saxe@Sun.COM 	 * Query to determine the number of states, allocate storage
291*8906SEric.Saxe@Sun.COM 	 * large enough to hold the state information, and pass it back
292*8906SEric.Saxe@Sun.COM 	 * to the platform driver to complete the enumeration.
293*8906SEric.Saxe@Sun.COM 	 */
294*8906SEric.Saxe@Sun.COM 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
295*8906SEric.Saxe@Sun.COM 
296*8906SEric.Saxe@Sun.COM 	if (dom->cpd_nstates == 0)
297*8906SEric.Saxe@Sun.COM 		return;
298*8906SEric.Saxe@Sun.COM 
299*8906SEric.Saxe@Sun.COM 	dom->cpd_states =
300*8906SEric.Saxe@Sun.COM 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
301*8906SEric.Saxe@Sun.COM 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
302*8906SEric.Saxe@Sun.COM }
303*8906SEric.Saxe@Sun.COM 
304*8906SEric.Saxe@Sun.COM /*
305*8906SEric.Saxe@Sun.COM  * Initialize the specified type of power domain on behalf of the CPU
306*8906SEric.Saxe@Sun.COM  */
307*8906SEric.Saxe@Sun.COM cpupm_domain_t *
308*8906SEric.Saxe@Sun.COM cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
309*8906SEric.Saxe@Sun.COM {
310*8906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
311*8906SEric.Saxe@Sun.COM 	id_t		did;
312*8906SEric.Saxe@Sun.COM 
313*8906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
314*8906SEric.Saxe@Sun.COM 
315*8906SEric.Saxe@Sun.COM 	/*
316*8906SEric.Saxe@Sun.COM 	 * Instantiate the domain if it doesn't already exist
317*8906SEric.Saxe@Sun.COM 	 * and enumerate its power states.
318*8906SEric.Saxe@Sun.COM 	 */
319*8906SEric.Saxe@Sun.COM 	did = cpupm_domain_id(cp, type);
320*8906SEric.Saxe@Sun.COM 	dom = cpupm_domain_find(did, type);
321*8906SEric.Saxe@Sun.COM 	if (dom == NULL) {
322*8906SEric.Saxe@Sun.COM 		dom = cpupm_domain_create(did, type);
323*8906SEric.Saxe@Sun.COM 		cpupm_domain_state_enum(cp, dom);
324*8906SEric.Saxe@Sun.COM 	}
325*8906SEric.Saxe@Sun.COM 
326*8906SEric.Saxe@Sun.COM 	/*
327*8906SEric.Saxe@Sun.COM 	 * Named state initialization
328*8906SEric.Saxe@Sun.COM 	 */
329*8906SEric.Saxe@Sun.COM 	if (type == CPUPM_DTYPE_ACTIVE) {
330*8906SEric.Saxe@Sun.COM 		/*
331*8906SEric.Saxe@Sun.COM 		 * For active power domains, the highest performance
332*8906SEric.Saxe@Sun.COM 		 * state is defined as first state returned from
333*8906SEric.Saxe@Sun.COM 		 * the domain enumeration.
334*8906SEric.Saxe@Sun.COM 		 */
335*8906SEric.Saxe@Sun.COM 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
336*8906SEric.Saxe@Sun.COM 		    &dom->cpd_states[0];
337*8906SEric.Saxe@Sun.COM 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
338*8906SEric.Saxe@Sun.COM 		    &dom->cpd_states[dom->cpd_nstates - 1];
339*8906SEric.Saxe@Sun.COM 
340*8906SEric.Saxe@Sun.COM 		/*
341*8906SEric.Saxe@Sun.COM 		 * Begin by assuming CPU is running at the max perf state.
342*8906SEric.Saxe@Sun.COM 		 */
343*8906SEric.Saxe@Sun.COM 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
344*8906SEric.Saxe@Sun.COM 	}
345*8906SEric.Saxe@Sun.COM 
346*8906SEric.Saxe@Sun.COM 	return (dom);
347*8906SEric.Saxe@Sun.COM }
348*8906SEric.Saxe@Sun.COM 
349*8906SEric.Saxe@Sun.COM /*
350*8906SEric.Saxe@Sun.COM  * Return the id associated with the given type of domain
351*8906SEric.Saxe@Sun.COM  * to which cp belongs
352*8906SEric.Saxe@Sun.COM  */
353*8906SEric.Saxe@Sun.COM id_t
354*8906SEric.Saxe@Sun.COM cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
355*8906SEric.Saxe@Sun.COM {
356*8906SEric.Saxe@Sun.COM 	return (cpupm_plat_domain_id(cp, type));
357*8906SEric.Saxe@Sun.COM }
358*8906SEric.Saxe@Sun.COM 
359*8906SEric.Saxe@Sun.COM /*
360*8906SEric.Saxe@Sun.COM  * Initiate a state change for the specified domain on behalf of cp
361*8906SEric.Saxe@Sun.COM  */
362*8906SEric.Saxe@Sun.COM int
363*8906SEric.Saxe@Sun.COM cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
364*8906SEric.Saxe@Sun.COM {
365*8906SEric.Saxe@Sun.COM 	if (cpupm_plat_change_state(cp, state) < 0)
366*8906SEric.Saxe@Sun.COM 		return (-1);
367*8906SEric.Saxe@Sun.COM 
368*8906SEric.Saxe@Sun.COM 	DTRACE_PROBE2(cpupm__change__state,
369*8906SEric.Saxe@Sun.COM 	    cpupm_domain_t *, dom,
370*8906SEric.Saxe@Sun.COM 	    cpupm_state_t *, state);
371*8906SEric.Saxe@Sun.COM 
372*8906SEric.Saxe@Sun.COM 	dom->cpd_state = state;
373*8906SEric.Saxe@Sun.COM 	return (0);
374*8906SEric.Saxe@Sun.COM }
375*8906SEric.Saxe@Sun.COM 
376*8906SEric.Saxe@Sun.COM /*
377*8906SEric.Saxe@Sun.COM  * Interface into the CPU power manager to indicate a significant change
378*8906SEric.Saxe@Sun.COM  * in utilization of the specified active power domain
379*8906SEric.Saxe@Sun.COM  */
380*8906SEric.Saxe@Sun.COM void
381*8906SEric.Saxe@Sun.COM cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
382*8906SEric.Saxe@Sun.COM 			    cpupm_util_event_t event)
383*8906SEric.Saxe@Sun.COM {
384*8906SEric.Saxe@Sun.COM 	cpupm_state_t	*new_state = NULL;
385*8906SEric.Saxe@Sun.COM 	hrtime_t	last;
386*8906SEric.Saxe@Sun.COM 
387*8906SEric.Saxe@Sun.COM 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
388*8906SEric.Saxe@Sun.COM 		return;
389*8906SEric.Saxe@Sun.COM 	}
390*8906SEric.Saxe@Sun.COM 
391*8906SEric.Saxe@Sun.COM 	/*
392*8906SEric.Saxe@Sun.COM 	 * What follows is a simple elastic power state management policy.
393*8906SEric.Saxe@Sun.COM 	 *
394*8906SEric.Saxe@Sun.COM 	 * If the utilization has become non-zero, and the domain was
395*8906SEric.Saxe@Sun.COM 	 * previously at it's lowest power state, then transition it
396*8906SEric.Saxe@Sun.COM 	 * to the highest state in the spirit of "race to idle".
397*8906SEric.Saxe@Sun.COM 	 *
398*8906SEric.Saxe@Sun.COM 	 * If the utilization has dropped to zero, then transition the
399*8906SEric.Saxe@Sun.COM 	 * domain to its lowest power state.
400*8906SEric.Saxe@Sun.COM 	 *
401*8906SEric.Saxe@Sun.COM 	 * Statistics are maintained to implement governors to reduce state
402*8906SEric.Saxe@Sun.COM 	 * transitions resulting from either transient work, or periods of
403*8906SEric.Saxe@Sun.COM 	 * transient idleness on the domain.
404*8906SEric.Saxe@Sun.COM 	 */
405*8906SEric.Saxe@Sun.COM 	switch (event) {
406*8906SEric.Saxe@Sun.COM 	case CPUPM_DOM_REMAIN_BUSY:
407*8906SEric.Saxe@Sun.COM 
408*8906SEric.Saxe@Sun.COM 		/*
409*8906SEric.Saxe@Sun.COM 		 * We've received an event that the domain is running a thread
410*8906SEric.Saxe@Sun.COM 		 * that's made it to the end of it's time slice. If we are at
411*8906SEric.Saxe@Sun.COM 		 * low power, then raise it. If the transient work governor
412*8906SEric.Saxe@Sun.COM 		 * is engaged, then remove it.
413*8906SEric.Saxe@Sun.COM 		 */
414*8906SEric.Saxe@Sun.COM 		if (dom->cpd_state ==
415*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
416*8906SEric.Saxe@Sun.COM 			new_state =
417*8906SEric.Saxe@Sun.COM 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
418*8906SEric.Saxe@Sun.COM 			if (dom->cpd_tw_governed == B_TRUE) {
419*8906SEric.Saxe@Sun.COM 				dom->cpd_tw_governed = B_FALSE;
420*8906SEric.Saxe@Sun.COM 				dom->cpd_tw = 0;
421*8906SEric.Saxe@Sun.COM 			}
422*8906SEric.Saxe@Sun.COM 		}
423*8906SEric.Saxe@Sun.COM 		break;
424*8906SEric.Saxe@Sun.COM 
425*8906SEric.Saxe@Sun.COM 	case CPUPM_DOM_BUSY_FROM_IDLE:
426*8906SEric.Saxe@Sun.COM 		last = dom->cpd_last_lower;
427*8906SEric.Saxe@Sun.COM 		dom->cpd_last_raise = now;
428*8906SEric.Saxe@Sun.COM 
429*8906SEric.Saxe@Sun.COM 		DTRACE_PROBE3(cpupm__raise__req,
430*8906SEric.Saxe@Sun.COM 		    cpupm_domain_t *, dom,
431*8906SEric.Saxe@Sun.COM 		    hrtime_t, last,
432*8906SEric.Saxe@Sun.COM 		    hrtime_t, now);
433*8906SEric.Saxe@Sun.COM 
434*8906SEric.Saxe@Sun.COM 		if (dom->cpd_state ==
435*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
436*8906SEric.Saxe@Sun.COM 
437*8906SEric.Saxe@Sun.COM 			/*
438*8906SEric.Saxe@Sun.COM 			 * There's non-zero utilization, and the domain is
439*8906SEric.Saxe@Sun.COM 			 * running in the lower power state. Before we
440*8906SEric.Saxe@Sun.COM 			 * consider raising power, perform some book keeping
441*8906SEric.Saxe@Sun.COM 			 * for the transient idle governor.
442*8906SEric.Saxe@Sun.COM 			 */
443*8906SEric.Saxe@Sun.COM 			if (dom->cpd_ti_governed == B_FALSE) {
444*8906SEric.Saxe@Sun.COM 				if ((now - last) < cpupm_ti_predict_interval) {
445*8906SEric.Saxe@Sun.COM 					/*
446*8906SEric.Saxe@Sun.COM 					 * We're raising the domain power and
447*8906SEric.Saxe@Sun.COM 					 * we *just* lowered it. Consider
448*8906SEric.Saxe@Sun.COM 					 * this a mispredicted power state
449*8906SEric.Saxe@Sun.COM 					 * transition due to a transient
450*8906SEric.Saxe@Sun.COM 					 * idle period.
451*8906SEric.Saxe@Sun.COM 					 */
452*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_ti >=
453*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_thresh) {
454*8906SEric.Saxe@Sun.COM 						/*
455*8906SEric.Saxe@Sun.COM 						 * There's enough transient
456*8906SEric.Saxe@Sun.COM 						 * idle transitions to
457*8906SEric.Saxe@Sun.COM 						 * justify governing future
458*8906SEric.Saxe@Sun.COM 						 * lowering requests.
459*8906SEric.Saxe@Sun.COM 						 */
460*8906SEric.Saxe@Sun.COM 						dom->cpd_ti_governed = B_TRUE;
461*8906SEric.Saxe@Sun.COM 						dom->cpd_ti = 0;
462*8906SEric.Saxe@Sun.COM 						DTRACE_PROBE1(
463*8906SEric.Saxe@Sun.COM 						    cpupm__ti__governed,
464*8906SEric.Saxe@Sun.COM 						    cpupm_domain_t *, dom);
465*8906SEric.Saxe@Sun.COM 					}
466*8906SEric.Saxe@Sun.COM 				} else {
467*8906SEric.Saxe@Sun.COM 					/*
468*8906SEric.Saxe@Sun.COM 					 * We correctly predicted the last
469*8906SEric.Saxe@Sun.COM 					 * lowering.
470*8906SEric.Saxe@Sun.COM 					 */
471*8906SEric.Saxe@Sun.COM 					dom->cpd_ti = 0;
472*8906SEric.Saxe@Sun.COM 				}
473*8906SEric.Saxe@Sun.COM 			}
474*8906SEric.Saxe@Sun.COM 			if (dom->cpd_tw_governed == B_TRUE) {
475*8906SEric.Saxe@Sun.COM 				/*
476*8906SEric.Saxe@Sun.COM 				 * Raise requests are governed due to
477*8906SEric.Saxe@Sun.COM 				 * transient work.
478*8906SEric.Saxe@Sun.COM 				 */
479*8906SEric.Saxe@Sun.COM 				DTRACE_PROBE1(cpupm__raise__governed,
480*8906SEric.Saxe@Sun.COM 				    cpupm_domain_t *, dom);
481*8906SEric.Saxe@Sun.COM 
482*8906SEric.Saxe@Sun.COM 				/*
483*8906SEric.Saxe@Sun.COM 				 * It's likely that we'll be governed for a
484*8906SEric.Saxe@Sun.COM 				 * while. If the transient idle governor is
485*8906SEric.Saxe@Sun.COM 				 * also in place, examine the preceeding idle
486*8906SEric.Saxe@Sun.COM 				 * interval to see if that still makes sense.
487*8906SEric.Saxe@Sun.COM 				 */
488*8906SEric.Saxe@Sun.COM 				if (dom->cpd_ti_governed == B_TRUE &&
489*8906SEric.Saxe@Sun.COM 				    ((now - last) >=
490*8906SEric.Saxe@Sun.COM 				    cpupm_ti_predict_interval)) {
491*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_ti >=
492*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_gov_thresh) {
493*8906SEric.Saxe@Sun.COM 						dom->cpd_ti_governed =
494*8906SEric.Saxe@Sun.COM 						    B_FALSE;
495*8906SEric.Saxe@Sun.COM 						dom->cpd_ti = 0;
496*8906SEric.Saxe@Sun.COM 					}
497*8906SEric.Saxe@Sun.COM 				}
498*8906SEric.Saxe@Sun.COM 				return;
499*8906SEric.Saxe@Sun.COM 			}
500*8906SEric.Saxe@Sun.COM 			/*
501*8906SEric.Saxe@Sun.COM 			 * Prepare to transition to the higher power state
502*8906SEric.Saxe@Sun.COM 			 */
503*8906SEric.Saxe@Sun.COM 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
504*8906SEric.Saxe@Sun.COM 
505*8906SEric.Saxe@Sun.COM 		} else if (dom->cpd_state ==
506*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
507*8906SEric.Saxe@Sun.COM 
508*8906SEric.Saxe@Sun.COM 			/*
509*8906SEric.Saxe@Sun.COM 			 * Utilization is non-zero, and we're already running
510*8906SEric.Saxe@Sun.COM 			 * in the higher power state. Take this opportunity to
511*8906SEric.Saxe@Sun.COM 			 * perform some book keeping if the last lowering
512*8906SEric.Saxe@Sun.COM 			 * request was governed.
513*8906SEric.Saxe@Sun.COM 			 */
514*8906SEric.Saxe@Sun.COM 			if (dom->cpd_ti_governed == B_TRUE) {
515*8906SEric.Saxe@Sun.COM 				if ((now - last) >= cpupm_ti_predict_interval) {
516*8906SEric.Saxe@Sun.COM 					/*
517*8906SEric.Saxe@Sun.COM 					 * The domain is transient idle
518*8906SEric.Saxe@Sun.COM 					 * governed, and we mispredicted
519*8906SEric.Saxe@Sun.COM 					 * governing the last lowering request.
520*8906SEric.Saxe@Sun.COM 					 */
521*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_ti >=
522*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_gov_thresh) {
523*8906SEric.Saxe@Sun.COM 						/*
524*8906SEric.Saxe@Sun.COM 						 * There's enough non-transient
525*8906SEric.Saxe@Sun.COM 						 * idle periods to justify
526*8906SEric.Saxe@Sun.COM 						 * removing the governor.
527*8906SEric.Saxe@Sun.COM 						 */
528*8906SEric.Saxe@Sun.COM 						dom->cpd_ti_governed = B_FALSE;
529*8906SEric.Saxe@Sun.COM 						dom->cpd_ti = 0;
530*8906SEric.Saxe@Sun.COM 						DTRACE_PROBE1(
531*8906SEric.Saxe@Sun.COM 						    cpupm__ti__ungoverned,
532*8906SEric.Saxe@Sun.COM 						    cpupm_domain_t *, dom);
533*8906SEric.Saxe@Sun.COM 					}
534*8906SEric.Saxe@Sun.COM 				} else {
535*8906SEric.Saxe@Sun.COM 					/*
536*8906SEric.Saxe@Sun.COM 					 * Correctly predicted governing the
537*8906SEric.Saxe@Sun.COM 					 * last lowering request.
538*8906SEric.Saxe@Sun.COM 					 */
539*8906SEric.Saxe@Sun.COM 					dom->cpd_ti = 0;
540*8906SEric.Saxe@Sun.COM 				}
541*8906SEric.Saxe@Sun.COM 			}
542*8906SEric.Saxe@Sun.COM 		}
543*8906SEric.Saxe@Sun.COM 		break;
544*8906SEric.Saxe@Sun.COM 
545*8906SEric.Saxe@Sun.COM 	case CPUPM_DOM_IDLE_FROM_BUSY:
546*8906SEric.Saxe@Sun.COM 		last = dom->cpd_last_raise;
547*8906SEric.Saxe@Sun.COM 		dom->cpd_last_lower = now;
548*8906SEric.Saxe@Sun.COM 
549*8906SEric.Saxe@Sun.COM 		DTRACE_PROBE3(cpupm__lower__req,
550*8906SEric.Saxe@Sun.COM 		    cpupm_domain_t *, dom,
551*8906SEric.Saxe@Sun.COM 		    hrtime_t, last,
552*8906SEric.Saxe@Sun.COM 		    hrtime_t, now);
553*8906SEric.Saxe@Sun.COM 
554*8906SEric.Saxe@Sun.COM 		if (dom->cpd_state ==
555*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
556*8906SEric.Saxe@Sun.COM 
557*8906SEric.Saxe@Sun.COM 			/*
558*8906SEric.Saxe@Sun.COM 			 * The domain is idle, and is running in the highest
559*8906SEric.Saxe@Sun.COM 			 * performance state. Before we consider lowering power,
560*8906SEric.Saxe@Sun.COM 			 * perform some book keeping for the transient work
561*8906SEric.Saxe@Sun.COM 			 * governor.
562*8906SEric.Saxe@Sun.COM 			 */
563*8906SEric.Saxe@Sun.COM 			if (dom->cpd_tw_governed == B_FALSE) {
564*8906SEric.Saxe@Sun.COM 				if ((now - last) < cpupm_tw_predict_interval) {
565*8906SEric.Saxe@Sun.COM 					/*
566*8906SEric.Saxe@Sun.COM 					 * We're lowering the domain power and
567*8906SEric.Saxe@Sun.COM 					 * we *just* raised it. Consider the
568*8906SEric.Saxe@Sun.COM 					 * last raise mispredicted due to
569*8906SEric.Saxe@Sun.COM 					 * transient work.
570*8906SEric.Saxe@Sun.COM 					 */
571*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_tw >=
572*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_thresh) {
573*8906SEric.Saxe@Sun.COM 						/*
574*8906SEric.Saxe@Sun.COM 						 * There's enough transient idle
575*8906SEric.Saxe@Sun.COM 						 * transitions to justify
576*8906SEric.Saxe@Sun.COM 						 * governing future lowering
577*8906SEric.Saxe@Sun.COM 						 * requests.
578*8906SEric.Saxe@Sun.COM 						 */
579*8906SEric.Saxe@Sun.COM 						dom->cpd_tw_governed = B_TRUE;
580*8906SEric.Saxe@Sun.COM 						dom->cpd_tw = 0;
581*8906SEric.Saxe@Sun.COM 						DTRACE_PROBE1(
582*8906SEric.Saxe@Sun.COM 						    cpupm__tw__governed,
583*8906SEric.Saxe@Sun.COM 						    cpupm_domain_t *, dom);
584*8906SEric.Saxe@Sun.COM 					}
585*8906SEric.Saxe@Sun.COM 				} else {
586*8906SEric.Saxe@Sun.COM 					/*
587*8906SEric.Saxe@Sun.COM 					 * We correctly predicted during the
588*8906SEric.Saxe@Sun.COM 					 * last raise.
589*8906SEric.Saxe@Sun.COM 					 */
590*8906SEric.Saxe@Sun.COM 					dom->cpd_tw = 0;
591*8906SEric.Saxe@Sun.COM 				}
592*8906SEric.Saxe@Sun.COM 			}
593*8906SEric.Saxe@Sun.COM 			if (dom->cpd_ti_governed == B_TRUE) {
594*8906SEric.Saxe@Sun.COM 				/*
595*8906SEric.Saxe@Sun.COM 				 * Lowering requests are governed due to
596*8906SEric.Saxe@Sun.COM 				 * transient idleness.
597*8906SEric.Saxe@Sun.COM 				 */
598*8906SEric.Saxe@Sun.COM 				DTRACE_PROBE1(cpupm__lowering__governed,
599*8906SEric.Saxe@Sun.COM 				    cpupm_domain_t *, dom);
600*8906SEric.Saxe@Sun.COM 
601*8906SEric.Saxe@Sun.COM 				/*
602*8906SEric.Saxe@Sun.COM 				 * It's likely that we'll be governed for a
603*8906SEric.Saxe@Sun.COM 				 * while. If the transient work governor is
604*8906SEric.Saxe@Sun.COM 				 * also in place, examine the preceeding busy
605*8906SEric.Saxe@Sun.COM 				 * interval to see if that still makes sense.
606*8906SEric.Saxe@Sun.COM 				 */
607*8906SEric.Saxe@Sun.COM 				if (dom->cpd_tw_governed == B_TRUE &&
608*8906SEric.Saxe@Sun.COM 				    ((now - last) >=
609*8906SEric.Saxe@Sun.COM 				    cpupm_tw_predict_interval)) {
610*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_tw >=
611*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_gov_thresh) {
612*8906SEric.Saxe@Sun.COM 						dom->cpd_tw_governed =
613*8906SEric.Saxe@Sun.COM 						    B_FALSE;
614*8906SEric.Saxe@Sun.COM 						dom->cpd_tw = 0;
615*8906SEric.Saxe@Sun.COM 					}
616*8906SEric.Saxe@Sun.COM 				}
617*8906SEric.Saxe@Sun.COM 				return;
618*8906SEric.Saxe@Sun.COM 			}
619*8906SEric.Saxe@Sun.COM 
620*8906SEric.Saxe@Sun.COM 			/*
621*8906SEric.Saxe@Sun.COM 			 * Prepare to transition to a lower power state.
622*8906SEric.Saxe@Sun.COM 			 */
623*8906SEric.Saxe@Sun.COM 			new_state =
624*8906SEric.Saxe@Sun.COM 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
625*8906SEric.Saxe@Sun.COM 
626*8906SEric.Saxe@Sun.COM 		} else if (dom->cpd_state ==
627*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
628*8906SEric.Saxe@Sun.COM 
629*8906SEric.Saxe@Sun.COM 			/*
630*8906SEric.Saxe@Sun.COM 			 * The domain is idle, and we're already running in
631*8906SEric.Saxe@Sun.COM 			 * the lower power state. Take this opportunity to
632*8906SEric.Saxe@Sun.COM 			 * perform some book keeping if the last raising
633*8906SEric.Saxe@Sun.COM 			 * request was governed.
634*8906SEric.Saxe@Sun.COM 			 */
635*8906SEric.Saxe@Sun.COM 			if (dom->cpd_tw_governed == B_TRUE) {
636*8906SEric.Saxe@Sun.COM 				if ((now - last) >= cpupm_tw_predict_interval) {
637*8906SEric.Saxe@Sun.COM 					/*
638*8906SEric.Saxe@Sun.COM 					 * The domain is transient work
639*8906SEric.Saxe@Sun.COM 					 * governed, and we mispredicted
640*8906SEric.Saxe@Sun.COM 					 * governing the last raising request.
641*8906SEric.Saxe@Sun.COM 					 */
642*8906SEric.Saxe@Sun.COM 					if (++dom->cpd_tw >=
643*8906SEric.Saxe@Sun.COM 					    cpupm_mispredict_gov_thresh) {
644*8906SEric.Saxe@Sun.COM 						/*
645*8906SEric.Saxe@Sun.COM 						 * There's enough non-transient
646*8906SEric.Saxe@Sun.COM 						 * work to justify removing
647*8906SEric.Saxe@Sun.COM 						 * the governor.
648*8906SEric.Saxe@Sun.COM 						 */
649*8906SEric.Saxe@Sun.COM 						dom->cpd_tw_governed = B_FALSE;
650*8906SEric.Saxe@Sun.COM 						dom->cpd_tw = 0;
651*8906SEric.Saxe@Sun.COM 						DTRACE_PROBE1(
652*8906SEric.Saxe@Sun.COM 						    cpupm__tw__ungoverned,
653*8906SEric.Saxe@Sun.COM 						    cpupm_domain_t *, dom);
654*8906SEric.Saxe@Sun.COM 					}
655*8906SEric.Saxe@Sun.COM 				} else {
656*8906SEric.Saxe@Sun.COM 					/*
657*8906SEric.Saxe@Sun.COM 					 * We correctly predicted governing
658*8906SEric.Saxe@Sun.COM 					 * the last raise.
659*8906SEric.Saxe@Sun.COM 					 */
660*8906SEric.Saxe@Sun.COM 					dom->cpd_tw = 0;
661*8906SEric.Saxe@Sun.COM 				}
662*8906SEric.Saxe@Sun.COM 			}
663*8906SEric.Saxe@Sun.COM 		}
664*8906SEric.Saxe@Sun.COM 		break;
665*8906SEric.Saxe@Sun.COM 	}
666*8906SEric.Saxe@Sun.COM 	/*
667*8906SEric.Saxe@Sun.COM 	 * Change the power state
668*8906SEric.Saxe@Sun.COM 	 * Not much currently done if this doesn't succeed
669*8906SEric.Saxe@Sun.COM 	 */
670*8906SEric.Saxe@Sun.COM 	if (new_state)
671*8906SEric.Saxe@Sun.COM 		(void) cpupm_change_state(cp, dom, new_state);
672*8906SEric.Saxe@Sun.COM }
673*8906SEric.Saxe@Sun.COM 
674*8906SEric.Saxe@Sun.COM 
675*8906SEric.Saxe@Sun.COM /*
676*8906SEric.Saxe@Sun.COM  * Interface called by platforms to dynamically change the
677*8906SEric.Saxe@Sun.COM  * MAX performance cpupm state
678*8906SEric.Saxe@Sun.COM  */
679*8906SEric.Saxe@Sun.COM void
680*8906SEric.Saxe@Sun.COM cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
681*8906SEric.Saxe@Sun.COM {
682*8906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
683*8906SEric.Saxe@Sun.COM 	id_t		did;
684*8906SEric.Saxe@Sun.COM 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
685*8906SEric.Saxe@Sun.COM 	boolean_t	change_state = B_FALSE;
686*8906SEric.Saxe@Sun.COM 	cpupm_state_t	*new_state = NULL;
687*8906SEric.Saxe@Sun.COM 
688*8906SEric.Saxe@Sun.COM 	did = cpupm_domain_id(cp, type);
689*8906SEric.Saxe@Sun.COM 	mutex_enter(&cpu_lock);
690*8906SEric.Saxe@Sun.COM 	dom = cpupm_domain_find(did, type);
691*8906SEric.Saxe@Sun.COM 	mutex_exit(&cpu_lock);
692*8906SEric.Saxe@Sun.COM 
693*8906SEric.Saxe@Sun.COM 	/*
694*8906SEric.Saxe@Sun.COM 	 * Can use a lock to avoid changing the power state of the cpu when
695*8906SEric.Saxe@Sun.COM 	 * CPUPM_STATE_MAX_PERF is getting changed.
696*8906SEric.Saxe@Sun.COM 	 * Since the occurance of events to change MAX_PERF is not frequent,
697*8906SEric.Saxe@Sun.COM 	 * it may not be a good idea to overburden with locks. In the worst
698*8906SEric.Saxe@Sun.COM 	 * case, for one cycle the power may not get changed to the required
699*8906SEric.Saxe@Sun.COM 	 * level
700*8906SEric.Saxe@Sun.COM 	 */
701*8906SEric.Saxe@Sun.COM 	if (dom != NULL) {
702*8906SEric.Saxe@Sun.COM 		if (dom->cpd_state ==
703*8906SEric.Saxe@Sun.COM 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
704*8906SEric.Saxe@Sun.COM 			change_state = B_TRUE;
705*8906SEric.Saxe@Sun.COM 		}
706*8906SEric.Saxe@Sun.COM 
707*8906SEric.Saxe@Sun.COM 		/*
708*8906SEric.Saxe@Sun.COM 		 * If an out of range level is passed, use the lowest supported
709*8906SEric.Saxe@Sun.COM 		 * speed.
710*8906SEric.Saxe@Sun.COM 		 */
711*8906SEric.Saxe@Sun.COM 		if (max_perf_level >= dom->cpd_nstates &&
712*8906SEric.Saxe@Sun.COM 		    dom->cpd_nstates > 1) {
713*8906SEric.Saxe@Sun.COM 			max_perf_level = dom->cpd_nstates - 1;
714*8906SEric.Saxe@Sun.COM 		}
715*8906SEric.Saxe@Sun.COM 
716*8906SEric.Saxe@Sun.COM 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
717*8906SEric.Saxe@Sun.COM 		    &dom->cpd_states[max_perf_level];
718*8906SEric.Saxe@Sun.COM 
719*8906SEric.Saxe@Sun.COM 		/*
720*8906SEric.Saxe@Sun.COM 		 * If the current state is MAX_PERF, change the current state
721*8906SEric.Saxe@Sun.COM 		 * to the new MAX_PERF
722*8906SEric.Saxe@Sun.COM 		 */
723*8906SEric.Saxe@Sun.COM 		if (change_state) {
724*8906SEric.Saxe@Sun.COM 			new_state =
725*8906SEric.Saxe@Sun.COM 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
726*8906SEric.Saxe@Sun.COM 			if (new_state) {
727*8906SEric.Saxe@Sun.COM 				(void) cpupm_change_state(cp, dom, new_state);
728*8906SEric.Saxe@Sun.COM 			}
729*8906SEric.Saxe@Sun.COM 		}
730*8906SEric.Saxe@Sun.COM 	}
731*8906SEric.Saxe@Sun.COM }
732*8906SEric.Saxe@Sun.COM 
733*8906SEric.Saxe@Sun.COM /*
734*8906SEric.Saxe@Sun.COM  * Benchmark some power state transitions and use the transition latencies as
735*8906SEric.Saxe@Sun.COM  * a basis for initializing parameters for the transient idle and transient
736*8906SEric.Saxe@Sun.COM  * work governors.
737*8906SEric.Saxe@Sun.COM  *
738*8906SEric.Saxe@Sun.COM  * Returns 0 on success or -1 if the governor parameters could not be
739*8906SEric.Saxe@Sun.COM  * initialized.
740*8906SEric.Saxe@Sun.COM  */
741*8906SEric.Saxe@Sun.COM static int
742*8906SEric.Saxe@Sun.COM cpupm_governor_initialize(void)
743*8906SEric.Saxe@Sun.COM {
744*8906SEric.Saxe@Sun.COM 	cpu_t		*cp = CPU;
745*8906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
746*8906SEric.Saxe@Sun.COM 	cpupm_state_t	*low, *high;
747*8906SEric.Saxe@Sun.COM 	id_t		did;
748*8906SEric.Saxe@Sun.COM 	hrtime_t	start, delta, deltas = 0;
749*8906SEric.Saxe@Sun.COM 	int		iterations;
750*8906SEric.Saxe@Sun.COM 
751*8906SEric.Saxe@Sun.COM 	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
752*8906SEric.Saxe@Sun.COM 	if (did == CPUPM_NO_DOMAIN)
753*8906SEric.Saxe@Sun.COM 		return (-1);
754*8906SEric.Saxe@Sun.COM 
755*8906SEric.Saxe@Sun.COM 	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
756*8906SEric.Saxe@Sun.COM 	if (dom == NULL)
757*8906SEric.Saxe@Sun.COM 		return (-1);
758*8906SEric.Saxe@Sun.COM 
759*8906SEric.Saxe@Sun.COM 	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
760*8906SEric.Saxe@Sun.COM 	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
761*8906SEric.Saxe@Sun.COM 
762*8906SEric.Saxe@Sun.COM 	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
763*8906SEric.Saxe@Sun.COM 
764*8906SEric.Saxe@Sun.COM 		/*
765*8906SEric.Saxe@Sun.COM 		 * Measure the amount of time it takes to transition the
766*8906SEric.Saxe@Sun.COM 		 * domain down to the lowest, and back to the highest power
767*8906SEric.Saxe@Sun.COM 		 * state.
768*8906SEric.Saxe@Sun.COM 		 */
769*8906SEric.Saxe@Sun.COM 		start = gethrtime_unscaled();
770*8906SEric.Saxe@Sun.COM 		(void) cpupm_change_state(cp, dom, low);
771*8906SEric.Saxe@Sun.COM 		(void) cpupm_change_state(cp, dom, high);
772*8906SEric.Saxe@Sun.COM 		delta = gethrtime_unscaled() - start;
773*8906SEric.Saxe@Sun.COM 
774*8906SEric.Saxe@Sun.COM 		DTRACE_PROBE1(cpupm__benchmark__latency,
775*8906SEric.Saxe@Sun.COM 		    hrtime_t, delta);
776*8906SEric.Saxe@Sun.COM 
777*8906SEric.Saxe@Sun.COM 		deltas += delta;
778*8906SEric.Saxe@Sun.COM 	}
779*8906SEric.Saxe@Sun.COM 
780*8906SEric.Saxe@Sun.COM 	/*
781*8906SEric.Saxe@Sun.COM 	 * Figure the average latency, and tune the transient work and
782*8906SEric.Saxe@Sun.COM 	 * transient idle prediction intervals accordingly.
783*8906SEric.Saxe@Sun.COM 	 */
784*8906SEric.Saxe@Sun.COM 	delta = deltas / iterations;
785*8906SEric.Saxe@Sun.COM 
786*8906SEric.Saxe@Sun.COM 	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
787*8906SEric.Saxe@Sun.COM 	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
788*8906SEric.Saxe@Sun.COM 
789*8906SEric.Saxe@Sun.COM 	return (0);
790*8906SEric.Saxe@Sun.COM }
791*8906SEric.Saxe@Sun.COM 
792*8906SEric.Saxe@Sun.COM /*
793*8906SEric.Saxe@Sun.COM  * Initiate a state change in all CPUPM domain instances of the specified type
794*8906SEric.Saxe@Sun.COM  */
795*8906SEric.Saxe@Sun.COM static void
796*8906SEric.Saxe@Sun.COM cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
797*8906SEric.Saxe@Sun.COM {
798*8906SEric.Saxe@Sun.COM 	cpu_t		*cp;
799*8906SEric.Saxe@Sun.COM 	pg_cmt_t	*pwr_pg;
800*8906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
801*8906SEric.Saxe@Sun.COM 	group_t		*hwset;
802*8906SEric.Saxe@Sun.COM 	group_iter_t	giter;
803*8906SEric.Saxe@Sun.COM 	pg_cpu_itr_t	cpu_iter;
804*8906SEric.Saxe@Sun.COM 	pghw_type_t	hw;
805*8906SEric.Saxe@Sun.COM 
806*8906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
807*8906SEric.Saxe@Sun.COM 
808*8906SEric.Saxe@Sun.COM 	switch (type) {
809*8906SEric.Saxe@Sun.COM 	case CPUPM_DTYPE_ACTIVE:
810*8906SEric.Saxe@Sun.COM 		hw = PGHW_POW_ACTIVE;
811*8906SEric.Saxe@Sun.COM 		break;
812*8906SEric.Saxe@Sun.COM 	default:
813*8906SEric.Saxe@Sun.COM 		/*
814*8906SEric.Saxe@Sun.COM 		 * Power domain types other than "active" unsupported.
815*8906SEric.Saxe@Sun.COM 		 */
816*8906SEric.Saxe@Sun.COM 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
817*8906SEric.Saxe@Sun.COM 		return;
818*8906SEric.Saxe@Sun.COM 	}
819*8906SEric.Saxe@Sun.COM 
820*8906SEric.Saxe@Sun.COM 	if ((hwset = pghw_set_lookup(hw)) == NULL)
821*8906SEric.Saxe@Sun.COM 		return;
822*8906SEric.Saxe@Sun.COM 
823*8906SEric.Saxe@Sun.COM 	/*
824*8906SEric.Saxe@Sun.COM 	 * Iterate over the power domains
825*8906SEric.Saxe@Sun.COM 	 */
826*8906SEric.Saxe@Sun.COM 	group_iter_init(&giter);
827*8906SEric.Saxe@Sun.COM 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
828*8906SEric.Saxe@Sun.COM 
829*8906SEric.Saxe@Sun.COM 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
830*8906SEric.Saxe@Sun.COM 
831*8906SEric.Saxe@Sun.COM 		/*
832*8906SEric.Saxe@Sun.COM 		 * Iterate over the CPUs in each domain
833*8906SEric.Saxe@Sun.COM 		 */
834*8906SEric.Saxe@Sun.COM 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
835*8906SEric.Saxe@Sun.COM 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
836*8906SEric.Saxe@Sun.COM 			(void) cpupm_change_state(cp, dom,
837*8906SEric.Saxe@Sun.COM 			    dom->cpd_named_states[state]);
838*8906SEric.Saxe@Sun.COM 		}
839*8906SEric.Saxe@Sun.COM 	}
840*8906SEric.Saxe@Sun.COM }
841