1*8906SEric.Saxe@Sun.COM /* 2*8906SEric.Saxe@Sun.COM * CDDL HEADER START 3*8906SEric.Saxe@Sun.COM * 4*8906SEric.Saxe@Sun.COM * The contents of this file are subject to the terms of the 5*8906SEric.Saxe@Sun.COM * Common Development and Distribution License (the "License"). 6*8906SEric.Saxe@Sun.COM * You may not use this file except in compliance with the License. 7*8906SEric.Saxe@Sun.COM * 8*8906SEric.Saxe@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*8906SEric.Saxe@Sun.COM * or http://www.opensolaris.org/os/licensing. 10*8906SEric.Saxe@Sun.COM * See the License for the specific language governing permissions 11*8906SEric.Saxe@Sun.COM * and limitations under the License. 12*8906SEric.Saxe@Sun.COM * 13*8906SEric.Saxe@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each 14*8906SEric.Saxe@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*8906SEric.Saxe@Sun.COM * If applicable, add the following below this CDDL HEADER, with the 16*8906SEric.Saxe@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying 17*8906SEric.Saxe@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner] 18*8906SEric.Saxe@Sun.COM * 19*8906SEric.Saxe@Sun.COM * CDDL HEADER END 20*8906SEric.Saxe@Sun.COM */ 21*8906SEric.Saxe@Sun.COM /* 22*8906SEric.Saxe@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23*8906SEric.Saxe@Sun.COM * Use is subject to license terms. 24*8906SEric.Saxe@Sun.COM */ 25*8906SEric.Saxe@Sun.COM 26*8906SEric.Saxe@Sun.COM #include <sys/cpu_pm.h> 27*8906SEric.Saxe@Sun.COM #include <sys/cmn_err.h> 28*8906SEric.Saxe@Sun.COM #include <sys/sdt.h> 29*8906SEric.Saxe@Sun.COM 30*8906SEric.Saxe@Sun.COM /* 31*8906SEric.Saxe@Sun.COM * Solaris Event Based CPU Power Manager 32*8906SEric.Saxe@Sun.COM * 33*8906SEric.Saxe@Sun.COM * This file implements platform independent event based CPU power management. 34*8906SEric.Saxe@Sun.COM * When CPUs are configured into the system, the CMT scheduling subsystem will 35*8906SEric.Saxe@Sun.COM * query the platform to determine if the CPU belongs to any power management 36*8906SEric.Saxe@Sun.COM * domains. That is, sets of CPUs that share power management states. 37*8906SEric.Saxe@Sun.COM * 38*8906SEric.Saxe@Sun.COM * Active Power Management domains represent a group of CPUs across which the 39*8906SEric.Saxe@Sun.COM * Operating System can request speed changes (which may in turn result 40*8906SEric.Saxe@Sun.COM * in voltage changes). This allows the operating system to trade off 41*8906SEric.Saxe@Sun.COM * performance for power savings. 42*8906SEric.Saxe@Sun.COM * 43*8906SEric.Saxe@Sun.COM * Idle Power Management domains can enter power savings states when they are 44*8906SEric.Saxe@Sun.COM * unutilized. These states allow the Operating System to trade off power 45*8906SEric.Saxe@Sun.COM * for performance (in the form of latency to transition from the idle state 46*8906SEric.Saxe@Sun.COM * to an active one). 47*8906SEric.Saxe@Sun.COM * 48*8906SEric.Saxe@Sun.COM * For each active and idle power domain the CMT subsystem instantiates, a 49*8906SEric.Saxe@Sun.COM * cpupm_domain_t structure is created. As the dispatcher schedules threads 50*8906SEric.Saxe@Sun.COM * to run on the system's CPUs, it will also track the utilization of the 51*8906SEric.Saxe@Sun.COM * enumerated power domains. Significant changes in utilization will result 52*8906SEric.Saxe@Sun.COM * in the dispatcher sending the power manager events that relate to the 53*8906SEric.Saxe@Sun.COM * utilization of the power domain. The power manager recieves the events, 54*8906SEric.Saxe@Sun.COM * and in the context of the policy objectives in force, may decide to request 55*8906SEric.Saxe@Sun.COM * the domain's power/performance state be changed. 56*8906SEric.Saxe@Sun.COM * 57*8906SEric.Saxe@Sun.COM * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power 58*8906SEric.Saxe@Sun.COM * manager will request the CPUs in the domain run at their fastest (and most 59*8906SEric.Saxe@Sun.COM * power consuming) state. When the domain becomes idle (utilization at zero), 60*8906SEric.Saxe@Sun.COM * the power manager will request that the CPUs run at a speed that saves the 61*8906SEric.Saxe@Sun.COM * most power. 62*8906SEric.Saxe@Sun.COM * 63*8906SEric.Saxe@Sun.COM * The advantage of this scheme, is that the CPU power manager working with the 64*8906SEric.Saxe@Sun.COM * dispatcher can be extremely responsive to changes in utilization. Optimizing 65*8906SEric.Saxe@Sun.COM * for performance in the presence of utilization, and power savings in the 66*8906SEric.Saxe@Sun.COM * presence of idleness. Such close collaboration with the dispatcher has other 67*8906SEric.Saxe@Sun.COM * benefits that will play out in the form of more sophisticated power / 68*8906SEric.Saxe@Sun.COM * performance policy in the near future. 69*8906SEric.Saxe@Sun.COM * 70*8906SEric.Saxe@Sun.COM * Avoiding state thrashing in the presence of transient periods of utilization 71*8906SEric.Saxe@Sun.COM * and idleness while still being responsive to non-transient periods is key. 72*8906SEric.Saxe@Sun.COM * The power manager implmeents several "governors" that are used to throttle 73*8906SEric.Saxe@Sun.COM * state transitions when a significant amount of transient idle or transient 74*8906SEric.Saxe@Sun.COM * work is detected. 75*8906SEric.Saxe@Sun.COM * 76*8906SEric.Saxe@Sun.COM * Kernel background activity (e.g. taskq threads) are by far the most common 77*8906SEric.Saxe@Sun.COM * form of transient utilization. Ungoverned in the face of this utililzation, 78*8906SEric.Saxe@Sun.COM * hundreds of state transitions per second would result on an idle system. 79*8906SEric.Saxe@Sun.COM * 80*8906SEric.Saxe@Sun.COM * Transient idleness is common when a thread briefly yields the CPU to 81*8906SEric.Saxe@Sun.COM * wait for an event elsewhere in the system. Where the idle period is short 82*8906SEric.Saxe@Sun.COM * enough, the overhead associated with making the state transition doesn't 83*8906SEric.Saxe@Sun.COM * justify the power savings. 84*8906SEric.Saxe@Sun.COM */ 85*8906SEric.Saxe@Sun.COM 86*8906SEric.Saxe@Sun.COM static cpupm_domain_t *cpupm_domains = NULL; 87*8906SEric.Saxe@Sun.COM 88*8906SEric.Saxe@Sun.COM /* 89*8906SEric.Saxe@Sun.COM * Uninitialized state of CPU power management is disabled 90*8906SEric.Saxe@Sun.COM */ 91*8906SEric.Saxe@Sun.COM cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED; 92*8906SEric.Saxe@Sun.COM 93*8906SEric.Saxe@Sun.COM /* 94*8906SEric.Saxe@Sun.COM * Periods of utilization lasting less than this time interval are characterized 95*8906SEric.Saxe@Sun.COM * as transient. State changes associated with transient work are considered 96*8906SEric.Saxe@Sun.COM * to be mispredicted. That is, it's not worth raising and lower power states 97*8906SEric.Saxe@Sun.COM * where the utilization lasts for less than this interval. 98*8906SEric.Saxe@Sun.COM */ 99*8906SEric.Saxe@Sun.COM hrtime_t cpupm_tw_predict_interval; 100*8906SEric.Saxe@Sun.COM 101*8906SEric.Saxe@Sun.COM /* 102*8906SEric.Saxe@Sun.COM * Periods of idleness lasting less than this time interval are characterized 103*8906SEric.Saxe@Sun.COM * as transient. State changes associated with transient idle are considered 104*8906SEric.Saxe@Sun.COM * to be mispredicted. That is, it's not worth lowering and raising power 105*8906SEric.Saxe@Sun.COM * states where the idleness lasts for less than this interval. 106*8906SEric.Saxe@Sun.COM */ 107*8906SEric.Saxe@Sun.COM hrtime_t cpupm_ti_predict_interval; 108*8906SEric.Saxe@Sun.COM 109*8906SEric.Saxe@Sun.COM /* 110*8906SEric.Saxe@Sun.COM * Number of mispredictions after which future transitions will be governed. 111*8906SEric.Saxe@Sun.COM */ 112*8906SEric.Saxe@Sun.COM int cpupm_mispredict_thresh = 2; 113*8906SEric.Saxe@Sun.COM 114*8906SEric.Saxe@Sun.COM /* 115*8906SEric.Saxe@Sun.COM * Likewise, the number of mispredicted governed transitions after which the 116*8906SEric.Saxe@Sun.COM * governor will be removed. 117*8906SEric.Saxe@Sun.COM */ 118*8906SEric.Saxe@Sun.COM int cpupm_mispredict_gov_thresh = 10; 119*8906SEric.Saxe@Sun.COM 120*8906SEric.Saxe@Sun.COM /* 121*8906SEric.Saxe@Sun.COM * The transient work and transient idle prediction intervals are initialized 122*8906SEric.Saxe@Sun.COM * to be some multiple of the amount of time it takes to transition a power 123*8906SEric.Saxe@Sun.COM * domain from the highest to the lowest power state, and back again, which 124*8906SEric.Saxe@Sun.COM * is measured. 125*8906SEric.Saxe@Sun.COM * 126*8906SEric.Saxe@Sun.COM * The default values of those multiples are specified here. Tuning them higher 127*8906SEric.Saxe@Sun.COM * will result in the transient work, and transient idle governors being used 128*8906SEric.Saxe@Sun.COM * more aggresively, which limits the frequency of state transitions at the 129*8906SEric.Saxe@Sun.COM * expense of performance and power savings, respectively. 130*8906SEric.Saxe@Sun.COM */ 131*8906SEric.Saxe@Sun.COM #define CPUPM_TI_GOV_DEFAULT_MULTIPLE 600 132*8906SEric.Saxe@Sun.COM #define CPUPM_TW_GOV_DEFAULT_MULTIPLE 25 133*8906SEric.Saxe@Sun.COM 134*8906SEric.Saxe@Sun.COM /* 135*8906SEric.Saxe@Sun.COM * Number of high=>low=>high measurements performed, of which the average 136*8906SEric.Saxe@Sun.COM * is taken. 137*8906SEric.Saxe@Sun.COM */ 138*8906SEric.Saxe@Sun.COM #define CPUPM_BENCHMARK_ITERS 5 139*8906SEric.Saxe@Sun.COM 140*8906SEric.Saxe@Sun.COM int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE; 141*8906SEric.Saxe@Sun.COM int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE; 142*8906SEric.Saxe@Sun.COM 143*8906SEric.Saxe@Sun.COM 144*8906SEric.Saxe@Sun.COM static int cpupm_governor_initialize(void); 145*8906SEric.Saxe@Sun.COM static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t); 146*8906SEric.Saxe@Sun.COM 147*8906SEric.Saxe@Sun.COM cpupm_policy_t 148*8906SEric.Saxe@Sun.COM cpupm_get_policy(void) 149*8906SEric.Saxe@Sun.COM { 150*8906SEric.Saxe@Sun.COM return (cpupm_policy); 151*8906SEric.Saxe@Sun.COM } 152*8906SEric.Saxe@Sun.COM 153*8906SEric.Saxe@Sun.COM int 154*8906SEric.Saxe@Sun.COM cpupm_set_policy(cpupm_policy_t new_policy) 155*8906SEric.Saxe@Sun.COM { 156*8906SEric.Saxe@Sun.COM static int gov_init = 0; 157*8906SEric.Saxe@Sun.COM int result = 0; 158*8906SEric.Saxe@Sun.COM 159*8906SEric.Saxe@Sun.COM mutex_enter(&cpu_lock); 160*8906SEric.Saxe@Sun.COM if (new_policy == cpupm_policy) { 161*8906SEric.Saxe@Sun.COM mutex_exit(&cpu_lock); 162*8906SEric.Saxe@Sun.COM return (result); 163*8906SEric.Saxe@Sun.COM } 164*8906SEric.Saxe@Sun.COM 165*8906SEric.Saxe@Sun.COM /* 166*8906SEric.Saxe@Sun.COM * Pausing CPUs causes a high priority thread to be scheduled 167*8906SEric.Saxe@Sun.COM * on all other CPUs (besides the current one). This locks out 168*8906SEric.Saxe@Sun.COM * other CPUs from making CPUPM state transitions. 169*8906SEric.Saxe@Sun.COM */ 170*8906SEric.Saxe@Sun.COM switch (new_policy) { 171*8906SEric.Saxe@Sun.COM case CPUPM_POLICY_DISABLED: 172*8906SEric.Saxe@Sun.COM pause_cpus(NULL); 173*8906SEric.Saxe@Sun.COM cpupm_policy = CPUPM_POLICY_DISABLED; 174*8906SEric.Saxe@Sun.COM start_cpus(); 175*8906SEric.Saxe@Sun.COM 176*8906SEric.Saxe@Sun.COM result = cmt_pad_disable(PGHW_POW_ACTIVE); 177*8906SEric.Saxe@Sun.COM 178*8906SEric.Saxe@Sun.COM /* 179*8906SEric.Saxe@Sun.COM * Once PAD has been enabled, it should always be possible 180*8906SEric.Saxe@Sun.COM * to disable it. 181*8906SEric.Saxe@Sun.COM */ 182*8906SEric.Saxe@Sun.COM ASSERT(result == 0); 183*8906SEric.Saxe@Sun.COM 184*8906SEric.Saxe@Sun.COM /* 185*8906SEric.Saxe@Sun.COM * Bring all the active power domains to the maximum 186*8906SEric.Saxe@Sun.COM * performance state. 187*8906SEric.Saxe@Sun.COM */ 188*8906SEric.Saxe@Sun.COM cpupm_state_change_global(CPUPM_DTYPE_ACTIVE, 189*8906SEric.Saxe@Sun.COM CPUPM_STATE_MAX_PERF); 190*8906SEric.Saxe@Sun.COM 191*8906SEric.Saxe@Sun.COM break; 192*8906SEric.Saxe@Sun.COM case CPUPM_POLICY_ELASTIC: 193*8906SEric.Saxe@Sun.COM 194*8906SEric.Saxe@Sun.COM result = cmt_pad_enable(PGHW_POW_ACTIVE); 195*8906SEric.Saxe@Sun.COM if (result < 0) { 196*8906SEric.Saxe@Sun.COM /* 197*8906SEric.Saxe@Sun.COM * Failed to enable PAD across the active power 198*8906SEric.Saxe@Sun.COM * domains, which may well be because none were 199*8906SEric.Saxe@Sun.COM * enumerated. 200*8906SEric.Saxe@Sun.COM */ 201*8906SEric.Saxe@Sun.COM break; 202*8906SEric.Saxe@Sun.COM } 203*8906SEric.Saxe@Sun.COM 204*8906SEric.Saxe@Sun.COM pause_cpus(NULL); 205*8906SEric.Saxe@Sun.COM /* 206*8906SEric.Saxe@Sun.COM * Attempt to initialize the governor parameters the first 207*8906SEric.Saxe@Sun.COM * time through. 208*8906SEric.Saxe@Sun.COM */ 209*8906SEric.Saxe@Sun.COM if (gov_init == 0) { 210*8906SEric.Saxe@Sun.COM result = cpupm_governor_initialize(); 211*8906SEric.Saxe@Sun.COM if (result == 0) { 212*8906SEric.Saxe@Sun.COM gov_init = 1; 213*8906SEric.Saxe@Sun.COM } else { 214*8906SEric.Saxe@Sun.COM /* 215*8906SEric.Saxe@Sun.COM * Failed to initialize the governor parameters 216*8906SEric.Saxe@Sun.COM */ 217*8906SEric.Saxe@Sun.COM start_cpus(); 218*8906SEric.Saxe@Sun.COM break; 219*8906SEric.Saxe@Sun.COM } 220*8906SEric.Saxe@Sun.COM } 221*8906SEric.Saxe@Sun.COM cpupm_policy = CPUPM_POLICY_ELASTIC; 222*8906SEric.Saxe@Sun.COM start_cpus(); 223*8906SEric.Saxe@Sun.COM 224*8906SEric.Saxe@Sun.COM break; 225*8906SEric.Saxe@Sun.COM default: 226*8906SEric.Saxe@Sun.COM cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n", 227*8906SEric.Saxe@Sun.COM new_policy); 228*8906SEric.Saxe@Sun.COM ASSERT(0); 229*8906SEric.Saxe@Sun.COM break; 230*8906SEric.Saxe@Sun.COM } 231*8906SEric.Saxe@Sun.COM mutex_exit(&cpu_lock); 232*8906SEric.Saxe@Sun.COM 233*8906SEric.Saxe@Sun.COM return (result); 234*8906SEric.Saxe@Sun.COM } 235*8906SEric.Saxe@Sun.COM 236*8906SEric.Saxe@Sun.COM /* 237*8906SEric.Saxe@Sun.COM * Look for an existing power domain 238*8906SEric.Saxe@Sun.COM */ 239*8906SEric.Saxe@Sun.COM static cpupm_domain_t * 240*8906SEric.Saxe@Sun.COM cpupm_domain_find(id_t id, cpupm_dtype_t type) 241*8906SEric.Saxe@Sun.COM { 242*8906SEric.Saxe@Sun.COM ASSERT(MUTEX_HELD(&cpu_lock)); 243*8906SEric.Saxe@Sun.COM 244*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 245*8906SEric.Saxe@Sun.COM 246*8906SEric.Saxe@Sun.COM dom = cpupm_domains; 247*8906SEric.Saxe@Sun.COM while (dom != NULL) { 248*8906SEric.Saxe@Sun.COM if (id == dom->cpd_id && type == dom->cpd_type) 249*8906SEric.Saxe@Sun.COM return (dom); 250*8906SEric.Saxe@Sun.COM dom = dom->cpd_next; 251*8906SEric.Saxe@Sun.COM } 252*8906SEric.Saxe@Sun.COM return (NULL); 253*8906SEric.Saxe@Sun.COM } 254*8906SEric.Saxe@Sun.COM 255*8906SEric.Saxe@Sun.COM /* 256*8906SEric.Saxe@Sun.COM * Create a new domain 257*8906SEric.Saxe@Sun.COM */ 258*8906SEric.Saxe@Sun.COM static cpupm_domain_t * 259*8906SEric.Saxe@Sun.COM cpupm_domain_create(id_t id, cpupm_dtype_t type) 260*8906SEric.Saxe@Sun.COM { 261*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 262*8906SEric.Saxe@Sun.COM 263*8906SEric.Saxe@Sun.COM ASSERT(MUTEX_HELD(&cpu_lock)); 264*8906SEric.Saxe@Sun.COM 265*8906SEric.Saxe@Sun.COM dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP); 266*8906SEric.Saxe@Sun.COM dom->cpd_id = id; 267*8906SEric.Saxe@Sun.COM dom->cpd_type = type; 268*8906SEric.Saxe@Sun.COM 269*8906SEric.Saxe@Sun.COM /* Link into the known domain list */ 270*8906SEric.Saxe@Sun.COM dom->cpd_next = cpupm_domains; 271*8906SEric.Saxe@Sun.COM cpupm_domains = dom; 272*8906SEric.Saxe@Sun.COM 273*8906SEric.Saxe@Sun.COM return (dom); 274*8906SEric.Saxe@Sun.COM } 275*8906SEric.Saxe@Sun.COM 276*8906SEric.Saxe@Sun.COM static void 277*8906SEric.Saxe@Sun.COM cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom) 278*8906SEric.Saxe@Sun.COM { 279*8906SEric.Saxe@Sun.COM /* 280*8906SEric.Saxe@Sun.COM * In the envent we're enumerating because the domain's state 281*8906SEric.Saxe@Sun.COM * configuration has changed, toss any existing states. 282*8906SEric.Saxe@Sun.COM */ 283*8906SEric.Saxe@Sun.COM if (dom->cpd_nstates > 0) { 284*8906SEric.Saxe@Sun.COM kmem_free(dom->cpd_states, 285*8906SEric.Saxe@Sun.COM sizeof (cpupm_state_t) * dom->cpd_nstates); 286*8906SEric.Saxe@Sun.COM dom->cpd_nstates = 0; 287*8906SEric.Saxe@Sun.COM } 288*8906SEric.Saxe@Sun.COM 289*8906SEric.Saxe@Sun.COM /* 290*8906SEric.Saxe@Sun.COM * Query to determine the number of states, allocate storage 291*8906SEric.Saxe@Sun.COM * large enough to hold the state information, and pass it back 292*8906SEric.Saxe@Sun.COM * to the platform driver to complete the enumeration. 293*8906SEric.Saxe@Sun.COM */ 294*8906SEric.Saxe@Sun.COM dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL); 295*8906SEric.Saxe@Sun.COM 296*8906SEric.Saxe@Sun.COM if (dom->cpd_nstates == 0) 297*8906SEric.Saxe@Sun.COM return; 298*8906SEric.Saxe@Sun.COM 299*8906SEric.Saxe@Sun.COM dom->cpd_states = 300*8906SEric.Saxe@Sun.COM kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP); 301*8906SEric.Saxe@Sun.COM (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states); 302*8906SEric.Saxe@Sun.COM } 303*8906SEric.Saxe@Sun.COM 304*8906SEric.Saxe@Sun.COM /* 305*8906SEric.Saxe@Sun.COM * Initialize the specified type of power domain on behalf of the CPU 306*8906SEric.Saxe@Sun.COM */ 307*8906SEric.Saxe@Sun.COM cpupm_domain_t * 308*8906SEric.Saxe@Sun.COM cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type) 309*8906SEric.Saxe@Sun.COM { 310*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 311*8906SEric.Saxe@Sun.COM id_t did; 312*8906SEric.Saxe@Sun.COM 313*8906SEric.Saxe@Sun.COM ASSERT(MUTEX_HELD(&cpu_lock)); 314*8906SEric.Saxe@Sun.COM 315*8906SEric.Saxe@Sun.COM /* 316*8906SEric.Saxe@Sun.COM * Instantiate the domain if it doesn't already exist 317*8906SEric.Saxe@Sun.COM * and enumerate its power states. 318*8906SEric.Saxe@Sun.COM */ 319*8906SEric.Saxe@Sun.COM did = cpupm_domain_id(cp, type); 320*8906SEric.Saxe@Sun.COM dom = cpupm_domain_find(did, type); 321*8906SEric.Saxe@Sun.COM if (dom == NULL) { 322*8906SEric.Saxe@Sun.COM dom = cpupm_domain_create(did, type); 323*8906SEric.Saxe@Sun.COM cpupm_domain_state_enum(cp, dom); 324*8906SEric.Saxe@Sun.COM } 325*8906SEric.Saxe@Sun.COM 326*8906SEric.Saxe@Sun.COM /* 327*8906SEric.Saxe@Sun.COM * Named state initialization 328*8906SEric.Saxe@Sun.COM */ 329*8906SEric.Saxe@Sun.COM if (type == CPUPM_DTYPE_ACTIVE) { 330*8906SEric.Saxe@Sun.COM /* 331*8906SEric.Saxe@Sun.COM * For active power domains, the highest performance 332*8906SEric.Saxe@Sun.COM * state is defined as first state returned from 333*8906SEric.Saxe@Sun.COM * the domain enumeration. 334*8906SEric.Saxe@Sun.COM */ 335*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = 336*8906SEric.Saxe@Sun.COM &dom->cpd_states[0]; 337*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_LOW_POWER] = 338*8906SEric.Saxe@Sun.COM &dom->cpd_states[dom->cpd_nstates - 1]; 339*8906SEric.Saxe@Sun.COM 340*8906SEric.Saxe@Sun.COM /* 341*8906SEric.Saxe@Sun.COM * Begin by assuming CPU is running at the max perf state. 342*8906SEric.Saxe@Sun.COM */ 343*8906SEric.Saxe@Sun.COM dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 344*8906SEric.Saxe@Sun.COM } 345*8906SEric.Saxe@Sun.COM 346*8906SEric.Saxe@Sun.COM return (dom); 347*8906SEric.Saxe@Sun.COM } 348*8906SEric.Saxe@Sun.COM 349*8906SEric.Saxe@Sun.COM /* 350*8906SEric.Saxe@Sun.COM * Return the id associated with the given type of domain 351*8906SEric.Saxe@Sun.COM * to which cp belongs 352*8906SEric.Saxe@Sun.COM */ 353*8906SEric.Saxe@Sun.COM id_t 354*8906SEric.Saxe@Sun.COM cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type) 355*8906SEric.Saxe@Sun.COM { 356*8906SEric.Saxe@Sun.COM return (cpupm_plat_domain_id(cp, type)); 357*8906SEric.Saxe@Sun.COM } 358*8906SEric.Saxe@Sun.COM 359*8906SEric.Saxe@Sun.COM /* 360*8906SEric.Saxe@Sun.COM * Initiate a state change for the specified domain on behalf of cp 361*8906SEric.Saxe@Sun.COM */ 362*8906SEric.Saxe@Sun.COM int 363*8906SEric.Saxe@Sun.COM cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state) 364*8906SEric.Saxe@Sun.COM { 365*8906SEric.Saxe@Sun.COM if (cpupm_plat_change_state(cp, state) < 0) 366*8906SEric.Saxe@Sun.COM return (-1); 367*8906SEric.Saxe@Sun.COM 368*8906SEric.Saxe@Sun.COM DTRACE_PROBE2(cpupm__change__state, 369*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom, 370*8906SEric.Saxe@Sun.COM cpupm_state_t *, state); 371*8906SEric.Saxe@Sun.COM 372*8906SEric.Saxe@Sun.COM dom->cpd_state = state; 373*8906SEric.Saxe@Sun.COM return (0); 374*8906SEric.Saxe@Sun.COM } 375*8906SEric.Saxe@Sun.COM 376*8906SEric.Saxe@Sun.COM /* 377*8906SEric.Saxe@Sun.COM * Interface into the CPU power manager to indicate a significant change 378*8906SEric.Saxe@Sun.COM * in utilization of the specified active power domain 379*8906SEric.Saxe@Sun.COM */ 380*8906SEric.Saxe@Sun.COM void 381*8906SEric.Saxe@Sun.COM cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom, 382*8906SEric.Saxe@Sun.COM cpupm_util_event_t event) 383*8906SEric.Saxe@Sun.COM { 384*8906SEric.Saxe@Sun.COM cpupm_state_t *new_state = NULL; 385*8906SEric.Saxe@Sun.COM hrtime_t last; 386*8906SEric.Saxe@Sun.COM 387*8906SEric.Saxe@Sun.COM if (cpupm_policy == CPUPM_POLICY_DISABLED) { 388*8906SEric.Saxe@Sun.COM return; 389*8906SEric.Saxe@Sun.COM } 390*8906SEric.Saxe@Sun.COM 391*8906SEric.Saxe@Sun.COM /* 392*8906SEric.Saxe@Sun.COM * What follows is a simple elastic power state management policy. 393*8906SEric.Saxe@Sun.COM * 394*8906SEric.Saxe@Sun.COM * If the utilization has become non-zero, and the domain was 395*8906SEric.Saxe@Sun.COM * previously at it's lowest power state, then transition it 396*8906SEric.Saxe@Sun.COM * to the highest state in the spirit of "race to idle". 397*8906SEric.Saxe@Sun.COM * 398*8906SEric.Saxe@Sun.COM * If the utilization has dropped to zero, then transition the 399*8906SEric.Saxe@Sun.COM * domain to its lowest power state. 400*8906SEric.Saxe@Sun.COM * 401*8906SEric.Saxe@Sun.COM * Statistics are maintained to implement governors to reduce state 402*8906SEric.Saxe@Sun.COM * transitions resulting from either transient work, or periods of 403*8906SEric.Saxe@Sun.COM * transient idleness on the domain. 404*8906SEric.Saxe@Sun.COM */ 405*8906SEric.Saxe@Sun.COM switch (event) { 406*8906SEric.Saxe@Sun.COM case CPUPM_DOM_REMAIN_BUSY: 407*8906SEric.Saxe@Sun.COM 408*8906SEric.Saxe@Sun.COM /* 409*8906SEric.Saxe@Sun.COM * We've received an event that the domain is running a thread 410*8906SEric.Saxe@Sun.COM * that's made it to the end of it's time slice. If we are at 411*8906SEric.Saxe@Sun.COM * low power, then raise it. If the transient work governor 412*8906SEric.Saxe@Sun.COM * is engaged, then remove it. 413*8906SEric.Saxe@Sun.COM */ 414*8906SEric.Saxe@Sun.COM if (dom->cpd_state == 415*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 416*8906SEric.Saxe@Sun.COM new_state = 417*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 418*8906SEric.Saxe@Sun.COM if (dom->cpd_tw_governed == B_TRUE) { 419*8906SEric.Saxe@Sun.COM dom->cpd_tw_governed = B_FALSE; 420*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 421*8906SEric.Saxe@Sun.COM } 422*8906SEric.Saxe@Sun.COM } 423*8906SEric.Saxe@Sun.COM break; 424*8906SEric.Saxe@Sun.COM 425*8906SEric.Saxe@Sun.COM case CPUPM_DOM_BUSY_FROM_IDLE: 426*8906SEric.Saxe@Sun.COM last = dom->cpd_last_lower; 427*8906SEric.Saxe@Sun.COM dom->cpd_last_raise = now; 428*8906SEric.Saxe@Sun.COM 429*8906SEric.Saxe@Sun.COM DTRACE_PROBE3(cpupm__raise__req, 430*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom, 431*8906SEric.Saxe@Sun.COM hrtime_t, last, 432*8906SEric.Saxe@Sun.COM hrtime_t, now); 433*8906SEric.Saxe@Sun.COM 434*8906SEric.Saxe@Sun.COM if (dom->cpd_state == 435*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 436*8906SEric.Saxe@Sun.COM 437*8906SEric.Saxe@Sun.COM /* 438*8906SEric.Saxe@Sun.COM * There's non-zero utilization, and the domain is 439*8906SEric.Saxe@Sun.COM * running in the lower power state. Before we 440*8906SEric.Saxe@Sun.COM * consider raising power, perform some book keeping 441*8906SEric.Saxe@Sun.COM * for the transient idle governor. 442*8906SEric.Saxe@Sun.COM */ 443*8906SEric.Saxe@Sun.COM if (dom->cpd_ti_governed == B_FALSE) { 444*8906SEric.Saxe@Sun.COM if ((now - last) < cpupm_ti_predict_interval) { 445*8906SEric.Saxe@Sun.COM /* 446*8906SEric.Saxe@Sun.COM * We're raising the domain power and 447*8906SEric.Saxe@Sun.COM * we *just* lowered it. Consider 448*8906SEric.Saxe@Sun.COM * this a mispredicted power state 449*8906SEric.Saxe@Sun.COM * transition due to a transient 450*8906SEric.Saxe@Sun.COM * idle period. 451*8906SEric.Saxe@Sun.COM */ 452*8906SEric.Saxe@Sun.COM if (++dom->cpd_ti >= 453*8906SEric.Saxe@Sun.COM cpupm_mispredict_thresh) { 454*8906SEric.Saxe@Sun.COM /* 455*8906SEric.Saxe@Sun.COM * There's enough transient 456*8906SEric.Saxe@Sun.COM * idle transitions to 457*8906SEric.Saxe@Sun.COM * justify governing future 458*8906SEric.Saxe@Sun.COM * lowering requests. 459*8906SEric.Saxe@Sun.COM */ 460*8906SEric.Saxe@Sun.COM dom->cpd_ti_governed = B_TRUE; 461*8906SEric.Saxe@Sun.COM dom->cpd_ti = 0; 462*8906SEric.Saxe@Sun.COM DTRACE_PROBE1( 463*8906SEric.Saxe@Sun.COM cpupm__ti__governed, 464*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 465*8906SEric.Saxe@Sun.COM } 466*8906SEric.Saxe@Sun.COM } else { 467*8906SEric.Saxe@Sun.COM /* 468*8906SEric.Saxe@Sun.COM * We correctly predicted the last 469*8906SEric.Saxe@Sun.COM * lowering. 470*8906SEric.Saxe@Sun.COM */ 471*8906SEric.Saxe@Sun.COM dom->cpd_ti = 0; 472*8906SEric.Saxe@Sun.COM } 473*8906SEric.Saxe@Sun.COM } 474*8906SEric.Saxe@Sun.COM if (dom->cpd_tw_governed == B_TRUE) { 475*8906SEric.Saxe@Sun.COM /* 476*8906SEric.Saxe@Sun.COM * Raise requests are governed due to 477*8906SEric.Saxe@Sun.COM * transient work. 478*8906SEric.Saxe@Sun.COM */ 479*8906SEric.Saxe@Sun.COM DTRACE_PROBE1(cpupm__raise__governed, 480*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 481*8906SEric.Saxe@Sun.COM 482*8906SEric.Saxe@Sun.COM /* 483*8906SEric.Saxe@Sun.COM * It's likely that we'll be governed for a 484*8906SEric.Saxe@Sun.COM * while. If the transient idle governor is 485*8906SEric.Saxe@Sun.COM * also in place, examine the preceeding idle 486*8906SEric.Saxe@Sun.COM * interval to see if that still makes sense. 487*8906SEric.Saxe@Sun.COM */ 488*8906SEric.Saxe@Sun.COM if (dom->cpd_ti_governed == B_TRUE && 489*8906SEric.Saxe@Sun.COM ((now - last) >= 490*8906SEric.Saxe@Sun.COM cpupm_ti_predict_interval)) { 491*8906SEric.Saxe@Sun.COM if (++dom->cpd_ti >= 492*8906SEric.Saxe@Sun.COM cpupm_mispredict_gov_thresh) { 493*8906SEric.Saxe@Sun.COM dom->cpd_ti_governed = 494*8906SEric.Saxe@Sun.COM B_FALSE; 495*8906SEric.Saxe@Sun.COM dom->cpd_ti = 0; 496*8906SEric.Saxe@Sun.COM } 497*8906SEric.Saxe@Sun.COM } 498*8906SEric.Saxe@Sun.COM return; 499*8906SEric.Saxe@Sun.COM } 500*8906SEric.Saxe@Sun.COM /* 501*8906SEric.Saxe@Sun.COM * Prepare to transition to the higher power state 502*8906SEric.Saxe@Sun.COM */ 503*8906SEric.Saxe@Sun.COM new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 504*8906SEric.Saxe@Sun.COM 505*8906SEric.Saxe@Sun.COM } else if (dom->cpd_state == 506*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 507*8906SEric.Saxe@Sun.COM 508*8906SEric.Saxe@Sun.COM /* 509*8906SEric.Saxe@Sun.COM * Utilization is non-zero, and we're already running 510*8906SEric.Saxe@Sun.COM * in the higher power state. Take this opportunity to 511*8906SEric.Saxe@Sun.COM * perform some book keeping if the last lowering 512*8906SEric.Saxe@Sun.COM * request was governed. 513*8906SEric.Saxe@Sun.COM */ 514*8906SEric.Saxe@Sun.COM if (dom->cpd_ti_governed == B_TRUE) { 515*8906SEric.Saxe@Sun.COM if ((now - last) >= cpupm_ti_predict_interval) { 516*8906SEric.Saxe@Sun.COM /* 517*8906SEric.Saxe@Sun.COM * The domain is transient idle 518*8906SEric.Saxe@Sun.COM * governed, and we mispredicted 519*8906SEric.Saxe@Sun.COM * governing the last lowering request. 520*8906SEric.Saxe@Sun.COM */ 521*8906SEric.Saxe@Sun.COM if (++dom->cpd_ti >= 522*8906SEric.Saxe@Sun.COM cpupm_mispredict_gov_thresh) { 523*8906SEric.Saxe@Sun.COM /* 524*8906SEric.Saxe@Sun.COM * There's enough non-transient 525*8906SEric.Saxe@Sun.COM * idle periods to justify 526*8906SEric.Saxe@Sun.COM * removing the governor. 527*8906SEric.Saxe@Sun.COM */ 528*8906SEric.Saxe@Sun.COM dom->cpd_ti_governed = B_FALSE; 529*8906SEric.Saxe@Sun.COM dom->cpd_ti = 0; 530*8906SEric.Saxe@Sun.COM DTRACE_PROBE1( 531*8906SEric.Saxe@Sun.COM cpupm__ti__ungoverned, 532*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 533*8906SEric.Saxe@Sun.COM } 534*8906SEric.Saxe@Sun.COM } else { 535*8906SEric.Saxe@Sun.COM /* 536*8906SEric.Saxe@Sun.COM * Correctly predicted governing the 537*8906SEric.Saxe@Sun.COM * last lowering request. 538*8906SEric.Saxe@Sun.COM */ 539*8906SEric.Saxe@Sun.COM dom->cpd_ti = 0; 540*8906SEric.Saxe@Sun.COM } 541*8906SEric.Saxe@Sun.COM } 542*8906SEric.Saxe@Sun.COM } 543*8906SEric.Saxe@Sun.COM break; 544*8906SEric.Saxe@Sun.COM 545*8906SEric.Saxe@Sun.COM case CPUPM_DOM_IDLE_FROM_BUSY: 546*8906SEric.Saxe@Sun.COM last = dom->cpd_last_raise; 547*8906SEric.Saxe@Sun.COM dom->cpd_last_lower = now; 548*8906SEric.Saxe@Sun.COM 549*8906SEric.Saxe@Sun.COM DTRACE_PROBE3(cpupm__lower__req, 550*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom, 551*8906SEric.Saxe@Sun.COM hrtime_t, last, 552*8906SEric.Saxe@Sun.COM hrtime_t, now); 553*8906SEric.Saxe@Sun.COM 554*8906SEric.Saxe@Sun.COM if (dom->cpd_state == 555*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 556*8906SEric.Saxe@Sun.COM 557*8906SEric.Saxe@Sun.COM /* 558*8906SEric.Saxe@Sun.COM * The domain is idle, and is running in the highest 559*8906SEric.Saxe@Sun.COM * performance state. Before we consider lowering power, 560*8906SEric.Saxe@Sun.COM * perform some book keeping for the transient work 561*8906SEric.Saxe@Sun.COM * governor. 562*8906SEric.Saxe@Sun.COM */ 563*8906SEric.Saxe@Sun.COM if (dom->cpd_tw_governed == B_FALSE) { 564*8906SEric.Saxe@Sun.COM if ((now - last) < cpupm_tw_predict_interval) { 565*8906SEric.Saxe@Sun.COM /* 566*8906SEric.Saxe@Sun.COM * We're lowering the domain power and 567*8906SEric.Saxe@Sun.COM * we *just* raised it. Consider the 568*8906SEric.Saxe@Sun.COM * last raise mispredicted due to 569*8906SEric.Saxe@Sun.COM * transient work. 570*8906SEric.Saxe@Sun.COM */ 571*8906SEric.Saxe@Sun.COM if (++dom->cpd_tw >= 572*8906SEric.Saxe@Sun.COM cpupm_mispredict_thresh) { 573*8906SEric.Saxe@Sun.COM /* 574*8906SEric.Saxe@Sun.COM * There's enough transient idle 575*8906SEric.Saxe@Sun.COM * transitions to justify 576*8906SEric.Saxe@Sun.COM * governing future lowering 577*8906SEric.Saxe@Sun.COM * requests. 578*8906SEric.Saxe@Sun.COM */ 579*8906SEric.Saxe@Sun.COM dom->cpd_tw_governed = B_TRUE; 580*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 581*8906SEric.Saxe@Sun.COM DTRACE_PROBE1( 582*8906SEric.Saxe@Sun.COM cpupm__tw__governed, 583*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 584*8906SEric.Saxe@Sun.COM } 585*8906SEric.Saxe@Sun.COM } else { 586*8906SEric.Saxe@Sun.COM /* 587*8906SEric.Saxe@Sun.COM * We correctly predicted during the 588*8906SEric.Saxe@Sun.COM * last raise. 589*8906SEric.Saxe@Sun.COM */ 590*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 591*8906SEric.Saxe@Sun.COM } 592*8906SEric.Saxe@Sun.COM } 593*8906SEric.Saxe@Sun.COM if (dom->cpd_ti_governed == B_TRUE) { 594*8906SEric.Saxe@Sun.COM /* 595*8906SEric.Saxe@Sun.COM * Lowering requests are governed due to 596*8906SEric.Saxe@Sun.COM * transient idleness. 597*8906SEric.Saxe@Sun.COM */ 598*8906SEric.Saxe@Sun.COM DTRACE_PROBE1(cpupm__lowering__governed, 599*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 600*8906SEric.Saxe@Sun.COM 601*8906SEric.Saxe@Sun.COM /* 602*8906SEric.Saxe@Sun.COM * It's likely that we'll be governed for a 603*8906SEric.Saxe@Sun.COM * while. If the transient work governor is 604*8906SEric.Saxe@Sun.COM * also in place, examine the preceeding busy 605*8906SEric.Saxe@Sun.COM * interval to see if that still makes sense. 606*8906SEric.Saxe@Sun.COM */ 607*8906SEric.Saxe@Sun.COM if (dom->cpd_tw_governed == B_TRUE && 608*8906SEric.Saxe@Sun.COM ((now - last) >= 609*8906SEric.Saxe@Sun.COM cpupm_tw_predict_interval)) { 610*8906SEric.Saxe@Sun.COM if (++dom->cpd_tw >= 611*8906SEric.Saxe@Sun.COM cpupm_mispredict_gov_thresh) { 612*8906SEric.Saxe@Sun.COM dom->cpd_tw_governed = 613*8906SEric.Saxe@Sun.COM B_FALSE; 614*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 615*8906SEric.Saxe@Sun.COM } 616*8906SEric.Saxe@Sun.COM } 617*8906SEric.Saxe@Sun.COM return; 618*8906SEric.Saxe@Sun.COM } 619*8906SEric.Saxe@Sun.COM 620*8906SEric.Saxe@Sun.COM /* 621*8906SEric.Saxe@Sun.COM * Prepare to transition to a lower power state. 622*8906SEric.Saxe@Sun.COM */ 623*8906SEric.Saxe@Sun.COM new_state = 624*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; 625*8906SEric.Saxe@Sun.COM 626*8906SEric.Saxe@Sun.COM } else if (dom->cpd_state == 627*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) { 628*8906SEric.Saxe@Sun.COM 629*8906SEric.Saxe@Sun.COM /* 630*8906SEric.Saxe@Sun.COM * The domain is idle, and we're already running in 631*8906SEric.Saxe@Sun.COM * the lower power state. Take this opportunity to 632*8906SEric.Saxe@Sun.COM * perform some book keeping if the last raising 633*8906SEric.Saxe@Sun.COM * request was governed. 634*8906SEric.Saxe@Sun.COM */ 635*8906SEric.Saxe@Sun.COM if (dom->cpd_tw_governed == B_TRUE) { 636*8906SEric.Saxe@Sun.COM if ((now - last) >= cpupm_tw_predict_interval) { 637*8906SEric.Saxe@Sun.COM /* 638*8906SEric.Saxe@Sun.COM * The domain is transient work 639*8906SEric.Saxe@Sun.COM * governed, and we mispredicted 640*8906SEric.Saxe@Sun.COM * governing the last raising request. 641*8906SEric.Saxe@Sun.COM */ 642*8906SEric.Saxe@Sun.COM if (++dom->cpd_tw >= 643*8906SEric.Saxe@Sun.COM cpupm_mispredict_gov_thresh) { 644*8906SEric.Saxe@Sun.COM /* 645*8906SEric.Saxe@Sun.COM * There's enough non-transient 646*8906SEric.Saxe@Sun.COM * work to justify removing 647*8906SEric.Saxe@Sun.COM * the governor. 648*8906SEric.Saxe@Sun.COM */ 649*8906SEric.Saxe@Sun.COM dom->cpd_tw_governed = B_FALSE; 650*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 651*8906SEric.Saxe@Sun.COM DTRACE_PROBE1( 652*8906SEric.Saxe@Sun.COM cpupm__tw__ungoverned, 653*8906SEric.Saxe@Sun.COM cpupm_domain_t *, dom); 654*8906SEric.Saxe@Sun.COM } 655*8906SEric.Saxe@Sun.COM } else { 656*8906SEric.Saxe@Sun.COM /* 657*8906SEric.Saxe@Sun.COM * We correctly predicted governing 658*8906SEric.Saxe@Sun.COM * the last raise. 659*8906SEric.Saxe@Sun.COM */ 660*8906SEric.Saxe@Sun.COM dom->cpd_tw = 0; 661*8906SEric.Saxe@Sun.COM } 662*8906SEric.Saxe@Sun.COM } 663*8906SEric.Saxe@Sun.COM } 664*8906SEric.Saxe@Sun.COM break; 665*8906SEric.Saxe@Sun.COM } 666*8906SEric.Saxe@Sun.COM /* 667*8906SEric.Saxe@Sun.COM * Change the power state 668*8906SEric.Saxe@Sun.COM * Not much currently done if this doesn't succeed 669*8906SEric.Saxe@Sun.COM */ 670*8906SEric.Saxe@Sun.COM if (new_state) 671*8906SEric.Saxe@Sun.COM (void) cpupm_change_state(cp, dom, new_state); 672*8906SEric.Saxe@Sun.COM } 673*8906SEric.Saxe@Sun.COM 674*8906SEric.Saxe@Sun.COM 675*8906SEric.Saxe@Sun.COM /* 676*8906SEric.Saxe@Sun.COM * Interface called by platforms to dynamically change the 677*8906SEric.Saxe@Sun.COM * MAX performance cpupm state 678*8906SEric.Saxe@Sun.COM */ 679*8906SEric.Saxe@Sun.COM void 680*8906SEric.Saxe@Sun.COM cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level) 681*8906SEric.Saxe@Sun.COM { 682*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 683*8906SEric.Saxe@Sun.COM id_t did; 684*8906SEric.Saxe@Sun.COM cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE; 685*8906SEric.Saxe@Sun.COM boolean_t change_state = B_FALSE; 686*8906SEric.Saxe@Sun.COM cpupm_state_t *new_state = NULL; 687*8906SEric.Saxe@Sun.COM 688*8906SEric.Saxe@Sun.COM did = cpupm_domain_id(cp, type); 689*8906SEric.Saxe@Sun.COM mutex_enter(&cpu_lock); 690*8906SEric.Saxe@Sun.COM dom = cpupm_domain_find(did, type); 691*8906SEric.Saxe@Sun.COM mutex_exit(&cpu_lock); 692*8906SEric.Saxe@Sun.COM 693*8906SEric.Saxe@Sun.COM /* 694*8906SEric.Saxe@Sun.COM * Can use a lock to avoid changing the power state of the cpu when 695*8906SEric.Saxe@Sun.COM * CPUPM_STATE_MAX_PERF is getting changed. 696*8906SEric.Saxe@Sun.COM * Since the occurance of events to change MAX_PERF is not frequent, 697*8906SEric.Saxe@Sun.COM * it may not be a good idea to overburden with locks. In the worst 698*8906SEric.Saxe@Sun.COM * case, for one cycle the power may not get changed to the required 699*8906SEric.Saxe@Sun.COM * level 700*8906SEric.Saxe@Sun.COM */ 701*8906SEric.Saxe@Sun.COM if (dom != NULL) { 702*8906SEric.Saxe@Sun.COM if (dom->cpd_state == 703*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) { 704*8906SEric.Saxe@Sun.COM change_state = B_TRUE; 705*8906SEric.Saxe@Sun.COM } 706*8906SEric.Saxe@Sun.COM 707*8906SEric.Saxe@Sun.COM /* 708*8906SEric.Saxe@Sun.COM * If an out of range level is passed, use the lowest supported 709*8906SEric.Saxe@Sun.COM * speed. 710*8906SEric.Saxe@Sun.COM */ 711*8906SEric.Saxe@Sun.COM if (max_perf_level >= dom->cpd_nstates && 712*8906SEric.Saxe@Sun.COM dom->cpd_nstates > 1) { 713*8906SEric.Saxe@Sun.COM max_perf_level = dom->cpd_nstates - 1; 714*8906SEric.Saxe@Sun.COM } 715*8906SEric.Saxe@Sun.COM 716*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF] = 717*8906SEric.Saxe@Sun.COM &dom->cpd_states[max_perf_level]; 718*8906SEric.Saxe@Sun.COM 719*8906SEric.Saxe@Sun.COM /* 720*8906SEric.Saxe@Sun.COM * If the current state is MAX_PERF, change the current state 721*8906SEric.Saxe@Sun.COM * to the new MAX_PERF 722*8906SEric.Saxe@Sun.COM */ 723*8906SEric.Saxe@Sun.COM if (change_state) { 724*8906SEric.Saxe@Sun.COM new_state = 725*8906SEric.Saxe@Sun.COM dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 726*8906SEric.Saxe@Sun.COM if (new_state) { 727*8906SEric.Saxe@Sun.COM (void) cpupm_change_state(cp, dom, new_state); 728*8906SEric.Saxe@Sun.COM } 729*8906SEric.Saxe@Sun.COM } 730*8906SEric.Saxe@Sun.COM } 731*8906SEric.Saxe@Sun.COM } 732*8906SEric.Saxe@Sun.COM 733*8906SEric.Saxe@Sun.COM /* 734*8906SEric.Saxe@Sun.COM * Benchmark some power state transitions and use the transition latencies as 735*8906SEric.Saxe@Sun.COM * a basis for initializing parameters for the transient idle and transient 736*8906SEric.Saxe@Sun.COM * work governors. 737*8906SEric.Saxe@Sun.COM * 738*8906SEric.Saxe@Sun.COM * Returns 0 on success or -1 if the governor parameters could not be 739*8906SEric.Saxe@Sun.COM * initialized. 740*8906SEric.Saxe@Sun.COM */ 741*8906SEric.Saxe@Sun.COM static int 742*8906SEric.Saxe@Sun.COM cpupm_governor_initialize(void) 743*8906SEric.Saxe@Sun.COM { 744*8906SEric.Saxe@Sun.COM cpu_t *cp = CPU; 745*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 746*8906SEric.Saxe@Sun.COM cpupm_state_t *low, *high; 747*8906SEric.Saxe@Sun.COM id_t did; 748*8906SEric.Saxe@Sun.COM hrtime_t start, delta, deltas = 0; 749*8906SEric.Saxe@Sun.COM int iterations; 750*8906SEric.Saxe@Sun.COM 751*8906SEric.Saxe@Sun.COM did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE); 752*8906SEric.Saxe@Sun.COM if (did == CPUPM_NO_DOMAIN) 753*8906SEric.Saxe@Sun.COM return (-1); 754*8906SEric.Saxe@Sun.COM 755*8906SEric.Saxe@Sun.COM dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE); 756*8906SEric.Saxe@Sun.COM if (dom == NULL) 757*8906SEric.Saxe@Sun.COM return (-1); 758*8906SEric.Saxe@Sun.COM 759*8906SEric.Saxe@Sun.COM low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER]; 760*8906SEric.Saxe@Sun.COM high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF]; 761*8906SEric.Saxe@Sun.COM 762*8906SEric.Saxe@Sun.COM for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) { 763*8906SEric.Saxe@Sun.COM 764*8906SEric.Saxe@Sun.COM /* 765*8906SEric.Saxe@Sun.COM * Measure the amount of time it takes to transition the 766*8906SEric.Saxe@Sun.COM * domain down to the lowest, and back to the highest power 767*8906SEric.Saxe@Sun.COM * state. 768*8906SEric.Saxe@Sun.COM */ 769*8906SEric.Saxe@Sun.COM start = gethrtime_unscaled(); 770*8906SEric.Saxe@Sun.COM (void) cpupm_change_state(cp, dom, low); 771*8906SEric.Saxe@Sun.COM (void) cpupm_change_state(cp, dom, high); 772*8906SEric.Saxe@Sun.COM delta = gethrtime_unscaled() - start; 773*8906SEric.Saxe@Sun.COM 774*8906SEric.Saxe@Sun.COM DTRACE_PROBE1(cpupm__benchmark__latency, 775*8906SEric.Saxe@Sun.COM hrtime_t, delta); 776*8906SEric.Saxe@Sun.COM 777*8906SEric.Saxe@Sun.COM deltas += delta; 778*8906SEric.Saxe@Sun.COM } 779*8906SEric.Saxe@Sun.COM 780*8906SEric.Saxe@Sun.COM /* 781*8906SEric.Saxe@Sun.COM * Figure the average latency, and tune the transient work and 782*8906SEric.Saxe@Sun.COM * transient idle prediction intervals accordingly. 783*8906SEric.Saxe@Sun.COM */ 784*8906SEric.Saxe@Sun.COM delta = deltas / iterations; 785*8906SEric.Saxe@Sun.COM 786*8906SEric.Saxe@Sun.COM cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple; 787*8906SEric.Saxe@Sun.COM cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple; 788*8906SEric.Saxe@Sun.COM 789*8906SEric.Saxe@Sun.COM return (0); 790*8906SEric.Saxe@Sun.COM } 791*8906SEric.Saxe@Sun.COM 792*8906SEric.Saxe@Sun.COM /* 793*8906SEric.Saxe@Sun.COM * Initiate a state change in all CPUPM domain instances of the specified type 794*8906SEric.Saxe@Sun.COM */ 795*8906SEric.Saxe@Sun.COM static void 796*8906SEric.Saxe@Sun.COM cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state) 797*8906SEric.Saxe@Sun.COM { 798*8906SEric.Saxe@Sun.COM cpu_t *cp; 799*8906SEric.Saxe@Sun.COM pg_cmt_t *pwr_pg; 800*8906SEric.Saxe@Sun.COM cpupm_domain_t *dom; 801*8906SEric.Saxe@Sun.COM group_t *hwset; 802*8906SEric.Saxe@Sun.COM group_iter_t giter; 803*8906SEric.Saxe@Sun.COM pg_cpu_itr_t cpu_iter; 804*8906SEric.Saxe@Sun.COM pghw_type_t hw; 805*8906SEric.Saxe@Sun.COM 806*8906SEric.Saxe@Sun.COM ASSERT(MUTEX_HELD(&cpu_lock)); 807*8906SEric.Saxe@Sun.COM 808*8906SEric.Saxe@Sun.COM switch (type) { 809*8906SEric.Saxe@Sun.COM case CPUPM_DTYPE_ACTIVE: 810*8906SEric.Saxe@Sun.COM hw = PGHW_POW_ACTIVE; 811*8906SEric.Saxe@Sun.COM break; 812*8906SEric.Saxe@Sun.COM default: 813*8906SEric.Saxe@Sun.COM /* 814*8906SEric.Saxe@Sun.COM * Power domain types other than "active" unsupported. 815*8906SEric.Saxe@Sun.COM */ 816*8906SEric.Saxe@Sun.COM ASSERT(type == CPUPM_DTYPE_ACTIVE); 817*8906SEric.Saxe@Sun.COM return; 818*8906SEric.Saxe@Sun.COM } 819*8906SEric.Saxe@Sun.COM 820*8906SEric.Saxe@Sun.COM if ((hwset = pghw_set_lookup(hw)) == NULL) 821*8906SEric.Saxe@Sun.COM return; 822*8906SEric.Saxe@Sun.COM 823*8906SEric.Saxe@Sun.COM /* 824*8906SEric.Saxe@Sun.COM * Iterate over the power domains 825*8906SEric.Saxe@Sun.COM */ 826*8906SEric.Saxe@Sun.COM group_iter_init(&giter); 827*8906SEric.Saxe@Sun.COM while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) { 828*8906SEric.Saxe@Sun.COM 829*8906SEric.Saxe@Sun.COM dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle; 830*8906SEric.Saxe@Sun.COM 831*8906SEric.Saxe@Sun.COM /* 832*8906SEric.Saxe@Sun.COM * Iterate over the CPUs in each domain 833*8906SEric.Saxe@Sun.COM */ 834*8906SEric.Saxe@Sun.COM PG_CPU_ITR_INIT(pwr_pg, cpu_iter); 835*8906SEric.Saxe@Sun.COM while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 836*8906SEric.Saxe@Sun.COM (void) cpupm_change_state(cp, dom, 837*8906SEric.Saxe@Sun.COM dom->cpd_named_states[state]); 838*8906SEric.Saxe@Sun.COM } 839*8906SEric.Saxe@Sun.COM } 840*8906SEric.Saxe@Sun.COM } 841