xref: /netbsd-src/sys/arch/x86/acpi/acpi_cpu_md.c (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1 /* $NetBSD: acpi_cpu_md.c,v 1.68 2011/10/18 05:08:24 jruoho Exp $ */
2 
3 /*-
4  * Copyright (c) 2010, 2011 Jukka Ruohonen <jruohonen@iki.fi>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: acpi_cpu_md.c,v 1.68 2011/10/18 05:08:24 jruoho Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/cpufreq.h>
35 #include <sys/device.h>
36 #include <sys/kcore.h>
37 #include <sys/sysctl.h>
38 #include <sys/xcall.h>
39 
40 #include <x86/cpu.h>
41 #include <x86/cpufunc.h>
42 #include <x86/cputypes.h>
43 #include <x86/cpuvar.h>
44 #include <x86/cpu_msr.h>
45 #include <x86/machdep.h>
46 
47 #include <dev/acpi/acpica.h>
48 #include <dev/acpi/acpi_cpu.h>
49 
50 #include <dev/pci/pcivar.h>
51 #include <dev/pci/pcidevs.h>
52 
53 #include <machine/acpi_machdep.h>
54 
55 /*
56  * Intel IA32_MISC_ENABLE.
57  */
58 #define MSR_MISC_ENABLE_EST	__BIT(16)
59 #define MSR_MISC_ENABLE_TURBO	__BIT(38)
60 
61 /*
62  * AMD C1E.
63  */
64 #define MSR_CMPHALT		0xc0010055
65 
66 #define MSR_CMPHALT_SMI		__BIT(27)
67 #define MSR_CMPHALT_C1E		__BIT(28)
68 #define MSR_CMPHALT_BMSTS	__BIT(29)
69 
70 /*
71  * AMD families 10h, 11h, and 14h
72  */
73 #define MSR_10H_LIMIT		0xc0010061
74 #define MSR_10H_CONTROL		0xc0010062
75 #define MSR_10H_STATUS		0xc0010063
76 #define MSR_10H_CONFIG		0xc0010064
77 
78 /*
79  * AMD family 0Fh.
80  */
81 #define MSR_0FH_CONTROL		0xc0010041
82 #define MSR_0FH_STATUS		0xc0010042
83 
84 #define MSR_0FH_STATUS_CFID	__BITS( 0,  5)
85 #define MSR_0FH_STATUS_CVID	__BITS(32, 36)
86 #define MSR_0FH_STATUS_PENDING	__BITS(31, 31)
87 
88 #define MSR_0FH_CONTROL_FID	__BITS( 0,  5)
89 #define MSR_0FH_CONTROL_VID	__BITS( 8, 12)
90 #define MSR_0FH_CONTROL_CHG	__BITS(16, 16)
91 #define MSR_0FH_CONTROL_CNT	__BITS(32, 51)
92 
93 #define ACPI_0FH_STATUS_FID	__BITS( 0,  5)
94 #define ACPI_0FH_STATUS_VID	__BITS( 6, 10)
95 
96 #define ACPI_0FH_CONTROL_FID	__BITS( 0,  5)
97 #define ACPI_0FH_CONTROL_VID	__BITS( 6, 10)
98 #define ACPI_0FH_CONTROL_VST	__BITS(11, 17)
99 #define ACPI_0FH_CONTROL_MVS	__BITS(18, 19)
100 #define ACPI_0FH_CONTROL_PLL	__BITS(20, 26)
101 #define ACPI_0FH_CONTROL_RVO	__BITS(28, 29)
102 #define ACPI_0FH_CONTROL_IRT	__BITS(30, 31)
103 
104 #define FID_TO_VCO_FID(fidd)	(((fid) < 8) ? (8 + ((fid) << 1)) : (fid))
105 
106 static char	  native_idle_text[16];
107 void		(*native_idle)(void) = NULL;
108 
109 static int	 acpicpu_md_quirk_piix4(const struct pci_attach_args *);
110 static void	 acpicpu_md_quirk_amd(struct acpicpu_pstate *, uint32_t);
111 static void	 acpicpu_md_pstate_hwf_reset(void *, void *);
112 static int	 acpicpu_md_pstate_fidvid_get(struct acpicpu_softc *,
113                                               uint32_t *);
114 static int	 acpicpu_md_pstate_fidvid_set(struct acpicpu_pstate *);
115 static int	 acpicpu_md_pstate_fidvid_read(uint32_t *, uint32_t *);
116 static void	 acpicpu_md_pstate_fidvid_write(uint32_t, uint32_t,
117 					        uint32_t, uint32_t);
118 static int	 acpicpu_md_pstate_sysctl_init(void);
119 static int	 acpicpu_md_pstate_sysctl_get(SYSCTLFN_PROTO);
120 static int	 acpicpu_md_pstate_sysctl_set(SYSCTLFN_PROTO);
121 static int	 acpicpu_md_pstate_sysctl_all(SYSCTLFN_PROTO);
122 
123 extern struct acpicpu_softc **acpicpu_sc;
124 static struct sysctllog *acpicpu_log = NULL;
125 
126 struct cpu_info *
127 acpicpu_md_match(device_t parent, cfdata_t match, void *aux)
128 {
129 	struct cpufeature_attach_args *cfaa = aux;
130 
131 	if (strcmp(cfaa->name, "frequency") != 0)
132 		return NULL;
133 
134 	return cfaa->ci;
135 }
136 
137 struct cpu_info *
138 acpicpu_md_attach(device_t parent, device_t self, void *aux)
139 {
140 	struct cpufeature_attach_args *cfaa = aux;
141 
142 	return cfaa->ci;
143 }
144 
145 uint32_t
146 acpicpu_md_flags(void)
147 {
148 	struct cpu_info *ci = curcpu();
149 	struct pci_attach_args pa;
150 	uint32_t family, val = 0;
151 	uint32_t regs[4];
152 	uint64_t msr;
153 
154 	if (acpi_md_ncpus() == 1)
155 		val |= ACPICPU_FLAG_C_BM;
156 
157 	if ((ci->ci_feat_val[1] & CPUID2_MONITOR) != 0)
158 		val |= ACPICPU_FLAG_C_FFH;
159 
160 	/*
161 	 * By default, assume that the local APIC timer
162 	 * as well as TSC are stalled during C3 sleep.
163 	 */
164 	val |= ACPICPU_FLAG_C_APIC | ACPICPU_FLAG_C_TSC;
165 
166 	switch (cpu_vendor) {
167 
168 	case CPUVENDOR_IDT:
169 
170 		if ((ci->ci_feat_val[1] & CPUID2_EST) != 0)
171 			val |= ACPICPU_FLAG_P_FFH;
172 
173 		if ((ci->ci_feat_val[0] & CPUID_ACPI) != 0)
174 			val |= ACPICPU_FLAG_T_FFH;
175 
176 		break;
177 
178 	case CPUVENDOR_INTEL:
179 
180 		/*
181 		 * Bus master control and arbitration should be
182 		 * available on all supported Intel CPUs (to be
183 		 * sure, this is double-checked later from the
184 		 * firmware data). These flags imply that it is
185 		 * not necessary to flush caches before C3 state.
186 		 */
187 		val |= ACPICPU_FLAG_C_BM | ACPICPU_FLAG_C_ARB;
188 
189 		/*
190 		 * Check if we can use "native", MSR-based,
191 		 * access. If not, we have to resort to I/O.
192 		 */
193 		if ((ci->ci_feat_val[1] & CPUID2_EST) != 0)
194 			val |= ACPICPU_FLAG_P_FFH;
195 
196 		if ((ci->ci_feat_val[0] & CPUID_ACPI) != 0)
197 			val |= ACPICPU_FLAG_T_FFH;
198 
199 		/*
200 		 * Check whether MSR_APERF, MSR_MPERF, and Turbo
201 		 * Boost are available. Also see if we might have
202 		 * an invariant local APIC timer ("ARAT").
203 		 */
204 		if (cpuid_level >= 0x06) {
205 
206 			x86_cpuid(0x00000006, regs);
207 
208 			if ((regs[2] & CPUID_DSPM_HWF) != 0)
209 				val |= ACPICPU_FLAG_P_HWF;
210 
211 			if ((regs[0] & CPUID_DSPM_IDA) != 0)
212 				val |= ACPICPU_FLAG_P_TURBO;
213 
214 			if ((regs[0] & CPUID_DSPM_ARAT) != 0)
215 				val &= ~ACPICPU_FLAG_C_APIC;
216 		}
217 
218 		/*
219 		 * Detect whether TSC is invariant. If it is not,
220 		 * we keep the flag to note that TSC will not run
221 		 * at constant rate. Depending on the CPU, this may
222 		 * affect P- and T-state changes, but especially
223 		 * relevant are C-states; with variant TSC, states
224 		 * larger than C1 may completely stop the counter.
225 		 */
226 		x86_cpuid(0x80000000, regs);
227 
228 		if (regs[0] >= 0x80000007) {
229 
230 			x86_cpuid(0x80000007, regs);
231 
232 			if ((regs[3] & __BIT(8)) != 0)
233 				val &= ~ACPICPU_FLAG_C_TSC;
234 		}
235 
236 		break;
237 
238 	case CPUVENDOR_AMD:
239 
240 		x86_cpuid(0x80000000, regs);
241 
242 		if (regs[0] < 0x80000007)
243 			break;
244 
245 		x86_cpuid(0x80000007, regs);
246 
247 		family = CPUID2FAMILY(ci->ci_signature);
248 
249 		if (family == 0xf)
250 			family += CPUID2EXTFAMILY(ci->ci_signature);
251 
252     		switch (family) {
253 
254 		case 0x0f:
255 
256 			/*
257 			 * Evaluate support for the "FID/VID
258 			 * algorithm" also used by powernow(4).
259 			 */
260 			if ((regs[3] & CPUID_APM_FID) == 0)
261 				break;
262 
263 			if ((regs[3] & CPUID_APM_VID) == 0)
264 				break;
265 
266 			val |= ACPICPU_FLAG_P_FFH | ACPICPU_FLAG_P_FIDVID;
267 			break;
268 
269 		case 0x10:
270 		case 0x11:
271 
272 			if (rdmsr_safe(MSR_CMPHALT, &msr) != EFAULT)
273 				val |= ACPICPU_FLAG_C_C1E;
274 
275 			/* FALLTHROUGH */
276 
277 		case 0x14: /* AMD Fusion */
278 
279 			/*
280 			 * Like with Intel, detect invariant TSC,
281 			 * MSR-based P-states, and AMD's "turbo"
282 			 * (Core Performance Boost), respectively.
283 			 */
284 			if ((regs[3] & CPUID_APM_TSC) != 0)
285 				val &= ~ACPICPU_FLAG_C_TSC;
286 
287 			if ((regs[3] & CPUID_APM_HWP) != 0)
288 				val |= ACPICPU_FLAG_P_FFH;
289 
290 			if ((regs[3] & CPUID_APM_CPB) != 0)
291 				val |= ACPICPU_FLAG_P_TURBO;
292 
293 			/*
294 			 * Also check for APERF and MPERF,
295 			 * first available in the family 10h.
296 			 */
297 			if (cpuid_level >= 0x06) {
298 
299 				x86_cpuid(0x00000006, regs);
300 
301 				if ((regs[2] & CPUID_DSPM_HWF) != 0)
302 					val |= ACPICPU_FLAG_P_HWF;
303 			}
304 
305 			break;
306 		}
307 
308 		break;
309 	}
310 
311 	/*
312 	 * There are several erratums for PIIX4.
313 	 */
314 	if (pci_find_device(&pa, acpicpu_md_quirk_piix4) != 0)
315 		val |= ACPICPU_FLAG_PIIX4;
316 
317 	return val;
318 }
319 
320 static int
321 acpicpu_md_quirk_piix4(const struct pci_attach_args *pa)
322 {
323 
324 	/*
325 	 * XXX: The pci_find_device(9) function only
326 	 *	deals with attached devices. Change this
327 	 *	to use something like pci_device_foreach().
328 	 */
329 	if (PCI_VENDOR(pa->pa_id) != PCI_VENDOR_INTEL)
330 		return 0;
331 
332 	if (PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_INTEL_82371AB_ISA ||
333 	    PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_INTEL_82440MX_PMC)
334 		return 1;
335 
336 	return 0;
337 }
338 
339 static void
340 acpicpu_md_quirk_amd(struct acpicpu_pstate *ps, uint32_t i)
341 {
342 	struct cpu_info *ci = &cpu_info_primary;
343 	uint32_t family, fid, freq, did, zeta;
344 	uint64_t val;
345 
346 	if (i > 7 || cpu_vendor != CPUVENDOR_AMD)
347 		return;
348 
349 	family = CPUID2FAMILY(ci->ci_signature);
350 
351 	if (family == 0xf)
352 		family += CPUID2EXTFAMILY(ci->ci_signature);
353 
354 	switch (family) {
355 
356 	case 0x10:
357 		zeta = 0x10;
358 		break;
359 
360 	case 0x11:
361 		zeta = 0x08;
362 		break;
363 
364 	default:
365 		return;
366 	}
367 
368 	/*
369 	 * The following eight P-state control MSRs define
370 	 * the static per-core values; the MSB indicates
371 	 * whether the state is enabled, and the first eight
372 	 * bits define the frequency divisor and multiplier.
373 	 */
374 	val = rdmsr(MSR_10H_CONFIG + i);
375 
376 	if ((val & __BIT(63)) == 0)
377 		return;
378 
379 	fid = __SHIFTOUT(val, __BITS(0, 5));
380 	did = __SHIFTOUT(val, __BITS(6, 8));
381 
382 	freq = 100 * (fid + zeta) >> did;
383 
384 	if (freq != 0 && ps->ps_freq != freq)
385 		ps->ps_freq = freq;
386 }
387 
388 void
389 acpicpu_md_quirk_c1e(void)
390 {
391 	const uint64_t c1e = MSR_CMPHALT_SMI | MSR_CMPHALT_C1E;
392 	uint64_t val;
393 
394 	val = rdmsr(MSR_CMPHALT);
395 
396 	if ((val & c1e) != 0)
397 		wrmsr(MSR_CMPHALT, val & ~c1e);
398 }
399 
400 int
401 acpicpu_md_cstate_start(struct acpicpu_softc *sc)
402 {
403 	const size_t size = sizeof(native_idle_text);
404 	struct acpicpu_cstate *cs;
405 	bool ipi = false;
406 	int i;
407 
408 	/*
409 	 * Save the cpu_idle(9) loop used by default.
410 	 */
411 	x86_cpu_idle_get(&native_idle, native_idle_text, size);
412 
413 	for (i = 0; i < ACPI_C_STATE_COUNT; i++) {
414 
415 		cs = &sc->sc_cstate[i];
416 
417 		if (cs->cs_method == ACPICPU_C_STATE_HALT) {
418 			ipi = true;
419 			break;
420 		}
421 	}
422 
423 	x86_cpu_idle_set(acpicpu_cstate_idle, "acpi", ipi);
424 
425 	return 0;
426 }
427 
428 int
429 acpicpu_md_cstate_stop(void)
430 {
431 	static char text[16];
432 	void (*func)(void);
433 	uint64_t xc;
434 	bool ipi;
435 
436 	x86_cpu_idle_get(&func, text, sizeof(text));
437 
438 	if (func == native_idle)
439 		return EALREADY;
440 
441 	ipi = (native_idle != x86_cpu_idle_halt) ? false : true;
442 	x86_cpu_idle_set(native_idle, native_idle_text, ipi);
443 
444 	/*
445 	 * Run a cross-call to ensure that all CPUs are
446 	 * out from the ACPI idle-loop before detachment.
447 	 */
448 	xc = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
449 	xc_wait(xc);
450 
451 	return 0;
452 }
453 
454 /*
455  * Called with interrupts enabled.
456  */
457 void
458 acpicpu_md_cstate_enter(int method, int state)
459 {
460 	struct cpu_info *ci = curcpu();
461 
462 	KASSERT(ci->ci_ilevel == IPL_NONE);
463 
464 	switch (method) {
465 
466 	case ACPICPU_C_STATE_FFH:
467 
468 		x86_monitor(&ci->ci_want_resched, 0, 0);
469 
470 		if (__predict_false(ci->ci_want_resched != 0))
471 			return;
472 
473 		x86_mwait((state - 1) << 4, 0);
474 		break;
475 
476 	case ACPICPU_C_STATE_HALT:
477 
478 		x86_disable_intr();
479 
480 		if (__predict_false(ci->ci_want_resched != 0)) {
481 			x86_enable_intr();
482 			return;
483 		}
484 
485 		x86_stihlt();
486 		break;
487 	}
488 }
489 
490 int
491 acpicpu_md_pstate_start(struct acpicpu_softc *sc)
492 {
493 	uint64_t xc, val;
494 
495 	switch (cpu_vendor) {
496 
497 	case CPUVENDOR_IDT:
498 	case CPUVENDOR_INTEL:
499 
500 		/*
501 		 * Make sure EST is enabled.
502 		 */
503 		if ((sc->sc_flags & ACPICPU_FLAG_P_FFH) != 0) {
504 
505 			val = rdmsr(MSR_MISC_ENABLE);
506 
507 			if ((val & MSR_MISC_ENABLE_EST) == 0) {
508 
509 				val |= MSR_MISC_ENABLE_EST;
510 				wrmsr(MSR_MISC_ENABLE, val);
511 				val = rdmsr(MSR_MISC_ENABLE);
512 
513 				if ((val & MSR_MISC_ENABLE_EST) == 0)
514 					return ENOTTY;
515 			}
516 		}
517 	}
518 
519 	/*
520 	 * Reset the APERF and MPERF counters.
521 	 */
522 	if ((sc->sc_flags & ACPICPU_FLAG_P_HWF) != 0) {
523 		xc = xc_broadcast(0, acpicpu_md_pstate_hwf_reset, NULL, NULL);
524 		xc_wait(xc);
525 	}
526 
527 	return acpicpu_md_pstate_sysctl_init();
528 }
529 
530 int
531 acpicpu_md_pstate_stop(void)
532 {
533 
534 	if (acpicpu_log == NULL)
535 		return EALREADY;
536 
537 	sysctl_teardown(&acpicpu_log);
538 	acpicpu_log = NULL;
539 
540 	return 0;
541 }
542 
543 int
544 acpicpu_md_pstate_init(struct acpicpu_softc *sc)
545 {
546 	struct cpu_info *ci = sc->sc_ci;
547 	struct acpicpu_pstate *ps, msr;
548 	uint32_t family, i = 0;
549 
550 	(void)memset(&msr, 0, sizeof(struct acpicpu_pstate));
551 
552 	switch (cpu_vendor) {
553 
554 	case CPUVENDOR_IDT:
555 	case CPUVENDOR_INTEL:
556 
557 		/*
558 		 * If the so-called Turbo Boost is present,
559 		 * the P0-state is always the "turbo state".
560 		 * It is shown as the P1 frequency + 1 MHz.
561 		 *
562 		 * For discussion, see:
563 		 *
564 		 *	Intel Corporation: Intel Turbo Boost Technology
565 		 *	in Intel Core(tm) Microarchitectures (Nehalem)
566 		 *	Based Processors. White Paper, November 2008.
567 		 */
568 		if (sc->sc_pstate_count >= 2 &&
569 		   (sc->sc_flags & ACPICPU_FLAG_P_TURBO) != 0) {
570 
571 			ps = &sc->sc_pstate[0];
572 
573 			if (ps->ps_freq == sc->sc_pstate[1].ps_freq + 1)
574 				ps->ps_flags |= ACPICPU_FLAG_P_TURBO;
575 		}
576 
577 		msr.ps_control_addr = MSR_PERF_CTL;
578 		msr.ps_control_mask = __BITS(0, 15);
579 
580 		msr.ps_status_addr  = MSR_PERF_STATUS;
581 		msr.ps_status_mask  = __BITS(0, 15);
582 		break;
583 
584 	case CPUVENDOR_AMD:
585 
586 		if ((sc->sc_flags & ACPICPU_FLAG_P_FIDVID) != 0)
587 			msr.ps_flags |= ACPICPU_FLAG_P_FIDVID;
588 
589 		family = CPUID2FAMILY(ci->ci_signature);
590 
591 		if (family == 0xf)
592 			family += CPUID2EXTFAMILY(ci->ci_signature);
593 
594 		switch (family) {
595 
596 		case 0x0f:
597 			msr.ps_control_addr = MSR_0FH_CONTROL;
598 			msr.ps_status_addr  = MSR_0FH_STATUS;
599 			break;
600 
601 		case 0x10:
602 		case 0x11:
603 		case 0x14: /* AMD Fusion */
604 			msr.ps_control_addr = MSR_10H_CONTROL;
605 			msr.ps_control_mask = __BITS(0, 2);
606 
607 			msr.ps_status_addr  = MSR_10H_STATUS;
608 			msr.ps_status_mask  = __BITS(0, 2);
609 			break;
610 
611 		default:
612 			/*
613 			 * If we have an unknown AMD CPU, rely on XPSS.
614 			 */
615 			if ((sc->sc_flags & ACPICPU_FLAG_P_XPSS) == 0)
616 				return EOPNOTSUPP;
617 		}
618 
619 		break;
620 
621 	default:
622 		return ENODEV;
623 	}
624 
625 	/*
626 	 * Fill the P-state structures with MSR addresses that are
627 	 * known to be correct. If we do not know the addresses,
628 	 * leave the values intact. If a vendor uses XPSS, we do
629 	 * not necessarily need to do anything to support new CPUs.
630 	 */
631 	while (i < sc->sc_pstate_count) {
632 
633 		ps = &sc->sc_pstate[i];
634 
635 		if (msr.ps_flags != 0)
636 			ps->ps_flags |= msr.ps_flags;
637 
638 		if (msr.ps_status_addr != 0)
639 			ps->ps_status_addr = msr.ps_status_addr;
640 
641 		if (msr.ps_status_mask != 0)
642 			ps->ps_status_mask = msr.ps_status_mask;
643 
644 		if (msr.ps_control_addr != 0)
645 			ps->ps_control_addr = msr.ps_control_addr;
646 
647 		if (msr.ps_control_mask != 0)
648 			ps->ps_control_mask = msr.ps_control_mask;
649 
650 		/*
651 		 * Some AMD systems may round the frequencies
652 		 * reported in the tables. Try to fix these.
653 		 */
654 		if (cpu_vendor == CPUVENDOR_AMD)
655 			acpicpu_md_quirk_amd(ps, i);
656 
657 		i++;
658 	}
659 
660 	return 0;
661 }
662 
663 /*
664  * Read the IA32_APERF and IA32_MPERF counters. The first
665  * increments at the rate of the fixed maximum frequency
666  * configured during the boot, whereas APERF counts at the
667  * rate of the actual frequency. Note that the MSRs must be
668  * read without delay, and that only the ratio between
669  * IA32_APERF and IA32_MPERF is architecturally defined.
670  *
671  * The function thus returns the percentage of the actual
672  * frequency in terms of the maximum frequency of the calling
673  * CPU since the last call. A value zero implies an error.
674  *
675  * For further details, refer to:
676  *
677  *	Intel Corporation: Intel 64 and IA-32 Architectures
678  *	Software Developer's Manual. Section 13.2, Volume 3A:
679  *	System Programming Guide, Part 1. July, 2008.
680  *
681  *	Advanced Micro Devices: BIOS and Kernel Developer's
682  *	Guide (BKDG) for AMD Family 10h Processors. Section
683  *	2.4.5, Revision 3.48, April 2010.
684  */
685 uint8_t
686 acpicpu_md_pstate_hwf(struct cpu_info *ci)
687 {
688 	struct acpicpu_softc *sc;
689 	uint64_t aperf, mperf;
690 	uint8_t rv = 0;
691 
692 	sc = acpicpu_sc[ci->ci_acpiid];
693 
694 	if (__predict_false(sc == NULL))
695 		return 0;
696 
697 	if (__predict_false((sc->sc_flags & ACPICPU_FLAG_P_HWF) == 0))
698 		return 0;
699 
700 	aperf = sc->sc_pstate_aperf;
701 	mperf = sc->sc_pstate_mperf;
702 
703 	x86_disable_intr();
704 
705 	sc->sc_pstate_aperf = rdmsr(MSR_APERF);
706 	sc->sc_pstate_mperf = rdmsr(MSR_MPERF);
707 
708 	x86_enable_intr();
709 
710 	aperf = sc->sc_pstate_aperf - aperf;
711 	mperf = sc->sc_pstate_mperf - mperf;
712 
713 	if (__predict_true(mperf != 0))
714 		rv = (aperf * 100) / mperf;
715 
716 	return rv;
717 }
718 
719 static void
720 acpicpu_md_pstate_hwf_reset(void *arg1, void *arg2)
721 {
722 	struct cpu_info *ci = curcpu();
723 	struct acpicpu_softc *sc;
724 
725 	sc = acpicpu_sc[ci->ci_acpiid];
726 
727 	if (__predict_false(sc == NULL))
728 		return;
729 
730 	x86_disable_intr();
731 
732 	wrmsr(MSR_APERF, 0);
733 	wrmsr(MSR_MPERF, 0);
734 
735 	x86_enable_intr();
736 
737 	sc->sc_pstate_aperf = 0;
738 	sc->sc_pstate_mperf = 0;
739 }
740 
741 int
742 acpicpu_md_pstate_get(struct acpicpu_softc *sc, uint32_t *freq)
743 {
744 	struct acpicpu_pstate *ps = NULL;
745 	uint64_t val;
746 	uint32_t i;
747 
748 	if ((sc->sc_flags & ACPICPU_FLAG_P_FIDVID) != 0)
749 		return acpicpu_md_pstate_fidvid_get(sc, freq);
750 
751 	/*
752 	 * Pick any P-state for the status address.
753 	 */
754 	for (i = 0; i < sc->sc_pstate_count; i++) {
755 
756 		ps = &sc->sc_pstate[i];
757 
758 		if (__predict_true(ps->ps_freq != 0))
759 			break;
760 	}
761 
762 	if (__predict_false(ps == NULL))
763 		return ENODEV;
764 
765 	if (__predict_false(ps->ps_status_addr == 0))
766 		return EINVAL;
767 
768 	val = rdmsr(ps->ps_status_addr);
769 
770 	if (__predict_true(ps->ps_status_mask != 0))
771 		val = val & ps->ps_status_mask;
772 
773 	/*
774 	 * Search for the value from known P-states.
775 	 */
776 	for (i = 0; i < sc->sc_pstate_count; i++) {
777 
778 		ps = &sc->sc_pstate[i];
779 
780 		if (__predict_false(ps->ps_freq == 0))
781 			continue;
782 
783 		if (val == ps->ps_status) {
784 			*freq = ps->ps_freq;
785 			return 0;
786 		}
787 	}
788 
789 	/*
790 	 * If the value was not found, try APERF/MPERF.
791 	 * The state is P0 if the return value is 100 %.
792 	 */
793 	if ((sc->sc_flags & ACPICPU_FLAG_P_HWF) != 0) {
794 
795 		KASSERT(sc->sc_pstate_count > 0);
796 		KASSERT(sc->sc_pstate[0].ps_freq != 0);
797 
798 		if (acpicpu_md_pstate_hwf(sc->sc_ci) == 100) {
799 			*freq = sc->sc_pstate[0].ps_freq;
800 			return 0;
801 		}
802 	}
803 
804 	return EIO;
805 }
806 
807 int
808 acpicpu_md_pstate_set(struct acpicpu_pstate *ps)
809 {
810 	uint64_t val = 0;
811 
812 	if (__predict_false(ps->ps_control_addr == 0))
813 		return EINVAL;
814 
815 	if ((ps->ps_flags & ACPICPU_FLAG_P_FIDVID) != 0)
816 		return acpicpu_md_pstate_fidvid_set(ps);
817 
818 	/*
819 	 * If the mask is set, do a read-modify-write.
820 	 */
821 	if (__predict_true(ps->ps_control_mask != 0)) {
822 		val = rdmsr(ps->ps_control_addr);
823 		val &= ~ps->ps_control_mask;
824 	}
825 
826 	val |= ps->ps_control;
827 
828 	wrmsr(ps->ps_control_addr, val);
829 	DELAY(ps->ps_latency);
830 
831 	return 0;
832 }
833 
834 static int
835 acpicpu_md_pstate_fidvid_get(struct acpicpu_softc *sc, uint32_t *freq)
836 {
837 	struct acpicpu_pstate *ps;
838 	uint32_t fid, i, vid;
839 	uint32_t cfid, cvid;
840 	int rv;
841 
842 	/*
843 	 * AMD family 0Fh needs special treatment.
844 	 * While it wants to use ACPI, it does not
845 	 * comply with the ACPI specifications.
846 	 */
847 	rv = acpicpu_md_pstate_fidvid_read(&cfid, &cvid);
848 
849 	if (rv != 0)
850 		return rv;
851 
852 	for (i = 0; i < sc->sc_pstate_count; i++) {
853 
854 		ps = &sc->sc_pstate[i];
855 
856 		if (__predict_false(ps->ps_freq == 0))
857 			continue;
858 
859 		fid = __SHIFTOUT(ps->ps_status, ACPI_0FH_STATUS_FID);
860 		vid = __SHIFTOUT(ps->ps_status, ACPI_0FH_STATUS_VID);
861 
862 		if (cfid == fid && cvid == vid) {
863 			*freq = ps->ps_freq;
864 			return 0;
865 		}
866 	}
867 
868 	return EIO;
869 }
870 
871 static int
872 acpicpu_md_pstate_fidvid_set(struct acpicpu_pstate *ps)
873 {
874 	const uint64_t ctrl = ps->ps_control;
875 	uint32_t cfid, cvid, fid, i, irt;
876 	uint32_t pll, vco_cfid, vco_fid;
877 	uint32_t val, vid, vst;
878 	int rv;
879 
880 	rv = acpicpu_md_pstate_fidvid_read(&cfid, &cvid);
881 
882 	if (rv != 0)
883 		return rv;
884 
885 	fid = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_FID);
886 	vid = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_VID);
887 	irt = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_IRT);
888 	vst = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_VST);
889 	pll = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_PLL);
890 
891 	vst = vst * 20;
892 	pll = pll * 1000 / 5;
893 	irt = 10 * __BIT(irt);
894 
895 	/*
896 	 * Phase 1.
897 	 */
898 	while (cvid > vid) {
899 
900 		val = 1 << __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_MVS);
901 		val = (val > cvid) ? 0 : cvid - val;
902 
903 		acpicpu_md_pstate_fidvid_write(cfid, val, 1, vst);
904 		rv = acpicpu_md_pstate_fidvid_read(NULL, &cvid);
905 
906 		if (rv != 0)
907 			return rv;
908 	}
909 
910 	i = __SHIFTOUT(ctrl, ACPI_0FH_CONTROL_RVO);
911 
912 	for (; i > 0 && cvid > 0; --i) {
913 
914 		acpicpu_md_pstate_fidvid_write(cfid, cvid - 1, 1, vst);
915 		rv = acpicpu_md_pstate_fidvid_read(NULL, &cvid);
916 
917 		if (rv != 0)
918 			return rv;
919 	}
920 
921 	/*
922 	 * Phase 2.
923 	 */
924 	if (cfid != fid) {
925 
926 		vco_fid  = FID_TO_VCO_FID(fid);
927 		vco_cfid = FID_TO_VCO_FID(cfid);
928 
929 		while (abs(vco_fid - vco_cfid) > 2) {
930 
931 			if (fid <= cfid)
932 				val = cfid - 2;
933 			else {
934 				val = (cfid > 6) ? cfid + 2 :
935 				    FID_TO_VCO_FID(cfid) + 2;
936 			}
937 
938 			acpicpu_md_pstate_fidvid_write(val, cvid, pll, irt);
939 			rv = acpicpu_md_pstate_fidvid_read(&cfid, NULL);
940 
941 			if (rv != 0)
942 				return rv;
943 
944 			vco_cfid = FID_TO_VCO_FID(cfid);
945 		}
946 
947 		acpicpu_md_pstate_fidvid_write(fid, cvid, pll, irt);
948 		rv = acpicpu_md_pstate_fidvid_read(&cfid, NULL);
949 
950 		if (rv != 0)
951 			return rv;
952 	}
953 
954 	/*
955 	 * Phase 3.
956 	 */
957 	if (cvid != vid) {
958 
959 		acpicpu_md_pstate_fidvid_write(cfid, vid, 1, vst);
960 		rv = acpicpu_md_pstate_fidvid_read(NULL, &cvid);
961 
962 		if (rv != 0)
963 			return rv;
964 	}
965 
966 	return 0;
967 }
968 
969 static int
970 acpicpu_md_pstate_fidvid_read(uint32_t *cfid, uint32_t *cvid)
971 {
972 	int i = ACPICPU_P_STATE_RETRY * 100;
973 	uint64_t val;
974 
975 	do {
976 		val = rdmsr(MSR_0FH_STATUS);
977 
978 	} while (__SHIFTOUT(val, MSR_0FH_STATUS_PENDING) != 0 && --i >= 0);
979 
980 	if (i == 0)
981 		return EAGAIN;
982 
983 	if (cfid != NULL)
984 		*cfid = __SHIFTOUT(val, MSR_0FH_STATUS_CFID);
985 
986 	if (cvid != NULL)
987 		*cvid = __SHIFTOUT(val, MSR_0FH_STATUS_CVID);
988 
989 	return 0;
990 }
991 
992 static void
993 acpicpu_md_pstate_fidvid_write(uint32_t fid,
994     uint32_t vid, uint32_t cnt, uint32_t tmo)
995 {
996 	uint64_t val = 0;
997 
998 	val |= __SHIFTIN(fid, MSR_0FH_CONTROL_FID);
999 	val |= __SHIFTIN(vid, MSR_0FH_CONTROL_VID);
1000 	val |= __SHIFTIN(cnt, MSR_0FH_CONTROL_CNT);
1001 	val |= __SHIFTIN(0x1, MSR_0FH_CONTROL_CHG);
1002 
1003 	wrmsr(MSR_0FH_CONTROL, val);
1004 	DELAY(tmo);
1005 }
1006 
1007 int
1008 acpicpu_md_tstate_get(struct acpicpu_softc *sc, uint32_t *percent)
1009 {
1010 	struct acpicpu_tstate *ts;
1011 	uint64_t val;
1012 	uint32_t i;
1013 
1014 	val = rdmsr(MSR_THERM_CONTROL);
1015 
1016 	for (i = 0; i < sc->sc_tstate_count; i++) {
1017 
1018 		ts = &sc->sc_tstate[i];
1019 
1020 		if (ts->ts_percent == 0)
1021 			continue;
1022 
1023 		if (val == ts->ts_status) {
1024 			*percent = ts->ts_percent;
1025 			return 0;
1026 		}
1027 	}
1028 
1029 	return EIO;
1030 }
1031 
1032 int
1033 acpicpu_md_tstate_set(struct acpicpu_tstate *ts)
1034 {
1035 	uint64_t val;
1036 	uint8_t i;
1037 
1038 	val = ts->ts_control;
1039 	val = val & __BITS(1, 4);
1040 
1041 	wrmsr(MSR_THERM_CONTROL, val);
1042 
1043 	if (ts->ts_status == 0) {
1044 		DELAY(ts->ts_latency);
1045 		return 0;
1046 	}
1047 
1048 	for (i = val = 0; i < ACPICPU_T_STATE_RETRY; i++) {
1049 
1050 		val = rdmsr(MSR_THERM_CONTROL);
1051 
1052 		if (val == ts->ts_status)
1053 			return 0;
1054 
1055 		DELAY(ts->ts_latency);
1056 	}
1057 
1058 	return EAGAIN;
1059 }
1060 
1061 /*
1062  * A kludge for backwards compatibility.
1063  */
1064 static int
1065 acpicpu_md_pstate_sysctl_init(void)
1066 {
1067 	const struct sysctlnode	*fnode, *mnode, *rnode;
1068 	const char *str;
1069 	int rv;
1070 
1071 	switch (cpu_vendor) {
1072 
1073 	case CPUVENDOR_IDT:
1074 	case CPUVENDOR_INTEL:
1075 		str = "est";
1076 		break;
1077 
1078 	case CPUVENDOR_AMD:
1079 		str = "powernow";
1080 		break;
1081 
1082 	default:
1083 		return ENODEV;
1084 	}
1085 
1086 
1087 	rv = sysctl_createv(&acpicpu_log, 0, NULL, &rnode,
1088 	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
1089 	    NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);
1090 
1091 	if (rv != 0)
1092 		goto fail;
1093 
1094 	rv = sysctl_createv(&acpicpu_log, 0, &rnode, &mnode,
1095 	    0, CTLTYPE_NODE, str, NULL,
1096 	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
1097 
1098 	if (rv != 0)
1099 		goto fail;
1100 
1101 	rv = sysctl_createv(&acpicpu_log, 0, &mnode, &fnode,
1102 	    0, CTLTYPE_NODE, "frequency", NULL,
1103 	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
1104 
1105 	if (rv != 0)
1106 		goto fail;
1107 
1108 	rv = sysctl_createv(&acpicpu_log, 0, &fnode, &rnode,
1109 	    CTLFLAG_READWRITE, CTLTYPE_INT, "target", NULL,
1110 	    acpicpu_md_pstate_sysctl_set, 0, NULL, 0, CTL_CREATE, CTL_EOL);
1111 
1112 	if (rv != 0)
1113 		goto fail;
1114 
1115 	rv = sysctl_createv(&acpicpu_log, 0, &fnode, &rnode,
1116 	    CTLFLAG_READONLY, CTLTYPE_INT, "current", NULL,
1117 	    acpicpu_md_pstate_sysctl_get, 0, NULL, 0, CTL_CREATE, CTL_EOL);
1118 
1119 	if (rv != 0)
1120 		goto fail;
1121 
1122 	rv = sysctl_createv(&acpicpu_log, 0, &fnode, &rnode,
1123 	    CTLFLAG_READONLY, CTLTYPE_STRING, "available", NULL,
1124 	    acpicpu_md_pstate_sysctl_all, 0, NULL, 0, CTL_CREATE, CTL_EOL);
1125 
1126 	if (rv != 0)
1127 		goto fail;
1128 
1129 	return 0;
1130 
1131 fail:
1132 	if (acpicpu_log != NULL) {
1133 		sysctl_teardown(&acpicpu_log);
1134 		acpicpu_log = NULL;
1135 	}
1136 
1137 	return rv;
1138 }
1139 
1140 static int
1141 acpicpu_md_pstate_sysctl_get(SYSCTLFN_ARGS)
1142 {
1143 	struct sysctlnode node;
1144 	uint32_t freq;
1145 	int err;
1146 
1147 	freq = cpufreq_get(curcpu());
1148 
1149 	if (freq == 0)
1150 		return ENXIO;
1151 
1152 	node = *rnode;
1153 	node.sysctl_data = &freq;
1154 
1155 	err = sysctl_lookup(SYSCTLFN_CALL(&node));
1156 
1157 	if (err != 0 || newp == NULL)
1158 		return err;
1159 
1160 	return 0;
1161 }
1162 
1163 static int
1164 acpicpu_md_pstate_sysctl_set(SYSCTLFN_ARGS)
1165 {
1166 	struct sysctlnode node;
1167 	uint32_t freq;
1168 	int err;
1169 
1170 	freq = cpufreq_get(curcpu());
1171 
1172 	if (freq == 0)
1173 		return ENXIO;
1174 
1175 	node = *rnode;
1176 	node.sysctl_data = &freq;
1177 
1178 	err = sysctl_lookup(SYSCTLFN_CALL(&node));
1179 
1180 	if (err != 0 || newp == NULL)
1181 		return err;
1182 
1183 	cpufreq_set_all(freq);
1184 
1185 	return 0;
1186 }
1187 
1188 static int
1189 acpicpu_md_pstate_sysctl_all(SYSCTLFN_ARGS)
1190 {
1191 	struct cpu_info *ci = curcpu();
1192 	struct acpicpu_softc *sc;
1193 	struct sysctlnode node;
1194 	char buf[1024];
1195 	size_t len;
1196 	uint32_t i;
1197 	int err;
1198 
1199 	sc = acpicpu_sc[ci->ci_acpiid];
1200 
1201 	if (sc == NULL)
1202 		return ENXIO;
1203 
1204 	(void)memset(&buf, 0, sizeof(buf));
1205 
1206 	mutex_enter(&sc->sc_mtx);
1207 
1208 	for (len = 0, i = sc->sc_pstate_max; i < sc->sc_pstate_count; i++) {
1209 
1210 		if (sc->sc_pstate[i].ps_freq == 0)
1211 			continue;
1212 
1213 		len += snprintf(buf + len, sizeof(buf) - len, "%u%s",
1214 		    sc->sc_pstate[i].ps_freq,
1215 		    i < (sc->sc_pstate_count - 1) ? " " : "");
1216 	}
1217 
1218 	mutex_exit(&sc->sc_mtx);
1219 
1220 	node = *rnode;
1221 	node.sysctl_data = buf;
1222 
1223 	err = sysctl_lookup(SYSCTLFN_CALL(&node));
1224 
1225 	if (err != 0 || newp == NULL)
1226 		return err;
1227 
1228 	return 0;
1229 }
1230 
1231