xref: /openbsd-src/sys/arch/amd64/amd64/tsc.c (revision ccd74f9409a9152da4bb5d9abdb4977db9c6f603)
1 /*	$OpenBSD: tsc.c,v 1.32 2024/04/03 02:01:21 guenther Exp $	*/
2 /*
3  * Copyright (c) 2008 The NetBSD Foundation, Inc.
4  * Copyright (c) 2016,2017 Reyk Floeter <reyk@openbsd.org>
5  * Copyright (c) 2017 Adam Steen <adam@adamsteen.com.au>
6  * Copyright (c) 2017 Mike Belopuhov <mike@openbsd.org>
7  * Copyright (c) 2019 Paul Irofti <paul@irofti.net>
8  *
9  * Permission to use, copy, modify, and distribute this software for any
10  * purpose with or without fee is hereby granted, provided that the above
11  * copyright notice and this permission notice appear in all copies.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
14  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
15  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
16  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
17  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
18  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
19  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20  */
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/timetc.h>
25 #include <sys/atomic.h>
26 
27 #include <machine/cpu.h>
28 #include <machine/cpufunc.h>
29 
30 #define RECALIBRATE_MAX_RETRIES		5
31 #define RECALIBRATE_SMI_THRESHOLD	50000
32 #define RECALIBRATE_DELAY_THRESHOLD	50
33 
34 int		tsc_recalibrate;
35 
36 uint64_t	tsc_frequency;
37 int		tsc_is_invariant;
38 
39 u_int		tsc_get_timecount_lfence(struct timecounter *tc);
40 u_int		tsc_get_timecount_rdtscp(struct timecounter *tc);
41 void		tsc_delay(int usecs);
42 
43 #include "lapic.h"
44 #if NLAPIC > 0
45 extern u_int32_t lapic_per_second;
46 #endif
47 
48 u_int64_t (*tsc_rdtsc)(void) = rdtsc_lfence;
49 
50 struct timecounter tsc_timecounter = {
51 	.tc_get_timecount = tsc_get_timecount_lfence,
52 	.tc_counter_mask = ~0u,
53 	.tc_frequency = 0,
54 	.tc_name = "tsc",
55 	.tc_quality = -1000,
56 	.tc_priv = NULL,
57 	.tc_user = TC_TSC_LFENCE,
58 };
59 
60 uint64_t
tsc_freq_cpuid(struct cpu_info * ci)61 tsc_freq_cpuid(struct cpu_info *ci)
62 {
63 	uint64_t count;
64 	uint32_t eax, ebx, khz, dummy;
65 
66 	if (ci->ci_vendor == CPUV_INTEL &&
67 	    ci->ci_cpuid_level >= 0x15) {
68 		eax = ebx = khz = dummy = 0;
69 		CPUID(0x15, eax, ebx, khz, dummy);
70 		khz /= 1000;
71 		if (khz == 0) {
72 			switch (ci->ci_model) {
73 			case 0x4e: /* Skylake mobile */
74 			case 0x5e: /* Skylake desktop */
75 			case 0x8e: /* Kabylake mobile */
76 			case 0x9e: /* Kabylake desktop */
77 			case 0xa5: /* CML-H CML-S62 CML-S102 */
78 			case 0xa6: /* CML-U62 */
79 				khz = 24000; /* 24.0 MHz */
80 				break;
81 			case 0x5f: /* Atom Denverton */
82 				khz = 25000; /* 25.0 MHz */
83 				break;
84 			case 0x5c: /* Atom Goldmont */
85 				khz = 19200; /* 19.2 MHz */
86 				break;
87 			}
88 		}
89 		if (ebx == 0 || eax == 0)
90 			count = 0;
91 		else if ((count = (uint64_t)khz * (uint64_t)ebx / eax) != 0) {
92 #if NLAPIC > 0
93 			lapic_per_second = khz * 1000;
94 #endif
95 			return (count * 1000);
96 		}
97 	}
98 
99 	return (0);
100 }
101 
102 uint64_t
tsc_freq_msr(struct cpu_info * ci)103 tsc_freq_msr(struct cpu_info *ci)
104 {
105 	uint64_t base, def, divisor, multiplier;
106 
107 	if (ci->ci_vendor != CPUV_AMD)
108 		return 0;
109 
110 	/*
111 	 * All 10h+ CPUs have Core::X86::Msr:HWCR and the TscFreqSel
112 	 * bit.  If TscFreqSel hasn't been set, the TSC isn't advancing
113 	 * at the core P0 frequency and we need to calibrate by hand.
114 	 */
115 	if (ci->ci_family < 0x10)
116 		return 0;
117 	if (!ISSET(rdmsr(MSR_HWCR), HWCR_TSCFREQSEL))
118 		return 0;
119 
120 	/*
121 	 * In 10h+ CPUs, Core::X86::Msr::PStateDef defines the voltage
122 	 * and frequency for each core P-state.  We want the P0 frequency.
123 	 * If the En bit isn't set, the register doesn't define a valid
124 	 * P-state.
125 	 */
126 	def = rdmsr(MSR_PSTATEDEF(0));
127 	if (!ISSET(def, PSTATEDEF_EN))
128 		return 0;
129 
130 	switch (ci->ci_family) {
131 	case 0x17:
132 	case 0x19:
133 		/*
134 		 * PPR for AMD Family 17h [...]:
135 		 * Models 01h,08h B2, Rev 3.03, pp. 33, 139-140
136 		 * Model 18h B1, Rev 3.16, pp. 36, 143-144
137 		 * Model 60h A1, Rev 3.06, pp. 33, 155-157
138 		 * Model 71h B0, Rev 3.06, pp. 28, 150-151
139 		 *
140 		 * PPR for AMD Family 19h [...]:
141 		 * Model 21h B0, Rev 3.05, pp. 33, 166-167
142 		 *
143 		 * OSRR for AMD Family 17h processors,
144 		 * Models 00h-2Fh, Rev 3.03, pp. 130-131
145 		 */
146 		base = 200000000;			/* 200.0 MHz */
147 		divisor = (def >> 8) & 0x3f;
148 		if (divisor <= 0x07 || divisor >= 0x2d)
149 			return 0;			/* reserved */
150 		if (divisor >= 0x1b && divisor % 2 == 1)
151 			return 0;			/* reserved */
152 		multiplier = def & 0xff;
153 		if (multiplier <= 0x0f)
154 			return 0;			/* reserved */
155 		break;
156 	default:
157 		return 0;
158 	}
159 
160 	return base * multiplier / divisor;
161 }
162 
163 void
tsc_identify(struct cpu_info * ci)164 tsc_identify(struct cpu_info *ci)
165 {
166 	if (!(ci->ci_flags & CPUF_PRIMARY) ||
167 	    !(ci->ci_flags & CPUF_CONST_TSC) ||
168 	    !(ci->ci_flags & CPUF_INVAR_TSC))
169 		return;
170 
171 	/* Prefer RDTSCP where supported. */
172 	if (ISSET(ci->ci_feature_eflags, CPUID_RDTSCP)) {
173 		tsc_rdtsc = rdtscp;
174 		tsc_timecounter.tc_get_timecount = tsc_get_timecount_rdtscp;
175 		tsc_timecounter.tc_user = TC_TSC_RDTSCP;
176 	}
177 
178 	tsc_is_invariant = 1;
179 
180 	tsc_frequency = tsc_freq_cpuid(ci);
181 	if (tsc_frequency == 0)
182 		tsc_frequency = tsc_freq_msr(ci);
183 	if (tsc_frequency > 0)
184 		delay_init(tsc_delay, 5000);
185 }
186 
187 static inline int
get_tsc_and_timecount(struct timecounter * tc,uint64_t * tsc,uint64_t * count)188 get_tsc_and_timecount(struct timecounter *tc, uint64_t *tsc, uint64_t *count)
189 {
190 	uint64_t n, tsc1, tsc2;
191 	int i;
192 
193 	for (i = 0; i < RECALIBRATE_MAX_RETRIES; i++) {
194 		tsc1 = tsc_rdtsc();
195 		n = (tc->tc_get_timecount(tc) & tc->tc_counter_mask);
196 		tsc2 = tsc_rdtsc();
197 
198 		if ((tsc2 - tsc1) < RECALIBRATE_SMI_THRESHOLD) {
199 			*count = n;
200 			*tsc = tsc2;
201 			return (0);
202 		}
203 	}
204 	return (1);
205 }
206 
207 static inline uint64_t
calculate_tsc_freq(uint64_t tsc1,uint64_t tsc2,int usec)208 calculate_tsc_freq(uint64_t tsc1, uint64_t tsc2, int usec)
209 {
210 	uint64_t delta;
211 
212 	delta = (tsc2 - tsc1);
213 	return (delta * 1000000 / usec);
214 }
215 
216 static inline uint64_t
calculate_tc_delay(struct timecounter * tc,uint64_t count1,uint64_t count2)217 calculate_tc_delay(struct timecounter *tc, uint64_t count1, uint64_t count2)
218 {
219 	uint64_t delta;
220 
221 	if (count2 < count1)
222 		count2 += tc->tc_counter_mask;
223 
224 	delta = (count2 - count1);
225 	return (delta * 1000000 / tc->tc_frequency);
226 }
227 
228 uint64_t
measure_tsc_freq(struct timecounter * tc)229 measure_tsc_freq(struct timecounter *tc)
230 {
231 	uint64_t count1, count2, frequency, min_freq, tsc1, tsc2;
232 	u_long s;
233 	int delay_usec, i, err1, err2, usec, success = 0;
234 
235 	/* warmup the timers */
236 	for (i = 0; i < 3; i++) {
237 		(void)tc->tc_get_timecount(tc);
238 		(void)rdtsc();
239 	}
240 
241 	min_freq = ULLONG_MAX;
242 
243 	delay_usec = 100000;
244 	for (i = 0; i < 3; i++) {
245 		s = intr_disable();
246 
247 		err1 = get_tsc_and_timecount(tc, &tsc1, &count1);
248 		delay(delay_usec);
249 		err2 = get_tsc_and_timecount(tc, &tsc2, &count2);
250 
251 		intr_restore(s);
252 
253 		if (err1 || err2)
254 			continue;
255 
256 		usec = calculate_tc_delay(tc, count1, count2);
257 
258 		if ((usec < (delay_usec - RECALIBRATE_DELAY_THRESHOLD)) ||
259 		    (usec > (delay_usec + RECALIBRATE_DELAY_THRESHOLD)))
260 			continue;
261 
262 		frequency = calculate_tsc_freq(tsc1, tsc2, usec);
263 
264 		min_freq = MIN(min_freq, frequency);
265 		success++;
266 	}
267 
268 	return (success > 1 ? min_freq : 0);
269 }
270 
271 void
calibrate_tsc_freq(void)272 calibrate_tsc_freq(void)
273 {
274 	struct timecounter *reference = tsc_timecounter.tc_priv;
275 	uint64_t freq;
276 
277 	if (!reference || !tsc_recalibrate)
278 		return;
279 
280 	if ((freq = measure_tsc_freq(reference)) == 0)
281 		return;
282 	tsc_frequency = freq;
283 	tsc_timecounter.tc_frequency = freq;
284 	if (tsc_is_invariant)
285 		tsc_timecounter.tc_quality = 2000;
286 }
287 
288 void
cpu_recalibrate_tsc(struct timecounter * tc)289 cpu_recalibrate_tsc(struct timecounter *tc)
290 {
291 	struct timecounter *reference = tsc_timecounter.tc_priv;
292 
293 	/* Prevent recalibration with a worse timecounter source */
294 	if (reference && reference->tc_quality > tc->tc_quality)
295 		return;
296 
297 	tsc_timecounter.tc_priv = tc;
298 	calibrate_tsc_freq();
299 }
300 
301 u_int
tsc_get_timecount_lfence(struct timecounter * tc)302 tsc_get_timecount_lfence(struct timecounter *tc)
303 {
304 	return rdtsc_lfence();
305 }
306 
307 u_int
tsc_get_timecount_rdtscp(struct timecounter * tc)308 tsc_get_timecount_rdtscp(struct timecounter *tc)
309 {
310 	return rdtscp();
311 }
312 
313 void
tsc_timecounter_init(struct cpu_info * ci,uint64_t cpufreq)314 tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
315 {
316 	if (!(ci->ci_flags & CPUF_PRIMARY) ||
317 	    !(ci->ci_flags & CPUF_CONST_TSC) ||
318 	    !(ci->ci_flags & CPUF_INVAR_TSC))
319 		return;
320 
321 	/* Newer CPUs don't require recalibration */
322 	if (tsc_frequency > 0) {
323 		tsc_timecounter.tc_frequency = tsc_frequency;
324 		tsc_timecounter.tc_quality = 2000;
325 	} else {
326 		tsc_recalibrate = 1;
327 		tsc_frequency = cpufreq;
328 		tsc_timecounter.tc_frequency = cpufreq;
329 		calibrate_tsc_freq();
330 	}
331 
332 	tc_init(&tsc_timecounter);
333 }
334 
335 void
tsc_delay(int usecs)336 tsc_delay(int usecs)
337 {
338 	uint64_t interval, start;
339 
340 	interval = (uint64_t)usecs * tsc_frequency / 1000000;
341 	start = tsc_rdtsc();
342 	while (tsc_rdtsc() - start < interval)
343 		CPU_BUSY_CYCLE();
344 }
345 
346 #ifdef MULTIPROCESSOR
347 
348 /*
349  * Protections for global variables in this code:
350  *
351  *	a	Modified atomically
352  *	b	Protected by a barrier
353  *	p	Only modified by the primary CPU
354  */
355 
356 #define TSC_TEST_MSECS		1	/* Test round duration */
357 #define TSC_TEST_ROUNDS		2	/* Number of test rounds */
358 
359 /*
360  * tsc_test_status.val is isolated to its own cache line to limit
361  * false sharing and reduce the test's margin of error.
362  */
363 struct tsc_test_status {
364 	volatile uint64_t val;		/* [a] Latest RDTSC value */
365 	uint64_t pad1[7];
366 	uint64_t lag_count;		/* [b] Number of lags seen by CPU */
367 	uint64_t lag_max;		/* [b] Biggest lag seen by CPU */
368 	int64_t adj;			/* [b] Initial IA32_TSC_ADJUST value */
369 	uint64_t pad2[5];
370 } __aligned(64);
371 struct tsc_test_status tsc_ap_status;	/* Test results from AP */
372 struct tsc_test_status tsc_bp_status;	/* Test results from BP */
373 uint64_t tsc_test_cycles;		/* [p] TSC cycles per test round */
374 const char *tsc_ap_name;		/* [b] Name of AP running test */
375 volatile u_int tsc_egress_barrier;	/* [a] Test end barrier */
376 volatile u_int tsc_ingress_barrier;	/* [a] Test start barrier */
377 volatile u_int tsc_test_rounds;		/* [p] Remaining test rounds */
378 int tsc_is_synchronized = 1;		/* [p] Have we ever failed the test? */
379 
380 void tsc_adjust_reset(struct cpu_info *, struct tsc_test_status *);
381 void tsc_report_test_results(void);
382 void tsc_test_ap(void);
383 void tsc_test_bp(void);
384 
385 void
tsc_test_sync_bp(struct cpu_info * ci)386 tsc_test_sync_bp(struct cpu_info *ci)
387 {
388 	if (!tsc_is_invariant)
389 		return;
390 #ifndef TSC_DEBUG
391 	/* No point in testing again if we already failed. */
392 	if (!tsc_is_synchronized)
393 		return;
394 #endif
395 	/* Reset IA32_TSC_ADJUST if it exists. */
396 	tsc_adjust_reset(ci, &tsc_bp_status);
397 
398 	/* Reset the test cycle limit and round count. */
399 	tsc_test_cycles = TSC_TEST_MSECS * tsc_frequency / 1000;
400 	tsc_test_rounds = TSC_TEST_ROUNDS;
401 
402 	do {
403 		/*
404 		 * Pass through the ingress barrier, run the test,
405 		 * then wait for the AP to reach the egress barrier.
406 		 */
407 		atomic_inc_int(&tsc_ingress_barrier);
408 		while (tsc_ingress_barrier != 2)
409 			CPU_BUSY_CYCLE();
410 		tsc_test_bp();
411 		while (tsc_egress_barrier != 1)
412 			CPU_BUSY_CYCLE();
413 
414 		/*
415 		 * Report what happened.  Adjust the TSC's quality
416 		 * if this is the first time we've failed the test.
417 		 */
418 		tsc_report_test_results();
419 		if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
420 			if (tsc_is_synchronized) {
421 				tsc_is_synchronized = 0;
422 				tc_reset_quality(&tsc_timecounter, -1000);
423 			}
424 			tsc_test_rounds = 0;
425 		} else
426 			tsc_test_rounds--;
427 
428 		/*
429 		 * Clean up for the next round.  It is safe to reset the
430 		 * ingress barrier because at this point we know the AP
431 		 * has reached the egress barrier.
432 		 */
433 		memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
434 		memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
435 		tsc_ingress_barrier = 0;
436 		if (tsc_test_rounds == 0)
437 			tsc_ap_name = NULL;
438 
439 		/*
440 		 * Pass through the egress barrier and release the AP.
441 		 * The AP is responsible for resetting the egress barrier.
442 		 */
443 		if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
444 			panic("%s: unexpected egress count", __func__);
445 	} while (tsc_test_rounds > 0);
446 }
447 
448 void
tsc_test_sync_ap(struct cpu_info * ci)449 tsc_test_sync_ap(struct cpu_info *ci)
450 {
451 	if (!tsc_is_invariant)
452 		return;
453 #ifndef TSC_DEBUG
454 	if (!tsc_is_synchronized)
455 		return;
456 #endif
457 	/* The BP needs our name in order to report any problems. */
458 	if (atomic_cas_ptr(&tsc_ap_name, NULL, ci->ci_dev->dv_xname) != NULL) {
459 		panic("%s: %s: tsc_ap_name is not NULL: %s",
460 		    __func__, ci->ci_dev->dv_xname, tsc_ap_name);
461 	}
462 
463 	tsc_adjust_reset(ci, &tsc_ap_status);
464 
465 	/*
466 	 * The AP is only responsible for running the test and
467 	 * resetting the egress barrier.  The BP handles everything
468 	 * else.
469 	 */
470 	do {
471 		atomic_inc_int(&tsc_ingress_barrier);
472 		while (tsc_ingress_barrier != 2)
473 			CPU_BUSY_CYCLE();
474 		tsc_test_ap();
475 		atomic_inc_int(&tsc_egress_barrier);
476 		while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
477 			CPU_BUSY_CYCLE();
478 	} while (tsc_test_rounds > 0);
479 }
480 
481 void
tsc_report_test_results(void)482 tsc_report_test_results(void)
483 {
484 #ifdef TSC_DEBUG
485 	u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
486 
487 	if (tsc_bp_status.adj != 0) {
488 		printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n",
489 		    tsc_bp_status.adj);
490 	}
491 	if (tsc_ap_status.adj != 0) {
492 		printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n",
493 		    tsc_ap_name, tsc_ap_status.adj);
494 	}
495 	if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) {
496 		printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
497 		    tsc_ap_name, round, TSC_TEST_ROUNDS);
498 	}
499 	if (tsc_bp_status.lag_count > 0) {
500 		printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
501 		    tsc_ap_name, tsc_bp_status.lag_count,
502 		    tsc_bp_status.lag_max);
503 	}
504 	if (tsc_ap_status.lag_count > 0) {
505 		printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
506 		    tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
507 		    tsc_ap_status.lag_max);
508 	}
509 #else
510 	if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0)
511 		printf("tsc: cpu0/%s: sync test failed\n", tsc_ap_name);
512 #endif /* TSC_DEBUG */
513 }
514 
515 /*
516  * Reset IA32_TSC_ADJUST if we have it.
517  */
518 void
tsc_adjust_reset(struct cpu_info * ci,struct tsc_test_status * tts)519 tsc_adjust_reset(struct cpu_info *ci, struct tsc_test_status *tts)
520 {
521 	if (ISSET(ci->ci_feature_sefflags_ebx, SEFF0EBX_TSC_ADJUST)) {
522 		tts->adj = rdmsr(MSR_TSC_ADJUST);
523 		if (tts->adj != 0)
524 			wrmsr(MSR_TSC_ADJUST, 0);
525 	}
526 }
527 
528 void
tsc_test_ap(void)529 tsc_test_ap(void)
530 {
531 	uint64_t ap_val, bp_val, end, lag;
532 
533 	ap_val = tsc_rdtsc();
534 	end = ap_val + tsc_test_cycles;
535 	while (__predict_true(ap_val < end)) {
536 		/*
537 		 * Get the BP's latest TSC value, then read the AP's
538 		 * TSC.  LFENCE is a serializing instruction, so we
539 		 * know bp_val predates ap_val.  If ap_val is smaller
540 		 * than bp_val then the AP's TSC must trail that of
541 		 * the BP and the counters cannot be synchronized.
542 		 */
543 		bp_val = tsc_bp_status.val;
544 		ap_val = tsc_rdtsc();
545 		tsc_ap_status.val = ap_val;
546 
547 		/*
548 		 * Record the magnitude of the problem if the AP's TSC
549 		 * trails the BP's TSC.
550 		 */
551 		if (__predict_false(ap_val < bp_val)) {
552 			tsc_ap_status.lag_count++;
553 			lag = bp_val - ap_val;
554 			if (tsc_ap_status.lag_max < lag)
555 				tsc_ap_status.lag_max = lag;
556 		}
557 	}
558 }
559 
560 /*
561  * This is similar to tsc_test_ap(), but with all relevant variables
562  * flipped around to run the test from the BP's perspective.
563  */
564 void
tsc_test_bp(void)565 tsc_test_bp(void)
566 {
567 	uint64_t ap_val, bp_val, end, lag;
568 
569 	bp_val = tsc_rdtsc();
570 	end = bp_val + tsc_test_cycles;
571 	while (__predict_true(bp_val < end)) {
572 		ap_val = tsc_ap_status.val;
573 		bp_val = tsc_rdtsc();
574 		tsc_bp_status.val = bp_val;
575 
576 		if (__predict_false(bp_val < ap_val)) {
577 			tsc_bp_status.lag_count++;
578 			lag = ap_val - bp_val;
579 			if (tsc_bp_status.lag_max < lag)
580 				tsc_bp_status.lag_max = lag;
581 		}
582 	}
583 }
584 
585 #endif /* MULTIPROCESSOR */
586