1 /* $OpenBSD: tsc.c,v 1.32 2024/04/03 02:01:21 guenther Exp $ */
2 /*
3 * Copyright (c) 2008 The NetBSD Foundation, Inc.
4 * Copyright (c) 2016,2017 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2017 Adam Steen <adam@adamsteen.com.au>
6 * Copyright (c) 2017 Mike Belopuhov <mike@openbsd.org>
7 * Copyright (c) 2019 Paul Irofti <paul@irofti.net>
8 *
9 * Permission to use, copy, modify, and distribute this software for any
10 * purpose with or without fee is hereby granted, provided that the above
11 * copyright notice and this permission notice appear in all copies.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
14 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
15 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
16 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
17 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
18 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
19 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 */
21
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/timetc.h>
25 #include <sys/atomic.h>
26
27 #include <machine/cpu.h>
28 #include <machine/cpufunc.h>
29
30 #define RECALIBRATE_MAX_RETRIES 5
31 #define RECALIBRATE_SMI_THRESHOLD 50000
32 #define RECALIBRATE_DELAY_THRESHOLD 50
33
34 int tsc_recalibrate;
35
36 uint64_t tsc_frequency;
37 int tsc_is_invariant;
38
39 u_int tsc_get_timecount_lfence(struct timecounter *tc);
40 u_int tsc_get_timecount_rdtscp(struct timecounter *tc);
41 void tsc_delay(int usecs);
42
43 #include "lapic.h"
44 #if NLAPIC > 0
45 extern u_int32_t lapic_per_second;
46 #endif
47
48 u_int64_t (*tsc_rdtsc)(void) = rdtsc_lfence;
49
50 struct timecounter tsc_timecounter = {
51 .tc_get_timecount = tsc_get_timecount_lfence,
52 .tc_counter_mask = ~0u,
53 .tc_frequency = 0,
54 .tc_name = "tsc",
55 .tc_quality = -1000,
56 .tc_priv = NULL,
57 .tc_user = TC_TSC_LFENCE,
58 };
59
60 uint64_t
tsc_freq_cpuid(struct cpu_info * ci)61 tsc_freq_cpuid(struct cpu_info *ci)
62 {
63 uint64_t count;
64 uint32_t eax, ebx, khz, dummy;
65
66 if (ci->ci_vendor == CPUV_INTEL &&
67 ci->ci_cpuid_level >= 0x15) {
68 eax = ebx = khz = dummy = 0;
69 CPUID(0x15, eax, ebx, khz, dummy);
70 khz /= 1000;
71 if (khz == 0) {
72 switch (ci->ci_model) {
73 case 0x4e: /* Skylake mobile */
74 case 0x5e: /* Skylake desktop */
75 case 0x8e: /* Kabylake mobile */
76 case 0x9e: /* Kabylake desktop */
77 case 0xa5: /* CML-H CML-S62 CML-S102 */
78 case 0xa6: /* CML-U62 */
79 khz = 24000; /* 24.0 MHz */
80 break;
81 case 0x5f: /* Atom Denverton */
82 khz = 25000; /* 25.0 MHz */
83 break;
84 case 0x5c: /* Atom Goldmont */
85 khz = 19200; /* 19.2 MHz */
86 break;
87 }
88 }
89 if (ebx == 0 || eax == 0)
90 count = 0;
91 else if ((count = (uint64_t)khz * (uint64_t)ebx / eax) != 0) {
92 #if NLAPIC > 0
93 lapic_per_second = khz * 1000;
94 #endif
95 return (count * 1000);
96 }
97 }
98
99 return (0);
100 }
101
102 uint64_t
tsc_freq_msr(struct cpu_info * ci)103 tsc_freq_msr(struct cpu_info *ci)
104 {
105 uint64_t base, def, divisor, multiplier;
106
107 if (ci->ci_vendor != CPUV_AMD)
108 return 0;
109
110 /*
111 * All 10h+ CPUs have Core::X86::Msr:HWCR and the TscFreqSel
112 * bit. If TscFreqSel hasn't been set, the TSC isn't advancing
113 * at the core P0 frequency and we need to calibrate by hand.
114 */
115 if (ci->ci_family < 0x10)
116 return 0;
117 if (!ISSET(rdmsr(MSR_HWCR), HWCR_TSCFREQSEL))
118 return 0;
119
120 /*
121 * In 10h+ CPUs, Core::X86::Msr::PStateDef defines the voltage
122 * and frequency for each core P-state. We want the P0 frequency.
123 * If the En bit isn't set, the register doesn't define a valid
124 * P-state.
125 */
126 def = rdmsr(MSR_PSTATEDEF(0));
127 if (!ISSET(def, PSTATEDEF_EN))
128 return 0;
129
130 switch (ci->ci_family) {
131 case 0x17:
132 case 0x19:
133 /*
134 * PPR for AMD Family 17h [...]:
135 * Models 01h,08h B2, Rev 3.03, pp. 33, 139-140
136 * Model 18h B1, Rev 3.16, pp. 36, 143-144
137 * Model 60h A1, Rev 3.06, pp. 33, 155-157
138 * Model 71h B0, Rev 3.06, pp. 28, 150-151
139 *
140 * PPR for AMD Family 19h [...]:
141 * Model 21h B0, Rev 3.05, pp. 33, 166-167
142 *
143 * OSRR for AMD Family 17h processors,
144 * Models 00h-2Fh, Rev 3.03, pp. 130-131
145 */
146 base = 200000000; /* 200.0 MHz */
147 divisor = (def >> 8) & 0x3f;
148 if (divisor <= 0x07 || divisor >= 0x2d)
149 return 0; /* reserved */
150 if (divisor >= 0x1b && divisor % 2 == 1)
151 return 0; /* reserved */
152 multiplier = def & 0xff;
153 if (multiplier <= 0x0f)
154 return 0; /* reserved */
155 break;
156 default:
157 return 0;
158 }
159
160 return base * multiplier / divisor;
161 }
162
163 void
tsc_identify(struct cpu_info * ci)164 tsc_identify(struct cpu_info *ci)
165 {
166 if (!(ci->ci_flags & CPUF_PRIMARY) ||
167 !(ci->ci_flags & CPUF_CONST_TSC) ||
168 !(ci->ci_flags & CPUF_INVAR_TSC))
169 return;
170
171 /* Prefer RDTSCP where supported. */
172 if (ISSET(ci->ci_feature_eflags, CPUID_RDTSCP)) {
173 tsc_rdtsc = rdtscp;
174 tsc_timecounter.tc_get_timecount = tsc_get_timecount_rdtscp;
175 tsc_timecounter.tc_user = TC_TSC_RDTSCP;
176 }
177
178 tsc_is_invariant = 1;
179
180 tsc_frequency = tsc_freq_cpuid(ci);
181 if (tsc_frequency == 0)
182 tsc_frequency = tsc_freq_msr(ci);
183 if (tsc_frequency > 0)
184 delay_init(tsc_delay, 5000);
185 }
186
187 static inline int
get_tsc_and_timecount(struct timecounter * tc,uint64_t * tsc,uint64_t * count)188 get_tsc_and_timecount(struct timecounter *tc, uint64_t *tsc, uint64_t *count)
189 {
190 uint64_t n, tsc1, tsc2;
191 int i;
192
193 for (i = 0; i < RECALIBRATE_MAX_RETRIES; i++) {
194 tsc1 = tsc_rdtsc();
195 n = (tc->tc_get_timecount(tc) & tc->tc_counter_mask);
196 tsc2 = tsc_rdtsc();
197
198 if ((tsc2 - tsc1) < RECALIBRATE_SMI_THRESHOLD) {
199 *count = n;
200 *tsc = tsc2;
201 return (0);
202 }
203 }
204 return (1);
205 }
206
207 static inline uint64_t
calculate_tsc_freq(uint64_t tsc1,uint64_t tsc2,int usec)208 calculate_tsc_freq(uint64_t tsc1, uint64_t tsc2, int usec)
209 {
210 uint64_t delta;
211
212 delta = (tsc2 - tsc1);
213 return (delta * 1000000 / usec);
214 }
215
216 static inline uint64_t
calculate_tc_delay(struct timecounter * tc,uint64_t count1,uint64_t count2)217 calculate_tc_delay(struct timecounter *tc, uint64_t count1, uint64_t count2)
218 {
219 uint64_t delta;
220
221 if (count2 < count1)
222 count2 += tc->tc_counter_mask;
223
224 delta = (count2 - count1);
225 return (delta * 1000000 / tc->tc_frequency);
226 }
227
228 uint64_t
measure_tsc_freq(struct timecounter * tc)229 measure_tsc_freq(struct timecounter *tc)
230 {
231 uint64_t count1, count2, frequency, min_freq, tsc1, tsc2;
232 u_long s;
233 int delay_usec, i, err1, err2, usec, success = 0;
234
235 /* warmup the timers */
236 for (i = 0; i < 3; i++) {
237 (void)tc->tc_get_timecount(tc);
238 (void)rdtsc();
239 }
240
241 min_freq = ULLONG_MAX;
242
243 delay_usec = 100000;
244 for (i = 0; i < 3; i++) {
245 s = intr_disable();
246
247 err1 = get_tsc_and_timecount(tc, &tsc1, &count1);
248 delay(delay_usec);
249 err2 = get_tsc_and_timecount(tc, &tsc2, &count2);
250
251 intr_restore(s);
252
253 if (err1 || err2)
254 continue;
255
256 usec = calculate_tc_delay(tc, count1, count2);
257
258 if ((usec < (delay_usec - RECALIBRATE_DELAY_THRESHOLD)) ||
259 (usec > (delay_usec + RECALIBRATE_DELAY_THRESHOLD)))
260 continue;
261
262 frequency = calculate_tsc_freq(tsc1, tsc2, usec);
263
264 min_freq = MIN(min_freq, frequency);
265 success++;
266 }
267
268 return (success > 1 ? min_freq : 0);
269 }
270
271 void
calibrate_tsc_freq(void)272 calibrate_tsc_freq(void)
273 {
274 struct timecounter *reference = tsc_timecounter.tc_priv;
275 uint64_t freq;
276
277 if (!reference || !tsc_recalibrate)
278 return;
279
280 if ((freq = measure_tsc_freq(reference)) == 0)
281 return;
282 tsc_frequency = freq;
283 tsc_timecounter.tc_frequency = freq;
284 if (tsc_is_invariant)
285 tsc_timecounter.tc_quality = 2000;
286 }
287
288 void
cpu_recalibrate_tsc(struct timecounter * tc)289 cpu_recalibrate_tsc(struct timecounter *tc)
290 {
291 struct timecounter *reference = tsc_timecounter.tc_priv;
292
293 /* Prevent recalibration with a worse timecounter source */
294 if (reference && reference->tc_quality > tc->tc_quality)
295 return;
296
297 tsc_timecounter.tc_priv = tc;
298 calibrate_tsc_freq();
299 }
300
301 u_int
tsc_get_timecount_lfence(struct timecounter * tc)302 tsc_get_timecount_lfence(struct timecounter *tc)
303 {
304 return rdtsc_lfence();
305 }
306
307 u_int
tsc_get_timecount_rdtscp(struct timecounter * tc)308 tsc_get_timecount_rdtscp(struct timecounter *tc)
309 {
310 return rdtscp();
311 }
312
313 void
tsc_timecounter_init(struct cpu_info * ci,uint64_t cpufreq)314 tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq)
315 {
316 if (!(ci->ci_flags & CPUF_PRIMARY) ||
317 !(ci->ci_flags & CPUF_CONST_TSC) ||
318 !(ci->ci_flags & CPUF_INVAR_TSC))
319 return;
320
321 /* Newer CPUs don't require recalibration */
322 if (tsc_frequency > 0) {
323 tsc_timecounter.tc_frequency = tsc_frequency;
324 tsc_timecounter.tc_quality = 2000;
325 } else {
326 tsc_recalibrate = 1;
327 tsc_frequency = cpufreq;
328 tsc_timecounter.tc_frequency = cpufreq;
329 calibrate_tsc_freq();
330 }
331
332 tc_init(&tsc_timecounter);
333 }
334
335 void
tsc_delay(int usecs)336 tsc_delay(int usecs)
337 {
338 uint64_t interval, start;
339
340 interval = (uint64_t)usecs * tsc_frequency / 1000000;
341 start = tsc_rdtsc();
342 while (tsc_rdtsc() - start < interval)
343 CPU_BUSY_CYCLE();
344 }
345
346 #ifdef MULTIPROCESSOR
347
348 /*
349 * Protections for global variables in this code:
350 *
351 * a Modified atomically
352 * b Protected by a barrier
353 * p Only modified by the primary CPU
354 */
355
356 #define TSC_TEST_MSECS 1 /* Test round duration */
357 #define TSC_TEST_ROUNDS 2 /* Number of test rounds */
358
359 /*
360 * tsc_test_status.val is isolated to its own cache line to limit
361 * false sharing and reduce the test's margin of error.
362 */
363 struct tsc_test_status {
364 volatile uint64_t val; /* [a] Latest RDTSC value */
365 uint64_t pad1[7];
366 uint64_t lag_count; /* [b] Number of lags seen by CPU */
367 uint64_t lag_max; /* [b] Biggest lag seen by CPU */
368 int64_t adj; /* [b] Initial IA32_TSC_ADJUST value */
369 uint64_t pad2[5];
370 } __aligned(64);
371 struct tsc_test_status tsc_ap_status; /* Test results from AP */
372 struct tsc_test_status tsc_bp_status; /* Test results from BP */
373 uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */
374 const char *tsc_ap_name; /* [b] Name of AP running test */
375 volatile u_int tsc_egress_barrier; /* [a] Test end barrier */
376 volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */
377 volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */
378 int tsc_is_synchronized = 1; /* [p] Have we ever failed the test? */
379
380 void tsc_adjust_reset(struct cpu_info *, struct tsc_test_status *);
381 void tsc_report_test_results(void);
382 void tsc_test_ap(void);
383 void tsc_test_bp(void);
384
385 void
tsc_test_sync_bp(struct cpu_info * ci)386 tsc_test_sync_bp(struct cpu_info *ci)
387 {
388 if (!tsc_is_invariant)
389 return;
390 #ifndef TSC_DEBUG
391 /* No point in testing again if we already failed. */
392 if (!tsc_is_synchronized)
393 return;
394 #endif
395 /* Reset IA32_TSC_ADJUST if it exists. */
396 tsc_adjust_reset(ci, &tsc_bp_status);
397
398 /* Reset the test cycle limit and round count. */
399 tsc_test_cycles = TSC_TEST_MSECS * tsc_frequency / 1000;
400 tsc_test_rounds = TSC_TEST_ROUNDS;
401
402 do {
403 /*
404 * Pass through the ingress barrier, run the test,
405 * then wait for the AP to reach the egress barrier.
406 */
407 atomic_inc_int(&tsc_ingress_barrier);
408 while (tsc_ingress_barrier != 2)
409 CPU_BUSY_CYCLE();
410 tsc_test_bp();
411 while (tsc_egress_barrier != 1)
412 CPU_BUSY_CYCLE();
413
414 /*
415 * Report what happened. Adjust the TSC's quality
416 * if this is the first time we've failed the test.
417 */
418 tsc_report_test_results();
419 if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) {
420 if (tsc_is_synchronized) {
421 tsc_is_synchronized = 0;
422 tc_reset_quality(&tsc_timecounter, -1000);
423 }
424 tsc_test_rounds = 0;
425 } else
426 tsc_test_rounds--;
427
428 /*
429 * Clean up for the next round. It is safe to reset the
430 * ingress barrier because at this point we know the AP
431 * has reached the egress barrier.
432 */
433 memset(&tsc_ap_status, 0, sizeof tsc_ap_status);
434 memset(&tsc_bp_status, 0, sizeof tsc_bp_status);
435 tsc_ingress_barrier = 0;
436 if (tsc_test_rounds == 0)
437 tsc_ap_name = NULL;
438
439 /*
440 * Pass through the egress barrier and release the AP.
441 * The AP is responsible for resetting the egress barrier.
442 */
443 if (atomic_inc_int_nv(&tsc_egress_barrier) != 2)
444 panic("%s: unexpected egress count", __func__);
445 } while (tsc_test_rounds > 0);
446 }
447
448 void
tsc_test_sync_ap(struct cpu_info * ci)449 tsc_test_sync_ap(struct cpu_info *ci)
450 {
451 if (!tsc_is_invariant)
452 return;
453 #ifndef TSC_DEBUG
454 if (!tsc_is_synchronized)
455 return;
456 #endif
457 /* The BP needs our name in order to report any problems. */
458 if (atomic_cas_ptr(&tsc_ap_name, NULL, ci->ci_dev->dv_xname) != NULL) {
459 panic("%s: %s: tsc_ap_name is not NULL: %s",
460 __func__, ci->ci_dev->dv_xname, tsc_ap_name);
461 }
462
463 tsc_adjust_reset(ci, &tsc_ap_status);
464
465 /*
466 * The AP is only responsible for running the test and
467 * resetting the egress barrier. The BP handles everything
468 * else.
469 */
470 do {
471 atomic_inc_int(&tsc_ingress_barrier);
472 while (tsc_ingress_barrier != 2)
473 CPU_BUSY_CYCLE();
474 tsc_test_ap();
475 atomic_inc_int(&tsc_egress_barrier);
476 while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2)
477 CPU_BUSY_CYCLE();
478 } while (tsc_test_rounds > 0);
479 }
480
481 void
tsc_report_test_results(void)482 tsc_report_test_results(void)
483 {
484 #ifdef TSC_DEBUG
485 u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1;
486
487 if (tsc_bp_status.adj != 0) {
488 printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n",
489 tsc_bp_status.adj);
490 }
491 if (tsc_ap_status.adj != 0) {
492 printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n",
493 tsc_ap_name, tsc_ap_status.adj);
494 }
495 if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) {
496 printf("tsc: cpu0/%s: sync test round %u/%u failed\n",
497 tsc_ap_name, round, TSC_TEST_ROUNDS);
498 }
499 if (tsc_bp_status.lag_count > 0) {
500 printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n",
501 tsc_ap_name, tsc_bp_status.lag_count,
502 tsc_bp_status.lag_max);
503 }
504 if (tsc_ap_status.lag_count > 0) {
505 printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n",
506 tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count,
507 tsc_ap_status.lag_max);
508 }
509 #else
510 if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0)
511 printf("tsc: cpu0/%s: sync test failed\n", tsc_ap_name);
512 #endif /* TSC_DEBUG */
513 }
514
515 /*
516 * Reset IA32_TSC_ADJUST if we have it.
517 */
518 void
tsc_adjust_reset(struct cpu_info * ci,struct tsc_test_status * tts)519 tsc_adjust_reset(struct cpu_info *ci, struct tsc_test_status *tts)
520 {
521 if (ISSET(ci->ci_feature_sefflags_ebx, SEFF0EBX_TSC_ADJUST)) {
522 tts->adj = rdmsr(MSR_TSC_ADJUST);
523 if (tts->adj != 0)
524 wrmsr(MSR_TSC_ADJUST, 0);
525 }
526 }
527
528 void
tsc_test_ap(void)529 tsc_test_ap(void)
530 {
531 uint64_t ap_val, bp_val, end, lag;
532
533 ap_val = tsc_rdtsc();
534 end = ap_val + tsc_test_cycles;
535 while (__predict_true(ap_val < end)) {
536 /*
537 * Get the BP's latest TSC value, then read the AP's
538 * TSC. LFENCE is a serializing instruction, so we
539 * know bp_val predates ap_val. If ap_val is smaller
540 * than bp_val then the AP's TSC must trail that of
541 * the BP and the counters cannot be synchronized.
542 */
543 bp_val = tsc_bp_status.val;
544 ap_val = tsc_rdtsc();
545 tsc_ap_status.val = ap_val;
546
547 /*
548 * Record the magnitude of the problem if the AP's TSC
549 * trails the BP's TSC.
550 */
551 if (__predict_false(ap_val < bp_val)) {
552 tsc_ap_status.lag_count++;
553 lag = bp_val - ap_val;
554 if (tsc_ap_status.lag_max < lag)
555 tsc_ap_status.lag_max = lag;
556 }
557 }
558 }
559
560 /*
561 * This is similar to tsc_test_ap(), but with all relevant variables
562 * flipped around to run the test from the BP's perspective.
563 */
564 void
tsc_test_bp(void)565 tsc_test_bp(void)
566 {
567 uint64_t ap_val, bp_val, end, lag;
568
569 bp_val = tsc_rdtsc();
570 end = bp_val + tsc_test_cycles;
571 while (__predict_true(bp_val < end)) {
572 ap_val = tsc_ap_status.val;
573 bp_val = tsc_rdtsc();
574 tsc_bp_status.val = bp_val;
575
576 if (__predict_false(bp_val < ap_val)) {
577 tsc_bp_status.lag_count++;
578 lag = ap_val - bp_val;
579 if (tsc_bp_status.lag_max < lag)
580 tsc_bp_status.lag_max = lag;
581 }
582 }
583 }
584
585 #endif /* MULTIPROCESSOR */
586