xref: /netbsd-src/sys/arch/x86/x86/tsc.c (revision 473449790769cdd5ce85d914e218c28181cbaf36)
1 /*	$NetBSD: tsc.c,v 1.61 2024/10/03 12:29:07 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.61 2024/10/03 12:29:07 riastradh Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/time.h>
35 #include <sys/timetc.h>
36 #include <sys/lwp.h>
37 #include <sys/atomic.h>
38 #include <sys/kernel.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/lock.h>
42 
43 #include <machine/cpu_counter.h>
44 #include <machine/cpuvar.h>
45 #include <machine/cpufunc.h>
46 #include <machine/specialreg.h>
47 #include <machine/cputypes.h>
48 
49 #include "tsc.h"
50 
51 #define	TSC_SYNC_ROUNDS		1000
52 #define	ABS(a)			((a) >= 0 ? (a) : -(a))
53 
54 static u_int	tsc_get_timecount(struct timecounter *);
55 
56 static void	tsc_delay(unsigned int);
57 
58 static uint64_t	tsc_dummy_cacheline __cacheline_aligned;
59 uint64_t	tsc_freq __read_mostly;	/* exported for sysctl */
60 static int64_t	tsc_drift_max = 1000;	/* max cycles */
61 static int64_t	tsc_drift_observed;
62 uint64_t	(*rdtsc)(void) = rdtsc_cpuid;
63 uint64_t	(*cpu_counter)(void) = cpu_counter_cpuid;
64 uint32_t	(*cpu_counter32)(void) = cpu_counter32_cpuid;
65 
66 int tsc_user_enabled = 1;
67 
68 static volatile int64_t	tsc_sync_val;
69 static volatile struct cpu_info	*tsc_sync_cpu;
70 
71 static struct timecounter tsc_timecounter = {
72 	.tc_get_timecount = tsc_get_timecount,
73 	.tc_counter_mask = ~0U,
74 	.tc_name = "TSC",
75 	.tc_quality = 3000,
76 };
77 
78 bool
79 tsc_is_invariant(void)
80 {
81 	struct cpu_info *ci;
82 	uint32_t descs[4];
83 	uint32_t family;
84 	bool invariant;
85 
86 	if (!cpu_hascounter())
87 		return false;
88 
89 	ci = curcpu();
90 	invariant = false;
91 
92 	if (cpu_vendor == CPUVENDOR_INTEL) {
93 		/*
94 		 * From Intel(tm) 64 and IA-32 Architectures Software
95 		 * Developer's Manual Volume 3A: System Programming Guide,
96 		 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
97 		 * where the TSC is known invariant:
98 		 *
99 		 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
100 		 * Core Solo and Core Duo processors (family 06, model 0e)
101 		 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
102 		 * Core 2 and Xeon (family 06, model 17)
103 		 * Atom (family 06, model 1c)
104 		 *
105 		 * We'll also assume that it's safe on the Pentium, and
106 		 * that it's safe on P-II and P-III Xeons due to the
107 		 * typical configuration of those systems.
108 		 *
109 		 */
110 		switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
111 		case 0x05:
112 			invariant = true;
113 			break;
114 		case 0x06:
115 			invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
116 			    CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
117 			    CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
118 			    CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
119 			break;
120 		case 0x0f:
121 			invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
122 			break;
123 		}
124 	} else if (cpu_vendor == CPUVENDOR_AMD) {
125 		/*
126 		 * TSC and Power Management Events on AMD Processors
127 		 * Nov 2, 2005 Rich Brunner, AMD Fellow
128 		 * http://lkml.org/lkml/2005/11/4/173
129 		 *
130 		 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
131 		 * Management Features, AMD64 Architecture Programmer's
132 		 * Manual Volume 3: General-Purpose and System Instructions.
133 		 * The check is done below.
134 		 */
135 
136 		/*
137 		 * AMD Errata 778: Processor Core Time Stamp Counters May
138 		 * Experience Drift
139 		 *
140 		 * This affects all family 15h and family 16h processors.
141 		 */
142 		switch (CPUID_TO_FAMILY(ci->ci_signature)) {
143 		case 0x15:
144 		case 0x16:
145 			return false;
146 		}
147 	}
148 
149 	/*
150 	 * The best way to check whether the TSC counter is invariant or not
151 	 * is to check CPUID 80000007.
152 	 */
153 	family = CPUID_TO_BASEFAMILY(ci->ci_signature);
154 	if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
155 	    && ((family == 0x06) || (family == 0x0f))) {
156 		x86_cpuid(0x80000000, descs);
157 		if (descs[0] >= 0x80000007) {
158 			x86_cpuid(0x80000007, descs);
159 			invariant = (descs[3] & CPUID_APM_ITSC) != 0;
160 		}
161 	}
162 
163 	return invariant;
164 }
165 
166 /* Setup function pointers for rdtsc() and timecounter(9). */
167 void
168 tsc_setfunc(struct cpu_info *ci)
169 {
170 	bool use_lfence, use_mfence;
171 
172 	use_lfence = use_mfence = false;
173 
174 	/*
175 	 * XXX On AMD, we might be able to use lfence for some cases:
176 	 *   a) if MSR_DE_CFG exist and the bit 1 is set.
177 	 *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
178 	 *      lfence is always serializing.
179 	 *
180 	 * We don't use it because the test result showed mfence was better
181 	 * than lfence with MSR_DE_CFG.
182 	 */
183 	if (cpu_vendor == CPUVENDOR_AMD)
184 		use_mfence = true;
185 	else if (cpu_vendor == CPUVENDOR_INTEL)
186 		use_lfence = true;
187 
188 	/* LFENCE and MFENCE are applicable if SSE2 is set. */
189 	if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
190 		use_lfence = use_mfence = false;
191 
192 #define TSC_SETFUNC(fence)						      \
193 	do {								      \
194 		rdtsc = rdtsc_##fence;					      \
195 		cpu_counter = cpu_counter_##fence;			      \
196 		cpu_counter32 = cpu_counter32_##fence;			      \
197 	} while (/* CONSTCOND */ 0)
198 
199 	if (use_lfence)
200 		TSC_SETFUNC(lfence);
201 	else if (use_mfence)
202 		TSC_SETFUNC(mfence);
203 	else
204 		TSC_SETFUNC(cpuid);
205 
206 	aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
207 	    use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
208 }
209 
210 /*
211  * Initialize timecounter(9) and DELAY() function of TSC.
212  *
213  * This function is called after all secondary processors were brought up
214  * and drift has been measured, and after any other potential delay funcs
215  * have been installed (e.g. lapic_delay()).
216  */
217 void
218 tsc_tc_init(void)
219 {
220 	struct cpu_info *ci;
221 	bool invariant;
222 
223 	if (!cpu_hascounter())
224 		return;
225 
226 	ci = curcpu();
227 	tsc_freq = ci->ci_data.cpu_cc_freq;
228 	invariant = tsc_is_invariant();
229 	if (!invariant) {
230 		aprint_debug("TSC not known invariant on this CPU\n");
231 		tsc_timecounter.tc_quality = -100;
232 	} else if (tsc_drift_observed > tsc_drift_max) {
233 		aprint_error("ERROR: %lld cycle TSC drift observed\n",
234 		    (long long)tsc_drift_observed);
235 		tsc_timecounter.tc_quality = -100;
236 		invariant = false;
237 	} else if (vm_guest == VM_GUEST_NO) {
238 		delay_func = tsc_delay;
239 	} else if (vm_guest == VM_GUEST_VIRTUALBOX) {
240 		tsc_timecounter.tc_quality = -100;
241 	}
242 
243 	if (tsc_freq != 0) {
244 		tsc_timecounter.tc_frequency = tsc_freq;
245 		tc_init(&tsc_timecounter);
246 	}
247 }
248 
249 /*
250  * Record drift (in clock cycles).  Called during AP startup.
251  */
252 void
253 tsc_sync_drift(int64_t drift)
254 {
255 
256 	if (drift < 0)
257 		drift = -drift;
258 	if (drift > tsc_drift_observed)
259 		tsc_drift_observed = drift;
260 }
261 
262 /*
263  * Called during startup of APs, by the boot processor.  Interrupts
264  * are disabled on entry.
265  */
266 static void __noinline
267 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
268 {
269 	uint64_t bptsc;
270 
271 	if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
272 		panic("tsc_sync_bp: 1");
273 	}
274 
275 	/* Prepare a cache miss for the other side. */
276 	(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
277 
278 	/* Flag our readiness. */
279 	atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
280 
281 	/* Wait for other side then read our TSC. */
282 	while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
283 		__insn_barrier();
284 	}
285 	bptsc = rdtsc();
286 
287 	/* Wait for the results to come in. */
288 	while (tsc_sync_cpu == ci) {
289 		x86_pause();
290 	}
291 	if (tsc_sync_cpu != NULL) {
292 		panic("tsc_sync_bp: 2");
293 	}
294 
295 	*bptscp = bptsc;
296 	*aptscp = tsc_sync_val;
297 }
298 
299 void
300 tsc_sync_bp(struct cpu_info *ci)
301 {
302 	int64_t bptsc, aptsc, val, diff;
303 
304 	if (!cpu_hascounter())
305 		return;
306 
307 	val = INT64_MAX;
308 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
309 		tsc_read_bp(ci, &bptsc, &aptsc);
310 		diff = bptsc - aptsc;
311 		if (ABS(diff) < ABS(val)) {
312 			val = diff;
313 		}
314 	}
315 
316 	ci->ci_data.cpu_cc_skew = val;
317 }
318 
319 /*
320  * Called during startup of AP, by the AP itself.  Interrupts are
321  * disabled on entry.
322  */
323 static void __noinline
324 tsc_post_ap(struct cpu_info *ci)
325 {
326 	uint64_t tsc;
327 
328 	/* Wait for go-ahead from primary. */
329 	while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
330 		__insn_barrier();
331 	}
332 
333 	/* Instruct primary to read its counter. */
334 	atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
335 
336 	/* Suffer a cache miss, then read TSC. */
337 	__insn_barrier();
338 	tsc = tsc_dummy_cacheline;
339 	__insn_barrier();
340 	tsc += rdtsc();
341 
342 	/* Post result.  Ensure the whole value goes out atomically. */
343 	(void)atomic_swap_64(&tsc_sync_val, tsc);
344 
345 	if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
346 		panic("tsc_sync_ap");
347 	}
348 }
349 
350 void
351 tsc_sync_ap(struct cpu_info *ci)
352 {
353 
354 	if (!cpu_hascounter())
355 		return;
356 
357 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
358 		tsc_post_ap(ci);
359 	}
360 }
361 
362 static void
363 tsc_apply_cpu(void *arg1, void *arg2)
364 {
365 	bool enable = arg1 != NULL;
366 	if (enable) {
367 		lcr4(rcr4() & ~CR4_TSD);
368 	} else {
369 		lcr4(rcr4() | CR4_TSD);
370 	}
371 }
372 
373 void
374 tsc_user_enable(void)
375 {
376 	uint64_t xc;
377 
378 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
379 	xc_wait(xc);
380 }
381 
382 void
383 tsc_user_disable(void)
384 {
385 	uint64_t xc;
386 
387 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
388 	xc_wait(xc);
389 }
390 
391 uint64_t
392 cpu_frequency(struct cpu_info *ci)
393 {
394 
395 	return ci->ci_data.cpu_cc_freq;
396 }
397 
398 int
399 cpu_hascounter(void)
400 {
401 
402 	return cpu_feature[0] & CPUID_TSC;
403 }
404 
405 static void
406 tsc_delay(unsigned int us)
407 {
408 	uint64_t start, delta;
409 
410 	start = cpu_counter();
411 	delta = (uint64_t)us * tsc_freq / 1000000;
412 
413 	while ((cpu_counter() - start) < delta) {
414 		x86_pause();
415 	}
416 }
417 
418 static u_int
419 tsc_get_timecount(struct timecounter *tc)
420 {
421 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
422 	static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
423 	static int lastwarn;
424 	uint64_t cur, prev;
425 	lwp_t *l = curlwp;
426 	int ticks;
427 
428 	/*
429 	 * Previous value must be read before the counter and stored to
430 	 * after, because this routine can be called from interrupt context
431 	 * and may run over the top of an existing invocation.  Ordering is
432 	 * guaranteed by "volatile" on md_tsc.
433 	 */
434 	prev = l->l_md.md_tsc;
435 	cur = cpu_counter();
436 	if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
437 	    __cpu_simple_lock_try(&lock)) {
438 		ticks = getticks();
439 		if (ticks - lastwarn >= hz) {
440 			printf(
441 			    "WARNING: %s TSC went backwards by %u - "
442 			    "change sysctl(7) kern.timecounter?\n",
443 			    cpu_name(curcpu()), (unsigned)(prev - cur));
444 			lastwarn = ticks;
445 		}
446 		__cpu_simple_unlock(&lock);
447 	}
448 	l->l_md.md_tsc = cur;
449 	return (uint32_t)cur;
450 #else
451 	return cpu_counter32();
452 #endif
453 }
454 
455 /*
456  * tsc has been reset; zero the cached tsc of every lwp in the system
457  * so we don't spuriously report that the tsc has gone backward.
458  * Caller must ensure all LWPs are quiescent (except the current one,
459  * obviously) and interrupts are blocked while we update this.
460  */
461 void
462 tsc_tc_reset(void)
463 {
464 	struct lwp *l;
465 
466 	LIST_FOREACH(l, &alllwp, l_list)
467 		l->l_md.md_tsc = 0;
468 }
469