1 /* $NetBSD: tsc.c,v 1.61 2024/10/03 12:29:07 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.61 2024/10/03 12:29:07 riastradh Exp $"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/time.h> 35 #include <sys/timetc.h> 36 #include <sys/lwp.h> 37 #include <sys/atomic.h> 38 #include <sys/kernel.h> 39 #include <sys/cpu.h> 40 #include <sys/xcall.h> 41 #include <sys/lock.h> 42 43 #include <machine/cpu_counter.h> 44 #include <machine/cpuvar.h> 45 #include <machine/cpufunc.h> 46 #include <machine/specialreg.h> 47 #include <machine/cputypes.h> 48 49 #include "tsc.h" 50 51 #define TSC_SYNC_ROUNDS 1000 52 #define ABS(a) ((a) >= 0 ? (a) : -(a)) 53 54 static u_int tsc_get_timecount(struct timecounter *); 55 56 static void tsc_delay(unsigned int); 57 58 static uint64_t tsc_dummy_cacheline __cacheline_aligned; 59 uint64_t tsc_freq __read_mostly; /* exported for sysctl */ 60 static int64_t tsc_drift_max = 1000; /* max cycles */ 61 static int64_t tsc_drift_observed; 62 uint64_t (*rdtsc)(void) = rdtsc_cpuid; 63 uint64_t (*cpu_counter)(void) = cpu_counter_cpuid; 64 uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid; 65 66 int tsc_user_enabled = 1; 67 68 static volatile int64_t tsc_sync_val; 69 static volatile struct cpu_info *tsc_sync_cpu; 70 71 static struct timecounter tsc_timecounter = { 72 .tc_get_timecount = tsc_get_timecount, 73 .tc_counter_mask = ~0U, 74 .tc_name = "TSC", 75 .tc_quality = 3000, 76 }; 77 78 bool 79 tsc_is_invariant(void) 80 { 81 struct cpu_info *ci; 82 uint32_t descs[4]; 83 uint32_t family; 84 bool invariant; 85 86 if (!cpu_hascounter()) 87 return false; 88 89 ci = curcpu(); 90 invariant = false; 91 92 if (cpu_vendor == CPUVENDOR_INTEL) { 93 /* 94 * From Intel(tm) 64 and IA-32 Architectures Software 95 * Developer's Manual Volume 3A: System Programming Guide, 96 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors 97 * where the TSC is known invariant: 98 * 99 * Pentium 4, Intel Xeon (family 0f, models 03 and higher) 100 * Core Solo and Core Duo processors (family 06, model 0e) 101 * Xeon 5100 series and Core 2 Duo (family 06, model 0f) 102 * Core 2 and Xeon (family 06, model 17) 103 * Atom (family 06, model 1c) 104 * 105 * We'll also assume that it's safe on the Pentium, and 106 * that it's safe on P-II and P-III Xeons due to the 107 * typical configuration of those systems. 108 * 109 */ 110 switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) { 111 case 0x05: 112 invariant = true; 113 break; 114 case 0x06: 115 invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e || 116 CPUID_TO_MODEL(ci->ci_signature) == 0x0f || 117 CPUID_TO_MODEL(ci->ci_signature) == 0x17 || 118 CPUID_TO_MODEL(ci->ci_signature) == 0x1c; 119 break; 120 case 0x0f: 121 invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03; 122 break; 123 } 124 } else if (cpu_vendor == CPUVENDOR_AMD) { 125 /* 126 * TSC and Power Management Events on AMD Processors 127 * Nov 2, 2005 Rich Brunner, AMD Fellow 128 * http://lkml.org/lkml/2005/11/4/173 129 * 130 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power 131 * Management Features, AMD64 Architecture Programmer's 132 * Manual Volume 3: General-Purpose and System Instructions. 133 * The check is done below. 134 */ 135 136 /* 137 * AMD Errata 778: Processor Core Time Stamp Counters May 138 * Experience Drift 139 * 140 * This affects all family 15h and family 16h processors. 141 */ 142 switch (CPUID_TO_FAMILY(ci->ci_signature)) { 143 case 0x15: 144 case 0x16: 145 return false; 146 } 147 } 148 149 /* 150 * The best way to check whether the TSC counter is invariant or not 151 * is to check CPUID 80000007. 152 */ 153 family = CPUID_TO_BASEFAMILY(ci->ci_signature); 154 if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD)) 155 && ((family == 0x06) || (family == 0x0f))) { 156 x86_cpuid(0x80000000, descs); 157 if (descs[0] >= 0x80000007) { 158 x86_cpuid(0x80000007, descs); 159 invariant = (descs[3] & CPUID_APM_ITSC) != 0; 160 } 161 } 162 163 return invariant; 164 } 165 166 /* Setup function pointers for rdtsc() and timecounter(9). */ 167 void 168 tsc_setfunc(struct cpu_info *ci) 169 { 170 bool use_lfence, use_mfence; 171 172 use_lfence = use_mfence = false; 173 174 /* 175 * XXX On AMD, we might be able to use lfence for some cases: 176 * a) if MSR_DE_CFG exist and the bit 1 is set. 177 * b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and 178 * lfence is always serializing. 179 * 180 * We don't use it because the test result showed mfence was better 181 * than lfence with MSR_DE_CFG. 182 */ 183 if (cpu_vendor == CPUVENDOR_AMD) 184 use_mfence = true; 185 else if (cpu_vendor == CPUVENDOR_INTEL) 186 use_lfence = true; 187 188 /* LFENCE and MFENCE are applicable if SSE2 is set. */ 189 if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0) 190 use_lfence = use_mfence = false; 191 192 #define TSC_SETFUNC(fence) \ 193 do { \ 194 rdtsc = rdtsc_##fence; \ 195 cpu_counter = cpu_counter_##fence; \ 196 cpu_counter32 = cpu_counter32_##fence; \ 197 } while (/* CONSTCOND */ 0) 198 199 if (use_lfence) 200 TSC_SETFUNC(lfence); 201 else if (use_mfence) 202 TSC_SETFUNC(mfence); 203 else 204 TSC_SETFUNC(cpuid); 205 206 aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n", 207 use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid")); 208 } 209 210 /* 211 * Initialize timecounter(9) and DELAY() function of TSC. 212 * 213 * This function is called after all secondary processors were brought up 214 * and drift has been measured, and after any other potential delay funcs 215 * have been installed (e.g. lapic_delay()). 216 */ 217 void 218 tsc_tc_init(void) 219 { 220 struct cpu_info *ci; 221 bool invariant; 222 223 if (!cpu_hascounter()) 224 return; 225 226 ci = curcpu(); 227 tsc_freq = ci->ci_data.cpu_cc_freq; 228 invariant = tsc_is_invariant(); 229 if (!invariant) { 230 aprint_debug("TSC not known invariant on this CPU\n"); 231 tsc_timecounter.tc_quality = -100; 232 } else if (tsc_drift_observed > tsc_drift_max) { 233 aprint_error("ERROR: %lld cycle TSC drift observed\n", 234 (long long)tsc_drift_observed); 235 tsc_timecounter.tc_quality = -100; 236 invariant = false; 237 } else if (vm_guest == VM_GUEST_NO) { 238 delay_func = tsc_delay; 239 } else if (vm_guest == VM_GUEST_VIRTUALBOX) { 240 tsc_timecounter.tc_quality = -100; 241 } 242 243 if (tsc_freq != 0) { 244 tsc_timecounter.tc_frequency = tsc_freq; 245 tc_init(&tsc_timecounter); 246 } 247 } 248 249 /* 250 * Record drift (in clock cycles). Called during AP startup. 251 */ 252 void 253 tsc_sync_drift(int64_t drift) 254 { 255 256 if (drift < 0) 257 drift = -drift; 258 if (drift > tsc_drift_observed) 259 tsc_drift_observed = drift; 260 } 261 262 /* 263 * Called during startup of APs, by the boot processor. Interrupts 264 * are disabled on entry. 265 */ 266 static void __noinline 267 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) 268 { 269 uint64_t bptsc; 270 271 if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) { 272 panic("tsc_sync_bp: 1"); 273 } 274 275 /* Prepare a cache miss for the other side. */ 276 (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0); 277 278 /* Flag our readiness. */ 279 atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC); 280 281 /* Wait for other side then read our TSC. */ 282 while ((ci->ci_flags & CPUF_SYNCTSC) != 0) { 283 __insn_barrier(); 284 } 285 bptsc = rdtsc(); 286 287 /* Wait for the results to come in. */ 288 while (tsc_sync_cpu == ci) { 289 x86_pause(); 290 } 291 if (tsc_sync_cpu != NULL) { 292 panic("tsc_sync_bp: 2"); 293 } 294 295 *bptscp = bptsc; 296 *aptscp = tsc_sync_val; 297 } 298 299 void 300 tsc_sync_bp(struct cpu_info *ci) 301 { 302 int64_t bptsc, aptsc, val, diff; 303 304 if (!cpu_hascounter()) 305 return; 306 307 val = INT64_MAX; 308 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { 309 tsc_read_bp(ci, &bptsc, &aptsc); 310 diff = bptsc - aptsc; 311 if (ABS(diff) < ABS(val)) { 312 val = diff; 313 } 314 } 315 316 ci->ci_data.cpu_cc_skew = val; 317 } 318 319 /* 320 * Called during startup of AP, by the AP itself. Interrupts are 321 * disabled on entry. 322 */ 323 static void __noinline 324 tsc_post_ap(struct cpu_info *ci) 325 { 326 uint64_t tsc; 327 328 /* Wait for go-ahead from primary. */ 329 while ((ci->ci_flags & CPUF_SYNCTSC) == 0) { 330 __insn_barrier(); 331 } 332 333 /* Instruct primary to read its counter. */ 334 atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC); 335 336 /* Suffer a cache miss, then read TSC. */ 337 __insn_barrier(); 338 tsc = tsc_dummy_cacheline; 339 __insn_barrier(); 340 tsc += rdtsc(); 341 342 /* Post result. Ensure the whole value goes out atomically. */ 343 (void)atomic_swap_64(&tsc_sync_val, tsc); 344 345 if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) { 346 panic("tsc_sync_ap"); 347 } 348 } 349 350 void 351 tsc_sync_ap(struct cpu_info *ci) 352 { 353 354 if (!cpu_hascounter()) 355 return; 356 357 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { 358 tsc_post_ap(ci); 359 } 360 } 361 362 static void 363 tsc_apply_cpu(void *arg1, void *arg2) 364 { 365 bool enable = arg1 != NULL; 366 if (enable) { 367 lcr4(rcr4() & ~CR4_TSD); 368 } else { 369 lcr4(rcr4() | CR4_TSD); 370 } 371 } 372 373 void 374 tsc_user_enable(void) 375 { 376 uint64_t xc; 377 378 xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL); 379 xc_wait(xc); 380 } 381 382 void 383 tsc_user_disable(void) 384 { 385 uint64_t xc; 386 387 xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL); 388 xc_wait(xc); 389 } 390 391 uint64_t 392 cpu_frequency(struct cpu_info *ci) 393 { 394 395 return ci->ci_data.cpu_cc_freq; 396 } 397 398 int 399 cpu_hascounter(void) 400 { 401 402 return cpu_feature[0] & CPUID_TSC; 403 } 404 405 static void 406 tsc_delay(unsigned int us) 407 { 408 uint64_t start, delta; 409 410 start = cpu_counter(); 411 delta = (uint64_t)us * tsc_freq / 1000000; 412 413 while ((cpu_counter() - start) < delta) { 414 x86_pause(); 415 } 416 } 417 418 static u_int 419 tsc_get_timecount(struct timecounter *tc) 420 { 421 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */ 422 static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED; 423 static int lastwarn; 424 uint64_t cur, prev; 425 lwp_t *l = curlwp; 426 int ticks; 427 428 /* 429 * Previous value must be read before the counter and stored to 430 * after, because this routine can be called from interrupt context 431 * and may run over the top of an existing invocation. Ordering is 432 * guaranteed by "volatile" on md_tsc. 433 */ 434 prev = l->l_md.md_tsc; 435 cur = cpu_counter(); 436 if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) && 437 __cpu_simple_lock_try(&lock)) { 438 ticks = getticks(); 439 if (ticks - lastwarn >= hz) { 440 printf( 441 "WARNING: %s TSC went backwards by %u - " 442 "change sysctl(7) kern.timecounter?\n", 443 cpu_name(curcpu()), (unsigned)(prev - cur)); 444 lastwarn = ticks; 445 } 446 __cpu_simple_unlock(&lock); 447 } 448 l->l_md.md_tsc = cur; 449 return (uint32_t)cur; 450 #else 451 return cpu_counter32(); 452 #endif 453 } 454 455 /* 456 * tsc has been reset; zero the cached tsc of every lwp in the system 457 * so we don't spuriously report that the tsc has gone backward. 458 * Caller must ensure all LWPs are quiescent (except the current one, 459 * obviously) and interrupts are blocked while we update this. 460 */ 461 void 462 tsc_tc_reset(void) 463 { 464 struct lwp *l; 465 466 LIST_FOREACH(l, &alllwp, l_list) 467 l->l_md.md_tsc = 0; 468 } 469