1 /* $NetBSD: xen_clock.c,v 1.20 2024/12/01 20:36:00 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include "opt_xen.h" 33 34 #ifndef XEN_CLOCK_DEBUG 35 #define XEN_CLOCK_DEBUG 0 36 #endif 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.20 2024/12/01 20:36:00 andvar Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 #include <sys/atomic.h> 44 #include <sys/callout.h> 45 #include <sys/cpu.h> 46 #include <sys/device.h> 47 #include <sys/evcnt.h> 48 #include <sys/intr.h> 49 #include <sys/kernel.h> 50 #include <sys/lwp.h> 51 #include <sys/proc.h> 52 #include <sys/sdt.h> 53 #include <sys/sysctl.h> 54 #include <sys/systm.h> 55 #include <sys/time.h> 56 #include <sys/timetc.h> 57 58 #include <dev/clock_subr.h> 59 60 #include <machine/cpu.h> 61 #include <machine/cpu_counter.h> 62 #include <machine/lock.h> 63 64 #include <xen/evtchn.h> 65 #include <xen/hypervisor.h> 66 #include <xen/include/public/vcpu.h> 67 #include <xen/xen.h> 68 69 #include <x86/rtc.h> 70 71 #define NS_PER_TICK ((uint64_t)1000000000ULL/hz) 72 73 static uint64_t xen_vcputime_systime_ns(void); 74 static uint64_t xen_vcputime_raw_systime_ns(void); 75 static uint64_t xen_global_systime_ns(void); 76 static unsigned xen_get_timecount(struct timecounter *); 77 static int xen_timer_handler(void *, struct clockframe *); 78 79 /* 80 * dtrace probes 81 */ 82 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward, 83 "uint64_t"/*raw_systime_ns*/, 84 "uint64_t"/*tsc_timestamp*/, 85 "uint64_t"/*tsc_to_system_mul*/, 86 "int"/*tsc_shift*/, 87 "uint64_t"/*delta_ns*/, 88 "uint64_t"/*tsc*/, 89 "uint64_t"/*systime_ns*/); 90 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative, 91 "uint64_t"/*raw_systime_ns*/, 92 "uint64_t"/*tsc_timestamp*/, 93 "uint64_t"/*tsc_to_system_mul*/, 94 "int"/*tsc_shift*/, 95 "uint64_t"/*delta_ns*/, 96 "uint64_t"/*tsc*/, 97 "uint64_t"/*systime_ns*/); 98 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound, 99 "uint64_t"/*raw_systime_ns*/, 100 "uint64_t"/*tsc_timestamp*/, 101 "uint64_t"/*tsc_to_system_mul*/, 102 "int"/*tsc_shift*/, 103 "uint64_t"/*delta_ns*/, 104 "uint64_t"/*tsc*/, 105 "uint64_t"/*systime_ns*/); 106 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward, 107 "uint64_t"/*raw_systime_ns*/, 108 "uint64_t"/*tsc_timestamp*/, 109 "uint64_t"/*tsc_to_system_mul*/, 110 "int"/*tsc_shift*/, 111 "uint64_t"/*delta_ns*/, 112 "uint64_t"/*tsc*/, 113 "uint64_t"/*systime_ns*/); 114 115 SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward, 116 "uint64_t"/*local*/, 117 "uint64_t"/*skew*/, 118 "uint64_t"/*global*/); 119 120 SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward, 121 "uint64_t"/*last_systime_ns*/, 122 "uint64_t"/*this_systime_ns*/); 123 SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick, 124 "uint64_t"/*last_systime_ns*/, 125 "uint64_t"/*this_systime_ns*/); 126 SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump, 127 "uint64_t"/*last_systime_ns*/, 128 "uint64_t"/*this_systime_ns*/, 129 "uint64_t"/*nticks*/); 130 SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed, 131 "uint64_t"/*last_systime_ns*/, 132 "uint64_t"/*this_systime_ns*/, 133 "uint64_t"/*remaining_ns*/); 134 135 /* 136 * xen timecounter: 137 * 138 * Xen vCPU system time, plus an adjustment with rdtsc. 139 */ 140 static struct timecounter xen_timecounter = { 141 .tc_get_timecount = xen_get_timecount, 142 .tc_poll_pps = NULL, 143 .tc_counter_mask = ~0U, 144 .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */ 145 .tc_name = "xen_system_time", 146 .tc_quality = 10000, 147 }; 148 149 /* 150 * xen_global_systime_ns_stamp 151 * 152 * The latest Xen vCPU system time that has been observed on any 153 * CPU, for a global monotonic view of the Xen system time clock. 154 */ 155 static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned; 156 157 #ifdef DOM0OPS 158 /* 159 * xen timepush state: 160 * 161 * Callout to periodically, after a sysctl-configurable number of 162 * NetBSD ticks, set the Xen hypervisor's wall clock time. 163 */ 164 static struct { 165 struct callout ch; 166 int ticks; 167 } xen_timepush; 168 169 static void xen_timepush_init(void); 170 static void xen_timepush_intr(void *); 171 static int sysctl_xen_timepush(SYSCTLFN_ARGS); 172 #endif 173 174 /* 175 * xen_rdtsc() 176 * 177 * Read the local pCPU's tsc. 178 */ 179 static inline uint64_t 180 xen_rdtsc(void) 181 { 182 uint32_t lo, hi; 183 184 asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); 185 186 return ((uint64_t)hi << 32) | lo; 187 } 188 189 /* 190 * struct xen_vcputime_ticket 191 * 192 * State for a vCPU read section, during which a caller may read 193 * from fields of a struct vcpu_time_info and call xen_rdtsc. 194 * Caller must enter with xen_vcputime_enter, exit with 195 * xen_vcputime_exit, and be prepared to retry if 196 * xen_vcputime_exit fails. 197 */ 198 struct xen_vcputime_ticket { 199 uint64_t version; 200 }; 201 202 /* 203 * xen_vcputime_enter(tp) 204 * 205 * Enter a vCPU time read section and store a ticket in *tp, which 206 * the caller must use with xen_vcputime_exit. Return a pointer 207 * to the current CPU's vcpu_time_info structure. Caller must 208 * already be bound to the CPU. 209 */ 210 static inline volatile struct vcpu_time_info * 211 xen_vcputime_enter(struct xen_vcputime_ticket *tp) 212 { 213 volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time; 214 215 while (__predict_false(1 & (tp->version = vt->version))) 216 SPINLOCK_BACKOFF_HOOK; 217 218 /* 219 * Must read the version before reading the tsc on the local 220 * pCPU. We are racing only with interruption by the 221 * hypervisor, so no need for a stronger memory barrier. 222 */ 223 __insn_barrier(); 224 225 return vt; 226 } 227 228 /* 229 * xen_vcputime_exit(vt, tp) 230 * 231 * Exit a vCPU time read section with the ticket in *tp from 232 * xen_vcputime_enter. Return true on success, false if caller 233 * must retry. 234 */ 235 static inline bool 236 xen_vcputime_exit(volatile struct vcpu_time_info *vt, 237 struct xen_vcputime_ticket *tp) 238 { 239 240 KASSERT(vt == &curcpu()->ci_vcpu->time); 241 242 /* 243 * Must read the tsc before re-reading the version on the local 244 * pCPU. We are racing only with interruption by the 245 * hypervisor, so no need for a stronger memory barrier. 246 */ 247 __insn_barrier(); 248 249 return tp->version == vt->version; 250 } 251 252 /* 253 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift) 254 * 255 * Convert a difference in tsc units to a difference in 256 * nanoseconds given a multiplier and shift for the unit 257 * conversion. 258 */ 259 static inline uint64_t 260 xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul, 261 int8_t tsc_shift) 262 { 263 uint32_t delta_tsc_hi, delta_tsc_lo; 264 265 if (tsc_shift < 0) 266 delta_tsc >>= -tsc_shift; 267 else 268 delta_tsc <<= tsc_shift; 269 270 delta_tsc_hi = delta_tsc >> 32; 271 delta_tsc_lo = delta_tsc & 0xffffffffUL; 272 273 /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */ 274 return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) + 275 (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32); 276 } 277 278 /* 279 * xen_vcputime_systime_ns() 280 * 281 * Return a snapshot of the Xen system time plus an adjustment 282 * from the tsc, in units of nanoseconds. Caller must be bound to 283 * the current CPU. 284 */ 285 static uint64_t 286 xen_vcputime_systime_ns(void) 287 { 288 volatile struct vcpu_time_info *vt; 289 struct cpu_info *ci = curcpu(); 290 struct xen_vcputime_ticket ticket; 291 uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns; 292 uint32_t tsc_to_system_mul; 293 int8_t tsc_shift; 294 uint64_t systime_ns; 295 296 /* We'd better be bound to the CPU in _some_ way. */ 297 KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() || 298 (curlwp->l_flag & LP_BOUND)); 299 300 /* 301 * Repeatedly try to read the system time, corresponding tsc 302 * timestamp, and tsc frequency until we get a consistent view. 303 */ 304 do { 305 vt = xen_vcputime_enter(&ticket); 306 307 /* Grab Xen's snapshot of raw system time and tsc. */ 308 raw_systime_ns = vt->system_time; 309 tsc_timestamp = vt->tsc_timestamp; 310 311 /* Get Xen's current idea of how fast the tsc is counting. */ 312 tsc_to_system_mul = vt->tsc_to_system_mul; 313 tsc_shift = vt->tsc_shift; 314 315 /* Read the CPU's tsc. */ 316 tsc = xen_rdtsc(); 317 } while (!xen_vcputime_exit(vt, &ticket)); 318 319 /* 320 * Out of paranoia, check whether the tsc has gone backwards 321 * since Xen's timestamp. 322 * 323 * This shouldn't happen because the Xen hypervisor is supposed 324 * to have read the tsc _before_ writing to the vcpu_time_info 325 * page, _before_ we read the tsc. 326 * 327 * Further, if we switched pCPUs after reading the tsc 328 * timestamp but before reading the CPU's tsc, the hypervisor 329 * had better notify us by updating the version too and forcing 330 * us to retry the vCPU time read. 331 */ 332 if (__predict_false(tsc < tsc_timestamp)) { 333 /* 334 * Notify the console that the CPU's tsc appeared to 335 * run behind Xen's idea of it, and pretend it hadn't. 336 */ 337 SDT_PROBE7(sdt, xen, clock, tsc__backward, 338 raw_systime_ns, tsc_timestamp, 339 tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc, 340 /*systime_ns*/raw_systime_ns); 341 #if XEN_CLOCK_DEBUG 342 device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64 343 " ran backwards from timestamp %"PRIu64 344 " by %"PRIu64"\n", 345 tsc, tsc_timestamp, tsc_timestamp - tsc); 346 #endif 347 ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++; 348 delta_ns = delta_tsc = 0; 349 } else { 350 /* Find how far the CPU's tsc has advanced. */ 351 delta_tsc = tsc - tsc_timestamp; 352 353 /* Convert the tsc delta to a nanosecond delta. */ 354 delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul, 355 tsc_shift); 356 } 357 358 /* 359 * Notify the console if the delta computation yielded a 360 * negative, and pretend it hadn't. 361 * 362 * This doesn't make sense but I include it out of paranoia. 363 */ 364 if (__predict_false((int64_t)delta_ns < 0)) { 365 SDT_PROBE7(sdt, xen, clock, tsc__delta__negative, 366 raw_systime_ns, tsc_timestamp, 367 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 368 /*systime_ns*/raw_systime_ns); 369 #if XEN_CLOCK_DEBUG 370 device_printf(ci->ci_dev, "xen tsc delta in ns went negative:" 371 " %"PRId64"\n", delta_ns); 372 #endif 373 ci->ci_xen_tsc_delta_negative_evcnt.ev_count++; 374 delta_ns = 0; 375 } 376 377 /* 378 * Compute the TSC-adjusted system time. 379 */ 380 systime_ns = raw_systime_ns + delta_ns; 381 382 /* 383 * Notify the console if the addition wrapped around. 384 * 385 * This shouldn't happen because system time should be relative 386 * to a reasonable reference point, not centuries in the past. 387 * (2^64 ns is approximately half a millennium.) 388 */ 389 if (__predict_false(systime_ns < raw_systime_ns)) { 390 SDT_PROBE7(sdt, xen, clock, systime__wraparound, 391 raw_systime_ns, tsc_timestamp, 392 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 393 systime_ns); 394 #if XEN_CLOCK_DEBUG 395 printf("xen raw systime + tsc delta wrapped around:" 396 " %"PRIu64" + %"PRIu64" = %"PRIu64"\n", 397 raw_systime_ns, delta_ns, systime_ns); 398 #endif 399 ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++; 400 } 401 402 /* 403 * Notify the console if the TSC-adjusted Xen system time 404 * appears to have gone backwards, and pretend we had gone 405 * forward. This seems to happen pretty regularly under load. 406 */ 407 if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) { 408 SDT_PROBE7(sdt, xen, clock, systime__backward, 409 raw_systime_ns, tsc_timestamp, 410 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 411 systime_ns); 412 #if XEN_CLOCK_DEBUG 413 printf("xen raw systime + tsc delta went backwards:" 414 " %"PRIu64" > %"PRIu64"\n", 415 ci->ci_xen_last_systime_ns, systime_ns); 416 printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n" 417 " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n" 418 " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n" 419 " delta_ns=%"PRIu64"\n", 420 raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul, 421 tsc_shift, delta_tsc, delta_ns); 422 #endif 423 ci->ci_xen_raw_systime_backwards_evcnt.ev_count++; 424 systime_ns = ci->ci_xen_last_systime_ns + 1; 425 } 426 427 /* Remember the TSC-adjusted Xen system time. */ 428 ci->ci_xen_last_systime_ns = systime_ns; 429 430 /* We had better not have migrated CPUs. */ 431 KASSERT(ci == curcpu()); 432 433 /* And we're done: return the TSC-adjusted systime in nanoseconds. */ 434 return systime_ns; 435 } 436 437 /* 438 * xen_vcputime_raw_systime_ns() 439 * 440 * Return a snapshot of the current Xen system time to the 441 * resolution of the Xen hypervisor tick, in units of nanoseconds. 442 */ 443 static uint64_t 444 xen_vcputime_raw_systime_ns(void) 445 { 446 volatile struct vcpu_time_info *vt; 447 struct xen_vcputime_ticket ticket; 448 uint64_t raw_systime_ns; 449 450 do { 451 vt = xen_vcputime_enter(&ticket); 452 raw_systime_ns = vt->system_time; 453 } while (!xen_vcputime_exit(vt, &ticket)); 454 455 return raw_systime_ns; 456 } 457 458 /* 459 * struct xen_wallclock_ticket 460 * 461 * State for a wall clock read section, during which a caller may 462 * read from the wall clock fields of HYPERVISOR_shared_info. 463 * Caller must enter with xen_wallclock_enter, exit with 464 * xen_wallclock_exit, and be prepared to retry if 465 * xen_wallclock_exit fails. 466 */ 467 struct xen_wallclock_ticket { 468 uint32_t version; 469 }; 470 471 /* 472 * xen_wallclock_enter(tp) 473 * 474 * Enter a wall clock read section and store a ticket in *tp, 475 * which the caller must use with xen_wallclock_exit. 476 */ 477 static inline void 478 xen_wallclock_enter(struct xen_wallclock_ticket *tp) 479 { 480 481 while (__predict_false(1 & (tp->version = 482 HYPERVISOR_shared_info->wc_version))) 483 SPINLOCK_BACKOFF_HOOK; 484 485 /* 486 * Must read the version from memory before reading the 487 * timestamp from memory, as written potentially by another 488 * pCPU. 489 */ 490 membar_consumer(); 491 } 492 493 /* 494 * xen_wallclock_exit(tp) 495 * 496 * Exit a wall clock read section with the ticket in *tp from 497 * xen_wallclock_enter. Return true on success, false if caller 498 * must retry. 499 */ 500 static inline bool 501 xen_wallclock_exit(struct xen_wallclock_ticket *tp) 502 { 503 504 /* 505 * Must read the timestamp from memory before re-reading the 506 * version from memory, as written potentially by another pCPU. 507 */ 508 membar_consumer(); 509 510 return tp->version == HYPERVISOR_shared_info->wc_version; 511 } 512 513 /* 514 * xen_global_systime_ns() 515 * 516 * Return a global monotonic view of the system time in 517 * nanoseconds, computed by the per-CPU Xen raw system time plus 518 * an rdtsc adjustment, and advance the view of the system time 519 * for all other CPUs. 520 */ 521 static uint64_t 522 xen_global_systime_ns(void) 523 { 524 struct cpu_info *ci; 525 uint64_t local, global, skew, result; 526 527 /* 528 * Find the local timecount on this CPU, and make sure it does 529 * not precede the latest global timecount witnessed so far by 530 * any CPU. If it does, add to the local CPU's skew from the 531 * fastest CPU. 532 * 533 * XXX Can we avoid retrying if the CAS fails? 534 */ 535 int s = splsched(); /* make sure we won't be interrupted */ 536 ci = curcpu(); 537 do { 538 local = xen_vcputime_systime_ns(); 539 skew = ci->ci_xen_systime_ns_skew; 540 global = xen_global_systime_ns_stamp; 541 if (__predict_false(local + skew < global + 1)) { 542 SDT_PROBE3(sdt, xen, timecounter, backward, 543 local, skew, global); 544 #if XEN_CLOCK_DEBUG 545 device_printf(ci->ci_dev, 546 "xen timecounter went backwards:" 547 " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64"," 548 " adding %"PRIu64" to skew\n", 549 local, skew, global, global + 1 - (local + skew)); 550 #endif 551 ci->ci_xen_timecounter_backwards_evcnt.ev_count++; 552 result = global + 1; 553 ci->ci_xen_systime_ns_skew += global + 1 - 554 (local + skew); 555 } else { 556 result = local + skew; 557 } 558 } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result) 559 != global); 560 KASSERT(ci == curcpu()); 561 splx(s); 562 563 return result; 564 } 565 566 /* 567 * xen_get_timecount(tc) 568 * 569 * Return the low 32 bits of a global monotonic view of the Xen 570 * system time. 571 */ 572 static unsigned 573 xen_get_timecount(struct timecounter *tc) 574 { 575 576 KASSERT(tc == &xen_timecounter); 577 578 return (unsigned)xen_global_systime_ns(); 579 } 580 581 /* 582 * xen_delay(n) 583 * 584 * Wait approximately n microseconds. 585 */ 586 void 587 xen_delay(unsigned n) 588 { 589 int bound; 590 591 /* Bind to the CPU so we don't compare tsc on different CPUs. */ 592 bound = curlwp_bind(); 593 594 if (curcpu()->ci_vcpu == NULL) { 595 curlwp_bindx(bound); 596 return; 597 } 598 599 /* Short wait (<500us) or long wait? */ 600 if (n < 500000) { 601 /* 602 * Xen system time is not precise enough for short 603 * delays, so use the tsc instead. 604 * 605 * We work with the current tsc frequency, and figure 606 * that if it changes while we're delaying, we've 607 * probably delayed long enough -- up to 500us. 608 * 609 * We do not use cpu_frequency(ci), which uses a 610 * quantity detected at boot time, and which may have 611 * changed by now if Xen has migrated this vCPU to 612 * another pCPU. 613 * 614 * XXX How long does it take to migrate pCPUs? 615 */ 616 volatile struct vcpu_time_info *vt; 617 struct xen_vcputime_ticket ticket; 618 uint64_t tsc_start, last_tsc, tsc; 619 uint32_t tsc_to_system_mul; 620 int8_t tsc_shift; 621 622 /* Get the starting tsc and tsc frequency. */ 623 do { 624 vt = xen_vcputime_enter(&ticket); 625 tsc_start = last_tsc = xen_rdtsc(); 626 tsc_to_system_mul = vt->tsc_to_system_mul; 627 tsc_shift = vt->tsc_shift; 628 } while (!xen_vcputime_exit(vt, &ticket)); 629 630 /* 631 * Wait until as many tsc ticks as there are in n 632 * microseconds have elapsed, or the tsc has gone 633 * backwards meaning we've probably migrated pCPUs. 634 */ 635 for (;;) { 636 tsc = xen_rdtsc(); 637 if (__predict_false(tsc < last_tsc)) 638 break; 639 if (xen_tsc_to_ns_delta(tsc - tsc_start, 640 tsc_to_system_mul, tsc_shift)/1000 >= n) 641 break; 642 last_tsc = tsc; 643 } 644 } else { 645 /* 646 * Use the Xen system time for >=500us delays. From my 647 * testing, it seems to sometimes run backward by about 648 * 110us, which is not so bad. 649 */ 650 uint64_t n_ns = 1000*(uint64_t)n; 651 uint64_t start_ns; 652 653 /* Get the start time. */ 654 start_ns = xen_vcputime_raw_systime_ns(); 655 656 /* Wait until the system time has passed the end. */ 657 do { 658 HYPERVISOR_yield(); 659 } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns); 660 } 661 662 /* Unbind from the CPU if we weren't already bound. */ 663 curlwp_bindx(bound); 664 } 665 666 /* 667 * xen_suspendclocks(ci) 668 * 669 * Stop handling the Xen timer event on the CPU of ci. Caller 670 * must be running on and bound to ci's CPU. 671 * 672 * Actually, caller must have kpreemption disabled, because that's 673 * easier to assert at the moment. 674 */ 675 void 676 xen_suspendclocks(struct cpu_info *ci) 677 { 678 int evtch; 679 680 KASSERT(ci == curcpu()); 681 KASSERT(kpreempt_disabled()); 682 683 /* 684 * Find the VIRQ_TIMER event channel and close it so new timer 685 * interrupt events stop getting delivered to it. 686 * 687 * XXX Should this happen later? This is not the reverse order 688 * of xen_resumeclocks. It is apparently necessary in this 689 * order only because we don't stash evtchn anywhere, but we 690 * could stash it. 691 */ 692 evtch = unbind_virq_from_evtch(VIRQ_TIMER); 693 KASSERT(evtch != -1); 694 695 /* 696 * Mask the event channel so we stop getting new interrupts on 697 * it. 698 */ 699 hypervisor_mask_event(evtch); 700 701 /* 702 * Now that we are no longer getting new interrupts, remove the 703 * handler and wait for any existing calls to the handler to 704 * complete. After this point, there can be no concurrent 705 * calls to xen_timer_handler. 706 */ 707 event_remove_handler(evtch, 708 __FPTRCAST(int (*)(void *), xen_timer_handler), ci); 709 710 aprint_verbose("Xen clock: removed event channel %d\n", evtch); 711 712 /* We'd better not have switched CPUs. */ 713 KASSERT(ci == curcpu()); 714 } 715 716 /* 717 * xen_resumeclocks(ci) 718 * 719 * Start handling the Xen timer event on the CPU of ci. Arm the 720 * Xen timer. Caller must be running on and bound to ci's CPU. 721 * 722 * Actually, caller must have kpreemption disabled, because that's 723 * easier to assert at the moment. 724 */ 725 void 726 xen_resumeclocks(struct cpu_info *ci) 727 { 728 char intr_xname[INTRDEVNAMEBUF]; 729 int evtch; 730 int error __diagused; 731 732 KASSERT(ci == curcpu()); 733 KASSERT(kpreempt_disabled()); 734 735 /* 736 * Allocate an event channel to receive VIRQ_TIMER events. 737 */ 738 evtch = bind_virq_to_evtch(VIRQ_TIMER); 739 KASSERT(evtch != -1); 740 741 /* 742 * Set an event handler for VIRQ_TIMER events to call 743 * xen_timer_handler. 744 */ 745 snprintf(intr_xname, sizeof(intr_xname), "%s clock", 746 device_xname(ci->ci_dev)); 747 /* XXX sketchy function pointer cast -- fix the API, please */ 748 if (event_set_handler(evtch, 749 __FPTRCAST(int (*)(void *), xen_timer_handler), 750 ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL) 751 panic("failed to establish timer interrupt handler"); 752 753 aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch); 754 755 /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */ 756 if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) { 757 error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 758 ci->ci_vcpuid, NULL); 759 KASSERT(error == 0); 760 } 761 762 /* Pretend the last hardclock happened right now. */ 763 ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns(); 764 765 /* Arm the one-shot timer. */ 766 error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns + 767 NS_PER_TICK); 768 KASSERT(error == 0); 769 770 /* 771 * Ready to go. Unmask the event. After this point, Xen may 772 * start calling xen_timer_handler. 773 */ 774 hypervisor_unmask_event(evtch); 775 776 /* We'd better not have switched CPUs. */ 777 KASSERT(ci == curcpu()); 778 } 779 780 /* 781 * xen_timer_handler(cookie, frame) 782 * 783 * Periodic Xen timer event handler for NetBSD hardclock. Calls 784 * to this may get delayed, so we run hardclock as many times as 785 * we need to in order to cover the Xen system time that elapsed. 786 * After that, re-arm the timer to run again at the next tick. 787 * The cookie is the pointer to struct cpu_info. 788 */ 789 static int 790 xen_timer_handler(void *cookie, struct clockframe *frame) 791 { 792 const uint64_t ns_per_tick = NS_PER_TICK; 793 struct cpu_info *ci = curcpu(); 794 uint64_t last, now, delta, next; 795 int error; 796 797 KASSERT(cpu_intr_p()); 798 KASSERT(cookie == ci); 799 800 #if defined(XENPV) 801 frame = NULL; /* We use values cached in curcpu() */ 802 #endif 803 /* 804 * Find how many nanoseconds of Xen system time has elapsed 805 * since the last hardclock tick. 806 */ 807 last = ci->ci_xen_hardclock_systime_ns; 808 now = xen_vcputime_systime_ns(); 809 SDT_PROBE2(sdt, xen, hardclock, tick, last, now); 810 if (__predict_false(now < last)) { 811 SDT_PROBE2(sdt, xen, hardclock, systime__backward, 812 last, now); 813 #if XEN_CLOCK_DEBUG 814 device_printf(ci->ci_dev, "xen systime ran backwards" 815 " in hardclock %"PRIu64"ns\n", 816 last - now); 817 #endif 818 ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++; 819 /* 820 * we've lost track of time. Just pretends that one 821 * tick elapsed, and reset our idea of last tick. 822 */ 823 ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick; 824 } 825 delta = now - last; 826 827 /* 828 * Play hardclock catchup: run the hardclock timer as many 829 * times as appears necessary based on how much time has 830 * passed. 831 */ 832 if (__predict_false(delta >= 2*ns_per_tick)) { 833 SDT_PROBE3(sdt, xen, hardclock, jump, 834 last, now, delta/ns_per_tick); 835 836 /* 837 * Warn if we violate timecounter(9) contract: with a 838 * k-bit timecounter (here k = 32), and timecounter 839 * frequency f (here f = 1 GHz), the maximum period 840 * between hardclock calls is 2^k / f. 841 */ 842 if (delta > xen_timecounter.tc_counter_mask) { 843 printf("WARNING: hardclock skipped %"PRIu64"ns" 844 " (%"PRIu64" -> %"PRIu64")," 845 " exceeding maximum of %"PRIu32"ns" 846 " for timecounter(9)\n", 847 last, now, delta, 848 xen_timecounter.tc_counter_mask); 849 ci->ci_xen_timecounter_jump_evcnt.ev_count++; 850 } 851 /* don't try to catch up more than one second at once */ 852 if (delta > 1000000000UL) 853 delta = 1000000000UL; 854 } 855 while (delta >= ns_per_tick) { 856 ci->ci_xen_hardclock_systime_ns += ns_per_tick; 857 delta -= ns_per_tick; 858 hardclock(frame); 859 if (__predict_false(delta >= ns_per_tick)) { 860 SDT_PROBE3(sdt, xen, hardclock, missed, 861 last, now, delta); 862 ci->ci_xen_missed_hardclock_evcnt.ev_count++; 863 } 864 } 865 866 /* 867 * Re-arm the timer. If it fails, it's probably because the 868 * time is in the past, possibly because we're in the 869 * process of catching up missed hardclock calls. 870 * In this case schedule a tick in the nead future. 871 */ 872 next = ci->ci_xen_hardclock_systime_ns + ns_per_tick; 873 error = HYPERVISOR_set_timer_op(next); 874 if (error) { 875 next = xen_vcputime_systime_ns() + ns_per_tick / 2; 876 error = HYPERVISOR_set_timer_op(next); 877 if (error) { 878 panic("failed to re-arm Xen timer %d", error); 879 } 880 } 881 882 /* Success! */ 883 return 0; 884 } 885 886 /* 887 * xen_initclocks() 888 * 889 * Initialize the Xen clocks on the current CPU. 890 */ 891 void 892 xen_initclocks(void) 893 { 894 struct cpu_info *ci = curcpu(); 895 896 /* If this is the primary CPU, do global initialization first. */ 897 if (ci == &cpu_info_primary) { 898 /* Initialize the systemwide Xen timecounter. */ 899 tc_init(&xen_timecounter); 900 } 901 902 /* Attach the event counters. */ 903 evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt, 904 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 905 "cpu tsc ran backwards"); 906 evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt, 907 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 908 "tsc delta went negative"); 909 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt, 910 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 911 "raw systime wrapped around"); 912 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt, 913 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 914 "raw systime went backwards"); 915 evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt, 916 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 917 "systime went backwards in hardclock"); 918 evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt, 919 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 920 "missed hardclock"); 921 evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt, 922 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 923 "timecounter went backwards"); 924 evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt, 925 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 926 "hardclock jumped past timecounter max"); 927 928 /* Fire up the clocks. */ 929 xen_resumeclocks(ci); 930 931 #ifdef DOM0OPS 932 /* 933 * If this is a privileged dom0, start pushing the wall 934 * clock time back to the Xen hypervisor. 935 */ 936 if (ci == &cpu_info_primary && xendomain_is_privileged()) 937 xen_timepush_init(); 938 #endif 939 } 940 941 #ifdef DOM0OPS 942 943 /* 944 * xen_timepush_init() 945 * 946 * Initialize callout to periodically set Xen hypervisor's wall 947 * clock time. 948 */ 949 static void 950 xen_timepush_init(void) 951 { 952 struct sysctllog *log = NULL; 953 const struct sysctlnode *node = NULL; 954 int error; 955 956 /* Start periodically updating the hypervisor's wall clock time. */ 957 callout_init(&xen_timepush.ch, 0); 958 callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL); 959 960 /* Pick a default frequency for timepush. */ 961 xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */ 962 963 /* Create machdep.xen node. */ 964 /* XXX Creation of the `machdep.xen' node should be elsewhere. */ 965 error = sysctl_createv(&log, 0, NULL, &node, 0, 966 CTLTYPE_NODE, "xen", 967 SYSCTL_DESCR("Xen top level node"), 968 NULL, 0, NULL, 0, 969 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 970 if (error) 971 goto fail; 972 KASSERT(node != NULL); 973 974 /* Create int machdep.xen.timepush_ticks knob. */ 975 error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE, 976 CTLTYPE_INT, "timepush_ticks", 977 SYSCTL_DESCR("How often to update the hypervisor's time-of-day;" 978 " 0 to disable"), 979 sysctl_xen_timepush, 0, &xen_timepush.ticks, 0, 980 CTL_CREATE, CTL_EOL); 981 if (error) 982 goto fail; 983 984 /* Start the timepush callout. */ 985 callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 986 987 /* Success! */ 988 return; 989 990 fail: sysctl_teardown(&log); 991 } 992 993 /* 994 * xen_timepush_intr(cookie) 995 * 996 * Callout interrupt handler to push NetBSD's idea of the wall 997 * clock time, usually synchronized with NTP, back to the Xen 998 * hypervisor. 999 */ 1000 static void 1001 xen_timepush_intr(void *cookie) 1002 { 1003 1004 resettodr(); 1005 if (xen_timepush.ticks) 1006 callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 1007 } 1008 1009 /* 1010 * sysctl_xen_timepush(...) 1011 * 1012 * Sysctl handler to set machdep.xen.timepush_ticks. 1013 */ 1014 static int 1015 sysctl_xen_timepush(SYSCTLFN_ARGS) 1016 { 1017 struct sysctlnode node; 1018 int ticks; 1019 int error; 1020 1021 ticks = xen_timepush.ticks; 1022 node = *rnode; 1023 node.sysctl_data = &ticks; 1024 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1025 if (error || newp == NULL) 1026 return error; 1027 1028 if (ticks < 0) 1029 return EINVAL; 1030 1031 if (ticks != xen_timepush.ticks) { 1032 xen_timepush.ticks = ticks; 1033 1034 if (ticks == 0) 1035 callout_stop(&xen_timepush.ch); 1036 else 1037 callout_schedule(&xen_timepush.ch, ticks); 1038 } 1039 1040 return 0; 1041 } 1042 1043 #endif /* DOM0OPS */ 1044 1045 static int xen_rtc_get(struct todr_chip_handle *, struct timeval *); 1046 static int xen_rtc_set(struct todr_chip_handle *, struct timeval *); 1047 static void xen_wallclock_time(struct timespec *); 1048 /* 1049 * xen time of day register: 1050 * 1051 * Xen wall clock time, plus a Xen vCPU system time adjustment. 1052 */ 1053 static struct todr_chip_handle xen_todr_chip = { 1054 .todr_gettime = xen_rtc_get, 1055 .todr_settime = xen_rtc_set, 1056 }; 1057 1058 /* 1059 * xen_startrtclock() 1060 * 1061 * Initialize the real-time clock from x86 machdep autoconf. 1062 */ 1063 void 1064 xen_startrtclock(void) 1065 { 1066 1067 todr_attach(&xen_todr_chip); 1068 } 1069 1070 /* 1071 * xen_rtc_get(todr, tv) 1072 * 1073 * Get the current real-time clock from the Xen wall clock time 1074 * and vCPU system time adjustment. 1075 */ 1076 static int 1077 xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp) 1078 { 1079 struct timespec ts; 1080 1081 xen_wallclock_time(&ts); 1082 TIMESPEC_TO_TIMEVAL(tvp, &ts); 1083 1084 return 0; 1085 } 1086 1087 /* 1088 * xen_rtc_set(todr, tv) 1089 * 1090 * Set the Xen wall clock time, if we can. 1091 */ 1092 static int 1093 xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp) 1094 { 1095 #ifdef DOM0OPS 1096 struct clock_ymdhms dt; 1097 xen_platform_op_t op; 1098 uint64_t systime_ns; 1099 1100 if (xendomain_is_privileged()) { 1101 /* Convert to ymdhms and set the x86 ISA RTC. */ 1102 clock_secs_to_ymdhms(tvp->tv_sec, &dt); 1103 rtc_set_ymdhms(NULL, &dt); 1104 1105 /* Get the global system time so we can preserve it. */ 1106 systime_ns = xen_global_systime_ns(); 1107 1108 /* Set the hypervisor wall clock time. */ 1109 memset(&op, 0, sizeof(op)); 1110 op.cmd = XENPF_settime; 1111 op.u.settime.secs = tvp->tv_sec; 1112 op.u.settime.nsecs = tvp->tv_usec * 1000; 1113 op.u.settime.system_time = systime_ns; 1114 return HYPERVISOR_platform_op(&op); 1115 } 1116 #endif 1117 1118 /* XXX Should this fail if not on privileged dom0? */ 1119 return 0; 1120 } 1121 1122 /* 1123 * xen_wallclock_time(tsp) 1124 * 1125 * Return a snapshot of the current low-resolution wall clock 1126 * time, as reported by the hypervisor, in tsp. 1127 */ 1128 static void 1129 xen_wallclock_time(struct timespec *tsp) 1130 { 1131 struct xen_wallclock_ticket ticket; 1132 uint64_t systime_ns; 1133 1134 int s = splsched(); /* make sure we won't be interrupted */ 1135 /* Read the last wall clock sample from the hypervisor. */ 1136 do { 1137 xen_wallclock_enter(&ticket); 1138 tsp->tv_sec = HYPERVISOR_shared_info->wc_sec; 1139 tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec; 1140 } while (!xen_wallclock_exit(&ticket)); 1141 1142 /* Get the global system time. */ 1143 systime_ns = xen_global_systime_ns(); 1144 splx(s); 1145 1146 /* Add the system time to the wall clock time. */ 1147 systime_ns += tsp->tv_nsec; 1148 tsp->tv_sec += systime_ns / 1000000000ull; 1149 tsp->tv_nsec = systime_ns % 1000000000ull; 1150 } 1151 1152 #ifdef XENPV 1153 /* 1154 * setstatclockrate(rate) 1155 * 1156 * Set the statclock to run at rate, in units of ticks per second. 1157 * 1158 * Currently Xen does not have a separate statclock, so this is a 1159 * noop; instad the statclock runs in hardclock. 1160 */ 1161 void 1162 setstatclockrate(int rate) 1163 { 1164 } 1165 #endif /* XENPV */ 1166