1 /* $OpenBSD: kern_tc.c,v 1.38 2019/03/09 23:04:56 cheloha Exp $ */ 2 3 /* 4 * Copyright (c) 2000 Poul-Henning Kamp <phk@FreeBSD.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 /* 20 * If we meet some day, and you think this stuff is worth it, you 21 * can buy me a beer in return. Poul-Henning Kamp 22 */ 23 24 #include <sys/param.h> 25 #include <sys/atomic.h> 26 #include <sys/kernel.h> 27 #include <sys/mutex.h> 28 #include <sys/timeout.h> 29 #include <sys/sysctl.h> 30 #include <sys/syslog.h> 31 #include <sys/systm.h> 32 #include <sys/timetc.h> 33 #include <sys/malloc.h> 34 #include <dev/rndvar.h> 35 36 /* 37 * A large step happens on boot. This constant detects such steps. 38 * It is relatively small so that ntp_update_second gets called enough 39 * in the typical 'missed a couple of seconds' case, but doesn't loop 40 * forever when the time step is large. 41 */ 42 #define LARGE_STEP 200 43 44 u_int dummy_get_timecount(struct timecounter *); 45 46 void ntp_update_second(int64_t *); 47 int sysctl_tc_hardware(void *, size_t *, void *, size_t); 48 int sysctl_tc_choice(void *, size_t *, void *, size_t); 49 50 /* 51 * Implement a dummy timecounter which we can use until we get a real one 52 * in the air. This allows the console and other early stuff to use 53 * time services. 54 */ 55 56 u_int 57 dummy_get_timecount(struct timecounter *tc) 58 { 59 static u_int now; 60 61 return (++now); 62 } 63 64 static struct timecounter dummy_timecounter = { 65 dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 66 }; 67 68 struct timehands { 69 /* These fields must be initialized by the driver. */ 70 struct timecounter *th_counter; 71 int64_t th_adjustment; 72 u_int64_t th_scale; 73 u_int th_offset_count; 74 struct bintime th_boottime; 75 struct bintime th_offset; 76 struct timeval th_microtime; 77 struct timespec th_nanotime; 78 /* Fields not to be copied in tc_windup start with th_generation. */ 79 volatile u_int th_generation; 80 struct timehands *th_next; 81 }; 82 83 static struct timehands th0; 84 static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th0}; 85 static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th9}; 86 static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th8}; 87 static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th7}; 88 static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th6}; 89 static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th5}; 90 static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th4}; 91 static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th3}; 92 static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th2}; 93 static struct timehands th0 = { 94 &dummy_timecounter, 95 0, 96 (uint64_t)-1 / 1000000, 97 0, 98 {0, 0}, 99 {1, 0}, 100 {0, 0}, 101 {0, 0}, 102 1, 103 &th1 104 }; 105 106 /* 107 * Protects writes to anything accessed during tc_windup(). 108 * tc_windup() must be called before leaving this mutex. 109 */ 110 struct mutex timecounter_mtx = MUTEX_INITIALIZER(IPL_CLOCK); 111 112 static struct timehands *volatile timehands = &th0; 113 struct timecounter *timecounter = &dummy_timecounter; 114 static struct timecounter *timecounters = &dummy_timecounter; 115 116 volatile time_t time_second = 1; 117 volatile time_t time_uptime = 0; 118 119 struct bintime naptime; 120 static int timestepwarnings; 121 122 void tc_windup(void); 123 124 /* 125 * Return the difference between the timehands' counter value now and what 126 * was when we copied it to the timehands' offset_count. 127 */ 128 static __inline u_int 129 tc_delta(struct timehands *th) 130 { 131 struct timecounter *tc; 132 133 tc = th->th_counter; 134 return ((tc->tc_get_timecount(tc) - th->th_offset_count) & 135 tc->tc_counter_mask); 136 } 137 138 /* 139 * Functions for reading the time. We have to loop until we are sure that 140 * the timehands that we operated on was not updated under our feet. See 141 * the comment in <sys/time.h> for a description of these functions. 142 */ 143 144 void 145 binboottime(struct bintime *bt) 146 { 147 struct timehands *th; 148 u_int gen; 149 150 do { 151 th = timehands; 152 gen = th->th_generation; 153 membar_consumer(); 154 *bt = th->th_boottime; 155 membar_consumer(); 156 } while (gen == 0 || gen != th->th_generation); 157 } 158 159 void 160 microboottime(struct timeval *tvp) 161 { 162 struct bintime bt; 163 164 binboottime(&bt); 165 bintime2timeval(&bt, tvp); 166 } 167 168 void 169 binuptime(struct bintime *bt) 170 { 171 struct timehands *th; 172 u_int gen; 173 174 do { 175 th = timehands; 176 gen = th->th_generation; 177 membar_consumer(); 178 *bt = th->th_offset; 179 bintime_addx(bt, th->th_scale * tc_delta(th)); 180 membar_consumer(); 181 } while (gen == 0 || gen != th->th_generation); 182 } 183 184 void 185 nanouptime(struct timespec *tsp) 186 { 187 struct bintime bt; 188 189 binuptime(&bt); 190 bintime2timespec(&bt, tsp); 191 } 192 193 void 194 microuptime(struct timeval *tvp) 195 { 196 struct bintime bt; 197 198 binuptime(&bt); 199 bintime2timeval(&bt, tvp); 200 } 201 202 void 203 bintime(struct bintime *bt) 204 { 205 struct timehands *th; 206 u_int gen; 207 208 do { 209 th = timehands; 210 gen = th->th_generation; 211 membar_consumer(); 212 *bt = th->th_offset; 213 bintime_addx(bt, th->th_scale * tc_delta(th)); 214 bintime_add(bt, &th->th_boottime); 215 membar_consumer(); 216 } while (gen == 0 || gen != th->th_generation); 217 } 218 219 void 220 nanotime(struct timespec *tsp) 221 { 222 struct bintime bt; 223 224 bintime(&bt); 225 bintime2timespec(&bt, tsp); 226 } 227 228 void 229 microtime(struct timeval *tvp) 230 { 231 struct bintime bt; 232 233 bintime(&bt); 234 bintime2timeval(&bt, tvp); 235 } 236 237 void 238 getnanouptime(struct timespec *tsp) 239 { 240 struct timehands *th; 241 u_int gen; 242 243 do { 244 th = timehands; 245 gen = th->th_generation; 246 membar_consumer(); 247 bintime2timespec(&th->th_offset, tsp); 248 membar_consumer(); 249 } while (gen == 0 || gen != th->th_generation); 250 } 251 252 void 253 getmicrouptime(struct timeval *tvp) 254 { 255 struct timehands *th; 256 u_int gen; 257 258 do { 259 th = timehands; 260 gen = th->th_generation; 261 membar_consumer(); 262 bintime2timeval(&th->th_offset, tvp); 263 membar_consumer(); 264 } while (gen == 0 || gen != th->th_generation); 265 } 266 267 void 268 getnanotime(struct timespec *tsp) 269 { 270 struct timehands *th; 271 u_int gen; 272 273 do { 274 th = timehands; 275 gen = th->th_generation; 276 membar_consumer(); 277 *tsp = th->th_nanotime; 278 membar_consumer(); 279 } while (gen == 0 || gen != th->th_generation); 280 } 281 282 void 283 getmicrotime(struct timeval *tvp) 284 { 285 struct timehands *th; 286 u_int gen; 287 288 do { 289 th = timehands; 290 gen = th->th_generation; 291 membar_consumer(); 292 *tvp = th->th_microtime; 293 membar_consumer(); 294 } while (gen == 0 || gen != th->th_generation); 295 } 296 297 /* 298 * Initialize a new timecounter and possibly use it. 299 */ 300 void 301 tc_init(struct timecounter *tc) 302 { 303 u_int u; 304 305 u = tc->tc_frequency / tc->tc_counter_mask; 306 /* XXX: We need some margin here, 10% is a guess */ 307 u *= 11; 308 u /= 10; 309 if (tc->tc_quality >= 0) { 310 if (u > hz) { 311 tc->tc_quality = -2000; 312 printf("Timecounter \"%s\" frequency %lu Hz", 313 tc->tc_name, (unsigned long)tc->tc_frequency); 314 printf(" -- Insufficient hz, needs at least %u\n", u); 315 } 316 } 317 318 tc->tc_next = timecounters; 319 timecounters = tc; 320 /* 321 * Never automatically use a timecounter with negative quality. 322 * Even though we run on the dummy counter, switching here may be 323 * worse since this timecounter may not be monotonic. 324 */ 325 if (tc->tc_quality < 0) 326 return; 327 if (tc->tc_quality < timecounter->tc_quality) 328 return; 329 if (tc->tc_quality == timecounter->tc_quality && 330 tc->tc_frequency < timecounter->tc_frequency) 331 return; 332 (void)tc->tc_get_timecount(tc); 333 enqueue_randomness(tc->tc_get_timecount(tc)); 334 335 timecounter = tc; 336 } 337 338 /* Report the frequency of the current timecounter. */ 339 u_int64_t 340 tc_getfrequency(void) 341 { 342 343 return (timehands->th_counter->tc_frequency); 344 } 345 346 /* 347 * Step our concept of UTC, aka the realtime clock. 348 * This is done by modifying our estimate of when we booted. 349 */ 350 void 351 tc_setrealtimeclock(const struct timespec *ts) 352 { 353 struct timespec ts2; 354 struct bintime bt, bt2; 355 356 mtx_enter(&timecounter_mtx); 357 binuptime(&bt2); 358 timespec2bintime(ts, &bt); 359 bintime_sub(&bt, &bt2); 360 bintime_add(&bt2, &timehands->th_boottime); 361 timehands->th_boottime = bt; 362 363 /* XXX fiddle all the little crinkly bits around the fiords... */ 364 tc_windup(); 365 mtx_leave(&timecounter_mtx); 366 367 enqueue_randomness(ts->tv_sec); 368 369 if (timestepwarnings) { 370 bintime2timespec(&bt2, &ts2); 371 log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n", 372 (long long)ts2.tv_sec, ts2.tv_nsec, 373 (long long)ts->tv_sec, ts->tv_nsec); 374 } 375 } 376 377 /* 378 * Step the monotonic and realtime clocks, triggering any timeouts that 379 * should have occurred across the interval. 380 */ 381 void 382 tc_setclock(const struct timespec *ts) 383 { 384 struct bintime bt, bt2; 385 struct timespec earlier; 386 static int first = 1; 387 #ifndef SMALL_KERNEL 388 long long adj_ticks; 389 #endif 390 391 /* 392 * When we're called for the first time, during boot when 393 * the root partition is mounted, we need to set boottime. 394 */ 395 if (first) { 396 tc_setrealtimeclock(ts); 397 first = 0; 398 return; 399 } 400 401 enqueue_randomness(ts->tv_sec); 402 403 mtx_enter(&timecounter_mtx); 404 timespec2bintime(ts, &bt); 405 bintime_sub(&bt, &timehands->th_boottime); 406 407 /* 408 * Don't rewind the offset. 409 */ 410 if (bt.sec < timehands->th_offset.sec || 411 (bt.sec == timehands->th_offset.sec && 412 bt.frac < timehands->th_offset.frac)) { 413 mtx_leave(&timecounter_mtx); 414 bintime2timespec(&bt, &earlier); 415 printf("%s: cannot rewind uptime to %lld.%09ld\n", 416 __func__, (long long)earlier.tv_sec, earlier.tv_nsec); 417 return; 418 } 419 420 bt2 = timehands->th_offset; 421 timehands->th_offset = bt; 422 423 /* XXX fiddle all the little crinkly bits around the fiords... */ 424 tc_windup(); 425 mtx_leave(&timecounter_mtx); 426 427 #ifndef SMALL_KERNEL 428 /* convert the bintime to ticks */ 429 bintime_sub(&bt, &bt2); 430 bintime_add(&naptime, &bt); 431 adj_ticks = (uint64_t)hz * bt.sec + 432 (((uint64_t)1000000 * (uint32_t)(bt.frac >> 32)) >> 32) / tick; 433 if (adj_ticks > 0) { 434 if (adj_ticks > INT_MAX) 435 adj_ticks = INT_MAX; 436 timeout_adjust_ticks(adj_ticks); 437 } 438 #endif 439 } 440 441 /* 442 * Initialize the next struct timehands in the ring and make 443 * it the active timehands. Along the way we might switch to a different 444 * timecounter and/or do seconds processing in NTP. Slightly magic. 445 */ 446 void 447 tc_windup(void) 448 { 449 struct bintime bt; 450 struct timecounter *active_tc; 451 struct timehands *th, *tho; 452 u_int64_t scale; 453 u_int delta, ncount, ogen; 454 int i; 455 456 MUTEX_ASSERT_LOCKED(&timecounter_mtx); 457 458 active_tc = timecounter; 459 460 /* 461 * Make the next timehands a copy of the current one, but do not 462 * overwrite the generation or next pointer. While we update 463 * the contents, the generation must be zero. 464 */ 465 tho = timehands; 466 th = tho->th_next; 467 ogen = th->th_generation; 468 th->th_generation = 0; 469 membar_producer(); 470 memcpy(th, tho, offsetof(struct timehands, th_generation)); 471 472 /* 473 * Capture a timecounter delta on the current timecounter and if 474 * changing timecounters, a counter value from the new timecounter. 475 * Update the offset fields accordingly. 476 */ 477 delta = tc_delta(th); 478 if (th->th_counter != active_tc) 479 ncount = active_tc->tc_get_timecount(active_tc); 480 else 481 ncount = 0; 482 th->th_offset_count += delta; 483 th->th_offset_count &= th->th_counter->tc_counter_mask; 484 bintime_addx(&th->th_offset, th->th_scale * delta); 485 486 #ifdef notyet 487 /* 488 * Hardware latching timecounters may not generate interrupts on 489 * PPS events, so instead we poll them. There is a finite risk that 490 * the hardware might capture a count which is later than the one we 491 * got above, and therefore possibly in the next NTP second which might 492 * have a different rate than the current NTP second. It doesn't 493 * matter in practice. 494 */ 495 if (tho->th_counter->tc_poll_pps) 496 tho->th_counter->tc_poll_pps(tho->th_counter); 497 #endif 498 499 /* 500 * Deal with NTP second processing. The for loop normally 501 * iterates at most once, but in extreme situations it might 502 * keep NTP sane if timeouts are not run for several seconds. 503 * At boot, the time step can be large when the TOD hardware 504 * has been read, so on really large steps, we call 505 * ntp_update_second only twice. We need to call it twice in 506 * case we missed a leap second. 507 */ 508 bt = th->th_offset; 509 bintime_add(&bt, &th->th_boottime); 510 i = bt.sec - tho->th_microtime.tv_sec; 511 if (i > LARGE_STEP) 512 i = 2; 513 for (; i > 0; i--) 514 ntp_update_second(&th->th_adjustment); 515 516 /* Update the UTC timestamps used by the get*() functions. */ 517 /* XXX shouldn't do this here. Should force non-`get' versions. */ 518 bintime2timeval(&bt, &th->th_microtime); 519 bintime2timespec(&bt, &th->th_nanotime); 520 521 /* Now is a good time to change timecounters. */ 522 if (th->th_counter != active_tc) { 523 th->th_counter = active_tc; 524 th->th_offset_count = ncount; 525 } 526 527 /*- 528 * Recalculate the scaling factor. We want the number of 1/2^64 529 * fractions of a second per period of the hardware counter, taking 530 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 531 * processing provides us with. 532 * 533 * The th_adjustment is nanoseconds per second with 32 bit binary 534 * fraction and we want 64 bit binary fraction of second: 535 * 536 * x = a * 2^32 / 10^9 = a * 4.294967296 537 * 538 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 539 * we can only multiply by about 850 without overflowing, but that 540 * leaves suitably precise fractions for multiply before divide. 541 * 542 * Divide before multiply with a fraction of 2199/512 results in a 543 * systematic undercompensation of 10PPM of th_adjustment. On a 544 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 545 * 546 * We happily sacrifice the lowest of the 64 bits of our result 547 * to the goddess of code clarity. 548 * 549 */ 550 scale = (u_int64_t)1 << 63; 551 scale += (th->th_adjustment / 1024) * 2199; 552 scale /= th->th_counter->tc_frequency; 553 th->th_scale = scale * 2; 554 555 /* 556 * Now that the struct timehands is again consistent, set the new 557 * generation number, making sure to not make it zero. 558 */ 559 if (++ogen == 0) 560 ogen = 1; 561 membar_producer(); 562 th->th_generation = ogen; 563 564 /* Go live with the new struct timehands. */ 565 time_second = th->th_microtime.tv_sec; 566 time_uptime = th->th_offset.sec; 567 membar_producer(); 568 timehands = th; 569 } 570 571 /* Report or change the active timecounter hardware. */ 572 int 573 sysctl_tc_hardware(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 574 { 575 char newname[32]; 576 struct timecounter *newtc, *tc; 577 int error; 578 579 tc = timecounter; 580 strlcpy(newname, tc->tc_name, sizeof(newname)); 581 582 error = sysctl_string(oldp, oldlenp, newp, newlen, newname, sizeof(newname)); 583 if (error != 0 || strcmp(newname, tc->tc_name) == 0) 584 return (error); 585 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 586 if (strcmp(newname, newtc->tc_name) != 0) 587 continue; 588 589 /* Warm up new timecounter. */ 590 (void)newtc->tc_get_timecount(newtc); 591 (void)newtc->tc_get_timecount(newtc); 592 593 timecounter = newtc; 594 return (0); 595 } 596 return (EINVAL); 597 } 598 599 /* Report or change the active timecounter hardware. */ 600 int 601 sysctl_tc_choice(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 602 { 603 char buf[32], *spc, *choices; 604 struct timecounter *tc; 605 int error, maxlen; 606 607 spc = ""; 608 maxlen = 0; 609 for (tc = timecounters; tc != NULL; tc = tc->tc_next) 610 maxlen += sizeof(buf); 611 choices = malloc(maxlen, M_TEMP, M_WAITOK); 612 *choices = '\0'; 613 for (tc = timecounters; tc != NULL; tc = tc->tc_next) { 614 snprintf(buf, sizeof(buf), "%s%s(%d)", 615 spc, tc->tc_name, tc->tc_quality); 616 spc = " "; 617 strlcat(choices, buf, maxlen); 618 } 619 error = sysctl_rdstring(oldp, oldlenp, newp, choices); 620 free(choices, M_TEMP, maxlen); 621 return (error); 622 } 623 624 /* 625 * Timecounters need to be updated every so often to prevent the hardware 626 * counter from overflowing. Updating also recalculates the cached values 627 * used by the get*() family of functions, so their precision depends on 628 * the update frequency. 629 */ 630 static int tc_tick; 631 632 void 633 tc_ticktock(void) 634 { 635 static int count; 636 637 if (++count < tc_tick) 638 return; 639 if (!mtx_enter_try(&timecounter_mtx)) 640 return; 641 count = 0; 642 tc_windup(); 643 mtx_leave(&timecounter_mtx); 644 } 645 646 void 647 inittimecounter(void) 648 { 649 #ifdef DEBUG 650 u_int p; 651 #endif 652 653 /* 654 * Set the initial timeout to 655 * max(1, <approx. number of hardclock ticks in a millisecond>). 656 * People should probably not use the sysctl to set the timeout 657 * to smaller than its initial value, since that value is the 658 * smallest reasonable one. If they want better timestamps they 659 * should use the non-"get"* functions. 660 */ 661 if (hz > 1000) 662 tc_tick = (hz + 500) / 1000; 663 else 664 tc_tick = 1; 665 #ifdef DEBUG 666 p = (tc_tick * 1000000) / hz; 667 printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); 668 #endif 669 670 /* warm up new timecounter (again) and get rolling. */ 671 (void)timecounter->tc_get_timecount(timecounter); 672 (void)timecounter->tc_get_timecount(timecounter); 673 } 674 675 /* 676 * Return timecounter-related information. 677 */ 678 int 679 sysctl_tc(int *name, u_int namelen, void *oldp, size_t *oldlenp, 680 void *newp, size_t newlen) 681 { 682 if (namelen != 1) 683 return (ENOTDIR); 684 685 switch (name[0]) { 686 case KERN_TIMECOUNTER_TICK: 687 return (sysctl_rdint(oldp, oldlenp, newp, tc_tick)); 688 case KERN_TIMECOUNTER_TIMESTEPWARNINGS: 689 return (sysctl_int(oldp, oldlenp, newp, newlen, 690 ×tepwarnings)); 691 case KERN_TIMECOUNTER_HARDWARE: 692 return (sysctl_tc_hardware(oldp, oldlenp, newp, newlen)); 693 case KERN_TIMECOUNTER_CHOICE: 694 return (sysctl_tc_choice(oldp, oldlenp, newp, newlen)); 695 default: 696 return (EOPNOTSUPP); 697 } 698 /* NOTREACHED */ 699 } 700 701 void 702 ntp_update_second(int64_t *adjust) 703 { 704 int64_t adj; 705 706 /* Skew time according to any adjtime(2) adjustments. */ 707 if (adjtimedelta > 0) 708 adj = MIN(5000, adjtimedelta); 709 else 710 adj = MAX(-5000, adjtimedelta); 711 adjtimedelta -= adj; 712 *adjust = (adj * 1000) << 32; 713 *adjust += timecounter->tc_freq_adj; 714 } 715 716 int 717 tc_adjfreq(int64_t *old, int64_t *new) 718 { 719 if (old != NULL) { 720 *old = timecounter->tc_freq_adj; 721 } 722 if (new != NULL) { 723 timecounter->tc_freq_adj = *new; 724 } 725 return 0; 726 } 727