1 /* $OpenBSD: kern_tc.c,v 1.36 2019/01/20 01:13:03 cheloha Exp $ */ 2 3 /* 4 * Copyright (c) 2000 Poul-Henning Kamp <phk@FreeBSD.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 /* 20 * If we meet some day, and you think this stuff is worth it, you 21 * can buy me a beer in return. Poul-Henning Kamp 22 */ 23 24 #include <sys/param.h> 25 #include <sys/atomic.h> 26 #include <sys/kernel.h> 27 #include <sys/mutex.h> 28 #include <sys/timeout.h> 29 #include <sys/sysctl.h> 30 #include <sys/syslog.h> 31 #include <sys/systm.h> 32 #include <sys/timetc.h> 33 #include <sys/malloc.h> 34 #include <dev/rndvar.h> 35 36 /* 37 * A large step happens on boot. This constant detects such steps. 38 * It is relatively small so that ntp_update_second gets called enough 39 * in the typical 'missed a couple of seconds' case, but doesn't loop 40 * forever when the time step is large. 41 */ 42 #define LARGE_STEP 200 43 44 u_int dummy_get_timecount(struct timecounter *); 45 46 void ntp_update_second(int64_t *); 47 int sysctl_tc_hardware(void *, size_t *, void *, size_t); 48 int sysctl_tc_choice(void *, size_t *, void *, size_t); 49 50 /* 51 * Implement a dummy timecounter which we can use until we get a real one 52 * in the air. This allows the console and other early stuff to use 53 * time services. 54 */ 55 56 u_int 57 dummy_get_timecount(struct timecounter *tc) 58 { 59 static u_int now; 60 61 return (++now); 62 } 63 64 static struct timecounter dummy_timecounter = { 65 dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 66 }; 67 68 struct timehands { 69 /* These fields must be initialized by the driver. */ 70 struct timecounter *th_counter; 71 int64_t th_adjustment; 72 u_int64_t th_scale; 73 u_int th_offset_count; 74 struct bintime th_boottime; 75 struct bintime th_offset; 76 struct timeval th_microtime; 77 struct timespec th_nanotime; 78 /* Fields not to be copied in tc_windup start with th_generation. */ 79 volatile u_int th_generation; 80 struct timehands *th_next; 81 }; 82 83 static struct timehands th0; 84 static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th0}; 85 static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th9}; 86 static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th8}; 87 static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th7}; 88 static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th6}; 89 static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th5}; 90 static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th4}; 91 static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th3}; 92 static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 0, &th2}; 93 static struct timehands th0 = { 94 &dummy_timecounter, 95 0, 96 (uint64_t)-1 / 1000000, 97 0, 98 {0, 0}, 99 {1, 0}, 100 {0, 0}, 101 {0, 0}, 102 1, 103 &th1 104 }; 105 106 /* 107 * Protects writes to anything accessed during tc_windup(). 108 * tc_windup() must be called before leaving this mutex. 109 */ 110 struct mutex timecounter_mtx = MUTEX_INITIALIZER(IPL_CLOCK); 111 112 static struct timehands *volatile timehands = &th0; 113 struct timecounter *timecounter = &dummy_timecounter; 114 static struct timecounter *timecounters = &dummy_timecounter; 115 116 volatile time_t time_second = 1; 117 volatile time_t time_uptime = 0; 118 119 struct bintime naptime; 120 static int timestepwarnings; 121 122 void tc_windup(void); 123 124 /* 125 * Return the difference between the timehands' counter value now and what 126 * was when we copied it to the timehands' offset_count. 127 */ 128 static __inline u_int 129 tc_delta(struct timehands *th) 130 { 131 struct timecounter *tc; 132 133 tc = th->th_counter; 134 return ((tc->tc_get_timecount(tc) - th->th_offset_count) & 135 tc->tc_counter_mask); 136 } 137 138 /* 139 * Functions for reading the time. We have to loop until we are sure that 140 * the timehands that we operated on was not updated under our feet. See 141 * the comment in <sys/time.h> for a description of these functions. 142 */ 143 144 void 145 binboottime(struct bintime *bt) 146 { 147 struct timehands *th; 148 u_int gen; 149 150 do { 151 th = timehands; 152 gen = th->th_generation; 153 membar_consumer(); 154 *bt = th->th_boottime; 155 membar_consumer(); 156 } while (gen == 0 || gen != th->th_generation); 157 } 158 159 void 160 microboottime(struct timeval *tvp) 161 { 162 struct bintime bt; 163 164 binboottime(&bt); 165 bintime2timeval(&bt, tvp); 166 } 167 168 void 169 binuptime(struct bintime *bt) 170 { 171 struct timehands *th; 172 u_int gen; 173 174 do { 175 th = timehands; 176 gen = th->th_generation; 177 membar_consumer(); 178 *bt = th->th_offset; 179 bintime_addx(bt, th->th_scale * tc_delta(th)); 180 membar_consumer(); 181 } while (gen == 0 || gen != th->th_generation); 182 } 183 184 void 185 nanouptime(struct timespec *tsp) 186 { 187 struct bintime bt; 188 189 binuptime(&bt); 190 bintime2timespec(&bt, tsp); 191 } 192 193 void 194 microuptime(struct timeval *tvp) 195 { 196 struct bintime bt; 197 198 binuptime(&bt); 199 bintime2timeval(&bt, tvp); 200 } 201 202 void 203 bintime(struct bintime *bt) 204 { 205 struct timehands *th; 206 u_int gen; 207 208 do { 209 th = timehands; 210 gen = th->th_generation; 211 membar_consumer(); 212 *bt = th->th_offset; 213 bintime_addx(bt, th->th_scale * tc_delta(th)); 214 bintime_add(bt, &th->th_boottime); 215 membar_consumer(); 216 } while (gen == 0 || gen != th->th_generation); 217 } 218 219 void 220 nanotime(struct timespec *tsp) 221 { 222 struct bintime bt; 223 224 bintime(&bt); 225 bintime2timespec(&bt, tsp); 226 } 227 228 void 229 microtime(struct timeval *tvp) 230 { 231 struct bintime bt; 232 233 bintime(&bt); 234 bintime2timeval(&bt, tvp); 235 } 236 237 void 238 getnanouptime(struct timespec *tsp) 239 { 240 struct timehands *th; 241 u_int gen; 242 243 do { 244 th = timehands; 245 gen = th->th_generation; 246 membar_consumer(); 247 bintime2timespec(&th->th_offset, tsp); 248 membar_consumer(); 249 } while (gen == 0 || gen != th->th_generation); 250 } 251 252 void 253 getmicrouptime(struct timeval *tvp) 254 { 255 struct timehands *th; 256 u_int gen; 257 258 do { 259 th = timehands; 260 gen = th->th_generation; 261 membar_consumer(); 262 bintime2timeval(&th->th_offset, tvp); 263 membar_consumer(); 264 } while (gen == 0 || gen != th->th_generation); 265 } 266 267 void 268 getnanotime(struct timespec *tsp) 269 { 270 struct timehands *th; 271 u_int gen; 272 273 do { 274 th = timehands; 275 gen = th->th_generation; 276 membar_consumer(); 277 *tsp = th->th_nanotime; 278 membar_consumer(); 279 } while (gen == 0 || gen != th->th_generation); 280 } 281 282 void 283 getmicrotime(struct timeval *tvp) 284 { 285 struct timehands *th; 286 u_int gen; 287 288 do { 289 th = timehands; 290 gen = th->th_generation; 291 membar_consumer(); 292 *tvp = th->th_microtime; 293 membar_consumer(); 294 } while (gen == 0 || gen != th->th_generation); 295 } 296 297 /* 298 * Initialize a new timecounter and possibly use it. 299 */ 300 void 301 tc_init(struct timecounter *tc) 302 { 303 u_int u; 304 305 u = tc->tc_frequency / tc->tc_counter_mask; 306 /* XXX: We need some margin here, 10% is a guess */ 307 u *= 11; 308 u /= 10; 309 if (tc->tc_quality >= 0) { 310 if (u > hz) { 311 tc->tc_quality = -2000; 312 printf("Timecounter \"%s\" frequency %lu Hz", 313 tc->tc_name, (unsigned long)tc->tc_frequency); 314 printf(" -- Insufficient hz, needs at least %u\n", u); 315 } 316 } 317 318 tc->tc_next = timecounters; 319 timecounters = tc; 320 /* 321 * Never automatically use a timecounter with negative quality. 322 * Even though we run on the dummy counter, switching here may be 323 * worse since this timecounter may not be monotonic. 324 */ 325 if (tc->tc_quality < 0) 326 return; 327 if (tc->tc_quality < timecounter->tc_quality) 328 return; 329 if (tc->tc_quality == timecounter->tc_quality && 330 tc->tc_frequency < timecounter->tc_frequency) 331 return; 332 (void)tc->tc_get_timecount(tc); 333 enqueue_randomness(tc->tc_get_timecount(tc)); 334 335 timecounter = tc; 336 } 337 338 /* Report the frequency of the current timecounter. */ 339 u_int64_t 340 tc_getfrequency(void) 341 { 342 343 return (timehands->th_counter->tc_frequency); 344 } 345 346 /* 347 * Step our concept of UTC, aka the realtime clock. 348 * This is done by modifying our estimate of when we booted. 349 */ 350 void 351 tc_setrealtimeclock(const struct timespec *ts) 352 { 353 struct timespec ts2; 354 struct bintime bt, bt2; 355 356 mtx_enter(&timecounter_mtx); 357 binuptime(&bt2); 358 timespec2bintime(ts, &bt); 359 bintime_sub(&bt, &bt2); 360 bintime_add(&bt2, &timehands->th_boottime); 361 timehands->th_boottime = bt; 362 363 /* XXX fiddle all the little crinkly bits around the fiords... */ 364 tc_windup(); 365 mtx_leave(&timecounter_mtx); 366 367 enqueue_randomness(ts->tv_sec); 368 369 if (timestepwarnings) { 370 bintime2timespec(&bt2, &ts2); 371 log(LOG_INFO, "Time stepped from %lld.%09ld to %lld.%09ld\n", 372 (long long)ts2.tv_sec, ts2.tv_nsec, 373 (long long)ts->tv_sec, ts->tv_nsec); 374 } 375 } 376 377 /* 378 * Step the monotonic and realtime clocks, triggering any timeouts that 379 * should have occurred across the interval. 380 */ 381 void 382 tc_setclock(const struct timespec *ts) 383 { 384 struct bintime bt, bt2; 385 static int first = 1; 386 #ifndef SMALL_KERNEL 387 long long adj_ticks; 388 #endif 389 390 /* 391 * When we're called for the first time, during boot when 392 * the root partition is mounted, we need to set boottime. 393 */ 394 if (first) { 395 tc_setrealtimeclock(ts); 396 first = 0; 397 return; 398 } 399 400 enqueue_randomness(ts->tv_sec); 401 402 mtx_enter(&timecounter_mtx); 403 timespec2bintime(ts, &bt); 404 bintime_sub(&bt, &timehands->th_boottime); 405 bt2 = timehands->th_offset; 406 timehands->th_offset = bt; 407 408 /* XXX fiddle all the little crinkly bits around the fiords... */ 409 tc_windup(); 410 mtx_leave(&timecounter_mtx); 411 412 #ifndef SMALL_KERNEL 413 /* convert the bintime to ticks */ 414 bintime_sub(&bt, &bt2); 415 bintime_add(&naptime, &bt); 416 adj_ticks = (uint64_t)hz * bt.sec + 417 (((uint64_t)1000000 * (uint32_t)(bt.frac >> 32)) >> 32) / tick; 418 if (adj_ticks > 0) { 419 if (adj_ticks > INT_MAX) 420 adj_ticks = INT_MAX; 421 timeout_adjust_ticks(adj_ticks); 422 } 423 #endif 424 } 425 426 /* 427 * Initialize the next struct timehands in the ring and make 428 * it the active timehands. Along the way we might switch to a different 429 * timecounter and/or do seconds processing in NTP. Slightly magic. 430 */ 431 void 432 tc_windup(void) 433 { 434 struct bintime bt; 435 struct timehands *th, *tho; 436 u_int64_t scale; 437 u_int delta, ncount, ogen; 438 int i; 439 440 MUTEX_ASSERT_LOCKED(&timecounter_mtx); 441 442 /* 443 * Make the next timehands a copy of the current one, but do not 444 * overwrite the generation or next pointer. While we update 445 * the contents, the generation must be zero. 446 */ 447 tho = timehands; 448 th = tho->th_next; 449 ogen = th->th_generation; 450 th->th_generation = 0; 451 membar_producer(); 452 memcpy(th, tho, offsetof(struct timehands, th_generation)); 453 454 /* 455 * Capture a timecounter delta on the current timecounter and if 456 * changing timecounters, a counter value from the new timecounter. 457 * Update the offset fields accordingly. 458 */ 459 delta = tc_delta(th); 460 if (th->th_counter != timecounter) 461 ncount = timecounter->tc_get_timecount(timecounter); 462 else 463 ncount = 0; 464 th->th_offset_count += delta; 465 th->th_offset_count &= th->th_counter->tc_counter_mask; 466 bintime_addx(&th->th_offset, th->th_scale * delta); 467 468 #ifdef notyet 469 /* 470 * Hardware latching timecounters may not generate interrupts on 471 * PPS events, so instead we poll them. There is a finite risk that 472 * the hardware might capture a count which is later than the one we 473 * got above, and therefore possibly in the next NTP second which might 474 * have a different rate than the current NTP second. It doesn't 475 * matter in practice. 476 */ 477 if (tho->th_counter->tc_poll_pps) 478 tho->th_counter->tc_poll_pps(tho->th_counter); 479 #endif 480 481 /* 482 * Deal with NTP second processing. The for loop normally 483 * iterates at most once, but in extreme situations it might 484 * keep NTP sane if timeouts are not run for several seconds. 485 * At boot, the time step can be large when the TOD hardware 486 * has been read, so on really large steps, we call 487 * ntp_update_second only twice. We need to call it twice in 488 * case we missed a leap second. 489 */ 490 bt = th->th_offset; 491 bintime_add(&bt, &th->th_boottime); 492 i = bt.sec - tho->th_microtime.tv_sec; 493 if (i > LARGE_STEP) 494 i = 2; 495 for (; i > 0; i--) 496 ntp_update_second(&th->th_adjustment); 497 498 /* Update the UTC timestamps used by the get*() functions. */ 499 /* XXX shouldn't do this here. Should force non-`get' versions. */ 500 bintime2timeval(&bt, &th->th_microtime); 501 bintime2timespec(&bt, &th->th_nanotime); 502 503 /* Now is a good time to change timecounters. */ 504 if (th->th_counter != timecounter) { 505 th->th_counter = timecounter; 506 th->th_offset_count = ncount; 507 } 508 509 /*- 510 * Recalculate the scaling factor. We want the number of 1/2^64 511 * fractions of a second per period of the hardware counter, taking 512 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 513 * processing provides us with. 514 * 515 * The th_adjustment is nanoseconds per second with 32 bit binary 516 * fraction and we want 64 bit binary fraction of second: 517 * 518 * x = a * 2^32 / 10^9 = a * 4.294967296 519 * 520 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 521 * we can only multiply by about 850 without overflowing, but that 522 * leaves suitably precise fractions for multiply before divide. 523 * 524 * Divide before multiply with a fraction of 2199/512 results in a 525 * systematic undercompensation of 10PPM of th_adjustment. On a 526 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 527 * 528 * We happily sacrifice the lowest of the 64 bits of our result 529 * to the goddess of code clarity. 530 * 531 */ 532 scale = (u_int64_t)1 << 63; 533 scale += (th->th_adjustment / 1024) * 2199; 534 scale /= th->th_counter->tc_frequency; 535 th->th_scale = scale * 2; 536 537 /* 538 * Now that the struct timehands is again consistent, set the new 539 * generation number, making sure to not make it zero. 540 */ 541 if (++ogen == 0) 542 ogen = 1; 543 membar_producer(); 544 th->th_generation = ogen; 545 546 /* Go live with the new struct timehands. */ 547 time_second = th->th_microtime.tv_sec; 548 time_uptime = th->th_offset.sec; 549 membar_producer(); 550 timehands = th; 551 } 552 553 /* Report or change the active timecounter hardware. */ 554 int 555 sysctl_tc_hardware(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 556 { 557 char newname[32]; 558 struct timecounter *newtc, *tc; 559 int error; 560 561 tc = timecounter; 562 strlcpy(newname, tc->tc_name, sizeof(newname)); 563 564 error = sysctl_string(oldp, oldlenp, newp, newlen, newname, sizeof(newname)); 565 if (error != 0 || strcmp(newname, tc->tc_name) == 0) 566 return (error); 567 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 568 if (strcmp(newname, newtc->tc_name) != 0) 569 continue; 570 571 /* Warm up new timecounter. */ 572 (void)newtc->tc_get_timecount(newtc); 573 (void)newtc->tc_get_timecount(newtc); 574 575 timecounter = newtc; 576 return (0); 577 } 578 return (EINVAL); 579 } 580 581 /* Report or change the active timecounter hardware. */ 582 int 583 sysctl_tc_choice(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 584 { 585 char buf[32], *spc, *choices; 586 struct timecounter *tc; 587 int error, maxlen; 588 589 spc = ""; 590 maxlen = 0; 591 for (tc = timecounters; tc != NULL; tc = tc->tc_next) 592 maxlen += sizeof(buf); 593 choices = malloc(maxlen, M_TEMP, M_WAITOK); 594 *choices = '\0'; 595 for (tc = timecounters; tc != NULL; tc = tc->tc_next) { 596 snprintf(buf, sizeof(buf), "%s%s(%d)", 597 spc, tc->tc_name, tc->tc_quality); 598 spc = " "; 599 strlcat(choices, buf, maxlen); 600 } 601 error = sysctl_rdstring(oldp, oldlenp, newp, choices); 602 free(choices, M_TEMP, maxlen); 603 return (error); 604 } 605 606 /* 607 * Timecounters need to be updated every so often to prevent the hardware 608 * counter from overflowing. Updating also recalculates the cached values 609 * used by the get*() family of functions, so their precision depends on 610 * the update frequency. 611 */ 612 static int tc_tick; 613 614 void 615 tc_ticktock(void) 616 { 617 static int count; 618 619 if (++count < tc_tick) 620 return; 621 if (!mtx_enter_try(&timecounter_mtx)) 622 return; 623 count = 0; 624 tc_windup(); 625 mtx_leave(&timecounter_mtx); 626 } 627 628 void 629 inittimecounter(void) 630 { 631 #ifdef DEBUG 632 u_int p; 633 #endif 634 635 /* 636 * Set the initial timeout to 637 * max(1, <approx. number of hardclock ticks in a millisecond>). 638 * People should probably not use the sysctl to set the timeout 639 * to smaller than its initial value, since that value is the 640 * smallest reasonable one. If they want better timestamps they 641 * should use the non-"get"* functions. 642 */ 643 if (hz > 1000) 644 tc_tick = (hz + 500) / 1000; 645 else 646 tc_tick = 1; 647 #ifdef DEBUG 648 p = (tc_tick * 1000000) / hz; 649 printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); 650 #endif 651 652 /* warm up new timecounter (again) and get rolling. */ 653 (void)timecounter->tc_get_timecount(timecounter); 654 (void)timecounter->tc_get_timecount(timecounter); 655 } 656 657 /* 658 * Return timecounter-related information. 659 */ 660 int 661 sysctl_tc(int *name, u_int namelen, void *oldp, size_t *oldlenp, 662 void *newp, size_t newlen) 663 { 664 if (namelen != 1) 665 return (ENOTDIR); 666 667 switch (name[0]) { 668 case KERN_TIMECOUNTER_TICK: 669 return (sysctl_rdint(oldp, oldlenp, newp, tc_tick)); 670 case KERN_TIMECOUNTER_TIMESTEPWARNINGS: 671 return (sysctl_int(oldp, oldlenp, newp, newlen, 672 ×tepwarnings)); 673 case KERN_TIMECOUNTER_HARDWARE: 674 return (sysctl_tc_hardware(oldp, oldlenp, newp, newlen)); 675 case KERN_TIMECOUNTER_CHOICE: 676 return (sysctl_tc_choice(oldp, oldlenp, newp, newlen)); 677 default: 678 return (EOPNOTSUPP); 679 } 680 /* NOTREACHED */ 681 } 682 683 void 684 ntp_update_second(int64_t *adjust) 685 { 686 int64_t adj; 687 688 /* Skew time according to any adjtime(2) adjustments. */ 689 if (adjtimedelta > 0) 690 adj = MIN(5000, adjtimedelta); 691 else 692 adj = MAX(-5000, adjtimedelta); 693 adjtimedelta -= adj; 694 *adjust = (adj * 1000) << 32; 695 *adjust += timecounter->tc_freq_adj; 696 } 697 698 int 699 tc_adjfreq(int64_t *old, int64_t *new) 700 { 701 if (old != NULL) { 702 *old = timecounter->tc_freq_adj; 703 } 704 if (new != NULL) { 705 timecounter->tc_freq_adj = *new; 706 } 707 return 0; 708 } 709