1 /* $NetBSD: kern_tc.c,v 1.61 2021/04/08 06:20:47 simonb Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.61 2021/04/08 06:20:47 simonb Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/atomic.h> 51 #include <sys/evcnt.h> 52 #include <sys/kauth.h> 53 #include <sys/kernel.h> 54 #include <sys/mutex.h> 55 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 56 #include <sys/sysctl.h> 57 #include <sys/syslog.h> 58 #include <sys/systm.h> 59 #include <sys/timepps.h> 60 #include <sys/timetc.h> 61 #include <sys/timex.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return ++now; 84 } 85 86 static struct timecounter dummy_timecounter = { 87 .tc_get_timecount = dummy_get_timecount, 88 .tc_counter_mask = ~0u, 89 .tc_frequency = 1000000, 90 .tc_name = "dummy", 91 .tc_quality = -1000000, 92 .tc_priv = NULL, 93 }; 94 95 struct timehands { 96 /* These fields must be initialized by the driver. */ 97 struct timecounter *th_counter; /* active timecounter */ 98 int64_t th_adjustment; /* frequency adjustment */ 99 /* (NTP/adjtime) */ 100 uint64_t th_scale; /* scale factor (counter */ 101 /* tick->time) */ 102 uint64_t th_offset_count; /* offset at last time */ 103 /* update (tc_windup()) */ 104 struct bintime th_offset; /* bin (up)time at windup */ 105 struct timeval th_microtime; /* cached microtime */ 106 struct timespec th_nanotime; /* cached nanotime */ 107 /* Fields not to be copied in tc_windup start with th_generation. */ 108 volatile u_int th_generation; /* current genration */ 109 struct timehands *th_next; /* next timehand */ 110 }; 111 112 static struct timehands th0; 113 static struct timehands th9 = { .th_next = &th0, }; 114 static struct timehands th8 = { .th_next = &th9, }; 115 static struct timehands th7 = { .th_next = &th8, }; 116 static struct timehands th6 = { .th_next = &th7, }; 117 static struct timehands th5 = { .th_next = &th6, }; 118 static struct timehands th4 = { .th_next = &th5, }; 119 static struct timehands th3 = { .th_next = &th4, }; 120 static struct timehands th2 = { .th_next = &th3, }; 121 static struct timehands th1 = { .th_next = &th2, }; 122 static struct timehands th0 = { 123 .th_counter = &dummy_timecounter, 124 .th_scale = (uint64_t)-1 / 1000000, 125 .th_offset = { .sec = 1, .frac = 0 }, 126 .th_generation = 1, 127 .th_next = &th1, 128 }; 129 130 static struct timehands *volatile timehands = &th0; 131 struct timecounter *timecounter = &dummy_timecounter; 132 static struct timecounter *timecounters = &dummy_timecounter; 133 134 volatile time_t time_second __cacheline_aligned = 1; 135 volatile time_t time_uptime __cacheline_aligned = 1; 136 137 static struct bintime timebasebin; 138 139 static int timestepwarnings; 140 141 kmutex_t timecounter_lock; 142 static u_int timecounter_mods; 143 static volatile int timecounter_removals = 1; 144 static u_int timecounter_bad; 145 146 /* 147 * sysctl helper routine for kern.timercounter.hardware 148 */ 149 static int 150 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 151 { 152 struct sysctlnode node; 153 int error; 154 char newname[MAX_TCNAMELEN]; 155 struct timecounter *newtc, *tc; 156 157 tc = timecounter; 158 159 strlcpy(newname, tc->tc_name, sizeof(newname)); 160 161 node = *rnode; 162 node.sysctl_data = newname; 163 node.sysctl_size = sizeof(newname); 164 165 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 166 167 if (error || 168 newp == NULL || 169 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 170 return error; 171 172 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 173 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 174 NULL, NULL)) != 0) 175 return error; 176 177 if (!cold) 178 mutex_spin_enter(&timecounter_lock); 179 error = EINVAL; 180 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 181 if (strcmp(newname, newtc->tc_name) != 0) 182 continue; 183 /* Warm up new timecounter. */ 184 (void)newtc->tc_get_timecount(newtc); 185 (void)newtc->tc_get_timecount(newtc); 186 timecounter = newtc; 187 error = 0; 188 break; 189 } 190 if (!cold) 191 mutex_spin_exit(&timecounter_lock); 192 return error; 193 } 194 195 static int 196 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 197 { 198 char buf[MAX_TCNAMELEN+48]; 199 char *where; 200 const char *spc; 201 struct timecounter *tc; 202 size_t needed, left, slen; 203 int error, mods; 204 205 if (newp != NULL) 206 return EPERM; 207 if (namelen != 0) 208 return EINVAL; 209 210 mutex_spin_enter(&timecounter_lock); 211 retry: 212 spc = ""; 213 error = 0; 214 needed = 0; 215 left = *oldlenp; 216 where = oldp; 217 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 218 if (where == NULL) { 219 needed += sizeof(buf); /* be conservative */ 220 } else { 221 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 222 " Hz)", spc, tc->tc_name, tc->tc_quality, 223 tc->tc_frequency); 224 if (left < slen + 1) 225 break; 226 mods = timecounter_mods; 227 mutex_spin_exit(&timecounter_lock); 228 error = copyout(buf, where, slen + 1); 229 mutex_spin_enter(&timecounter_lock); 230 if (mods != timecounter_mods) { 231 goto retry; 232 } 233 spc = " "; 234 where += slen; 235 needed += slen; 236 left -= slen; 237 } 238 } 239 mutex_spin_exit(&timecounter_lock); 240 241 *oldlenp = needed; 242 return error; 243 } 244 245 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 246 { 247 const struct sysctlnode *node; 248 249 sysctl_createv(clog, 0, NULL, &node, 250 CTLFLAG_PERMANENT, 251 CTLTYPE_NODE, "timecounter", 252 SYSCTL_DESCR("time counter information"), 253 NULL, 0, NULL, 0, 254 CTL_KERN, CTL_CREATE, CTL_EOL); 255 256 if (node != NULL) { 257 sysctl_createv(clog, 0, NULL, NULL, 258 CTLFLAG_PERMANENT, 259 CTLTYPE_STRING, "choice", 260 SYSCTL_DESCR("available counters"), 261 sysctl_kern_timecounter_choice, 0, NULL, 0, 262 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 263 264 sysctl_createv(clog, 0, NULL, NULL, 265 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 266 CTLTYPE_STRING, "hardware", 267 SYSCTL_DESCR("currently active time counter"), 268 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 269 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 270 271 sysctl_createv(clog, 0, NULL, NULL, 272 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 273 CTLTYPE_INT, "timestepwarnings", 274 SYSCTL_DESCR("log time steps"), 275 NULL, 0, ×tepwarnings, 0, 276 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 277 } 278 } 279 280 #ifdef TC_COUNTERS 281 #define TC_STATS(name) \ 282 static struct evcnt n##name = \ 283 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 284 EVCNT_ATTACH_STATIC(n##name) 285 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 286 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 287 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 288 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 289 TC_STATS(setclock); 290 #define TC_COUNT(var) var.ev_count++ 291 #undef TC_STATS 292 #else 293 #define TC_COUNT(var) /* nothing */ 294 #endif /* TC_COUNTERS */ 295 296 static void tc_windup(void); 297 298 /* 299 * Return the difference between the timehands' counter value now and what 300 * was when we copied it to the timehands' offset_count. 301 */ 302 static inline u_int 303 tc_delta(struct timehands *th) 304 { 305 struct timecounter *tc; 306 307 tc = th->th_counter; 308 return (tc->tc_get_timecount(tc) - 309 th->th_offset_count) & tc->tc_counter_mask; 310 } 311 312 /* 313 * Functions for reading the time. We have to loop until we are sure that 314 * the timehands that we operated on was not updated under our feet. See 315 * the comment in <sys/timevar.h> for a description of these 12 functions. 316 */ 317 318 void 319 binuptime(struct bintime *bt) 320 { 321 struct timehands *th; 322 lwp_t *l; 323 u_int lgen, gen; 324 325 TC_COUNT(nbinuptime); 326 327 /* 328 * Provide exclusion against tc_detach(). 329 * 330 * We record the number of timecounter removals before accessing 331 * timecounter state. Note that the LWP can be using multiple 332 * "generations" at once, due to interrupts (interrupted while in 333 * this function). Hardware interrupts will borrow the interrupted 334 * LWP's l_tcgen value for this purpose, and can themselves be 335 * interrupted by higher priority interrupts. In this case we need 336 * to ensure that the oldest generation in use is recorded. 337 * 338 * splsched() is too expensive to use, so we take care to structure 339 * this code in such a way that it is not required. Likewise, we 340 * do not disable preemption. 341 * 342 * Memory barriers are also too expensive to use for such a 343 * performance critical function. The good news is that we do not 344 * need memory barriers for this type of exclusion, as the thread 345 * updating timecounter_removals will issue a broadcast cross call 346 * before inspecting our l_tcgen value (this elides memory ordering 347 * issues). 348 */ 349 l = curlwp; 350 lgen = l->l_tcgen; 351 if (__predict_true(lgen == 0)) { 352 l->l_tcgen = timecounter_removals; 353 } 354 __insn_barrier(); 355 356 do { 357 th = timehands; 358 gen = th->th_generation; 359 *bt = th->th_offset; 360 bintime_addx(bt, th->th_scale * tc_delta(th)); 361 } while (gen == 0 || gen != th->th_generation); 362 363 __insn_barrier(); 364 l->l_tcgen = lgen; 365 } 366 367 void 368 nanouptime(struct timespec *tsp) 369 { 370 struct bintime bt; 371 372 TC_COUNT(nnanouptime); 373 binuptime(&bt); 374 bintime2timespec(&bt, tsp); 375 } 376 377 void 378 microuptime(struct timeval *tvp) 379 { 380 struct bintime bt; 381 382 TC_COUNT(nmicrouptime); 383 binuptime(&bt); 384 bintime2timeval(&bt, tvp); 385 } 386 387 void 388 bintime(struct bintime *bt) 389 { 390 391 TC_COUNT(nbintime); 392 binuptime(bt); 393 bintime_add(bt, &timebasebin); 394 } 395 396 void 397 nanotime(struct timespec *tsp) 398 { 399 struct bintime bt; 400 401 TC_COUNT(nnanotime); 402 bintime(&bt); 403 bintime2timespec(&bt, tsp); 404 } 405 406 void 407 microtime(struct timeval *tvp) 408 { 409 struct bintime bt; 410 411 TC_COUNT(nmicrotime); 412 bintime(&bt); 413 bintime2timeval(&bt, tvp); 414 } 415 416 void 417 getbinuptime(struct bintime *bt) 418 { 419 struct timehands *th; 420 u_int gen; 421 422 TC_COUNT(ngetbinuptime); 423 do { 424 th = timehands; 425 gen = th->th_generation; 426 *bt = th->th_offset; 427 } while (gen == 0 || gen != th->th_generation); 428 } 429 430 void 431 getnanouptime(struct timespec *tsp) 432 { 433 struct timehands *th; 434 u_int gen; 435 436 TC_COUNT(ngetnanouptime); 437 do { 438 th = timehands; 439 gen = th->th_generation; 440 bintime2timespec(&th->th_offset, tsp); 441 } while (gen == 0 || gen != th->th_generation); 442 } 443 444 void 445 getmicrouptime(struct timeval *tvp) 446 { 447 struct timehands *th; 448 u_int gen; 449 450 TC_COUNT(ngetmicrouptime); 451 do { 452 th = timehands; 453 gen = th->th_generation; 454 bintime2timeval(&th->th_offset, tvp); 455 } while (gen == 0 || gen != th->th_generation); 456 } 457 458 void 459 getbintime(struct bintime *bt) 460 { 461 struct timehands *th; 462 u_int gen; 463 464 TC_COUNT(ngetbintime); 465 do { 466 th = timehands; 467 gen = th->th_generation; 468 *bt = th->th_offset; 469 } while (gen == 0 || gen != th->th_generation); 470 bintime_add(bt, &timebasebin); 471 } 472 473 static inline void 474 dogetnanotime(struct timespec *tsp) 475 { 476 struct timehands *th; 477 u_int gen; 478 479 TC_COUNT(ngetnanotime); 480 do { 481 th = timehands; 482 gen = th->th_generation; 483 *tsp = th->th_nanotime; 484 } while (gen == 0 || gen != th->th_generation); 485 } 486 487 void 488 getnanotime(struct timespec *tsp) 489 { 490 491 dogetnanotime(tsp); 492 } 493 494 void dtrace_getnanotime(struct timespec *tsp); 495 496 void 497 dtrace_getnanotime(struct timespec *tsp) 498 { 499 500 dogetnanotime(tsp); 501 } 502 503 void 504 getmicrotime(struct timeval *tvp) 505 { 506 struct timehands *th; 507 u_int gen; 508 509 TC_COUNT(ngetmicrotime); 510 do { 511 th = timehands; 512 gen = th->th_generation; 513 *tvp = th->th_microtime; 514 } while (gen == 0 || gen != th->th_generation); 515 } 516 517 void 518 getnanoboottime(struct timespec *tsp) 519 { 520 struct bintime bt; 521 522 getbinboottime(&bt); 523 bintime2timespec(&bt, tsp); 524 } 525 526 void 527 getmicroboottime(struct timeval *tvp) 528 { 529 struct bintime bt; 530 531 getbinboottime(&bt); 532 bintime2timeval(&bt, tvp); 533 } 534 535 void 536 getbinboottime(struct bintime *bt) 537 { 538 539 /* 540 * XXX Need lockless read synchronization around timebasebin 541 * (and not just here). 542 */ 543 *bt = timebasebin; 544 } 545 546 /* 547 * Initialize a new timecounter and possibly use it. 548 */ 549 void 550 tc_init(struct timecounter *tc) 551 { 552 u_int u; 553 554 KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised", 555 tc->tc_name); 556 557 u = tc->tc_frequency / tc->tc_counter_mask; 558 /* XXX: We need some margin here, 10% is a guess */ 559 u *= 11; 560 u /= 10; 561 if (u > hz && tc->tc_quality >= 0) { 562 tc->tc_quality = -2000; 563 aprint_verbose( 564 "timecounter: Timecounter \"%s\" frequency %ju Hz", 565 tc->tc_name, (uintmax_t)tc->tc_frequency); 566 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 567 } else if (tc->tc_quality >= 0 || bootverbose) { 568 aprint_verbose( 569 "timecounter: Timecounter \"%s\" frequency %ju Hz " 570 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 571 tc->tc_quality); 572 } 573 574 mutex_spin_enter(&timecounter_lock); 575 tc->tc_next = timecounters; 576 timecounters = tc; 577 timecounter_mods++; 578 /* 579 * Never automatically use a timecounter with negative quality. 580 * Even though we run on the dummy counter, switching here may be 581 * worse since this timecounter may not be monotonous. 582 */ 583 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 584 (tc->tc_quality == timecounter->tc_quality && 585 tc->tc_frequency > timecounter->tc_frequency))) { 586 (void)tc->tc_get_timecount(tc); 587 (void)tc->tc_get_timecount(tc); 588 timecounter = tc; 589 tc_windup(); 590 } 591 mutex_spin_exit(&timecounter_lock); 592 } 593 594 /* 595 * Pick a new timecounter due to the existing counter going bad. 596 */ 597 static void 598 tc_pick(void) 599 { 600 struct timecounter *best, *tc; 601 602 KASSERT(mutex_owned(&timecounter_lock)); 603 604 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 605 if (tc->tc_quality > best->tc_quality) 606 best = tc; 607 else if (tc->tc_quality < best->tc_quality) 608 continue; 609 else if (tc->tc_frequency > best->tc_frequency) 610 best = tc; 611 } 612 (void)best->tc_get_timecount(best); 613 (void)best->tc_get_timecount(best); 614 timecounter = best; 615 } 616 617 /* 618 * A timecounter has gone bad, arrange to pick a new one at the next 619 * clock tick. 620 */ 621 void 622 tc_gonebad(struct timecounter *tc) 623 { 624 625 tc->tc_quality = -100; 626 membar_producer(); 627 atomic_inc_uint(&timecounter_bad); 628 } 629 630 /* 631 * Stop using a timecounter and remove it from the timecounters list. 632 */ 633 int 634 tc_detach(struct timecounter *target) 635 { 636 struct timecounter *tc; 637 struct timecounter **tcp = NULL; 638 int removals; 639 lwp_t *l; 640 641 /* First, find the timecounter. */ 642 mutex_spin_enter(&timecounter_lock); 643 for (tcp = &timecounters, tc = timecounters; 644 tc != NULL; 645 tcp = &tc->tc_next, tc = tc->tc_next) { 646 if (tc == target) 647 break; 648 } 649 if (tc == NULL) { 650 mutex_spin_exit(&timecounter_lock); 651 return ESRCH; 652 } 653 654 /* And now, remove it. */ 655 *tcp = tc->tc_next; 656 if (timecounter == target) { 657 tc_pick(); 658 tc_windup(); 659 } 660 timecounter_mods++; 661 removals = timecounter_removals++; 662 mutex_spin_exit(&timecounter_lock); 663 664 /* 665 * We now have to determine if any threads in the system are still 666 * making use of this timecounter. 667 * 668 * We issue a broadcast cross call to elide memory ordering issues, 669 * then scan all LWPs in the system looking at each's timecounter 670 * generation number. We need to see a value of zero (not actively 671 * using a timecounter) or a value greater than our removal value. 672 * 673 * We may race with threads that read `timecounter_removals' and 674 * and then get preempted before updating `l_tcgen'. This is not 675 * a problem, since it means that these threads have not yet started 676 * accessing timecounter state. All we do need is one clean 677 * snapshot of the system where every thread appears not to be using 678 * old timecounter state. 679 */ 680 for (;;) { 681 xc_barrier(0); 682 683 mutex_enter(&proc_lock); 684 LIST_FOREACH(l, &alllwp, l_list) { 685 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 686 /* 687 * Not using timecounter or old timecounter 688 * state at time of our xcall or later. 689 */ 690 continue; 691 } 692 break; 693 } 694 mutex_exit(&proc_lock); 695 696 /* 697 * If the timecounter is still in use, wait at least 10ms 698 * before retrying. 699 */ 700 if (l == NULL) { 701 return 0; 702 } 703 (void)kpause("tcdetach", false, mstohz(10), NULL); 704 } 705 } 706 707 /* Report the frequency of the current timecounter. */ 708 uint64_t 709 tc_getfrequency(void) 710 { 711 712 return timehands->th_counter->tc_frequency; 713 } 714 715 /* 716 * Step our concept of UTC. This is done by modifying our estimate of 717 * when we booted. 718 */ 719 void 720 tc_setclock(const struct timespec *ts) 721 { 722 struct timespec ts2; 723 struct bintime bt, bt2; 724 725 mutex_spin_enter(&timecounter_lock); 726 TC_COUNT(nsetclock); 727 binuptime(&bt2); 728 timespec2bintime(ts, &bt); 729 bintime_sub(&bt, &bt2); 730 bintime_add(&bt2, &timebasebin); 731 timebasebin = bt; 732 tc_windup(); 733 mutex_spin_exit(&timecounter_lock); 734 735 if (timestepwarnings) { 736 bintime2timespec(&bt2, &ts2); 737 log(LOG_INFO, 738 "Time stepped from %lld.%09ld to %lld.%09ld\n", 739 (long long)ts2.tv_sec, ts2.tv_nsec, 740 (long long)ts->tv_sec, ts->tv_nsec); 741 } 742 } 743 744 /* 745 * Initialize the next struct timehands in the ring and make 746 * it the active timehands. Along the way we might switch to a different 747 * timecounter and/or do seconds processing in NTP. Slightly magic. 748 */ 749 static void 750 tc_windup(void) 751 { 752 struct bintime bt; 753 struct timehands *th, *tho; 754 uint64_t scale; 755 u_int delta, ncount, ogen; 756 int i, s_update; 757 time_t t; 758 759 KASSERT(mutex_owned(&timecounter_lock)); 760 761 s_update = 0; 762 763 /* 764 * Make the next timehands a copy of the current one, but do not 765 * overwrite the generation or next pointer. While we update 766 * the contents, the generation must be zero. Ensure global 767 * visibility of the generation before proceeding. 768 */ 769 tho = timehands; 770 th = tho->th_next; 771 ogen = th->th_generation; 772 th->th_generation = 0; 773 membar_producer(); 774 bcopy(tho, th, offsetof(struct timehands, th_generation)); 775 776 /* 777 * Capture a timecounter delta on the current timecounter and if 778 * changing timecounters, a counter value from the new timecounter. 779 * Update the offset fields accordingly. 780 */ 781 delta = tc_delta(th); 782 if (th->th_counter != timecounter) 783 ncount = timecounter->tc_get_timecount(timecounter); 784 else 785 ncount = 0; 786 th->th_offset_count += delta; 787 bintime_addx(&th->th_offset, th->th_scale * delta); 788 789 /* 790 * Hardware latching timecounters may not generate interrupts on 791 * PPS events, so instead we poll them. There is a finite risk that 792 * the hardware might capture a count which is later than the one we 793 * got above, and therefore possibly in the next NTP second which might 794 * have a different rate than the current NTP second. It doesn't 795 * matter in practice. 796 */ 797 if (tho->th_counter->tc_poll_pps) 798 tho->th_counter->tc_poll_pps(tho->th_counter); 799 800 /* 801 * Deal with NTP second processing. The for loop normally 802 * iterates at most once, but in extreme situations it might 803 * keep NTP sane if timeouts are not run for several seconds. 804 * At boot, the time step can be large when the TOD hardware 805 * has been read, so on really large steps, we call 806 * ntp_update_second only twice. We need to call it twice in 807 * case we missed a leap second. 808 * If NTP is not compiled in ntp_update_second still calculates 809 * the adjustment resulting from adjtime() calls. 810 */ 811 bt = th->th_offset; 812 bintime_add(&bt, &timebasebin); 813 i = bt.sec - tho->th_microtime.tv_sec; 814 if (i > LARGE_STEP) 815 i = 2; 816 for (; i > 0; i--) { 817 t = bt.sec; 818 ntp_update_second(&th->th_adjustment, &bt.sec); 819 s_update = 1; 820 if (bt.sec != t) 821 timebasebin.sec += bt.sec - t; 822 } 823 824 /* Update the UTC timestamps used by the get*() functions. */ 825 /* XXX shouldn't do this here. Should force non-`get' versions. */ 826 bintime2timeval(&bt, &th->th_microtime); 827 bintime2timespec(&bt, &th->th_nanotime); 828 /* Now is a good time to change timecounters. */ 829 if (th->th_counter != timecounter) { 830 th->th_counter = timecounter; 831 th->th_offset_count = ncount; 832 s_update = 1; 833 } 834 835 /*- 836 * Recalculate the scaling factor. We want the number of 1/2^64 837 * fractions of a second per period of the hardware counter, taking 838 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 839 * processing provides us with. 840 * 841 * The th_adjustment is nanoseconds per second with 32 bit binary 842 * fraction and we want 64 bit binary fraction of second: 843 * 844 * x = a * 2^32 / 10^9 = a * 4.294967296 845 * 846 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 847 * we can only multiply by about 850 without overflowing, but that 848 * leaves suitably precise fractions for multiply before divide. 849 * 850 * Divide before multiply with a fraction of 2199/512 results in a 851 * systematic undercompensation of 10PPM of th_adjustment. On a 852 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 853 * 854 * We happily sacrifice the lowest of the 64 bits of our result 855 * to the goddess of code clarity. 856 * 857 */ 858 if (s_update) { 859 scale = (uint64_t)1 << 63; 860 scale += (th->th_adjustment / 1024) * 2199; 861 scale /= th->th_counter->tc_frequency; 862 th->th_scale = scale * 2; 863 } 864 /* 865 * Now that the struct timehands is again consistent, set the new 866 * generation number, making sure to not make it zero. Ensure 867 * changes are globally visible before changing. 868 */ 869 if (++ogen == 0) 870 ogen = 1; 871 membar_producer(); 872 th->th_generation = ogen; 873 874 /* 875 * Go live with the new struct timehands. Ensure changes are 876 * globally visible before changing. 877 */ 878 time_second = th->th_microtime.tv_sec; 879 time_uptime = th->th_offset.sec; 880 membar_producer(); 881 timehands = th; 882 883 /* 884 * Force users of the old timehand to move on. This is 885 * necessary for MP systems; we need to ensure that the 886 * consumers will move away from the old timehand before 887 * we begin updating it again when we eventually wrap 888 * around. 889 */ 890 if (++tho->th_generation == 0) 891 tho->th_generation = 1; 892 } 893 894 /* 895 * RFC 2783 PPS-API implementation. 896 */ 897 898 int 899 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 900 { 901 pps_params_t *app; 902 pps_info_t *pipi; 903 #ifdef PPS_SYNC 904 int *epi; 905 #endif 906 907 KASSERT(mutex_owned(&timecounter_lock)); 908 909 KASSERT(pps != NULL); 910 911 switch (cmd) { 912 case PPS_IOC_CREATE: 913 return 0; 914 case PPS_IOC_DESTROY: 915 return 0; 916 case PPS_IOC_SETPARAMS: 917 app = (pps_params_t *)data; 918 if (app->mode & ~pps->ppscap) 919 return EINVAL; 920 pps->ppsparam = *app; 921 return 0; 922 case PPS_IOC_GETPARAMS: 923 app = (pps_params_t *)data; 924 *app = pps->ppsparam; 925 app->api_version = PPS_API_VERS_1; 926 return 0; 927 case PPS_IOC_GETCAP: 928 *(int*)data = pps->ppscap; 929 return 0; 930 case PPS_IOC_FETCH: 931 pipi = (pps_info_t *)data; 932 pps->ppsinfo.current_mode = pps->ppsparam.mode; 933 *pipi = pps->ppsinfo; 934 return 0; 935 case PPS_IOC_KCBIND: 936 #ifdef PPS_SYNC 937 epi = (int *)data; 938 /* XXX Only root should be able to do this */ 939 if (*epi & ~pps->ppscap) 940 return EINVAL; 941 pps->kcmode = *epi; 942 return 0; 943 #else 944 return EOPNOTSUPP; 945 #endif 946 default: 947 return EPASSTHROUGH; 948 } 949 } 950 951 void 952 pps_init(struct pps_state *pps) 953 { 954 955 KASSERT(mutex_owned(&timecounter_lock)); 956 957 pps->ppscap |= PPS_TSFMT_TSPEC; 958 if (pps->ppscap & PPS_CAPTUREASSERT) 959 pps->ppscap |= PPS_OFFSETASSERT; 960 if (pps->ppscap & PPS_CAPTURECLEAR) 961 pps->ppscap |= PPS_OFFSETCLEAR; 962 } 963 964 /* 965 * capture a timetamp in the pps structure 966 */ 967 void 968 pps_capture(struct pps_state *pps) 969 { 970 struct timehands *th; 971 972 KASSERT(mutex_owned(&timecounter_lock)); 973 KASSERT(pps != NULL); 974 975 th = timehands; 976 pps->capgen = th->th_generation; 977 pps->capth = th; 978 pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count; 979 if (pps->capgen != th->th_generation) 980 pps->capgen = 0; 981 } 982 983 #ifdef PPS_DEBUG 984 int ppsdebug = 0; 985 #endif 986 987 /* 988 * process a pps_capture()ed event 989 */ 990 void 991 pps_event(struct pps_state *pps, int event) 992 { 993 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 994 } 995 996 /* 997 * extended pps api / kernel pll/fll entry point 998 * 999 * feed reference time stamps to PPS engine 1000 * 1001 * will simulate a PPS event and feed 1002 * the NTP PLL/FLL if requested. 1003 * 1004 * the ref time stamps should be roughly once 1005 * a second but do not need to be exactly in phase 1006 * with the UTC second but should be close to it. 1007 * this relaxation of requirements allows callout 1008 * driven timestamping mechanisms to feed to pps 1009 * capture/kernel pll logic. 1010 * 1011 * calling pattern is: 1012 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 1013 * read timestamp from reference source 1014 * pps_ref_event() 1015 * 1016 * supported refmodes: 1017 * PPS_REFEVNT_CAPTURE 1018 * use system timestamp of pps_capture() 1019 * PPS_REFEVNT_CURRENT 1020 * use system timestamp of this call 1021 * PPS_REFEVNT_CAPCUR 1022 * use average of read capture and current system time stamp 1023 * PPS_REFEVNT_PPS 1024 * assume timestamp on second mark - ref_ts is ignored 1025 * 1026 */ 1027 1028 void 1029 pps_ref_event(struct pps_state *pps, 1030 int event, 1031 struct bintime *ref_ts, 1032 int refmode 1033 ) 1034 { 1035 struct bintime bt; /* current time */ 1036 struct bintime btd; /* time difference */ 1037 struct bintime bt_ref; /* reference time */ 1038 struct timespec ts, *tsp, *osp; 1039 struct timehands *th; 1040 uint64_t tcount, acount, dcount, *pcount; 1041 int foff, gen; 1042 #ifdef PPS_SYNC 1043 int fhard; 1044 #endif 1045 pps_seq_t *pseq; 1046 1047 KASSERT(mutex_owned(&timecounter_lock)); 1048 1049 KASSERT(pps != NULL); 1050 1051 /* pick up current time stamp if needed */ 1052 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1053 /* pick up current time stamp */ 1054 th = timehands; 1055 gen = th->th_generation; 1056 tcount = (uint64_t)tc_delta(th) + th->th_offset_count; 1057 if (gen != th->th_generation) 1058 gen = 0; 1059 1060 /* If the timecounter was wound up underneath us, bail out. */ 1061 if (pps->capgen == 0 || 1062 pps->capgen != pps->capth->th_generation || 1063 gen == 0 || 1064 gen != pps->capgen) { 1065 #ifdef PPS_DEBUG 1066 if (ppsdebug & 0x1) { 1067 log(LOG_DEBUG, 1068 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1069 pps, event); 1070 } 1071 #endif 1072 return; 1073 } 1074 } else { 1075 tcount = 0; /* keep GCC happy */ 1076 } 1077 1078 #ifdef PPS_DEBUG 1079 if (ppsdebug & 0x1) { 1080 struct timespec tmsp; 1081 1082 if (ref_ts == NULL) { 1083 tmsp.tv_sec = 0; 1084 tmsp.tv_nsec = 0; 1085 } else { 1086 bintime2timespec(ref_ts, &tmsp); 1087 } 1088 1089 log(LOG_DEBUG, 1090 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1091 ".%09"PRIi32", refmode=0x%1x)\n", 1092 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1093 } 1094 #endif 1095 1096 /* setup correct event references */ 1097 if (event == PPS_CAPTUREASSERT) { 1098 tsp = &pps->ppsinfo.assert_timestamp; 1099 osp = &pps->ppsparam.assert_offset; 1100 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1101 #ifdef PPS_SYNC 1102 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1103 #endif 1104 pcount = &pps->ppscount[0]; 1105 pseq = &pps->ppsinfo.assert_sequence; 1106 } else { 1107 tsp = &pps->ppsinfo.clear_timestamp; 1108 osp = &pps->ppsparam.clear_offset; 1109 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1110 #ifdef PPS_SYNC 1111 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1112 #endif 1113 pcount = &pps->ppscount[1]; 1114 pseq = &pps->ppsinfo.clear_sequence; 1115 } 1116 1117 /* determine system time stamp according to refmode */ 1118 dcount = 0; /* keep GCC happy */ 1119 switch (refmode & PPS_REFEVNT_RMASK) { 1120 case PPS_REFEVNT_CAPTURE: 1121 acount = pps->capcount; /* use capture timestamp */ 1122 break; 1123 1124 case PPS_REFEVNT_CURRENT: 1125 acount = tcount; /* use current timestamp */ 1126 break; 1127 1128 case PPS_REFEVNT_CAPCUR: 1129 /* 1130 * calculate counter value between pps_capture() and 1131 * pps_ref_event() 1132 */ 1133 dcount = tcount - pps->capcount; 1134 acount = (dcount / 2) + pps->capcount; 1135 break; 1136 1137 default: /* ignore call error silently */ 1138 return; 1139 } 1140 1141 /* 1142 * If the timecounter changed, we cannot compare the count values, so 1143 * we have to drop the rest of the PPS-stuff until the next event. 1144 */ 1145 if (pps->ppstc != pps->capth->th_counter) { 1146 pps->ppstc = pps->capth->th_counter; 1147 pps->capcount = acount; 1148 *pcount = acount; 1149 pps->ppscount[2] = acount; 1150 #ifdef PPS_DEBUG 1151 if (ppsdebug & 0x1) { 1152 log(LOG_DEBUG, 1153 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1154 pps, event); 1155 } 1156 #endif 1157 return; 1158 } 1159 1160 pps->capcount = acount; 1161 1162 /* Convert the count to a bintime. */ 1163 bt = pps->capth->th_offset; 1164 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1165 bintime_add(&bt, &timebasebin); 1166 1167 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1168 /* determine difference to reference time stamp */ 1169 bt_ref = *ref_ts; 1170 1171 btd = bt; 1172 bintime_sub(&btd, &bt_ref); 1173 1174 /* 1175 * simulate a PPS timestamp by dropping the fraction 1176 * and applying the offset 1177 */ 1178 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1179 bt.sec++; 1180 bt.frac = 0; 1181 bintime_add(&bt, &btd); 1182 } else { 1183 /* 1184 * create ref_ts from current time - 1185 * we are supposed to be called on 1186 * the second mark 1187 */ 1188 bt_ref = bt; 1189 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1190 bt_ref.sec++; 1191 bt_ref.frac = 0; 1192 } 1193 1194 /* convert bintime to timestamp */ 1195 bintime2timespec(&bt, &ts); 1196 1197 /* If the timecounter was wound up underneath us, bail out. */ 1198 if (pps->capgen != pps->capth->th_generation) 1199 return; 1200 1201 /* store time stamp */ 1202 *pcount = pps->capcount; 1203 (*pseq)++; 1204 *tsp = ts; 1205 1206 /* add offset correction */ 1207 if (foff) { 1208 timespecadd(tsp, osp, tsp); 1209 if (tsp->tv_nsec < 0) { 1210 tsp->tv_nsec += 1000000000; 1211 tsp->tv_sec -= 1; 1212 } 1213 } 1214 1215 #ifdef PPS_DEBUG 1216 if (ppsdebug & 0x2) { 1217 struct timespec ts2; 1218 struct timespec ts3; 1219 1220 bintime2timespec(&bt_ref, &ts2); 1221 1222 bt.sec = 0; 1223 bt.frac = 0; 1224 1225 if (refmode & PPS_REFEVNT_CAPCUR) { 1226 bintime_addx(&bt, pps->capth->th_scale * dcount); 1227 } 1228 bintime2timespec(&bt, &ts3); 1229 1230 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1231 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1232 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1233 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1234 timespec2ns(&ts3)); 1235 } 1236 #endif 1237 1238 #ifdef PPS_SYNC 1239 if (fhard) { 1240 uint64_t scale; 1241 uint64_t div; 1242 1243 /* 1244 * Feed the NTP PLL/FLL. 1245 * The FLL wants to know how many (hardware) nanoseconds 1246 * elapsed since the previous event (mod 1 second) thus 1247 * we are actually looking at the frequency difference scaled 1248 * in nsec. 1249 * As the counter time stamps are not truly at 1Hz 1250 * we need to scale the count by the elapsed 1251 * reference time. 1252 * valid sampling interval: [0.5..2[ sec 1253 */ 1254 1255 /* calculate elapsed raw count */ 1256 tcount = pps->capcount - pps->ppscount[2]; 1257 pps->ppscount[2] = pps->capcount; 1258 tcount &= pps->capth->th_counter->tc_counter_mask; 1259 1260 /* calculate elapsed ref time */ 1261 btd = bt_ref; 1262 bintime_sub(&btd, &pps->ref_time); 1263 pps->ref_time = bt_ref; 1264 1265 /* check that we stay below 2 sec */ 1266 if (btd.sec < 0 || btd.sec > 1) 1267 return; 1268 1269 /* we want at least 0.5 sec between samples */ 1270 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1271 return; 1272 1273 /* 1274 * calculate cycles per period by multiplying 1275 * the frequency with the elapsed period 1276 * we pick a fraction of 30 bits 1277 * ~1ns resolution for elapsed time 1278 */ 1279 div = (uint64_t)btd.sec << 30; 1280 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1281 div *= pps->capth->th_counter->tc_frequency; 1282 div >>= 30; 1283 1284 if (div == 0) /* safeguard */ 1285 return; 1286 1287 scale = (uint64_t)1 << 63; 1288 scale /= div; 1289 scale *= 2; 1290 1291 bt.sec = 0; 1292 bt.frac = 0; 1293 bintime_addx(&bt, scale * tcount); 1294 bintime2timespec(&bt, &ts); 1295 1296 #ifdef PPS_DEBUG 1297 if (ppsdebug & 0x4) { 1298 struct timespec ts2; 1299 int64_t df; 1300 1301 bintime2timespec(&bt_ref, &ts2); 1302 df = timespec2ns(&ts); 1303 if (df > 500000000) 1304 df -= 1000000000; 1305 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1306 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1307 ", freqdiff=%"PRIi64" ns/s\n", 1308 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1309 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1310 df); 1311 } 1312 #endif 1313 1314 hardpps(tsp, timespec2ns(&ts)); 1315 } 1316 #endif 1317 } 1318 1319 /* 1320 * Timecounters need to be updated every so often to prevent the hardware 1321 * counter from overflowing. Updating also recalculates the cached values 1322 * used by the get*() family of functions, so their precision depends on 1323 * the update frequency. 1324 */ 1325 1326 static int tc_tick; 1327 1328 void 1329 tc_ticktock(void) 1330 { 1331 static int count; 1332 1333 if (++count < tc_tick) 1334 return; 1335 count = 0; 1336 mutex_spin_enter(&timecounter_lock); 1337 if (__predict_false(timecounter_bad != 0)) { 1338 /* An existing timecounter has gone bad, pick a new one. */ 1339 (void)atomic_swap_uint(&timecounter_bad, 0); 1340 if (timecounter->tc_quality < 0) { 1341 tc_pick(); 1342 } 1343 } 1344 tc_windup(); 1345 mutex_spin_exit(&timecounter_lock); 1346 } 1347 1348 void 1349 inittimecounter(void) 1350 { 1351 u_int p; 1352 1353 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1354 1355 /* 1356 * Set the initial timeout to 1357 * max(1, <approx. number of hardclock ticks in a millisecond>). 1358 * People should probably not use the sysctl to set the timeout 1359 * to smaller than its initial value, since that value is the 1360 * smallest reasonable one. If they want better timestamps they 1361 * should use the non-"get"* functions. 1362 */ 1363 if (hz > 1000) 1364 tc_tick = (hz + 500) / 1000; 1365 else 1366 tc_tick = 1; 1367 p = (tc_tick * 1000000) / hz; 1368 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1369 p / 1000, p % 1000); 1370 1371 /* warm up new timecounter (again) and get rolling. */ 1372 (void)timecounter->tc_get_timecount(timecounter); 1373 (void)timecounter->tc_get_timecount(timecounter); 1374 } 1375