1 /* $NetBSD: kern_tc.c,v 1.62 2021/06/02 21:34:58 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.62 2021/06/02 21:34:58 riastradh Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/atomic.h> 51 #include <sys/evcnt.h> 52 #include <sys/kauth.h> 53 #include <sys/kernel.h> 54 #include <sys/mutex.h> 55 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 56 #include <sys/sysctl.h> 57 #include <sys/syslog.h> 58 #include <sys/systm.h> 59 #include <sys/timepps.h> 60 #include <sys/timetc.h> 61 #include <sys/timex.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return ++now; 84 } 85 86 static struct timecounter dummy_timecounter = { 87 .tc_get_timecount = dummy_get_timecount, 88 .tc_counter_mask = ~0u, 89 .tc_frequency = 1000000, 90 .tc_name = "dummy", 91 .tc_quality = -1000000, 92 .tc_priv = NULL, 93 }; 94 95 struct timehands { 96 /* These fields must be initialized by the driver. */ 97 struct timecounter *th_counter; /* active timecounter */ 98 int64_t th_adjustment; /* frequency adjustment */ 99 /* (NTP/adjtime) */ 100 uint64_t th_scale; /* scale factor (counter */ 101 /* tick->time) */ 102 uint64_t th_offset_count; /* offset at last time */ 103 /* update (tc_windup()) */ 104 struct bintime th_offset; /* bin (up)time at windup */ 105 struct timeval th_microtime; /* cached microtime */ 106 struct timespec th_nanotime; /* cached nanotime */ 107 /* Fields not to be copied in tc_windup start with th_generation. */ 108 volatile u_int th_generation; /* current genration */ 109 struct timehands *th_next; /* next timehand */ 110 }; 111 112 static struct timehands th0; 113 static struct timehands th9 = { .th_next = &th0, }; 114 static struct timehands th8 = { .th_next = &th9, }; 115 static struct timehands th7 = { .th_next = &th8, }; 116 static struct timehands th6 = { .th_next = &th7, }; 117 static struct timehands th5 = { .th_next = &th6, }; 118 static struct timehands th4 = { .th_next = &th5, }; 119 static struct timehands th3 = { .th_next = &th4, }; 120 static struct timehands th2 = { .th_next = &th3, }; 121 static struct timehands th1 = { .th_next = &th2, }; 122 static struct timehands th0 = { 123 .th_counter = &dummy_timecounter, 124 .th_scale = (uint64_t)-1 / 1000000, 125 .th_offset = { .sec = 1, .frac = 0 }, 126 .th_generation = 1, 127 .th_next = &th1, 128 }; 129 130 static struct timehands *volatile timehands = &th0; 131 struct timecounter *timecounter = &dummy_timecounter; 132 static struct timecounter *timecounters = &dummy_timecounter; 133 134 volatile time_t time_second __cacheline_aligned = 1; 135 volatile time_t time_uptime __cacheline_aligned = 1; 136 137 static struct bintime timebasebin; 138 139 static int timestepwarnings; 140 141 kmutex_t timecounter_lock; 142 static u_int timecounter_mods; 143 static volatile int timecounter_removals = 1; 144 static u_int timecounter_bad; 145 146 /* 147 * sysctl helper routine for kern.timercounter.hardware 148 */ 149 static int 150 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 151 { 152 struct sysctlnode node; 153 int error; 154 char newname[MAX_TCNAMELEN]; 155 struct timecounter *newtc, *tc; 156 157 tc = timecounter; 158 159 strlcpy(newname, tc->tc_name, sizeof(newname)); 160 161 node = *rnode; 162 node.sysctl_data = newname; 163 node.sysctl_size = sizeof(newname); 164 165 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 166 167 if (error || 168 newp == NULL || 169 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 170 return error; 171 172 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 173 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 174 NULL, NULL)) != 0) 175 return error; 176 177 if (!cold) 178 mutex_spin_enter(&timecounter_lock); 179 error = EINVAL; 180 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 181 if (strcmp(newname, newtc->tc_name) != 0) 182 continue; 183 /* Warm up new timecounter. */ 184 (void)newtc->tc_get_timecount(newtc); 185 (void)newtc->tc_get_timecount(newtc); 186 timecounter = newtc; 187 error = 0; 188 break; 189 } 190 if (!cold) 191 mutex_spin_exit(&timecounter_lock); 192 return error; 193 } 194 195 static int 196 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 197 { 198 char buf[MAX_TCNAMELEN+48]; 199 char *where; 200 const char *spc; 201 struct timecounter *tc; 202 size_t needed, left, slen; 203 int error, mods; 204 205 if (newp != NULL) 206 return EPERM; 207 if (namelen != 0) 208 return EINVAL; 209 210 mutex_spin_enter(&timecounter_lock); 211 retry: 212 spc = ""; 213 error = 0; 214 needed = 0; 215 left = *oldlenp; 216 where = oldp; 217 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 218 if (where == NULL) { 219 needed += sizeof(buf); /* be conservative */ 220 } else { 221 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 222 " Hz)", spc, tc->tc_name, tc->tc_quality, 223 tc->tc_frequency); 224 if (left < slen + 1) 225 break; 226 mods = timecounter_mods; 227 mutex_spin_exit(&timecounter_lock); 228 error = copyout(buf, where, slen + 1); 229 mutex_spin_enter(&timecounter_lock); 230 if (mods != timecounter_mods) { 231 goto retry; 232 } 233 spc = " "; 234 where += slen; 235 needed += slen; 236 left -= slen; 237 } 238 } 239 mutex_spin_exit(&timecounter_lock); 240 241 *oldlenp = needed; 242 return error; 243 } 244 245 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 246 { 247 const struct sysctlnode *node; 248 249 sysctl_createv(clog, 0, NULL, &node, 250 CTLFLAG_PERMANENT, 251 CTLTYPE_NODE, "timecounter", 252 SYSCTL_DESCR("time counter information"), 253 NULL, 0, NULL, 0, 254 CTL_KERN, CTL_CREATE, CTL_EOL); 255 256 if (node != NULL) { 257 sysctl_createv(clog, 0, NULL, NULL, 258 CTLFLAG_PERMANENT, 259 CTLTYPE_STRING, "choice", 260 SYSCTL_DESCR("available counters"), 261 sysctl_kern_timecounter_choice, 0, NULL, 0, 262 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 263 264 sysctl_createv(clog, 0, NULL, NULL, 265 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 266 CTLTYPE_STRING, "hardware", 267 SYSCTL_DESCR("currently active time counter"), 268 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 269 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 270 271 sysctl_createv(clog, 0, NULL, NULL, 272 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 273 CTLTYPE_INT, "timestepwarnings", 274 SYSCTL_DESCR("log time steps"), 275 NULL, 0, ×tepwarnings, 0, 276 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 277 } 278 } 279 280 #ifdef TC_COUNTERS 281 #define TC_STATS(name) \ 282 static struct evcnt n##name = \ 283 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 284 EVCNT_ATTACH_STATIC(n##name) 285 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 286 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 287 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 288 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 289 TC_STATS(setclock); 290 #define TC_COUNT(var) var.ev_count++ 291 #undef TC_STATS 292 #else 293 #define TC_COUNT(var) /* nothing */ 294 #endif /* TC_COUNTERS */ 295 296 static void tc_windup(void); 297 298 /* 299 * Return the difference between the timehands' counter value now and what 300 * was when we copied it to the timehands' offset_count. 301 */ 302 static inline u_int 303 tc_delta(struct timehands *th) 304 { 305 struct timecounter *tc; 306 307 tc = th->th_counter; 308 return (tc->tc_get_timecount(tc) - 309 th->th_offset_count) & tc->tc_counter_mask; 310 } 311 312 /* 313 * Functions for reading the time. We have to loop until we are sure that 314 * the timehands that we operated on was not updated under our feet. See 315 * the comment in <sys/timevar.h> for a description of these 12 functions. 316 */ 317 318 void 319 binuptime(struct bintime *bt) 320 { 321 struct timehands *th; 322 lwp_t *l; 323 u_int lgen, gen; 324 325 TC_COUNT(nbinuptime); 326 327 /* 328 * Provide exclusion against tc_detach(). 329 * 330 * We record the number of timecounter removals before accessing 331 * timecounter state. Note that the LWP can be using multiple 332 * "generations" at once, due to interrupts (interrupted while in 333 * this function). Hardware interrupts will borrow the interrupted 334 * LWP's l_tcgen value for this purpose, and can themselves be 335 * interrupted by higher priority interrupts. In this case we need 336 * to ensure that the oldest generation in use is recorded. 337 * 338 * splsched() is too expensive to use, so we take care to structure 339 * this code in such a way that it is not required. Likewise, we 340 * do not disable preemption. 341 * 342 * Memory barriers are also too expensive to use for such a 343 * performance critical function. The good news is that we do not 344 * need memory barriers for this type of exclusion, as the thread 345 * updating timecounter_removals will issue a broadcast cross call 346 * before inspecting our l_tcgen value (this elides memory ordering 347 * issues). 348 */ 349 l = curlwp; 350 lgen = l->l_tcgen; 351 if (__predict_true(lgen == 0)) { 352 l->l_tcgen = timecounter_removals; 353 } 354 __insn_barrier(); 355 356 do { 357 th = timehands; 358 gen = th->th_generation; 359 *bt = th->th_offset; 360 bintime_addx(bt, th->th_scale * tc_delta(th)); 361 } while (gen == 0 || gen != th->th_generation); 362 363 __insn_barrier(); 364 l->l_tcgen = lgen; 365 } 366 367 void 368 nanouptime(struct timespec *tsp) 369 { 370 struct bintime bt; 371 372 TC_COUNT(nnanouptime); 373 binuptime(&bt); 374 bintime2timespec(&bt, tsp); 375 } 376 377 void 378 microuptime(struct timeval *tvp) 379 { 380 struct bintime bt; 381 382 TC_COUNT(nmicrouptime); 383 binuptime(&bt); 384 bintime2timeval(&bt, tvp); 385 } 386 387 void 388 bintime(struct bintime *bt) 389 { 390 391 TC_COUNT(nbintime); 392 binuptime(bt); 393 bintime_add(bt, &timebasebin); 394 } 395 396 void 397 nanotime(struct timespec *tsp) 398 { 399 struct bintime bt; 400 401 TC_COUNT(nnanotime); 402 bintime(&bt); 403 bintime2timespec(&bt, tsp); 404 } 405 406 void 407 microtime(struct timeval *tvp) 408 { 409 struct bintime bt; 410 411 TC_COUNT(nmicrotime); 412 bintime(&bt); 413 bintime2timeval(&bt, tvp); 414 } 415 416 void 417 getbinuptime(struct bintime *bt) 418 { 419 struct timehands *th; 420 u_int gen; 421 422 TC_COUNT(ngetbinuptime); 423 do { 424 th = timehands; 425 gen = th->th_generation; 426 *bt = th->th_offset; 427 } while (gen == 0 || gen != th->th_generation); 428 } 429 430 void 431 getnanouptime(struct timespec *tsp) 432 { 433 struct timehands *th; 434 u_int gen; 435 436 TC_COUNT(ngetnanouptime); 437 do { 438 th = timehands; 439 gen = th->th_generation; 440 bintime2timespec(&th->th_offset, tsp); 441 } while (gen == 0 || gen != th->th_generation); 442 } 443 444 void 445 getmicrouptime(struct timeval *tvp) 446 { 447 struct timehands *th; 448 u_int gen; 449 450 TC_COUNT(ngetmicrouptime); 451 do { 452 th = timehands; 453 gen = th->th_generation; 454 bintime2timeval(&th->th_offset, tvp); 455 } while (gen == 0 || gen != th->th_generation); 456 } 457 458 void 459 getbintime(struct bintime *bt) 460 { 461 struct timehands *th; 462 u_int gen; 463 464 TC_COUNT(ngetbintime); 465 do { 466 th = timehands; 467 gen = th->th_generation; 468 *bt = th->th_offset; 469 } while (gen == 0 || gen != th->th_generation); 470 bintime_add(bt, &timebasebin); 471 } 472 473 static inline void 474 dogetnanotime(struct timespec *tsp) 475 { 476 struct timehands *th; 477 u_int gen; 478 479 TC_COUNT(ngetnanotime); 480 do { 481 th = timehands; 482 gen = th->th_generation; 483 *tsp = th->th_nanotime; 484 } while (gen == 0 || gen != th->th_generation); 485 } 486 487 void 488 getnanotime(struct timespec *tsp) 489 { 490 491 dogetnanotime(tsp); 492 } 493 494 void dtrace_getnanotime(struct timespec *tsp); 495 496 void 497 dtrace_getnanotime(struct timespec *tsp) 498 { 499 500 dogetnanotime(tsp); 501 } 502 503 void 504 getmicrotime(struct timeval *tvp) 505 { 506 struct timehands *th; 507 u_int gen; 508 509 TC_COUNT(ngetmicrotime); 510 do { 511 th = timehands; 512 gen = th->th_generation; 513 *tvp = th->th_microtime; 514 } while (gen == 0 || gen != th->th_generation); 515 } 516 517 void 518 getnanoboottime(struct timespec *tsp) 519 { 520 struct bintime bt; 521 522 getbinboottime(&bt); 523 bintime2timespec(&bt, tsp); 524 } 525 526 void 527 getmicroboottime(struct timeval *tvp) 528 { 529 struct bintime bt; 530 531 getbinboottime(&bt); 532 bintime2timeval(&bt, tvp); 533 } 534 535 void 536 getbinboottime(struct bintime *bt) 537 { 538 539 /* 540 * XXX Need lockless read synchronization around timebasebin 541 * (and not just here). 542 */ 543 *bt = timebasebin; 544 } 545 546 /* 547 * Initialize a new timecounter and possibly use it. 548 */ 549 void 550 tc_init(struct timecounter *tc) 551 { 552 u_int u; 553 554 KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised", 555 tc->tc_name); 556 557 u = tc->tc_frequency / tc->tc_counter_mask; 558 /* XXX: We need some margin here, 10% is a guess */ 559 u *= 11; 560 u /= 10; 561 if (u > hz && tc->tc_quality >= 0) { 562 tc->tc_quality = -2000; 563 aprint_verbose( 564 "timecounter: Timecounter \"%s\" frequency %ju Hz", 565 tc->tc_name, (uintmax_t)tc->tc_frequency); 566 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 567 } else if (tc->tc_quality >= 0 || bootverbose) { 568 aprint_verbose( 569 "timecounter: Timecounter \"%s\" frequency %ju Hz " 570 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 571 tc->tc_quality); 572 } 573 574 mutex_spin_enter(&timecounter_lock); 575 tc->tc_next = timecounters; 576 timecounters = tc; 577 timecounter_mods++; 578 /* 579 * Never automatically use a timecounter with negative quality. 580 * Even though we run on the dummy counter, switching here may be 581 * worse since this timecounter may not be monotonous. 582 */ 583 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 584 (tc->tc_quality == timecounter->tc_quality && 585 tc->tc_frequency > timecounter->tc_frequency))) { 586 (void)tc->tc_get_timecount(tc); 587 (void)tc->tc_get_timecount(tc); 588 timecounter = tc; 589 tc_windup(); 590 } 591 mutex_spin_exit(&timecounter_lock); 592 } 593 594 /* 595 * Pick a new timecounter due to the existing counter going bad. 596 */ 597 static void 598 tc_pick(void) 599 { 600 struct timecounter *best, *tc; 601 602 KASSERT(mutex_owned(&timecounter_lock)); 603 604 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 605 if (tc->tc_quality > best->tc_quality) 606 best = tc; 607 else if (tc->tc_quality < best->tc_quality) 608 continue; 609 else if (tc->tc_frequency > best->tc_frequency) 610 best = tc; 611 } 612 (void)best->tc_get_timecount(best); 613 (void)best->tc_get_timecount(best); 614 timecounter = best; 615 } 616 617 /* 618 * A timecounter has gone bad, arrange to pick a new one at the next 619 * clock tick. 620 */ 621 void 622 tc_gonebad(struct timecounter *tc) 623 { 624 625 tc->tc_quality = -100; 626 membar_producer(); 627 atomic_inc_uint(&timecounter_bad); 628 } 629 630 /* 631 * Stop using a timecounter and remove it from the timecounters list. 632 */ 633 int 634 tc_detach(struct timecounter *target) 635 { 636 struct timecounter *tc; 637 struct timecounter **tcp = NULL; 638 int removals; 639 lwp_t *l; 640 641 /* First, find the timecounter. */ 642 mutex_spin_enter(&timecounter_lock); 643 for (tcp = &timecounters, tc = timecounters; 644 tc != NULL; 645 tcp = &tc->tc_next, tc = tc->tc_next) { 646 if (tc == target) 647 break; 648 } 649 if (tc == NULL) { 650 mutex_spin_exit(&timecounter_lock); 651 return ESRCH; 652 } 653 654 /* And now, remove it. */ 655 *tcp = tc->tc_next; 656 if (timecounter == target) { 657 tc_pick(); 658 tc_windup(); 659 } 660 timecounter_mods++; 661 removals = timecounter_removals++; 662 mutex_spin_exit(&timecounter_lock); 663 664 /* 665 * We now have to determine if any threads in the system are still 666 * making use of this timecounter. 667 * 668 * We issue a broadcast cross call to elide memory ordering issues, 669 * then scan all LWPs in the system looking at each's timecounter 670 * generation number. We need to see a value of zero (not actively 671 * using a timecounter) or a value greater than our removal value. 672 * 673 * We may race with threads that read `timecounter_removals' and 674 * and then get preempted before updating `l_tcgen'. This is not 675 * a problem, since it means that these threads have not yet started 676 * accessing timecounter state. All we do need is one clean 677 * snapshot of the system where every thread appears not to be using 678 * old timecounter state. 679 */ 680 for (;;) { 681 xc_barrier(0); 682 683 mutex_enter(&proc_lock); 684 LIST_FOREACH(l, &alllwp, l_list) { 685 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 686 /* 687 * Not using timecounter or old timecounter 688 * state at time of our xcall or later. 689 */ 690 continue; 691 } 692 break; 693 } 694 mutex_exit(&proc_lock); 695 696 /* 697 * If the timecounter is still in use, wait at least 10ms 698 * before retrying. 699 */ 700 if (l == NULL) { 701 break; 702 } 703 (void)kpause("tcdetach", false, mstohz(10), NULL); 704 } 705 706 tc->tc_next = NULL; 707 return 0; 708 } 709 710 /* Report the frequency of the current timecounter. */ 711 uint64_t 712 tc_getfrequency(void) 713 { 714 715 return timehands->th_counter->tc_frequency; 716 } 717 718 /* 719 * Step our concept of UTC. This is done by modifying our estimate of 720 * when we booted. 721 */ 722 void 723 tc_setclock(const struct timespec *ts) 724 { 725 struct timespec ts2; 726 struct bintime bt, bt2; 727 728 mutex_spin_enter(&timecounter_lock); 729 TC_COUNT(nsetclock); 730 binuptime(&bt2); 731 timespec2bintime(ts, &bt); 732 bintime_sub(&bt, &bt2); 733 bintime_add(&bt2, &timebasebin); 734 timebasebin = bt; 735 tc_windup(); 736 mutex_spin_exit(&timecounter_lock); 737 738 if (timestepwarnings) { 739 bintime2timespec(&bt2, &ts2); 740 log(LOG_INFO, 741 "Time stepped from %lld.%09ld to %lld.%09ld\n", 742 (long long)ts2.tv_sec, ts2.tv_nsec, 743 (long long)ts->tv_sec, ts->tv_nsec); 744 } 745 } 746 747 /* 748 * Initialize the next struct timehands in the ring and make 749 * it the active timehands. Along the way we might switch to a different 750 * timecounter and/or do seconds processing in NTP. Slightly magic. 751 */ 752 static void 753 tc_windup(void) 754 { 755 struct bintime bt; 756 struct timehands *th, *tho; 757 uint64_t scale; 758 u_int delta, ncount, ogen; 759 int i, s_update; 760 time_t t; 761 762 KASSERT(mutex_owned(&timecounter_lock)); 763 764 s_update = 0; 765 766 /* 767 * Make the next timehands a copy of the current one, but do not 768 * overwrite the generation or next pointer. While we update 769 * the contents, the generation must be zero. Ensure global 770 * visibility of the generation before proceeding. 771 */ 772 tho = timehands; 773 th = tho->th_next; 774 ogen = th->th_generation; 775 th->th_generation = 0; 776 membar_producer(); 777 bcopy(tho, th, offsetof(struct timehands, th_generation)); 778 779 /* 780 * Capture a timecounter delta on the current timecounter and if 781 * changing timecounters, a counter value from the new timecounter. 782 * Update the offset fields accordingly. 783 */ 784 delta = tc_delta(th); 785 if (th->th_counter != timecounter) 786 ncount = timecounter->tc_get_timecount(timecounter); 787 else 788 ncount = 0; 789 th->th_offset_count += delta; 790 bintime_addx(&th->th_offset, th->th_scale * delta); 791 792 /* 793 * Hardware latching timecounters may not generate interrupts on 794 * PPS events, so instead we poll them. There is a finite risk that 795 * the hardware might capture a count which is later than the one we 796 * got above, and therefore possibly in the next NTP second which might 797 * have a different rate than the current NTP second. It doesn't 798 * matter in practice. 799 */ 800 if (tho->th_counter->tc_poll_pps) 801 tho->th_counter->tc_poll_pps(tho->th_counter); 802 803 /* 804 * Deal with NTP second processing. The for loop normally 805 * iterates at most once, but in extreme situations it might 806 * keep NTP sane if timeouts are not run for several seconds. 807 * At boot, the time step can be large when the TOD hardware 808 * has been read, so on really large steps, we call 809 * ntp_update_second only twice. We need to call it twice in 810 * case we missed a leap second. 811 * If NTP is not compiled in ntp_update_second still calculates 812 * the adjustment resulting from adjtime() calls. 813 */ 814 bt = th->th_offset; 815 bintime_add(&bt, &timebasebin); 816 i = bt.sec - tho->th_microtime.tv_sec; 817 if (i > LARGE_STEP) 818 i = 2; 819 for (; i > 0; i--) { 820 t = bt.sec; 821 ntp_update_second(&th->th_adjustment, &bt.sec); 822 s_update = 1; 823 if (bt.sec != t) 824 timebasebin.sec += bt.sec - t; 825 } 826 827 /* Update the UTC timestamps used by the get*() functions. */ 828 /* XXX shouldn't do this here. Should force non-`get' versions. */ 829 bintime2timeval(&bt, &th->th_microtime); 830 bintime2timespec(&bt, &th->th_nanotime); 831 /* Now is a good time to change timecounters. */ 832 if (th->th_counter != timecounter) { 833 th->th_counter = timecounter; 834 th->th_offset_count = ncount; 835 s_update = 1; 836 } 837 838 /*- 839 * Recalculate the scaling factor. We want the number of 1/2^64 840 * fractions of a second per period of the hardware counter, taking 841 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 842 * processing provides us with. 843 * 844 * The th_adjustment is nanoseconds per second with 32 bit binary 845 * fraction and we want 64 bit binary fraction of second: 846 * 847 * x = a * 2^32 / 10^9 = a * 4.294967296 848 * 849 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 850 * we can only multiply by about 850 without overflowing, but that 851 * leaves suitably precise fractions for multiply before divide. 852 * 853 * Divide before multiply with a fraction of 2199/512 results in a 854 * systematic undercompensation of 10PPM of th_adjustment. On a 855 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 856 * 857 * We happily sacrifice the lowest of the 64 bits of our result 858 * to the goddess of code clarity. 859 * 860 */ 861 if (s_update) { 862 scale = (uint64_t)1 << 63; 863 scale += (th->th_adjustment / 1024) * 2199; 864 scale /= th->th_counter->tc_frequency; 865 th->th_scale = scale * 2; 866 } 867 /* 868 * Now that the struct timehands is again consistent, set the new 869 * generation number, making sure to not make it zero. Ensure 870 * changes are globally visible before changing. 871 */ 872 if (++ogen == 0) 873 ogen = 1; 874 membar_producer(); 875 th->th_generation = ogen; 876 877 /* 878 * Go live with the new struct timehands. Ensure changes are 879 * globally visible before changing. 880 */ 881 time_second = th->th_microtime.tv_sec; 882 time_uptime = th->th_offset.sec; 883 membar_producer(); 884 timehands = th; 885 886 /* 887 * Force users of the old timehand to move on. This is 888 * necessary for MP systems; we need to ensure that the 889 * consumers will move away from the old timehand before 890 * we begin updating it again when we eventually wrap 891 * around. 892 */ 893 if (++tho->th_generation == 0) 894 tho->th_generation = 1; 895 } 896 897 /* 898 * RFC 2783 PPS-API implementation. 899 */ 900 901 int 902 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 903 { 904 pps_params_t *app; 905 pps_info_t *pipi; 906 #ifdef PPS_SYNC 907 int *epi; 908 #endif 909 910 KASSERT(mutex_owned(&timecounter_lock)); 911 912 KASSERT(pps != NULL); 913 914 switch (cmd) { 915 case PPS_IOC_CREATE: 916 return 0; 917 case PPS_IOC_DESTROY: 918 return 0; 919 case PPS_IOC_SETPARAMS: 920 app = (pps_params_t *)data; 921 if (app->mode & ~pps->ppscap) 922 return EINVAL; 923 pps->ppsparam = *app; 924 return 0; 925 case PPS_IOC_GETPARAMS: 926 app = (pps_params_t *)data; 927 *app = pps->ppsparam; 928 app->api_version = PPS_API_VERS_1; 929 return 0; 930 case PPS_IOC_GETCAP: 931 *(int*)data = pps->ppscap; 932 return 0; 933 case PPS_IOC_FETCH: 934 pipi = (pps_info_t *)data; 935 pps->ppsinfo.current_mode = pps->ppsparam.mode; 936 *pipi = pps->ppsinfo; 937 return 0; 938 case PPS_IOC_KCBIND: 939 #ifdef PPS_SYNC 940 epi = (int *)data; 941 /* XXX Only root should be able to do this */ 942 if (*epi & ~pps->ppscap) 943 return EINVAL; 944 pps->kcmode = *epi; 945 return 0; 946 #else 947 return EOPNOTSUPP; 948 #endif 949 default: 950 return EPASSTHROUGH; 951 } 952 } 953 954 void 955 pps_init(struct pps_state *pps) 956 { 957 958 KASSERT(mutex_owned(&timecounter_lock)); 959 960 pps->ppscap |= PPS_TSFMT_TSPEC; 961 if (pps->ppscap & PPS_CAPTUREASSERT) 962 pps->ppscap |= PPS_OFFSETASSERT; 963 if (pps->ppscap & PPS_CAPTURECLEAR) 964 pps->ppscap |= PPS_OFFSETCLEAR; 965 } 966 967 /* 968 * capture a timetamp in the pps structure 969 */ 970 void 971 pps_capture(struct pps_state *pps) 972 { 973 struct timehands *th; 974 975 KASSERT(mutex_owned(&timecounter_lock)); 976 KASSERT(pps != NULL); 977 978 th = timehands; 979 pps->capgen = th->th_generation; 980 pps->capth = th; 981 pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count; 982 if (pps->capgen != th->th_generation) 983 pps->capgen = 0; 984 } 985 986 #ifdef PPS_DEBUG 987 int ppsdebug = 0; 988 #endif 989 990 /* 991 * process a pps_capture()ed event 992 */ 993 void 994 pps_event(struct pps_state *pps, int event) 995 { 996 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 997 } 998 999 /* 1000 * extended pps api / kernel pll/fll entry point 1001 * 1002 * feed reference time stamps to PPS engine 1003 * 1004 * will simulate a PPS event and feed 1005 * the NTP PLL/FLL if requested. 1006 * 1007 * the ref time stamps should be roughly once 1008 * a second but do not need to be exactly in phase 1009 * with the UTC second but should be close to it. 1010 * this relaxation of requirements allows callout 1011 * driven timestamping mechanisms to feed to pps 1012 * capture/kernel pll logic. 1013 * 1014 * calling pattern is: 1015 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 1016 * read timestamp from reference source 1017 * pps_ref_event() 1018 * 1019 * supported refmodes: 1020 * PPS_REFEVNT_CAPTURE 1021 * use system timestamp of pps_capture() 1022 * PPS_REFEVNT_CURRENT 1023 * use system timestamp of this call 1024 * PPS_REFEVNT_CAPCUR 1025 * use average of read capture and current system time stamp 1026 * PPS_REFEVNT_PPS 1027 * assume timestamp on second mark - ref_ts is ignored 1028 * 1029 */ 1030 1031 void 1032 pps_ref_event(struct pps_state *pps, 1033 int event, 1034 struct bintime *ref_ts, 1035 int refmode 1036 ) 1037 { 1038 struct bintime bt; /* current time */ 1039 struct bintime btd; /* time difference */ 1040 struct bintime bt_ref; /* reference time */ 1041 struct timespec ts, *tsp, *osp; 1042 struct timehands *th; 1043 uint64_t tcount, acount, dcount, *pcount; 1044 int foff, gen; 1045 #ifdef PPS_SYNC 1046 int fhard; 1047 #endif 1048 pps_seq_t *pseq; 1049 1050 KASSERT(mutex_owned(&timecounter_lock)); 1051 1052 KASSERT(pps != NULL); 1053 1054 /* pick up current time stamp if needed */ 1055 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1056 /* pick up current time stamp */ 1057 th = timehands; 1058 gen = th->th_generation; 1059 tcount = (uint64_t)tc_delta(th) + th->th_offset_count; 1060 if (gen != th->th_generation) 1061 gen = 0; 1062 1063 /* If the timecounter was wound up underneath us, bail out. */ 1064 if (pps->capgen == 0 || 1065 pps->capgen != pps->capth->th_generation || 1066 gen == 0 || 1067 gen != pps->capgen) { 1068 #ifdef PPS_DEBUG 1069 if (ppsdebug & 0x1) { 1070 log(LOG_DEBUG, 1071 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1072 pps, event); 1073 } 1074 #endif 1075 return; 1076 } 1077 } else { 1078 tcount = 0; /* keep GCC happy */ 1079 } 1080 1081 #ifdef PPS_DEBUG 1082 if (ppsdebug & 0x1) { 1083 struct timespec tmsp; 1084 1085 if (ref_ts == NULL) { 1086 tmsp.tv_sec = 0; 1087 tmsp.tv_nsec = 0; 1088 } else { 1089 bintime2timespec(ref_ts, &tmsp); 1090 } 1091 1092 log(LOG_DEBUG, 1093 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1094 ".%09"PRIi32", refmode=0x%1x)\n", 1095 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1096 } 1097 #endif 1098 1099 /* setup correct event references */ 1100 if (event == PPS_CAPTUREASSERT) { 1101 tsp = &pps->ppsinfo.assert_timestamp; 1102 osp = &pps->ppsparam.assert_offset; 1103 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1104 #ifdef PPS_SYNC 1105 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1106 #endif 1107 pcount = &pps->ppscount[0]; 1108 pseq = &pps->ppsinfo.assert_sequence; 1109 } else { 1110 tsp = &pps->ppsinfo.clear_timestamp; 1111 osp = &pps->ppsparam.clear_offset; 1112 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1113 #ifdef PPS_SYNC 1114 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1115 #endif 1116 pcount = &pps->ppscount[1]; 1117 pseq = &pps->ppsinfo.clear_sequence; 1118 } 1119 1120 /* determine system time stamp according to refmode */ 1121 dcount = 0; /* keep GCC happy */ 1122 switch (refmode & PPS_REFEVNT_RMASK) { 1123 case PPS_REFEVNT_CAPTURE: 1124 acount = pps->capcount; /* use capture timestamp */ 1125 break; 1126 1127 case PPS_REFEVNT_CURRENT: 1128 acount = tcount; /* use current timestamp */ 1129 break; 1130 1131 case PPS_REFEVNT_CAPCUR: 1132 /* 1133 * calculate counter value between pps_capture() and 1134 * pps_ref_event() 1135 */ 1136 dcount = tcount - pps->capcount; 1137 acount = (dcount / 2) + pps->capcount; 1138 break; 1139 1140 default: /* ignore call error silently */ 1141 return; 1142 } 1143 1144 /* 1145 * If the timecounter changed, we cannot compare the count values, so 1146 * we have to drop the rest of the PPS-stuff until the next event. 1147 */ 1148 if (pps->ppstc != pps->capth->th_counter) { 1149 pps->ppstc = pps->capth->th_counter; 1150 pps->capcount = acount; 1151 *pcount = acount; 1152 pps->ppscount[2] = acount; 1153 #ifdef PPS_DEBUG 1154 if (ppsdebug & 0x1) { 1155 log(LOG_DEBUG, 1156 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1157 pps, event); 1158 } 1159 #endif 1160 return; 1161 } 1162 1163 pps->capcount = acount; 1164 1165 /* Convert the count to a bintime. */ 1166 bt = pps->capth->th_offset; 1167 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1168 bintime_add(&bt, &timebasebin); 1169 1170 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1171 /* determine difference to reference time stamp */ 1172 bt_ref = *ref_ts; 1173 1174 btd = bt; 1175 bintime_sub(&btd, &bt_ref); 1176 1177 /* 1178 * simulate a PPS timestamp by dropping the fraction 1179 * and applying the offset 1180 */ 1181 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1182 bt.sec++; 1183 bt.frac = 0; 1184 bintime_add(&bt, &btd); 1185 } else { 1186 /* 1187 * create ref_ts from current time - 1188 * we are supposed to be called on 1189 * the second mark 1190 */ 1191 bt_ref = bt; 1192 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1193 bt_ref.sec++; 1194 bt_ref.frac = 0; 1195 } 1196 1197 /* convert bintime to timestamp */ 1198 bintime2timespec(&bt, &ts); 1199 1200 /* If the timecounter was wound up underneath us, bail out. */ 1201 if (pps->capgen != pps->capth->th_generation) 1202 return; 1203 1204 /* store time stamp */ 1205 *pcount = pps->capcount; 1206 (*pseq)++; 1207 *tsp = ts; 1208 1209 /* add offset correction */ 1210 if (foff) { 1211 timespecadd(tsp, osp, tsp); 1212 if (tsp->tv_nsec < 0) { 1213 tsp->tv_nsec += 1000000000; 1214 tsp->tv_sec -= 1; 1215 } 1216 } 1217 1218 #ifdef PPS_DEBUG 1219 if (ppsdebug & 0x2) { 1220 struct timespec ts2; 1221 struct timespec ts3; 1222 1223 bintime2timespec(&bt_ref, &ts2); 1224 1225 bt.sec = 0; 1226 bt.frac = 0; 1227 1228 if (refmode & PPS_REFEVNT_CAPCUR) { 1229 bintime_addx(&bt, pps->capth->th_scale * dcount); 1230 } 1231 bintime2timespec(&bt, &ts3); 1232 1233 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1234 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1235 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1236 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1237 timespec2ns(&ts3)); 1238 } 1239 #endif 1240 1241 #ifdef PPS_SYNC 1242 if (fhard) { 1243 uint64_t scale; 1244 uint64_t div; 1245 1246 /* 1247 * Feed the NTP PLL/FLL. 1248 * The FLL wants to know how many (hardware) nanoseconds 1249 * elapsed since the previous event (mod 1 second) thus 1250 * we are actually looking at the frequency difference scaled 1251 * in nsec. 1252 * As the counter time stamps are not truly at 1Hz 1253 * we need to scale the count by the elapsed 1254 * reference time. 1255 * valid sampling interval: [0.5..2[ sec 1256 */ 1257 1258 /* calculate elapsed raw count */ 1259 tcount = pps->capcount - pps->ppscount[2]; 1260 pps->ppscount[2] = pps->capcount; 1261 tcount &= pps->capth->th_counter->tc_counter_mask; 1262 1263 /* calculate elapsed ref time */ 1264 btd = bt_ref; 1265 bintime_sub(&btd, &pps->ref_time); 1266 pps->ref_time = bt_ref; 1267 1268 /* check that we stay below 2 sec */ 1269 if (btd.sec < 0 || btd.sec > 1) 1270 return; 1271 1272 /* we want at least 0.5 sec between samples */ 1273 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1274 return; 1275 1276 /* 1277 * calculate cycles per period by multiplying 1278 * the frequency with the elapsed period 1279 * we pick a fraction of 30 bits 1280 * ~1ns resolution for elapsed time 1281 */ 1282 div = (uint64_t)btd.sec << 30; 1283 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1284 div *= pps->capth->th_counter->tc_frequency; 1285 div >>= 30; 1286 1287 if (div == 0) /* safeguard */ 1288 return; 1289 1290 scale = (uint64_t)1 << 63; 1291 scale /= div; 1292 scale *= 2; 1293 1294 bt.sec = 0; 1295 bt.frac = 0; 1296 bintime_addx(&bt, scale * tcount); 1297 bintime2timespec(&bt, &ts); 1298 1299 #ifdef PPS_DEBUG 1300 if (ppsdebug & 0x4) { 1301 struct timespec ts2; 1302 int64_t df; 1303 1304 bintime2timespec(&bt_ref, &ts2); 1305 df = timespec2ns(&ts); 1306 if (df > 500000000) 1307 df -= 1000000000; 1308 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1309 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1310 ", freqdiff=%"PRIi64" ns/s\n", 1311 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1312 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1313 df); 1314 } 1315 #endif 1316 1317 hardpps(tsp, timespec2ns(&ts)); 1318 } 1319 #endif 1320 } 1321 1322 /* 1323 * Timecounters need to be updated every so often to prevent the hardware 1324 * counter from overflowing. Updating also recalculates the cached values 1325 * used by the get*() family of functions, so their precision depends on 1326 * the update frequency. 1327 */ 1328 1329 static int tc_tick; 1330 1331 void 1332 tc_ticktock(void) 1333 { 1334 static int count; 1335 1336 if (++count < tc_tick) 1337 return; 1338 count = 0; 1339 mutex_spin_enter(&timecounter_lock); 1340 if (__predict_false(timecounter_bad != 0)) { 1341 /* An existing timecounter has gone bad, pick a new one. */ 1342 (void)atomic_swap_uint(&timecounter_bad, 0); 1343 if (timecounter->tc_quality < 0) { 1344 tc_pick(); 1345 } 1346 } 1347 tc_windup(); 1348 mutex_spin_exit(&timecounter_lock); 1349 } 1350 1351 void 1352 inittimecounter(void) 1353 { 1354 u_int p; 1355 1356 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1357 1358 /* 1359 * Set the initial timeout to 1360 * max(1, <approx. number of hardclock ticks in a millisecond>). 1361 * People should probably not use the sysctl to set the timeout 1362 * to smaller than its initial value, since that value is the 1363 * smallest reasonable one. If they want better timestamps they 1364 * should use the non-"get"* functions. 1365 */ 1366 if (hz > 1000) 1367 tc_tick = (hz + 500) / 1000; 1368 else 1369 tc_tick = 1; 1370 p = (tc_tick * 1000000) / hz; 1371 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1372 p / 1000, p % 1000); 1373 1374 /* warm up new timecounter (again) and get rolling. */ 1375 (void)timecounter->tc_get_timecount(timecounter); 1376 (void)timecounter->tc_get_timecount(timecounter); 1377 } 1378