1 /* $NetBSD: kern_tc.c,v 1.52 2019/10/06 15:11:17 uwe Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.52 2019/10/06 15:11:17 uwe Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 52 #include <sys/sysctl.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 #include <sys/timepps.h> 56 #include <sys/timetc.h> 57 #include <sys/timex.h> 58 #include <sys/evcnt.h> 59 #include <sys/kauth.h> 60 #include <sys/mutex.h> 61 #include <sys/atomic.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return (++now); 84 } 85 86 static struct timecounter dummy_timecounter = { 87 .tc_get_timecount = dummy_get_timecount, 88 .tc_counter_mask = ~0u, 89 .tc_frequency = 1000000, 90 .tc_name = "dummy", 91 .tc_quality = -1000000, 92 .tc_priv = NULL, 93 }; 94 95 struct timehands { 96 /* These fields must be initialized by the driver. */ 97 struct timecounter *th_counter; /* active timecounter */ 98 int64_t th_adjustment; /* frequency adjustment */ 99 /* (NTP/adjtime) */ 100 u_int64_t th_scale; /* scale factor (counter */ 101 /* tick->time) */ 102 u_int64_t th_offset_count; /* offset at last time */ 103 /* update (tc_windup()) */ 104 struct bintime th_offset; /* bin (up)time at windup */ 105 struct timeval th_microtime; /* cached microtime */ 106 struct timespec th_nanotime; /* cached nanotime */ 107 /* Fields not to be copied in tc_windup start with th_generation. */ 108 volatile u_int th_generation; /* current genration */ 109 struct timehands *th_next; /* next timehand */ 110 }; 111 112 static struct timehands th0; 113 static struct timehands th9 = { .th_next = &th0, }; 114 static struct timehands th8 = { .th_next = &th9, }; 115 static struct timehands th7 = { .th_next = &th8, }; 116 static struct timehands th6 = { .th_next = &th7, }; 117 static struct timehands th5 = { .th_next = &th6, }; 118 static struct timehands th4 = { .th_next = &th5, }; 119 static struct timehands th3 = { .th_next = &th4, }; 120 static struct timehands th2 = { .th_next = &th3, }; 121 static struct timehands th1 = { .th_next = &th2, }; 122 static struct timehands th0 = { 123 .th_counter = &dummy_timecounter, 124 .th_scale = (uint64_t)-1 / 1000000, 125 .th_offset = { .sec = 1, .frac = 0 }, 126 .th_generation = 1, 127 .th_next = &th1, 128 }; 129 130 static struct timehands *volatile timehands = &th0; 131 struct timecounter *timecounter = &dummy_timecounter; 132 static struct timecounter *timecounters = &dummy_timecounter; 133 134 volatile time_t time_second __cacheline_aligned = 1; 135 volatile time_t time_uptime __cacheline_aligned = 1; 136 137 static struct bintime timebasebin; 138 139 static int timestepwarnings; 140 141 kmutex_t timecounter_lock; 142 static u_int timecounter_mods; 143 static volatile int timecounter_removals = 1; 144 static u_int timecounter_bad; 145 146 #ifdef __FreeBSD__ 147 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, 148 ×tepwarnings, 0, ""); 149 #endif /* __FreeBSD__ */ 150 151 /* 152 * sysctl helper routine for kern.timercounter.hardware 153 */ 154 static int 155 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 156 { 157 struct sysctlnode node; 158 int error; 159 char newname[MAX_TCNAMELEN]; 160 struct timecounter *newtc, *tc; 161 162 tc = timecounter; 163 164 strlcpy(newname, tc->tc_name, sizeof(newname)); 165 166 node = *rnode; 167 node.sysctl_data = newname; 168 node.sysctl_size = sizeof(newname); 169 170 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 171 172 if (error || 173 newp == NULL || 174 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 175 return error; 176 177 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 178 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 179 NULL, NULL)) != 0) 180 return (error); 181 182 if (!cold) 183 mutex_spin_enter(&timecounter_lock); 184 error = EINVAL; 185 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 186 if (strcmp(newname, newtc->tc_name) != 0) 187 continue; 188 /* Warm up new timecounter. */ 189 (void)newtc->tc_get_timecount(newtc); 190 (void)newtc->tc_get_timecount(newtc); 191 timecounter = newtc; 192 error = 0; 193 break; 194 } 195 if (!cold) 196 mutex_spin_exit(&timecounter_lock); 197 return error; 198 } 199 200 static int 201 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 202 { 203 char buf[MAX_TCNAMELEN+48]; 204 char *where; 205 const char *spc; 206 struct timecounter *tc; 207 size_t needed, left, slen; 208 int error, mods; 209 210 if (newp != NULL) 211 return (EPERM); 212 if (namelen != 0) 213 return (EINVAL); 214 215 mutex_spin_enter(&timecounter_lock); 216 retry: 217 spc = ""; 218 error = 0; 219 needed = 0; 220 left = *oldlenp; 221 where = oldp; 222 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 223 if (where == NULL) { 224 needed += sizeof(buf); /* be conservative */ 225 } else { 226 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 227 " Hz)", spc, tc->tc_name, tc->tc_quality, 228 tc->tc_frequency); 229 if (left < slen + 1) 230 break; 231 mods = timecounter_mods; 232 mutex_spin_exit(&timecounter_lock); 233 error = copyout(buf, where, slen + 1); 234 mutex_spin_enter(&timecounter_lock); 235 if (mods != timecounter_mods) { 236 goto retry; 237 } 238 spc = " "; 239 where += slen; 240 needed += slen; 241 left -= slen; 242 } 243 } 244 mutex_spin_exit(&timecounter_lock); 245 246 *oldlenp = needed; 247 return (error); 248 } 249 250 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 251 { 252 const struct sysctlnode *node; 253 254 sysctl_createv(clog, 0, NULL, &node, 255 CTLFLAG_PERMANENT, 256 CTLTYPE_NODE, "timecounter", 257 SYSCTL_DESCR("time counter information"), 258 NULL, 0, NULL, 0, 259 CTL_KERN, CTL_CREATE, CTL_EOL); 260 261 if (node != NULL) { 262 sysctl_createv(clog, 0, NULL, NULL, 263 CTLFLAG_PERMANENT, 264 CTLTYPE_STRING, "choice", 265 SYSCTL_DESCR("available counters"), 266 sysctl_kern_timecounter_choice, 0, NULL, 0, 267 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 268 269 sysctl_createv(clog, 0, NULL, NULL, 270 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 271 CTLTYPE_STRING, "hardware", 272 SYSCTL_DESCR("currently active time counter"), 273 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 274 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 275 276 sysctl_createv(clog, 0, NULL, NULL, 277 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 278 CTLTYPE_INT, "timestepwarnings", 279 SYSCTL_DESCR("log time steps"), 280 NULL, 0, ×tepwarnings, 0, 281 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 282 } 283 } 284 285 #ifdef TC_COUNTERS 286 #define TC_STATS(name) \ 287 static struct evcnt n##name = \ 288 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 289 EVCNT_ATTACH_STATIC(n##name) 290 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 291 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 292 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 293 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 294 TC_STATS(setclock); 295 #define TC_COUNT(var) var.ev_count++ 296 #undef TC_STATS 297 #else 298 #define TC_COUNT(var) /* nothing */ 299 #endif /* TC_COUNTERS */ 300 301 static void tc_windup(void); 302 303 /* 304 * Return the difference between the timehands' counter value now and what 305 * was when we copied it to the timehands' offset_count. 306 */ 307 static inline u_int 308 tc_delta(struct timehands *th) 309 { 310 struct timecounter *tc; 311 312 tc = th->th_counter; 313 return ((tc->tc_get_timecount(tc) - 314 th->th_offset_count) & tc->tc_counter_mask); 315 } 316 317 /* 318 * Functions for reading the time. We have to loop until we are sure that 319 * the timehands that we operated on was not updated under our feet. See 320 * the comment in <sys/timevar.h> for a description of these 12 functions. 321 */ 322 323 void 324 binuptime(struct bintime *bt) 325 { 326 struct timehands *th; 327 lwp_t *l; 328 u_int lgen, gen; 329 330 TC_COUNT(nbinuptime); 331 332 /* 333 * Provide exclusion against tc_detach(). 334 * 335 * We record the number of timecounter removals before accessing 336 * timecounter state. Note that the LWP can be using multiple 337 * "generations" at once, due to interrupts (interrupted while in 338 * this function). Hardware interrupts will borrow the interrupted 339 * LWP's l_tcgen value for this purpose, and can themselves be 340 * interrupted by higher priority interrupts. In this case we need 341 * to ensure that the oldest generation in use is recorded. 342 * 343 * splsched() is too expensive to use, so we take care to structure 344 * this code in such a way that it is not required. Likewise, we 345 * do not disable preemption. 346 * 347 * Memory barriers are also too expensive to use for such a 348 * performance critical function. The good news is that we do not 349 * need memory barriers for this type of exclusion, as the thread 350 * updating timecounter_removals will issue a broadcast cross call 351 * before inspecting our l_tcgen value (this elides memory ordering 352 * issues). 353 */ 354 l = curlwp; 355 lgen = l->l_tcgen; 356 if (__predict_true(lgen == 0)) { 357 l->l_tcgen = timecounter_removals; 358 } 359 __insn_barrier(); 360 361 do { 362 th = timehands; 363 gen = th->th_generation; 364 *bt = th->th_offset; 365 bintime_addx(bt, th->th_scale * tc_delta(th)); 366 } while (gen == 0 || gen != th->th_generation); 367 368 __insn_barrier(); 369 l->l_tcgen = lgen; 370 } 371 372 void 373 nanouptime(struct timespec *tsp) 374 { 375 struct bintime bt; 376 377 TC_COUNT(nnanouptime); 378 binuptime(&bt); 379 bintime2timespec(&bt, tsp); 380 } 381 382 void 383 microuptime(struct timeval *tvp) 384 { 385 struct bintime bt; 386 387 TC_COUNT(nmicrouptime); 388 binuptime(&bt); 389 bintime2timeval(&bt, tvp); 390 } 391 392 void 393 bintime(struct bintime *bt) 394 { 395 396 TC_COUNT(nbintime); 397 binuptime(bt); 398 bintime_add(bt, &timebasebin); 399 } 400 401 void 402 nanotime(struct timespec *tsp) 403 { 404 struct bintime bt; 405 406 TC_COUNT(nnanotime); 407 bintime(&bt); 408 bintime2timespec(&bt, tsp); 409 } 410 411 void 412 microtime(struct timeval *tvp) 413 { 414 struct bintime bt; 415 416 TC_COUNT(nmicrotime); 417 bintime(&bt); 418 bintime2timeval(&bt, tvp); 419 } 420 421 void 422 getbinuptime(struct bintime *bt) 423 { 424 struct timehands *th; 425 u_int gen; 426 427 TC_COUNT(ngetbinuptime); 428 do { 429 th = timehands; 430 gen = th->th_generation; 431 *bt = th->th_offset; 432 } while (gen == 0 || gen != th->th_generation); 433 } 434 435 void 436 getnanouptime(struct timespec *tsp) 437 { 438 struct timehands *th; 439 u_int gen; 440 441 TC_COUNT(ngetnanouptime); 442 do { 443 th = timehands; 444 gen = th->th_generation; 445 bintime2timespec(&th->th_offset, tsp); 446 } while (gen == 0 || gen != th->th_generation); 447 } 448 449 void 450 getmicrouptime(struct timeval *tvp) 451 { 452 struct timehands *th; 453 u_int gen; 454 455 TC_COUNT(ngetmicrouptime); 456 do { 457 th = timehands; 458 gen = th->th_generation; 459 bintime2timeval(&th->th_offset, tvp); 460 } while (gen == 0 || gen != th->th_generation); 461 } 462 463 void 464 getbintime(struct bintime *bt) 465 { 466 struct timehands *th; 467 u_int gen; 468 469 TC_COUNT(ngetbintime); 470 do { 471 th = timehands; 472 gen = th->th_generation; 473 *bt = th->th_offset; 474 } while (gen == 0 || gen != th->th_generation); 475 bintime_add(bt, &timebasebin); 476 } 477 478 static inline void 479 dogetnanotime(struct timespec *tsp) 480 { 481 struct timehands *th; 482 u_int gen; 483 484 TC_COUNT(ngetnanotime); 485 do { 486 th = timehands; 487 gen = th->th_generation; 488 *tsp = th->th_nanotime; 489 } while (gen == 0 || gen != th->th_generation); 490 } 491 492 void 493 getnanotime(struct timespec *tsp) 494 { 495 496 dogetnanotime(tsp); 497 } 498 499 void dtrace_getnanotime(struct timespec *tsp); 500 501 void 502 dtrace_getnanotime(struct timespec *tsp) 503 { 504 505 dogetnanotime(tsp); 506 } 507 508 void 509 getmicrotime(struct timeval *tvp) 510 { 511 struct timehands *th; 512 u_int gen; 513 514 TC_COUNT(ngetmicrotime); 515 do { 516 th = timehands; 517 gen = th->th_generation; 518 *tvp = th->th_microtime; 519 } while (gen == 0 || gen != th->th_generation); 520 } 521 522 /* 523 * Initialize a new timecounter and possibly use it. 524 */ 525 void 526 tc_init(struct timecounter *tc) 527 { 528 u_int u; 529 530 u = tc->tc_frequency / tc->tc_counter_mask; 531 /* XXX: We need some margin here, 10% is a guess */ 532 u *= 11; 533 u /= 10; 534 if (u > hz && tc->tc_quality >= 0) { 535 tc->tc_quality = -2000; 536 aprint_verbose( 537 "timecounter: Timecounter \"%s\" frequency %ju Hz", 538 tc->tc_name, (uintmax_t)tc->tc_frequency); 539 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 540 } else if (tc->tc_quality >= 0 || bootverbose) { 541 aprint_verbose( 542 "timecounter: Timecounter \"%s\" frequency %ju Hz " 543 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 544 tc->tc_quality); 545 } 546 547 mutex_spin_enter(&timecounter_lock); 548 tc->tc_next = timecounters; 549 timecounters = tc; 550 timecounter_mods++; 551 /* 552 * Never automatically use a timecounter with negative quality. 553 * Even though we run on the dummy counter, switching here may be 554 * worse since this timecounter may not be monotonous. 555 */ 556 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 557 (tc->tc_quality == timecounter->tc_quality && 558 tc->tc_frequency > timecounter->tc_frequency))) { 559 (void)tc->tc_get_timecount(tc); 560 (void)tc->tc_get_timecount(tc); 561 timecounter = tc; 562 tc_windup(); 563 } 564 mutex_spin_exit(&timecounter_lock); 565 } 566 567 /* 568 * Pick a new timecounter due to the existing counter going bad. 569 */ 570 static void 571 tc_pick(void) 572 { 573 struct timecounter *best, *tc; 574 575 KASSERT(mutex_owned(&timecounter_lock)); 576 577 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 578 if (tc->tc_quality > best->tc_quality) 579 best = tc; 580 else if (tc->tc_quality < best->tc_quality) 581 continue; 582 else if (tc->tc_frequency > best->tc_frequency) 583 best = tc; 584 } 585 (void)best->tc_get_timecount(best); 586 (void)best->tc_get_timecount(best); 587 timecounter = best; 588 } 589 590 /* 591 * A timecounter has gone bad, arrange to pick a new one at the next 592 * clock tick. 593 */ 594 void 595 tc_gonebad(struct timecounter *tc) 596 { 597 598 tc->tc_quality = -100; 599 membar_producer(); 600 atomic_inc_uint(&timecounter_bad); 601 } 602 603 /* 604 * Stop using a timecounter and remove it from the timecounters list. 605 */ 606 int 607 tc_detach(struct timecounter *target) 608 { 609 struct timecounter *tc; 610 struct timecounter **tcp = NULL; 611 int removals; 612 lwp_t *l; 613 614 /* First, find the timecounter. */ 615 mutex_spin_enter(&timecounter_lock); 616 for (tcp = &timecounters, tc = timecounters; 617 tc != NULL; 618 tcp = &tc->tc_next, tc = tc->tc_next) { 619 if (tc == target) 620 break; 621 } 622 if (tc == NULL) { 623 mutex_spin_exit(&timecounter_lock); 624 return ESRCH; 625 } 626 627 /* And now, remove it. */ 628 *tcp = tc->tc_next; 629 if (timecounter == target) { 630 tc_pick(); 631 tc_windup(); 632 } 633 timecounter_mods++; 634 removals = timecounter_removals++; 635 mutex_spin_exit(&timecounter_lock); 636 637 /* 638 * We now have to determine if any threads in the system are still 639 * making use of this timecounter. 640 * 641 * We issue a broadcast cross call to elide memory ordering issues, 642 * then scan all LWPs in the system looking at each's timecounter 643 * generation number. We need to see a value of zero (not actively 644 * using a timecounter) or a value greater than our removal value. 645 * 646 * We may race with threads that read `timecounter_removals' and 647 * and then get preempted before updating `l_tcgen'. This is not 648 * a problem, since it means that these threads have not yet started 649 * accessing timecounter state. All we do need is one clean 650 * snapshot of the system where every thread appears not to be using 651 * old timecounter state. 652 */ 653 for (;;) { 654 xc_barrier(0); 655 656 mutex_enter(proc_lock); 657 LIST_FOREACH(l, &alllwp, l_list) { 658 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 659 /* 660 * Not using timecounter or old timecounter 661 * state at time of our xcall or later. 662 */ 663 continue; 664 } 665 break; 666 } 667 mutex_exit(proc_lock); 668 669 /* 670 * If the timecounter is still in use, wait at least 10ms 671 * before retrying. 672 */ 673 if (l == NULL) { 674 return 0; 675 } 676 (void)kpause("tcdetach", false, mstohz(10), NULL); 677 } 678 } 679 680 /* Report the frequency of the current timecounter. */ 681 u_int64_t 682 tc_getfrequency(void) 683 { 684 685 return (timehands->th_counter->tc_frequency); 686 } 687 688 /* 689 * Step our concept of UTC. This is done by modifying our estimate of 690 * when we booted. 691 */ 692 void 693 tc_setclock(const struct timespec *ts) 694 { 695 struct timespec ts2; 696 struct bintime bt, bt2; 697 698 mutex_spin_enter(&timecounter_lock); 699 TC_COUNT(nsetclock); 700 binuptime(&bt2); 701 timespec2bintime(ts, &bt); 702 bintime_sub(&bt, &bt2); 703 bintime_add(&bt2, &timebasebin); 704 timebasebin = bt; 705 tc_windup(); 706 mutex_spin_exit(&timecounter_lock); 707 708 if (timestepwarnings) { 709 bintime2timespec(&bt2, &ts2); 710 log(LOG_INFO, 711 "Time stepped from %lld.%09ld to %lld.%09ld\n", 712 (long long)ts2.tv_sec, ts2.tv_nsec, 713 (long long)ts->tv_sec, ts->tv_nsec); 714 } 715 } 716 717 /* 718 * Initialize the next struct timehands in the ring and make 719 * it the active timehands. Along the way we might switch to a different 720 * timecounter and/or do seconds processing in NTP. Slightly magic. 721 */ 722 static void 723 tc_windup(void) 724 { 725 struct bintime bt; 726 struct timehands *th, *tho; 727 u_int64_t scale; 728 u_int delta, ncount, ogen; 729 int i, s_update; 730 time_t t; 731 732 KASSERT(mutex_owned(&timecounter_lock)); 733 734 s_update = 0; 735 736 /* 737 * Make the next timehands a copy of the current one, but do not 738 * overwrite the generation or next pointer. While we update 739 * the contents, the generation must be zero. Ensure global 740 * visibility of the generation before proceeding. 741 */ 742 tho = timehands; 743 th = tho->th_next; 744 ogen = th->th_generation; 745 th->th_generation = 0; 746 membar_producer(); 747 bcopy(tho, th, offsetof(struct timehands, th_generation)); 748 749 /* 750 * Capture a timecounter delta on the current timecounter and if 751 * changing timecounters, a counter value from the new timecounter. 752 * Update the offset fields accordingly. 753 */ 754 delta = tc_delta(th); 755 if (th->th_counter != timecounter) 756 ncount = timecounter->tc_get_timecount(timecounter); 757 else 758 ncount = 0; 759 th->th_offset_count += delta; 760 bintime_addx(&th->th_offset, th->th_scale * delta); 761 762 /* 763 * Hardware latching timecounters may not generate interrupts on 764 * PPS events, so instead we poll them. There is a finite risk that 765 * the hardware might capture a count which is later than the one we 766 * got above, and therefore possibly in the next NTP second which might 767 * have a different rate than the current NTP second. It doesn't 768 * matter in practice. 769 */ 770 if (tho->th_counter->tc_poll_pps) 771 tho->th_counter->tc_poll_pps(tho->th_counter); 772 773 /* 774 * Deal with NTP second processing. The for loop normally 775 * iterates at most once, but in extreme situations it might 776 * keep NTP sane if timeouts are not run for several seconds. 777 * At boot, the time step can be large when the TOD hardware 778 * has been read, so on really large steps, we call 779 * ntp_update_second only twice. We need to call it twice in 780 * case we missed a leap second. 781 * If NTP is not compiled in ntp_update_second still calculates 782 * the adjustment resulting from adjtime() calls. 783 */ 784 bt = th->th_offset; 785 bintime_add(&bt, &timebasebin); 786 i = bt.sec - tho->th_microtime.tv_sec; 787 if (i > LARGE_STEP) 788 i = 2; 789 for (; i > 0; i--) { 790 t = bt.sec; 791 ntp_update_second(&th->th_adjustment, &bt.sec); 792 s_update = 1; 793 if (bt.sec != t) 794 timebasebin.sec += bt.sec - t; 795 } 796 797 /* Update the UTC timestamps used by the get*() functions. */ 798 /* XXX shouldn't do this here. Should force non-`get' versions. */ 799 bintime2timeval(&bt, &th->th_microtime); 800 bintime2timespec(&bt, &th->th_nanotime); 801 /* Now is a good time to change timecounters. */ 802 if (th->th_counter != timecounter) { 803 th->th_counter = timecounter; 804 th->th_offset_count = ncount; 805 s_update = 1; 806 } 807 808 /*- 809 * Recalculate the scaling factor. We want the number of 1/2^64 810 * fractions of a second per period of the hardware counter, taking 811 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 812 * processing provides us with. 813 * 814 * The th_adjustment is nanoseconds per second with 32 bit binary 815 * fraction and we want 64 bit binary fraction of second: 816 * 817 * x = a * 2^32 / 10^9 = a * 4.294967296 818 * 819 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 820 * we can only multiply by about 850 without overflowing, but that 821 * leaves suitably precise fractions for multiply before divide. 822 * 823 * Divide before multiply with a fraction of 2199/512 results in a 824 * systematic undercompensation of 10PPM of th_adjustment. On a 825 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 826 * 827 * We happily sacrifice the lowest of the 64 bits of our result 828 * to the goddess of code clarity. 829 * 830 */ 831 if (s_update) { 832 scale = (u_int64_t)1 << 63; 833 scale += (th->th_adjustment / 1024) * 2199; 834 scale /= th->th_counter->tc_frequency; 835 th->th_scale = scale * 2; 836 } 837 /* 838 * Now that the struct timehands is again consistent, set the new 839 * generation number, making sure to not make it zero. Ensure 840 * changes are globally visible before changing. 841 */ 842 if (++ogen == 0) 843 ogen = 1; 844 membar_producer(); 845 th->th_generation = ogen; 846 847 /* 848 * Go live with the new struct timehands. Ensure changes are 849 * globally visible before changing. 850 */ 851 time_second = th->th_microtime.tv_sec; 852 time_uptime = th->th_offset.sec; 853 membar_producer(); 854 timehands = th; 855 856 /* 857 * Force users of the old timehand to move on. This is 858 * necessary for MP systems; we need to ensure that the 859 * consumers will move away from the old timehand before 860 * we begin updating it again when we eventually wrap 861 * around. 862 */ 863 if (++tho->th_generation == 0) 864 tho->th_generation = 1; 865 } 866 867 /* 868 * RFC 2783 PPS-API implementation. 869 */ 870 871 int 872 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 873 { 874 pps_params_t *app; 875 pps_info_t *pipi; 876 #ifdef PPS_SYNC 877 int *epi; 878 #endif 879 880 KASSERT(mutex_owned(&timecounter_lock)); 881 882 KASSERT(pps != NULL); 883 884 switch (cmd) { 885 case PPS_IOC_CREATE: 886 return (0); 887 case PPS_IOC_DESTROY: 888 return (0); 889 case PPS_IOC_SETPARAMS: 890 app = (pps_params_t *)data; 891 if (app->mode & ~pps->ppscap) 892 return (EINVAL); 893 pps->ppsparam = *app; 894 return (0); 895 case PPS_IOC_GETPARAMS: 896 app = (pps_params_t *)data; 897 *app = pps->ppsparam; 898 app->api_version = PPS_API_VERS_1; 899 return (0); 900 case PPS_IOC_GETCAP: 901 *(int*)data = pps->ppscap; 902 return (0); 903 case PPS_IOC_FETCH: 904 pipi = (pps_info_t *)data; 905 pps->ppsinfo.current_mode = pps->ppsparam.mode; 906 *pipi = pps->ppsinfo; 907 return (0); 908 case PPS_IOC_KCBIND: 909 #ifdef PPS_SYNC 910 epi = (int *)data; 911 /* XXX Only root should be able to do this */ 912 if (*epi & ~pps->ppscap) 913 return (EINVAL); 914 pps->kcmode = *epi; 915 return (0); 916 #else 917 return (EOPNOTSUPP); 918 #endif 919 default: 920 return (EPASSTHROUGH); 921 } 922 } 923 924 void 925 pps_init(struct pps_state *pps) 926 { 927 928 KASSERT(mutex_owned(&timecounter_lock)); 929 930 pps->ppscap |= PPS_TSFMT_TSPEC; 931 if (pps->ppscap & PPS_CAPTUREASSERT) 932 pps->ppscap |= PPS_OFFSETASSERT; 933 if (pps->ppscap & PPS_CAPTURECLEAR) 934 pps->ppscap |= PPS_OFFSETCLEAR; 935 } 936 937 /* 938 * capture a timetamp in the pps structure 939 */ 940 void 941 pps_capture(struct pps_state *pps) 942 { 943 struct timehands *th; 944 945 KASSERT(mutex_owned(&timecounter_lock)); 946 KASSERT(pps != NULL); 947 948 th = timehands; 949 pps->capgen = th->th_generation; 950 pps->capth = th; 951 pps->capcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 952 if (pps->capgen != th->th_generation) 953 pps->capgen = 0; 954 } 955 956 #ifdef PPS_DEBUG 957 int ppsdebug = 0; 958 #endif 959 960 /* 961 * process a pps_capture()ed event 962 */ 963 void 964 pps_event(struct pps_state *pps, int event) 965 { 966 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 967 } 968 969 /* 970 * extended pps api / kernel pll/fll entry point 971 * 972 * feed reference time stamps to PPS engine 973 * 974 * will simulate a PPS event and feed 975 * the NTP PLL/FLL if requested. 976 * 977 * the ref time stamps should be roughly once 978 * a second but do not need to be exactly in phase 979 * with the UTC second but should be close to it. 980 * this relaxation of requirements allows callout 981 * driven timestamping mechanisms to feed to pps 982 * capture/kernel pll logic. 983 * 984 * calling pattern is: 985 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 986 * read timestamp from reference source 987 * pps_ref_event() 988 * 989 * supported refmodes: 990 * PPS_REFEVNT_CAPTURE 991 * use system timestamp of pps_capture() 992 * PPS_REFEVNT_CURRENT 993 * use system timestamp of this call 994 * PPS_REFEVNT_CAPCUR 995 * use average of read capture and current system time stamp 996 * PPS_REFEVNT_PPS 997 * assume timestamp on second mark - ref_ts is ignored 998 * 999 */ 1000 1001 void 1002 pps_ref_event(struct pps_state *pps, 1003 int event, 1004 struct bintime *ref_ts, 1005 int refmode 1006 ) 1007 { 1008 struct bintime bt; /* current time */ 1009 struct bintime btd; /* time difference */ 1010 struct bintime bt_ref; /* reference time */ 1011 struct timespec ts, *tsp, *osp; 1012 struct timehands *th; 1013 u_int64_t tcount, acount, dcount, *pcount; 1014 int foff, gen; 1015 #ifdef PPS_SYNC 1016 int fhard; 1017 #endif 1018 pps_seq_t *pseq; 1019 1020 KASSERT(mutex_owned(&timecounter_lock)); 1021 1022 KASSERT(pps != NULL); 1023 1024 /* pick up current time stamp if needed */ 1025 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1026 /* pick up current time stamp */ 1027 th = timehands; 1028 gen = th->th_generation; 1029 tcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 1030 if (gen != th->th_generation) 1031 gen = 0; 1032 1033 /* If the timecounter was wound up underneath us, bail out. */ 1034 if (pps->capgen == 0 || 1035 pps->capgen != pps->capth->th_generation || 1036 gen == 0 || 1037 gen != pps->capgen) { 1038 #ifdef PPS_DEBUG 1039 if (ppsdebug & 0x1) { 1040 log(LOG_DEBUG, 1041 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1042 pps, event); 1043 } 1044 #endif 1045 return; 1046 } 1047 } else { 1048 tcount = 0; /* keep GCC happy */ 1049 } 1050 1051 #ifdef PPS_DEBUG 1052 if (ppsdebug & 0x1) { 1053 struct timespec tmsp; 1054 1055 if (ref_ts == NULL) { 1056 tmsp.tv_sec = 0; 1057 tmsp.tv_nsec = 0; 1058 } else { 1059 bintime2timespec(ref_ts, &tmsp); 1060 } 1061 1062 log(LOG_DEBUG, 1063 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1064 ".%09"PRIi32", refmode=0x%1x)\n", 1065 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1066 } 1067 #endif 1068 1069 /* setup correct event references */ 1070 if (event == PPS_CAPTUREASSERT) { 1071 tsp = &pps->ppsinfo.assert_timestamp; 1072 osp = &pps->ppsparam.assert_offset; 1073 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1074 #ifdef PPS_SYNC 1075 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1076 #endif 1077 pcount = &pps->ppscount[0]; 1078 pseq = &pps->ppsinfo.assert_sequence; 1079 } else { 1080 tsp = &pps->ppsinfo.clear_timestamp; 1081 osp = &pps->ppsparam.clear_offset; 1082 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1083 #ifdef PPS_SYNC 1084 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1085 #endif 1086 pcount = &pps->ppscount[1]; 1087 pseq = &pps->ppsinfo.clear_sequence; 1088 } 1089 1090 /* determine system time stamp according to refmode */ 1091 dcount = 0; /* keep GCC happy */ 1092 switch (refmode & PPS_REFEVNT_RMASK) { 1093 case PPS_REFEVNT_CAPTURE: 1094 acount = pps->capcount; /* use capture timestamp */ 1095 break; 1096 1097 case PPS_REFEVNT_CURRENT: 1098 acount = tcount; /* use current timestamp */ 1099 break; 1100 1101 case PPS_REFEVNT_CAPCUR: 1102 /* 1103 * calculate counter value between pps_capture() and 1104 * pps_ref_event() 1105 */ 1106 dcount = tcount - pps->capcount; 1107 acount = (dcount / 2) + pps->capcount; 1108 break; 1109 1110 default: /* ignore call error silently */ 1111 return; 1112 } 1113 1114 /* 1115 * If the timecounter changed, we cannot compare the count values, so 1116 * we have to drop the rest of the PPS-stuff until the next event. 1117 */ 1118 if (pps->ppstc != pps->capth->th_counter) { 1119 pps->ppstc = pps->capth->th_counter; 1120 pps->capcount = acount; 1121 *pcount = acount; 1122 pps->ppscount[2] = acount; 1123 #ifdef PPS_DEBUG 1124 if (ppsdebug & 0x1) { 1125 log(LOG_DEBUG, 1126 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1127 pps, event); 1128 } 1129 #endif 1130 return; 1131 } 1132 1133 pps->capcount = acount; 1134 1135 /* Convert the count to a bintime. */ 1136 bt = pps->capth->th_offset; 1137 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1138 bintime_add(&bt, &timebasebin); 1139 1140 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1141 /* determine difference to reference time stamp */ 1142 bt_ref = *ref_ts; 1143 1144 btd = bt; 1145 bintime_sub(&btd, &bt_ref); 1146 1147 /* 1148 * simulate a PPS timestamp by dropping the fraction 1149 * and applying the offset 1150 */ 1151 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1152 bt.sec++; 1153 bt.frac = 0; 1154 bintime_add(&bt, &btd); 1155 } else { 1156 /* 1157 * create ref_ts from current time - 1158 * we are supposed to be called on 1159 * the second mark 1160 */ 1161 bt_ref = bt; 1162 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1163 bt_ref.sec++; 1164 bt_ref.frac = 0; 1165 } 1166 1167 /* convert bintime to timestamp */ 1168 bintime2timespec(&bt, &ts); 1169 1170 /* If the timecounter was wound up underneath us, bail out. */ 1171 if (pps->capgen != pps->capth->th_generation) 1172 return; 1173 1174 /* store time stamp */ 1175 *pcount = pps->capcount; 1176 (*pseq)++; 1177 *tsp = ts; 1178 1179 /* add offset correction */ 1180 if (foff) { 1181 timespecadd(tsp, osp, tsp); 1182 if (tsp->tv_nsec < 0) { 1183 tsp->tv_nsec += 1000000000; 1184 tsp->tv_sec -= 1; 1185 } 1186 } 1187 1188 #ifdef PPS_DEBUG 1189 if (ppsdebug & 0x2) { 1190 struct timespec ts2; 1191 struct timespec ts3; 1192 1193 bintime2timespec(&bt_ref, &ts2); 1194 1195 bt.sec = 0; 1196 bt.frac = 0; 1197 1198 if (refmode & PPS_REFEVNT_CAPCUR) { 1199 bintime_addx(&bt, pps->capth->th_scale * dcount); 1200 } 1201 bintime2timespec(&bt, &ts3); 1202 1203 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1204 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1205 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1206 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1207 timespec2ns(&ts3)); 1208 } 1209 #endif 1210 1211 #ifdef PPS_SYNC 1212 if (fhard) { 1213 uint64_t scale; 1214 uint64_t div; 1215 1216 /* 1217 * Feed the NTP PLL/FLL. 1218 * The FLL wants to know how many (hardware) nanoseconds 1219 * elapsed since the previous event (mod 1 second) thus 1220 * we are actually looking at the frequency difference scaled 1221 * in nsec. 1222 * As the counter time stamps are not truly at 1Hz 1223 * we need to scale the count by the elapsed 1224 * reference time. 1225 * valid sampling interval: [0.5..2[ sec 1226 */ 1227 1228 /* calculate elapsed raw count */ 1229 tcount = pps->capcount - pps->ppscount[2]; 1230 pps->ppscount[2] = pps->capcount; 1231 tcount &= pps->capth->th_counter->tc_counter_mask; 1232 1233 /* calculate elapsed ref time */ 1234 btd = bt_ref; 1235 bintime_sub(&btd, &pps->ref_time); 1236 pps->ref_time = bt_ref; 1237 1238 /* check that we stay below 2 sec */ 1239 if (btd.sec < 0 || btd.sec > 1) 1240 return; 1241 1242 /* we want at least 0.5 sec between samples */ 1243 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1244 return; 1245 1246 /* 1247 * calculate cycles per period by multiplying 1248 * the frequency with the elapsed period 1249 * we pick a fraction of 30 bits 1250 * ~1ns resolution for elapsed time 1251 */ 1252 div = (uint64_t)btd.sec << 30; 1253 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1254 div *= pps->capth->th_counter->tc_frequency; 1255 div >>= 30; 1256 1257 if (div == 0) /* safeguard */ 1258 return; 1259 1260 scale = (uint64_t)1 << 63; 1261 scale /= div; 1262 scale *= 2; 1263 1264 bt.sec = 0; 1265 bt.frac = 0; 1266 bintime_addx(&bt, scale * tcount); 1267 bintime2timespec(&bt, &ts); 1268 1269 #ifdef PPS_DEBUG 1270 if (ppsdebug & 0x4) { 1271 struct timespec ts2; 1272 int64_t df; 1273 1274 bintime2timespec(&bt_ref, &ts2); 1275 df = timespec2ns(&ts); 1276 if (df > 500000000) 1277 df -= 1000000000; 1278 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1279 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1280 ", freqdiff=%"PRIi64" ns/s\n", 1281 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1282 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1283 df); 1284 } 1285 #endif 1286 1287 hardpps(tsp, timespec2ns(&ts)); 1288 } 1289 #endif 1290 } 1291 1292 /* 1293 * Timecounters need to be updated every so often to prevent the hardware 1294 * counter from overflowing. Updating also recalculates the cached values 1295 * used by the get*() family of functions, so their precision depends on 1296 * the update frequency. 1297 */ 1298 1299 static int tc_tick; 1300 1301 void 1302 tc_ticktock(void) 1303 { 1304 static int count; 1305 1306 if (++count < tc_tick) 1307 return; 1308 count = 0; 1309 mutex_spin_enter(&timecounter_lock); 1310 if (timecounter_bad != 0) { 1311 /* An existing timecounter has gone bad, pick a new one. */ 1312 (void)atomic_swap_uint(&timecounter_bad, 0); 1313 if (timecounter->tc_quality < 0) { 1314 tc_pick(); 1315 } 1316 } 1317 tc_windup(); 1318 mutex_spin_exit(&timecounter_lock); 1319 } 1320 1321 void 1322 inittimecounter(void) 1323 { 1324 u_int p; 1325 1326 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1327 1328 /* 1329 * Set the initial timeout to 1330 * max(1, <approx. number of hardclock ticks in a millisecond>). 1331 * People should probably not use the sysctl to set the timeout 1332 * to smaller than its inital value, since that value is the 1333 * smallest reasonable one. If they want better timestamps they 1334 * should use the non-"get"* functions. 1335 */ 1336 if (hz > 1000) 1337 tc_tick = (hz + 500) / 1000; 1338 else 1339 tc_tick = 1; 1340 p = (tc_tick * 1000000) / hz; 1341 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1342 p / 1000, p % 1000); 1343 1344 /* warm up new timecounter (again) and get rolling. */ 1345 (void)timecounter->tc_get_timecount(timecounter); 1346 (void)timecounter->tc_get_timecount(timecounter); 1347 } 1348