1 /* $NetBSD: kern_tc.c,v 1.51 2018/07/01 15:12:06 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.51 2018/07/01 15:12:06 riastradh Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 52 #include <sys/sysctl.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 #include <sys/timepps.h> 56 #include <sys/timetc.h> 57 #include <sys/timex.h> 58 #include <sys/evcnt.h> 59 #include <sys/kauth.h> 60 #include <sys/mutex.h> 61 #include <sys/atomic.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return (++now); 84 } 85 86 static struct timecounter dummy_timecounter = { 87 .tc_get_timecount = dummy_get_timecount, 88 .tc_counter_mask = ~0u, 89 .tc_frequency = 1000000, 90 .tc_name = "dummy", 91 .tc_quality = -1000000, 92 .tc_priv = NULL, 93 }; 94 95 struct timehands { 96 /* These fields must be initialized by the driver. */ 97 struct timecounter *th_counter; /* active timecounter */ 98 int64_t th_adjustment; /* frequency adjustment */ 99 /* (NTP/adjtime) */ 100 u_int64_t th_scale; /* scale factor (counter */ 101 /* tick->time) */ 102 u_int64_t th_offset_count; /* offset at last time */ 103 /* update (tc_windup()) */ 104 struct bintime th_offset; /* bin (up)time at windup */ 105 struct timeval th_microtime; /* cached microtime */ 106 struct timespec th_nanotime; /* cached nanotime */ 107 /* Fields not to be copied in tc_windup start with th_generation. */ 108 volatile u_int th_generation; /* current genration */ 109 struct timehands *th_next; /* next timehand */ 110 }; 111 112 static struct timehands th0; 113 static struct timehands th9 = { .th_next = &th0, }; 114 static struct timehands th8 = { .th_next = &th9, }; 115 static struct timehands th7 = { .th_next = &th8, }; 116 static struct timehands th6 = { .th_next = &th7, }; 117 static struct timehands th5 = { .th_next = &th6, }; 118 static struct timehands th4 = { .th_next = &th5, }; 119 static struct timehands th3 = { .th_next = &th4, }; 120 static struct timehands th2 = { .th_next = &th3, }; 121 static struct timehands th1 = { .th_next = &th2, }; 122 static struct timehands th0 = { 123 .th_counter = &dummy_timecounter, 124 .th_scale = (uint64_t)-1 / 1000000, 125 .th_offset = { .sec = 1, .frac = 0 }, 126 .th_generation = 1, 127 .th_next = &th1, 128 }; 129 130 static struct timehands *volatile timehands = &th0; 131 struct timecounter *timecounter = &dummy_timecounter; 132 static struct timecounter *timecounters = &dummy_timecounter; 133 134 volatile time_t time_second __cacheline_aligned = 1; 135 volatile time_t time_uptime __cacheline_aligned = 1; 136 137 static struct bintime timebasebin; 138 139 static int timestepwarnings; 140 141 kmutex_t timecounter_lock; 142 static u_int timecounter_mods; 143 static volatile int timecounter_removals = 1; 144 static u_int timecounter_bad; 145 146 #ifdef __FreeBSD__ 147 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, 148 ×tepwarnings, 0, ""); 149 #endif /* __FreeBSD__ */ 150 151 /* 152 * sysctl helper routine for kern.timercounter.hardware 153 */ 154 static int 155 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 156 { 157 struct sysctlnode node; 158 int error; 159 char newname[MAX_TCNAMELEN]; 160 struct timecounter *newtc, *tc; 161 162 tc = timecounter; 163 164 strlcpy(newname, tc->tc_name, sizeof(newname)); 165 166 node = *rnode; 167 node.sysctl_data = newname; 168 node.sysctl_size = sizeof(newname); 169 170 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 171 172 if (error || 173 newp == NULL || 174 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 175 return error; 176 177 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 178 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 179 NULL, NULL)) != 0) 180 return (error); 181 182 if (!cold) 183 mutex_spin_enter(&timecounter_lock); 184 error = EINVAL; 185 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 186 if (strcmp(newname, newtc->tc_name) != 0) 187 continue; 188 /* Warm up new timecounter. */ 189 (void)newtc->tc_get_timecount(newtc); 190 (void)newtc->tc_get_timecount(newtc); 191 timecounter = newtc; 192 error = 0; 193 break; 194 } 195 if (!cold) 196 mutex_spin_exit(&timecounter_lock); 197 return error; 198 } 199 200 static int 201 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 202 { 203 char buf[MAX_TCNAMELEN+48]; 204 char *where; 205 const char *spc; 206 struct timecounter *tc; 207 size_t needed, left, slen; 208 int error, mods; 209 210 if (newp != NULL) 211 return (EPERM); 212 if (namelen != 0) 213 return (EINVAL); 214 215 mutex_spin_enter(&timecounter_lock); 216 retry: 217 spc = ""; 218 error = 0; 219 needed = 0; 220 left = *oldlenp; 221 where = oldp; 222 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 223 if (where == NULL) { 224 needed += sizeof(buf); /* be conservative */ 225 } else { 226 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 227 " Hz)", spc, tc->tc_name, tc->tc_quality, 228 tc->tc_frequency); 229 if (left < slen + 1) 230 break; 231 mods = timecounter_mods; 232 mutex_spin_exit(&timecounter_lock); 233 error = copyout(buf, where, slen + 1); 234 mutex_spin_enter(&timecounter_lock); 235 if (mods != timecounter_mods) { 236 goto retry; 237 } 238 spc = " "; 239 where += slen; 240 needed += slen; 241 left -= slen; 242 } 243 } 244 mutex_spin_exit(&timecounter_lock); 245 246 *oldlenp = needed; 247 return (error); 248 } 249 250 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 251 { 252 const struct sysctlnode *node; 253 254 sysctl_createv(clog, 0, NULL, &node, 255 CTLFLAG_PERMANENT, 256 CTLTYPE_NODE, "timecounter", 257 SYSCTL_DESCR("time counter information"), 258 NULL, 0, NULL, 0, 259 CTL_KERN, CTL_CREATE, CTL_EOL); 260 261 if (node != NULL) { 262 sysctl_createv(clog, 0, NULL, NULL, 263 CTLFLAG_PERMANENT, 264 CTLTYPE_STRING, "choice", 265 SYSCTL_DESCR("available counters"), 266 sysctl_kern_timecounter_choice, 0, NULL, 0, 267 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 268 269 sysctl_createv(clog, 0, NULL, NULL, 270 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 271 CTLTYPE_STRING, "hardware", 272 SYSCTL_DESCR("currently active time counter"), 273 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 274 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 275 276 sysctl_createv(clog, 0, NULL, NULL, 277 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 278 CTLTYPE_INT, "timestepwarnings", 279 SYSCTL_DESCR("log time steps"), 280 NULL, 0, ×tepwarnings, 0, 281 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 282 } 283 } 284 285 #ifdef TC_COUNTERS 286 #define TC_STATS(name) \ 287 static struct evcnt n##name = \ 288 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 289 EVCNT_ATTACH_STATIC(n##name) 290 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 291 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 292 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 293 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 294 TC_STATS(setclock); 295 #define TC_COUNT(var) var.ev_count++ 296 #undef TC_STATS 297 #else 298 #define TC_COUNT(var) /* nothing */ 299 #endif /* TC_COUNTERS */ 300 301 static void tc_windup(void); 302 303 /* 304 * Return the difference between the timehands' counter value now and what 305 * was when we copied it to the timehands' offset_count. 306 */ 307 static inline u_int 308 tc_delta(struct timehands *th) 309 { 310 struct timecounter *tc; 311 312 tc = th->th_counter; 313 return ((tc->tc_get_timecount(tc) - 314 th->th_offset_count) & tc->tc_counter_mask); 315 } 316 317 /* 318 * Functions for reading the time. We have to loop until we are sure that 319 * the timehands that we operated on was not updated under our feet. See 320 * the comment in <sys/timevar.h> for a description of these 12 functions. 321 */ 322 323 void 324 binuptime(struct bintime *bt) 325 { 326 struct timehands *th; 327 lwp_t *l; 328 u_int lgen, gen; 329 330 TC_COUNT(nbinuptime); 331 332 /* 333 * Provide exclusion against tc_detach(). 334 * 335 * We record the number of timecounter removals before accessing 336 * timecounter state. Note that the LWP can be using multiple 337 * "generations" at once, due to interrupts (interrupted while in 338 * this function). Hardware interrupts will borrow the interrupted 339 * LWP's l_tcgen value for this purpose, and can themselves be 340 * interrupted by higher priority interrupts. In this case we need 341 * to ensure that the oldest generation in use is recorded. 342 * 343 * splsched() is too expensive to use, so we take care to structure 344 * this code in such a way that it is not required. Likewise, we 345 * do not disable preemption. 346 * 347 * Memory barriers are also too expensive to use for such a 348 * performance critical function. The good news is that we do not 349 * need memory barriers for this type of exclusion, as the thread 350 * updating timecounter_removals will issue a broadcast cross call 351 * before inspecting our l_tcgen value (this elides memory ordering 352 * issues). 353 */ 354 l = curlwp; 355 lgen = l->l_tcgen; 356 if (__predict_true(lgen == 0)) { 357 l->l_tcgen = timecounter_removals; 358 } 359 __insn_barrier(); 360 361 do { 362 th = timehands; 363 gen = th->th_generation; 364 *bt = th->th_offset; 365 bintime_addx(bt, th->th_scale * tc_delta(th)); 366 } while (gen == 0 || gen != th->th_generation); 367 368 __insn_barrier(); 369 l->l_tcgen = lgen; 370 } 371 372 void 373 nanouptime(struct timespec *tsp) 374 { 375 struct bintime bt; 376 377 TC_COUNT(nnanouptime); 378 binuptime(&bt); 379 bintime2timespec(&bt, tsp); 380 } 381 382 void 383 microuptime(struct timeval *tvp) 384 { 385 struct bintime bt; 386 387 TC_COUNT(nmicrouptime); 388 binuptime(&bt); 389 bintime2timeval(&bt, tvp); 390 } 391 392 void 393 bintime(struct bintime *bt) 394 { 395 396 TC_COUNT(nbintime); 397 binuptime(bt); 398 bintime_add(bt, &timebasebin); 399 } 400 401 void 402 nanotime(struct timespec *tsp) 403 { 404 struct bintime bt; 405 406 TC_COUNT(nnanotime); 407 bintime(&bt); 408 bintime2timespec(&bt, tsp); 409 } 410 411 void 412 microtime(struct timeval *tvp) 413 { 414 struct bintime bt; 415 416 TC_COUNT(nmicrotime); 417 bintime(&bt); 418 bintime2timeval(&bt, tvp); 419 } 420 421 void 422 getbinuptime(struct bintime *bt) 423 { 424 struct timehands *th; 425 u_int gen; 426 427 TC_COUNT(ngetbinuptime); 428 do { 429 th = timehands; 430 gen = th->th_generation; 431 *bt = th->th_offset; 432 } while (gen == 0 || gen != th->th_generation); 433 } 434 435 void 436 getnanouptime(struct timespec *tsp) 437 { 438 struct timehands *th; 439 u_int gen; 440 441 TC_COUNT(ngetnanouptime); 442 do { 443 th = timehands; 444 gen = th->th_generation; 445 bintime2timespec(&th->th_offset, tsp); 446 } while (gen == 0 || gen != th->th_generation); 447 } 448 449 void 450 getmicrouptime(struct timeval *tvp) 451 { 452 struct timehands *th; 453 u_int gen; 454 455 TC_COUNT(ngetmicrouptime); 456 do { 457 th = timehands; 458 gen = th->th_generation; 459 bintime2timeval(&th->th_offset, tvp); 460 } while (gen == 0 || gen != th->th_generation); 461 } 462 463 void 464 getbintime(struct bintime *bt) 465 { 466 struct timehands *th; 467 u_int gen; 468 469 TC_COUNT(ngetbintime); 470 do { 471 th = timehands; 472 gen = th->th_generation; 473 *bt = th->th_offset; 474 } while (gen == 0 || gen != th->th_generation); 475 bintime_add(bt, &timebasebin); 476 } 477 478 static inline void 479 dogetnanotime(struct timespec *tsp) 480 { 481 struct timehands *th; 482 u_int gen; 483 484 TC_COUNT(ngetnanotime); 485 do { 486 th = timehands; 487 gen = th->th_generation; 488 *tsp = th->th_nanotime; 489 } while (gen == 0 || gen != th->th_generation); 490 } 491 492 void 493 getnanotime(struct timespec *tsp) 494 { 495 496 dogetnanotime(tsp); 497 } 498 499 void dtrace_getnanotime(struct timespec *tsp); 500 501 void 502 dtrace_getnanotime(struct timespec *tsp) 503 { 504 505 dogetnanotime(tsp); 506 } 507 508 void 509 getmicrotime(struct timeval *tvp) 510 { 511 struct timehands *th; 512 u_int gen; 513 514 TC_COUNT(ngetmicrotime); 515 do { 516 th = timehands; 517 gen = th->th_generation; 518 *tvp = th->th_microtime; 519 } while (gen == 0 || gen != th->th_generation); 520 } 521 522 /* 523 * Initialize a new timecounter and possibly use it. 524 */ 525 void 526 tc_init(struct timecounter *tc) 527 { 528 u_int u; 529 530 u = tc->tc_frequency / tc->tc_counter_mask; 531 /* XXX: We need some margin here, 10% is a guess */ 532 u *= 11; 533 u /= 10; 534 if (u > hz && tc->tc_quality >= 0) { 535 tc->tc_quality = -2000; 536 aprint_verbose( 537 "timecounter: Timecounter \"%s\" frequency %ju Hz", 538 tc->tc_name, (uintmax_t)tc->tc_frequency); 539 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 540 } else if (tc->tc_quality >= 0 || bootverbose) { 541 aprint_verbose( 542 "timecounter: Timecounter \"%s\" frequency %ju Hz " 543 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 544 tc->tc_quality); 545 } 546 547 mutex_spin_enter(&timecounter_lock); 548 tc->tc_next = timecounters; 549 timecounters = tc; 550 timecounter_mods++; 551 /* 552 * Never automatically use a timecounter with negative quality. 553 * Even though we run on the dummy counter, switching here may be 554 * worse since this timecounter may not be monotonous. 555 */ 556 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 557 (tc->tc_quality == timecounter->tc_quality && 558 tc->tc_frequency > timecounter->tc_frequency))) { 559 (void)tc->tc_get_timecount(tc); 560 (void)tc->tc_get_timecount(tc); 561 timecounter = tc; 562 tc_windup(); 563 } 564 mutex_spin_exit(&timecounter_lock); 565 } 566 567 /* 568 * Pick a new timecounter due to the existing counter going bad. 569 */ 570 static void 571 tc_pick(void) 572 { 573 struct timecounter *best, *tc; 574 575 KASSERT(mutex_owned(&timecounter_lock)); 576 577 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 578 if (tc->tc_quality > best->tc_quality) 579 best = tc; 580 else if (tc->tc_quality < best->tc_quality) 581 continue; 582 else if (tc->tc_frequency > best->tc_frequency) 583 best = tc; 584 } 585 (void)best->tc_get_timecount(best); 586 (void)best->tc_get_timecount(best); 587 timecounter = best; 588 } 589 590 /* 591 * A timecounter has gone bad, arrange to pick a new one at the next 592 * clock tick. 593 */ 594 void 595 tc_gonebad(struct timecounter *tc) 596 { 597 598 tc->tc_quality = -100; 599 membar_producer(); 600 atomic_inc_uint(&timecounter_bad); 601 } 602 603 /* 604 * Stop using a timecounter and remove it from the timecounters list. 605 */ 606 int 607 tc_detach(struct timecounter *target) 608 { 609 struct timecounter *tc; 610 struct timecounter **tcp = NULL; 611 int removals; 612 uint64_t where; 613 lwp_t *l; 614 615 /* First, find the timecounter. */ 616 mutex_spin_enter(&timecounter_lock); 617 for (tcp = &timecounters, tc = timecounters; 618 tc != NULL; 619 tcp = &tc->tc_next, tc = tc->tc_next) { 620 if (tc == target) 621 break; 622 } 623 if (tc == NULL) { 624 mutex_spin_exit(&timecounter_lock); 625 return ESRCH; 626 } 627 628 /* And now, remove it. */ 629 *tcp = tc->tc_next; 630 if (timecounter == target) { 631 tc_pick(); 632 tc_windup(); 633 } 634 timecounter_mods++; 635 removals = timecounter_removals++; 636 mutex_spin_exit(&timecounter_lock); 637 638 /* 639 * We now have to determine if any threads in the system are still 640 * making use of this timecounter. 641 * 642 * We issue a broadcast cross call to elide memory ordering issues, 643 * then scan all LWPs in the system looking at each's timecounter 644 * generation number. We need to see a value of zero (not actively 645 * using a timecounter) or a value greater than our removal value. 646 * 647 * We may race with threads that read `timecounter_removals' and 648 * and then get preempted before updating `l_tcgen'. This is not 649 * a problem, since it means that these threads have not yet started 650 * accessing timecounter state. All we do need is one clean 651 * snapshot of the system where every thread appears not to be using 652 * old timecounter state. 653 */ 654 for (;;) { 655 where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL); 656 xc_wait(where); 657 658 mutex_enter(proc_lock); 659 LIST_FOREACH(l, &alllwp, l_list) { 660 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 661 /* 662 * Not using timecounter or old timecounter 663 * state at time of our xcall or later. 664 */ 665 continue; 666 } 667 break; 668 } 669 mutex_exit(proc_lock); 670 671 /* 672 * If the timecounter is still in use, wait at least 10ms 673 * before retrying. 674 */ 675 if (l == NULL) { 676 return 0; 677 } 678 (void)kpause("tcdetach", false, mstohz(10), NULL); 679 } 680 } 681 682 /* Report the frequency of the current timecounter. */ 683 u_int64_t 684 tc_getfrequency(void) 685 { 686 687 return (timehands->th_counter->tc_frequency); 688 } 689 690 /* 691 * Step our concept of UTC. This is done by modifying our estimate of 692 * when we booted. 693 */ 694 void 695 tc_setclock(const struct timespec *ts) 696 { 697 struct timespec ts2; 698 struct bintime bt, bt2; 699 700 mutex_spin_enter(&timecounter_lock); 701 TC_COUNT(nsetclock); 702 binuptime(&bt2); 703 timespec2bintime(ts, &bt); 704 bintime_sub(&bt, &bt2); 705 bintime_add(&bt2, &timebasebin); 706 timebasebin = bt; 707 tc_windup(); 708 mutex_spin_exit(&timecounter_lock); 709 710 if (timestepwarnings) { 711 bintime2timespec(&bt2, &ts2); 712 log(LOG_INFO, 713 "Time stepped from %lld.%09ld to %lld.%09ld\n", 714 (long long)ts2.tv_sec, ts2.tv_nsec, 715 (long long)ts->tv_sec, ts->tv_nsec); 716 } 717 } 718 719 /* 720 * Initialize the next struct timehands in the ring and make 721 * it the active timehands. Along the way we might switch to a different 722 * timecounter and/or do seconds processing in NTP. Slightly magic. 723 */ 724 static void 725 tc_windup(void) 726 { 727 struct bintime bt; 728 struct timehands *th, *tho; 729 u_int64_t scale; 730 u_int delta, ncount, ogen; 731 int i, s_update; 732 time_t t; 733 734 KASSERT(mutex_owned(&timecounter_lock)); 735 736 s_update = 0; 737 738 /* 739 * Make the next timehands a copy of the current one, but do not 740 * overwrite the generation or next pointer. While we update 741 * the contents, the generation must be zero. Ensure global 742 * visibility of the generation before proceeding. 743 */ 744 tho = timehands; 745 th = tho->th_next; 746 ogen = th->th_generation; 747 th->th_generation = 0; 748 membar_producer(); 749 bcopy(tho, th, offsetof(struct timehands, th_generation)); 750 751 /* 752 * Capture a timecounter delta on the current timecounter and if 753 * changing timecounters, a counter value from the new timecounter. 754 * Update the offset fields accordingly. 755 */ 756 delta = tc_delta(th); 757 if (th->th_counter != timecounter) 758 ncount = timecounter->tc_get_timecount(timecounter); 759 else 760 ncount = 0; 761 th->th_offset_count += delta; 762 bintime_addx(&th->th_offset, th->th_scale * delta); 763 764 /* 765 * Hardware latching timecounters may not generate interrupts on 766 * PPS events, so instead we poll them. There is a finite risk that 767 * the hardware might capture a count which is later than the one we 768 * got above, and therefore possibly in the next NTP second which might 769 * have a different rate than the current NTP second. It doesn't 770 * matter in practice. 771 */ 772 if (tho->th_counter->tc_poll_pps) 773 tho->th_counter->tc_poll_pps(tho->th_counter); 774 775 /* 776 * Deal with NTP second processing. The for loop normally 777 * iterates at most once, but in extreme situations it might 778 * keep NTP sane if timeouts are not run for several seconds. 779 * At boot, the time step can be large when the TOD hardware 780 * has been read, so on really large steps, we call 781 * ntp_update_second only twice. We need to call it twice in 782 * case we missed a leap second. 783 * If NTP is not compiled in ntp_update_second still calculates 784 * the adjustment resulting from adjtime() calls. 785 */ 786 bt = th->th_offset; 787 bintime_add(&bt, &timebasebin); 788 i = bt.sec - tho->th_microtime.tv_sec; 789 if (i > LARGE_STEP) 790 i = 2; 791 for (; i > 0; i--) { 792 t = bt.sec; 793 ntp_update_second(&th->th_adjustment, &bt.sec); 794 s_update = 1; 795 if (bt.sec != t) 796 timebasebin.sec += bt.sec - t; 797 } 798 799 /* Update the UTC timestamps used by the get*() functions. */ 800 /* XXX shouldn't do this here. Should force non-`get' versions. */ 801 bintime2timeval(&bt, &th->th_microtime); 802 bintime2timespec(&bt, &th->th_nanotime); 803 /* Now is a good time to change timecounters. */ 804 if (th->th_counter != timecounter) { 805 th->th_counter = timecounter; 806 th->th_offset_count = ncount; 807 s_update = 1; 808 } 809 810 /*- 811 * Recalculate the scaling factor. We want the number of 1/2^64 812 * fractions of a second per period of the hardware counter, taking 813 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 814 * processing provides us with. 815 * 816 * The th_adjustment is nanoseconds per second with 32 bit binary 817 * fraction and we want 64 bit binary fraction of second: 818 * 819 * x = a * 2^32 / 10^9 = a * 4.294967296 820 * 821 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 822 * we can only multiply by about 850 without overflowing, but that 823 * leaves suitably precise fractions for multiply before divide. 824 * 825 * Divide before multiply with a fraction of 2199/512 results in a 826 * systematic undercompensation of 10PPM of th_adjustment. On a 827 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 828 * 829 * We happily sacrifice the lowest of the 64 bits of our result 830 * to the goddess of code clarity. 831 * 832 */ 833 if (s_update) { 834 scale = (u_int64_t)1 << 63; 835 scale += (th->th_adjustment / 1024) * 2199; 836 scale /= th->th_counter->tc_frequency; 837 th->th_scale = scale * 2; 838 } 839 /* 840 * Now that the struct timehands is again consistent, set the new 841 * generation number, making sure to not make it zero. Ensure 842 * changes are globally visible before changing. 843 */ 844 if (++ogen == 0) 845 ogen = 1; 846 membar_producer(); 847 th->th_generation = ogen; 848 849 /* 850 * Go live with the new struct timehands. Ensure changes are 851 * globally visible before changing. 852 */ 853 time_second = th->th_microtime.tv_sec; 854 time_uptime = th->th_offset.sec; 855 membar_producer(); 856 timehands = th; 857 858 /* 859 * Force users of the old timehand to move on. This is 860 * necessary for MP systems; we need to ensure that the 861 * consumers will move away from the old timehand before 862 * we begin updating it again when we eventually wrap 863 * around. 864 */ 865 if (++tho->th_generation == 0) 866 tho->th_generation = 1; 867 } 868 869 /* 870 * RFC 2783 PPS-API implementation. 871 */ 872 873 int 874 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 875 { 876 pps_params_t *app; 877 pps_info_t *pipi; 878 #ifdef PPS_SYNC 879 int *epi; 880 #endif 881 882 KASSERT(mutex_owned(&timecounter_lock)); 883 884 KASSERT(pps != NULL); 885 886 switch (cmd) { 887 case PPS_IOC_CREATE: 888 return (0); 889 case PPS_IOC_DESTROY: 890 return (0); 891 case PPS_IOC_SETPARAMS: 892 app = (pps_params_t *)data; 893 if (app->mode & ~pps->ppscap) 894 return (EINVAL); 895 pps->ppsparam = *app; 896 return (0); 897 case PPS_IOC_GETPARAMS: 898 app = (pps_params_t *)data; 899 *app = pps->ppsparam; 900 app->api_version = PPS_API_VERS_1; 901 return (0); 902 case PPS_IOC_GETCAP: 903 *(int*)data = pps->ppscap; 904 return (0); 905 case PPS_IOC_FETCH: 906 pipi = (pps_info_t *)data; 907 pps->ppsinfo.current_mode = pps->ppsparam.mode; 908 *pipi = pps->ppsinfo; 909 return (0); 910 case PPS_IOC_KCBIND: 911 #ifdef PPS_SYNC 912 epi = (int *)data; 913 /* XXX Only root should be able to do this */ 914 if (*epi & ~pps->ppscap) 915 return (EINVAL); 916 pps->kcmode = *epi; 917 return (0); 918 #else 919 return (EOPNOTSUPP); 920 #endif 921 default: 922 return (EPASSTHROUGH); 923 } 924 } 925 926 void 927 pps_init(struct pps_state *pps) 928 { 929 930 KASSERT(mutex_owned(&timecounter_lock)); 931 932 pps->ppscap |= PPS_TSFMT_TSPEC; 933 if (pps->ppscap & PPS_CAPTUREASSERT) 934 pps->ppscap |= PPS_OFFSETASSERT; 935 if (pps->ppscap & PPS_CAPTURECLEAR) 936 pps->ppscap |= PPS_OFFSETCLEAR; 937 } 938 939 /* 940 * capture a timetamp in the pps structure 941 */ 942 void 943 pps_capture(struct pps_state *pps) 944 { 945 struct timehands *th; 946 947 KASSERT(mutex_owned(&timecounter_lock)); 948 KASSERT(pps != NULL); 949 950 th = timehands; 951 pps->capgen = th->th_generation; 952 pps->capth = th; 953 pps->capcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 954 if (pps->capgen != th->th_generation) 955 pps->capgen = 0; 956 } 957 958 #ifdef PPS_DEBUG 959 int ppsdebug = 0; 960 #endif 961 962 /* 963 * process a pps_capture()ed event 964 */ 965 void 966 pps_event(struct pps_state *pps, int event) 967 { 968 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 969 } 970 971 /* 972 * extended pps api / kernel pll/fll entry point 973 * 974 * feed reference time stamps to PPS engine 975 * 976 * will simulate a PPS event and feed 977 * the NTP PLL/FLL if requested. 978 * 979 * the ref time stamps should be roughly once 980 * a second but do not need to be exactly in phase 981 * with the UTC second but should be close to it. 982 * this relaxation of requirements allows callout 983 * driven timestamping mechanisms to feed to pps 984 * capture/kernel pll logic. 985 * 986 * calling pattern is: 987 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 988 * read timestamp from reference source 989 * pps_ref_event() 990 * 991 * supported refmodes: 992 * PPS_REFEVNT_CAPTURE 993 * use system timestamp of pps_capture() 994 * PPS_REFEVNT_CURRENT 995 * use system timestamp of this call 996 * PPS_REFEVNT_CAPCUR 997 * use average of read capture and current system time stamp 998 * PPS_REFEVNT_PPS 999 * assume timestamp on second mark - ref_ts is ignored 1000 * 1001 */ 1002 1003 void 1004 pps_ref_event(struct pps_state *pps, 1005 int event, 1006 struct bintime *ref_ts, 1007 int refmode 1008 ) 1009 { 1010 struct bintime bt; /* current time */ 1011 struct bintime btd; /* time difference */ 1012 struct bintime bt_ref; /* reference time */ 1013 struct timespec ts, *tsp, *osp; 1014 struct timehands *th; 1015 u_int64_t tcount, acount, dcount, *pcount; 1016 int foff, gen; 1017 #ifdef PPS_SYNC 1018 int fhard; 1019 #endif 1020 pps_seq_t *pseq; 1021 1022 KASSERT(mutex_owned(&timecounter_lock)); 1023 1024 KASSERT(pps != NULL); 1025 1026 /* pick up current time stamp if needed */ 1027 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1028 /* pick up current time stamp */ 1029 th = timehands; 1030 gen = th->th_generation; 1031 tcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 1032 if (gen != th->th_generation) 1033 gen = 0; 1034 1035 /* If the timecounter was wound up underneath us, bail out. */ 1036 if (pps->capgen == 0 || 1037 pps->capgen != pps->capth->th_generation || 1038 gen == 0 || 1039 gen != pps->capgen) { 1040 #ifdef PPS_DEBUG 1041 if (ppsdebug & 0x1) { 1042 log(LOG_DEBUG, 1043 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1044 pps, event); 1045 } 1046 #endif 1047 return; 1048 } 1049 } else { 1050 tcount = 0; /* keep GCC happy */ 1051 } 1052 1053 #ifdef PPS_DEBUG 1054 if (ppsdebug & 0x1) { 1055 struct timespec tmsp; 1056 1057 if (ref_ts == NULL) { 1058 tmsp.tv_sec = 0; 1059 tmsp.tv_nsec = 0; 1060 } else { 1061 bintime2timespec(ref_ts, &tmsp); 1062 } 1063 1064 log(LOG_DEBUG, 1065 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1066 ".%09"PRIi32", refmode=0x%1x)\n", 1067 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1068 } 1069 #endif 1070 1071 /* setup correct event references */ 1072 if (event == PPS_CAPTUREASSERT) { 1073 tsp = &pps->ppsinfo.assert_timestamp; 1074 osp = &pps->ppsparam.assert_offset; 1075 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1076 #ifdef PPS_SYNC 1077 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1078 #endif 1079 pcount = &pps->ppscount[0]; 1080 pseq = &pps->ppsinfo.assert_sequence; 1081 } else { 1082 tsp = &pps->ppsinfo.clear_timestamp; 1083 osp = &pps->ppsparam.clear_offset; 1084 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1085 #ifdef PPS_SYNC 1086 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1087 #endif 1088 pcount = &pps->ppscount[1]; 1089 pseq = &pps->ppsinfo.clear_sequence; 1090 } 1091 1092 /* determine system time stamp according to refmode */ 1093 dcount = 0; /* keep GCC happy */ 1094 switch (refmode & PPS_REFEVNT_RMASK) { 1095 case PPS_REFEVNT_CAPTURE: 1096 acount = pps->capcount; /* use capture timestamp */ 1097 break; 1098 1099 case PPS_REFEVNT_CURRENT: 1100 acount = tcount; /* use current timestamp */ 1101 break; 1102 1103 case PPS_REFEVNT_CAPCUR: 1104 /* 1105 * calculate counter value between pps_capture() and 1106 * pps_ref_event() 1107 */ 1108 dcount = tcount - pps->capcount; 1109 acount = (dcount / 2) + pps->capcount; 1110 break; 1111 1112 default: /* ignore call error silently */ 1113 return; 1114 } 1115 1116 /* 1117 * If the timecounter changed, we cannot compare the count values, so 1118 * we have to drop the rest of the PPS-stuff until the next event. 1119 */ 1120 if (pps->ppstc != pps->capth->th_counter) { 1121 pps->ppstc = pps->capth->th_counter; 1122 pps->capcount = acount; 1123 *pcount = acount; 1124 pps->ppscount[2] = acount; 1125 #ifdef PPS_DEBUG 1126 if (ppsdebug & 0x1) { 1127 log(LOG_DEBUG, 1128 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1129 pps, event); 1130 } 1131 #endif 1132 return; 1133 } 1134 1135 pps->capcount = acount; 1136 1137 /* Convert the count to a bintime. */ 1138 bt = pps->capth->th_offset; 1139 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1140 bintime_add(&bt, &timebasebin); 1141 1142 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1143 /* determine difference to reference time stamp */ 1144 bt_ref = *ref_ts; 1145 1146 btd = bt; 1147 bintime_sub(&btd, &bt_ref); 1148 1149 /* 1150 * simulate a PPS timestamp by dropping the fraction 1151 * and applying the offset 1152 */ 1153 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1154 bt.sec++; 1155 bt.frac = 0; 1156 bintime_add(&bt, &btd); 1157 } else { 1158 /* 1159 * create ref_ts from current time - 1160 * we are supposed to be called on 1161 * the second mark 1162 */ 1163 bt_ref = bt; 1164 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1165 bt_ref.sec++; 1166 bt_ref.frac = 0; 1167 } 1168 1169 /* convert bintime to timestamp */ 1170 bintime2timespec(&bt, &ts); 1171 1172 /* If the timecounter was wound up underneath us, bail out. */ 1173 if (pps->capgen != pps->capth->th_generation) 1174 return; 1175 1176 /* store time stamp */ 1177 *pcount = pps->capcount; 1178 (*pseq)++; 1179 *tsp = ts; 1180 1181 /* add offset correction */ 1182 if (foff) { 1183 timespecadd(tsp, osp, tsp); 1184 if (tsp->tv_nsec < 0) { 1185 tsp->tv_nsec += 1000000000; 1186 tsp->tv_sec -= 1; 1187 } 1188 } 1189 1190 #ifdef PPS_DEBUG 1191 if (ppsdebug & 0x2) { 1192 struct timespec ts2; 1193 struct timespec ts3; 1194 1195 bintime2timespec(&bt_ref, &ts2); 1196 1197 bt.sec = 0; 1198 bt.frac = 0; 1199 1200 if (refmode & PPS_REFEVNT_CAPCUR) { 1201 bintime_addx(&bt, pps->capth->th_scale * dcount); 1202 } 1203 bintime2timespec(&bt, &ts3); 1204 1205 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1206 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1207 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1208 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1209 timespec2ns(&ts3)); 1210 } 1211 #endif 1212 1213 #ifdef PPS_SYNC 1214 if (fhard) { 1215 uint64_t scale; 1216 uint64_t div; 1217 1218 /* 1219 * Feed the NTP PLL/FLL. 1220 * The FLL wants to know how many (hardware) nanoseconds 1221 * elapsed since the previous event (mod 1 second) thus 1222 * we are actually looking at the frequency difference scaled 1223 * in nsec. 1224 * As the counter time stamps are not truly at 1Hz 1225 * we need to scale the count by the elapsed 1226 * reference time. 1227 * valid sampling interval: [0.5..2[ sec 1228 */ 1229 1230 /* calculate elapsed raw count */ 1231 tcount = pps->capcount - pps->ppscount[2]; 1232 pps->ppscount[2] = pps->capcount; 1233 tcount &= pps->capth->th_counter->tc_counter_mask; 1234 1235 /* calculate elapsed ref time */ 1236 btd = bt_ref; 1237 bintime_sub(&btd, &pps->ref_time); 1238 pps->ref_time = bt_ref; 1239 1240 /* check that we stay below 2 sec */ 1241 if (btd.sec < 0 || btd.sec > 1) 1242 return; 1243 1244 /* we want at least 0.5 sec between samples */ 1245 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1246 return; 1247 1248 /* 1249 * calculate cycles per period by multiplying 1250 * the frequency with the elapsed period 1251 * we pick a fraction of 30 bits 1252 * ~1ns resolution for elapsed time 1253 */ 1254 div = (uint64_t)btd.sec << 30; 1255 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1256 div *= pps->capth->th_counter->tc_frequency; 1257 div >>= 30; 1258 1259 if (div == 0) /* safeguard */ 1260 return; 1261 1262 scale = (uint64_t)1 << 63; 1263 scale /= div; 1264 scale *= 2; 1265 1266 bt.sec = 0; 1267 bt.frac = 0; 1268 bintime_addx(&bt, scale * tcount); 1269 bintime2timespec(&bt, &ts); 1270 1271 #ifdef PPS_DEBUG 1272 if (ppsdebug & 0x4) { 1273 struct timespec ts2; 1274 int64_t df; 1275 1276 bintime2timespec(&bt_ref, &ts2); 1277 df = timespec2ns(&ts); 1278 if (df > 500000000) 1279 df -= 1000000000; 1280 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1281 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1282 ", freqdiff=%"PRIi64" ns/s\n", 1283 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1284 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1285 df); 1286 } 1287 #endif 1288 1289 hardpps(tsp, timespec2ns(&ts)); 1290 } 1291 #endif 1292 } 1293 1294 /* 1295 * Timecounters need to be updated every so often to prevent the hardware 1296 * counter from overflowing. Updating also recalculates the cached values 1297 * used by the get*() family of functions, so their precision depends on 1298 * the update frequency. 1299 */ 1300 1301 static int tc_tick; 1302 1303 void 1304 tc_ticktock(void) 1305 { 1306 static int count; 1307 1308 if (++count < tc_tick) 1309 return; 1310 count = 0; 1311 mutex_spin_enter(&timecounter_lock); 1312 if (timecounter_bad != 0) { 1313 /* An existing timecounter has gone bad, pick a new one. */ 1314 (void)atomic_swap_uint(&timecounter_bad, 0); 1315 if (timecounter->tc_quality < 0) { 1316 tc_pick(); 1317 } 1318 } 1319 tc_windup(); 1320 mutex_spin_exit(&timecounter_lock); 1321 } 1322 1323 void 1324 inittimecounter(void) 1325 { 1326 u_int p; 1327 1328 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1329 1330 /* 1331 * Set the initial timeout to 1332 * max(1, <approx. number of hardclock ticks in a millisecond>). 1333 * People should probably not use the sysctl to set the timeout 1334 * to smaller than its inital value, since that value is the 1335 * smallest reasonable one. If they want better timestamps they 1336 * should use the non-"get"* functions. 1337 */ 1338 if (hz > 1000) 1339 tc_tick = (hz + 500) / 1000; 1340 else 1341 tc_tick = 1; 1342 p = (tc_tick * 1000000) / hz; 1343 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1344 p / 1000, p % 1000); 1345 1346 /* warm up new timecounter (again) and get rolling. */ 1347 (void)timecounter->tc_get_timecount(timecounter); 1348 (void)timecounter->tc_get_timecount(timecounter); 1349 } 1350