1 /* $NetBSD: kern_tc.c,v 1.54 2020/01/02 15:42:27 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.54 2020/01/02 15:42:27 thorpej Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 52 #include <sys/sysctl.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 #include <sys/timepps.h> 56 #include <sys/timetc.h> 57 #include <sys/timex.h> 58 #include <sys/evcnt.h> 59 #include <sys/kauth.h> 60 #include <sys/mutex.h> 61 #include <sys/atomic.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return (++now); 84 } 85 86 static struct timecounter dummy_timecounter = { 87 .tc_get_timecount = dummy_get_timecount, 88 .tc_counter_mask = ~0u, 89 .tc_frequency = 1000000, 90 .tc_name = "dummy", 91 .tc_quality = -1000000, 92 .tc_priv = NULL, 93 }; 94 95 struct timehands { 96 /* These fields must be initialized by the driver. */ 97 struct timecounter *th_counter; /* active timecounter */ 98 int64_t th_adjustment; /* frequency adjustment */ 99 /* (NTP/adjtime) */ 100 u_int64_t th_scale; /* scale factor (counter */ 101 /* tick->time) */ 102 u_int64_t th_offset_count; /* offset at last time */ 103 /* update (tc_windup()) */ 104 struct bintime th_offset; /* bin (up)time at windup */ 105 struct timeval th_microtime; /* cached microtime */ 106 struct timespec th_nanotime; /* cached nanotime */ 107 /* Fields not to be copied in tc_windup start with th_generation. */ 108 volatile u_int th_generation; /* current genration */ 109 struct timehands *th_next; /* next timehand */ 110 }; 111 112 static struct timehands th0; 113 static struct timehands th9 = { .th_next = &th0, }; 114 static struct timehands th8 = { .th_next = &th9, }; 115 static struct timehands th7 = { .th_next = &th8, }; 116 static struct timehands th6 = { .th_next = &th7, }; 117 static struct timehands th5 = { .th_next = &th6, }; 118 static struct timehands th4 = { .th_next = &th5, }; 119 static struct timehands th3 = { .th_next = &th4, }; 120 static struct timehands th2 = { .th_next = &th3, }; 121 static struct timehands th1 = { .th_next = &th2, }; 122 static struct timehands th0 = { 123 .th_counter = &dummy_timecounter, 124 .th_scale = (uint64_t)-1 / 1000000, 125 .th_offset = { .sec = 1, .frac = 0 }, 126 .th_generation = 1, 127 .th_next = &th1, 128 }; 129 130 static struct timehands *volatile timehands = &th0; 131 struct timecounter *timecounter = &dummy_timecounter; 132 static struct timecounter *timecounters = &dummy_timecounter; 133 134 volatile time_t time_second __cacheline_aligned = 1; 135 volatile time_t time_uptime __cacheline_aligned = 1; 136 137 static struct bintime timebasebin; 138 139 static int timestepwarnings; 140 141 kmutex_t timecounter_lock; 142 static u_int timecounter_mods; 143 static volatile int timecounter_removals = 1; 144 static u_int timecounter_bad; 145 146 #ifdef __FreeBSD__ 147 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, 148 ×tepwarnings, 0, ""); 149 #endif /* __FreeBSD__ */ 150 151 /* 152 * sysctl helper routine for kern.timercounter.hardware 153 */ 154 static int 155 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 156 { 157 struct sysctlnode node; 158 int error; 159 char newname[MAX_TCNAMELEN]; 160 struct timecounter *newtc, *tc; 161 162 tc = timecounter; 163 164 strlcpy(newname, tc->tc_name, sizeof(newname)); 165 166 node = *rnode; 167 node.sysctl_data = newname; 168 node.sysctl_size = sizeof(newname); 169 170 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 171 172 if (error || 173 newp == NULL || 174 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 175 return error; 176 177 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 178 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 179 NULL, NULL)) != 0) 180 return (error); 181 182 if (!cold) 183 mutex_spin_enter(&timecounter_lock); 184 error = EINVAL; 185 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 186 if (strcmp(newname, newtc->tc_name) != 0) 187 continue; 188 /* Warm up new timecounter. */ 189 (void)newtc->tc_get_timecount(newtc); 190 (void)newtc->tc_get_timecount(newtc); 191 timecounter = newtc; 192 error = 0; 193 break; 194 } 195 if (!cold) 196 mutex_spin_exit(&timecounter_lock); 197 return error; 198 } 199 200 static int 201 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 202 { 203 char buf[MAX_TCNAMELEN+48]; 204 char *where; 205 const char *spc; 206 struct timecounter *tc; 207 size_t needed, left, slen; 208 int error, mods; 209 210 if (newp != NULL) 211 return (EPERM); 212 if (namelen != 0) 213 return (EINVAL); 214 215 mutex_spin_enter(&timecounter_lock); 216 retry: 217 spc = ""; 218 error = 0; 219 needed = 0; 220 left = *oldlenp; 221 where = oldp; 222 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 223 if (where == NULL) { 224 needed += sizeof(buf); /* be conservative */ 225 } else { 226 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 227 " Hz)", spc, tc->tc_name, tc->tc_quality, 228 tc->tc_frequency); 229 if (left < slen + 1) 230 break; 231 mods = timecounter_mods; 232 mutex_spin_exit(&timecounter_lock); 233 error = copyout(buf, where, slen + 1); 234 mutex_spin_enter(&timecounter_lock); 235 if (mods != timecounter_mods) { 236 goto retry; 237 } 238 spc = " "; 239 where += slen; 240 needed += slen; 241 left -= slen; 242 } 243 } 244 mutex_spin_exit(&timecounter_lock); 245 246 *oldlenp = needed; 247 return (error); 248 } 249 250 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 251 { 252 const struct sysctlnode *node; 253 254 sysctl_createv(clog, 0, NULL, &node, 255 CTLFLAG_PERMANENT, 256 CTLTYPE_NODE, "timecounter", 257 SYSCTL_DESCR("time counter information"), 258 NULL, 0, NULL, 0, 259 CTL_KERN, CTL_CREATE, CTL_EOL); 260 261 if (node != NULL) { 262 sysctl_createv(clog, 0, NULL, NULL, 263 CTLFLAG_PERMANENT, 264 CTLTYPE_STRING, "choice", 265 SYSCTL_DESCR("available counters"), 266 sysctl_kern_timecounter_choice, 0, NULL, 0, 267 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 268 269 sysctl_createv(clog, 0, NULL, NULL, 270 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 271 CTLTYPE_STRING, "hardware", 272 SYSCTL_DESCR("currently active time counter"), 273 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 274 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 275 276 sysctl_createv(clog, 0, NULL, NULL, 277 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 278 CTLTYPE_INT, "timestepwarnings", 279 SYSCTL_DESCR("log time steps"), 280 NULL, 0, ×tepwarnings, 0, 281 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 282 } 283 } 284 285 #ifdef TC_COUNTERS 286 #define TC_STATS(name) \ 287 static struct evcnt n##name = \ 288 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 289 EVCNT_ATTACH_STATIC(n##name) 290 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 291 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 292 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 293 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 294 TC_STATS(setclock); 295 #define TC_COUNT(var) var.ev_count++ 296 #undef TC_STATS 297 #else 298 #define TC_COUNT(var) /* nothing */ 299 #endif /* TC_COUNTERS */ 300 301 static void tc_windup(void); 302 303 /* 304 * Return the difference between the timehands' counter value now and what 305 * was when we copied it to the timehands' offset_count. 306 */ 307 static inline u_int 308 tc_delta(struct timehands *th) 309 { 310 struct timecounter *tc; 311 312 tc = th->th_counter; 313 return ((tc->tc_get_timecount(tc) - 314 th->th_offset_count) & tc->tc_counter_mask); 315 } 316 317 /* 318 * Functions for reading the time. We have to loop until we are sure that 319 * the timehands that we operated on was not updated under our feet. See 320 * the comment in <sys/timevar.h> for a description of these 12 functions. 321 */ 322 323 void 324 binuptime(struct bintime *bt) 325 { 326 struct timehands *th; 327 lwp_t *l; 328 u_int lgen, gen; 329 330 TC_COUNT(nbinuptime); 331 332 /* 333 * Provide exclusion against tc_detach(). 334 * 335 * We record the number of timecounter removals before accessing 336 * timecounter state. Note that the LWP can be using multiple 337 * "generations" at once, due to interrupts (interrupted while in 338 * this function). Hardware interrupts will borrow the interrupted 339 * LWP's l_tcgen value for this purpose, and can themselves be 340 * interrupted by higher priority interrupts. In this case we need 341 * to ensure that the oldest generation in use is recorded. 342 * 343 * splsched() is too expensive to use, so we take care to structure 344 * this code in such a way that it is not required. Likewise, we 345 * do not disable preemption. 346 * 347 * Memory barriers are also too expensive to use for such a 348 * performance critical function. The good news is that we do not 349 * need memory barriers for this type of exclusion, as the thread 350 * updating timecounter_removals will issue a broadcast cross call 351 * before inspecting our l_tcgen value (this elides memory ordering 352 * issues). 353 */ 354 l = curlwp; 355 lgen = l->l_tcgen; 356 if (__predict_true(lgen == 0)) { 357 l->l_tcgen = timecounter_removals; 358 } 359 __insn_barrier(); 360 361 do { 362 th = timehands; 363 gen = th->th_generation; 364 *bt = th->th_offset; 365 bintime_addx(bt, th->th_scale * tc_delta(th)); 366 } while (gen == 0 || gen != th->th_generation); 367 368 __insn_barrier(); 369 l->l_tcgen = lgen; 370 } 371 372 void 373 nanouptime(struct timespec *tsp) 374 { 375 struct bintime bt; 376 377 TC_COUNT(nnanouptime); 378 binuptime(&bt); 379 bintime2timespec(&bt, tsp); 380 } 381 382 void 383 microuptime(struct timeval *tvp) 384 { 385 struct bintime bt; 386 387 TC_COUNT(nmicrouptime); 388 binuptime(&bt); 389 bintime2timeval(&bt, tvp); 390 } 391 392 void 393 bintime(struct bintime *bt) 394 { 395 396 TC_COUNT(nbintime); 397 binuptime(bt); 398 bintime_add(bt, &timebasebin); 399 } 400 401 void 402 nanotime(struct timespec *tsp) 403 { 404 struct bintime bt; 405 406 TC_COUNT(nnanotime); 407 bintime(&bt); 408 bintime2timespec(&bt, tsp); 409 } 410 411 void 412 microtime(struct timeval *tvp) 413 { 414 struct bintime bt; 415 416 TC_COUNT(nmicrotime); 417 bintime(&bt); 418 bintime2timeval(&bt, tvp); 419 } 420 421 void 422 getbinuptime(struct bintime *bt) 423 { 424 struct timehands *th; 425 u_int gen; 426 427 TC_COUNT(ngetbinuptime); 428 do { 429 th = timehands; 430 gen = th->th_generation; 431 *bt = th->th_offset; 432 } while (gen == 0 || gen != th->th_generation); 433 } 434 435 void 436 getnanouptime(struct timespec *tsp) 437 { 438 struct timehands *th; 439 u_int gen; 440 441 TC_COUNT(ngetnanouptime); 442 do { 443 th = timehands; 444 gen = th->th_generation; 445 bintime2timespec(&th->th_offset, tsp); 446 } while (gen == 0 || gen != th->th_generation); 447 } 448 449 void 450 getmicrouptime(struct timeval *tvp) 451 { 452 struct timehands *th; 453 u_int gen; 454 455 TC_COUNT(ngetmicrouptime); 456 do { 457 th = timehands; 458 gen = th->th_generation; 459 bintime2timeval(&th->th_offset, tvp); 460 } while (gen == 0 || gen != th->th_generation); 461 } 462 463 void 464 getbintime(struct bintime *bt) 465 { 466 struct timehands *th; 467 u_int gen; 468 469 TC_COUNT(ngetbintime); 470 do { 471 th = timehands; 472 gen = th->th_generation; 473 *bt = th->th_offset; 474 } while (gen == 0 || gen != th->th_generation); 475 bintime_add(bt, &timebasebin); 476 } 477 478 static inline void 479 dogetnanotime(struct timespec *tsp) 480 { 481 struct timehands *th; 482 u_int gen; 483 484 TC_COUNT(ngetnanotime); 485 do { 486 th = timehands; 487 gen = th->th_generation; 488 *tsp = th->th_nanotime; 489 } while (gen == 0 || gen != th->th_generation); 490 } 491 492 void 493 getnanotime(struct timespec *tsp) 494 { 495 496 dogetnanotime(tsp); 497 } 498 499 void dtrace_getnanotime(struct timespec *tsp); 500 501 void 502 dtrace_getnanotime(struct timespec *tsp) 503 { 504 505 dogetnanotime(tsp); 506 } 507 508 void 509 getmicrotime(struct timeval *tvp) 510 { 511 struct timehands *th; 512 u_int gen; 513 514 TC_COUNT(ngetmicrotime); 515 do { 516 th = timehands; 517 gen = th->th_generation; 518 *tvp = th->th_microtime; 519 } while (gen == 0 || gen != th->th_generation); 520 } 521 522 void 523 getnanoboottime(struct timespec *tsp) 524 { 525 struct bintime bt; 526 527 getbinboottime(&bt); 528 bintime2timespec(&bt, tsp); 529 } 530 531 void 532 getmicroboottime(struct timeval *tvp) 533 { 534 struct bintime bt; 535 536 getbinboottime(&bt); 537 bintime2timeval(&bt, tvp); 538 } 539 540 void 541 getbinboottime(struct bintime *bt) 542 { 543 544 /* 545 * XXX Need lockless read synchronization around timebasebin 546 * (and not just here). 547 */ 548 *bt = timebasebin; 549 } 550 551 /* 552 * Initialize a new timecounter and possibly use it. 553 */ 554 void 555 tc_init(struct timecounter *tc) 556 { 557 u_int u; 558 559 u = tc->tc_frequency / tc->tc_counter_mask; 560 /* XXX: We need some margin here, 10% is a guess */ 561 u *= 11; 562 u /= 10; 563 if (u > hz && tc->tc_quality >= 0) { 564 tc->tc_quality = -2000; 565 aprint_verbose( 566 "timecounter: Timecounter \"%s\" frequency %ju Hz", 567 tc->tc_name, (uintmax_t)tc->tc_frequency); 568 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 569 } else if (tc->tc_quality >= 0 || bootverbose) { 570 aprint_verbose( 571 "timecounter: Timecounter \"%s\" frequency %ju Hz " 572 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 573 tc->tc_quality); 574 } 575 576 mutex_spin_enter(&timecounter_lock); 577 tc->tc_next = timecounters; 578 timecounters = tc; 579 timecounter_mods++; 580 /* 581 * Never automatically use a timecounter with negative quality. 582 * Even though we run on the dummy counter, switching here may be 583 * worse since this timecounter may not be monotonous. 584 */ 585 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 586 (tc->tc_quality == timecounter->tc_quality && 587 tc->tc_frequency > timecounter->tc_frequency))) { 588 (void)tc->tc_get_timecount(tc); 589 (void)tc->tc_get_timecount(tc); 590 timecounter = tc; 591 tc_windup(); 592 } 593 mutex_spin_exit(&timecounter_lock); 594 } 595 596 /* 597 * Pick a new timecounter due to the existing counter going bad. 598 */ 599 static void 600 tc_pick(void) 601 { 602 struct timecounter *best, *tc; 603 604 KASSERT(mutex_owned(&timecounter_lock)); 605 606 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 607 if (tc->tc_quality > best->tc_quality) 608 best = tc; 609 else if (tc->tc_quality < best->tc_quality) 610 continue; 611 else if (tc->tc_frequency > best->tc_frequency) 612 best = tc; 613 } 614 (void)best->tc_get_timecount(best); 615 (void)best->tc_get_timecount(best); 616 timecounter = best; 617 } 618 619 /* 620 * A timecounter has gone bad, arrange to pick a new one at the next 621 * clock tick. 622 */ 623 void 624 tc_gonebad(struct timecounter *tc) 625 { 626 627 tc->tc_quality = -100; 628 membar_producer(); 629 atomic_inc_uint(&timecounter_bad); 630 } 631 632 /* 633 * Stop using a timecounter and remove it from the timecounters list. 634 */ 635 int 636 tc_detach(struct timecounter *target) 637 { 638 struct timecounter *tc; 639 struct timecounter **tcp = NULL; 640 int removals; 641 lwp_t *l; 642 643 /* First, find the timecounter. */ 644 mutex_spin_enter(&timecounter_lock); 645 for (tcp = &timecounters, tc = timecounters; 646 tc != NULL; 647 tcp = &tc->tc_next, tc = tc->tc_next) { 648 if (tc == target) 649 break; 650 } 651 if (tc == NULL) { 652 mutex_spin_exit(&timecounter_lock); 653 return ESRCH; 654 } 655 656 /* And now, remove it. */ 657 *tcp = tc->tc_next; 658 if (timecounter == target) { 659 tc_pick(); 660 tc_windup(); 661 } 662 timecounter_mods++; 663 removals = timecounter_removals++; 664 mutex_spin_exit(&timecounter_lock); 665 666 /* 667 * We now have to determine if any threads in the system are still 668 * making use of this timecounter. 669 * 670 * We issue a broadcast cross call to elide memory ordering issues, 671 * then scan all LWPs in the system looking at each's timecounter 672 * generation number. We need to see a value of zero (not actively 673 * using a timecounter) or a value greater than our removal value. 674 * 675 * We may race with threads that read `timecounter_removals' and 676 * and then get preempted before updating `l_tcgen'. This is not 677 * a problem, since it means that these threads have not yet started 678 * accessing timecounter state. All we do need is one clean 679 * snapshot of the system where every thread appears not to be using 680 * old timecounter state. 681 */ 682 for (;;) { 683 xc_barrier(0); 684 685 mutex_enter(proc_lock); 686 LIST_FOREACH(l, &alllwp, l_list) { 687 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 688 /* 689 * Not using timecounter or old timecounter 690 * state at time of our xcall or later. 691 */ 692 continue; 693 } 694 break; 695 } 696 mutex_exit(proc_lock); 697 698 /* 699 * If the timecounter is still in use, wait at least 10ms 700 * before retrying. 701 */ 702 if (l == NULL) { 703 return 0; 704 } 705 (void)kpause("tcdetach", false, mstohz(10), NULL); 706 } 707 } 708 709 /* Report the frequency of the current timecounter. */ 710 u_int64_t 711 tc_getfrequency(void) 712 { 713 714 return (timehands->th_counter->tc_frequency); 715 } 716 717 /* 718 * Step our concept of UTC. This is done by modifying our estimate of 719 * when we booted. 720 */ 721 void 722 tc_setclock(const struct timespec *ts) 723 { 724 struct timespec ts2; 725 struct bintime bt, bt2; 726 727 mutex_spin_enter(&timecounter_lock); 728 TC_COUNT(nsetclock); 729 binuptime(&bt2); 730 timespec2bintime(ts, &bt); 731 bintime_sub(&bt, &bt2); 732 bintime_add(&bt2, &timebasebin); 733 timebasebin = bt; 734 tc_windup(); 735 mutex_spin_exit(&timecounter_lock); 736 737 if (timestepwarnings) { 738 bintime2timespec(&bt2, &ts2); 739 log(LOG_INFO, 740 "Time stepped from %lld.%09ld to %lld.%09ld\n", 741 (long long)ts2.tv_sec, ts2.tv_nsec, 742 (long long)ts->tv_sec, ts->tv_nsec); 743 } 744 } 745 746 /* 747 * Initialize the next struct timehands in the ring and make 748 * it the active timehands. Along the way we might switch to a different 749 * timecounter and/or do seconds processing in NTP. Slightly magic. 750 */ 751 static void 752 tc_windup(void) 753 { 754 struct bintime bt; 755 struct timehands *th, *tho; 756 u_int64_t scale; 757 u_int delta, ncount, ogen; 758 int i, s_update; 759 time_t t; 760 761 KASSERT(mutex_owned(&timecounter_lock)); 762 763 s_update = 0; 764 765 /* 766 * Make the next timehands a copy of the current one, but do not 767 * overwrite the generation or next pointer. While we update 768 * the contents, the generation must be zero. Ensure global 769 * visibility of the generation before proceeding. 770 */ 771 tho = timehands; 772 th = tho->th_next; 773 ogen = th->th_generation; 774 th->th_generation = 0; 775 membar_producer(); 776 bcopy(tho, th, offsetof(struct timehands, th_generation)); 777 778 /* 779 * Capture a timecounter delta on the current timecounter and if 780 * changing timecounters, a counter value from the new timecounter. 781 * Update the offset fields accordingly. 782 */ 783 delta = tc_delta(th); 784 if (th->th_counter != timecounter) 785 ncount = timecounter->tc_get_timecount(timecounter); 786 else 787 ncount = 0; 788 th->th_offset_count += delta; 789 bintime_addx(&th->th_offset, th->th_scale * delta); 790 791 /* 792 * Hardware latching timecounters may not generate interrupts on 793 * PPS events, so instead we poll them. There is a finite risk that 794 * the hardware might capture a count which is later than the one we 795 * got above, and therefore possibly in the next NTP second which might 796 * have a different rate than the current NTP second. It doesn't 797 * matter in practice. 798 */ 799 if (tho->th_counter->tc_poll_pps) 800 tho->th_counter->tc_poll_pps(tho->th_counter); 801 802 /* 803 * Deal with NTP second processing. The for loop normally 804 * iterates at most once, but in extreme situations it might 805 * keep NTP sane if timeouts are not run for several seconds. 806 * At boot, the time step can be large when the TOD hardware 807 * has been read, so on really large steps, we call 808 * ntp_update_second only twice. We need to call it twice in 809 * case we missed a leap second. 810 * If NTP is not compiled in ntp_update_second still calculates 811 * the adjustment resulting from adjtime() calls. 812 */ 813 bt = th->th_offset; 814 bintime_add(&bt, &timebasebin); 815 i = bt.sec - tho->th_microtime.tv_sec; 816 if (i > LARGE_STEP) 817 i = 2; 818 for (; i > 0; i--) { 819 t = bt.sec; 820 ntp_update_second(&th->th_adjustment, &bt.sec); 821 s_update = 1; 822 if (bt.sec != t) 823 timebasebin.sec += bt.sec - t; 824 } 825 826 /* Update the UTC timestamps used by the get*() functions. */ 827 /* XXX shouldn't do this here. Should force non-`get' versions. */ 828 bintime2timeval(&bt, &th->th_microtime); 829 bintime2timespec(&bt, &th->th_nanotime); 830 /* Now is a good time to change timecounters. */ 831 if (th->th_counter != timecounter) { 832 th->th_counter = timecounter; 833 th->th_offset_count = ncount; 834 s_update = 1; 835 } 836 837 /*- 838 * Recalculate the scaling factor. We want the number of 1/2^64 839 * fractions of a second per period of the hardware counter, taking 840 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 841 * processing provides us with. 842 * 843 * The th_adjustment is nanoseconds per second with 32 bit binary 844 * fraction and we want 64 bit binary fraction of second: 845 * 846 * x = a * 2^32 / 10^9 = a * 4.294967296 847 * 848 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 849 * we can only multiply by about 850 without overflowing, but that 850 * leaves suitably precise fractions for multiply before divide. 851 * 852 * Divide before multiply with a fraction of 2199/512 results in a 853 * systematic undercompensation of 10PPM of th_adjustment. On a 854 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 855 * 856 * We happily sacrifice the lowest of the 64 bits of our result 857 * to the goddess of code clarity. 858 * 859 */ 860 if (s_update) { 861 scale = (u_int64_t)1 << 63; 862 scale += (th->th_adjustment / 1024) * 2199; 863 scale /= th->th_counter->tc_frequency; 864 th->th_scale = scale * 2; 865 } 866 /* 867 * Now that the struct timehands is again consistent, set the new 868 * generation number, making sure to not make it zero. Ensure 869 * changes are globally visible before changing. 870 */ 871 if (++ogen == 0) 872 ogen = 1; 873 membar_producer(); 874 th->th_generation = ogen; 875 876 /* 877 * Go live with the new struct timehands. Ensure changes are 878 * globally visible before changing. 879 */ 880 time_second = th->th_microtime.tv_sec; 881 time_uptime = th->th_offset.sec; 882 membar_producer(); 883 timehands = th; 884 885 /* 886 * Force users of the old timehand to move on. This is 887 * necessary for MP systems; we need to ensure that the 888 * consumers will move away from the old timehand before 889 * we begin updating it again when we eventually wrap 890 * around. 891 */ 892 if (++tho->th_generation == 0) 893 tho->th_generation = 1; 894 } 895 896 /* 897 * RFC 2783 PPS-API implementation. 898 */ 899 900 int 901 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 902 { 903 pps_params_t *app; 904 pps_info_t *pipi; 905 #ifdef PPS_SYNC 906 int *epi; 907 #endif 908 909 KASSERT(mutex_owned(&timecounter_lock)); 910 911 KASSERT(pps != NULL); 912 913 switch (cmd) { 914 case PPS_IOC_CREATE: 915 return (0); 916 case PPS_IOC_DESTROY: 917 return (0); 918 case PPS_IOC_SETPARAMS: 919 app = (pps_params_t *)data; 920 if (app->mode & ~pps->ppscap) 921 return (EINVAL); 922 pps->ppsparam = *app; 923 return (0); 924 case PPS_IOC_GETPARAMS: 925 app = (pps_params_t *)data; 926 *app = pps->ppsparam; 927 app->api_version = PPS_API_VERS_1; 928 return (0); 929 case PPS_IOC_GETCAP: 930 *(int*)data = pps->ppscap; 931 return (0); 932 case PPS_IOC_FETCH: 933 pipi = (pps_info_t *)data; 934 pps->ppsinfo.current_mode = pps->ppsparam.mode; 935 *pipi = pps->ppsinfo; 936 return (0); 937 case PPS_IOC_KCBIND: 938 #ifdef PPS_SYNC 939 epi = (int *)data; 940 /* XXX Only root should be able to do this */ 941 if (*epi & ~pps->ppscap) 942 return (EINVAL); 943 pps->kcmode = *epi; 944 return (0); 945 #else 946 return (EOPNOTSUPP); 947 #endif 948 default: 949 return (EPASSTHROUGH); 950 } 951 } 952 953 void 954 pps_init(struct pps_state *pps) 955 { 956 957 KASSERT(mutex_owned(&timecounter_lock)); 958 959 pps->ppscap |= PPS_TSFMT_TSPEC; 960 if (pps->ppscap & PPS_CAPTUREASSERT) 961 pps->ppscap |= PPS_OFFSETASSERT; 962 if (pps->ppscap & PPS_CAPTURECLEAR) 963 pps->ppscap |= PPS_OFFSETCLEAR; 964 } 965 966 /* 967 * capture a timetamp in the pps structure 968 */ 969 void 970 pps_capture(struct pps_state *pps) 971 { 972 struct timehands *th; 973 974 KASSERT(mutex_owned(&timecounter_lock)); 975 KASSERT(pps != NULL); 976 977 th = timehands; 978 pps->capgen = th->th_generation; 979 pps->capth = th; 980 pps->capcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 981 if (pps->capgen != th->th_generation) 982 pps->capgen = 0; 983 } 984 985 #ifdef PPS_DEBUG 986 int ppsdebug = 0; 987 #endif 988 989 /* 990 * process a pps_capture()ed event 991 */ 992 void 993 pps_event(struct pps_state *pps, int event) 994 { 995 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 996 } 997 998 /* 999 * extended pps api / kernel pll/fll entry point 1000 * 1001 * feed reference time stamps to PPS engine 1002 * 1003 * will simulate a PPS event and feed 1004 * the NTP PLL/FLL if requested. 1005 * 1006 * the ref time stamps should be roughly once 1007 * a second but do not need to be exactly in phase 1008 * with the UTC second but should be close to it. 1009 * this relaxation of requirements allows callout 1010 * driven timestamping mechanisms to feed to pps 1011 * capture/kernel pll logic. 1012 * 1013 * calling pattern is: 1014 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 1015 * read timestamp from reference source 1016 * pps_ref_event() 1017 * 1018 * supported refmodes: 1019 * PPS_REFEVNT_CAPTURE 1020 * use system timestamp of pps_capture() 1021 * PPS_REFEVNT_CURRENT 1022 * use system timestamp of this call 1023 * PPS_REFEVNT_CAPCUR 1024 * use average of read capture and current system time stamp 1025 * PPS_REFEVNT_PPS 1026 * assume timestamp on second mark - ref_ts is ignored 1027 * 1028 */ 1029 1030 void 1031 pps_ref_event(struct pps_state *pps, 1032 int event, 1033 struct bintime *ref_ts, 1034 int refmode 1035 ) 1036 { 1037 struct bintime bt; /* current time */ 1038 struct bintime btd; /* time difference */ 1039 struct bintime bt_ref; /* reference time */ 1040 struct timespec ts, *tsp, *osp; 1041 struct timehands *th; 1042 u_int64_t tcount, acount, dcount, *pcount; 1043 int foff, gen; 1044 #ifdef PPS_SYNC 1045 int fhard; 1046 #endif 1047 pps_seq_t *pseq; 1048 1049 KASSERT(mutex_owned(&timecounter_lock)); 1050 1051 KASSERT(pps != NULL); 1052 1053 /* pick up current time stamp if needed */ 1054 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1055 /* pick up current time stamp */ 1056 th = timehands; 1057 gen = th->th_generation; 1058 tcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 1059 if (gen != th->th_generation) 1060 gen = 0; 1061 1062 /* If the timecounter was wound up underneath us, bail out. */ 1063 if (pps->capgen == 0 || 1064 pps->capgen != pps->capth->th_generation || 1065 gen == 0 || 1066 gen != pps->capgen) { 1067 #ifdef PPS_DEBUG 1068 if (ppsdebug & 0x1) { 1069 log(LOG_DEBUG, 1070 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1071 pps, event); 1072 } 1073 #endif 1074 return; 1075 } 1076 } else { 1077 tcount = 0; /* keep GCC happy */ 1078 } 1079 1080 #ifdef PPS_DEBUG 1081 if (ppsdebug & 0x1) { 1082 struct timespec tmsp; 1083 1084 if (ref_ts == NULL) { 1085 tmsp.tv_sec = 0; 1086 tmsp.tv_nsec = 0; 1087 } else { 1088 bintime2timespec(ref_ts, &tmsp); 1089 } 1090 1091 log(LOG_DEBUG, 1092 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1093 ".%09"PRIi32", refmode=0x%1x)\n", 1094 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1095 } 1096 #endif 1097 1098 /* setup correct event references */ 1099 if (event == PPS_CAPTUREASSERT) { 1100 tsp = &pps->ppsinfo.assert_timestamp; 1101 osp = &pps->ppsparam.assert_offset; 1102 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1103 #ifdef PPS_SYNC 1104 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1105 #endif 1106 pcount = &pps->ppscount[0]; 1107 pseq = &pps->ppsinfo.assert_sequence; 1108 } else { 1109 tsp = &pps->ppsinfo.clear_timestamp; 1110 osp = &pps->ppsparam.clear_offset; 1111 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1112 #ifdef PPS_SYNC 1113 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1114 #endif 1115 pcount = &pps->ppscount[1]; 1116 pseq = &pps->ppsinfo.clear_sequence; 1117 } 1118 1119 /* determine system time stamp according to refmode */ 1120 dcount = 0; /* keep GCC happy */ 1121 switch (refmode & PPS_REFEVNT_RMASK) { 1122 case PPS_REFEVNT_CAPTURE: 1123 acount = pps->capcount; /* use capture timestamp */ 1124 break; 1125 1126 case PPS_REFEVNT_CURRENT: 1127 acount = tcount; /* use current timestamp */ 1128 break; 1129 1130 case PPS_REFEVNT_CAPCUR: 1131 /* 1132 * calculate counter value between pps_capture() and 1133 * pps_ref_event() 1134 */ 1135 dcount = tcount - pps->capcount; 1136 acount = (dcount / 2) + pps->capcount; 1137 break; 1138 1139 default: /* ignore call error silently */ 1140 return; 1141 } 1142 1143 /* 1144 * If the timecounter changed, we cannot compare the count values, so 1145 * we have to drop the rest of the PPS-stuff until the next event. 1146 */ 1147 if (pps->ppstc != pps->capth->th_counter) { 1148 pps->ppstc = pps->capth->th_counter; 1149 pps->capcount = acount; 1150 *pcount = acount; 1151 pps->ppscount[2] = acount; 1152 #ifdef PPS_DEBUG 1153 if (ppsdebug & 0x1) { 1154 log(LOG_DEBUG, 1155 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1156 pps, event); 1157 } 1158 #endif 1159 return; 1160 } 1161 1162 pps->capcount = acount; 1163 1164 /* Convert the count to a bintime. */ 1165 bt = pps->capth->th_offset; 1166 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1167 bintime_add(&bt, &timebasebin); 1168 1169 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1170 /* determine difference to reference time stamp */ 1171 bt_ref = *ref_ts; 1172 1173 btd = bt; 1174 bintime_sub(&btd, &bt_ref); 1175 1176 /* 1177 * simulate a PPS timestamp by dropping the fraction 1178 * and applying the offset 1179 */ 1180 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1181 bt.sec++; 1182 bt.frac = 0; 1183 bintime_add(&bt, &btd); 1184 } else { 1185 /* 1186 * create ref_ts from current time - 1187 * we are supposed to be called on 1188 * the second mark 1189 */ 1190 bt_ref = bt; 1191 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1192 bt_ref.sec++; 1193 bt_ref.frac = 0; 1194 } 1195 1196 /* convert bintime to timestamp */ 1197 bintime2timespec(&bt, &ts); 1198 1199 /* If the timecounter was wound up underneath us, bail out. */ 1200 if (pps->capgen != pps->capth->th_generation) 1201 return; 1202 1203 /* store time stamp */ 1204 *pcount = pps->capcount; 1205 (*pseq)++; 1206 *tsp = ts; 1207 1208 /* add offset correction */ 1209 if (foff) { 1210 timespecadd(tsp, osp, tsp); 1211 if (tsp->tv_nsec < 0) { 1212 tsp->tv_nsec += 1000000000; 1213 tsp->tv_sec -= 1; 1214 } 1215 } 1216 1217 #ifdef PPS_DEBUG 1218 if (ppsdebug & 0x2) { 1219 struct timespec ts2; 1220 struct timespec ts3; 1221 1222 bintime2timespec(&bt_ref, &ts2); 1223 1224 bt.sec = 0; 1225 bt.frac = 0; 1226 1227 if (refmode & PPS_REFEVNT_CAPCUR) { 1228 bintime_addx(&bt, pps->capth->th_scale * dcount); 1229 } 1230 bintime2timespec(&bt, &ts3); 1231 1232 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1233 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1234 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1235 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1236 timespec2ns(&ts3)); 1237 } 1238 #endif 1239 1240 #ifdef PPS_SYNC 1241 if (fhard) { 1242 uint64_t scale; 1243 uint64_t div; 1244 1245 /* 1246 * Feed the NTP PLL/FLL. 1247 * The FLL wants to know how many (hardware) nanoseconds 1248 * elapsed since the previous event (mod 1 second) thus 1249 * we are actually looking at the frequency difference scaled 1250 * in nsec. 1251 * As the counter time stamps are not truly at 1Hz 1252 * we need to scale the count by the elapsed 1253 * reference time. 1254 * valid sampling interval: [0.5..2[ sec 1255 */ 1256 1257 /* calculate elapsed raw count */ 1258 tcount = pps->capcount - pps->ppscount[2]; 1259 pps->ppscount[2] = pps->capcount; 1260 tcount &= pps->capth->th_counter->tc_counter_mask; 1261 1262 /* calculate elapsed ref time */ 1263 btd = bt_ref; 1264 bintime_sub(&btd, &pps->ref_time); 1265 pps->ref_time = bt_ref; 1266 1267 /* check that we stay below 2 sec */ 1268 if (btd.sec < 0 || btd.sec > 1) 1269 return; 1270 1271 /* we want at least 0.5 sec between samples */ 1272 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1273 return; 1274 1275 /* 1276 * calculate cycles per period by multiplying 1277 * the frequency with the elapsed period 1278 * we pick a fraction of 30 bits 1279 * ~1ns resolution for elapsed time 1280 */ 1281 div = (uint64_t)btd.sec << 30; 1282 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1283 div *= pps->capth->th_counter->tc_frequency; 1284 div >>= 30; 1285 1286 if (div == 0) /* safeguard */ 1287 return; 1288 1289 scale = (uint64_t)1 << 63; 1290 scale /= div; 1291 scale *= 2; 1292 1293 bt.sec = 0; 1294 bt.frac = 0; 1295 bintime_addx(&bt, scale * tcount); 1296 bintime2timespec(&bt, &ts); 1297 1298 #ifdef PPS_DEBUG 1299 if (ppsdebug & 0x4) { 1300 struct timespec ts2; 1301 int64_t df; 1302 1303 bintime2timespec(&bt_ref, &ts2); 1304 df = timespec2ns(&ts); 1305 if (df > 500000000) 1306 df -= 1000000000; 1307 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1308 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1309 ", freqdiff=%"PRIi64" ns/s\n", 1310 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1311 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1312 df); 1313 } 1314 #endif 1315 1316 hardpps(tsp, timespec2ns(&ts)); 1317 } 1318 #endif 1319 } 1320 1321 /* 1322 * Timecounters need to be updated every so often to prevent the hardware 1323 * counter from overflowing. Updating also recalculates the cached values 1324 * used by the get*() family of functions, so their precision depends on 1325 * the update frequency. 1326 */ 1327 1328 static int tc_tick; 1329 1330 void 1331 tc_ticktock(void) 1332 { 1333 static int count; 1334 1335 if (++count < tc_tick) 1336 return; 1337 count = 0; 1338 mutex_spin_enter(&timecounter_lock); 1339 if (timecounter_bad != 0) { 1340 /* An existing timecounter has gone bad, pick a new one. */ 1341 (void)atomic_swap_uint(&timecounter_bad, 0); 1342 if (timecounter->tc_quality < 0) { 1343 tc_pick(); 1344 } 1345 } 1346 tc_windup(); 1347 mutex_spin_exit(&timecounter_lock); 1348 } 1349 1350 void 1351 inittimecounter(void) 1352 { 1353 u_int p; 1354 1355 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1356 1357 /* 1358 * Set the initial timeout to 1359 * max(1, <approx. number of hardclock ticks in a millisecond>). 1360 * People should probably not use the sysctl to set the timeout 1361 * to smaller than its initial value, since that value is the 1362 * smallest reasonable one. If they want better timestamps they 1363 * should use the non-"get"* functions. 1364 */ 1365 if (hz > 1000) 1366 tc_tick = (hz + 500) / 1000; 1367 else 1368 tc_tick = 1; 1369 p = (tc_tick * 1000000) / hz; 1370 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1371 p / 1000, p % 1000); 1372 1373 /* warm up new timecounter (again) and get rolling. */ 1374 (void)timecounter->tc_get_timecount(timecounter); 1375 (void)timecounter->tc_get_timecount(timecounter); 1376 } 1377