1 /* $NetBSD: kern_tc.c,v 1.46 2013/09/14 20:52:43 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /*- 33 * ---------------------------------------------------------------------------- 34 * "THE BEER-WARE LICENSE" (Revision 42): 35 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 36 * can do whatever you want with this stuff. If we meet some day, and you think 37 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 38 * --------------------------------------------------------------------------- 39 */ 40 41 #include <sys/cdefs.h> 42 /* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */ 43 __KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.46 2013/09/14 20:52:43 martin Exp $"); 44 45 #ifdef _KERNEL_OPT 46 #include "opt_ntp.h" 47 #endif 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/reboot.h> /* XXX just to get AB_VERBOSE */ 52 #include <sys/sysctl.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 #include <sys/timepps.h> 56 #include <sys/timetc.h> 57 #include <sys/timex.h> 58 #include <sys/evcnt.h> 59 #include <sys/kauth.h> 60 #include <sys/mutex.h> 61 #include <sys/atomic.h> 62 #include <sys/xcall.h> 63 64 /* 65 * A large step happens on boot. This constant detects such steps. 66 * It is relatively small so that ntp_update_second gets called enough 67 * in the typical 'missed a couple of seconds' case, but doesn't loop 68 * forever when the time step is large. 69 */ 70 #define LARGE_STEP 200 71 72 /* 73 * Implement a dummy timecounter which we can use until we get a real one 74 * in the air. This allows the console and other early stuff to use 75 * time services. 76 */ 77 78 static u_int 79 dummy_get_timecount(struct timecounter *tc) 80 { 81 static u_int now; 82 83 return (++now); 84 } 85 86 static struct timecounter dummy_timecounter = { 87 dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000, NULL, NULL, 88 }; 89 90 struct timehands { 91 /* These fields must be initialized by the driver. */ 92 struct timecounter *th_counter; /* active timecounter */ 93 int64_t th_adjustment; /* frequency adjustment */ 94 /* (NTP/adjtime) */ 95 u_int64_t th_scale; /* scale factor (counter */ 96 /* tick->time) */ 97 u_int64_t th_offset_count; /* offset at last time */ 98 /* update (tc_windup()) */ 99 struct bintime th_offset; /* bin (up)time at windup */ 100 struct timeval th_microtime; /* cached microtime */ 101 struct timespec th_nanotime; /* cached nanotime */ 102 /* Fields not to be copied in tc_windup start with th_generation. */ 103 volatile u_int th_generation; /* current genration */ 104 struct timehands *th_next; /* next timehand */ 105 }; 106 107 static struct timehands th0; 108 static struct timehands th9 = { .th_next = &th0, }; 109 static struct timehands th8 = { .th_next = &th9, }; 110 static struct timehands th7 = { .th_next = &th8, }; 111 static struct timehands th6 = { .th_next = &th7, }; 112 static struct timehands th5 = { .th_next = &th6, }; 113 static struct timehands th4 = { .th_next = &th5, }; 114 static struct timehands th3 = { .th_next = &th4, }; 115 static struct timehands th2 = { .th_next = &th3, }; 116 static struct timehands th1 = { .th_next = &th2, }; 117 static struct timehands th0 = { 118 .th_counter = &dummy_timecounter, 119 .th_scale = (uint64_t)-1 / 1000000, 120 .th_offset = { .sec = 1, .frac = 0 }, 121 .th_generation = 1, 122 .th_next = &th1, 123 }; 124 125 static struct timehands *volatile timehands = &th0; 126 struct timecounter *timecounter = &dummy_timecounter; 127 static struct timecounter *timecounters = &dummy_timecounter; 128 129 volatile time_t time_second = 1; 130 volatile time_t time_uptime = 1; 131 132 static struct bintime timebasebin; 133 134 static int timestepwarnings; 135 136 kmutex_t timecounter_lock; 137 static u_int timecounter_mods; 138 static volatile int timecounter_removals = 1; 139 static u_int timecounter_bad; 140 141 #ifdef __FreeBSD__ 142 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, 143 ×tepwarnings, 0, ""); 144 #endif /* __FreeBSD__ */ 145 146 /* 147 * sysctl helper routine for kern.timercounter.hardware 148 */ 149 static int 150 sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS) 151 { 152 struct sysctlnode node; 153 int error; 154 char newname[MAX_TCNAMELEN]; 155 struct timecounter *newtc, *tc; 156 157 tc = timecounter; 158 159 strlcpy(newname, tc->tc_name, sizeof(newname)); 160 161 node = *rnode; 162 node.sysctl_data = newname; 163 node.sysctl_size = sizeof(newname); 164 165 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 166 167 if (error || 168 newp == NULL || 169 strncmp(newname, tc->tc_name, sizeof(newname)) == 0) 170 return error; 171 172 if (l != NULL && (error = kauth_authorize_system(l->l_cred, 173 KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname, 174 NULL, NULL)) != 0) 175 return (error); 176 177 if (!cold) 178 mutex_spin_enter(&timecounter_lock); 179 error = EINVAL; 180 for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { 181 if (strcmp(newname, newtc->tc_name) != 0) 182 continue; 183 /* Warm up new timecounter. */ 184 (void)newtc->tc_get_timecount(newtc); 185 (void)newtc->tc_get_timecount(newtc); 186 timecounter = newtc; 187 error = 0; 188 break; 189 } 190 if (!cold) 191 mutex_spin_exit(&timecounter_lock); 192 return error; 193 } 194 195 static int 196 sysctl_kern_timecounter_choice(SYSCTLFN_ARGS) 197 { 198 char buf[MAX_TCNAMELEN+48]; 199 char *where; 200 const char *spc; 201 struct timecounter *tc; 202 size_t needed, left, slen; 203 int error, mods; 204 205 if (newp != NULL) 206 return (EPERM); 207 if (namelen != 0) 208 return (EINVAL); 209 210 mutex_spin_enter(&timecounter_lock); 211 retry: 212 spc = ""; 213 error = 0; 214 needed = 0; 215 left = *oldlenp; 216 where = oldp; 217 for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { 218 if (where == NULL) { 219 needed += sizeof(buf); /* be conservative */ 220 } else { 221 slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64 222 " Hz)", spc, tc->tc_name, tc->tc_quality, 223 tc->tc_frequency); 224 if (left < slen + 1) 225 break; 226 mods = timecounter_mods; 227 mutex_spin_exit(&timecounter_lock); 228 error = copyout(buf, where, slen + 1); 229 mutex_spin_enter(&timecounter_lock); 230 if (mods != timecounter_mods) { 231 goto retry; 232 } 233 spc = " "; 234 where += slen; 235 needed += slen; 236 left -= slen; 237 } 238 } 239 mutex_spin_exit(&timecounter_lock); 240 241 *oldlenp = needed; 242 return (error); 243 } 244 245 SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup") 246 { 247 const struct sysctlnode *node; 248 249 sysctl_createv(clog, 0, NULL, &node, 250 CTLFLAG_PERMANENT, 251 CTLTYPE_NODE, "timecounter", 252 SYSCTL_DESCR("time counter information"), 253 NULL, 0, NULL, 0, 254 CTL_KERN, CTL_CREATE, CTL_EOL); 255 256 if (node != NULL) { 257 sysctl_createv(clog, 0, NULL, NULL, 258 CTLFLAG_PERMANENT, 259 CTLTYPE_STRING, "choice", 260 SYSCTL_DESCR("available counters"), 261 sysctl_kern_timecounter_choice, 0, NULL, 0, 262 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 263 264 sysctl_createv(clog, 0, NULL, NULL, 265 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 266 CTLTYPE_STRING, "hardware", 267 SYSCTL_DESCR("currently active time counter"), 268 sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN, 269 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 270 271 sysctl_createv(clog, 0, NULL, NULL, 272 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 273 CTLTYPE_INT, "timestepwarnings", 274 SYSCTL_DESCR("log time steps"), 275 NULL, 0, ×tepwarnings, 0, 276 CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL); 277 } 278 } 279 280 #ifdef TC_COUNTERS 281 #define TC_STATS(name) \ 282 static struct evcnt n##name = \ 283 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \ 284 EVCNT_ATTACH_STATIC(n##name) 285 TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime); 286 TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime); 287 TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime); 288 TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime); 289 TC_STATS(setclock); 290 #define TC_COUNT(var) var.ev_count++ 291 #undef TC_STATS 292 #else 293 #define TC_COUNT(var) /* nothing */ 294 #endif /* TC_COUNTERS */ 295 296 static void tc_windup(void); 297 298 /* 299 * Return the difference between the timehands' counter value now and what 300 * was when we copied it to the timehands' offset_count. 301 */ 302 static inline u_int 303 tc_delta(struct timehands *th) 304 { 305 struct timecounter *tc; 306 307 tc = th->th_counter; 308 return ((tc->tc_get_timecount(tc) - 309 th->th_offset_count) & tc->tc_counter_mask); 310 } 311 312 /* 313 * Functions for reading the time. We have to loop until we are sure that 314 * the timehands that we operated on was not updated under our feet. See 315 * the comment in <sys/timevar.h> for a description of these 12 functions. 316 */ 317 318 void 319 binuptime(struct bintime *bt) 320 { 321 struct timehands *th; 322 lwp_t *l; 323 u_int lgen, gen; 324 325 TC_COUNT(nbinuptime); 326 327 /* 328 * Provide exclusion against tc_detach(). 329 * 330 * We record the number of timecounter removals before accessing 331 * timecounter state. Note that the LWP can be using multiple 332 * "generations" at once, due to interrupts (interrupted while in 333 * this function). Hardware interrupts will borrow the interrupted 334 * LWP's l_tcgen value for this purpose, and can themselves be 335 * interrupted by higher priority interrupts. In this case we need 336 * to ensure that the oldest generation in use is recorded. 337 * 338 * splsched() is too expensive to use, so we take care to structure 339 * this code in such a way that it is not required. Likewise, we 340 * do not disable preemption. 341 * 342 * Memory barriers are also too expensive to use for such a 343 * performance critical function. The good news is that we do not 344 * need memory barriers for this type of exclusion, as the thread 345 * updating timecounter_removals will issue a broadcast cross call 346 * before inspecting our l_tcgen value (this elides memory ordering 347 * issues). 348 */ 349 l = curlwp; 350 lgen = l->l_tcgen; 351 if (__predict_true(lgen == 0)) { 352 l->l_tcgen = timecounter_removals; 353 } 354 __insn_barrier(); 355 356 do { 357 th = timehands; 358 gen = th->th_generation; 359 *bt = th->th_offset; 360 bintime_addx(bt, th->th_scale * tc_delta(th)); 361 } while (gen == 0 || gen != th->th_generation); 362 363 __insn_barrier(); 364 l->l_tcgen = lgen; 365 } 366 367 void 368 nanouptime(struct timespec *tsp) 369 { 370 struct bintime bt; 371 372 TC_COUNT(nnanouptime); 373 binuptime(&bt); 374 bintime2timespec(&bt, tsp); 375 } 376 377 void 378 microuptime(struct timeval *tvp) 379 { 380 struct bintime bt; 381 382 TC_COUNT(nmicrouptime); 383 binuptime(&bt); 384 bintime2timeval(&bt, tvp); 385 } 386 387 void 388 bintime(struct bintime *bt) 389 { 390 391 TC_COUNT(nbintime); 392 binuptime(bt); 393 bintime_add(bt, &timebasebin); 394 } 395 396 void 397 nanotime(struct timespec *tsp) 398 { 399 struct bintime bt; 400 401 TC_COUNT(nnanotime); 402 bintime(&bt); 403 bintime2timespec(&bt, tsp); 404 } 405 406 void 407 microtime(struct timeval *tvp) 408 { 409 struct bintime bt; 410 411 TC_COUNT(nmicrotime); 412 bintime(&bt); 413 bintime2timeval(&bt, tvp); 414 } 415 416 void 417 getbinuptime(struct bintime *bt) 418 { 419 struct timehands *th; 420 u_int gen; 421 422 TC_COUNT(ngetbinuptime); 423 do { 424 th = timehands; 425 gen = th->th_generation; 426 *bt = th->th_offset; 427 } while (gen == 0 || gen != th->th_generation); 428 } 429 430 void 431 getnanouptime(struct timespec *tsp) 432 { 433 struct timehands *th; 434 u_int gen; 435 436 TC_COUNT(ngetnanouptime); 437 do { 438 th = timehands; 439 gen = th->th_generation; 440 bintime2timespec(&th->th_offset, tsp); 441 } while (gen == 0 || gen != th->th_generation); 442 } 443 444 void 445 getmicrouptime(struct timeval *tvp) 446 { 447 struct timehands *th; 448 u_int gen; 449 450 TC_COUNT(ngetmicrouptime); 451 do { 452 th = timehands; 453 gen = th->th_generation; 454 bintime2timeval(&th->th_offset, tvp); 455 } while (gen == 0 || gen != th->th_generation); 456 } 457 458 void 459 getbintime(struct bintime *bt) 460 { 461 struct timehands *th; 462 u_int gen; 463 464 TC_COUNT(ngetbintime); 465 do { 466 th = timehands; 467 gen = th->th_generation; 468 *bt = th->th_offset; 469 } while (gen == 0 || gen != th->th_generation); 470 bintime_add(bt, &timebasebin); 471 } 472 473 void 474 getnanotime(struct timespec *tsp) 475 { 476 struct timehands *th; 477 u_int gen; 478 479 TC_COUNT(ngetnanotime); 480 do { 481 th = timehands; 482 gen = th->th_generation; 483 *tsp = th->th_nanotime; 484 } while (gen == 0 || gen != th->th_generation); 485 } 486 487 void 488 getmicrotime(struct timeval *tvp) 489 { 490 struct timehands *th; 491 u_int gen; 492 493 TC_COUNT(ngetmicrotime); 494 do { 495 th = timehands; 496 gen = th->th_generation; 497 *tvp = th->th_microtime; 498 } while (gen == 0 || gen != th->th_generation); 499 } 500 501 /* 502 * Initialize a new timecounter and possibly use it. 503 */ 504 void 505 tc_init(struct timecounter *tc) 506 { 507 u_int u; 508 509 u = tc->tc_frequency / tc->tc_counter_mask; 510 /* XXX: We need some margin here, 10% is a guess */ 511 u *= 11; 512 u /= 10; 513 if (u > hz && tc->tc_quality >= 0) { 514 tc->tc_quality = -2000; 515 aprint_verbose( 516 "timecounter: Timecounter \"%s\" frequency %ju Hz", 517 tc->tc_name, (uintmax_t)tc->tc_frequency); 518 aprint_verbose(" -- Insufficient hz, needs at least %u\n", u); 519 } else if (tc->tc_quality >= 0 || bootverbose) { 520 aprint_verbose( 521 "timecounter: Timecounter \"%s\" frequency %ju Hz " 522 "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, 523 tc->tc_quality); 524 } 525 526 mutex_spin_enter(&timecounter_lock); 527 tc->tc_next = timecounters; 528 timecounters = tc; 529 timecounter_mods++; 530 /* 531 * Never automatically use a timecounter with negative quality. 532 * Even though we run on the dummy counter, switching here may be 533 * worse since this timecounter may not be monotonous. 534 */ 535 if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality || 536 (tc->tc_quality == timecounter->tc_quality && 537 tc->tc_frequency > timecounter->tc_frequency))) { 538 (void)tc->tc_get_timecount(tc); 539 (void)tc->tc_get_timecount(tc); 540 timecounter = tc; 541 tc_windup(); 542 } 543 mutex_spin_exit(&timecounter_lock); 544 } 545 546 /* 547 * Pick a new timecounter due to the existing counter going bad. 548 */ 549 static void 550 tc_pick(void) 551 { 552 struct timecounter *best, *tc; 553 554 KASSERT(mutex_owned(&timecounter_lock)); 555 556 for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) { 557 if (tc->tc_quality > best->tc_quality) 558 best = tc; 559 else if (tc->tc_quality < best->tc_quality) 560 continue; 561 else if (tc->tc_frequency > best->tc_frequency) 562 best = tc; 563 } 564 (void)best->tc_get_timecount(best); 565 (void)best->tc_get_timecount(best); 566 timecounter = best; 567 } 568 569 /* 570 * A timecounter has gone bad, arrange to pick a new one at the next 571 * clock tick. 572 */ 573 void 574 tc_gonebad(struct timecounter *tc) 575 { 576 577 tc->tc_quality = -100; 578 membar_producer(); 579 atomic_inc_uint(&timecounter_bad); 580 } 581 582 /* 583 * Stop using a timecounter and remove it from the timecounters list. 584 */ 585 int 586 tc_detach(struct timecounter *target) 587 { 588 struct timecounter *tc; 589 struct timecounter **tcp = NULL; 590 int removals; 591 uint64_t where; 592 lwp_t *l; 593 594 /* First, find the timecounter. */ 595 mutex_spin_enter(&timecounter_lock); 596 for (tcp = &timecounters, tc = timecounters; 597 tc != NULL; 598 tcp = &tc->tc_next, tc = tc->tc_next) { 599 if (tc == target) 600 break; 601 } 602 if (tc == NULL) { 603 mutex_spin_exit(&timecounter_lock); 604 return ESRCH; 605 } 606 607 /* And now, remove it. */ 608 *tcp = tc->tc_next; 609 if (timecounter == target) { 610 tc_pick(); 611 tc_windup(); 612 } 613 timecounter_mods++; 614 removals = timecounter_removals++; 615 mutex_spin_exit(&timecounter_lock); 616 617 /* 618 * We now have to determine if any threads in the system are still 619 * making use of this timecounter. 620 * 621 * We issue a broadcast cross call to elide memory ordering issues, 622 * then scan all LWPs in the system looking at each's timecounter 623 * generation number. We need to see a value of zero (not actively 624 * using a timecounter) or a value greater than our removal value. 625 * 626 * We may race with threads that read `timecounter_removals' and 627 * and then get preempted before updating `l_tcgen'. This is not 628 * a problem, since it means that these threads have not yet started 629 * accessing timecounter state. All we do need is one clean 630 * snapshot of the system where every thread appears not to be using 631 * old timecounter state. 632 */ 633 for (;;) { 634 where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL); 635 xc_wait(where); 636 637 mutex_enter(proc_lock); 638 LIST_FOREACH(l, &alllwp, l_list) { 639 if (l->l_tcgen == 0 || l->l_tcgen > removals) { 640 /* 641 * Not using timecounter or old timecounter 642 * state at time of our xcall or later. 643 */ 644 continue; 645 } 646 break; 647 } 648 mutex_exit(proc_lock); 649 650 /* 651 * If the timecounter is still in use, wait at least 10ms 652 * before retrying. 653 */ 654 if (l == NULL) { 655 return 0; 656 } 657 (void)kpause("tcdetach", false, mstohz(10), NULL); 658 } 659 } 660 661 /* Report the frequency of the current timecounter. */ 662 u_int64_t 663 tc_getfrequency(void) 664 { 665 666 return (timehands->th_counter->tc_frequency); 667 } 668 669 /* 670 * Step our concept of UTC. This is done by modifying our estimate of 671 * when we booted. 672 */ 673 void 674 tc_setclock(const struct timespec *ts) 675 { 676 struct timespec ts2; 677 struct bintime bt, bt2; 678 679 mutex_spin_enter(&timecounter_lock); 680 TC_COUNT(nsetclock); 681 binuptime(&bt2); 682 timespec2bintime(ts, &bt); 683 bintime_sub(&bt, &bt2); 684 bintime_add(&bt2, &timebasebin); 685 timebasebin = bt; 686 tc_windup(); 687 mutex_spin_exit(&timecounter_lock); 688 689 if (timestepwarnings) { 690 bintime2timespec(&bt2, &ts2); 691 log(LOG_INFO, 692 "Time stepped from %lld.%09ld to %lld.%09ld\n", 693 (long long)ts2.tv_sec, ts2.tv_nsec, 694 (long long)ts->tv_sec, ts->tv_nsec); 695 } 696 } 697 698 /* 699 * Initialize the next struct timehands in the ring and make 700 * it the active timehands. Along the way we might switch to a different 701 * timecounter and/or do seconds processing in NTP. Slightly magic. 702 */ 703 static void 704 tc_windup(void) 705 { 706 struct bintime bt; 707 struct timehands *th, *tho; 708 u_int64_t scale; 709 u_int delta, ncount, ogen; 710 int i, s_update; 711 time_t t; 712 713 KASSERT(mutex_owned(&timecounter_lock)); 714 715 s_update = 0; 716 717 /* 718 * Make the next timehands a copy of the current one, but do not 719 * overwrite the generation or next pointer. While we update 720 * the contents, the generation must be zero. Ensure global 721 * visibility of the generation before proceeding. 722 */ 723 tho = timehands; 724 th = tho->th_next; 725 ogen = th->th_generation; 726 th->th_generation = 0; 727 membar_producer(); 728 bcopy(tho, th, offsetof(struct timehands, th_generation)); 729 730 /* 731 * Capture a timecounter delta on the current timecounter and if 732 * changing timecounters, a counter value from the new timecounter. 733 * Update the offset fields accordingly. 734 */ 735 delta = tc_delta(th); 736 if (th->th_counter != timecounter) 737 ncount = timecounter->tc_get_timecount(timecounter); 738 else 739 ncount = 0; 740 th->th_offset_count += delta; 741 bintime_addx(&th->th_offset, th->th_scale * delta); 742 743 /* 744 * Hardware latching timecounters may not generate interrupts on 745 * PPS events, so instead we poll them. There is a finite risk that 746 * the hardware might capture a count which is later than the one we 747 * got above, and therefore possibly in the next NTP second which might 748 * have a different rate than the current NTP second. It doesn't 749 * matter in practice. 750 */ 751 if (tho->th_counter->tc_poll_pps) 752 tho->th_counter->tc_poll_pps(tho->th_counter); 753 754 /* 755 * Deal with NTP second processing. The for loop normally 756 * iterates at most once, but in extreme situations it might 757 * keep NTP sane if timeouts are not run for several seconds. 758 * At boot, the time step can be large when the TOD hardware 759 * has been read, so on really large steps, we call 760 * ntp_update_second only twice. We need to call it twice in 761 * case we missed a leap second. 762 * If NTP is not compiled in ntp_update_second still calculates 763 * the adjustment resulting from adjtime() calls. 764 */ 765 bt = th->th_offset; 766 bintime_add(&bt, &timebasebin); 767 i = bt.sec - tho->th_microtime.tv_sec; 768 if (i > LARGE_STEP) 769 i = 2; 770 for (; i > 0; i--) { 771 t = bt.sec; 772 ntp_update_second(&th->th_adjustment, &bt.sec); 773 s_update = 1; 774 if (bt.sec != t) 775 timebasebin.sec += bt.sec - t; 776 } 777 778 /* Update the UTC timestamps used by the get*() functions. */ 779 /* XXX shouldn't do this here. Should force non-`get' versions. */ 780 bintime2timeval(&bt, &th->th_microtime); 781 bintime2timespec(&bt, &th->th_nanotime); 782 /* Now is a good time to change timecounters. */ 783 if (th->th_counter != timecounter) { 784 th->th_counter = timecounter; 785 th->th_offset_count = ncount; 786 s_update = 1; 787 } 788 789 /*- 790 * Recalculate the scaling factor. We want the number of 1/2^64 791 * fractions of a second per period of the hardware counter, taking 792 * into account the th_adjustment factor which the NTP PLL/adjtime(2) 793 * processing provides us with. 794 * 795 * The th_adjustment is nanoseconds per second with 32 bit binary 796 * fraction and we want 64 bit binary fraction of second: 797 * 798 * x = a * 2^32 / 10^9 = a * 4.294967296 799 * 800 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int 801 * we can only multiply by about 850 without overflowing, but that 802 * leaves suitably precise fractions for multiply before divide. 803 * 804 * Divide before multiply with a fraction of 2199/512 results in a 805 * systematic undercompensation of 10PPM of th_adjustment. On a 806 * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. 807 * 808 * We happily sacrifice the lowest of the 64 bits of our result 809 * to the goddess of code clarity. 810 * 811 */ 812 if (s_update) { 813 scale = (u_int64_t)1 << 63; 814 scale += (th->th_adjustment / 1024) * 2199; 815 scale /= th->th_counter->tc_frequency; 816 th->th_scale = scale * 2; 817 } 818 /* 819 * Now that the struct timehands is again consistent, set the new 820 * generation number, making sure to not make it zero. Ensure 821 * changes are globally visible before changing. 822 */ 823 if (++ogen == 0) 824 ogen = 1; 825 membar_producer(); 826 th->th_generation = ogen; 827 828 /* 829 * Go live with the new struct timehands. Ensure changes are 830 * globally visible before changing. 831 */ 832 time_second = th->th_microtime.tv_sec; 833 time_uptime = th->th_offset.sec; 834 membar_producer(); 835 timehands = th; 836 837 /* 838 * Force users of the old timehand to move on. This is 839 * necessary for MP systems; we need to ensure that the 840 * consumers will move away from the old timehand before 841 * we begin updating it again when we eventually wrap 842 * around. 843 */ 844 if (++tho->th_generation == 0) 845 tho->th_generation = 1; 846 } 847 848 /* 849 * RFC 2783 PPS-API implementation. 850 */ 851 852 int 853 pps_ioctl(u_long cmd, void *data, struct pps_state *pps) 854 { 855 pps_params_t *app; 856 pps_info_t *pipi; 857 #ifdef PPS_SYNC 858 int *epi; 859 #endif 860 861 KASSERT(mutex_owned(&timecounter_lock)); 862 863 KASSERT(pps != NULL); 864 865 switch (cmd) { 866 case PPS_IOC_CREATE: 867 return (0); 868 case PPS_IOC_DESTROY: 869 return (0); 870 case PPS_IOC_SETPARAMS: 871 app = (pps_params_t *)data; 872 if (app->mode & ~pps->ppscap) 873 return (EINVAL); 874 pps->ppsparam = *app; 875 return (0); 876 case PPS_IOC_GETPARAMS: 877 app = (pps_params_t *)data; 878 *app = pps->ppsparam; 879 app->api_version = PPS_API_VERS_1; 880 return (0); 881 case PPS_IOC_GETCAP: 882 *(int*)data = pps->ppscap; 883 return (0); 884 case PPS_IOC_FETCH: 885 pipi = (pps_info_t *)data; 886 pps->ppsinfo.current_mode = pps->ppsparam.mode; 887 *pipi = pps->ppsinfo; 888 return (0); 889 case PPS_IOC_KCBIND: 890 #ifdef PPS_SYNC 891 epi = (int *)data; 892 /* XXX Only root should be able to do this */ 893 if (*epi & ~pps->ppscap) 894 return (EINVAL); 895 pps->kcmode = *epi; 896 return (0); 897 #else 898 return (EOPNOTSUPP); 899 #endif 900 default: 901 return (EPASSTHROUGH); 902 } 903 } 904 905 void 906 pps_init(struct pps_state *pps) 907 { 908 909 KASSERT(mutex_owned(&timecounter_lock)); 910 911 pps->ppscap |= PPS_TSFMT_TSPEC; 912 if (pps->ppscap & PPS_CAPTUREASSERT) 913 pps->ppscap |= PPS_OFFSETASSERT; 914 if (pps->ppscap & PPS_CAPTURECLEAR) 915 pps->ppscap |= PPS_OFFSETCLEAR; 916 } 917 918 /* 919 * capture a timetamp in the pps structure 920 */ 921 void 922 pps_capture(struct pps_state *pps) 923 { 924 struct timehands *th; 925 926 KASSERT(mutex_owned(&timecounter_lock)); 927 KASSERT(pps != NULL); 928 929 th = timehands; 930 pps->capgen = th->th_generation; 931 pps->capth = th; 932 pps->capcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 933 if (pps->capgen != th->th_generation) 934 pps->capgen = 0; 935 } 936 937 #ifdef PPS_DEBUG 938 int ppsdebug = 0; 939 #endif 940 941 /* 942 * process a pps_capture()ed event 943 */ 944 void 945 pps_event(struct pps_state *pps, int event) 946 { 947 pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE); 948 } 949 950 /* 951 * extended pps api / kernel pll/fll entry point 952 * 953 * feed reference time stamps to PPS engine 954 * 955 * will simulate a PPS event and feed 956 * the NTP PLL/FLL if requested. 957 * 958 * the ref time stamps should be roughly once 959 * a second but do not need to be exactly in phase 960 * with the UTC second but should be close to it. 961 * this relaxation of requirements allows callout 962 * driven timestamping mechanisms to feed to pps 963 * capture/kernel pll logic. 964 * 965 * calling pattern is: 966 * pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR}) 967 * read timestamp from reference source 968 * pps_ref_event() 969 * 970 * supported refmodes: 971 * PPS_REFEVNT_CAPTURE 972 * use system timestamp of pps_capture() 973 * PPS_REFEVNT_CURRENT 974 * use system timestamp of this call 975 * PPS_REFEVNT_CAPCUR 976 * use average of read capture and current system time stamp 977 * PPS_REFEVNT_PPS 978 * assume timestamp on second mark - ref_ts is ignored 979 * 980 */ 981 982 void 983 pps_ref_event(struct pps_state *pps, 984 int event, 985 struct bintime *ref_ts, 986 int refmode 987 ) 988 { 989 struct bintime bt; /* current time */ 990 struct bintime btd; /* time difference */ 991 struct bintime bt_ref; /* reference time */ 992 struct timespec ts, *tsp, *osp; 993 struct timehands *th; 994 u_int64_t tcount, acount, dcount, *pcount; 995 int foff, gen; 996 #ifdef PPS_SYNC 997 int fhard; 998 #endif 999 pps_seq_t *pseq; 1000 1001 KASSERT(mutex_owned(&timecounter_lock)); 1002 1003 KASSERT(pps != NULL); 1004 1005 /* pick up current time stamp if needed */ 1006 if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) { 1007 /* pick up current time stamp */ 1008 th = timehands; 1009 gen = th->th_generation; 1010 tcount = (u_int64_t)tc_delta(th) + th->th_offset_count; 1011 if (gen != th->th_generation) 1012 gen = 0; 1013 1014 /* If the timecounter was wound up underneath us, bail out. */ 1015 if (pps->capgen == 0 || 1016 pps->capgen != pps->capth->th_generation || 1017 gen == 0 || 1018 gen != pps->capgen) { 1019 #ifdef PPS_DEBUG 1020 if (ppsdebug & 0x1) { 1021 log(LOG_DEBUG, 1022 "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n", 1023 pps, event); 1024 } 1025 #endif 1026 return; 1027 } 1028 } else { 1029 tcount = 0; /* keep GCC happy */ 1030 } 1031 1032 #ifdef PPS_DEBUG 1033 if (ppsdebug & 0x1) { 1034 struct timespec tmsp; 1035 1036 if (ref_ts == NULL) { 1037 tmsp.tv_sec = 0; 1038 tmsp.tv_nsec = 0; 1039 } else { 1040 bintime2timespec(ref_ts, &tmsp); 1041 } 1042 1043 log(LOG_DEBUG, 1044 "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64 1045 ".%09"PRIi32", refmode=0x%1x)\n", 1046 pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode); 1047 } 1048 #endif 1049 1050 /* setup correct event references */ 1051 if (event == PPS_CAPTUREASSERT) { 1052 tsp = &pps->ppsinfo.assert_timestamp; 1053 osp = &pps->ppsparam.assert_offset; 1054 foff = pps->ppsparam.mode & PPS_OFFSETASSERT; 1055 #ifdef PPS_SYNC 1056 fhard = pps->kcmode & PPS_CAPTUREASSERT; 1057 #endif 1058 pcount = &pps->ppscount[0]; 1059 pseq = &pps->ppsinfo.assert_sequence; 1060 } else { 1061 tsp = &pps->ppsinfo.clear_timestamp; 1062 osp = &pps->ppsparam.clear_offset; 1063 foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; 1064 #ifdef PPS_SYNC 1065 fhard = pps->kcmode & PPS_CAPTURECLEAR; 1066 #endif 1067 pcount = &pps->ppscount[1]; 1068 pseq = &pps->ppsinfo.clear_sequence; 1069 } 1070 1071 /* determine system time stamp according to refmode */ 1072 dcount = 0; /* keep GCC happy */ 1073 switch (refmode & PPS_REFEVNT_RMASK) { 1074 case PPS_REFEVNT_CAPTURE: 1075 acount = pps->capcount; /* use capture timestamp */ 1076 break; 1077 1078 case PPS_REFEVNT_CURRENT: 1079 acount = tcount; /* use current timestamp */ 1080 break; 1081 1082 case PPS_REFEVNT_CAPCUR: 1083 /* 1084 * calculate counter value between pps_capture() and 1085 * pps_ref_event() 1086 */ 1087 dcount = tcount - pps->capcount; 1088 acount = (dcount / 2) + pps->capcount; 1089 break; 1090 1091 default: /* ignore call error silently */ 1092 return; 1093 } 1094 1095 /* 1096 * If the timecounter changed, we cannot compare the count values, so 1097 * we have to drop the rest of the PPS-stuff until the next event. 1098 */ 1099 if (pps->ppstc != pps->capth->th_counter) { 1100 pps->ppstc = pps->capth->th_counter; 1101 pps->capcount = acount; 1102 *pcount = acount; 1103 pps->ppscount[2] = acount; 1104 #ifdef PPS_DEBUG 1105 if (ppsdebug & 0x1) { 1106 log(LOG_DEBUG, 1107 "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n", 1108 pps, event); 1109 } 1110 #endif 1111 return; 1112 } 1113 1114 pps->capcount = acount; 1115 1116 /* Convert the count to a bintime. */ 1117 bt = pps->capth->th_offset; 1118 bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count)); 1119 bintime_add(&bt, &timebasebin); 1120 1121 if ((refmode & PPS_REFEVNT_PPS) == 0) { 1122 /* determine difference to reference time stamp */ 1123 bt_ref = *ref_ts; 1124 1125 btd = bt; 1126 bintime_sub(&btd, &bt_ref); 1127 1128 /* 1129 * simulate a PPS timestamp by dropping the fraction 1130 * and applying the offset 1131 */ 1132 if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1133 bt.sec++; 1134 bt.frac = 0; 1135 bintime_add(&bt, &btd); 1136 } else { 1137 /* 1138 * create ref_ts from current time - 1139 * we are supposed to be called on 1140 * the second mark 1141 */ 1142 bt_ref = bt; 1143 if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */ 1144 bt_ref.sec++; 1145 bt_ref.frac = 0; 1146 } 1147 1148 /* convert bintime to timestamp */ 1149 bintime2timespec(&bt, &ts); 1150 1151 /* If the timecounter was wound up underneath us, bail out. */ 1152 if (pps->capgen != pps->capth->th_generation) 1153 return; 1154 1155 /* store time stamp */ 1156 *pcount = pps->capcount; 1157 (*pseq)++; 1158 *tsp = ts; 1159 1160 /* add offset correction */ 1161 if (foff) { 1162 timespecadd(tsp, osp, tsp); 1163 if (tsp->tv_nsec < 0) { 1164 tsp->tv_nsec += 1000000000; 1165 tsp->tv_sec -= 1; 1166 } 1167 } 1168 1169 #ifdef PPS_DEBUG 1170 if (ppsdebug & 0x2) { 1171 struct timespec ts2; 1172 struct timespec ts3; 1173 1174 bintime2timespec(&bt_ref, &ts2); 1175 1176 bt.sec = 0; 1177 bt.frac = 0; 1178 1179 if (refmode & PPS_REFEVNT_CAPCUR) { 1180 bintime_addx(&bt, pps->capth->th_scale * dcount); 1181 } 1182 bintime2timespec(&bt, &ts3); 1183 1184 log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32 1185 ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n", 1186 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1187 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1188 timespec2ns(&ts3)); 1189 } 1190 #endif 1191 1192 #ifdef PPS_SYNC 1193 if (fhard) { 1194 uint64_t scale; 1195 uint64_t div; 1196 1197 /* 1198 * Feed the NTP PLL/FLL. 1199 * The FLL wants to know how many (hardware) nanoseconds 1200 * elapsed since the previous event (mod 1 second) thus 1201 * we are actually looking at the frequency difference scaled 1202 * in nsec. 1203 * As the counter time stamps are not truly at 1Hz 1204 * we need to scale the count by the elapsed 1205 * reference time. 1206 * valid sampling interval: [0.5..2[ sec 1207 */ 1208 1209 /* calculate elapsed raw count */ 1210 tcount = pps->capcount - pps->ppscount[2]; 1211 pps->ppscount[2] = pps->capcount; 1212 tcount &= pps->capth->th_counter->tc_counter_mask; 1213 1214 /* calculate elapsed ref time */ 1215 btd = bt_ref; 1216 bintime_sub(&btd, &pps->ref_time); 1217 pps->ref_time = bt_ref; 1218 1219 /* check that we stay below 2 sec */ 1220 if (btd.sec < 0 || btd.sec > 1) 1221 return; 1222 1223 /* we want at least 0.5 sec between samples */ 1224 if (btd.sec == 0 && btd.frac < (uint64_t)1<<63) 1225 return; 1226 1227 /* 1228 * calculate cycles per period by multiplying 1229 * the frequency with the elapsed period 1230 * we pick a fraction of 30 bits 1231 * ~1ns resolution for elapsed time 1232 */ 1233 div = (uint64_t)btd.sec << 30; 1234 div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1); 1235 div *= pps->capth->th_counter->tc_frequency; 1236 div >>= 30; 1237 1238 if (div == 0) /* safeguard */ 1239 return; 1240 1241 scale = (uint64_t)1 << 63; 1242 scale /= div; 1243 scale *= 2; 1244 1245 bt.sec = 0; 1246 bt.frac = 0; 1247 bintime_addx(&bt, scale * tcount); 1248 bintime2timespec(&bt, &ts); 1249 1250 #ifdef PPS_DEBUG 1251 if (ppsdebug & 0x4) { 1252 struct timespec ts2; 1253 int64_t df; 1254 1255 bintime2timespec(&bt_ref, &ts2); 1256 df = timespec2ns(&ts); 1257 if (df > 500000000) 1258 df -= 1000000000; 1259 log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64 1260 ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32 1261 ", freqdiff=%"PRIi64" ns/s\n", 1262 ts2.tv_sec, (int32_t)ts2.tv_nsec, 1263 tsp->tv_sec, (int32_t)tsp->tv_nsec, 1264 df); 1265 } 1266 #endif 1267 1268 hardpps(tsp, timespec2ns(&ts)); 1269 } 1270 #endif 1271 } 1272 1273 /* 1274 * Timecounters need to be updated every so often to prevent the hardware 1275 * counter from overflowing. Updating also recalculates the cached values 1276 * used by the get*() family of functions, so their precision depends on 1277 * the update frequency. 1278 */ 1279 1280 static int tc_tick; 1281 1282 void 1283 tc_ticktock(void) 1284 { 1285 static int count; 1286 1287 if (++count < tc_tick) 1288 return; 1289 count = 0; 1290 mutex_spin_enter(&timecounter_lock); 1291 if (timecounter_bad != 0) { 1292 /* An existing timecounter has gone bad, pick a new one. */ 1293 (void)atomic_swap_uint(&timecounter_bad, 0); 1294 if (timecounter->tc_quality < 0) { 1295 tc_pick(); 1296 } 1297 } 1298 tc_windup(); 1299 mutex_spin_exit(&timecounter_lock); 1300 } 1301 1302 void 1303 inittimecounter(void) 1304 { 1305 u_int p; 1306 1307 mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH); 1308 1309 /* 1310 * Set the initial timeout to 1311 * max(1, <approx. number of hardclock ticks in a millisecond>). 1312 * People should probably not use the sysctl to set the timeout 1313 * to smaller than its inital value, since that value is the 1314 * smallest reasonable one. If they want better timestamps they 1315 * should use the non-"get"* functions. 1316 */ 1317 if (hz > 1000) 1318 tc_tick = (hz + 500) / 1000; 1319 else 1320 tc_tick = 1; 1321 p = (tc_tick * 1000000) / hz; 1322 aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n", 1323 p / 1000, p % 1000); 1324 1325 /* warm up new timecounter (again) and get rolling. */ 1326 (void)timecounter->tc_get_timecount(timecounter); 1327 (void)timecounter->tc_get_timecount(timecounter); 1328 } 1329