1 /* $NetBSD: kern_clock.c,v 1.144 2021/01/16 02:20:00 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.144 2021/01/16 02:20:00 riastradh Exp $"); 73 74 #ifdef _KERNEL_OPT 75 #include "opt_dtrace.h" 76 #include "opt_gprof.h" 77 #endif 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/callout.h> 82 #include <sys/kernel.h> 83 #include <sys/proc.h> 84 #include <sys/resourcevar.h> 85 #include <sys/signalvar.h> 86 #include <sys/sysctl.h> 87 #include <sys/timex.h> 88 #include <sys/sched.h> 89 #include <sys/time.h> 90 #include <sys/timetc.h> 91 #include <sys/cpu.h> 92 #include <sys/atomic.h> 93 #include <sys/rndsource.h> 94 95 #ifdef GPROF 96 #include <sys/gmon.h> 97 #endif 98 99 #ifdef KDTRACE_HOOKS 100 #include <sys/dtrace_bsd.h> 101 #include <sys/cpu.h> 102 103 cyclic_clock_func_t cyclic_clock_func[MAXCPUS]; 104 #endif 105 106 static int sysctl_kern_clockrate(SYSCTLFN_PROTO); 107 108 /* 109 * Clock handling routines. 110 * 111 * This code is written to operate with two timers that run independently of 112 * each other. The main clock, running hz times per second, is used to keep 113 * track of real time. The second timer handles kernel and user profiling, 114 * and does resource use estimation. If the second timer is programmable, 115 * it is randomized to avoid aliasing between the two clocks. For example, 116 * the randomization prevents an adversary from always giving up the CPU 117 * just before its quantum expires. Otherwise, it would never accumulate 118 * CPU ticks. The mean frequency of the second timer is stathz. 119 * 120 * If no second timer exists, stathz will be zero; in this case we drive 121 * profiling and statistics off the main clock. This WILL NOT be accurate; 122 * do not do it unless absolutely necessary. 123 * 124 * The statistics clock may (or may not) be run at a higher rate while 125 * profiling. This profile clock runs at profhz. We require that profhz 126 * be an integral multiple of stathz. 127 * 128 * If the statistics clock is running fast, it must be divided by the ratio 129 * profhz/stathz for statistics. (For profiling, every tick counts.) 130 */ 131 132 int stathz; 133 int profhz; 134 int profsrc; 135 int schedhz; 136 int profprocs; 137 int hardclock_ticks; 138 static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */ 139 static int psdiv; /* prof => stat divider */ 140 int psratio; /* ratio: prof / stat */ 141 142 struct clockrnd { 143 struct krndsource source; 144 unsigned needed; 145 }; 146 147 static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT); 148 static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT); 149 150 static void 151 clockrnd_get(size_t needed, void *cookie) 152 { 153 struct clockrnd *C = cookie; 154 155 /* Start sampling. */ 156 atomic_store_relaxed(&C->needed, 2*NBBY*needed); 157 } 158 159 static void 160 clockrnd_sample(struct clockrnd *C) 161 { 162 struct cpu_info *ci = curcpu(); 163 164 /* If there's nothing needed right now, stop here. */ 165 if (__predict_true(C->needed == 0)) 166 return; 167 168 /* 169 * If we're not the primary core of a package, we're probably 170 * driven by the same clock as the primary core, so don't 171 * bother. 172 */ 173 if (ci != ci->ci_package1st) 174 return; 175 176 /* Take a sample and enter it into the pool. */ 177 rnd_add_uint32(&C->source, 0); 178 179 /* 180 * On the primary CPU, count down. Using an atomic decrement 181 * here isn't really necessary -- on every platform we care 182 * about, stores to unsigned int are atomic, and the only other 183 * memory operation that could happen here is for another CPU 184 * to store a higher value for needed. But using an atomic 185 * decrement avoids giving the impression of data races, and is 186 * unlikely to hurt because only one CPU will ever be writing 187 * to the location. 188 */ 189 if (CPU_IS_PRIMARY(curcpu())) { 190 unsigned needed __diagused; 191 192 needed = atomic_dec_uint_nv(&C->needed); 193 KASSERT(needed != UINT_MAX); 194 } 195 } 196 197 static u_int get_intr_timecount(struct timecounter *); 198 199 static struct timecounter intr_timecounter = { 200 .tc_get_timecount = get_intr_timecount, 201 .tc_poll_pps = NULL, 202 .tc_counter_mask = ~0u, 203 .tc_frequency = 0, 204 .tc_name = "clockinterrupt", 205 /* quality - minimum implementation level for a clock */ 206 .tc_quality = 0, 207 .tc_priv = NULL, 208 }; 209 210 static u_int 211 get_intr_timecount(struct timecounter *tc) 212 { 213 214 return (u_int)getticks(); 215 } 216 217 int 218 getticks(void) 219 { 220 return atomic_load_relaxed(&hardclock_ticks); 221 } 222 223 /* 224 * Initialize clock frequencies and start both clocks running. 225 */ 226 void 227 initclocks(void) 228 { 229 static struct sysctllog *clog; 230 int i; 231 232 /* 233 * Set divisors to 1 (normal case) and let the machine-specific 234 * code do its bit. 235 */ 236 psdiv = 1; 237 238 /* 239 * Call cpu_initclocks() before registering the default 240 * timecounter, in case it needs to adjust hz. 241 */ 242 const int old_hz = hz; 243 cpu_initclocks(); 244 if (old_hz != hz) { 245 tick = 1000000 / hz; 246 tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1; 247 } 248 249 /* 250 * provide minimum default time counter 251 * will only run at interrupt resolution 252 */ 253 intr_timecounter.tc_frequency = hz; 254 tc_init(&intr_timecounter); 255 256 /* 257 * Compute profhz and stathz, fix profhz if needed. 258 */ 259 i = stathz ? stathz : hz; 260 if (profhz == 0) 261 profhz = i; 262 psratio = profhz / i; 263 if (schedhz == 0) { 264 /* 16Hz is best */ 265 hardscheddiv = hz / 16; 266 if (hardscheddiv <= 0) 267 panic("hardscheddiv"); 268 } 269 270 sysctl_createv(&clog, 0, NULL, NULL, 271 CTLFLAG_PERMANENT, 272 CTLTYPE_STRUCT, "clockrate", 273 SYSCTL_DESCR("Kernel clock rates"), 274 sysctl_kern_clockrate, 0, NULL, 275 sizeof(struct clockinfo), 276 CTL_KERN, KERN_CLOCKRATE, CTL_EOL); 277 sysctl_createv(&clog, 0, NULL, NULL, 278 CTLFLAG_PERMANENT, 279 CTLTYPE_INT, "hardclock_ticks", 280 SYSCTL_DESCR("Number of hardclock ticks"), 281 NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks), 282 CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL); 283 284 rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd); 285 rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW, 286 RND_FLAG_COLLECT_TIME|RND_FLAG_HASCB); 287 if (stathz) { 288 rndsource_setcb(&statclockrnd.source, clockrnd_get, 289 &statclockrnd); 290 rnd_attach_source(&statclockrnd.source, "statclock", 291 RND_TYPE_SKEW, RND_FLAG_COLLECT_TIME|RND_FLAG_HASCB); 292 } 293 } 294 295 /* 296 * The real-time timer, interrupting hz times per second. 297 */ 298 void 299 hardclock(struct clockframe *frame) 300 { 301 struct lwp *l; 302 struct cpu_info *ci; 303 304 clockrnd_sample(&hardclockrnd); 305 306 ci = curcpu(); 307 l = ci->ci_onproc; 308 309 ptimer_tick(l, CLKF_USERMODE(frame)); 310 311 /* 312 * If no separate statistics clock is available, run it from here. 313 */ 314 if (stathz == 0) 315 statclock(frame); 316 /* 317 * If no separate schedclock is provided, call it here 318 * at about 16 Hz. 319 */ 320 if (schedhz == 0) { 321 if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) { 322 schedclock(l); 323 ci->ci_schedstate.spc_schedticks = hardscheddiv; 324 } 325 } 326 if ((--ci->ci_schedstate.spc_ticks) <= 0) 327 sched_tick(ci); 328 329 if (CPU_IS_PRIMARY(ci)) { 330 atomic_store_relaxed(&hardclock_ticks, 331 atomic_load_relaxed(&hardclock_ticks) + 1); 332 tc_ticktock(); 333 } 334 335 /* 336 * Update real-time timeout queue. 337 */ 338 callout_hardclock(); 339 } 340 341 /* 342 * Start profiling on a process. 343 * 344 * Kernel profiling passes proc0 which never exits and hence 345 * keeps the profile clock running constantly. 346 */ 347 void 348 startprofclock(struct proc *p) 349 { 350 351 KASSERT(mutex_owned(&p->p_stmutex)); 352 353 if ((p->p_stflag & PST_PROFIL) == 0) { 354 p->p_stflag |= PST_PROFIL; 355 /* 356 * This is only necessary if using the clock as the 357 * profiling source. 358 */ 359 if (++profprocs == 1 && stathz != 0) 360 psdiv = psratio; 361 } 362 } 363 364 /* 365 * Stop profiling on a process. 366 */ 367 void 368 stopprofclock(struct proc *p) 369 { 370 371 KASSERT(mutex_owned(&p->p_stmutex)); 372 373 if (p->p_stflag & PST_PROFIL) { 374 p->p_stflag &= ~PST_PROFIL; 375 /* 376 * This is only necessary if using the clock as the 377 * profiling source. 378 */ 379 if (--profprocs == 0 && stathz != 0) 380 psdiv = 1; 381 } 382 } 383 384 void 385 schedclock(struct lwp *l) 386 { 387 if ((l->l_flag & LW_IDLE) != 0) 388 return; 389 390 sched_schedclock(l); 391 } 392 393 /* 394 * Statistics clock. Grab profile sample, and if divider reaches 0, 395 * do process and kernel statistics. 396 */ 397 void 398 statclock(struct clockframe *frame) 399 { 400 #ifdef GPROF 401 struct gmonparam *g; 402 intptr_t i; 403 #endif 404 struct cpu_info *ci = curcpu(); 405 struct schedstate_percpu *spc = &ci->ci_schedstate; 406 struct proc *p; 407 struct lwp *l; 408 409 if (stathz) 410 clockrnd_sample(&statclockrnd); 411 412 /* 413 * Notice changes in divisor frequency, and adjust clock 414 * frequency accordingly. 415 */ 416 if (spc->spc_psdiv != psdiv) { 417 spc->spc_psdiv = psdiv; 418 spc->spc_pscnt = psdiv; 419 if (psdiv == 1) { 420 setstatclockrate(stathz); 421 } else { 422 setstatclockrate(profhz); 423 } 424 } 425 l = ci->ci_onproc; 426 if ((l->l_flag & LW_IDLE) != 0) { 427 /* 428 * don't account idle lwps as swapper. 429 */ 430 p = NULL; 431 } else { 432 p = l->l_proc; 433 mutex_spin_enter(&p->p_stmutex); 434 } 435 436 if (CLKF_USERMODE(frame)) { 437 KASSERT(p != NULL); 438 if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK) 439 addupc_intr(l, CLKF_PC(frame)); 440 if (--spc->spc_pscnt > 0) { 441 mutex_spin_exit(&p->p_stmutex); 442 return; 443 } 444 445 /* 446 * Came from user mode; CPU was in user state. 447 * If this process is being profiled record the tick. 448 */ 449 p->p_uticks++; 450 if (p->p_nice > NZERO) 451 spc->spc_cp_time[CP_NICE]++; 452 else 453 spc->spc_cp_time[CP_USER]++; 454 } else { 455 #ifdef GPROF 456 /* 457 * Kernel statistics are just like addupc_intr, only easier. 458 */ 459 g = &_gmonparam; 460 if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { 461 i = CLKF_PC(frame) - g->lowpc; 462 if (i < g->textsize) { 463 i /= HISTFRACTION * sizeof(*g->kcount); 464 g->kcount[i]++; 465 } 466 } 467 #endif 468 #ifdef LWP_PC 469 if (p != NULL && profsrc == PROFSRC_CLOCK && 470 (p->p_stflag & PST_PROFIL)) { 471 addupc_intr(l, LWP_PC(l)); 472 } 473 #endif 474 if (--spc->spc_pscnt > 0) { 475 if (p != NULL) 476 mutex_spin_exit(&p->p_stmutex); 477 return; 478 } 479 /* 480 * Came from kernel mode, so we were: 481 * - handling an interrupt, 482 * - doing syscall or trap work on behalf of the current 483 * user process, or 484 * - spinning in the idle loop. 485 * Whichever it is, charge the time as appropriate. 486 * Note that we charge interrupts to the current process, 487 * regardless of whether they are ``for'' that process, 488 * so that we know how much of its real time was spent 489 * in ``non-process'' (i.e., interrupt) work. 490 */ 491 if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) { 492 if (p != NULL) { 493 p->p_iticks++; 494 } 495 spc->spc_cp_time[CP_INTR]++; 496 } else if (p != NULL) { 497 p->p_sticks++; 498 spc->spc_cp_time[CP_SYS]++; 499 } else { 500 spc->spc_cp_time[CP_IDLE]++; 501 } 502 } 503 spc->spc_pscnt = psdiv; 504 505 if (p != NULL) { 506 atomic_inc_uint(&l->l_cpticks); 507 mutex_spin_exit(&p->p_stmutex); 508 } 509 510 #ifdef KDTRACE_HOOKS 511 cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)]; 512 if (func) { 513 (*func)((struct clockframe *)frame); 514 } 515 #endif 516 } 517 518 /* 519 * sysctl helper routine for kern.clockrate. Assembles a struct on 520 * the fly to be returned to the caller. 521 */ 522 static int 523 sysctl_kern_clockrate(SYSCTLFN_ARGS) 524 { 525 struct clockinfo clkinfo; 526 struct sysctlnode node; 527 528 clkinfo.tick = tick; 529 clkinfo.tickadj = tickadj; 530 clkinfo.hz = hz; 531 clkinfo.profhz = profhz; 532 clkinfo.stathz = stathz ? stathz : hz; 533 534 node = *rnode; 535 node.sysctl_data = &clkinfo; 536 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 537 } 538