sys/kern/kern_clock.c

/*	$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $	*/

/*-
 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/sched.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/rndsource.h>
#include <sys/heartbeat.h>

#ifdef GPROF
#include <sys/gmon.h>
#endif

#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
#include <sys/cpu.h>

cyclic_clock_func_t	cyclic_clock_func[MAXCPUS];
#endif

static int sysctl_kern_clockrate(SYSCTLFN_PROTO);

/*
 * Clock handling routines.
 *
 * This code is written to operate with two timers that run independently of
 * each other.  The main clock, running hz times per second, is used to keep
 * track of real time.  The second timer handles kernel and user profiling,
 * and does resource use estimation.  If the second timer is programmable,
 * it is randomized to avoid aliasing between the two clocks.  For example,
 * the randomization prevents an adversary from always giving up the CPU
 * just before its quantum expires.  Otherwise, it would never accumulate
 * CPU ticks.  The mean frequency of the second timer is stathz.
 *
 * If no second timer exists, stathz will be zero; in this case we drive
 * profiling and statistics off the main clock.  This WILL NOT be accurate;
 * do not do it unless absolutely necessary.
 *
 * The statistics clock may (or may not) be run at a higher rate while
 * profiling.  This profile clock runs at profhz.  We require that profhz
 * be an integral multiple of stathz.
 *
 * If the statistics clock is running fast, it must be divided by the ratio
 * profhz/stathz for statistics.  (For profiling, every tick counts.)
 */

int	stathz;
int	profhz;
int	profsrc;
int	schedhz;
int	profprocs;
static int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv;			/* prof => stat divider */
int	psratio;			/* ratio: prof / stat */

struct clockrnd {
	struct krndsource source;
	unsigned needed;
};

static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT);
static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT);

static void
clockrnd_get(size_t needed, void *cookie)
{
	struct clockrnd *C = cookie;

	/* Start sampling.  */
	atomic_store_relaxed(&C->needed, 2*NBBY*needed);
}

static void
clockrnd_sample(struct clockrnd *C)
{
	struct cpu_info *ci = curcpu();

	/* If there's nothing needed right now, stop here.  */
	if (__predict_true(atomic_load_relaxed(&C->needed) == 0))
		return;

	/*
	 * If we're not the primary core of a package, we're probably
	 * driven by the same clock as the primary core, so don't
	 * bother.
	 */
	if (ci != ci->ci_package1st)
		return;

	/* Take a sample and enter it into the pool.  */
	rnd_add_uint32(&C->source, 0);

	/*
	 * On the primary CPU, count down.  Using an atomic decrement
	 * here isn't really necessary -- on every platform we care
	 * about, stores to unsigned int are atomic, and the only other
	 * memory operation that could happen here is for another CPU
	 * to store a higher value for needed.  But using an atomic
	 * decrement avoids giving the impression of data races, and is
	 * unlikely to hurt because only one CPU will ever be writing
	 * to the location.
	 */
	if (CPU_IS_PRIMARY(curcpu())) {
		unsigned needed __diagused;

		needed = atomic_dec_uint_nv(&C->needed);
		KASSERT(needed != UINT_MAX);
	}
}

static u_int get_intr_timecount(struct timecounter *);

static struct timecounter intr_timecounter = {
	.tc_get_timecount	= get_intr_timecount,
	.tc_poll_pps		= NULL,
	.tc_counter_mask	= ~0u,
	.tc_frequency		= 0,
	.tc_name		= "clockinterrupt",
	/* quality - minimum implementation level for a clock */
	.tc_quality		= 0,
	.tc_priv		= NULL,
};

static u_int
get_intr_timecount(struct timecounter *tc)
{

	return (u_int)getticks();
}

int
getticks(void)
{
	return atomic_load_relaxed(&hardclock_ticks);
}

/*
 * Initialize clock frequencies and start both clocks running.
 */
void
initclocks(void)
{
	static struct sysctllog *clog;
	int i;

	/*
	 * Set divisors to 1 (normal case) and let the machine-specific
	 * code do its bit.
	 */
	psdiv = 1;

	/*
	 * Call cpu_initclocks() before registering the default
	 * timecounter, in case it needs to adjust hz.
	 */
	const int old_hz = hz;
	cpu_initclocks();
	if (old_hz != hz) {
		tick = 1000000 / hz;
		tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1;
	}

	/*
	 * provide minimum default time counter
	 * will only run at interrupt resolution
	 */
	intr_timecounter.tc_frequency = hz;
	tc_init(&intr_timecounter);

	/*
	 * Compute profhz and stathz, fix profhz if needed.
	 */
	i = stathz ? stathz : hz;
	if (profhz == 0)
		profhz = i;
	psratio = profhz / i;
	if (schedhz == 0) {
		/* 16Hz is best */
		hardscheddiv = hz / 16;
		if (hardscheddiv <= 0)
			panic("hardscheddiv");
	}

	sysctl_createv(&clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_STRUCT, "clockrate",
		       SYSCTL_DESCR("Kernel clock rates"),
		       sysctl_kern_clockrate, 0, NULL,
		       sizeof(struct clockinfo),
		       CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
	sysctl_createv(&clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_INT, "hardclock_ticks",
		       SYSCTL_DESCR("Number of hardclock ticks"),
		       NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
		       CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);

	rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd);
	rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW,
	    RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB);
	if (stathz) {
		rndsource_setcb(&statclockrnd.source, clockrnd_get,
		    &statclockrnd);
		rnd_attach_source(&statclockrnd.source, "statclock",
		    RND_TYPE_SKEW,
		    (RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|
			RND_FLAG_HASCB));
	}
}

/*
 * The real-time timer, interrupting hz times per second.
 */
void
hardclock(struct clockframe *frame)
{
	struct lwp *l;
	struct cpu_info *ci;

	clockrnd_sample(&hardclockrnd);

	ci = curcpu();
	l = ci->ci_onproc;

	ptimer_tick(l, CLKF_USERMODE(frame));

	/*
	 * If no separate statistics clock is available, run it from here.
	 */
	if (stathz == 0)
		statclock(frame);
	/*
	 * If no separate schedclock is provided, call it here
	 * at about 16 Hz.
	 */
	if (schedhz == 0) {
		if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
			schedclock(l);
			ci->ci_schedstate.spc_schedticks = hardscheddiv;
		}
	}
	if ((--ci->ci_schedstate.spc_ticks) <= 0)
		sched_tick(ci);

	if (CPU_IS_PRIMARY(ci)) {
		atomic_store_relaxed(&hardclock_ticks,
		    atomic_load_relaxed(&hardclock_ticks) + 1);
		tc_ticktock();
	}

	/*
	 * Make sure the CPUs and timecounter are making progress.
	 */
	heartbeat();

	/*
	 * Update real-time timeout queue.
	 */
	callout_hardclock();
}

/*
 * Start profiling on a process.
 *
 * Kernel profiling passes proc0 which never exits and hence
 * keeps the profile clock running constantly.
 */
void
startprofclock(struct proc *p)
{

	KASSERT(mutex_owned(&p->p_stmutex));

	if ((p->p_stflag & PST_PROFIL) == 0) {
		p->p_stflag |= PST_PROFIL;
		/*
		 * This is only necessary if using the clock as the
		 * profiling source.
		 */
		if (++profprocs == 1 && stathz != 0)
			psdiv = psratio;
	}
}

/*
 * Stop profiling on a process.
 */
void
stopprofclock(struct proc *p)
{

	KASSERT(mutex_owned(&p->p_stmutex));

	if (p->p_stflag & PST_PROFIL) {
		p->p_stflag &= ~PST_PROFIL;
		/*
		 * This is only necessary if using the clock as the
		 * profiling source.
		 */
		if (--profprocs == 0 && stathz != 0)
			psdiv = 1;
	}
}

void
schedclock(struct lwp *l)
{
	if ((l->l_flag & LW_IDLE) != 0)
		return;

	sched_schedclock(l);
}

/*
 * Statistics clock.  Grab profile sample, and if divider reaches 0,
 * do process and kernel statistics.
 */
void
statclock(struct clockframe *frame)
{
#ifdef GPROF
	struct gmonparam *g;
	intptr_t i;
#endif
	struct cpu_info *ci = curcpu();
	struct schedstate_percpu *spc = &ci->ci_schedstate;
	struct proc *p;
	struct lwp *l;

	if (stathz)
		clockrnd_sample(&statclockrnd);

	/*
	 * Notice changes in divisor frequency, and adjust clock
	 * frequency accordingly.
	 */
	if (spc->spc_psdiv != psdiv) {
		spc->spc_psdiv = psdiv;
		spc->spc_pscnt = psdiv;
		if (psdiv == 1) {
			setstatclockrate(stathz);
		} else {
			setstatclockrate(profhz);
		}
	}
	l = ci->ci_onproc;
	if ((l->l_flag & LW_IDLE) != 0) {
		/*
		 * don't account idle lwps as swapper.
		 */
		p = NULL;
	} else {
		p = l->l_proc;
		mutex_spin_enter(&p->p_stmutex);
	}

	if (CLKF_USERMODE(frame)) {
		KASSERT(p != NULL);
		if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
			addupc_intr(l, CLKF_PC(frame));
		if (--spc->spc_pscnt > 0) {
			mutex_spin_exit(&p->p_stmutex);
			return;
		}

		/*
		 * Came from user mode; CPU was in user state.
		 * If this process is being profiled record the tick.
		 */
		p->p_uticks++;
		if (p->p_nice > NZERO)
			spc->spc_cp_time[CP_NICE]++;
		else
			spc->spc_cp_time[CP_USER]++;
	} else {
#ifdef GPROF
		/*
		 * Kernel statistics are just like addupc_intr, only easier.
		 */
#if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL)
		g = curcpu()->ci_gmon;
		if (g != NULL &&
		    profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#else
		g = &_gmonparam;
		if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#endif
			i = CLKF_PC(frame) - g->lowpc;
			if (i < g->textsize) {
				i /= HISTFRACTION * sizeof(*g->kcount);
				g->kcount[i]++;
			}
		}
#endif
#ifdef LWP_PC
		if (p != NULL && profsrc == PROFSRC_CLOCK &&
		    (p->p_stflag & PST_PROFIL)) {
			addupc_intr(l, LWP_PC(l));
		}
#endif
		if (--spc->spc_pscnt > 0) {
			if (p != NULL)
				mutex_spin_exit(&p->p_stmutex);
			return;
		}
		/*
		 * Came from kernel mode, so we were:
		 * - handling an interrupt,
		 * - doing syscall or trap work on behalf of the current
		 *   user process, or
		 * - spinning in the idle loop.
		 * Whichever it is, charge the time as appropriate.
		 * Note that we charge interrupts to the current process,
		 * regardless of whether they are ``for'' that process,
		 * so that we know how much of its real time was spent
		 * in ``non-process'' (i.e., interrupt) work.
		 */
		if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
			if (p != NULL) {
				p->p_iticks++;
			}
			spc->spc_cp_time[CP_INTR]++;
		} else if (p != NULL) {
			p->p_sticks++;
			spc->spc_cp_time[CP_SYS]++;
		} else {
			spc->spc_cp_time[CP_IDLE]++;
		}
	}
	spc->spc_pscnt = psdiv;

	if (p != NULL) {
		atomic_inc_uint(&l->l_cpticks);
		mutex_spin_exit(&p->p_stmutex);
	}

#ifdef KDTRACE_HOOKS
	cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
	if (func) {
		(*func)((struct clockframe *)frame);
	}
#endif
}

/*
 * sysctl helper routine for kern.clockrate. Assembles a struct on
 * the fly to be returned to the caller.
 */
static int
sysctl_kern_clockrate(SYSCTLFN_ARGS)
{
	struct clockinfo clkinfo;
	struct sysctlnode node;

	clkinfo.tick = tick;
	clkinfo.tickadj = tickadj;
	clkinfo.hz = hz;
	clkinfo.profhz = profhz;
	clkinfo.stathz = stathz ? stathz : hz;

	node = *rnode;
	node.sysctl_data = &clkinfo;
	return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}