xref: /onnv-gate/usr/src/uts/common/os/clock_tick.c (revision 11099:707807478873)
15788Smv143129 /*
25788Smv143129  * CDDL HEADER START
35788Smv143129  *
45788Smv143129  * The contents of this file are subject to the terms of the
55788Smv143129  * Common Development and Distribution License (the "License").
65788Smv143129  * You may not use this file except in compliance with the License.
75788Smv143129  *
85788Smv143129  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95788Smv143129  * or http://www.opensolaris.org/os/licensing.
105788Smv143129  * See the License for the specific language governing permissions
115788Smv143129  * and limitations under the License.
125788Smv143129  *
135788Smv143129  * When distributing Covered Code, include this CDDL HEADER in each
145788Smv143129  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155788Smv143129  * If applicable, add the following below this CDDL HEADER, with the
165788Smv143129  * fields enclosed by brackets "[]" replaced with your own identifying
175788Smv143129  * information: Portions Copyright [yyyy] [name of copyright owner]
185788Smv143129  *
195788Smv143129  * CDDL HEADER END
205788Smv143129  */
215788Smv143129 
225788Smv143129 /*
239039SMadhavan.Venkataraman@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
245788Smv143129  * Use is subject to license terms.
255788Smv143129  */
265788Smv143129 
275788Smv143129 #include <sys/thread.h>
285788Smv143129 #include <sys/proc.h>
295788Smv143129 #include <sys/task.h>
305788Smv143129 #include <sys/cmn_err.h>
315788Smv143129 #include <sys/class.h>
325788Smv143129 #include <sys/sdt.h>
335788Smv143129 #include <sys/atomic.h>
345788Smv143129 #include <sys/cpu.h>
355788Smv143129 #include <sys/clock_tick.h>
3611066Srafael.vanoni@sun.com #include <sys/clock_impl.h>
375788Smv143129 #include <sys/sysmacros.h>
385788Smv143129 #include <vm/rm.h>
395788Smv143129 
405788Smv143129 /*
415788Smv143129  * This file contains the implementation of clock tick accounting for threads.
425788Smv143129  * Every tick, user threads running on various CPUs are located and charged
435788Smv143129  * with a tick to account for their use of CPU time.
445788Smv143129  *
455788Smv143129  * Every tick, the clock() handler calls clock_tick_schedule() to perform tick
465788Smv143129  * accounting for all the threads in the system. Tick accounting is done in
475788Smv143129  * two phases:
485788Smv143129  *
495788Smv143129  * Tick scheduling	Done in clock_tick_schedule(). In this phase, cross
505788Smv143129  *			calls are scheduled to multiple CPUs to perform
515788Smv143129  *			multi-threaded tick accounting. The CPUs are chosen
525788Smv143129  *			on a rotational basis so as to distribute the tick
535788Smv143129  *			accounting load evenly across all CPUs.
545788Smv143129  *
555788Smv143129  * Tick execution	Done in clock_tick_execute(). In this phase, tick
565788Smv143129  *			accounting is actually performed by softint handlers
575788Smv143129  *			on multiple CPUs.
585788Smv143129  *
595788Smv143129  * This implementation gives us a multi-threaded tick processing facility that
605788Smv143129  * is suitable for configurations with a large number of CPUs. On smaller
615788Smv143129  * configurations it may be desirable to let the processing be single-threaded
625788Smv143129  * and just allow clock() to do it as it has been done traditionally. To
635788Smv143129  * facilitate this, a variable, clock_tick_threshold, is defined. Platforms
645788Smv143129  * that desire multi-threading should set this variable to something
655788Smv143129  * appropriate. A recommended value may be found in clock_tick.h. At boot time,
665788Smv143129  * if the number of CPUs is greater than clock_tick_threshold, multi-threading
675788Smv143129  * kicks in. Note that this is a decision made at boot time. If more CPUs
685788Smv143129  * are dynamically added later on to exceed the threshold, no attempt is made
695788Smv143129  * to switch to multi-threaded. Similarly, if CPUs are removed dynamically
705788Smv143129  * no attempt is made to switch to single-threaded. This is to keep the
715788Smv143129  * implementation simple. Also note that the threshold can be changed for a
725788Smv143129  * specific customer configuration via /etc/system.
735788Smv143129  *
745788Smv143129  * The boot time decision is reflected in clock_tick_single_threaded.
755788Smv143129  */
765788Smv143129 
775788Smv143129 /*
785788Smv143129  * clock_tick_threshold
795788Smv143129  *	If the number of CPUs at boot time exceeds this threshold,
805788Smv143129  *	multi-threaded tick accounting kicks in.
815788Smv143129  *
825788Smv143129  * clock_tick_ncpus
835788Smv143129  *	The number of CPUs in a set. Each set is scheduled for tick execution
845788Smv143129  *	on a separate processor.
855788Smv143129  *
865788Smv143129  * clock_tick_single_threaded
875788Smv143129  *	Indicates whether or not tick accounting is single threaded.
885788Smv143129  *
895788Smv143129  * clock_tick_total_cpus
905788Smv143129  *	Total number of online CPUs.
915788Smv143129  *
925788Smv143129  * clock_tick_cpus
935788Smv143129  *	Array of online CPU pointers.
945788Smv143129  *
955788Smv143129  * clock_tick_cpu
965788Smv143129  *	Per-CPU, cache-aligned data structures to facilitate multi-threading.
975788Smv143129  *
985788Smv143129  * clock_tick_active
995788Smv143129  *	Counter that indicates the number of active tick processing softints
1005788Smv143129  *	in the system.
1015788Smv143129  *
1025788Smv143129  * clock_tick_pending
1035788Smv143129  *	Number of pending ticks that need to be accounted by the softint
1045788Smv143129  *	handlers.
1055788Smv143129  *
1065788Smv143129  * clock_tick_lock
1075788Smv143129  *	Mutex to synchronize between clock_tick_schedule() and
1085788Smv143129  *	CPU online/offline.
1095788Smv143129  *
1105788Smv143129  * clock_cpu_id
1115788Smv143129  *	CPU id of the clock() CPU. Used to detect when the clock CPU
1125788Smv143129  *	is offlined.
1135788Smv143129  *
1145788Smv143129  * clock_tick_online_cpuset
1155788Smv143129  *	CPU set of all online processors that can be X-called.
1165788Smv143129  *
1175788Smv143129  * clock_tick_proc_max
1185788Smv143129  *	Each process is allowed to accumulate a few ticks before checking
1195788Smv143129  *	for the task CPU time resource limit. We lower the number of calls
1205788Smv143129  *	to rctl_test() to make tick accounting more scalable. The tradeoff
1215788Smv143129  *	is that the limit may not get enforced in a timely manner. This is
1225788Smv143129  *	typically not a problem.
1235788Smv143129  *
1245788Smv143129  * clock_tick_set
1255788Smv143129  *	Per-set structures. Each structure contains the range of CPUs
1265788Smv143129  *	to be processed for the set.
1275788Smv143129  *
1285788Smv143129  * clock_tick_nsets;
1295788Smv143129  *	Number of sets.
1305788Smv143129  *
1315788Smv143129  * clock_tick_scan
1325788Smv143129  *	Where to begin the scan for single-threaded mode. In multi-threaded,
1335788Smv143129  *	the clock_tick_set itself contains a field for this.
1345788Smv143129  */
1355788Smv143129 int			clock_tick_threshold;
1365788Smv143129 int			clock_tick_ncpus;
1375788Smv143129 int			clock_tick_single_threaded;
1385788Smv143129 int			clock_tick_total_cpus;
1395788Smv143129 cpu_t			*clock_tick_cpus[NCPU];
1405788Smv143129 clock_tick_cpu_t	*clock_tick_cpu[NCPU];
1415788Smv143129 ulong_t			clock_tick_active;
1425788Smv143129 int			clock_tick_pending;
1435788Smv143129 kmutex_t		clock_tick_lock;
1445788Smv143129 processorid_t		clock_cpu_id;
1455788Smv143129 cpuset_t		clock_tick_online_cpuset;
1465788Smv143129 clock_t			clock_tick_proc_max;
1475788Smv143129 clock_tick_set_t	*clock_tick_set;
1485788Smv143129 int			clock_tick_nsets;
1495788Smv143129 int			clock_tick_scan;
1509039SMadhavan.Venkataraman@Sun.COM ulong_t			clock_tick_intr;
1515788Smv143129 
1525788Smv143129 static uint_t	clock_tick_execute(caddr_t, caddr_t);
1535788Smv143129 static void	clock_tick_execute_common(int, int, int, clock_t, int);
1545788Smv143129 
1555788Smv143129 #define	CLOCK_TICK_ALIGN	64	/* cache alignment */
1565788Smv143129 
1575788Smv143129 /*
1585788Smv143129  * Clock tick initialization is done in two phases:
1595788Smv143129  *
1605788Smv143129  * 1. Before clock_init() is called, clock_tick_init_pre() is called to set
1615788Smv143129  *    up single-threading so the clock() can begin to do its job.
1625788Smv143129  *
1635788Smv143129  * 2. After the slave CPUs are initialized at boot time, we know the number
1645788Smv143129  *    of CPUs. clock_tick_init_post() is called to set up multi-threading if
1655788Smv143129  *    required.
1665788Smv143129  */
1675788Smv143129 void
clock_tick_init_pre(void)1685788Smv143129 clock_tick_init_pre(void)
1695788Smv143129 {
1705788Smv143129 	clock_tick_cpu_t	*ctp;
1715788Smv143129 	int			i, n;
1725788Smv143129 	clock_tick_set_t	*csp;
1735788Smv143129 	uintptr_t		buf;
1745788Smv143129 	size_t			size;
1755788Smv143129 
1765788Smv143129 	clock_tick_single_threaded = 1;
1775788Smv143129 
1785788Smv143129 	size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN);
1795788Smv143129 	buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP);
1805788Smv143129 	buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN);
1815788Smv143129 
1825788Smv143129 	/*
1835788Smv143129 	 * Perform initialization in case multi-threading is chosen later.
1845788Smv143129 	 */
1859039SMadhavan.Venkataraman@Sun.COM 	if (&create_softint != NULL) {
1869039SMadhavan.Venkataraman@Sun.COM 		clock_tick_intr = create_softint(LOCK_LEVEL,
1879039SMadhavan.Venkataraman@Sun.COM 		    clock_tick_execute, (caddr_t)NULL);
1889039SMadhavan.Venkataraman@Sun.COM 	}
1895788Smv143129 	for (i = 0; i < NCPU; i++, buf += size) {
1905788Smv143129 		ctp = (clock_tick_cpu_t *)buf;
1915788Smv143129 		clock_tick_cpu[i] = ctp;
1925788Smv143129 		mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL);
1935788Smv143129 		if (&create_softint != NULL) {
1949039SMadhavan.Venkataraman@Sun.COM 			ctp->ct_intr = clock_tick_intr;
1955788Smv143129 		}
1965788Smv143129 		ctp->ct_pending = 0;
1975788Smv143129 	}
1985788Smv143129 
1995788Smv143129 	mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL);
2005788Smv143129 
2015788Smv143129 	/*
2025788Smv143129 	 * Compute clock_tick_ncpus here. We need it to compute the
2035788Smv143129 	 * maximum number of tick sets we need to support.
2045788Smv143129 	 */
2055788Smv143129 	ASSERT(clock_tick_ncpus >= 0);
2065788Smv143129 	if (clock_tick_ncpus == 0)
2075788Smv143129 		clock_tick_ncpus = CLOCK_TICK_NCPUS;
2085788Smv143129 	if (clock_tick_ncpus > max_ncpus)
2095788Smv143129 		clock_tick_ncpus = max_ncpus;
2105788Smv143129 
2115788Smv143129 	/*
2125788Smv143129 	 * Allocate and initialize the tick sets.
2135788Smv143129 	 */
2145788Smv143129 	n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus;
2155788Smv143129 	clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP);
2165788Smv143129 	for (i = 0; i < n; i++) {
2175788Smv143129 		csp = &clock_tick_set[i];
2185788Smv143129 		csp->ct_start = i * clock_tick_ncpus;
2195788Smv143129 		csp->ct_scan = csp->ct_start;
2205788Smv143129 		csp->ct_end = csp->ct_start;
2215788Smv143129 	}
2225788Smv143129 }
2235788Smv143129 
2245788Smv143129 void
clock_tick_init_post(void)2255788Smv143129 clock_tick_init_post(void)
2265788Smv143129 {
2275788Smv143129 	/*
2285788Smv143129 	 * If a platform does not provide create_softint() and invoke_softint(),
2295788Smv143129 	 * then we assume single threaded.
2305788Smv143129 	 */
2315788Smv143129 	if (&invoke_softint == NULL)
2325788Smv143129 		clock_tick_threshold = 0;
2335788Smv143129 
2345788Smv143129 	ASSERT(clock_tick_threshold >= 0);
2355788Smv143129 
2365788Smv143129 	if (clock_tick_threshold == 0)
2375788Smv143129 		clock_tick_threshold = max_ncpus;
2385788Smv143129 
2395788Smv143129 	/*
2405788Smv143129 	 * If a platform does not specify a threshold or if the number of CPUs
2415788Smv143129 	 * at boot time does not exceed the threshold, tick accounting remains
2425788Smv143129 	 * single-threaded.
2435788Smv143129 	 */
2445788Smv143129 	if (ncpus <= clock_tick_threshold) {
2455788Smv143129 		clock_tick_ncpus = max_ncpus;
2465788Smv143129 		clock_tick_proc_max = 1;
2475788Smv143129 		return;
2485788Smv143129 	}
2495788Smv143129 
2505788Smv143129 	/*
2515788Smv143129 	 * OK. Multi-thread tick processing. If a platform has not specified
2525788Smv143129 	 * the CPU set size for multi-threading, then use the default value.
2535788Smv143129 	 * This value has been arrived through measurements on large
2545788Smv143129 	 * configuration systems.
2555788Smv143129 	 */
2565788Smv143129 	clock_tick_single_threaded = 0;
2575788Smv143129 	if (clock_tick_proc_max == 0) {
2585788Smv143129 		clock_tick_proc_max = CLOCK_TICK_PROC_MAX;
2595788Smv143129 		if (hires_tick)
2605788Smv143129 			clock_tick_proc_max *= 10;
2615788Smv143129 	}
2625788Smv143129 }
2635788Smv143129 
2645788Smv143129 static void
clock_tick_schedule_one(clock_tick_set_t * csp,int pending,processorid_t cid)2655788Smv143129 clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid)
2665788Smv143129 {
2675788Smv143129 	clock_tick_cpu_t	*ctp;
2685788Smv143129 
2695788Smv143129 	ASSERT(&invoke_softint != NULL);
2709039SMadhavan.Venkataraman@Sun.COM 
2719039SMadhavan.Venkataraman@Sun.COM 	atomic_inc_ulong(&clock_tick_active);
2729039SMadhavan.Venkataraman@Sun.COM 
2735788Smv143129 	/*
2745788Smv143129 	 * Schedule tick accounting for a set of CPUs.
2755788Smv143129 	 */
2765788Smv143129 	ctp = clock_tick_cpu[cid];
2775788Smv143129 	mutex_enter(&ctp->ct_lock);
278*11099Srafael.vanoni@sun.com 	ctp->ct_lbolt = LBOLT_NO_ACCOUNT;
2795788Smv143129 	ctp->ct_pending += pending;
2805788Smv143129 	ctp->ct_start = csp->ct_start;
2815788Smv143129 	ctp->ct_end = csp->ct_end;
2825788Smv143129 	ctp->ct_scan = csp->ct_scan;
2835788Smv143129 	mutex_exit(&ctp->ct_lock);
2845788Smv143129 
2855788Smv143129 	invoke_softint(cid, ctp->ct_intr);
2865788Smv143129 	/*
2875788Smv143129 	 * Return without waiting for the softint to finish.
2885788Smv143129 	 */
2895788Smv143129 }
2905788Smv143129 
2915788Smv143129 static void
clock_tick_process(cpu_t * cp,clock_t mylbolt,int pending)2925788Smv143129 clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending)
2935788Smv143129 {
2945788Smv143129 	kthread_t	*t;
2955788Smv143129 	kmutex_t	*plockp;
2965788Smv143129 	int		notick, intr;
2975788Smv143129 	klwp_id_t	lwp;
2985788Smv143129 
2995788Smv143129 	/*
3005788Smv143129 	 * The locking here is rather tricky. thread_free_prevent()
3015788Smv143129 	 * prevents the thread returned from being freed while we
3025788Smv143129 	 * are looking at it. We can then check if the thread
3035788Smv143129 	 * is exiting and get the appropriate p_lock if it
3045788Smv143129 	 * is not.  We have to be careful, though, because
3055788Smv143129 	 * the _process_ can still be freed while we've
3065788Smv143129 	 * prevented thread free.  To avoid touching the
3075788Smv143129 	 * proc structure we put a pointer to the p_lock in the
3085788Smv143129 	 * thread structure.  The p_lock is persistent so we
3095788Smv143129 	 * can acquire it even if the process is gone.  At that
3105788Smv143129 	 * point we can check (again) if the thread is exiting
3115788Smv143129 	 * and either drop the lock or do the tick processing.
3125788Smv143129 	 */
3135788Smv143129 	t = cp->cpu_thread;	/* Current running thread */
3145788Smv143129 	if (CPU == cp) {
3155788Smv143129 		/*
3165788Smv143129 		 * 't' will be the tick processing thread on this
3175788Smv143129 		 * CPU.  Use the pinned thread (if any) on this CPU
3185788Smv143129 		 * as the target of the clock tick.
3195788Smv143129 		 */
3205788Smv143129 		if (t->t_intr != NULL)
3215788Smv143129 			t = t->t_intr;
3225788Smv143129 	}
3235788Smv143129 
3245788Smv143129 	/*
3255788Smv143129 	 * We use thread_free_prevent to keep the currently running
3265788Smv143129 	 * thread from being freed or recycled while we're
3275788Smv143129 	 * looking at it.
3285788Smv143129 	 */
3295788Smv143129 	thread_free_prevent(t);
3305788Smv143129 	/*
3315788Smv143129 	 * We cannot hold the cpu_lock to prevent the
3325788Smv143129 	 * cpu_active from changing in the clock interrupt.
3335788Smv143129 	 * As long as we don't block (or don't get pre-empted)
3345788Smv143129 	 * the cpu_list will not change (all threads are paused
3355788Smv143129 	 * before list modification).
3365788Smv143129 	 */
3375788Smv143129 	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
3385788Smv143129 		thread_free_allow(t);
3395788Smv143129 		return;
3405788Smv143129 	}
3415788Smv143129 
3425788Smv143129 	/*
3435788Smv143129 	 * Make sure the thread is still on the CPU.
3445788Smv143129 	 */
3455788Smv143129 	if ((t != cp->cpu_thread) &&
3465788Smv143129 	    ((cp != CPU) || (t != cp->cpu_thread->t_intr))) {
3475788Smv143129 		/*
3485788Smv143129 		 * We could not locate the thread. Skip this CPU. Race
3495788Smv143129 		 * conditions while performing these checks are benign.
3505788Smv143129 		 * These checks are not perfect and they don't need
3515788Smv143129 		 * to be.
3525788Smv143129 		 */
3535788Smv143129 		thread_free_allow(t);
3545788Smv143129 		return;
3555788Smv143129 	}
3565788Smv143129 
3575788Smv143129 	intr = t->t_flag & T_INTR_THREAD;
3585788Smv143129 	lwp = ttolwp(t);
3595788Smv143129 	if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
3605788Smv143129 		/*
3615788Smv143129 		 * Thread is exiting (or uninteresting) so don't
3625788Smv143129 		 * do tick processing.
3635788Smv143129 		 */
3645788Smv143129 		thread_free_allow(t);
3655788Smv143129 		return;
3665788Smv143129 	}
3675788Smv143129 
3685788Smv143129 	/*
3695788Smv143129 	 * OK, try to grab the process lock.  See
3705788Smv143129 	 * comments above for why we're not using
3715788Smv143129 	 * ttoproc(t)->p_lockp here.
3725788Smv143129 	 */
3735788Smv143129 	plockp = t->t_plockp;
3745788Smv143129 	mutex_enter(plockp);
3755788Smv143129 	/* See above comment. */
3765788Smv143129 	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
3775788Smv143129 		mutex_exit(plockp);
3785788Smv143129 		thread_free_allow(t);
3795788Smv143129 		return;
3805788Smv143129 	}
3815788Smv143129 
3825788Smv143129 	/*
3835788Smv143129 	 * The thread may have exited between when we
3845788Smv143129 	 * checked above, and when we got the p_lock.
3855788Smv143129 	 */
3865788Smv143129 	if (t->t_proc_flag & TP_LWPEXIT) {
3875788Smv143129 		mutex_exit(plockp);
3885788Smv143129 		thread_free_allow(t);
3895788Smv143129 		return;
3905788Smv143129 	}
3915788Smv143129 
3925788Smv143129 	/*
3935788Smv143129 	 * Either we have the p_lock for the thread's process,
3945788Smv143129 	 * or we don't care about the thread structure any more.
3955788Smv143129 	 * Either way we can allow thread free.
3965788Smv143129 	 */
3975788Smv143129 	thread_free_allow(t);
3985788Smv143129 
3995788Smv143129 	/*
4005788Smv143129 	 * If we haven't done tick processing for this
4015788Smv143129 	 * lwp, then do it now. Since we don't hold the
4025788Smv143129 	 * lwp down on a CPU it can migrate and show up
4035788Smv143129 	 * more than once, hence the lbolt check. mylbolt
4045788Smv143129 	 * is copied at the time of tick scheduling to prevent
4055788Smv143129 	 * lbolt mismatches.
4065788Smv143129 	 *
4075788Smv143129 	 * Also, make sure that it's okay to perform the
4085788Smv143129 	 * tick processing before calling clock_tick.
4095788Smv143129 	 * Setting notick to a TRUE value (ie. not 0)
4105788Smv143129 	 * results in tick processing not being performed for
4115788Smv143129 	 * that thread.
4125788Smv143129 	 */
4135788Smv143129 	notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) ||
4145788Smv143129 	    (cp->cpu_dispthread == cp->cpu_idle_thread));
4155788Smv143129 
4165788Smv143129 	if ((!notick) && (t->t_lbolt < mylbolt)) {
4175788Smv143129 		t->t_lbolt = mylbolt;
4185788Smv143129 		clock_tick(t, pending);
4195788Smv143129 	}
4205788Smv143129 
4215788Smv143129 	mutex_exit(plockp);
4225788Smv143129 }
4235788Smv143129 
4245788Smv143129 void
clock_tick_schedule(int one_sec)4255788Smv143129 clock_tick_schedule(int one_sec)
4265788Smv143129 {
4275788Smv143129 	ulong_t			active;
4285788Smv143129 	int			i, end;
4295788Smv143129 	clock_tick_set_t	*csp;
4305788Smv143129 	cpu_t			*cp;
4315788Smv143129 
4325788Smv143129 	if (clock_cpu_id != CPU->cpu_id)
4335788Smv143129 		clock_cpu_id = CPU->cpu_id;
4345788Smv143129 
4355788Smv143129 	if (clock_tick_single_threaded) {
4365788Smv143129 		/*
4375788Smv143129 		 * Each tick cycle, start the scan from a different
4385788Smv143129 		 * CPU for the sake of fairness.
4395788Smv143129 		 */
4405788Smv143129 		end = clock_tick_total_cpus;
4415788Smv143129 		clock_tick_scan++;
4425788Smv143129 		if (clock_tick_scan >= end)
4435788Smv143129 			clock_tick_scan = 0;
4445788Smv143129 
44511066Srafael.vanoni@sun.com 		clock_tick_execute_common(0, clock_tick_scan, end,
446*11099Srafael.vanoni@sun.com 		    LBOLT_NO_ACCOUNT, 1);
4475788Smv143129 
4485788Smv143129 		return;
4495788Smv143129 	}
4505788Smv143129 
4515788Smv143129 	/*
4525788Smv143129 	 * If the previous invocation of handlers is not yet finished, then
4535788Smv143129 	 * simply increment a pending count and return. Eventually when they
4545788Smv143129 	 * finish, the pending count is passed down to the next set of
4555788Smv143129 	 * handlers to process. This way, ticks that have already elapsed
4565788Smv143129 	 * in the past are handled as quickly as possible to minimize the
4575788Smv143129 	 * chances of threads getting away before their pending ticks are
4585788Smv143129 	 * accounted. The other benefit is that if the pending count is
4595788Smv143129 	 * more than one, it can be handled by a single invocation of
4605788Smv143129 	 * clock_tick(). This is a good optimization for large configuration
4615788Smv143129 	 * busy systems where tick accounting can get backed up for various
4625788Smv143129 	 * reasons.
4635788Smv143129 	 */
4645788Smv143129 	clock_tick_pending++;
4655788Smv143129 
4665788Smv143129 	active = clock_tick_active;
4675788Smv143129 	active = atomic_cas_ulong(&clock_tick_active, active, active);
4685788Smv143129 	if (active)
4695788Smv143129 		return;
4705788Smv143129 
4715788Smv143129 	/*
4725788Smv143129 	 * We want to handle the clock CPU here. If we
4735788Smv143129 	 * scheduled the accounting for the clock CPU to another
4745788Smv143129 	 * processor, that processor will find only the clock() thread
4755788Smv143129 	 * running and not account for any user thread below it. Also,
4765788Smv143129 	 * we want to handle this before we block on anything and allow
4775788Smv143129 	 * the pinned thread below the current thread to escape.
4785788Smv143129 	 */
479*11099Srafael.vanoni@sun.com 	clock_tick_process(CPU, LBOLT_NO_ACCOUNT, clock_tick_pending);
4805788Smv143129 
4815788Smv143129 	mutex_enter(&clock_tick_lock);
4825788Smv143129 
4835788Smv143129 	/*
4845788Smv143129 	 * Schedule each set on a separate processor.
4855788Smv143129 	 */
4865788Smv143129 	cp = clock_cpu_list;
4875788Smv143129 	for (i = 0; i < clock_tick_nsets; i++) {
4885788Smv143129 		csp = &clock_tick_set[i];
4895788Smv143129 
4905788Smv143129 		/*
4915788Smv143129 		 * Pick the next online CPU in list for scheduling tick
4925788Smv143129 		 * accounting. The clock_tick_lock is held by the caller.
4935788Smv143129 		 * So, CPU online/offline cannot muck with this while
4945788Smv143129 		 * we are picking our CPU to X-call.
4955788Smv143129 		 */
4965788Smv143129 		if (cp == CPU)
4975788Smv143129 			cp = cp->cpu_next_onln;
4985788Smv143129 
4995788Smv143129 		/*
5005788Smv143129 		 * Each tick cycle, start the scan from a different
5015788Smv143129 		 * CPU for the sake of fairness.
5025788Smv143129 		 */
5035788Smv143129 		csp->ct_scan++;
5045788Smv143129 		if (csp->ct_scan >= csp->ct_end)
5055788Smv143129 			csp->ct_scan = csp->ct_start;
5065788Smv143129 
5075788Smv143129 		clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id);
5085788Smv143129 
5095788Smv143129 		cp = cp->cpu_next_onln;
5105788Smv143129 	}
5115788Smv143129 
5125788Smv143129 	if (one_sec) {
5135788Smv143129 		/*
5145788Smv143129 		 * Move the CPU pointer around every second. This is so
5155788Smv143129 		 * all the CPUs can be X-called in a round-robin fashion
5165788Smv143129 		 * to evenly distribute the X-calls. We don't do this
5175788Smv143129 		 * at a faster rate than this because we don't want
5185788Smv143129 		 * to affect cache performance negatively.
5195788Smv143129 		 */
5205788Smv143129 		clock_cpu_list = clock_cpu_list->cpu_next_onln;
5215788Smv143129 	}
5225788Smv143129 
5235788Smv143129 	mutex_exit(&clock_tick_lock);
5245788Smv143129 
5255788Smv143129 	clock_tick_pending = 0;
5265788Smv143129 }
5275788Smv143129 
5285788Smv143129 static void
clock_tick_execute_common(int start,int scan,int end,clock_t mylbolt,int pending)5295788Smv143129 clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt,
5305788Smv143129 	int pending)
5315788Smv143129 {
5325788Smv143129 	cpu_t		*cp;
5335788Smv143129 	int		i;
5345788Smv143129 
5355788Smv143129 	ASSERT((start <= scan) && (scan <= end));
5365788Smv143129 
5375788Smv143129 	/*
5385788Smv143129 	 * Handle the thread on current CPU first. This is to prevent a
5395788Smv143129 	 * pinned thread from escaping if we ever block on something.
5405788Smv143129 	 * Note that in the single-threaded mode, this handles the clock
5415788Smv143129 	 * CPU.
5425788Smv143129 	 */
5435788Smv143129 	clock_tick_process(CPU, mylbolt, pending);
5445788Smv143129 
5455788Smv143129 	/*
5465788Smv143129 	 * Perform tick accounting for the threads running on
5475788Smv143129 	 * the scheduled CPUs.
5485788Smv143129 	 */
5495788Smv143129 	for (i = scan; i < end; i++) {
5505788Smv143129 		cp = clock_tick_cpus[i];
5515788Smv143129 		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
5525788Smv143129 			continue;
5535788Smv143129 		clock_tick_process(cp, mylbolt, pending);
5545788Smv143129 	}
5555788Smv143129 
5565788Smv143129 	for (i = start; i < scan; i++) {
5575788Smv143129 		cp = clock_tick_cpus[i];
5585788Smv143129 		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
5595788Smv143129 			continue;
5605788Smv143129 		clock_tick_process(cp, mylbolt, pending);
5615788Smv143129 	}
5625788Smv143129 }
5635788Smv143129 
5645788Smv143129 /*ARGSUSED*/
5655788Smv143129 static uint_t
clock_tick_execute(caddr_t arg1,caddr_t arg2)5665788Smv143129 clock_tick_execute(caddr_t arg1, caddr_t arg2)
5675788Smv143129 {
5685788Smv143129 	clock_tick_cpu_t	*ctp;
5695788Smv143129 	int			start, scan, end, pending;
5705788Smv143129 	clock_t			mylbolt;
5715788Smv143129 
5725788Smv143129 	/*
5735788Smv143129 	 * We could have raced with cpu offline. We don't want to
5745788Smv143129 	 * process anything on an offlined CPU. If we got blocked
5755788Smv143129 	 * on anything, we may not get scheduled when we wakeup
5765788Smv143129 	 * later on.
5775788Smv143129 	 */
5785788Smv143129 	if (!CLOCK_TICK_XCALL_SAFE(CPU))
5799039SMadhavan.Venkataraman@Sun.COM 		goto out;
5805788Smv143129 
5819039SMadhavan.Venkataraman@Sun.COM 	ctp = clock_tick_cpu[CPU->cpu_id];
5825788Smv143129 
5835788Smv143129 	mutex_enter(&ctp->ct_lock);
5845788Smv143129 	pending = ctp->ct_pending;
5855788Smv143129 	if (pending == 0) {
5865788Smv143129 		/*
5875788Smv143129 		 * If a CPU is busy at LOCK_LEVEL, then an invocation
5885788Smv143129 		 * of this softint may be queued for some time. In that case,
5895788Smv143129 		 * clock_tick_active will not be incremented.
5905788Smv143129 		 * clock_tick_schedule() will then assume that the previous
5915788Smv143129 		 * invocation is done and post a new softint. The first one
5925788Smv143129 		 * that gets in will reset the pending count so the
5935788Smv143129 		 * second one is a noop.
5945788Smv143129 		 */
5955788Smv143129 		mutex_exit(&ctp->ct_lock);
5965788Smv143129 		goto out;
5975788Smv143129 	}
5985788Smv143129 	ctp->ct_pending = 0;
5995788Smv143129 	start = ctp->ct_start;
6005788Smv143129 	end = ctp->ct_end;
6015788Smv143129 	scan = ctp->ct_scan;
6025788Smv143129 	mylbolt = ctp->ct_lbolt;
6035788Smv143129 	mutex_exit(&ctp->ct_lock);
6045788Smv143129 
6055788Smv143129 	clock_tick_execute_common(start, scan, end, mylbolt, pending);
6065788Smv143129 
6075788Smv143129 out:
6085788Smv143129 	/*
6095788Smv143129 	 * Signal completion to the clock handler.
6105788Smv143129 	 */
6115788Smv143129 	atomic_dec_ulong(&clock_tick_active);
6125788Smv143129 
6135788Smv143129 	return (1);
6145788Smv143129 }
6155788Smv143129 
6165788Smv143129 /*ARGSUSED*/
6175788Smv143129 static int
clock_tick_cpu_setup(cpu_setup_t what,int cid,void * arg)6185788Smv143129 clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg)
6195788Smv143129 {
6205788Smv143129 	cpu_t			*cp, *ncp;
6215788Smv143129 	int			i, set;
6225788Smv143129 	clock_tick_set_t	*csp;
6235788Smv143129 
6245788Smv143129 	/*
6255788Smv143129 	 * This function performs some computations at CPU offline/online
6265788Smv143129 	 * time. The computed values are used during tick scheduling and
6275788Smv143129 	 * execution phases. This avoids having to compute things on
6285788Smv143129 	 * an every tick basis. The other benefit is that we perform the
6295788Smv143129 	 * computations only for onlined CPUs (not offlined ones). As a
6305788Smv143129 	 * result, no tick processing is attempted for offlined CPUs.
6315788Smv143129 	 *
6325788Smv143129 	 * Also, cpu_offline() calls this function before checking for
6335788Smv143129 	 * active interrupt threads. This allows us to avoid posting
6345788Smv143129 	 * cross calls to CPUs that are being offlined.
6355788Smv143129 	 */
6365788Smv143129 
6375788Smv143129 	cp = cpu[cid];
6385788Smv143129 
6395788Smv143129 	mutex_enter(&clock_tick_lock);
6405788Smv143129 
6415788Smv143129 	switch (what) {
6425788Smv143129 	case CPU_ON:
6435788Smv143129 		clock_tick_cpus[clock_tick_total_cpus] = cp;
6445788Smv143129 		set = clock_tick_total_cpus / clock_tick_ncpus;
6455788Smv143129 		csp = &clock_tick_set[set];
6465788Smv143129 		csp->ct_end++;
6475788Smv143129 		clock_tick_total_cpus++;
6485788Smv143129 		clock_tick_nsets =
6495788Smv143129 		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
6505788Smv143129 		    clock_tick_ncpus;
6515788Smv143129 		CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id);
6525788Smv143129 		membar_sync();
6535788Smv143129 		break;
6545788Smv143129 
6555788Smv143129 	case CPU_OFF:
6565788Smv143129 		if (&sync_softint != NULL)
6575788Smv143129 			sync_softint(clock_tick_online_cpuset);
6585788Smv143129 		CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id);
6595788Smv143129 		clock_tick_total_cpus--;
6605788Smv143129 		clock_tick_cpus[clock_tick_total_cpus] = NULL;
6615788Smv143129 		clock_tick_nsets =
6625788Smv143129 		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
6635788Smv143129 		    clock_tick_ncpus;
6645788Smv143129 		set = clock_tick_total_cpus / clock_tick_ncpus;
6655788Smv143129 		csp = &clock_tick_set[set];
6665788Smv143129 		csp->ct_end--;
6675788Smv143129 
6685788Smv143129 		i = 0;
6695788Smv143129 		ncp = cpu_active;
6705788Smv143129 		do {
6715788Smv143129 			if (cp == ncp)
6725788Smv143129 				continue;
6735788Smv143129 			clock_tick_cpus[i] = ncp;
6745788Smv143129 			i++;
6755788Smv143129 		} while ((ncp = ncp->cpu_next_onln) != cpu_active);
6765788Smv143129 		ASSERT(i == clock_tick_total_cpus);
6775788Smv143129 		membar_sync();
6785788Smv143129 		break;
6795788Smv143129 
6805788Smv143129 	default:
6815788Smv143129 		break;
6825788Smv143129 	}
6835788Smv143129 
6845788Smv143129 	mutex_exit(&clock_tick_lock);
6855788Smv143129 
6865788Smv143129 	return (0);
6875788Smv143129 }
6885788Smv143129 
6895788Smv143129 
6905788Smv143129 void
clock_tick_mp_init(void)6915788Smv143129 clock_tick_mp_init(void)
6925788Smv143129 {
6935788Smv143129 	cpu_t	*cp;
6945788Smv143129 
6955788Smv143129 	mutex_enter(&cpu_lock);
6965788Smv143129 
6975788Smv143129 	cp = cpu_active;
6985788Smv143129 	do {
6995788Smv143129 		(void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL);
7005788Smv143129 	} while ((cp = cp->cpu_next_onln) != cpu_active);
7015788Smv143129 
7025788Smv143129 	register_cpu_setup_func(clock_tick_cpu_setup, NULL);
7035788Smv143129 
7045788Smv143129 	mutex_exit(&cpu_lock);
7055788Smv143129 }
706