xref: /onnv-gate/usr/src/uts/common/os/ddi_timer.c (revision 5343:a0b027e2a7c7)
15107Seota /*
25107Seota  * CDDL HEADER START
35107Seota  *
45107Seota  * The contents of this file are subject to the terms of the
55107Seota  * Common Development and Distribution License (the "License").
65107Seota  * You may not use this file except in compliance with the License.
75107Seota  *
85107Seota  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95107Seota  * or http://www.opensolaris.org/os/licensing.
105107Seota  * See the License for the specific language governing permissions
115107Seota  * and limitations under the License.
125107Seota  *
135107Seota  * When distributing Covered Code, include this CDDL HEADER in each
145107Seota  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155107Seota  * If applicable, add the following below this CDDL HEADER, with the
165107Seota  * fields enclosed by brackets "[]" replaced with your own identifying
175107Seota  * information: Portions Copyright [yyyy] [name of copyright owner]
185107Seota  *
195107Seota  * CDDL HEADER END
205107Seota  */
215107Seota 
225107Seota /*
235107Seota  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
245107Seota  * Use is subject to license terms.
255107Seota  */
265107Seota 
275107Seota #pragma ident	"%Z%%M%	%I%	%E% SMI"
285107Seota 
295107Seota #include <sys/atomic.h>
305107Seota #include <sys/callb.h>
315107Seota #include <sys/conf.h>
325107Seota #include <sys/cmn_err.h>
335107Seota #include <sys/taskq.h>
345107Seota #include <sys/dditypes.h>
355107Seota #include <sys/ddi_timer.h>
365107Seota #include <sys/disp.h>
375107Seota #include <sys/kobj.h>
385107Seota #include <sys/note.h>
395107Seota #include <sys/param.h>
405107Seota #include <sys/sysmacros.h>
415107Seota #include <sys/systm.h>
425107Seota #include <sys/time.h>
435107Seota #include <sys/types.h>
445107Seota 
455107Seota /*
465107Seota  * global variables for timeout request
475107Seota  */
485107Seota static kmem_cache_t *req_cache;		/* kmem cache for timeout request */
495107Seota 
505107Seota /*
51*5343Seota  * taskq parameters for cyclic_timer
52*5343Seota  *
53*5343Seota  * timer_taskq_num:
54*5343Seota  * timer_taskq_num represents the number of taskq threads.
55*5343Seota  * Currently 4 threads are pooled to handle periodic timeout requests.
56*5343Seota  * This number is chosen based on the fact that the callout (one-time
57*5343Seota  * timeout framework) uses 8 threads with TQ_NOSLEEP; the periodic timeout
58*5343Seota  * calls taskq_dispatch() with TQ_SLEEP instead, and in this case, 4 threads
59*5343Seota  * should be sufficient to handle periodic timeout requests. (see also
60*5343Seota  * timer_taskq_max_num below)
61*5343Seota  *
62*5343Seota  * timer_taskq_min_num:
63*5343Seota  * timer_taskq_min_num represents the number of pre-populated taskq_ent
64*5343Seota  * structures, and this variable holds the same value as timer_taskq_num does.
65*5343Seota  *
66*5343Seota  * timer_taskq_max_num:
67*5343Seota  * Since TQ_SLEEP is set when taskq_dispatch() is called, the framework waits
68*5343Seota  * for one second if more taskq_ent structures than timer_taskq_max_num are
69*5343Seota  * required. However, from the timeout point of view, one second is much longer
70*5343Seota  * than expected, and to prevent this occurrence, timer_taskq_max_num should
71*5343Seota  * hold a sufficiently-large value, which is 128 here. Note that since the size
72*5343Seota  * of taskq_ent_t is relatively small, this doesn't use up the resource so much.
73*5343Seota  * (Currently the size is less than 8k at most)
74*5343Seota  *
75*5343Seota  * About the detailed explanation of the taskq function arguments, please see
76*5343Seota  * usr/src/uts/common/os/taskq.c.
775107Seota  */
78*5343Seota int timer_taskq_num = 4;		/* taskq thread number */
79*5343Seota int timer_taskq_min_num = 4;		/* min. number of taskq_ent structs */
80*5343Seota int timer_taskq_max_num = 128;		/* max. number of taskq_ent structs */
815107Seota static taskq_t *tm_taskq;		/* taskq thread pool */
825107Seota static kthread_t *tm_work_thread;	/* work thread invoking taskq */
835107Seota 
845107Seota /*
855107Seota  * timer variables
865107Seota  */
875107Seota static cyc_timer_t *ddi_timer;		/* ddi timer based on the cyclic */
885107Seota static volatile hrtime_t timer_hrtime;	/* current tick time on the timer */
895107Seota 
905107Seota /*
915107Seota  * Variable used for the suspend/resume.
925107Seota  */
935107Seota static volatile boolean_t timer_suspended;
945107Seota 
955107Seota /*
965107Seota  * Kernel taskq queue to ddi timer
975107Seota  */
985107Seota static list_t kern_queue;	/* kernel thread request queue */
995107Seota static kcondvar_t kern_cv;	/* condition variable for taskq queue */
1005107Seota 
1015107Seota /*
1025107Seota  * Software interrupt queue dedicated to ddi timer
1035107Seota  */
1045107Seota static list_t intr_queue;	/* software interrupt request queue */
1055107Seota static uint_t intr_state;	/* software interrupt state */
1065107Seota 
1075107Seota /*
1085107Seota  * This lock is used to protect the intr_queue and kern_queue.
1095107Seota  * It's also used to protect the intr_state which represents the software
1105107Seota  * interrupt state for the timer.
1115107Seota  */
1125107Seota static kmutex_t	disp_req_lock;
1135107Seota 
1145107Seota /*
1155107Seota  * the periodic timer interrupt priority level
1165107Seota  */
1175107Seota enum {
1185107Seota 	TM_IPL_0 = 0,			/* kernel context */
1195107Seota 	TM_IPL_1, TM_IPL_2, TM_IPL_3,	/* level 1-3 */
1205107Seota 	TM_IPL_4, TM_IPL_5, TM_IPL_6,	/* level 4-6 */
1215107Seota 	TM_IPL_7, TM_IPL_8, TM_IPL_9,	/* level 7-9 */
1225107Seota 	TM_IPL_10			/* level 10 */
1235107Seota };
1245107Seota 
1255107Seota /*
1265107Seota  * A callback handler used by CPR to stop and resume callouts.
1275107Seota  * Since the taskq uses TASKQ_CPR_SAFE, the function just set the boolean
1285107Seota  * flag to timer_suspended here.
1295107Seota  */
1305107Seota /*ARGSUSED*/
1315107Seota static boolean_t
1325107Seota timer_cpr_callb(void *arg, int code)
1335107Seota {
1345107Seota 	timer_suspended = (code == CB_CODE_CPR_CHKPT);
1355107Seota 	return (B_TRUE);
1365107Seota }
1375107Seota 
1385107Seota /*
1395107Seota  * Return a proposed timeout request id. add_req() determines whether
1405107Seota  * or not the proposed one is used. If it's not suitable, add_req()
1415107Seota  * recalls get_req_cnt(). To reduce the lock contention between the
1425107Seota  * timer and i_untimeout(), the atomic instruction should be used here.
1435107Seota  */
1445107Seota static timeout_t
1455107Seota get_req_cnt(void)
1465107Seota {
1475107Seota 	static volatile ulong_t timeout_cnt = 0;
1485107Seota 	return ((timeout_t)atomic_inc_ulong_nv(&timeout_cnt));
1495107Seota }
1505107Seota 
1515107Seota /*
1525107Seota  * Get the system resolution.
1535107Seota  * Note. currently there is a restriction about the system resolution, and
1545107Seota  * the 10ms tick (the default clock resolution) is only supported now.
1555107Seota  */
1565107Seota static hrtime_t
1575107Seota i_get_res(void)
1585107Seota {
1595107Seota 	return ((hrtime_t)10000000); /* 10ms tick only */
1605107Seota }
1615107Seota 
1625107Seota /*
1635107Seota  * Return the value for the cog of the timing wheel.
1645107Seota  * TICK_FACTOR is used to gain a finer cog on the clock resolution.
1655107Seota  */
1665107Seota static hrtime_t
1675107Seota tw_tick(hrtime_t time)
1685107Seota {
1695107Seota 	return ((time << TICK_FACTOR) / ddi_timer->res);
1705107Seota }
1715107Seota 
1725107Seota /*
1735107Seota  * Calculate the expiration time for the timeout request.
1745107Seota  */
1755107Seota static hrtime_t
1765107Seota expire_tick(tm_req_t *req)
1775107Seota {
1785107Seota 	return (tw_tick(req->exp_time));
1795107Seota }
1805107Seota 
1815107Seota /*
1825107Seota  * Register a timeout request to the timer. This function is used
1835107Seota  * in i_timeout().
1845107Seota  */
1855107Seota static timeout_t
1865107Seota add_req(tm_req_t *req)
1875107Seota {
1885107Seota 	timer_tw_t *tid, *tw;
1895107Seota 	tm_req_t *next;
1905107Seota 	timeout_t id;
1915107Seota 
1925107Seota retry:
1935107Seota 	/*
1945107Seota 	 * Retrieve a timeout request id. Since i_timeout() needs to return
1955107Seota 	 * a non-zero value, re-try if the zero is gotten.
1965107Seota 	 */
1975107Seota 	if ((id = get_req_cnt()) == 0)
1985107Seota 		id = get_req_cnt();
1995107Seota 
2005107Seota 	/*
2015107Seota 	 * Check if the id is not used yet. Since the framework now deals
2025107Seota 	 * with the periodic timeout requests, we cannot assume the id
2035107Seota 	 * allocated (long) before doesn't exist any more when it will
2045107Seota 	 * be re-assigned again (especially on 32bit) but need to handle
2055107Seota 	 * this case to solve the conflicts. If it's used already, retry
2065107Seota 	 * another.
2075107Seota 	 */
2085107Seota 	tid = &ddi_timer->idhash[TM_HASH((uintptr_t)id)];
2095107Seota 	mutex_enter(&tid->lock);
2105107Seota 	for (next = list_head(&tid->req); next != NULL;
2115107Seota 	    next = list_next(&tid->req, next)) {
2125107Seota 		if (next->id == id) {
2135107Seota 			mutex_exit(&tid->lock);
2145107Seota 			goto retry;
2155107Seota 		}
2165107Seota 	}
2175107Seota 	/* Nobody uses this id yet */
2185107Seota 	req->id = id;
2195107Seota 
2205107Seota 	/*
2215107Seota 	 * Register this request to the timer.
2225107Seota 	 * The list operation must be list_insert_head().
2235107Seota 	 * Other operations can degrade performance.
2245107Seota 	 */
2255107Seota 	list_insert_head(&tid->req, req);
2265107Seota 	mutex_exit(&tid->lock);
2275107Seota 
2285107Seota 	tw = &ddi_timer->exhash[TM_HASH(expire_tick(req))];
2295107Seota 	mutex_enter(&tw->lock);
2305107Seota 	/*
2315107Seota 	 * Other operations than list_insert_head() can
2325107Seota 	 * degrade performance here.
2335107Seota 	 */
2345107Seota 	list_insert_head(&tw->req, req);
2355107Seota 	mutex_exit(&tw->lock);
2365107Seota 
2375107Seota 	return (id);
2385107Seota }
2395107Seota 
2405107Seota /*
2415107Seota  * Periodic timeout requests cannot be removed until they are canceled
2425107Seota  * explicitly. Until then, they need to be re-registerd after they are
2435107Seota  * fired. transfer_req() re-registers the requests for the next fires.
2445107Seota  * Note. transfer_req() sends the cv_signal to timeout_execute(), which
2455107Seota  * runs in interrupt context. Make sure this function will not be blocked,
2465107Seota  * otherwise the deadlock situation can occur.
2475107Seota  */
2485107Seota static void
2495107Seota transfer_req(tm_req_t *req, timer_tw_t *tw)
2505107Seota {
2515107Seota 	timer_tw_t *new_tw;
2525107Seota 	hrtime_t curr_time;
2535107Seota 	ASSERT(tw && MUTEX_HELD(&tw->lock));
2545107Seota 
2555107Seota 	/* Calculate the next expiration time by interval */
2565107Seota 	req->exp_time += req->interval;
2575107Seota 	curr_time = gethrtime();
2585107Seota 
2595107Seota 	/*
2605107Seota 	 * If a long time (more than 1 clock resolution) has already
2615107Seota 	 * passed for some reason (e.g. debugger or high interrupt),
2625107Seota 	 * round up the next expiration to the appropriate one
2635107Seota 	 * since this request is periodic and never catches with it.
2645107Seota 	 */
2655107Seota 	if (curr_time - req->exp_time >= ddi_timer->res) {
2665107Seota 		req->exp_time = roundup(curr_time + req->interval,
2675107Seota 		    ddi_timer->res);
2685107Seota 	}
2695107Seota 
2705107Seota 	/*
2715107Seota 	 * Re-register this request.
2725107Seota 	 * Note. since it is guaranteed that the timer is invoked on only
2735107Seota 	 * one CPU at any time (by the cyclic subsystem), a deadlock
2745107Seota 	 * cannot occur regardless of the lock order here.
2755107Seota 	 */
2765107Seota 	new_tw = &ddi_timer->exhash[TM_HASH(expire_tick(req))];
2775107Seota 
2785107Seota 	/*
2795107Seota 	 * If it's on the timer cog already, there is nothing
2805107Seota 	 * to do. Just return.
2815107Seota 	 */
2825107Seota 	if (new_tw == tw)
2835107Seota 		return;
2845107Seota 
2855107Seota 	/* Remove this request from the timer */
2865107Seota 	list_remove(&tw->req, req);
2875107Seota 
2885107Seota 	/* Re-register this request to the timer */
2895107Seota 	mutex_enter(&new_tw->lock);
2905107Seota 
2915107Seota 	/*
2925107Seota 	 * Other operations than list_insert_head() can
2935107Seota 	 * degrade performance here.
2945107Seota 	 */
2955107Seota 	list_insert_head(&new_tw->req, req);
2965107Seota 	mutex_exit(&new_tw->lock);
2975107Seota 
2985107Seota 	/*
2995107Seota 	 * Set the TM_TRANSFER flag and notify the request is transfered
3005107Seota 	 * completely. This prevents a race in the case that this request
3015107Seota 	 * is serviced on another CPU already.
3025107Seota 	 */
3035107Seota 	mutex_enter(&req->lock);
3045107Seota 	req->flags |= TM_TRANSFER;
3055107Seota 	cv_signal(&req->cv);
3065107Seota 	mutex_exit(&req->lock);
3075107Seota }
3085107Seota 
3095107Seota /*
3105107Seota  * Execute timeout requests.
3115107Seota  * Note. since timeout_execute() can run in interrupt context and block
3125107Seota  * on condition variables, there are restrictions on the timer code that
3135107Seota  * signals these condition variables (see i_untimeout(), transfer_req(),
3145107Seota  * and condvar(9F)). Functions that signal these cvs must ensure that
3155107Seota  * they will not be blocked (for memory allocations or any other reason)
3165107Seota  * since condition variables don't support priority inheritance.
3175107Seota  */
3185107Seota static void
3195107Seota timeout_execute(void *arg)
3205107Seota {
3215107Seota 	tm_req_t *req = (tm_req_t *)arg;
3225107Seota 	ASSERT(req->flags & TM_INVOKING && !(req->flags & TM_EXECUTING));
3235107Seota 
3245107Seota 	for (;;) {
3255107Seota 		/*
3265107Seota 		 * Check if this request is canceled. If it's canceled, do not
3275107Seota 		 * execute this request.
3285107Seota 		 */
3295107Seota 		mutex_enter(&req->lock);
3305107Seota 		if (!(req->flags & TM_CANCEL)) {
3315107Seota 			/*
3325107Seota 			 * Set the current thread to prevent a dead lock
3335107Seota 			 * situation in case that this timeout request is
3345107Seota 			 * canceled in the handler being invoked now.
3355107Seota 			 * (this doesn't violate the spec) Set TM_EXECUTING
3365107Seota 			 * to show this handler is invoked soon.
3375107Seota 			 */
3385107Seota 			req->h_thread = curthread;
3395107Seota 			req->flags |= TM_EXECUTING;
3405107Seota 			mutex_exit(&req->lock);
3415107Seota 
3425107Seota 			/* The handler is invoked without holding any locks */
3435107Seota 			(*req->handler)(req->arg);
3445107Seota 
3455107Seota 			/*
3465107Seota 			 * Set TM_COMPLETE and notify the request is complete
3475107Seota 			 * now.
3485107Seota 			 */
3495107Seota 			mutex_enter(&req->lock);
3505107Seota 			req->flags |= TM_COMPLETE;
3515107Seota 			if (req->flags & TM_COMPWAIT)
3525107Seota 				cv_signal(&req->cv);
3535107Seota 		}
3545107Seota 
3555107Seota 		/*
3565107Seota 		 * The handler is invoked at this point. If this request
3575107Seota 		 * is not canceled, prepare for the next fire.
3585107Seota 		 */
3595107Seota 		if (req->flags & TM_CANCEL) {
3605107Seota 			timer_tw_t *tw;
3615107Seota 			/*
3625107Seota 			 * Wait until the timer finishes all things for
3635107Seota 			 * this request.
3645107Seota 			 */
3655107Seota 			while (!(req->flags & TM_TRANSFER))
3665107Seota 				cv_wait(&req->cv, &req->lock);
3675107Seota 			mutex_exit(&req->lock);
3685107Seota 			ASSERT(req->flags & TM_TRANSFER);
3695107Seota 
3705107Seota 			/* Remove this request from the timer */
3715107Seota 			tw = &ddi_timer->exhash[TM_HASH(expire_tick(req))];
3725107Seota 			mutex_enter(&tw->lock);
3735107Seota 			list_remove(&tw->req, req);
3745107Seota 			mutex_exit(&tw->lock);
3755107Seota 
3765107Seota 			/*
3775107Seota 			 * Wait until i_untimeout() can go ahead.
3785107Seota 			 * This prevents the request from being freed before
3795107Seota 			 * i_untimeout() is complete.
3805107Seota 			 */
3815107Seota 			mutex_enter(&req->lock);
3825107Seota 			while (req->flags & TM_COMPWAIT)
3835107Seota 				cv_wait(&req->cv, &req->lock);
3845107Seota 			mutex_exit(&req->lock);
3855107Seota 			ASSERT(!(req->flags & TM_COMPWAIT));
3865107Seota 
3875107Seota 			/* Free this request */
3885107Seota 			kmem_cache_free(req_cache, req);
3895107Seota 			return;
3905107Seota 		}
3915107Seota 		ASSERT(req->flags & TM_EXECUTING);
3925107Seota 
3935107Seota 		/*
3945107Seota 		 * TM_EXECUTING must be set at this point.
3955107Seota 		 * Unset the flag.
3965107Seota 		 */
3975107Seota 		req->flags &= ~(TM_EXECUTING | TM_TRANSFER);
3985107Seota 
3995107Seota 		/*
4005107Seota 		 * Decrease the request cnt. The reqest cnt shows
4015107Seota 		 * how many times this request is executed now.
4025107Seota 		 * If this counter becomes the zero, drop TM_INVOKING
4035107Seota 		 * to show there is no requests to do now.
4045107Seota 		 */
4055107Seota 		req->cnt--;
4065107Seota 		if (req->cnt == 0) {
4075107Seota 			req->flags &= ~TM_INVOKING;
4085107Seota 			mutex_exit(&req->lock);
4095107Seota 			return;
4105107Seota 		}
4115107Seota 		mutex_exit(&req->lock);
4125107Seota 	}
4135107Seota }
4145107Seota 
4155107Seota /*
4165107Seota  * Timeout worker thread for processing task queue.
4175107Seota  */
4185107Seota static void
4195107Seota timeout_taskq_thread(void *arg)
4205107Seota {
4215107Seota 	_NOTE(ARGUNUSED(arg));
4225107Seota 	tm_req_t *kern_req;
4235107Seota 	callb_cpr_t cprinfo;
4245107Seota 
4255107Seota 	CALLB_CPR_INIT(&cprinfo, &disp_req_lock, callb_generic_cpr,
4265107Seota 	    "timeout_taskq_thread");
4275107Seota 
4285107Seota 	/*
4295107Seota 	 * This thread is wakened up when a new request is added to
4305107Seota 	 * the queue. Then pick up all requests and dispatch them
4315107Seota 	 * via taskq_dispatch().
4325107Seota 	 */
4335107Seota 	for (;;) {
4345107Seota 		/*
4355107Seota 		 * Check the queue and pick up a request if the queue
4365107Seota 		 * is not NULL.
4375107Seota 		 */
4385107Seota 		mutex_enter(&disp_req_lock);
4395107Seota 		while ((kern_req = list_head(&kern_queue)) == NULL) {
4405107Seota 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4415107Seota 			cv_wait(&kern_cv, &disp_req_lock);
4425107Seota 			CALLB_CPR_SAFE_END(&cprinfo, &disp_req_lock);
4435107Seota 		}
4445107Seota 		list_remove(&kern_queue, kern_req);
4455107Seota 		mutex_exit(&disp_req_lock);
4465107Seota 
4475107Seota 		/* Execute the timeout request via the taskq thread */
4485107Seota 		(void) taskq_dispatch(tm_taskq, timeout_execute,
4495107Seota 		    (void *)kern_req, TQ_SLEEP);
4505107Seota 	}
4515107Seota }
4525107Seota 
4535107Seota /*
4545107Seota  * Dispatch the timeout request based on the level specified.
4555107Seota  * If the level is equal to zero, notify the worker thread to
4565107Seota  * call taskq_dispatch() in kernel context. If the level is bigger
4575107Seota  * than zero, add a software interrupt request to the queue and raise
4585107Seota  * the interrupt level to the specified one.
4595107Seota  */
4605107Seota static void
4615107Seota timeout_dispatch(tm_req_t *req)
4625107Seota {
4635107Seota 	int level = req->level;
4645107Seota 	extern void sir_on(int);
4655107Seota 
4665107Seota 	if (level == TM_IPL_0) {
4675107Seota 		/* Add a new request to the tail */
4685107Seota 		mutex_enter(&disp_req_lock);
4695107Seota 		list_insert_tail(&kern_queue, req);
4705107Seota 		mutex_exit(&disp_req_lock);
4715107Seota 
4725107Seota 		/*
4735107Seota 		 * notify the worker thread that this request
4745107Seota 		 * is newly added to the queue.
4755107Seota 		 * Note. this cv_signal() can be called after the
4765107Seota 		 * mutex_lock.
4775107Seota 		 */
4785107Seota 		cv_signal(&kern_cv);
4795107Seota 	} else {
4805107Seota 		/* Add a new request to the tail */
4815107Seota 		mutex_enter(&disp_req_lock);
4825107Seota 		list_insert_tail(&intr_queue, req);
4835107Seota 
4845107Seota 		/* Issue the software interrupt */
4855107Seota 		if (intr_state & TM_INTR_START(level)) {
4865107Seota 			/*
4875107Seota 			 * timer_softintr() is already running; no need to
4885107Seota 			 * raise a siron. Due to lock protection of
4895107Seota 			 * the intr_queue and intr_state, we know that
4905107Seota 			 * timer_softintr() will see the new addition to
4915107Seota 			 * the intr_queue.
4925107Seota 			 */
4935107Seota 			mutex_exit(&disp_req_lock);
4945107Seota 		} else {
4955107Seota 			intr_state |= TM_INTR_SET(level);
4965107Seota 			mutex_exit(&disp_req_lock);
4975107Seota 
4985107Seota 			/* Raise an interrupt to execute timeout requests */
4995107Seota 			sir_on(level);
5005107Seota 		}
5015107Seota 	}
5025107Seota }
5035107Seota 
5045107Seota /*
5055107Seota  * Check the software interrupt queue and invoke requests at the specified
5065107Seota  * interrupt level.
5075107Seota  * Note that the queue may change during call so that the disp_req_lock
5085107Seota  * and the intr_state are used to protect it.
5095107Seota  * The software interrupts supported here are up to the level 10. Higher
5105107Seota  * than 10 interrupts cannot be supported.
5115107Seota  */
5125107Seota void
5135107Seota timer_softintr(int level)
5145107Seota {
5155107Seota 	tm_req_t *intr_req;
5165107Seota 	ASSERT(level >= TM_IPL_1 && level <= TM_IPL_10);
5175107Seota 
5185107Seota 	/* Check if we are asked to process the softcall list */
5195107Seota 	mutex_enter(&disp_req_lock);
5205107Seota 	if (!(intr_state & TM_INTR_SET(level))) {
5215107Seota 		mutex_exit(&disp_req_lock);
5225107Seota 		return;
5235107Seota 	}
5245107Seota 
5255107Seota 	/* Notify this software interrupt request will be executed soon */
5265107Seota 	intr_state |= TM_INTR_START(level);
5275107Seota 	intr_state &= ~TM_INTR_SET(level);
5285107Seota 
5295107Seota 	/* loop the link until there is no requests */
5305107Seota 	for (intr_req = list_head(&intr_queue); intr_req != NULL;
5315107Seota 	    /* Nothing */) {
5325107Seota 
5335107Seota 		/* Check the interrupt level */
5345107Seota 		if (intr_req->level != level) {
5355107Seota 			intr_req = list_next(&intr_queue, intr_req);
5365107Seota 			continue;
5375107Seota 		}
5385107Seota 		list_remove(&intr_queue, intr_req);
5395107Seota 		mutex_exit(&disp_req_lock);
5405107Seota 
5415107Seota 		/* Execute the software interrupt request */
5425107Seota 		timeout_execute(intr_req);
5435107Seota 
5445107Seota 		mutex_enter(&disp_req_lock);
5455107Seota 		/* Restart the loop since new requests might be added */
5465107Seota 		intr_req = list_head(&intr_queue);
5475107Seota 	}
5485107Seota 
5495107Seota 	/* reset the interrupt state */
5505107Seota 	intr_state &= ~TM_INTR_START(level);
5515107Seota 	mutex_exit(&disp_req_lock);
5525107Seota }
5535107Seota 
5545107Seota /*
5555107Seota  *  void
5565107Seota  *  cyclic_timer(void)
5575107Seota  *
5585107Seota  *  Overview
5595107Seota  *   cyclic_timer() is a function invoked periodically by the cyclic
5605107Seota  *   subsystem.
5615107Seota  *
5625107Seota  *   The function calls timeout_invoke() with timeout requests whose
5635107Seota  *   expiration time is already reached.
5645107Seota  *
5655107Seota  *  Arguments
5665107Seota  *   Nothing
5675107Seota  *
5685107Seota  *  Return value
5695107Seota  *   Nothing
5705107Seota  */
5715107Seota void
5725107Seota cyclic_timer(void)
5735107Seota {
5745107Seota 	tm_req_t *req;
5755107Seota 	timer_tw_t *tw;
5765107Seota 	hrtime_t curr_tick, curr;
5775107Seota 
5785107Seota 	/* If the system is suspended, just return */
5795107Seota 	if (timer_suspended)
5805107Seota 		return;
5815107Seota 
5825107Seota 	/* Get the current time */
5835107Seota 	timer_hrtime = ddi_timer->tick_time = curr = gethrtime();
5845107Seota 	curr_tick = tw_tick(ddi_timer->tick_time);
5855107Seota 
5865107Seota restart:
5875107Seota 	/*
5885107Seota 	 * Check the timer cogs to see if there are timeout requests
5895107Seota 	 * who reach the expiration time. Call timeout_invoke() to execute
5905107Seota 	 * the requests, then.
5915107Seota 	 */
5925107Seota 	while (curr_tick >= ddi_timer->tick) {
5935107Seota 		tm_req_t *next;
5945107Seota 		tw = &ddi_timer->exhash[TM_HASH(ddi_timer->tick)];
5955107Seota 		mutex_enter(&tw->lock);
5965107Seota 		for (req = list_head(&tw->req); req != NULL; req = next) {
5975107Seota 			next = list_next(&tw->req, req);
5985107Seota 			/*
5995107Seota 			 * If this request is already obsolete, free
6005107Seota 			 * it here.
6015107Seota 			 */
6025107Seota 			if (req->flags & TM_UTMCOMP) {
6035107Seota 				/*
6045107Seota 				 * Remove this request from the timer,
6055107Seota 				 * then free it.
6065107Seota 				 */
6075107Seota 				list_remove(&tw->req, req);
6085107Seota 				kmem_cache_free(req_cache, req);
6095107Seota 			} else if (curr >= req->exp_time) {
6105107Seota 				mutex_enter(&req->lock);
6115107Seota 				/*
6125107Seota 				 * Check if this request is canceled, but not
6135107Seota 				 * being executed now.
6145107Seota 				 */
6155107Seota 				if (req->flags & TM_CANCEL &&
6165107Seota 				    !(req->flags & TM_INVOKING)) {
6175107Seota 					mutex_exit(&req->lock);
6185107Seota 					continue;
6195107Seota 				}
6205107Seota 				/*
6215107Seota 				 * Record how many times timeout_execute()
6225107Seota 				 * must be invoked.
6235107Seota 				 */
6245107Seota 				req->cnt++;
6255107Seota 				/*
6265107Seota 				 * Invoke timeout_execute() via taskq or
6275107Seota 				 * software interrupt.
6285107Seota 				 */
6295107Seota 				if (req->flags & TM_INVOKING) {
6305107Seota 					/*
6315107Seota 					 * If it's already invoked,
6325107Seota 					 * There is nothing to do.
6335107Seota 					 */
6345107Seota 					mutex_exit(&req->lock);
6355107Seota 				} else {
6365107Seota 					req->flags |= TM_INVOKING;
6375107Seota 					mutex_exit(&req->lock);
6385107Seota 					/*
6395107Seota 					 * Dispatch this timeout request.
6405107Seota 					 * timeout_dispatch() chooses either
6415107Seota 					 * a software interrupt or taskq thread
6425107Seota 					 * based on the level.
6435107Seota 					 */
6445107Seota 					timeout_dispatch(req);
6455107Seota 				}
6465107Seota 				/*
6475107Seota 				 * Periodic timeout requests must prepare for
6485107Seota 				 * the next fire.
6495107Seota 				 */
6505107Seota 				transfer_req(req, tw);
6515107Seota 			}
6525107Seota 		}
6535107Seota 		mutex_exit(&tw->lock);
6545107Seota 		ddi_timer->tick++;
6555107Seota 	}
6565107Seota 
6575107Seota 	/*
6585107Seota 	 * Check the current time. If we spend some amount of time,
6595107Seota 	 * double-check if some of the requests reaches the expiration
6605107Seota 	 * time during the work.
6615107Seota 	 */
6625107Seota 	curr = gethrtime();
6635107Seota 	curr_tick = tw_tick(curr);
6645107Seota 	if (curr_tick >= ddi_timer->tick) {
6655107Seota 		ddi_timer->tick -= 1;
6665107Seota 		goto restart;
6675107Seota 	}
6685107Seota 	/* Adjustment for the next rolling */
6695107Seota 	ddi_timer->tick -= 1;
6705107Seota }
6715107Seota 
6725107Seota /*
6735107Seota  *  void
6745107Seota  *  timer_init(void)
6755107Seota  *
6765107Seota  *  Overview
6775107Seota  *    timer_init() allocates the internal data structures used by
6785107Seota  *    i_timeout(), i_untimeout() and the timer.
6795107Seota  *
6805107Seota  *  Arguments
6815107Seota  *    Nothing
6825107Seota  *
6835107Seota  *  Return value
6845107Seota  *    Nothing
6855107Seota  *
6865107Seota  *  Caller's context
6875107Seota  *    timer_init() can be called in kernel context only.
6885107Seota  */
6895107Seota void
6905107Seota timer_init(void)
6915107Seota {
6925107Seota 	int i;
6935107Seota 
6945107Seota 	/* Create kmem_cache for timeout requests */
6955107Seota 	req_cache = kmem_cache_create("timeout_request", sizeof (tm_req_t),
6965107Seota 	    0, NULL, NULL, NULL, NULL, NULL, 0);
6975107Seota 
6985107Seota 	/* Initialize the timer which is invoked by the cyclic subsystem */
6995107Seota 	ddi_timer = kmem_alloc(sizeof (cyc_timer_t), KM_SLEEP);
7005107Seota 	ddi_timer->res = nsec_per_tick;
7015107Seota 	ddi_timer->tick = tw_tick(gethrtime());
7025107Seota 	ddi_timer->tick_time = 0;
7035107Seota 
7045107Seota 	/* Initialize the timing wheel */
7055107Seota 	bzero((char *)&ddi_timer->idhash[0], TM_HASH_SZ * sizeof (timer_tw_t));
7065107Seota 	bzero((char *)&ddi_timer->exhash[0], TM_HASH_SZ * sizeof (timer_tw_t));
7075107Seota 
7085107Seota 	for (i = 0; i < TM_HASH_SZ; i++) {
7095107Seota 		list_create(&ddi_timer->idhash[i].req, sizeof (tm_req_t),
7105107Seota 		    offsetof(tm_req_t, id_req));
7115107Seota 		mutex_init(&ddi_timer->idhash[i].lock, NULL, MUTEX_ADAPTIVE,
7125107Seota 		    NULL);
7135107Seota 
7145107Seota 		list_create(&ddi_timer->exhash[i].req, sizeof (tm_req_t),
7155107Seota 		    offsetof(tm_req_t, ex_req));
7165107Seota 		mutex_init(&ddi_timer->exhash[i].lock, NULL, MUTEX_ADAPTIVE,
7175107Seota 		    NULL);
7185107Seota 	}
7195107Seota 
7205107Seota 	/* Create a taskq thread pool */
7215107Seota 	tm_taskq = taskq_create_instance("timeout_taskq", 0,
7225107Seota 	    timer_taskq_num, MAXCLSYSPRI,
7235265Seota 	    timer_taskq_min_num, timer_taskq_max_num,
7245107Seota 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
7255107Seota 
7265107Seota 	/*
7275107Seota 	 * Initialize the taskq queue which is dedicated to this timeout
7285107Seota 	 * interface/timer.
7295107Seota 	 */
7305107Seota 	list_create(&kern_queue, sizeof (tm_req_t),
7315107Seota 	    offsetof(tm_req_t, disp_req));
7325107Seota 
7335107Seota 	/* Create a worker thread to dispatch the taskq thread */
7345107Seota 	tm_work_thread = thread_create(NULL, 0, timeout_taskq_thread, NULL,
7355107Seota 	    0, &p0, TS_RUN, MAXCLSYSPRI);
7365107Seota 
7375107Seota 	/*
7385107Seota 	 * Initialize the software interrupt queue which is dedicated to
7395107Seota 	 * this timeout interface/timer.
7405107Seota 	 */
7415107Seota 	list_create(&intr_queue, sizeof (tm_req_t),
7425107Seota 	    offsetof(tm_req_t, disp_req));
7435107Seota 
7445107Seota 	/*
7455107Seota 	 * Initialize the mutex lock used for both of kern_queue and
7465107Seota 	 * intr_queue.
7475107Seota 	 */
7485107Seota 	mutex_init(&disp_req_lock, NULL, MUTEX_ADAPTIVE, NULL);
7495107Seota 	cv_init(&kern_cv, NULL, CV_DEFAULT, NULL);
7505107Seota 
7515107Seota 	/* Register the callback handler for the system suspend/resume */
7525107Seota 	(void) callb_add(timer_cpr_callb, 0, CB_CL_CPR_CALLOUT, "cyclicTimer");
7535107Seota }
7545107Seota 
7555107Seota /*
7565107Seota  *  timeout_t
7575107Seota  *  i_timeout(void (*func)(void *), void *arg,  hrtime_t interval,
7585107Seota  *      int level, int flags)
7595107Seota  *
7605107Seota  *  Overview
7615107Seota  *    i_timeout() is an internal function scheduling the passed function
7625107Seota  *    to be invoked in the interval in nanoseconds. The callback function
7635107Seota  *    keeps invoked until the request is explicitly canceled by i_untimeout().
7645107Seota  *    This function is used for ddi_periodic_add(9F).
7655107Seota  *
7665107Seota  *  Arguments
7675107Seota  *
7685107Seota  *    func: the callback function
7695107Seota  *          the callback function will be invoked in kernel context if
7705107Seota  *          the level passed is the zero. Otherwise be invoked in interrupt
7715107Seota  *          context at the specified level by the argument "level".
7725107Seota  *
7735107Seota  *          Note that It's guaranteed by the cyclic subsystem that the
7745107Seota  *          function is invoked on the only one CPU and is never executed
7755107Seota  *          simultaneously even on MP system.
7765107Seota  *
7775107Seota  *     arg: the argument passed to the callback function
7785107Seota  *
7795107Seota  * interval: interval time in nanoseconds
7805107Seota  *          if the interval is the zero, the timer resolution is used.
7815107Seota  *
7825107Seota  *  level : callback interrupt level
7835107Seota  *          If the value is 0 (the zero), the callback function is invoked
7845107Seota  *          in kernel context. If the value is more than 0 (the zero), but
7855107Seota  *          less than or equal to 10, the callback function is invoked in
7865107Seota  *          interrupt context at the specified interrupt level.
7875107Seota  *          This value must be in range of 0-10.
7885107Seota  *
7895107Seota  *  Return value
7905107Seota  *    returns a non-zero opaque value (timeout_t) on success.
7915107Seota  *
7925107Seota  *  Caller's context
7935107Seota  *    i_timeout() can be called in user, kernel or interrupt context.
7945107Seota  *    It cannot be called in high interrupt context.
7955107Seota  *
7965107Seota  *  Note. This function is used by ddi_periodic_add(), which cannot
7975107Seota  *  be called in interrupt context. As a result, this function is called
7985107Seota  *  in user or kernel context only in practice.
7995107Seota  *
8005107Seota  */
8015107Seota timeout_t
8025107Seota i_timeout(void (*func)(void *), void *arg, hrtime_t interval, int level)
8035107Seota {
8045107Seota 	hrtime_t start_time = gethrtime(), res;
8055107Seota 	tm_req_t *req = NULL;
8065107Seota 
8075107Seota 	/* Allocate and initialize the timeout request */
8085107Seota 	req = kmem_cache_alloc(req_cache, KM_SLEEP);
8095107Seota 	req->handler = func;
8105107Seota 	req->arg = arg;
8115107Seota 	req->h_thread = NULL;
8125107Seota 	req->level = level;
8135107Seota 	req->flags = 0;
8145107Seota 	req->cnt = 0;
8155107Seota 	mutex_init(&req->lock, NULL, MUTEX_ADAPTIVE, NULL);
8165107Seota 	cv_init(&req->cv, NULL, CV_DEFAULT, NULL);
8175107Seota 
8185107Seota 	/*
8195107Seota 	 * The resolution must be finer than or equal to
8205107Seota 	 * the requested interval. If it's not, set the resolution
8215107Seota 	 * to the interval.
8225107Seota 	 * Note. There is a restriction currently. Regardless of the
8235107Seota 	 * clock resolution used here, 10ms is set as the timer resolution.
8245107Seota 	 * Even on the 1ms resolution timer, the minimum interval is 10ms.
8255107Seota 	 */
8265107Seota 	if ((res = i_get_res()) > interval) {
8275107Seota 		uintptr_t pc = (uintptr_t)req->handler;
8285107Seota 		ulong_t off;
8295107Seota 		cmn_err(CE_WARN,
8305107Seota 		    "The periodic timeout (handler=%s, interval=%lld) "
8315107Seota 		    "requests a finer interval than the supported resolution. "
8325107Seota 		    "It rounds up to %lld\n", kobj_getsymname(pc, &off),
8335107Seota 		    interval, res);
8345107Seota 		interval = res;
8355107Seota 	}
8365107Seota 
8375107Seota 	/*
8385107Seota 	 * If the specified interval is already multiples of
8395107Seota 	 * the resolution, use it as is. Otherwise, it rounds
8405107Seota 	 * up to multiples of the timer resolution.
8415107Seota 	 */
8425107Seota 	req->interval = roundup(interval, i_get_res());
8435107Seota 
8445107Seota 	/*
8455107Seota 	 * For the periodic timeout requests, the first expiration time will
8465107Seota 	 * be adjusted to the timer tick edge to take advantage of the cyclic
8475107Seota 	 * subsystem. In that case, the first fire is likely not an expected
8485107Seota 	 * one, but the fires later can be more accurate due to this.
8495107Seota 	 */
8505107Seota 	req->exp_time = roundup(start_time + req->interval, i_get_res());
8515107Seota 
8525107Seota 	/* Add the request to the timer */
8535107Seota 	return (add_req(req));
8545107Seota }
8555107Seota 
8565107Seota /*
8575107Seota  *  void
8585107Seota  *  i_untimeout(timeout_t req)
8595107Seota  *
8605107Seota  *  Overview
8615107Seota  *    i_untimeout() is an internal function canceling the i_timeout()
8625107Seota  *    request previously issued.
8635107Seota  *    This function is used for ddi_periodic_delete(9F).
8645107Seota  *
8655107Seota  *  Argument
8665107Seota  *      req: timeout_t opaque value i_timeout() returned previously.
8675107Seota  *
8685107Seota  *  Return value
8695107Seota  *      Nothing.
8705107Seota  *
8715107Seota  *  Caller's context
8725107Seota  *    i_untimeout() can be called in user, kernel or interrupt context.
8735107Seota  *    It cannot be called in high interrupt context.
8745107Seota  *
8755107Seota  *  Note. This function is used by ddi_periodic_delete(), which cannot
8765107Seota  *  be called in interrupt context. As a result, this function is called
8775107Seota  *  in user or kernel context only in practice. Also i_untimeout() sends
8785107Seota  *  the cv_signal to timeout_execute(), which runs in interrupt context.
8795107Seota  *  Make sure this function will not be blocked, otherwise the deadlock
8805107Seota  *  situation can occur. See timeout_execute().
8815107Seota  */
8825107Seota void
8835107Seota i_untimeout(timeout_t timeout_req)
8845107Seota {
8855107Seota 	timer_tw_t *tid;
8865107Seota 	tm_req_t *req;
8875107Seota 	timeout_t id;
8885107Seota 
8895107Seota 	/* Retrieve the id for this timeout request */
8905107Seota 	id = (timeout_t)timeout_req;
8915107Seota 	tid = &ddi_timer->idhash[TM_HASH((uintptr_t)id)];
8925107Seota 
8935107Seota 	mutex_enter(&tid->lock);
8945107Seota 	for (req = list_head(&tid->req); req != NULL;
8955107Seota 	    req = list_next(&tid->req, req)) {
8965107Seota 		if (req->id == id)
8975107Seota 			break;
8985107Seota 	}
8995107Seota 	if (req == NULL) {
9005107Seota 		/* There is no requests with this id after all */
9015107Seota 		mutex_exit(&tid->lock);
9025107Seota 		return;
9035107Seota 	}
9045107Seota 	mutex_enter(&req->lock);
9055107Seota 
9065107Seota 	/* Unregister this request first */
9075107Seota 	list_remove(&tid->req, req);
9085107Seota 
9095107Seota 	/* Notify that this request is canceled */
9105107Seota 	req->flags |= TM_CANCEL;
9115107Seota 
9125107Seota 	/* Check if the handler is invoked */
9135107Seota 	if (req->flags & TM_INVOKING) {
9145107Seota 		/*
9155107Seota 		 * If this request is not yet executed or is already finished
9165107Seota 		 * then there is nothing to do but just return. Otherwise
9175107Seota 		 * we'll have to wait for the callback execution being complete.
9185107Seota 		 */
9195107Seota 		if (!(req->flags & TM_EXECUTING) || req->flags & TM_COMPLETE) {
9205107Seota 			/* There is nothing to do any more */
9215107Seota 			mutex_exit(&req->lock);
9225107Seota 			mutex_exit(&tid->lock);
9235107Seota 			return;
9245107Seota 		}
9255107Seota 
9265107Seota 		/*
9275107Seota 		 * If this is the recursive call, there is nothing
9285107Seota 		 * to do any more. This is the case that i_untimeout()
9295107Seota 		 * is called in the handler.
9305107Seota 		 */
9315107Seota 		if (req->h_thread == curthread) {
9325107Seota 			mutex_exit(&req->lock);
9335107Seota 			mutex_exit(&tid->lock);
9345107Seota 			return;
9355107Seota 		}
9365107Seota 
9375107Seota 		/*
9385107Seota 		 * Notify that i_untimeout() is waiting until this request
9395107Seota 		 * is complete.
9405107Seota 		 */
9415107Seota 		req->flags |= TM_COMPWAIT;
9425107Seota 		mutex_exit(&tid->lock);
9435107Seota 
9445107Seota 		/*
9455107Seota 		 * Wait for this timeout request being complete before
9465107Seota 		 * the return.
9475107Seota 		 */
9485107Seota 		while (!(req->flags & TM_COMPLETE))
9495107Seota 			cv_wait(&req->cv, &req->lock);
9505107Seota 		req->flags &= ~TM_COMPWAIT;
9515107Seota 		cv_signal(&req->cv);
9525107Seota 		mutex_exit(&req->lock);
9535107Seota 		return;
9545107Seota 	}
9555107Seota 	mutex_exit(&req->lock);
9565107Seota 	mutex_exit(&tid->lock);
9575107Seota 
9585107Seota 	/*
9595107Seota 	 * Notify untimeout() is about to be finished, and this request
9605107Seota 	 * can be freed.
9615107Seota 	 */
9625107Seota 	atomic_or_uint(&req->flags, TM_UTMCOMP);
9635107Seota }
964