xref: /onnv-gate/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c (revision 1341:6d7c4f090a72)
1*1341Sstevel /*
2*1341Sstevel  * CDDL HEADER START
3*1341Sstevel  *
4*1341Sstevel  * The contents of this file are subject to the terms of the
5*1341Sstevel  * Common Development and Distribution License (the "License").
6*1341Sstevel  * You may not use this file except in compliance with the License.
7*1341Sstevel  *
8*1341Sstevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*1341Sstevel  * or http://www.opensolaris.org/os/licensing.
10*1341Sstevel  * See the License for the specific language governing permissions
11*1341Sstevel  * and limitations under the License.
12*1341Sstevel  *
13*1341Sstevel  * When distributing Covered Code, include this CDDL HEADER in each
14*1341Sstevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*1341Sstevel  * If applicable, add the following below this CDDL HEADER, with the
16*1341Sstevel  * fields enclosed by brackets "[]" replaced with your own identifying
17*1341Sstevel  * information: Portions Copyright [yyyy] [name of copyright owner]
18*1341Sstevel  *
19*1341Sstevel  * CDDL HEADER END
20*1341Sstevel  */
21*1341Sstevel 
22*1341Sstevel /*
23*1341Sstevel  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*1341Sstevel  * Use is subject to license terms.
25*1341Sstevel  */
26*1341Sstevel 
27*1341Sstevel #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*1341Sstevel 
29*1341Sstevel /*
30*1341Sstevel  * This workaround inhibits prom_printf after the cpus are grabbed.
31*1341Sstevel  * This can be removed when 4154263 is corrected.
32*1341Sstevel  */
33*1341Sstevel #define	Bug_4154263
34*1341Sstevel 
35*1341Sstevel /*
36*1341Sstevel  * A CPR derivative specifically for sunfire
37*1341Sstevel  */
38*1341Sstevel 
39*1341Sstevel #include <sys/types.h>
40*1341Sstevel #include <sys/systm.h>
41*1341Sstevel #include <sys/machparam.h>
42*1341Sstevel #include <sys/machsystm.h>
43*1341Sstevel #include <sys/ddi.h>
44*1341Sstevel #define	SUNDDI_IMPL
45*1341Sstevel #include <sys/sunddi.h>
46*1341Sstevel #include <sys/time.h>
47*1341Sstevel #include <sys/kmem.h>
48*1341Sstevel #include <nfs/lm.h>
49*1341Sstevel #include <sys/ddi_impldefs.h>
50*1341Sstevel #include <sys/obpdefs.h>
51*1341Sstevel #include <sys/cmn_err.h>
52*1341Sstevel #include <sys/debug.h>
53*1341Sstevel #include <sys/errno.h>
54*1341Sstevel #include <sys/callb.h>
55*1341Sstevel #include <sys/clock.h>
56*1341Sstevel #include <sys/x_call.h>
57*1341Sstevel #include <sys/cpuvar.h>
58*1341Sstevel #include <sys/epm.h>
59*1341Sstevel #include <sys/vfs.h>
60*1341Sstevel #include <sys/fhc.h>
61*1341Sstevel #include <sys/sysctrl.h>
62*1341Sstevel #include <sys/promif.h>
63*1341Sstevel #include <sys/conf.h>
64*1341Sstevel #include <sys/modctl.h>
65*1341Sstevel #include <sys/cyclic.h>
66*1341Sstevel #include <sys/sunndi.h>
67*1341Sstevel #include <sys/machsystm.h>
68*1341Sstevel 
69*1341Sstevel static enum sysctrl_suspend_state {
70*1341Sstevel 	SYSC_STATE_BEGIN = 0,
71*1341Sstevel 	SYSC_STATE_USER,
72*1341Sstevel 	SYSC_STATE_DAEMON,
73*1341Sstevel 	SYSC_STATE_DRIVER,
74*1341Sstevel 	SYSC_STATE_FULL } suspend_state;
75*1341Sstevel 
76*1341Sstevel static int	pstate_save;
77*1341Sstevel static uint_t	sysctrl_gate[NCPU];
78*1341Sstevel int	sysctrl_quiesce_debug = FALSE;
79*1341Sstevel static int	sysctrl_skip_kernel_threads = TRUE;
80*1341Sstevel 
81*1341Sstevel /*
82*1341Sstevel  * sysctrl_skip_user_threads is used to control if user threads should
83*1341Sstevel  * be suspended.  If sysctrl_skip_user_threads is true, the rest of the
84*1341Sstevel  * flags are not used; if it is false, sysctrl_check_user_stop_result
85*1341Sstevel  * will be used to control whether or not we need to check suspend
86*1341Sstevel  * result, and sysctrl_allow_blocked_threads will be used to control
87*1341Sstevel  * whether or not we allow suspend to continue if there are blocked
88*1341Sstevel  * threads.  We allow all combinations of sysctrl_check_user_stop_result
89*1341Sstevel  * and sysctrl_allow_block_threads, even though it might not make much
90*1341Sstevel  * sense to not allow block threads when we don't even check stop
91*1341Sstevel  * result.
92*1341Sstevel  */
93*1341Sstevel static int	sysctrl_skip_user_threads = 0;		/* default to FALSE */
94*1341Sstevel static int	sysctrl_check_user_stop_result = 1;	/* default to TRUE */
95*1341Sstevel static int	sysctrl_allow_blocked_threads = 1;	/* default to TRUE */
96*1341Sstevel 
97*1341Sstevel static int	sysc_watchdog_suspended;
98*1341Sstevel 
99*1341Sstevel extern int	sysctrl_enable_detach_suspend;
100*1341Sstevel static int	sysc_lastval;
101*1341Sstevel 
102*1341Sstevel #define	DEBUGP(p) { if (sysctrl_quiesce_debug) p; }
103*1341Sstevel #define	errp	prom_printf
104*1341Sstevel 
105*1341Sstevel #define	SYSC_CPU_LOOP_MSEC	1000
106*1341Sstevel 
107*1341Sstevel static void
108*1341Sstevel sysctrl_grab_cpus(void)
109*1341Sstevel {
110*1341Sstevel 	int		i;
111*1341Sstevel 	cpuset_t	others;
112*1341Sstevel 	extern cpuset_t	cpu_ready_set;
113*1341Sstevel 	extern void	sysctrl_freeze(void);
114*1341Sstevel 	uint64_t	sysc_tick_limit;
115*1341Sstevel 	uint64_t	sysc_current_tick;
116*1341Sstevel 	uint64_t	sysc_tick_deadline;
117*1341Sstevel 
118*1341Sstevel 	extern u_longlong_t	gettick(void);
119*1341Sstevel 
120*1341Sstevel 	for (i = 0; i < NCPU; i++)
121*1341Sstevel 		sysctrl_gate[i] = 0;
122*1341Sstevel 
123*1341Sstevel 	/* tell other cpus to go quiet and wait for continue signal */
124*1341Sstevel 	others = cpu_ready_set;
125*1341Sstevel 	CPUSET_DEL(others, CPU->cpu_id);
126*1341Sstevel 	xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate,
127*1341Sstevel 		(uint64_t)(&sysctrl_gate[CPU->cpu_id]));
128*1341Sstevel 
129*1341Sstevel 	sysc_tick_limit =
130*1341Sstevel 		((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000;
131*1341Sstevel 
132*1341Sstevel 	/* wait for each cpu to check in */
133*1341Sstevel 	for (i = 0; i < NCPU; i++) {
134*1341Sstevel 		if (!CPU_IN_SET(others, i))
135*1341Sstevel 			continue;
136*1341Sstevel 
137*1341Sstevel 		/*
138*1341Sstevel 		 * Get current tick value and calculate the deadline tick
139*1341Sstevel 		 */
140*1341Sstevel 		sysc_current_tick = gettick();
141*1341Sstevel 		sysc_tick_deadline = sysc_current_tick + sysc_tick_limit;
142*1341Sstevel 
143*1341Sstevel 		while (sysctrl_gate[i] == 0) {
144*1341Sstevel 			/* If in panic, we just return */
145*1341Sstevel 			if (panicstr)
146*1341Sstevel 				break;
147*1341Sstevel 
148*1341Sstevel 			/* Panic the system if cpu not responsed by deadline */
149*1341Sstevel 			sysc_current_tick = gettick();
150*1341Sstevel 			if (sysc_current_tick >= sysc_tick_deadline) {
151*1341Sstevel 			    cmn_err(CE_PANIC, "sysctrl: cpu %d not "
152*1341Sstevel 				"responding to quiesce command", i);
153*1341Sstevel 			}
154*1341Sstevel 		}
155*1341Sstevel 	}
156*1341Sstevel 
157*1341Sstevel 	/* now even our interrupts are disabled -- really quiet now */
158*1341Sstevel 	pstate_save = disable_vec_intr();
159*1341Sstevel }
160*1341Sstevel 
161*1341Sstevel static void
162*1341Sstevel sysctrl_release_cpus(void)
163*1341Sstevel {
164*1341Sstevel 	/* let the other cpus go */
165*1341Sstevel 	sysctrl_gate[CPU->cpu_id] = 1;
166*1341Sstevel 
167*1341Sstevel 	/* restore our interrupts too */
168*1341Sstevel 	enable_vec_intr(pstate_save);
169*1341Sstevel }
170*1341Sstevel 
171*1341Sstevel static void
172*1341Sstevel sysctrl_stop_intr(void)
173*1341Sstevel {
174*1341Sstevel 	mutex_enter(&cpu_lock);
175*1341Sstevel 	kpreempt_disable();
176*1341Sstevel 	cyclic_suspend();
177*1341Sstevel }
178*1341Sstevel 
179*1341Sstevel static void
180*1341Sstevel sysctrl_enable_intr(void)
181*1341Sstevel {
182*1341Sstevel 	cyclic_resume();
183*1341Sstevel 	(void) spl0();
184*1341Sstevel 	kpreempt_enable();
185*1341Sstevel 	mutex_exit(&cpu_lock);
186*1341Sstevel }
187*1341Sstevel 
188*1341Sstevel static int
189*1341Sstevel sysctrl_is_real_device(dev_info_t *dip)
190*1341Sstevel {
191*1341Sstevel 	struct regspec *regbuf;
192*1341Sstevel 	int length;
193*1341Sstevel 	int rc;
194*1341Sstevel 
195*1341Sstevel 	if (ddi_get_driver(dip) == NULL)
196*1341Sstevel 		return (FALSE);
197*1341Sstevel 
198*1341Sstevel 	if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR))
199*1341Sstevel 		return (TRUE);
200*1341Sstevel 	if (DEVI(dip)->devi_pm_flags & PMC_NO_SR)
201*1341Sstevel 		return (FALSE);
202*1341Sstevel 
203*1341Sstevel 	/*
204*1341Sstevel 	 * now the general case
205*1341Sstevel 	 */
206*1341Sstevel 	rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
207*1341Sstevel 		(caddr_t)&regbuf, &length);
208*1341Sstevel 	ASSERT(rc != DDI_PROP_NO_MEMORY);
209*1341Sstevel 	if (rc != DDI_PROP_SUCCESS) {
210*1341Sstevel 		return (FALSE);
211*1341Sstevel 	} else {
212*1341Sstevel 		kmem_free(regbuf, length);
213*1341Sstevel 		return (TRUE);
214*1341Sstevel 	}
215*1341Sstevel }
216*1341Sstevel 
217*1341Sstevel static dev_info_t *failed_driver;
218*1341Sstevel static char device_path[MAXPATHLEN];
219*1341Sstevel 
220*1341Sstevel static int
221*1341Sstevel sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt)
222*1341Sstevel {
223*1341Sstevel 	int circ;
224*1341Sstevel 
225*1341Sstevel 	ASSERT(dip == NULL || ddi_get_parent(dip) == NULL ||
226*1341Sstevel 	    DEVI_BUSY_OWNED(ddi_get_parent(dip)));
227*1341Sstevel 
228*1341Sstevel 	failed_driver = NULL;
229*1341Sstevel 	for (; dip != NULL; dip = ddi_get_next_sibling(dip)) {
230*1341Sstevel 		/*
231*1341Sstevel 		 * Hold parent busy while walking child list
232*1341Sstevel 		 */
233*1341Sstevel 		ndi_devi_enter(dip, &circ);
234*1341Sstevel 		if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) {
235*1341Sstevel 			ndi_devi_exit(dip, circ);
236*1341Sstevel 			return (ENXIO);
237*1341Sstevel 		}
238*1341Sstevel 		ndi_devi_exit(dip, circ);
239*1341Sstevel 
240*1341Sstevel 		if (!sysctrl_is_real_device(dip))
241*1341Sstevel 			continue;
242*1341Sstevel 
243*1341Sstevel 		/*
244*1341Sstevel 		 * Safe to call ddi_pathname() as parent is held busy
245*1341Sstevel 		 */
246*1341Sstevel 		(void) ddi_pathname(dip, device_path);
247*1341Sstevel 		DEBUGP(errp(" suspending device %s\n", device_path));
248*1341Sstevel 		if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) {
249*1341Sstevel 			DEBUGP(errp("  unable to suspend device %s\n",
250*1341Sstevel 				device_path));
251*1341Sstevel 
252*1341Sstevel 			(void) strncpy(pkt->errbuf, device_path,
253*1341Sstevel 				SYSC_OUTPUT_LEN);
254*1341Sstevel 			SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND);
255*1341Sstevel 			ndi_hold_devi(dip);
256*1341Sstevel 			failed_driver = dip;
257*1341Sstevel 			return (ENXIO);
258*1341Sstevel 		}
259*1341Sstevel 	}
260*1341Sstevel 
261*1341Sstevel 	return (DDI_SUCCESS);
262*1341Sstevel }
263*1341Sstevel 
264*1341Sstevel static void
265*1341Sstevel sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt)
266*1341Sstevel {
267*1341Sstevel 	int		circ;
268*1341Sstevel 	dev_info_t	*dip, *next, *last = NULL;
269*1341Sstevel 
270*1341Sstevel 	ASSERT(start == NULL || ddi_get_parent(start) == NULL ||
271*1341Sstevel 	    DEVI_BUSY_OWNED(ddi_get_parent(start)));
272*1341Sstevel 
273*1341Sstevel 	/* attach in reverse device tree order */
274*1341Sstevel 	while (last != start) {
275*1341Sstevel 		dip = start;
276*1341Sstevel 		next = ddi_get_next_sibling(dip);
277*1341Sstevel 		while (next != last && dip != failed_driver) {
278*1341Sstevel 			dip = next;
279*1341Sstevel 			next = ddi_get_next_sibling(dip);
280*1341Sstevel 		}
281*1341Sstevel 		if (dip == failed_driver) {
282*1341Sstevel 			failed_driver = NULL;
283*1341Sstevel 			ndi_rele_devi(dip);
284*1341Sstevel 		} else if (sysctrl_is_real_device(dip) &&
285*1341Sstevel 		    failed_driver == NULL) {
286*1341Sstevel 			/*
287*1341Sstevel 			 * Parent dip is held busy, so ddi_pathname() can
288*1341Sstevel 			 * be safely called.
289*1341Sstevel 			 */
290*1341Sstevel 			(void) ddi_pathname(dip, device_path);
291*1341Sstevel 			DEBUGP(errp(" resuming device %s\n", device_path));
292*1341Sstevel 			if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) {
293*1341Sstevel 				/*
294*1341Sstevel 				 * XXX - if in the future we decide not to
295*1341Sstevel 				 * panic the system, we need to set the error
296*1341Sstevel 				 * SYSC_ERR_RESUME here and also change the
297*1341Sstevel 				 * cfgadm platform library.
298*1341Sstevel 				 */
299*1341Sstevel 				cmn_err(CE_PANIC, "Unable to resume device %s",
300*1341Sstevel 					device_path);
301*1341Sstevel 			}
302*1341Sstevel 		}
303*1341Sstevel 		ndi_devi_enter(dip, &circ);
304*1341Sstevel 		sysctrl_resume_devices(ddi_get_child(dip), pkt);
305*1341Sstevel 		ndi_devi_exit(dip, circ);
306*1341Sstevel 
307*1341Sstevel 		last = dip;
308*1341Sstevel 	}
309*1341Sstevel }
310*1341Sstevel 
311*1341Sstevel /*
312*1341Sstevel  * True if thread is virtually stopped.  Similar to CPR_VSTOPPED
313*1341Sstevel  * but from DR point of view.  These user threads are waiting in
314*1341Sstevel  * the kernel.  Once they complete in the kernel, they will process
315*1341Sstevel  * the stop signal and stop.
316*1341Sstevel  */
317*1341Sstevel #define	SYSCTRL_VSTOPPED(t)		\
318*1341Sstevel 	((t)->t_state == TS_SLEEP &&	\
319*1341Sstevel 	(t)->t_wchan != NULL &&		\
320*1341Sstevel 	(t)->t_astflag &&		\
321*1341Sstevel 	((t)->t_proc_flag & TP_CHKPT))
322*1341Sstevel 
323*1341Sstevel static int
324*1341Sstevel sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt)
325*1341Sstevel {
326*1341Sstevel 	int		count;
327*1341Sstevel 	char		cache_psargs[PSARGSZ];
328*1341Sstevel 	kthread_id_t	cache_tp;
329*1341Sstevel 	uint_t		cache_t_state;
330*1341Sstevel 	int		bailout;
331*1341Sstevel 	pid_t		pid;
332*1341Sstevel 
333*1341Sstevel 	extern void add_one_utstop();
334*1341Sstevel 	extern void utstop_timedwait(clock_t);
335*1341Sstevel 	extern void utstop_init(void);
336*1341Sstevel 
337*1341Sstevel #define	SYSCTRL_UTSTOP_RETRY	4
338*1341Sstevel #define	SYSCTRL_UTSTOP_WAIT	hz
339*1341Sstevel 
340*1341Sstevel 	if (sysctrl_skip_user_threads)
341*1341Sstevel 		return (DDI_SUCCESS);
342*1341Sstevel 
343*1341Sstevel 	utstop_init();
344*1341Sstevel 
345*1341Sstevel 	/* we need to try a few times to get past fork, etc. */
346*1341Sstevel 	for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) {
347*1341Sstevel 		kthread_id_t tp;
348*1341Sstevel 
349*1341Sstevel 		/* walk the entire threadlist */
350*1341Sstevel 		mutex_enter(&pidlock);
351*1341Sstevel 		for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
352*1341Sstevel 			proc_t *p = ttoproc(tp);
353*1341Sstevel 
354*1341Sstevel 			/* handle kernel threads separately */
355*1341Sstevel 			if (p->p_as == &kas || p->p_stat == SZOMB)
356*1341Sstevel 				continue;
357*1341Sstevel 
358*1341Sstevel 			mutex_enter(&p->p_lock);
359*1341Sstevel 			thread_lock(tp);
360*1341Sstevel 
361*1341Sstevel 			if (tp->t_state == TS_STOPPED) {
362*1341Sstevel 				/* add another reason to stop this thread */
363*1341Sstevel 				tp->t_schedflag &= ~TS_RESUME;
364*1341Sstevel 			} else {
365*1341Sstevel 				tp->t_proc_flag |= TP_CHKPT;
366*1341Sstevel 
367*1341Sstevel 				thread_unlock(tp);
368*1341Sstevel 				mutex_exit(&p->p_lock);
369*1341Sstevel 				add_one_utstop();
370*1341Sstevel 				mutex_enter(&p->p_lock);
371*1341Sstevel 				thread_lock(tp);
372*1341Sstevel 
373*1341Sstevel 				aston(tp);
374*1341Sstevel 
375*1341Sstevel 				if (tp->t_state == TS_SLEEP &&
376*1341Sstevel 				    (tp->t_flag & T_WAKEABLE)) {
377*1341Sstevel 					setrun_locked(tp);
378*1341Sstevel 				}
379*1341Sstevel 
380*1341Sstevel 			}
381*1341Sstevel 
382*1341Sstevel 			/* grab thread if needed */
383*1341Sstevel 			if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU)
384*1341Sstevel 				poke_cpu(tp->t_cpu->cpu_id);
385*1341Sstevel 
386*1341Sstevel 
387*1341Sstevel 			thread_unlock(tp);
388*1341Sstevel 			mutex_exit(&p->p_lock);
389*1341Sstevel 		}
390*1341Sstevel 		mutex_exit(&pidlock);
391*1341Sstevel 
392*1341Sstevel 
393*1341Sstevel 		/* let everything catch up */
394*1341Sstevel 		utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT);
395*1341Sstevel 
396*1341Sstevel 
397*1341Sstevel 		/* now, walk the threadlist again to see if we are done */
398*1341Sstevel 		mutex_enter(&pidlock);
399*1341Sstevel 		for (tp = curthread->t_next, bailout = 0;
400*1341Sstevel 		    bailout == 0 && tp != curthread; tp = tp->t_next) {
401*1341Sstevel 			proc_t *p = ttoproc(tp);
402*1341Sstevel 
403*1341Sstevel 			/* handle kernel threads separately */
404*1341Sstevel 			if (p->p_as == &kas || p->p_stat == SZOMB)
405*1341Sstevel 				continue;
406*1341Sstevel 
407*1341Sstevel 			/*
408*1341Sstevel 			 * If this thread didn't stop, and we don't allow
409*1341Sstevel 			 * unstopped blocked threads, bail.
410*1341Sstevel 			 */
411*1341Sstevel 			/* did this thread stop? */
412*1341Sstevel 			thread_lock(tp);
413*1341Sstevel 			if (!CPR_ISTOPPED(tp) &&
414*1341Sstevel 			    !(sysctrl_allow_blocked_threads &&
415*1341Sstevel 			    SYSCTRL_VSTOPPED(tp))) {
416*1341Sstevel 
417*1341Sstevel 				/* nope, cache the details for later */
418*1341Sstevel 				bcopy(p->p_user.u_psargs, cache_psargs,
419*1341Sstevel 					sizeof (cache_psargs));
420*1341Sstevel 				cache_tp = tp;
421*1341Sstevel 				cache_t_state = tp->t_state;
422*1341Sstevel 				bailout = 1;
423*1341Sstevel 				pid = p->p_pidp->pid_id;
424*1341Sstevel 			}
425*1341Sstevel 			thread_unlock(tp);
426*1341Sstevel 		}
427*1341Sstevel 		mutex_exit(&pidlock);
428*1341Sstevel 
429*1341Sstevel 		/* were all the threads stopped? */
430*1341Sstevel 		if (!bailout)
431*1341Sstevel 			break;
432*1341Sstevel 	}
433*1341Sstevel 
434*1341Sstevel 	/* were we unable to stop all threads after a few tries? */
435*1341Sstevel 	if (bailout) {
436*1341Sstevel 		(void) sprintf(pkt->errbuf, "process: %s id: %d state: %x"
437*1341Sstevel 		    " thread descriptor: %p",
438*1341Sstevel 		    cache_psargs, (int)pid, cache_t_state,
439*1341Sstevel 			(void *)cache_tp);
440*1341Sstevel 
441*1341Sstevel 		SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD);
442*1341Sstevel 
443*1341Sstevel 		return (ESRCH);
444*1341Sstevel 	}
445*1341Sstevel 
446*1341Sstevel 	return (DDI_SUCCESS);
447*1341Sstevel }
448*1341Sstevel 
449*1341Sstevel static int
450*1341Sstevel sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt)
451*1341Sstevel {
452*1341Sstevel 	caddr_t		name;
453*1341Sstevel 	kthread_id_t	tp;
454*1341Sstevel 
455*1341Sstevel 	if (sysctrl_skip_kernel_threads) {
456*1341Sstevel 		return (DDI_SUCCESS);
457*1341Sstevel 	}
458*1341Sstevel 
459*1341Sstevel 	/*
460*1341Sstevel 	 * Note: we unlock the table in resume.
461*1341Sstevel 	 * We only need to lock the callback table if we are actually
462*1341Sstevel 	 * suspending kernel threads.
463*1341Sstevel 	 */
464*1341Sstevel 	callb_lock_table();
465*1341Sstevel 	if ((name = callb_execute_class(CB_CL_CPR_DAEMON,
466*1341Sstevel 	    CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) {
467*1341Sstevel 
468*1341Sstevel 		(void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN);
469*1341Sstevel 		SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD);
470*1341Sstevel 		return (EBUSY);
471*1341Sstevel 	}
472*1341Sstevel 
473*1341Sstevel 	/*
474*1341Sstevel 	 * Verify that all threads are accounted for
475*1341Sstevel 	 */
476*1341Sstevel 	mutex_enter(&pidlock);
477*1341Sstevel 	for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
478*1341Sstevel 		proc_t	*p = ttoproc(tp);
479*1341Sstevel 
480*1341Sstevel 		if (p->p_as != &kas)
481*1341Sstevel 			continue;
482*1341Sstevel 
483*1341Sstevel 		if (tp->t_flag & T_INTR_THREAD)
484*1341Sstevel 			continue;
485*1341Sstevel 
486*1341Sstevel 		if (!callb_is_stopped(tp, &name)) {
487*1341Sstevel 			mutex_exit(&pidlock);
488*1341Sstevel 			(void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN);
489*1341Sstevel 			SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD);
490*1341Sstevel 			return (EBUSY);
491*1341Sstevel 		}
492*1341Sstevel 	}
493*1341Sstevel 
494*1341Sstevel 	mutex_exit(&pidlock);
495*1341Sstevel 	return (DDI_SUCCESS);
496*1341Sstevel }
497*1341Sstevel 
498*1341Sstevel static void
499*1341Sstevel sysctrl_start_user_threads(void)
500*1341Sstevel {
501*1341Sstevel 	kthread_id_t tp;
502*1341Sstevel 
503*1341Sstevel 	mutex_enter(&pidlock);
504*1341Sstevel 
505*1341Sstevel 	/* walk all threads and release them */
506*1341Sstevel 	for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
507*1341Sstevel 		proc_t *p = ttoproc(tp);
508*1341Sstevel 
509*1341Sstevel 		/* skip kernel threads */
510*1341Sstevel 		if (ttoproc(tp)->p_as == &kas)
511*1341Sstevel 			continue;
512*1341Sstevel 
513*1341Sstevel 		mutex_enter(&p->p_lock);
514*1341Sstevel 		tp->t_proc_flag &= ~TP_CHKPT;
515*1341Sstevel 		mutex_exit(&p->p_lock);
516*1341Sstevel 
517*1341Sstevel 		thread_lock(tp);
518*1341Sstevel 		if (CPR_ISTOPPED(tp)) {
519*1341Sstevel 			/* back on the runq */
520*1341Sstevel 			tp->t_schedflag |= TS_RESUME;
521*1341Sstevel 			setrun_locked(tp);
522*1341Sstevel 		}
523*1341Sstevel 		thread_unlock(tp);
524*1341Sstevel 	}
525*1341Sstevel 
526*1341Sstevel 	mutex_exit(&pidlock);
527*1341Sstevel }
528*1341Sstevel 
529*1341Sstevel static void
530*1341Sstevel sysctrl_signal_user(int sig)
531*1341Sstevel {
532*1341Sstevel 	struct proc *p;
533*1341Sstevel 
534*1341Sstevel 	mutex_enter(&pidlock);
535*1341Sstevel 
536*1341Sstevel 	for (p = practive; p != NULL; p = p->p_next) {
537*1341Sstevel 		/* only user threads */
538*1341Sstevel 		if (p->p_exec == NULL || p->p_stat == SZOMB ||
539*1341Sstevel 		    p == proc_init || p == ttoproc(curthread))
540*1341Sstevel 			continue;
541*1341Sstevel 
542*1341Sstevel 		mutex_enter(&p->p_lock);
543*1341Sstevel 		sigtoproc(p, NULL, sig);
544*1341Sstevel 		mutex_exit(&p->p_lock);
545*1341Sstevel 	}
546*1341Sstevel 
547*1341Sstevel 	mutex_exit(&pidlock);
548*1341Sstevel 
549*1341Sstevel 	/* add a bit of delay */
550*1341Sstevel 	delay(hz);
551*1341Sstevel }
552*1341Sstevel 
553*1341Sstevel void
554*1341Sstevel sysctrl_resume(sysc_cfga_pkt_t *pkt)
555*1341Sstevel {
556*1341Sstevel #ifndef Bug_4154263
557*1341Sstevel 	DEBUGP(errp("resume system...\n"));
558*1341Sstevel #endif
559*1341Sstevel 	switch (suspend_state) {
560*1341Sstevel 	case SYSC_STATE_FULL:
561*1341Sstevel 		/*
562*1341Sstevel 		 * release all the other cpus
563*1341Sstevel 		 */
564*1341Sstevel #ifndef	Bug_4154263
565*1341Sstevel 		DEBUGP(errp("release cpus..."));
566*1341Sstevel #endif
567*1341Sstevel 		sysctrl_release_cpus();
568*1341Sstevel 		DEBUGP(errp("cpus resumed...\n"));
569*1341Sstevel 
570*1341Sstevel 		/*
571*1341Sstevel 		 * If we suspended hw watchdog at suspend,
572*1341Sstevel 		 * re-enable it now.
573*1341Sstevel 		 */
574*1341Sstevel 		if (sysc_watchdog_suspended) {
575*1341Sstevel 			mutex_enter(&tod_lock);
576*1341Sstevel 			tod_ops.tod_set_watchdog_timer(
577*1341Sstevel 				watchdog_timeout_seconds);
578*1341Sstevel 			mutex_exit(&tod_lock);
579*1341Sstevel 		}
580*1341Sstevel 
581*1341Sstevel 		/*
582*1341Sstevel 		 * resume callout
583*1341Sstevel 		 */
584*1341Sstevel 		(void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME);
585*1341Sstevel 		(void) callb_execute_class(CB_CL_CPR_CALLOUT,
586*1341Sstevel 			CB_CODE_CPR_RESUME);
587*1341Sstevel 		sysctrl_enable_intr();
588*1341Sstevel 		/* FALLTHROUGH */
589*1341Sstevel 
590*1341Sstevel 	case SYSC_STATE_DRIVER:
591*1341Sstevel 		/*
592*1341Sstevel 		 * resume drivers
593*1341Sstevel 		 */
594*1341Sstevel 		DEBUGP(errp("resume drivers..."));
595*1341Sstevel 		sysctrl_resume_devices(ddi_root_node(), pkt);
596*1341Sstevel 		DEBUGP(errp("done\n"));
597*1341Sstevel 
598*1341Sstevel 		/*
599*1341Sstevel 		 * resume the lock manager
600*1341Sstevel 		 */
601*1341Sstevel 		lm_cprresume();
602*1341Sstevel 
603*1341Sstevel 		/* FALLTHROUGH */
604*1341Sstevel 
605*1341Sstevel 	case SYSC_STATE_DAEMON:
606*1341Sstevel 		/*
607*1341Sstevel 		 * resume kernel daemons
608*1341Sstevel 		 */
609*1341Sstevel 		if (!sysctrl_skip_kernel_threads) {
610*1341Sstevel 			DEBUGP(errp("starting kernel daemons..."));
611*1341Sstevel 			(void) callb_execute_class(CB_CL_CPR_DAEMON,
612*1341Sstevel 				CB_CODE_CPR_RESUME);
613*1341Sstevel 			callb_unlock_table();
614*1341Sstevel 		}
615*1341Sstevel 		DEBUGP(errp("done\n"));
616*1341Sstevel 
617*1341Sstevel 		/* FALLTHROUGH */
618*1341Sstevel 
619*1341Sstevel 	case SYSC_STATE_USER:
620*1341Sstevel 		/*
621*1341Sstevel 		 * finally, resume user threads
622*1341Sstevel 		 */
623*1341Sstevel 		if (!sysctrl_skip_user_threads) {
624*1341Sstevel 			DEBUGP(errp("starting user threads..."));
625*1341Sstevel 			sysctrl_start_user_threads();
626*1341Sstevel 			DEBUGP(errp("done\n"));
627*1341Sstevel 		}
628*1341Sstevel 		/* FALLTHROUGH */
629*1341Sstevel 
630*1341Sstevel 	case SYSC_STATE_BEGIN:
631*1341Sstevel 	default:
632*1341Sstevel 		/*
633*1341Sstevel 		 * let those who care know that we've just resumed
634*1341Sstevel 		 */
635*1341Sstevel 		DEBUGP(errp("sending SIGTHAW..."));
636*1341Sstevel 		sysctrl_signal_user(SIGTHAW);
637*1341Sstevel 		DEBUGP(errp("done\n"));
638*1341Sstevel 		break;
639*1341Sstevel 	}
640*1341Sstevel 
641*1341Sstevel 	/* Restore sysctrl detach/suspend to its original value */
642*1341Sstevel 	sysctrl_enable_detach_suspend = sysc_lastval;
643*1341Sstevel 
644*1341Sstevel 	DEBUGP(errp("system state restored\n"));
645*1341Sstevel }
646*1341Sstevel 
647*1341Sstevel void
648*1341Sstevel sysctrl_suspend_prepare(void)
649*1341Sstevel {
650*1341Sstevel 	/*
651*1341Sstevel 	 * We use a function, lm_cprsuspend(), in the suspend flow that
652*1341Sstevel 	 * is redirected to a module through the modstubs mechanism.
653*1341Sstevel 	 * If the module is currently not loaded, modstubs attempts
654*1341Sstevel 	 * the modload. The context this happens in below causes the
655*1341Sstevel 	 * module load to block forever, so this function must be called
656*1341Sstevel 	 * in the normal system call context ahead of time.
657*1341Sstevel 	 */
658*1341Sstevel 	(void) modload("misc", "klmmod");
659*1341Sstevel }
660*1341Sstevel 
661*1341Sstevel int
662*1341Sstevel sysctrl_suspend(sysc_cfga_pkt_t *pkt)
663*1341Sstevel {
664*1341Sstevel 	int rc = DDI_SUCCESS;
665*1341Sstevel 
666*1341Sstevel 	/* enable sysctrl detach/suspend function */
667*1341Sstevel 	sysc_lastval = sysctrl_enable_detach_suspend;
668*1341Sstevel 	sysctrl_enable_detach_suspend = 1;
669*1341Sstevel 
670*1341Sstevel 	/*
671*1341Sstevel 	 * first, stop all user threads
672*1341Sstevel 	 */
673*1341Sstevel 	DEBUGP(errp("\nstopping user threads..."));
674*1341Sstevel 	suspend_state = SYSC_STATE_USER;
675*1341Sstevel 	if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) &&
676*1341Sstevel 	    sysctrl_check_user_stop_result) {
677*1341Sstevel 		sysctrl_resume(pkt);
678*1341Sstevel 		return (rc);
679*1341Sstevel 	}
680*1341Sstevel 	DEBUGP(errp("done\n"));
681*1341Sstevel 
682*1341Sstevel 	/*
683*1341Sstevel 	 * now stop daemon activities
684*1341Sstevel 	 */
685*1341Sstevel 	DEBUGP(errp("stopping kernel daemons..."));
686*1341Sstevel 	suspend_state = SYSC_STATE_DAEMON;
687*1341Sstevel 	if (rc = sysctrl_stop_kernel_threads(pkt)) {
688*1341Sstevel 		sysctrl_resume(pkt);
689*1341Sstevel 		return (rc);
690*1341Sstevel 	}
691*1341Sstevel 	DEBUGP(errp("done\n"));
692*1341Sstevel 
693*1341Sstevel 	/*
694*1341Sstevel 	 * This sync swap out all user pages
695*1341Sstevel 	 */
696*1341Sstevel 	vfs_sync(SYNC_ALL);
697*1341Sstevel 
698*1341Sstevel 	/*
699*1341Sstevel 	 * special treatment for lock manager
700*1341Sstevel 	 */
701*1341Sstevel 	lm_cprsuspend();
702*1341Sstevel 
703*1341Sstevel 	/*
704*1341Sstevel 	 * sync the file system in case we never make it back
705*1341Sstevel 	 */
706*1341Sstevel 	sync();
707*1341Sstevel 
708*1341Sstevel 	/*
709*1341Sstevel 	 * now suspend drivers
710*1341Sstevel 	 */
711*1341Sstevel 	DEBUGP(errp("suspending drivers..."));
712*1341Sstevel 	suspend_state = SYSC_STATE_DRIVER;
713*1341Sstevel 	if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) {
714*1341Sstevel 		sysctrl_resume(pkt);
715*1341Sstevel 		return (rc);
716*1341Sstevel 	}
717*1341Sstevel 	DEBUGP(errp("done\n"));
718*1341Sstevel 
719*1341Sstevel 	/*
720*1341Sstevel 	 * handle the callout table
721*1341Sstevel 	 */
722*1341Sstevel 	sysctrl_stop_intr();
723*1341Sstevel 
724*1341Sstevel 	(void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT);
725*1341Sstevel 
726*1341Sstevel 	/*
727*1341Sstevel 	 * if watchdog was activated, disable it
728*1341Sstevel 	 */
729*1341Sstevel 	if (watchdog_activated) {
730*1341Sstevel 		mutex_enter(&tod_lock);
731*1341Sstevel 		tod_ops.tod_clear_watchdog_timer();
732*1341Sstevel 		mutex_exit(&tod_lock);
733*1341Sstevel 		sysc_watchdog_suspended = 1;
734*1341Sstevel 	} else {
735*1341Sstevel 		sysc_watchdog_suspended = 0;
736*1341Sstevel 	}
737*1341Sstevel 
738*1341Sstevel 	/*
739*1341Sstevel 	 * finally, grab all cpus
740*1341Sstevel 	 */
741*1341Sstevel 	DEBUGP(errp("freezing all cpus...\n"));
742*1341Sstevel 	suspend_state = SYSC_STATE_FULL;
743*1341Sstevel 	sysctrl_grab_cpus();
744*1341Sstevel #ifndef	Bug_4154263
745*1341Sstevel 	DEBUGP(errp("done\n"));
746*1341Sstevel 
747*1341Sstevel 	DEBUGP(errp("system is quiesced\n"));
748*1341Sstevel #endif
749*1341Sstevel 
750*1341Sstevel 	return (rc);
751*1341Sstevel }
752