xref: /onnv-gate/usr/src/uts/common/disp/cmt.c (revision 8969:6dd9da355f1d)
13434Sesaxe /*
23434Sesaxe  * CDDL HEADER START
33434Sesaxe  *
43434Sesaxe  * The contents of this file are subject to the terms of the
53434Sesaxe  * Common Development and Distribution License (the "License").
63434Sesaxe  * You may not use this file except in compliance with the License.
73434Sesaxe  *
83434Sesaxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93434Sesaxe  * or http://www.opensolaris.org/os/licensing.
103434Sesaxe  * See the License for the specific language governing permissions
113434Sesaxe  * and limitations under the License.
123434Sesaxe  *
133434Sesaxe  * When distributing Covered Code, include this CDDL HEADER in each
143434Sesaxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153434Sesaxe  * If applicable, add the following below this CDDL HEADER, with the
163434Sesaxe  * fields enclosed by brackets "[]" replaced with your own identifying
173434Sesaxe  * information: Portions Copyright [yyyy] [name of copyright owner]
183434Sesaxe  *
193434Sesaxe  * CDDL HEADER END
203434Sesaxe  */
213434Sesaxe /*
228689SEric.Saxe@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
233434Sesaxe  * Use is subject to license terms.
243434Sesaxe  */
253434Sesaxe 
263434Sesaxe #include <sys/systm.h>
273434Sesaxe #include <sys/types.h>
283434Sesaxe #include <sys/param.h>
293434Sesaxe #include <sys/thread.h>
303434Sesaxe #include <sys/cpuvar.h>
313434Sesaxe #include <sys/cpupart.h>
323434Sesaxe #include <sys/kmem.h>
333434Sesaxe #include <sys/cmn_err.h>
343434Sesaxe #include <sys/kstat.h>
353434Sesaxe #include <sys/processor.h>
363434Sesaxe #include <sys/disp.h>
373434Sesaxe #include <sys/group.h>
383434Sesaxe #include <sys/pghw.h>
393434Sesaxe #include <sys/bitset.h>
403434Sesaxe #include <sys/lgrp.h>
413434Sesaxe #include <sys/cmt.h>
428906SEric.Saxe@Sun.COM #include <sys/cpu_pm.h>
433434Sesaxe 
443434Sesaxe /*
453434Sesaxe  * CMT scheduler / dispatcher support
463434Sesaxe  *
473434Sesaxe  * This file implements CMT scheduler support using Processor Groups.
483434Sesaxe  * The CMT processor group class creates and maintains the CMT class
493434Sesaxe  * specific processor group pg_cmt_t.
503434Sesaxe  *
513434Sesaxe  * ---------------------------- <-- pg_cmt_t *
523434Sesaxe  * | pghw_t                   |
533434Sesaxe  * ----------------------------
543434Sesaxe  * | CMT class specific data  |
553434Sesaxe  * | - hierarchy linkage      |
563434Sesaxe  * | - CMT load balancing data|
573434Sesaxe  * | - active CPU group/bitset|
583434Sesaxe  * ----------------------------
593434Sesaxe  *
603434Sesaxe  * The scheduler/dispatcher leverages knowledge of the performance
613434Sesaxe  * relevant CMT sharing relationships existing between cpus to implement
628906SEric.Saxe@Sun.COM  * optimized affinity, load balancing, and coalescence policies.
633434Sesaxe  *
643434Sesaxe  * Load balancing policy seeks to improve performance by minimizing
658906SEric.Saxe@Sun.COM  * contention over shared processor resources / facilities, Affinity
668906SEric.Saxe@Sun.COM  * policies seek to improve cache and TLB utilization. Coalescence
678906SEric.Saxe@Sun.COM  * policies improve resource utilization and ultimately power efficiency.
683434Sesaxe  *
693434Sesaxe  * The CMT PGs created by this class are already arranged into a
703434Sesaxe  * hierarchy (which is done in the pghw layer). To implement the top-down
713434Sesaxe  * CMT load balancing algorithm, the CMT PGs additionally maintain
723434Sesaxe  * parent, child and sibling hierarchy relationships.
733434Sesaxe  * Parent PGs always contain a superset of their children(s) resources,
743434Sesaxe  * each PG can have at most one parent, and siblings are the group of PGs
753434Sesaxe  * sharing the same parent.
763434Sesaxe  *
773434Sesaxe  * On NUMA systems, the CMT load balancing algorithm balances across the
783434Sesaxe  * CMT PGs within their respective lgroups. On UMA based system, there
793434Sesaxe  * exists a top level group of PGs to balance across. On NUMA systems multiple
803434Sesaxe  * top level groups are instantiated, where the top level balancing begins by
813434Sesaxe  * balancng across the CMT PGs within their respective (per lgroup) top level
823434Sesaxe  * groups.
833434Sesaxe  */
843676Sesaxe static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
853676Sesaxe static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
863676Sesaxe 						/* used for null_proc_lpa */
878906SEric.Saxe@Sun.COM cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
883434Sesaxe 
893676Sesaxe static int		is_cpu0 = 1; /* true if this is boot CPU context */
903676Sesaxe 
913676Sesaxe /*
928906SEric.Saxe@Sun.COM  * Array of hardware sharing relationships that are blacklisted.
938906SEric.Saxe@Sun.COM  * PGs won't be instantiated for blacklisted hardware sharing relationships.
948906SEric.Saxe@Sun.COM  */
958906SEric.Saxe@Sun.COM static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
968906SEric.Saxe@Sun.COM 
978906SEric.Saxe@Sun.COM /*
983676Sesaxe  * Set this to non-zero to disable CMT scheduling
993676Sesaxe  * This must be done via kmdb -d, as /etc/system will be too late
1003676Sesaxe  */
1018906SEric.Saxe@Sun.COM int			cmt_sched_disabled = 0;
1023434Sesaxe 
1033434Sesaxe static pg_cid_t		pg_cmt_class_id;		/* PG class id */
1043434Sesaxe 
1053434Sesaxe static pg_t		*pg_cmt_alloc();
1063434Sesaxe static void		pg_cmt_free(pg_t *);
1073434Sesaxe static void		pg_cmt_cpu_init(cpu_t *);
1083434Sesaxe static void		pg_cmt_cpu_fini(cpu_t *);
1093434Sesaxe static void		pg_cmt_cpu_active(cpu_t *);
1103434Sesaxe static void		pg_cmt_cpu_inactive(cpu_t *);
1113434Sesaxe static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
1123434Sesaxe static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
1138906SEric.Saxe@Sun.COM static char		*pg_cmt_policy_name(pg_t *);
1148906SEric.Saxe@Sun.COM static void		pg_cmt_hier_sort(pg_cmt_t **, int);
1158906SEric.Saxe@Sun.COM static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
1163434Sesaxe static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
1173434Sesaxe static int		pg_cmt_hw(pghw_type_t);
1183434Sesaxe static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
1193676Sesaxe static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
1208906SEric.Saxe@Sun.COM static int		pg_cmt_lineage_validate(pg_cmt_t **, int *);
1218906SEric.Saxe@Sun.COM static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
1228906SEric.Saxe@Sun.COM 			    kthread_t *, kthread_t *);
1238906SEric.Saxe@Sun.COM static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
1248906SEric.Saxe@Sun.COM 			    kthread_t *, kthread_t *);
1258906SEric.Saxe@Sun.COM static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
1263434Sesaxe 
1273434Sesaxe /*
1283434Sesaxe  * Macro to test if PG is managed by the CMT PG class
1293434Sesaxe  */
1303434Sesaxe #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
1313434Sesaxe 
1323434Sesaxe /*
1338906SEric.Saxe@Sun.COM  * Status codes for CMT lineage validation
1348906SEric.Saxe@Sun.COM  * See cmt_lineage_validate() below
1358906SEric.Saxe@Sun.COM  */
1368906SEric.Saxe@Sun.COM typedef enum cmt_lineage_validation {
1378906SEric.Saxe@Sun.COM 	CMT_LINEAGE_VALID,
1388906SEric.Saxe@Sun.COM 	CMT_LINEAGE_NON_CONCENTRIC,
1398906SEric.Saxe@Sun.COM 	CMT_LINEAGE_REPAIRED,
1408906SEric.Saxe@Sun.COM 	CMT_LINEAGE_UNRECOVERABLE
1418906SEric.Saxe@Sun.COM } cmt_lineage_validation_t;
1428906SEric.Saxe@Sun.COM 
1438906SEric.Saxe@Sun.COM /*
1448906SEric.Saxe@Sun.COM  * Status of the current lineage under construction.
1458906SEric.Saxe@Sun.COM  * One must be holding cpu_lock to change this.
1468906SEric.Saxe@Sun.COM  */
1478906SEric.Saxe@Sun.COM static cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
1488906SEric.Saxe@Sun.COM 
1498906SEric.Saxe@Sun.COM /*
1508906SEric.Saxe@Sun.COM  * Power domain definitions (on x86) are defined by ACPI, and
1518906SEric.Saxe@Sun.COM  * therefore may be subject to BIOS bugs.
1528906SEric.Saxe@Sun.COM  */
1538906SEric.Saxe@Sun.COM #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
1548906SEric.Saxe@Sun.COM 
1558906SEric.Saxe@Sun.COM /*
1563434Sesaxe  * CMT PG ops
1573434Sesaxe  */
1583434Sesaxe struct pg_ops pg_ops_cmt = {
1593434Sesaxe 	pg_cmt_alloc,
1603434Sesaxe 	pg_cmt_free,
1613434Sesaxe 	pg_cmt_cpu_init,
1623434Sesaxe 	pg_cmt_cpu_fini,
1633434Sesaxe 	pg_cmt_cpu_active,
1643434Sesaxe 	pg_cmt_cpu_inactive,
1653434Sesaxe 	pg_cmt_cpupart_in,
1663434Sesaxe 	NULL,			/* cpupart_out */
1673434Sesaxe 	pg_cmt_cpupart_move,
1683434Sesaxe 	pg_cmt_cpu_belongs,
1698906SEric.Saxe@Sun.COM 	pg_cmt_policy_name,
1703434Sesaxe };
1713434Sesaxe 
1723434Sesaxe /*
1733434Sesaxe  * Initialize the CMT PG class
1743434Sesaxe  */
1753434Sesaxe void
1763434Sesaxe pg_cmt_class_init(void)
1773434Sesaxe {
1783434Sesaxe 	if (cmt_sched_disabled)
1793434Sesaxe 		return;
1803434Sesaxe 
1813434Sesaxe 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
1823434Sesaxe }
1833434Sesaxe 
1843434Sesaxe /*
1853434Sesaxe  * Called to indicate a new CPU has started up so
1863434Sesaxe  * that either t0 or the slave startup thread can
1873434Sesaxe  * be accounted for.
1883434Sesaxe  */
1893434Sesaxe void
1903434Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
1913434Sesaxe {
1928906SEric.Saxe@Sun.COM 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
1938906SEric.Saxe@Sun.COM 	    cp->cpu_thread);
1943434Sesaxe }
1953434Sesaxe 
1963434Sesaxe /*
1973434Sesaxe  * Return non-zero if thread can migrate between "from" and "to"
1983434Sesaxe  * without a performance penalty
1993434Sesaxe  */
2003434Sesaxe int
2013434Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
2023434Sesaxe {
2033434Sesaxe 	if (from->cpu_physid->cpu_cacheid ==
2043434Sesaxe 	    to->cpu_physid->cpu_cacheid)
2053434Sesaxe 		return (1);
2063434Sesaxe 	return (0);
2073434Sesaxe }
2083434Sesaxe 
2093434Sesaxe /*
2103434Sesaxe  * CMT class specific PG allocation
2113434Sesaxe  */
2123434Sesaxe static pg_t *
2133434Sesaxe pg_cmt_alloc(void)
2143434Sesaxe {
2153434Sesaxe 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
2163434Sesaxe }
2173434Sesaxe 
2183434Sesaxe /*
2193434Sesaxe  * Class specific PG de-allocation
2203434Sesaxe  */
2213434Sesaxe static void
2223434Sesaxe pg_cmt_free(pg_t *pg)
2233434Sesaxe {
2243434Sesaxe 	ASSERT(pg != NULL);
2253434Sesaxe 	ASSERT(IS_CMT_PG(pg));
2263434Sesaxe 
2273434Sesaxe 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
2283434Sesaxe }
2293434Sesaxe 
2303434Sesaxe /*
2318906SEric.Saxe@Sun.COM  * Given a hardware sharing relationship, return which dispatcher
2328906SEric.Saxe@Sun.COM  * policies should be implemented to optimize performance and efficiency
2338906SEric.Saxe@Sun.COM  */
2348906SEric.Saxe@Sun.COM static pg_cmt_policy_t
2358906SEric.Saxe@Sun.COM pg_cmt_policy(pghw_type_t hw)
2368906SEric.Saxe@Sun.COM {
2378906SEric.Saxe@Sun.COM 	pg_cmt_policy_t p;
2388906SEric.Saxe@Sun.COM 
2398906SEric.Saxe@Sun.COM 	/*
2408906SEric.Saxe@Sun.COM 	 * Give the platform a chance to override the default
2418906SEric.Saxe@Sun.COM 	 */
2428906SEric.Saxe@Sun.COM 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
2438906SEric.Saxe@Sun.COM 		return (p);
2448906SEric.Saxe@Sun.COM 
2458906SEric.Saxe@Sun.COM 	switch (hw) {
2468906SEric.Saxe@Sun.COM 	case PGHW_IPIPE:
2478906SEric.Saxe@Sun.COM 	case PGHW_FPU:
2488906SEric.Saxe@Sun.COM 	case PGHW_CHIP:
2498906SEric.Saxe@Sun.COM 		return (CMT_BALANCE);
2508906SEric.Saxe@Sun.COM 	case PGHW_CACHE:
2518906SEric.Saxe@Sun.COM 		return (CMT_AFFINITY);
2528906SEric.Saxe@Sun.COM 	case PGHW_POW_ACTIVE:
2538906SEric.Saxe@Sun.COM 	case PGHW_POW_IDLE:
2548906SEric.Saxe@Sun.COM 		return (CMT_BALANCE);
2558906SEric.Saxe@Sun.COM 	default:
2568906SEric.Saxe@Sun.COM 		return (CMT_NO_POLICY);
2578906SEric.Saxe@Sun.COM 	}
2588906SEric.Saxe@Sun.COM }
2598906SEric.Saxe@Sun.COM 
2608906SEric.Saxe@Sun.COM /*
2618906SEric.Saxe@Sun.COM  * Rank the importance of optimizing for the pg1 relationship vs.
2628906SEric.Saxe@Sun.COM  * the pg2 relationship.
2638906SEric.Saxe@Sun.COM  */
2648906SEric.Saxe@Sun.COM static pg_cmt_t *
2658906SEric.Saxe@Sun.COM pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
2668906SEric.Saxe@Sun.COM {
2678906SEric.Saxe@Sun.COM 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
2688906SEric.Saxe@Sun.COM 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
2698906SEric.Saxe@Sun.COM 
2708906SEric.Saxe@Sun.COM 	/*
2718906SEric.Saxe@Sun.COM 	 * A power domain is only important if CPUPM is enabled.
2728906SEric.Saxe@Sun.COM 	 */
2738906SEric.Saxe@Sun.COM 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
2748906SEric.Saxe@Sun.COM 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
2758906SEric.Saxe@Sun.COM 			return (pg2);
2768906SEric.Saxe@Sun.COM 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
2778906SEric.Saxe@Sun.COM 			return (pg1);
2788906SEric.Saxe@Sun.COM 	}
2798906SEric.Saxe@Sun.COM 
2808906SEric.Saxe@Sun.COM 	/*
2818906SEric.Saxe@Sun.COM 	 * Otherwise, ask the platform
2828906SEric.Saxe@Sun.COM 	 */
2838906SEric.Saxe@Sun.COM 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
2848906SEric.Saxe@Sun.COM 		return (pg1);
2858906SEric.Saxe@Sun.COM 	else
2868906SEric.Saxe@Sun.COM 		return (pg2);
2878906SEric.Saxe@Sun.COM }
2888906SEric.Saxe@Sun.COM 
2898906SEric.Saxe@Sun.COM /*
2908906SEric.Saxe@Sun.COM  * Initialize CMT callbacks for the given PG
2918906SEric.Saxe@Sun.COM  */
2928906SEric.Saxe@Sun.COM static void
2938906SEric.Saxe@Sun.COM cmt_callback_init(pg_t *pg)
2948906SEric.Saxe@Sun.COM {
2958906SEric.Saxe@Sun.COM 	switch (((pghw_t *)pg)->pghw_hw) {
2968906SEric.Saxe@Sun.COM 	case PGHW_POW_ACTIVE:
2978906SEric.Saxe@Sun.COM 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
2988906SEric.Saxe@Sun.COM 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
2998906SEric.Saxe@Sun.COM 		break;
3008906SEric.Saxe@Sun.COM 	default:
3018906SEric.Saxe@Sun.COM 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
3028906SEric.Saxe@Sun.COM 
3038906SEric.Saxe@Sun.COM 	}
3048906SEric.Saxe@Sun.COM }
3058906SEric.Saxe@Sun.COM 
3068906SEric.Saxe@Sun.COM /*
3078906SEric.Saxe@Sun.COM  * Promote PG above it's current parent.
3088906SEric.Saxe@Sun.COM  * This is only legal if PG has an equal or greater number of CPUs
3098906SEric.Saxe@Sun.COM  * than it's parent.
3103434Sesaxe  */
3118906SEric.Saxe@Sun.COM static void
3128906SEric.Saxe@Sun.COM cmt_hier_promote(pg_cmt_t *pg)
3133434Sesaxe {
3148906SEric.Saxe@Sun.COM 	pg_cmt_t	*parent;
3158906SEric.Saxe@Sun.COM 	group_t		*children;
3168906SEric.Saxe@Sun.COM 	cpu_t		*cpu;
3178906SEric.Saxe@Sun.COM 	group_iter_t	iter;
3188906SEric.Saxe@Sun.COM 	pg_cpu_itr_t	cpu_iter;
3198906SEric.Saxe@Sun.COM 	int		r;
3208906SEric.Saxe@Sun.COM 	int		err;
3218906SEric.Saxe@Sun.COM 
3228906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
3238906SEric.Saxe@Sun.COM 
3248906SEric.Saxe@Sun.COM 	parent = pg->cmt_parent;
3258906SEric.Saxe@Sun.COM 	if (parent == NULL) {
3268906SEric.Saxe@Sun.COM 		/*
3278906SEric.Saxe@Sun.COM 		 * Nothing to do
3288906SEric.Saxe@Sun.COM 		 */
3298906SEric.Saxe@Sun.COM 		return;
3308906SEric.Saxe@Sun.COM 	}
3318906SEric.Saxe@Sun.COM 
3328906SEric.Saxe@Sun.COM 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
3338906SEric.Saxe@Sun.COM 
3348906SEric.Saxe@Sun.COM 	/*
3358906SEric.Saxe@Sun.COM 	 * We're changing around the hierarchy, which is actively traversed
3368906SEric.Saxe@Sun.COM 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
3378906SEric.Saxe@Sun.COM 	 */
3388906SEric.Saxe@Sun.COM 	pause_cpus(NULL);
3398906SEric.Saxe@Sun.COM 
3408906SEric.Saxe@Sun.COM 	/*
3418906SEric.Saxe@Sun.COM 	 * If necessary, update the parent's sibling set, replacing parent
3428906SEric.Saxe@Sun.COM 	 * with PG.
3438906SEric.Saxe@Sun.COM 	 */
3448906SEric.Saxe@Sun.COM 	if (parent->cmt_siblings) {
3458906SEric.Saxe@Sun.COM 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
3468906SEric.Saxe@Sun.COM 		    != -1) {
3478906SEric.Saxe@Sun.COM 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
3488906SEric.Saxe@Sun.COM 			ASSERT(r != -1);
3498906SEric.Saxe@Sun.COM 		}
3508906SEric.Saxe@Sun.COM 	}
3518906SEric.Saxe@Sun.COM 
3528906SEric.Saxe@Sun.COM 	/*
3538906SEric.Saxe@Sun.COM 	 * If the parent is at the top of the hierarchy, replace it's entry
3548906SEric.Saxe@Sun.COM 	 * in the root lgroup's group of top level PGs.
3558906SEric.Saxe@Sun.COM 	 */
3568906SEric.Saxe@Sun.COM 	if (parent->cmt_parent == NULL &&
3578906SEric.Saxe@Sun.COM 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
3588906SEric.Saxe@Sun.COM 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
3598906SEric.Saxe@Sun.COM 		    != -1) {
3608906SEric.Saxe@Sun.COM 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
3618906SEric.Saxe@Sun.COM 			ASSERT(r != -1);
3628906SEric.Saxe@Sun.COM 		}
3638906SEric.Saxe@Sun.COM 	}
3648906SEric.Saxe@Sun.COM 
3658906SEric.Saxe@Sun.COM 	/*
3668906SEric.Saxe@Sun.COM 	 * We assume (and therefore assert) that the PG being promoted is an
3678906SEric.Saxe@Sun.COM 	 * only child of it's parent. Update the parent's children set
3688906SEric.Saxe@Sun.COM 	 * replacing PG's entry with the parent (since the parent is becoming
3698906SEric.Saxe@Sun.COM 	 * the child). Then have PG and the parent swap children sets.
3708906SEric.Saxe@Sun.COM 	 */
3718906SEric.Saxe@Sun.COM 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
3728906SEric.Saxe@Sun.COM 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
3738906SEric.Saxe@Sun.COM 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
3748906SEric.Saxe@Sun.COM 		ASSERT(r != -1);
3758906SEric.Saxe@Sun.COM 	}
3768906SEric.Saxe@Sun.COM 
3778906SEric.Saxe@Sun.COM 	children = pg->cmt_children;
3788906SEric.Saxe@Sun.COM 	pg->cmt_children = parent->cmt_children;
3798906SEric.Saxe@Sun.COM 	parent->cmt_children = children;
3808906SEric.Saxe@Sun.COM 
3818906SEric.Saxe@Sun.COM 	/*
3828906SEric.Saxe@Sun.COM 	 * Update the sibling references for PG and it's parent
3838906SEric.Saxe@Sun.COM 	 */
3848906SEric.Saxe@Sun.COM 	pg->cmt_siblings = parent->cmt_siblings;
3858906SEric.Saxe@Sun.COM 	parent->cmt_siblings = pg->cmt_children;
3868906SEric.Saxe@Sun.COM 
3878906SEric.Saxe@Sun.COM 	/*
3888906SEric.Saxe@Sun.COM 	 * Update any cached lineages in the per CPU pg data.
3898906SEric.Saxe@Sun.COM 	 */
3908906SEric.Saxe@Sun.COM 	PG_CPU_ITR_INIT(pg, cpu_iter);
3918906SEric.Saxe@Sun.COM 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
3928906SEric.Saxe@Sun.COM 		int		idx;
3938906SEric.Saxe@Sun.COM 		group_t		*pgs;
3948906SEric.Saxe@Sun.COM 		pg_cmt_t	*cpu_pg;
3958906SEric.Saxe@Sun.COM 
3968906SEric.Saxe@Sun.COM 		/*
3978906SEric.Saxe@Sun.COM 		 * Iterate over the CPU's PGs updating the children
3988906SEric.Saxe@Sun.COM 		 * of the PG being promoted, since they have a new parent.
3998906SEric.Saxe@Sun.COM 		 */
4008906SEric.Saxe@Sun.COM 		pgs = &cpu->cpu_pg->pgs;
4018906SEric.Saxe@Sun.COM 		group_iter_init(&iter);
4028906SEric.Saxe@Sun.COM 		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
4038906SEric.Saxe@Sun.COM 			if (cpu_pg->cmt_parent == pg) {
4048906SEric.Saxe@Sun.COM 				cpu_pg->cmt_parent = parent;
4058906SEric.Saxe@Sun.COM 			}
4068906SEric.Saxe@Sun.COM 		}
4078906SEric.Saxe@Sun.COM 
4088906SEric.Saxe@Sun.COM 		/*
4098906SEric.Saxe@Sun.COM 		 * Update the CMT load balancing lineage
4108906SEric.Saxe@Sun.COM 		 */
4118906SEric.Saxe@Sun.COM 		pgs = &cpu->cpu_pg->cmt_pgs;
4128906SEric.Saxe@Sun.COM 		if ((idx = group_find(pgs, (void *)pg)) == -1) {
4138906SEric.Saxe@Sun.COM 			/*
4148906SEric.Saxe@Sun.COM 			 * Unless this is the CPU who's lineage is being
4158906SEric.Saxe@Sun.COM 			 * constructed, the PG being promoted should be
4168906SEric.Saxe@Sun.COM 			 * in the lineage.
4178906SEric.Saxe@Sun.COM 			 */
4188906SEric.Saxe@Sun.COM 			ASSERT(GROUP_SIZE(pgs) == 0);
4198906SEric.Saxe@Sun.COM 			continue;
4208906SEric.Saxe@Sun.COM 		}
4218906SEric.Saxe@Sun.COM 
4228906SEric.Saxe@Sun.COM 		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
4238906SEric.Saxe@Sun.COM 		ASSERT(idx > 0);
4248906SEric.Saxe@Sun.COM 
4258906SEric.Saxe@Sun.COM 		/*
4268906SEric.Saxe@Sun.COM 		 * Have the child and the parent swap places in the CPU's
4278906SEric.Saxe@Sun.COM 		 * lineage
4288906SEric.Saxe@Sun.COM 		 */
4298906SEric.Saxe@Sun.COM 		group_remove_at(pgs, idx);
4308906SEric.Saxe@Sun.COM 		group_remove_at(pgs, idx - 1);
4318906SEric.Saxe@Sun.COM 		err = group_add_at(pgs, parent, idx);
4328906SEric.Saxe@Sun.COM 		ASSERT(err == 0);
4338906SEric.Saxe@Sun.COM 		err = group_add_at(pgs, pg, idx - 1);
4348906SEric.Saxe@Sun.COM 		ASSERT(err == 0);
4358906SEric.Saxe@Sun.COM 	}
4368906SEric.Saxe@Sun.COM 
4378906SEric.Saxe@Sun.COM 	/*
4388906SEric.Saxe@Sun.COM 	 * Update the parent references for PG and it's parent
4398906SEric.Saxe@Sun.COM 	 */
4408906SEric.Saxe@Sun.COM 	pg->cmt_parent = parent->cmt_parent;
4418906SEric.Saxe@Sun.COM 	parent->cmt_parent = pg;
4428906SEric.Saxe@Sun.COM 
4438906SEric.Saxe@Sun.COM 	start_cpus();
4443434Sesaxe }
4453434Sesaxe 
4463434Sesaxe /*
4473434Sesaxe  * CMT class callback for a new CPU entering the system
4483434Sesaxe  */
4493434Sesaxe static void
4503434Sesaxe pg_cmt_cpu_init(cpu_t *cp)
4513434Sesaxe {
4523434Sesaxe 	pg_cmt_t	*pg;
4533434Sesaxe 	group_t		*cmt_pgs;
4548906SEric.Saxe@Sun.COM 	int		levels, level;
4553434Sesaxe 	pghw_type_t	hw;
4563434Sesaxe 	pg_t		*pg_cache = NULL;
4573434Sesaxe 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
4583434Sesaxe 	lgrp_handle_t	lgrp_handle;
4593434Sesaxe 	cmt_lgrp_t	*lgrp;
4603434Sesaxe 
4613434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
4623434Sesaxe 
4638906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
4648906SEric.Saxe@Sun.COM 		return;
4658906SEric.Saxe@Sun.COM 
4663434Sesaxe 	/*
4673434Sesaxe 	 * A new CPU is coming into the system.
4683434Sesaxe 	 * Interrogate the platform to see if the CPU
4698906SEric.Saxe@Sun.COM 	 * has any performance or efficiency relevant
4708906SEric.Saxe@Sun.COM 	 * sharing relationships
4713434Sesaxe 	 */
4723434Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
4733434Sesaxe 	cp->cpu_pg->cmt_lineage = NULL;
4743434Sesaxe 
4753434Sesaxe 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
4768906SEric.Saxe@Sun.COM 	levels = 0;
4773434Sesaxe 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
4783434Sesaxe 
4798906SEric.Saxe@Sun.COM 		pg_cmt_policy_t	policy;
4808906SEric.Saxe@Sun.COM 
4813434Sesaxe 		/*
4828906SEric.Saxe@Sun.COM 		 * We're only interested in the hw sharing relationships
4838906SEric.Saxe@Sun.COM 		 * for which we know how to optimize.
4843434Sesaxe 		 */
4858906SEric.Saxe@Sun.COM 		policy = pg_cmt_policy(hw);
4868906SEric.Saxe@Sun.COM 		if (policy == CMT_NO_POLICY ||
4878906SEric.Saxe@Sun.COM 		    pg_plat_hw_shared(cp, hw) == 0)
4883434Sesaxe 			continue;
4893434Sesaxe 
4903434Sesaxe 		/*
4918906SEric.Saxe@Sun.COM 		 * Continue if the hardware sharing relationship has been
4928906SEric.Saxe@Sun.COM 		 * blacklisted.
4938906SEric.Saxe@Sun.COM 		 */
4948906SEric.Saxe@Sun.COM 		if (cmt_hw_blacklisted[hw]) {
4958906SEric.Saxe@Sun.COM 			continue;
4968906SEric.Saxe@Sun.COM 		}
4978906SEric.Saxe@Sun.COM 
4988906SEric.Saxe@Sun.COM 		/*
4993434Sesaxe 		 * Find (or create) the PG associated with
5003434Sesaxe 		 * the hw sharing relationship in which cp
5013434Sesaxe 		 * belongs.
5023434Sesaxe 		 *
5033434Sesaxe 		 * Determine if a suitable PG already
5043434Sesaxe 		 * exists, or if one needs to be created.
5053434Sesaxe 		 */
5063434Sesaxe 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
5073434Sesaxe 		if (pg == NULL) {
5083434Sesaxe 			/*
5093434Sesaxe 			 * Create a new one.
5103434Sesaxe 			 * Initialize the common...
5113434Sesaxe 			 */
5123434Sesaxe 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
5133434Sesaxe 
5143434Sesaxe 			/* ... physical ... */
5153434Sesaxe 			pghw_init((pghw_t *)pg, cp, hw);
5163434Sesaxe 
5173434Sesaxe 			/*
5183434Sesaxe 			 * ... and CMT specific portions of the
5193434Sesaxe 			 * structure.
5203434Sesaxe 			 */
5218906SEric.Saxe@Sun.COM 			pg->cmt_policy = policy;
5228906SEric.Saxe@Sun.COM 
5238906SEric.Saxe@Sun.COM 			/* CMT event callbacks */
5248906SEric.Saxe@Sun.COM 			cmt_callback_init((pg_t *)pg);
5258906SEric.Saxe@Sun.COM 
5263434Sesaxe 			bitset_init(&pg->cmt_cpus_actv_set);
5273434Sesaxe 			group_create(&pg->cmt_cpus_actv);
5283434Sesaxe 		} else {
5293434Sesaxe 			ASSERT(IS_CMT_PG(pg));
5303434Sesaxe 		}
5313434Sesaxe 
5323434Sesaxe 		/* Add the CPU to the PG */
5333434Sesaxe 		pg_cpu_add((pg_t *)pg, cp);
5343434Sesaxe 
5353434Sesaxe 		/*
5368408SEric.Saxe@Sun.COM 		 * Ensure capacity of the active CPU group/bitset
5373434Sesaxe 		 */
5383434Sesaxe 		group_expand(&pg->cmt_cpus_actv,
5393434Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
5403434Sesaxe 
5413434Sesaxe 		if (cp->cpu_seqid >=
5423434Sesaxe 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
5433434Sesaxe 			bitset_resize(&pg->cmt_cpus_actv_set,
5443434Sesaxe 			    cp->cpu_seqid + 1);
5453434Sesaxe 		}
5463434Sesaxe 
5473434Sesaxe 		/*
5488906SEric.Saxe@Sun.COM 		 * Build a lineage of CMT PGs for load balancing / coalescence
5493434Sesaxe 		 */
5508906SEric.Saxe@Sun.COM 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
5518906SEric.Saxe@Sun.COM 			cpu_cmt_hier[levels++] = pg;
5523434Sesaxe 		}
5533434Sesaxe 
5543434Sesaxe 		/* Cache this for later */
5553434Sesaxe 		if (hw == PGHW_CACHE)
5563434Sesaxe 			pg_cache = (pg_t *)pg;
5573434Sesaxe 	}
5583434Sesaxe 
5598906SEric.Saxe@Sun.COM 	group_expand(cmt_pgs, levels);
5608408SEric.Saxe@Sun.COM 
5618408SEric.Saxe@Sun.COM 	if (cmt_root == NULL)
5628408SEric.Saxe@Sun.COM 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
5633434Sesaxe 
5643434Sesaxe 	/*
5658906SEric.Saxe@Sun.COM 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
5668408SEric.Saxe@Sun.COM 	 */
5678408SEric.Saxe@Sun.COM 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
5688408SEric.Saxe@Sun.COM 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
5698408SEric.Saxe@Sun.COM 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
5708408SEric.Saxe@Sun.COM 
5718408SEric.Saxe@Sun.COM 	/*
5728906SEric.Saxe@Sun.COM 	 * Ascendingly sort the PGs in the lineage by number of CPUs
5738906SEric.Saxe@Sun.COM 	 */
5748906SEric.Saxe@Sun.COM 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
5758906SEric.Saxe@Sun.COM 
5768906SEric.Saxe@Sun.COM 	/*
5778906SEric.Saxe@Sun.COM 	 * Examine the lineage and validate it.
5788906SEric.Saxe@Sun.COM 	 * This routine will also try to fix the lineage along with the
5798906SEric.Saxe@Sun.COM 	 * rest of the PG hierarchy should it detect an issue.
5808906SEric.Saxe@Sun.COM 	 *
5818906SEric.Saxe@Sun.COM 	 * If it returns -1, an unrecoverable error has happened and we
5828906SEric.Saxe@Sun.COM 	 * need to return.
5838906SEric.Saxe@Sun.COM 	 */
5848906SEric.Saxe@Sun.COM 	if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0)
5858906SEric.Saxe@Sun.COM 		return;
5868906SEric.Saxe@Sun.COM 
5878906SEric.Saxe@Sun.COM 	/*
5888906SEric.Saxe@Sun.COM 	 * For existing PGs in the lineage, verify that the parent is
5898906SEric.Saxe@Sun.COM 	 * correct, as the generation in the lineage may have changed
5908906SEric.Saxe@Sun.COM 	 * as a result of the sorting. Start the traversal at the top
5918906SEric.Saxe@Sun.COM 	 * of the lineage, moving down.
5928906SEric.Saxe@Sun.COM 	 */
5938906SEric.Saxe@Sun.COM 	for (level = levels - 1; level >= 0; ) {
5948906SEric.Saxe@Sun.COM 		int reorg;
5958906SEric.Saxe@Sun.COM 
5968906SEric.Saxe@Sun.COM 		reorg = 0;
5978906SEric.Saxe@Sun.COM 		pg = cpu_cmt_hier[level];
5988906SEric.Saxe@Sun.COM 
5998906SEric.Saxe@Sun.COM 		/*
6008906SEric.Saxe@Sun.COM 		 * Promote PGs at an incorrect generation into place.
6018906SEric.Saxe@Sun.COM 		 */
6028906SEric.Saxe@Sun.COM 		while (pg->cmt_parent &&
6038906SEric.Saxe@Sun.COM 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
6048906SEric.Saxe@Sun.COM 			cmt_hier_promote(pg);
6058906SEric.Saxe@Sun.COM 			reorg++;
6068906SEric.Saxe@Sun.COM 		}
6078906SEric.Saxe@Sun.COM 		if (reorg > 0)
6088906SEric.Saxe@Sun.COM 			level = levels - 1;
6098906SEric.Saxe@Sun.COM 		else
6108906SEric.Saxe@Sun.COM 			level--;
6118906SEric.Saxe@Sun.COM 	}
6128906SEric.Saxe@Sun.COM 
6138906SEric.Saxe@Sun.COM 	/*
6148408SEric.Saxe@Sun.COM 	 * For each of the PGs in the CPU's lineage:
6158906SEric.Saxe@Sun.COM 	 *	- Add an entry in the CPU sorted CMT PG group
6168906SEric.Saxe@Sun.COM 	 *	  which is used for top down CMT load balancing
6173434Sesaxe 	 *	- Tie the PG into the CMT hierarchy by connecting
6183434Sesaxe 	 *	  it to it's parent and siblings.
6193434Sesaxe 	 */
6208906SEric.Saxe@Sun.COM 	for (level = 0; level < levels; level++) {
6213434Sesaxe 		uint_t		children;
6223434Sesaxe 		int		err;
6233434Sesaxe 
6243434Sesaxe 		pg = cpu_cmt_hier[level];
6258906SEric.Saxe@Sun.COM 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
6263434Sesaxe 		ASSERT(err == 0);
6273434Sesaxe 
6283434Sesaxe 		if (level == 0)
6293434Sesaxe 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
6303434Sesaxe 
6313434Sesaxe 		if (pg->cmt_siblings != NULL) {
6323434Sesaxe 			/* Already initialized */
6333434Sesaxe 			ASSERT(pg->cmt_parent == NULL ||
6343434Sesaxe 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
6353434Sesaxe 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
6365933Sjb145095 			    ((pg->cmt_parent != NULL) &&
6375933Sjb145095 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
6383434Sesaxe 			continue;
6393434Sesaxe 		}
6403434Sesaxe 
6418906SEric.Saxe@Sun.COM 		if ((level + 1) == levels) {
6423434Sesaxe 			pg->cmt_parent = NULL;
6438408SEric.Saxe@Sun.COM 
6443434Sesaxe 			pg->cmt_siblings = &lgrp->cl_pgs;
6453434Sesaxe 			children = ++lgrp->cl_npgs;
6468906SEric.Saxe@Sun.COM 			if (cmt_root != lgrp)
6478906SEric.Saxe@Sun.COM 				cmt_root->cl_npgs++;
6483434Sesaxe 		} else {
6493434Sesaxe 			pg->cmt_parent = cpu_cmt_hier[level + 1];
6503434Sesaxe 
6513434Sesaxe 			/*
6523434Sesaxe 			 * A good parent keeps track of their children.
6533434Sesaxe 			 * The parent's children group is also the PG's
6543434Sesaxe 			 * siblings.
6553434Sesaxe 			 */
6563434Sesaxe 			if (pg->cmt_parent->cmt_children == NULL) {
6573434Sesaxe 				pg->cmt_parent->cmt_children =
6583434Sesaxe 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
6593434Sesaxe 				group_create(pg->cmt_parent->cmt_children);
6603434Sesaxe 			}
6613434Sesaxe 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
6623434Sesaxe 			children = ++pg->cmt_parent->cmt_nchildren;
6633434Sesaxe 		}
6648408SEric.Saxe@Sun.COM 
6653434Sesaxe 		group_expand(pg->cmt_siblings, children);
6668408SEric.Saxe@Sun.COM 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
6673434Sesaxe 	}
6683434Sesaxe 
6693434Sesaxe 	/*
6703434Sesaxe 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
6713434Sesaxe 	 * for fast lookups later.
6723434Sesaxe 	 */
6733434Sesaxe 	if (cp->cpu_physid) {
6743434Sesaxe 		cp->cpu_physid->cpu_chipid =
6753434Sesaxe 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
6763434Sesaxe 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
6773434Sesaxe 
6783434Sesaxe 		/*
6793434Sesaxe 		 * If this cpu has a PG representing shared cache, then set
6803434Sesaxe 		 * cpu_cacheid to that PG's logical id
6813434Sesaxe 		 */
6823434Sesaxe 		if (pg_cache)
6833434Sesaxe 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
6843434Sesaxe 	}
6853434Sesaxe 
6863434Sesaxe 	/* CPU0 only initialization */
6873434Sesaxe 	if (is_cpu0) {
6883434Sesaxe 		pg_cmt_cpu_startup(cp);
6893434Sesaxe 		is_cpu0 = 0;
6903676Sesaxe 		cpu0_lgrp = lgrp;
6913434Sesaxe 	}
6923434Sesaxe 
6933434Sesaxe }
6943434Sesaxe 
6953434Sesaxe /*
6963434Sesaxe  * Class callback when a CPU is leaving the system (deletion)
6973434Sesaxe  */
6983434Sesaxe static void
6993434Sesaxe pg_cmt_cpu_fini(cpu_t *cp)
7003434Sesaxe {
7013434Sesaxe 	group_iter_t	i;
7023434Sesaxe 	pg_cmt_t	*pg;
7033434Sesaxe 	group_t		*pgs, *cmt_pgs;
7043434Sesaxe 	lgrp_handle_t	lgrp_handle;
7053434Sesaxe 	cmt_lgrp_t	*lgrp;
7063434Sesaxe 
7078906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
7088906SEric.Saxe@Sun.COM 		return;
7098906SEric.Saxe@Sun.COM 
7103434Sesaxe 	pgs = &cp->cpu_pg->pgs;
7113434Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
7123434Sesaxe 
7133434Sesaxe 	/*
7143434Sesaxe 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
7153434Sesaxe 	 */
7163434Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
7173676Sesaxe 
7183434Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
7198689SEric.Saxe@Sun.COM 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
7203676Sesaxe 		/*
7218689SEric.Saxe@Sun.COM 		 * One might wonder how we could be deconfiguring the
7228689SEric.Saxe@Sun.COM 		 * only CPU in the system.
7233676Sesaxe 		 *
7248689SEric.Saxe@Sun.COM 		 * On Starcat systems when null_proc_lpa is detected,
7258689SEric.Saxe@Sun.COM 		 * the boot CPU (which is already configured into a leaf
7268689SEric.Saxe@Sun.COM 		 * lgroup), is moved into the root lgroup. This is done by
7278689SEric.Saxe@Sun.COM 		 * deconfiguring it from both lgroups and processor
7288689SEric.Saxe@Sun.COM 		 * groups), and then later reconfiguring it back in.  This
7298689SEric.Saxe@Sun.COM 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
7308689SEric.Saxe@Sun.COM 		 *
7318689SEric.Saxe@Sun.COM 		 * This special case is detected by noting that the platform
7328689SEric.Saxe@Sun.COM 		 * has changed the CPU's lgrp affiliation (since it now
7338689SEric.Saxe@Sun.COM 		 * belongs in the root). In this case, use the cmt_lgrp_t
7348689SEric.Saxe@Sun.COM 		 * cached for the boot CPU, since this is what needs to be
7358689SEric.Saxe@Sun.COM 		 * torn down.
7363676Sesaxe 		 */
7373676Sesaxe 		lgrp = cpu0_lgrp;
7383676Sesaxe 	}
7393434Sesaxe 
7408689SEric.Saxe@Sun.COM 	ASSERT(lgrp != NULL);
7418689SEric.Saxe@Sun.COM 
7423434Sesaxe 	/*
7433434Sesaxe 	 * First, clean up anything load balancing specific for each of
7443434Sesaxe 	 * the CPU's PGs that participated in CMT load balancing
7453434Sesaxe 	 */
7463434Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
7473434Sesaxe 	while (pg != NULL) {
7483434Sesaxe 
7493434Sesaxe 		/*
7503434Sesaxe 		 * Remove the PG from the CPU's load balancing lineage
7513434Sesaxe 		 */
7523434Sesaxe 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
7533434Sesaxe 
7543434Sesaxe 		/*
7553434Sesaxe 		 * If it's about to become empty, destroy it's children
7563434Sesaxe 		 * group, and remove it's reference from it's siblings.
7573434Sesaxe 		 * This is done here (rather than below) to avoid removing
7583434Sesaxe 		 * our reference from a PG that we just eliminated.
7593434Sesaxe 		 */
7603434Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
7613434Sesaxe 			if (pg->cmt_children != NULL)
7623434Sesaxe 				group_destroy(pg->cmt_children);
7633434Sesaxe 			if (pg->cmt_siblings != NULL) {
7643434Sesaxe 				if (pg->cmt_siblings == &lgrp->cl_pgs)
7653434Sesaxe 					lgrp->cl_npgs--;
7663434Sesaxe 				else
7673434Sesaxe 					pg->cmt_parent->cmt_nchildren--;
7683434Sesaxe 			}
7693434Sesaxe 		}
7703434Sesaxe 		pg = pg->cmt_parent;
7713434Sesaxe 	}
7723434Sesaxe 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
7733434Sesaxe 
7743434Sesaxe 	/*
7753434Sesaxe 	 * Now that the load balancing lineage updates have happened,
7763434Sesaxe 	 * remove the CPU from all it's PGs (destroying any that become
7773434Sesaxe 	 * empty).
7783434Sesaxe 	 */
7793434Sesaxe 	group_iter_init(&i);
7803434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
7813434Sesaxe 		if (IS_CMT_PG(pg) == 0)
7823434Sesaxe 			continue;
7833434Sesaxe 
7843434Sesaxe 		pg_cpu_delete((pg_t *)pg, cp);
7853434Sesaxe 		/*
7863434Sesaxe 		 * Deleting the CPU from the PG changes the CPU's
7873434Sesaxe 		 * PG group over which we are actively iterating
7883434Sesaxe 		 * Re-initialize the iteration
7893434Sesaxe 		 */
7903434Sesaxe 		group_iter_init(&i);
7913434Sesaxe 
7923434Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
7933434Sesaxe 
7943434Sesaxe 			/*
7953434Sesaxe 			 * The PG has become zero sized, so destroy it.
7963434Sesaxe 			 */
7973434Sesaxe 			group_destroy(&pg->cmt_cpus_actv);
7983434Sesaxe 			bitset_fini(&pg->cmt_cpus_actv_set);
7993434Sesaxe 			pghw_fini((pghw_t *)pg);
8003434Sesaxe 
8013434Sesaxe 			pg_destroy((pg_t *)pg);
8023434Sesaxe 		}
8033434Sesaxe 	}
8043434Sesaxe }
8053434Sesaxe 
8063434Sesaxe /*
8073434Sesaxe  * Class callback when a CPU is entering a cpu partition
8083434Sesaxe  */
8093434Sesaxe static void
8103434Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
8113434Sesaxe {
8123434Sesaxe 	group_t		*pgs;
8133434Sesaxe 	pg_t		*pg;
8143434Sesaxe 	group_iter_t	i;
8153434Sesaxe 
8163434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
8173434Sesaxe 
8188906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
8198906SEric.Saxe@Sun.COM 		return;
8208906SEric.Saxe@Sun.COM 
8213434Sesaxe 	pgs = &cp->cpu_pg->pgs;
8223434Sesaxe 
8233434Sesaxe 	/*
8243434Sesaxe 	 * Ensure that the new partition's PG bitset
8253434Sesaxe 	 * is large enough for all CMT PG's to which cp
8263434Sesaxe 	 * belongs
8273434Sesaxe 	 */
8283434Sesaxe 	group_iter_init(&i);
8293434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
8303434Sesaxe 		if (IS_CMT_PG(pg) == 0)
8313434Sesaxe 			continue;
8323434Sesaxe 
8333434Sesaxe 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
8343434Sesaxe 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
8353434Sesaxe 	}
8363434Sesaxe }
8373434Sesaxe 
8383434Sesaxe /*
8393434Sesaxe  * Class callback when a CPU is actually moving partitions
8403434Sesaxe  */
8413434Sesaxe static void
8423434Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
8433434Sesaxe {
8443434Sesaxe 	cpu_t		*cpp;
8453434Sesaxe 	group_t		*pgs;
8463434Sesaxe 	pg_t		*pg;
8473434Sesaxe 	group_iter_t	pg_iter;
8483434Sesaxe 	pg_cpu_itr_t	cpu_iter;
8493434Sesaxe 	boolean_t	found;
8503434Sesaxe 
8513434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
8523434Sesaxe 
8538906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
8548906SEric.Saxe@Sun.COM 		return;
8558906SEric.Saxe@Sun.COM 
8563434Sesaxe 	pgs = &cp->cpu_pg->pgs;
8573434Sesaxe 	group_iter_init(&pg_iter);
8583434Sesaxe 
8593434Sesaxe 	/*
8603434Sesaxe 	 * Iterate over the CPUs CMT PGs
8613434Sesaxe 	 */
8623434Sesaxe 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
8633434Sesaxe 
8643434Sesaxe 		if (IS_CMT_PG(pg) == 0)
8653434Sesaxe 			continue;
8663434Sesaxe 
8673434Sesaxe 		/*
8683434Sesaxe 		 * Add the PG to the bitset in the new partition.
8693434Sesaxe 		 */
8703434Sesaxe 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
8713434Sesaxe 
8723434Sesaxe 		/*
8733434Sesaxe 		 * Remove the PG from the bitset in the old partition
8743434Sesaxe 		 * if the last of the PG's CPUs have left.
8753434Sesaxe 		 */
8763434Sesaxe 		found = B_FALSE;
8773434Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
8783434Sesaxe 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
8793434Sesaxe 			if (cpp == cp)
8803434Sesaxe 				continue;
8813676Sesaxe 			if (CPU_ACTIVE(cpp) &&
8823676Sesaxe 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
8833434Sesaxe 				found = B_TRUE;
8843434Sesaxe 				break;
8853434Sesaxe 			}
8863434Sesaxe 		}
8873434Sesaxe 		if (!found)
8883434Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
8893434Sesaxe 	}
8903434Sesaxe }
8913434Sesaxe 
8923434Sesaxe /*
8933434Sesaxe  * Class callback when a CPU becomes active (online)
8943434Sesaxe  *
8953434Sesaxe  * This is called in a context where CPUs are paused
8963434Sesaxe  */
8973434Sesaxe static void
8983434Sesaxe pg_cmt_cpu_active(cpu_t *cp)
8993434Sesaxe {
9003434Sesaxe 	int		err;
9013434Sesaxe 	group_iter_t	i;
9023434Sesaxe 	pg_cmt_t	*pg;
9033434Sesaxe 	group_t		*pgs;
9043434Sesaxe 
9053434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
9063434Sesaxe 
9078906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
9088906SEric.Saxe@Sun.COM 		return;
9098906SEric.Saxe@Sun.COM 
9103434Sesaxe 	pgs = &cp->cpu_pg->pgs;
9113434Sesaxe 	group_iter_init(&i);
9123434Sesaxe 
9133434Sesaxe 	/*
9143434Sesaxe 	 * Iterate over the CPU's PGs
9153434Sesaxe 	 */
9163434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
9173434Sesaxe 
9183434Sesaxe 		if (IS_CMT_PG(pg) == 0)
9193434Sesaxe 			continue;
9203434Sesaxe 
9213434Sesaxe 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
9223434Sesaxe 		ASSERT(err == 0);
9233434Sesaxe 
9243434Sesaxe 		/*
9253434Sesaxe 		 * If this is the first active CPU in the PG, and it
9263434Sesaxe 		 * represents a hardware sharing relationship over which
9273434Sesaxe 		 * CMT load balancing is performed, add it as a candidate
9283434Sesaxe 		 * for balancing with it's siblings.
9293434Sesaxe 		 */
9303434Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
9318906SEric.Saxe@Sun.COM 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
9323434Sesaxe 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
9333434Sesaxe 			ASSERT(err == 0);
9348408SEric.Saxe@Sun.COM 
9358408SEric.Saxe@Sun.COM 			/*
9368408SEric.Saxe@Sun.COM 			 * If this is a top level PG, add it as a balancing
9378906SEric.Saxe@Sun.COM 			 * candidate when balancing within the root lgroup.
9388408SEric.Saxe@Sun.COM 			 */
9398906SEric.Saxe@Sun.COM 			if (pg->cmt_parent == NULL &&
9408906SEric.Saxe@Sun.COM 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
9418408SEric.Saxe@Sun.COM 				err = group_add(&cmt_root->cl_pgs, pg,
9428408SEric.Saxe@Sun.COM 				    GRP_NORESIZE);
9438408SEric.Saxe@Sun.COM 				ASSERT(err == 0);
9448408SEric.Saxe@Sun.COM 			}
9453434Sesaxe 		}
9463434Sesaxe 
9473434Sesaxe 		/*
9483434Sesaxe 		 * Notate the CPU in the PGs active CPU bitset.
9493434Sesaxe 		 * Also notate the PG as being active in it's associated
9503434Sesaxe 		 * partition
9513434Sesaxe 		 */
9523434Sesaxe 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
9533434Sesaxe 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
9543434Sesaxe 	}
9553434Sesaxe }
9563434Sesaxe 
9573434Sesaxe /*
9583434Sesaxe  * Class callback when a CPU goes inactive (offline)
9593434Sesaxe  *
9603434Sesaxe  * This is called in a context where CPUs are paused
9613434Sesaxe  */
9623434Sesaxe static void
9633434Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
9643434Sesaxe {
9653434Sesaxe 	int		err;
9663434Sesaxe 	group_t		*pgs;
9673434Sesaxe 	pg_cmt_t	*pg;
9683434Sesaxe 	cpu_t		*cpp;
9693434Sesaxe 	group_iter_t	i;
9703434Sesaxe 	pg_cpu_itr_t	cpu_itr;
9713434Sesaxe 	boolean_t	found;
9723434Sesaxe 
9733434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
9743434Sesaxe 
9758906SEric.Saxe@Sun.COM 	if (cmt_sched_disabled)
9768906SEric.Saxe@Sun.COM 		return;
9778906SEric.Saxe@Sun.COM 
9783434Sesaxe 	pgs = &cp->cpu_pg->pgs;
9793434Sesaxe 	group_iter_init(&i);
9803434Sesaxe 
9813434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
9823434Sesaxe 
9833434Sesaxe 		if (IS_CMT_PG(pg) == 0)
9843434Sesaxe 			continue;
9853434Sesaxe 
9863434Sesaxe 		/*
9873434Sesaxe 		 * Remove the CPU from the CMT PGs active CPU group
9883434Sesaxe 		 * bitmap
9893434Sesaxe 		 */
9903434Sesaxe 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
9913434Sesaxe 		ASSERT(err == 0);
9923434Sesaxe 
9933434Sesaxe 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
9943434Sesaxe 
9953434Sesaxe 		/*
9963434Sesaxe 		 * If there are no more active CPUs in this PG over which
9973434Sesaxe 		 * load was balanced, remove it as a balancing candidate.
9983434Sesaxe 		 */
9993434Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
10008906SEric.Saxe@Sun.COM 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
10013434Sesaxe 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
10023434Sesaxe 			ASSERT(err == 0);
10038408SEric.Saxe@Sun.COM 
10048906SEric.Saxe@Sun.COM 			if (pg->cmt_parent == NULL &&
10058906SEric.Saxe@Sun.COM 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
10068408SEric.Saxe@Sun.COM 				err = group_remove(&cmt_root->cl_pgs, pg,
10078408SEric.Saxe@Sun.COM 				    GRP_NORESIZE);
10088408SEric.Saxe@Sun.COM 				ASSERT(err == 0);
10098408SEric.Saxe@Sun.COM 			}
10103434Sesaxe 		}
10113434Sesaxe 
10123434Sesaxe 		/*
10133434Sesaxe 		 * Assert the number of active CPUs does not exceed
10143434Sesaxe 		 * the total number of CPUs in the PG
10153434Sesaxe 		 */
10163434Sesaxe 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
10173434Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
10183434Sesaxe 
10193434Sesaxe 		/*
10203434Sesaxe 		 * Update the PG bitset in the CPU's old partition
10213434Sesaxe 		 */
10223434Sesaxe 		found = B_FALSE;
10233434Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_itr);
10243434Sesaxe 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
10253434Sesaxe 			if (cpp == cp)
10263434Sesaxe 				continue;
10273676Sesaxe 			if (CPU_ACTIVE(cpp) &&
10283676Sesaxe 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
10293434Sesaxe 				found = B_TRUE;
10303434Sesaxe 				break;
10313434Sesaxe 			}
10323434Sesaxe 		}
10333434Sesaxe 		if (!found) {
10343434Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
10353434Sesaxe 			    ((pg_t *)pg)->pg_id);
10363434Sesaxe 		}
10373434Sesaxe 	}
10383434Sesaxe }
10393434Sesaxe 
10403434Sesaxe /*
10413434Sesaxe  * Return non-zero if the CPU belongs in the given PG
10423434Sesaxe  */
10433434Sesaxe static int
10443434Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
10453434Sesaxe {
10463434Sesaxe 	cpu_t	*pg_cpu;
10473434Sesaxe 
10483434Sesaxe 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
10493434Sesaxe 
10503434Sesaxe 	ASSERT(pg_cpu != NULL);
10513434Sesaxe 
10523434Sesaxe 	/*
10533434Sesaxe 	 * The CPU belongs if, given the nature of the hardware sharing
10543434Sesaxe 	 * relationship represented by the PG, the CPU has that
10553434Sesaxe 	 * relationship with some other CPU already in the PG
10563434Sesaxe 	 */
10573434Sesaxe 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
10583434Sesaxe 		return (1);
10593434Sesaxe 
10603434Sesaxe 	return (0);
10613434Sesaxe }
10623434Sesaxe 
10633434Sesaxe /*
10648906SEric.Saxe@Sun.COM  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
10653434Sesaxe  */
10663434Sesaxe static void
10678906SEric.Saxe@Sun.COM pg_cmt_hier_sort(pg_cmt_t **hier, int size)
10683434Sesaxe {
10698906SEric.Saxe@Sun.COM 	int		i, j, inc;
10708906SEric.Saxe@Sun.COM 	pg_t		*tmp;
10718906SEric.Saxe@Sun.COM 	pg_t		**h = (pg_t **)hier;
10723434Sesaxe 
10738906SEric.Saxe@Sun.COM 	/*
10748906SEric.Saxe@Sun.COM 	 * First sort by number of CPUs
10758906SEric.Saxe@Sun.COM 	 */
10768906SEric.Saxe@Sun.COM 	inc = size / 2;
10778906SEric.Saxe@Sun.COM 	while (inc > 0) {
10788906SEric.Saxe@Sun.COM 		for (i = inc; i < size; i++) {
10798906SEric.Saxe@Sun.COM 			j = i;
10808906SEric.Saxe@Sun.COM 			tmp = h[i];
10818906SEric.Saxe@Sun.COM 			while ((j >= inc) &&
10828906SEric.Saxe@Sun.COM 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
10838906SEric.Saxe@Sun.COM 				h[j] = h[j - inc];
10848906SEric.Saxe@Sun.COM 				j = j - inc;
10853434Sesaxe 			}
10868906SEric.Saxe@Sun.COM 			h[j] = tmp;
10873434Sesaxe 		}
10888906SEric.Saxe@Sun.COM 		if (inc == 2)
10898906SEric.Saxe@Sun.COM 			inc = 1;
10908906SEric.Saxe@Sun.COM 		else
10918906SEric.Saxe@Sun.COM 			inc = (inc * 5) / 11;
10928906SEric.Saxe@Sun.COM 	}
10938906SEric.Saxe@Sun.COM 
10948906SEric.Saxe@Sun.COM 	/*
10958906SEric.Saxe@Sun.COM 	 * Break ties by asking the platform.
10968906SEric.Saxe@Sun.COM 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
10978906SEric.Saxe@Sun.COM 	 */
10988906SEric.Saxe@Sun.COM 	for (i = 0; i < size - 1; i++) {
10998906SEric.Saxe@Sun.COM 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
11008906SEric.Saxe@Sun.COM 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
11018906SEric.Saxe@Sun.COM 			tmp = h[i];
11028906SEric.Saxe@Sun.COM 			h[i] = h[i + 1];
11038906SEric.Saxe@Sun.COM 			h[i + 1] = tmp;
11048906SEric.Saxe@Sun.COM 		}
11053434Sesaxe 	}
11063434Sesaxe }
11073434Sesaxe 
11083434Sesaxe /*
11093434Sesaxe  * Return a cmt_lgrp_t * given an lgroup handle.
11103434Sesaxe  */
11113434Sesaxe static cmt_lgrp_t *
11123434Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
11133434Sesaxe {
11143434Sesaxe 	cmt_lgrp_t	*lgrp;
11153434Sesaxe 
11163434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
11173434Sesaxe 
11183434Sesaxe 	lgrp = cmt_lgrps;
11193434Sesaxe 	while (lgrp != NULL) {
11203434Sesaxe 		if (lgrp->cl_hand == hand)
11213676Sesaxe 			break;
11223434Sesaxe 		lgrp = lgrp->cl_next;
11233434Sesaxe 	}
11243676Sesaxe 	return (lgrp);
11253676Sesaxe }
11263434Sesaxe 
11273676Sesaxe /*
11283676Sesaxe  * Create a cmt_lgrp_t with the specified handle.
11293676Sesaxe  */
11303676Sesaxe static cmt_lgrp_t *
11313676Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
11323676Sesaxe {
11333676Sesaxe 	cmt_lgrp_t	*lgrp;
11343676Sesaxe 
11353676Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
11363676Sesaxe 
11373434Sesaxe 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
11383434Sesaxe 
11393434Sesaxe 	lgrp->cl_hand = hand;
11403434Sesaxe 	lgrp->cl_npgs = 0;
11413434Sesaxe 	lgrp->cl_next = cmt_lgrps;
11423434Sesaxe 	cmt_lgrps = lgrp;
11433434Sesaxe 	group_create(&lgrp->cl_pgs);
11443434Sesaxe 
11453434Sesaxe 	return (lgrp);
11463434Sesaxe }
11478408SEric.Saxe@Sun.COM 
11488408SEric.Saxe@Sun.COM /*
11498906SEric.Saxe@Sun.COM  * Interfaces to enable and disable power aware dispatching
11508906SEric.Saxe@Sun.COM  * The caller must be holding cpu_lock.
11518408SEric.Saxe@Sun.COM  *
11528906SEric.Saxe@Sun.COM  * Return 0 on success and -1 on failure.
11538408SEric.Saxe@Sun.COM  */
11548906SEric.Saxe@Sun.COM int
11558906SEric.Saxe@Sun.COM cmt_pad_enable(pghw_type_t type)
11568408SEric.Saxe@Sun.COM {
11578906SEric.Saxe@Sun.COM 	group_t		*hwset;
11588906SEric.Saxe@Sun.COM 	group_iter_t	iter;
11598906SEric.Saxe@Sun.COM 	pg_cmt_t	*pg;
11608906SEric.Saxe@Sun.COM 
11618906SEric.Saxe@Sun.COM 	ASSERT(PGHW_IS_PM_DOMAIN(type));
11628906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
11638408SEric.Saxe@Sun.COM 
11648906SEric.Saxe@Sun.COM 	if ((hwset = pghw_set_lookup(type)) == NULL ||
11658906SEric.Saxe@Sun.COM 	    cmt_hw_blacklisted[type]) {
11668906SEric.Saxe@Sun.COM 		/*
11678906SEric.Saxe@Sun.COM 		 * Unable to find any instances of the specified type
11688906SEric.Saxe@Sun.COM 		 * of power domain, or the power domains have been blacklisted.
11698906SEric.Saxe@Sun.COM 		 */
11708906SEric.Saxe@Sun.COM 		return (-1);
11718906SEric.Saxe@Sun.COM 	}
11728408SEric.Saxe@Sun.COM 
11738408SEric.Saxe@Sun.COM 	/*
11748906SEric.Saxe@Sun.COM 	 * Iterate over the power domains, setting the default dispatcher
11758906SEric.Saxe@Sun.COM 	 * policy for power/performance optimization.
11768906SEric.Saxe@Sun.COM 	 *
11778906SEric.Saxe@Sun.COM 	 * Simply setting the policy isn't enough in the case where the power
11788906SEric.Saxe@Sun.COM 	 * domain is an only child of another PG. Because the dispatcher walks
11798906SEric.Saxe@Sun.COM 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
11808906SEric.Saxe@Sun.COM 	 * will dominate. So promote the power domain above it's parent if both
11818906SEric.Saxe@Sun.COM 	 * PG and it's parent have the same CPUs to ensure it's policy
11828906SEric.Saxe@Sun.COM 	 * dominates.
11838408SEric.Saxe@Sun.COM 	 */
11848906SEric.Saxe@Sun.COM 	group_iter_init(&iter);
11858906SEric.Saxe@Sun.COM 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
11868906SEric.Saxe@Sun.COM 		/*
11878906SEric.Saxe@Sun.COM 		 * If the power domain is an only child to a parent
11888906SEric.Saxe@Sun.COM 		 * not implementing the same policy, promote the child
11898906SEric.Saxe@Sun.COM 		 * above the parent to activate the policy.
11908906SEric.Saxe@Sun.COM 		 */
11918906SEric.Saxe@Sun.COM 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
11928906SEric.Saxe@Sun.COM 		while ((pg->cmt_parent != NULL) &&
11938906SEric.Saxe@Sun.COM 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
11948906SEric.Saxe@Sun.COM 		    (PG_NUM_CPUS((pg_t *)pg) ==
11958906SEric.Saxe@Sun.COM 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
11968906SEric.Saxe@Sun.COM 			cmt_hier_promote(pg);
11978906SEric.Saxe@Sun.COM 		}
11988906SEric.Saxe@Sun.COM 	}
11998906SEric.Saxe@Sun.COM 
12008906SEric.Saxe@Sun.COM 	return (0);
12018906SEric.Saxe@Sun.COM }
12028408SEric.Saxe@Sun.COM 
12038906SEric.Saxe@Sun.COM int
12048906SEric.Saxe@Sun.COM cmt_pad_disable(pghw_type_t type)
12058906SEric.Saxe@Sun.COM {
12068906SEric.Saxe@Sun.COM 	group_t		*hwset;
12078906SEric.Saxe@Sun.COM 	group_iter_t	iter;
12088906SEric.Saxe@Sun.COM 	pg_cmt_t	*pg;
12098906SEric.Saxe@Sun.COM 	pg_cmt_t	*child;
12108906SEric.Saxe@Sun.COM 
12118906SEric.Saxe@Sun.COM 	ASSERT(PGHW_IS_PM_DOMAIN(type));
12128906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
12138906SEric.Saxe@Sun.COM 
12148906SEric.Saxe@Sun.COM 	if ((hwset = pghw_set_lookup(type)) == NULL) {
12158906SEric.Saxe@Sun.COM 		/*
12168906SEric.Saxe@Sun.COM 		 * Unable to find any instances of the specified type of
12178906SEric.Saxe@Sun.COM 		 * power domain.
12188906SEric.Saxe@Sun.COM 		 */
12198906SEric.Saxe@Sun.COM 		return (-1);
12208906SEric.Saxe@Sun.COM 	}
12218408SEric.Saxe@Sun.COM 	/*
12228906SEric.Saxe@Sun.COM 	 * Iterate over the power domains, setting the default dispatcher
12238906SEric.Saxe@Sun.COM 	 * policy for performance optimization (load balancing).
12248408SEric.Saxe@Sun.COM 	 */
12258906SEric.Saxe@Sun.COM 	group_iter_init(&iter);
12268906SEric.Saxe@Sun.COM 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
12278408SEric.Saxe@Sun.COM 
12288408SEric.Saxe@Sun.COM 		/*
12298906SEric.Saxe@Sun.COM 		 * If the power domain has an only child that implements
12308906SEric.Saxe@Sun.COM 		 * policy other than load balancing, promote the child
12318906SEric.Saxe@Sun.COM 		 * above the power domain to ensure it's policy dominates.
12328408SEric.Saxe@Sun.COM 		 */
1233*8969SEric.Saxe@Sun.COM 		if (pg->cmt_children != NULL &&
1234*8969SEric.Saxe@Sun.COM 		    GROUP_SIZE(pg->cmt_children) == 1) {
12358906SEric.Saxe@Sun.COM 			child = GROUP_ACCESS(pg->cmt_children, 0);
12368906SEric.Saxe@Sun.COM 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
12378906SEric.Saxe@Sun.COM 				cmt_hier_promote(child);
12388906SEric.Saxe@Sun.COM 			}
12398906SEric.Saxe@Sun.COM 		}
12408906SEric.Saxe@Sun.COM 		pg->cmt_policy = CMT_BALANCE;
12418906SEric.Saxe@Sun.COM 	}
12428906SEric.Saxe@Sun.COM 	return (0);
12438906SEric.Saxe@Sun.COM }
12448906SEric.Saxe@Sun.COM 
12458906SEric.Saxe@Sun.COM /* ARGSUSED */
12468906SEric.Saxe@Sun.COM static void
12478906SEric.Saxe@Sun.COM cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
12488906SEric.Saxe@Sun.COM 		    kthread_t *new)
12498906SEric.Saxe@Sun.COM {
12508906SEric.Saxe@Sun.COM 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
12518906SEric.Saxe@Sun.COM 
12528906SEric.Saxe@Sun.COM 	if (old == cp->cpu_idle_thread) {
12538906SEric.Saxe@Sun.COM 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
12548906SEric.Saxe@Sun.COM 	} else if (new == cp->cpu_idle_thread) {
12558906SEric.Saxe@Sun.COM 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
12568906SEric.Saxe@Sun.COM 	}
12578906SEric.Saxe@Sun.COM }
12588906SEric.Saxe@Sun.COM 
12598906SEric.Saxe@Sun.COM /*
12608906SEric.Saxe@Sun.COM  * Macro to test whether a thread is currently runnable on a CPU in a PG.
12618906SEric.Saxe@Sun.COM  */
12628906SEric.Saxe@Sun.COM #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
12638906SEric.Saxe@Sun.COM 	((t)->t_state == TS_RUN &&					\
12648906SEric.Saxe@Sun.COM 	    (t)->t_disp_queue->disp_cpu &&				\
12658906SEric.Saxe@Sun.COM 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
12668906SEric.Saxe@Sun.COM 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
12678906SEric.Saxe@Sun.COM 
12688906SEric.Saxe@Sun.COM static void
12698906SEric.Saxe@Sun.COM cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
12708906SEric.Saxe@Sun.COM     kthread_t *new)
12718906SEric.Saxe@Sun.COM {
12728906SEric.Saxe@Sun.COM 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
12738906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
12748906SEric.Saxe@Sun.COM 	uint32_t	u;
12758906SEric.Saxe@Sun.COM 
12768906SEric.Saxe@Sun.COM 	if (old == cp->cpu_idle_thread) {
12778906SEric.Saxe@Sun.COM 		ASSERT(new != cp->cpu_idle_thread);
12788906SEric.Saxe@Sun.COM 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
12798906SEric.Saxe@Sun.COM 		if (u == 1) {
12808906SEric.Saxe@Sun.COM 			/*
12818906SEric.Saxe@Sun.COM 			 * Notify the CPU power manager that the domain
12828906SEric.Saxe@Sun.COM 			 * is non-idle.
12838906SEric.Saxe@Sun.COM 			 */
12848906SEric.Saxe@Sun.COM 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
12858906SEric.Saxe@Sun.COM 			cpupm_utilization_event(cp, now, dom,
12868906SEric.Saxe@Sun.COM 			    CPUPM_DOM_BUSY_FROM_IDLE);
12878906SEric.Saxe@Sun.COM 		}
12888906SEric.Saxe@Sun.COM 	} else if (new == cp->cpu_idle_thread) {
12898906SEric.Saxe@Sun.COM 		ASSERT(old != cp->cpu_idle_thread);
12908906SEric.Saxe@Sun.COM 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
12918906SEric.Saxe@Sun.COM 		if (u == 0) {
12928906SEric.Saxe@Sun.COM 			/*
12938906SEric.Saxe@Sun.COM 			 * The domain is idle, notify the CPU power
12948906SEric.Saxe@Sun.COM 			 * manager.
12958906SEric.Saxe@Sun.COM 			 *
12968906SEric.Saxe@Sun.COM 			 * Avoid notifying if the thread is simply migrating
12978906SEric.Saxe@Sun.COM 			 * between CPUs in the domain.
12988906SEric.Saxe@Sun.COM 			 */
12998906SEric.Saxe@Sun.COM 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
13008906SEric.Saxe@Sun.COM 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
13018906SEric.Saxe@Sun.COM 				cpupm_utilization_event(cp, now, dom,
13028906SEric.Saxe@Sun.COM 				    CPUPM_DOM_IDLE_FROM_BUSY);
13038906SEric.Saxe@Sun.COM 			}
13048906SEric.Saxe@Sun.COM 		}
13058906SEric.Saxe@Sun.COM 	}
13068906SEric.Saxe@Sun.COM }
13078906SEric.Saxe@Sun.COM 
13088906SEric.Saxe@Sun.COM /* ARGSUSED */
13098906SEric.Saxe@Sun.COM static void
13108906SEric.Saxe@Sun.COM cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
13118906SEric.Saxe@Sun.COM {
13128906SEric.Saxe@Sun.COM 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
13138906SEric.Saxe@Sun.COM 	cpupm_domain_t	*dom;
13148906SEric.Saxe@Sun.COM 
13158906SEric.Saxe@Sun.COM 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
13168906SEric.Saxe@Sun.COM 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
13178906SEric.Saxe@Sun.COM }
13188906SEric.Saxe@Sun.COM 
13198906SEric.Saxe@Sun.COM /*
13208906SEric.Saxe@Sun.COM  * Return the name of the CMT scheduling policy
13218906SEric.Saxe@Sun.COM  * being implemented across this PG
13228906SEric.Saxe@Sun.COM  */
13238906SEric.Saxe@Sun.COM static char *
13248906SEric.Saxe@Sun.COM pg_cmt_policy_name(pg_t *pg)
13258906SEric.Saxe@Sun.COM {
13268906SEric.Saxe@Sun.COM 	pg_cmt_policy_t policy;
13278906SEric.Saxe@Sun.COM 
13288906SEric.Saxe@Sun.COM 	policy = ((pg_cmt_t *)pg)->cmt_policy;
13298906SEric.Saxe@Sun.COM 
13308906SEric.Saxe@Sun.COM 	if (policy & CMT_AFFINITY) {
13318906SEric.Saxe@Sun.COM 		if (policy & CMT_BALANCE)
13328906SEric.Saxe@Sun.COM 			return ("Load Balancing & Affinity");
13338906SEric.Saxe@Sun.COM 		else if (policy & CMT_COALESCE)
13348906SEric.Saxe@Sun.COM 			return ("Load Coalescence & Affinity");
13358906SEric.Saxe@Sun.COM 		else
13368906SEric.Saxe@Sun.COM 			return ("Affinity");
13378906SEric.Saxe@Sun.COM 	} else {
13388906SEric.Saxe@Sun.COM 		if (policy & CMT_BALANCE)
13398906SEric.Saxe@Sun.COM 			return ("Load Balancing");
13408906SEric.Saxe@Sun.COM 		else if (policy & CMT_COALESCE)
13418906SEric.Saxe@Sun.COM 			return ("Load Coalescence");
13428906SEric.Saxe@Sun.COM 		else
13438906SEric.Saxe@Sun.COM 			return ("None");
13448906SEric.Saxe@Sun.COM 	}
13458906SEric.Saxe@Sun.COM }
13468906SEric.Saxe@Sun.COM 
13478906SEric.Saxe@Sun.COM /*
13488906SEric.Saxe@Sun.COM  * Prune PG, and all other instances of PG's hardware sharing relationship
13498906SEric.Saxe@Sun.COM  * from the PG hierarchy.
13508906SEric.Saxe@Sun.COM  */
13518906SEric.Saxe@Sun.COM static int
13528906SEric.Saxe@Sun.COM pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
13538906SEric.Saxe@Sun.COM {
13548906SEric.Saxe@Sun.COM 	group_t		*hwset, *children;
13558906SEric.Saxe@Sun.COM 	int		i, j, r, size = *sz;
13568906SEric.Saxe@Sun.COM 	group_iter_t	hw_iter, child_iter;
13578906SEric.Saxe@Sun.COM 	pg_cpu_itr_t	cpu_iter;
13588906SEric.Saxe@Sun.COM 	pg_cmt_t	*pg, *child;
13598906SEric.Saxe@Sun.COM 	cpu_t		*cpu;
13608906SEric.Saxe@Sun.COM 	int		cap_needed;
13618906SEric.Saxe@Sun.COM 	pghw_type_t	hw;
13628906SEric.Saxe@Sun.COM 
13638906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
13648906SEric.Saxe@Sun.COM 
13658906SEric.Saxe@Sun.COM 	hw = ((pghw_t *)pg_bad)->pghw_hw;
13668906SEric.Saxe@Sun.COM 
13678906SEric.Saxe@Sun.COM 	if (hw == PGHW_POW_ACTIVE) {
13688906SEric.Saxe@Sun.COM 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
13698906SEric.Saxe@Sun.COM 		    "Event Based CPUPM Unavailable");
13708906SEric.Saxe@Sun.COM 	} else if (hw == PGHW_POW_IDLE) {
13718906SEric.Saxe@Sun.COM 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
13728906SEric.Saxe@Sun.COM 		    "Dispatcher assisted CPUPM disabled.");
13738906SEric.Saxe@Sun.COM 	}
13748906SEric.Saxe@Sun.COM 
13758906SEric.Saxe@Sun.COM 	/*
13768906SEric.Saxe@Sun.COM 	 * Find and eliminate the PG from the lineage.
13778906SEric.Saxe@Sun.COM 	 */
13788906SEric.Saxe@Sun.COM 	for (i = 0; i < size; i++) {
13798906SEric.Saxe@Sun.COM 		if (lineage[i] == pg_bad) {
13808906SEric.Saxe@Sun.COM 			for (j = i; j < size - 1; j++)
13818906SEric.Saxe@Sun.COM 				lineage[j] = lineage[j + 1];
13828906SEric.Saxe@Sun.COM 			*sz = size - 1;
13838906SEric.Saxe@Sun.COM 			break;
13848906SEric.Saxe@Sun.COM 		}
13858906SEric.Saxe@Sun.COM 	}
13868906SEric.Saxe@Sun.COM 
13878906SEric.Saxe@Sun.COM 	/*
13888906SEric.Saxe@Sun.COM 	 * We'll prune all instances of the hardware sharing relationship
13898906SEric.Saxe@Sun.COM 	 * represented by pg. But before we do that (and pause CPUs) we need
13908906SEric.Saxe@Sun.COM 	 * to ensure the hierarchy's groups are properly sized.
13918906SEric.Saxe@Sun.COM 	 */
13928906SEric.Saxe@Sun.COM 	hwset = pghw_set_lookup(hw);
13938906SEric.Saxe@Sun.COM 
13948906SEric.Saxe@Sun.COM 	/*
13958906SEric.Saxe@Sun.COM 	 * Blacklist the hardware so that future groups won't be created.
13968906SEric.Saxe@Sun.COM 	 */
13978906SEric.Saxe@Sun.COM 	cmt_hw_blacklisted[hw] = 1;
13988906SEric.Saxe@Sun.COM 
13998906SEric.Saxe@Sun.COM 	/*
14008906SEric.Saxe@Sun.COM 	 * For each of the PGs being pruned, ensure sufficient capacity in
14018906SEric.Saxe@Sun.COM 	 * the siblings set for the PG's children
14028906SEric.Saxe@Sun.COM 	 */
14038906SEric.Saxe@Sun.COM 	group_iter_init(&hw_iter);
14048906SEric.Saxe@Sun.COM 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
14058906SEric.Saxe@Sun.COM 		/*
14068906SEric.Saxe@Sun.COM 		 * PG is being pruned, but if it is bringing up more than
14078906SEric.Saxe@Sun.COM 		 * one child, ask for more capacity in the siblings group.
14088906SEric.Saxe@Sun.COM 		 */
14098906SEric.Saxe@Sun.COM 		cap_needed = 0;
14108906SEric.Saxe@Sun.COM 		if (pg->cmt_children &&
14118906SEric.Saxe@Sun.COM 		    GROUP_SIZE(pg->cmt_children) > 1) {
14128906SEric.Saxe@Sun.COM 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
14138906SEric.Saxe@Sun.COM 
14148906SEric.Saxe@Sun.COM 			group_expand(pg->cmt_siblings,
14158906SEric.Saxe@Sun.COM 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
14168408SEric.Saxe@Sun.COM 
14178408SEric.Saxe@Sun.COM 			/*
14188906SEric.Saxe@Sun.COM 			 * If this is a top level group, also ensure the
14198906SEric.Saxe@Sun.COM 			 * capacity in the root lgrp level CMT grouping.
14208408SEric.Saxe@Sun.COM 			 */
14218906SEric.Saxe@Sun.COM 			if (pg->cmt_parent == NULL &&
14228906SEric.Saxe@Sun.COM 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
14238906SEric.Saxe@Sun.COM 				group_expand(&cmt_root->cl_pgs,
14248906SEric.Saxe@Sun.COM 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
14258408SEric.Saxe@Sun.COM 			}
14268906SEric.Saxe@Sun.COM 		}
14278906SEric.Saxe@Sun.COM 	}
14288408SEric.Saxe@Sun.COM 
14298906SEric.Saxe@Sun.COM 	/*
14308906SEric.Saxe@Sun.COM 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
14318906SEric.Saxe@Sun.COM 	 * exclusivity with respect to the dispatcher.
14328906SEric.Saxe@Sun.COM 	 */
14338906SEric.Saxe@Sun.COM 	pause_cpus(NULL);
14348408SEric.Saxe@Sun.COM 
14358906SEric.Saxe@Sun.COM 	/*
14368906SEric.Saxe@Sun.COM 	 * Prune all PG instances of the hardware sharing relationship
14378906SEric.Saxe@Sun.COM 	 * represented by pg.
14388906SEric.Saxe@Sun.COM 	 */
14398906SEric.Saxe@Sun.COM 	group_iter_init(&hw_iter);
14408906SEric.Saxe@Sun.COM 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
14418408SEric.Saxe@Sun.COM 
14428408SEric.Saxe@Sun.COM 		/*
14438906SEric.Saxe@Sun.COM 		 * Remove PG from it's group of siblings, if it's there.
14448906SEric.Saxe@Sun.COM 		 */
14458906SEric.Saxe@Sun.COM 		if (pg->cmt_siblings) {
14468906SEric.Saxe@Sun.COM 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
14478906SEric.Saxe@Sun.COM 		}
14488906SEric.Saxe@Sun.COM 		if (pg->cmt_parent == NULL &&
14498906SEric.Saxe@Sun.COM 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
14508906SEric.Saxe@Sun.COM 			(void) group_remove(&cmt_root->cl_pgs, pg,
14518906SEric.Saxe@Sun.COM 			    GRP_NORESIZE);
14528906SEric.Saxe@Sun.COM 		}
14538906SEric.Saxe@Sun.COM 		/*
14548906SEric.Saxe@Sun.COM 		 * Add PGs children to it's group of siblings.
14558906SEric.Saxe@Sun.COM 		 */
14568906SEric.Saxe@Sun.COM 		if (pg->cmt_children != NULL) {
14578906SEric.Saxe@Sun.COM 			children = pg->cmt_children;
14588906SEric.Saxe@Sun.COM 
14598906SEric.Saxe@Sun.COM 			group_iter_init(&child_iter);
14608906SEric.Saxe@Sun.COM 			while ((child = group_iterate(children, &child_iter))
14618906SEric.Saxe@Sun.COM 			    != NULL) {
14628906SEric.Saxe@Sun.COM 				/*
14638906SEric.Saxe@Sun.COM 				 * Transplant child from it's siblings set to
14648906SEric.Saxe@Sun.COM 				 * PGs.
14658906SEric.Saxe@Sun.COM 				 */
14668906SEric.Saxe@Sun.COM 				if (pg->cmt_siblings != NULL &&
14678906SEric.Saxe@Sun.COM 				    child->cmt_siblings != NULL &&
14688906SEric.Saxe@Sun.COM 				    group_remove(child->cmt_siblings, child,
14698906SEric.Saxe@Sun.COM 				    GRP_NORESIZE) != -1) {
14708906SEric.Saxe@Sun.COM 					r = group_add(pg->cmt_siblings, child,
14718906SEric.Saxe@Sun.COM 					    GRP_NORESIZE);
14728906SEric.Saxe@Sun.COM 					ASSERT(r == 0);
14738906SEric.Saxe@Sun.COM 				}
14748906SEric.Saxe@Sun.COM 			}
14758906SEric.Saxe@Sun.COM 		}
14768906SEric.Saxe@Sun.COM 
14778906SEric.Saxe@Sun.COM 		/*
14788906SEric.Saxe@Sun.COM 		 * Reset the callbacks to the defaults
14798906SEric.Saxe@Sun.COM 		 */
14808906SEric.Saxe@Sun.COM 		pg_callback_set_defaults((pg_t *)pg);
14818906SEric.Saxe@Sun.COM 
14828906SEric.Saxe@Sun.COM 		/*
14838906SEric.Saxe@Sun.COM 		 * Update all the CPU lineages in each of PG's CPUs
14848408SEric.Saxe@Sun.COM 		 */
14858906SEric.Saxe@Sun.COM 		PG_CPU_ITR_INIT(pg, cpu_iter);
14868906SEric.Saxe@Sun.COM 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
14878906SEric.Saxe@Sun.COM 			group_t		*pgs;
14888906SEric.Saxe@Sun.COM 			pg_cmt_t	*cpu_pg;
14898906SEric.Saxe@Sun.COM 			group_iter_t	liter;	/* Iterator for the lineage */
14908906SEric.Saxe@Sun.COM 
14918906SEric.Saxe@Sun.COM 			/*
14928906SEric.Saxe@Sun.COM 			 * Iterate over the CPU's PGs updating the children
14938906SEric.Saxe@Sun.COM 			 * of the PG being promoted, since they have a new
14948906SEric.Saxe@Sun.COM 			 * parent and siblings set.
14958906SEric.Saxe@Sun.COM 			 */
14968906SEric.Saxe@Sun.COM 			pgs = &cpu->cpu_pg->pgs;
14978906SEric.Saxe@Sun.COM 			group_iter_init(&liter);
14988906SEric.Saxe@Sun.COM 			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
14998906SEric.Saxe@Sun.COM 				if (cpu_pg->cmt_parent == pg) {
15008906SEric.Saxe@Sun.COM 					cpu_pg->cmt_parent = pg->cmt_parent;
15018906SEric.Saxe@Sun.COM 					cpu_pg->cmt_siblings = pg->cmt_siblings;
15028906SEric.Saxe@Sun.COM 				}
15038906SEric.Saxe@Sun.COM 			}
15048906SEric.Saxe@Sun.COM 
15058906SEric.Saxe@Sun.COM 			/*
15068906SEric.Saxe@Sun.COM 			 * Update the CPU's lineages
15078906SEric.Saxe@Sun.COM 			 */
15088906SEric.Saxe@Sun.COM 			pgs = &cpu->cpu_pg->cmt_pgs;
15098906SEric.Saxe@Sun.COM 			(void) group_remove(pgs, pg, GRP_NORESIZE);
15108906SEric.Saxe@Sun.COM 			pgs = &cpu->cpu_pg->pgs;
15118906SEric.Saxe@Sun.COM 			(void) group_remove(pgs, pg, GRP_NORESIZE);
15128408SEric.Saxe@Sun.COM 		}
15138906SEric.Saxe@Sun.COM 	}
15148906SEric.Saxe@Sun.COM 	start_cpus();
15158906SEric.Saxe@Sun.COM 	return (0);
15168906SEric.Saxe@Sun.COM }
15178906SEric.Saxe@Sun.COM 
15188906SEric.Saxe@Sun.COM /*
15198906SEric.Saxe@Sun.COM  * Disable CMT scheduling
15208906SEric.Saxe@Sun.COM  */
15218906SEric.Saxe@Sun.COM static void
15228906SEric.Saxe@Sun.COM pg_cmt_disable(void)
15238906SEric.Saxe@Sun.COM {
15248906SEric.Saxe@Sun.COM 	cpu_t	*cpu;
15258906SEric.Saxe@Sun.COM 
15268906SEric.Saxe@Sun.COM 	pause_cpus(NULL);
15278906SEric.Saxe@Sun.COM 	cpu = cpu_list;
15288906SEric.Saxe@Sun.COM 
15298906SEric.Saxe@Sun.COM 	do {
15308906SEric.Saxe@Sun.COM 		if (cpu->cpu_pg)
15318906SEric.Saxe@Sun.COM 			group_empty(&cpu->cpu_pg->cmt_pgs);
15328906SEric.Saxe@Sun.COM 	} while ((cpu = cpu->cpu_next) != cpu_list);
15338906SEric.Saxe@Sun.COM 
15348906SEric.Saxe@Sun.COM 	cmt_sched_disabled = 1;
15358906SEric.Saxe@Sun.COM 	start_cpus();
15368906SEric.Saxe@Sun.COM 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
15378906SEric.Saxe@Sun.COM }
15388408SEric.Saxe@Sun.COM 
15398906SEric.Saxe@Sun.COM static int
15408906SEric.Saxe@Sun.COM pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
15418906SEric.Saxe@Sun.COM {
15428906SEric.Saxe@Sun.COM 	int		i, size;
15438906SEric.Saxe@Sun.COM 	pg_cmt_t	*pg, *parent, *pg_bad;
15448906SEric.Saxe@Sun.COM 	cpu_t		*cp;
15458906SEric.Saxe@Sun.COM 	pg_cpu_itr_t	cpu_iter;
15468906SEric.Saxe@Sun.COM 
15478906SEric.Saxe@Sun.COM 	ASSERT(MUTEX_HELD(&cpu_lock));
15488906SEric.Saxe@Sun.COM 
15498906SEric.Saxe@Sun.COM revalidate:
15508906SEric.Saxe@Sun.COM 	size = *sz;
15518906SEric.Saxe@Sun.COM 	pg_bad = NULL;
15528906SEric.Saxe@Sun.COM 	for (i = 0; i < size - 1; i++) {
15538906SEric.Saxe@Sun.COM 
15548906SEric.Saxe@Sun.COM 		pg = lineage[i];
15558906SEric.Saxe@Sun.COM 		parent = lineage[i + 1];
15568408SEric.Saxe@Sun.COM 
15578906SEric.Saxe@Sun.COM 		/*
15588906SEric.Saxe@Sun.COM 		 * We assume that the lineage has already been sorted
15598906SEric.Saxe@Sun.COM 		 * by the number of CPUs. In fact, we depend on it.
15608906SEric.Saxe@Sun.COM 		 */
15618906SEric.Saxe@Sun.COM 		ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent));
15628906SEric.Saxe@Sun.COM 
15638906SEric.Saxe@Sun.COM 		/*
15648906SEric.Saxe@Sun.COM 		 * Walk each of the CPUs in the PGs group, and verify that
15658906SEric.Saxe@Sun.COM 		 * the next larger PG contains at least the CPUs in this one.
15668906SEric.Saxe@Sun.COM 		 */
15678906SEric.Saxe@Sun.COM 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
15688906SEric.Saxe@Sun.COM 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
15698906SEric.Saxe@Sun.COM 			if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) {
15708906SEric.Saxe@Sun.COM 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
15718906SEric.Saxe@Sun.COM 				goto handle_error;
15728906SEric.Saxe@Sun.COM 			}
15738906SEric.Saxe@Sun.COM 		}
15748408SEric.Saxe@Sun.COM 	}
15758408SEric.Saxe@Sun.COM 
15768906SEric.Saxe@Sun.COM handle_error:
15778906SEric.Saxe@Sun.COM 	switch (cmt_lineage_status) {
15788906SEric.Saxe@Sun.COM 	case CMT_LINEAGE_VALID:
15798906SEric.Saxe@Sun.COM 	case CMT_LINEAGE_REPAIRED:
15808906SEric.Saxe@Sun.COM 		break;
15818906SEric.Saxe@Sun.COM 	case CMT_LINEAGE_NON_CONCENTRIC:
15828408SEric.Saxe@Sun.COM 		/*
15838906SEric.Saxe@Sun.COM 		 * We've detected a non-concentric PG lineage.
15848906SEric.Saxe@Sun.COM 		 *
15858906SEric.Saxe@Sun.COM 		 * This can happen when some of the CPU grouping information
15868906SEric.Saxe@Sun.COM 		 * is derived from buggy sources (for example, incorrect ACPI
15878906SEric.Saxe@Sun.COM 		 * tables on x86 systems).
15888906SEric.Saxe@Sun.COM 		 *
15898906SEric.Saxe@Sun.COM 		 * We attempt to recover from this by pruning out the
15908906SEric.Saxe@Sun.COM 		 * illegal groupings from the PG hierarchy, which means that
15918906SEric.Saxe@Sun.COM 		 * we won't optimize for those levels, but we will for the
15928906SEric.Saxe@Sun.COM 		 * remaining ones.
15938906SEric.Saxe@Sun.COM 		 *
15948906SEric.Saxe@Sun.COM 		 * If a given level has CPUs not found in it's parent, then
15958906SEric.Saxe@Sun.COM 		 * we examine the PG and it's parent to see if either grouping
15968906SEric.Saxe@Sun.COM 		 * is enumerated from potentially buggy sources.
15978906SEric.Saxe@Sun.COM 		 *
15988906SEric.Saxe@Sun.COM 		 * If one has less CPUs than the other, and contains CPUs
15998906SEric.Saxe@Sun.COM 		 * not found in the parent, and it is an untrusted enumeration,
16008906SEric.Saxe@Sun.COM 		 * then prune it. If both have the same number of CPUs, then
16018906SEric.Saxe@Sun.COM 		 * prune the one that is untrusted.
16028906SEric.Saxe@Sun.COM 		 *
16038906SEric.Saxe@Sun.COM 		 * This process repeats until we have a concentric lineage,
16048906SEric.Saxe@Sun.COM 		 * or we would have to prune out level derived from what we
16058906SEric.Saxe@Sun.COM 		 * thought was a reliable source, in which case CMT scheduling
16068906SEric.Saxe@Sun.COM 		 * is disabled all together.
16078408SEric.Saxe@Sun.COM 		 */
16088906SEric.Saxe@Sun.COM 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) &&
16098906SEric.Saxe@Sun.COM 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
16108906SEric.Saxe@Sun.COM 			pg_bad = pg;
16118906SEric.Saxe@Sun.COM 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
16128906SEric.Saxe@Sun.COM 		    PG_NUM_CPUS((pg_t *)parent)) {
16138906SEric.Saxe@Sun.COM 			if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) {
16148906SEric.Saxe@Sun.COM 				pg_bad = parent;
16158906SEric.Saxe@Sun.COM 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
16168906SEric.Saxe@Sun.COM 				pg_bad = pg;
16178906SEric.Saxe@Sun.COM 			}
16188906SEric.Saxe@Sun.COM 		}
16198906SEric.Saxe@Sun.COM 		if (pg_bad) {
16208906SEric.Saxe@Sun.COM 			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
16218906SEric.Saxe@Sun.COM 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
16228906SEric.Saxe@Sun.COM 				goto revalidate;
16238408SEric.Saxe@Sun.COM 			}
16248906SEric.Saxe@Sun.COM 		}
16258906SEric.Saxe@Sun.COM 		/*FALLTHROUGH*/
16268906SEric.Saxe@Sun.COM 	default:
16278906SEric.Saxe@Sun.COM 		/*
16288906SEric.Saxe@Sun.COM 		 * If we're here, something has gone wrong in trying to
16298906SEric.Saxe@Sun.COM 		 * recover from a illegal PG hierarchy, or we've encountered
16308906SEric.Saxe@Sun.COM 		 * a validation error for which we don't know how to recover.
16318906SEric.Saxe@Sun.COM 		 * In this case, disable CMT scheduling all together.
16328906SEric.Saxe@Sun.COM 		 */
16338906SEric.Saxe@Sun.COM 		pg_cmt_disable();
16348906SEric.Saxe@Sun.COM 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
16358906SEric.Saxe@Sun.COM 		return (-1);
16368408SEric.Saxe@Sun.COM 	}
16378906SEric.Saxe@Sun.COM 	return (0);
16388408SEric.Saxe@Sun.COM }
1639