xref: /onnv-gate/usr/src/uts/common/disp/cmt.c (revision 3676:4975133d76f2)
13434Sesaxe /*
23434Sesaxe  * CDDL HEADER START
33434Sesaxe  *
43434Sesaxe  * The contents of this file are subject to the terms of the
53434Sesaxe  * Common Development and Distribution License (the "License").
63434Sesaxe  * You may not use this file except in compliance with the License.
73434Sesaxe  *
83434Sesaxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93434Sesaxe  * or http://www.opensolaris.org/os/licensing.
103434Sesaxe  * See the License for the specific language governing permissions
113434Sesaxe  * and limitations under the License.
123434Sesaxe  *
133434Sesaxe  * When distributing Covered Code, include this CDDL HEADER in each
143434Sesaxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153434Sesaxe  * If applicable, add the following below this CDDL HEADER, with the
163434Sesaxe  * fields enclosed by brackets "[]" replaced with your own identifying
173434Sesaxe  * information: Portions Copyright [yyyy] [name of copyright owner]
183434Sesaxe  *
193434Sesaxe  * CDDL HEADER END
203434Sesaxe  */
213434Sesaxe /*
223434Sesaxe  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
233434Sesaxe  * Use is subject to license terms.
243434Sesaxe  */
253434Sesaxe 
263434Sesaxe #pragma ident	"%Z%%M%	%I%	%E% SMI"
273434Sesaxe 
283434Sesaxe #include <sys/systm.h>
293434Sesaxe #include <sys/types.h>
303434Sesaxe #include <sys/param.h>
313434Sesaxe #include <sys/thread.h>
323434Sesaxe #include <sys/cpuvar.h>
333434Sesaxe #include <sys/cpupart.h>
343434Sesaxe #include <sys/kmem.h>
353434Sesaxe #include <sys/cmn_err.h>
363434Sesaxe #include <sys/kstat.h>
373434Sesaxe #include <sys/processor.h>
383434Sesaxe #include <sys/disp.h>
393434Sesaxe #include <sys/group.h>
403434Sesaxe #include <sys/pghw.h>
413434Sesaxe #include <sys/bitset.h>
423434Sesaxe #include <sys/lgrp.h>
433434Sesaxe #include <sys/cmt.h>
443434Sesaxe 
453434Sesaxe /*
463434Sesaxe  * CMT scheduler / dispatcher support
473434Sesaxe  *
483434Sesaxe  * This file implements CMT scheduler support using Processor Groups.
493434Sesaxe  * The CMT processor group class creates and maintains the CMT class
503434Sesaxe  * specific processor group pg_cmt_t.
513434Sesaxe  *
523434Sesaxe  * ---------------------------- <-- pg_cmt_t *
533434Sesaxe  * | pghw_t                   |
543434Sesaxe  * ----------------------------
553434Sesaxe  * | CMT class specific data  |
563434Sesaxe  * | - hierarchy linkage      |
573434Sesaxe  * | - CMT load balancing data|
583434Sesaxe  * | - active CPU group/bitset|
593434Sesaxe  * ----------------------------
603434Sesaxe  *
613434Sesaxe  * The scheduler/dispatcher leverages knowledge of the performance
623434Sesaxe  * relevant CMT sharing relationships existing between cpus to implement
633434Sesaxe  * optimized affinity and load balancing policies.
643434Sesaxe  *
653434Sesaxe  * Load balancing policy seeks to improve performance by minimizing
663434Sesaxe  * contention over shared processor resources / facilities, while the
673434Sesaxe  * affinity policies seek to improve cache and TLB utilization.
683434Sesaxe  *
693434Sesaxe  * The CMT PGs created by this class are already arranged into a
703434Sesaxe  * hierarchy (which is done in the pghw layer). To implement the top-down
713434Sesaxe  * CMT load balancing algorithm, the CMT PGs additionally maintain
723434Sesaxe  * parent, child and sibling hierarchy relationships.
733434Sesaxe  * Parent PGs always contain a superset of their children(s) resources,
743434Sesaxe  * each PG can have at most one parent, and siblings are the group of PGs
753434Sesaxe  * sharing the same parent.
763434Sesaxe  *
773434Sesaxe  * On NUMA systems, the CMT load balancing algorithm balances across the
783434Sesaxe  * CMT PGs within their respective lgroups. On UMA based system, there
793434Sesaxe  * exists a top level group of PGs to balance across. On NUMA systems multiple
803434Sesaxe  * top level groups are instantiated, where the top level balancing begins by
813434Sesaxe  * balancng across the CMT PGs within their respective (per lgroup) top level
823434Sesaxe  * groups.
833434Sesaxe  */
843434Sesaxe 
853434Sesaxe typedef struct cmt_lgrp {
863434Sesaxe 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
873434Sesaxe 	int		cl_npgs;	/* # of top level PGs in the lgroup */
883434Sesaxe 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
893434Sesaxe 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
903434Sesaxe } cmt_lgrp_t;
913434Sesaxe 
92*3676Sesaxe static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
93*3676Sesaxe static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
94*3676Sesaxe 						/* used for null_proc_lpa */
953434Sesaxe 
96*3676Sesaxe static int		is_cpu0 = 1; /* true if this is boot CPU context */
97*3676Sesaxe 
98*3676Sesaxe /*
99*3676Sesaxe  * Set this to non-zero to disable CMT scheduling
100*3676Sesaxe  * This must be done via kmdb -d, as /etc/system will be too late
101*3676Sesaxe  */
1023434Sesaxe static int		cmt_sched_disabled = 0;
1033434Sesaxe 
1043434Sesaxe static pg_cid_t		pg_cmt_class_id;		/* PG class id */
1053434Sesaxe 
1063434Sesaxe static pg_t		*pg_cmt_alloc();
1073434Sesaxe static void		pg_cmt_free(pg_t *);
1083434Sesaxe static void		pg_cmt_cpu_init(cpu_t *);
1093434Sesaxe static void		pg_cmt_cpu_fini(cpu_t *);
1103434Sesaxe static void		pg_cmt_cpu_active(cpu_t *);
1113434Sesaxe static void		pg_cmt_cpu_inactive(cpu_t *);
1123434Sesaxe static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
1133434Sesaxe static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
1143434Sesaxe static void		pg_cmt_hier_pack(pg_cmt_t **, int);
1153434Sesaxe static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
1163434Sesaxe static int		pg_cmt_hw(pghw_type_t);
1173434Sesaxe static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
118*3676Sesaxe static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
1193434Sesaxe 
1203434Sesaxe /*
1213434Sesaxe  * Macro to test if PG is managed by the CMT PG class
1223434Sesaxe  */
1233434Sesaxe #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
1243434Sesaxe 
1253434Sesaxe /*
1263434Sesaxe  * CMT PG ops
1273434Sesaxe  */
1283434Sesaxe struct pg_ops pg_ops_cmt = {
1293434Sesaxe 	pg_cmt_alloc,
1303434Sesaxe 	pg_cmt_free,
1313434Sesaxe 	pg_cmt_cpu_init,
1323434Sesaxe 	pg_cmt_cpu_fini,
1333434Sesaxe 	pg_cmt_cpu_active,
1343434Sesaxe 	pg_cmt_cpu_inactive,
1353434Sesaxe 	pg_cmt_cpupart_in,
1363434Sesaxe 	NULL,			/* cpupart_out */
1373434Sesaxe 	pg_cmt_cpupart_move,
1383434Sesaxe 	pg_cmt_cpu_belongs,
1393434Sesaxe };
1403434Sesaxe 
1413434Sesaxe /*
1423434Sesaxe  * Initialize the CMT PG class
1433434Sesaxe  */
1443434Sesaxe void
1453434Sesaxe pg_cmt_class_init(void)
1463434Sesaxe {
1473434Sesaxe 	if (cmt_sched_disabled)
1483434Sesaxe 		return;
1493434Sesaxe 
1503434Sesaxe 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
1513434Sesaxe }
1523434Sesaxe 
1533434Sesaxe /*
1543434Sesaxe  * Called to indicate a new CPU has started up so
1553434Sesaxe  * that either t0 or the slave startup thread can
1563434Sesaxe  * be accounted for.
1573434Sesaxe  */
1583434Sesaxe void
1593434Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
1603434Sesaxe {
1613434Sesaxe 	PG_NRUN_UPDATE(cp, 1);
1623434Sesaxe }
1633434Sesaxe 
1643434Sesaxe /*
1653434Sesaxe  * Adjust the CMT load in the CMT PGs in which the CPU belongs
1663434Sesaxe  * Note that "n" can be positive in the case of increasing
1673434Sesaxe  * load, or negative in the case of decreasing load.
1683434Sesaxe  */
1693434Sesaxe void
1703434Sesaxe pg_cmt_load(cpu_t *cp, int n)
1713434Sesaxe {
1723434Sesaxe 	pg_cmt_t	*pg;
1733434Sesaxe 
1743434Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
1753434Sesaxe 	while (pg != NULL) {
1763434Sesaxe 		ASSERT(IS_CMT_PG(pg));
1773434Sesaxe 		atomic_add_32(&pg->cmt_nrunning, n);
1783434Sesaxe 		pg = pg->cmt_parent;
1793434Sesaxe 	}
1803434Sesaxe }
1813434Sesaxe 
1823434Sesaxe /*
1833434Sesaxe  * Return non-zero if thread can migrate between "from" and "to"
1843434Sesaxe  * without a performance penalty
1853434Sesaxe  */
1863434Sesaxe int
1873434Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
1883434Sesaxe {
1893434Sesaxe 	if (from->cpu_physid->cpu_cacheid ==
1903434Sesaxe 	    to->cpu_physid->cpu_cacheid)
1913434Sesaxe 		return (1);
1923434Sesaxe 	return (0);
1933434Sesaxe }
1943434Sesaxe 
1953434Sesaxe /*
1963434Sesaxe  * CMT class specific PG allocation
1973434Sesaxe  */
1983434Sesaxe static pg_t *
1993434Sesaxe pg_cmt_alloc(void)
2003434Sesaxe {
2013434Sesaxe 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
2023434Sesaxe }
2033434Sesaxe 
2043434Sesaxe /*
2053434Sesaxe  * Class specific PG de-allocation
2063434Sesaxe  */
2073434Sesaxe static void
2083434Sesaxe pg_cmt_free(pg_t *pg)
2093434Sesaxe {
2103434Sesaxe 	ASSERT(pg != NULL);
2113434Sesaxe 	ASSERT(IS_CMT_PG(pg));
2123434Sesaxe 
2133434Sesaxe 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
2143434Sesaxe }
2153434Sesaxe 
2163434Sesaxe /*
2173434Sesaxe  * Return 1 if CMT load balancing policies should be
2183434Sesaxe  * implemented across instances of the specified hardware
2193434Sesaxe  * sharing relationship.
2203434Sesaxe  */
2213434Sesaxe static int
2223434Sesaxe pg_cmt_load_bal_hw(pghw_type_t hw)
2233434Sesaxe {
2243434Sesaxe 	if (hw == PGHW_IPIPE ||
2253434Sesaxe 	    hw == PGHW_FPU ||
2263434Sesaxe 	    hw == PGHW_CHIP)
2273434Sesaxe 		return (1);
2283434Sesaxe 	else
2293434Sesaxe 		return (0);
2303434Sesaxe }
2313434Sesaxe 
2323434Sesaxe /*
2333434Sesaxe  * Return 1 if thread affinity polices should be implemented
2343434Sesaxe  * for instances of the specifed hardware sharing relationship.
2353434Sesaxe  */
2363434Sesaxe static int
2373434Sesaxe pg_cmt_affinity_hw(pghw_type_t hw)
2383434Sesaxe {
2393434Sesaxe 	if (hw == PGHW_CACHE)
2403434Sesaxe 		return (1);
2413434Sesaxe 	else
2423434Sesaxe 		return (0);
2433434Sesaxe }
2443434Sesaxe 
2453434Sesaxe /*
2463434Sesaxe  * Return 1 if CMT scheduling policies should be impelmented
2473434Sesaxe  * for the specified hardware sharing relationship.
2483434Sesaxe  */
2493434Sesaxe static int
2503434Sesaxe pg_cmt_hw(pghw_type_t hw)
2513434Sesaxe {
2523434Sesaxe 	return (pg_cmt_load_bal_hw(hw) ||
2533434Sesaxe 	    pg_cmt_affinity_hw(hw));
2543434Sesaxe }
2553434Sesaxe 
2563434Sesaxe /*
2573434Sesaxe  * CMT class callback for a new CPU entering the system
2583434Sesaxe  */
2593434Sesaxe static void
2603434Sesaxe pg_cmt_cpu_init(cpu_t *cp)
2613434Sesaxe {
2623434Sesaxe 	pg_cmt_t	*pg;
2633434Sesaxe 	group_t		*cmt_pgs;
2643434Sesaxe 	int		level, max_level, nlevels;
2653434Sesaxe 	pghw_type_t	hw;
2663434Sesaxe 	pg_t		*pg_cache = NULL;
2673434Sesaxe 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
2683434Sesaxe 	lgrp_handle_t	lgrp_handle;
2693434Sesaxe 	cmt_lgrp_t	*lgrp;
2703434Sesaxe 
2713434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
2723434Sesaxe 
2733434Sesaxe 	/*
2743434Sesaxe 	 * A new CPU is coming into the system.
2753434Sesaxe 	 * Interrogate the platform to see if the CPU
2763434Sesaxe 	 * has any performance relevant CMT sharing
2773434Sesaxe 	 * relationships
2783434Sesaxe 	 */
2793434Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
2803434Sesaxe 	cp->cpu_pg->cmt_lineage = NULL;
2813434Sesaxe 
2823434Sesaxe 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
2833434Sesaxe 	max_level = nlevels = 0;
2843434Sesaxe 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
2853434Sesaxe 
2863434Sesaxe 		/*
2873434Sesaxe 		 * We're only interested in CMT hw sharing relationships
2883434Sesaxe 		 */
2893434Sesaxe 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
2903434Sesaxe 			continue;
2913434Sesaxe 
2923434Sesaxe 		/*
2933434Sesaxe 		 * Find (or create) the PG associated with
2943434Sesaxe 		 * the hw sharing relationship in which cp
2953434Sesaxe 		 * belongs.
2963434Sesaxe 		 *
2973434Sesaxe 		 * Determine if a suitable PG already
2983434Sesaxe 		 * exists, or if one needs to be created.
2993434Sesaxe 		 */
3003434Sesaxe 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
3013434Sesaxe 		if (pg == NULL) {
3023434Sesaxe 			/*
3033434Sesaxe 			 * Create a new one.
3043434Sesaxe 			 * Initialize the common...
3053434Sesaxe 			 */
3063434Sesaxe 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
3073434Sesaxe 
3083434Sesaxe 			/* ... physical ... */
3093434Sesaxe 			pghw_init((pghw_t *)pg, cp, hw);
3103434Sesaxe 
3113434Sesaxe 			/*
3123434Sesaxe 			 * ... and CMT specific portions of the
3133434Sesaxe 			 * structure.
3143434Sesaxe 			 */
3153434Sesaxe 			bitset_init(&pg->cmt_cpus_actv_set);
3163434Sesaxe 			group_create(&pg->cmt_cpus_actv);
3173434Sesaxe 		} else {
3183434Sesaxe 			ASSERT(IS_CMT_PG(pg));
3193434Sesaxe 		}
3203434Sesaxe 
3213434Sesaxe 		/* Add the CPU to the PG */
3223434Sesaxe 		pg_cpu_add((pg_t *)pg, cp);
3233434Sesaxe 
3243434Sesaxe 		/*
3253434Sesaxe 		 * Ensure capacity of the active CPUs group/bitset
3263434Sesaxe 		 */
3273434Sesaxe 		group_expand(&pg->cmt_cpus_actv,
3283434Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
3293434Sesaxe 
3303434Sesaxe 		if (cp->cpu_seqid >=
3313434Sesaxe 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
3323434Sesaxe 			bitset_resize(&pg->cmt_cpus_actv_set,
3333434Sesaxe 			    cp->cpu_seqid + 1);
3343434Sesaxe 		}
3353434Sesaxe 
3363434Sesaxe 		/*
3373434Sesaxe 		 * Build a lineage of CMT PGs for load balancing
3383434Sesaxe 		 */
3393434Sesaxe 		if (pg_cmt_load_bal_hw(hw)) {
3403434Sesaxe 			level = pghw_level(hw);
3413434Sesaxe 			cpu_cmt_hier[level] = pg;
3423434Sesaxe 			if (level > max_level)
3433434Sesaxe 				max_level = level;
3443434Sesaxe 			nlevels++;
3453434Sesaxe 		}
3463434Sesaxe 
3473434Sesaxe 		/* Cache this for later */
3483434Sesaxe 		if (hw == PGHW_CACHE)
3493434Sesaxe 			pg_cache = (pg_t *)pg;
3503434Sesaxe 	}
3513434Sesaxe 
3523434Sesaxe 	/*
3533434Sesaxe 	 * Pack out any gaps in the constructed lineage.
3543434Sesaxe 	 * Gaps may exist where the architecture knows
3553434Sesaxe 	 * about a hardware sharing relationship, but such a
3563434Sesaxe 	 * relationship either isn't relevant for load
3573434Sesaxe 	 * balancing or doesn't exist between CPUs on the system.
3583434Sesaxe 	 */
3593434Sesaxe 	pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1);
3603434Sesaxe 
3613434Sesaxe 	/*
3623434Sesaxe 	 * For each of the PGs int the CPU's lineage:
3633434Sesaxe 	 *	- Add an entry in the CPU sorted CMT PG group
3643434Sesaxe 	 *	  which is used for top down CMT load balancing
3653434Sesaxe 	 *	- Tie the PG into the CMT hierarchy by connecting
3663434Sesaxe 	 *	  it to it's parent and siblings.
3673434Sesaxe 	 */
3683434Sesaxe 	group_expand(cmt_pgs, nlevels);
3693434Sesaxe 
3703434Sesaxe 	/*
3713434Sesaxe 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
3723434Sesaxe 	 */
3733434Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
3743434Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
375*3676Sesaxe 	if (lgrp == NULL)
376*3676Sesaxe 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
3773434Sesaxe 
3783434Sesaxe 	for (level = 0; level < nlevels; level++) {
3793434Sesaxe 		uint_t		children;
3803434Sesaxe 		int		err;
3813434Sesaxe 
3823434Sesaxe 		pg = cpu_cmt_hier[level];
3833434Sesaxe 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
3843434Sesaxe 		ASSERT(err == 0);
3853434Sesaxe 
3863434Sesaxe 		if (level == 0)
3873434Sesaxe 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
3883434Sesaxe 
3893434Sesaxe 		if (pg->cmt_siblings != NULL) {
3903434Sesaxe 			/* Already initialized */
3913434Sesaxe 			ASSERT(pg->cmt_parent == NULL ||
3923434Sesaxe 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
3933434Sesaxe 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
3943434Sesaxe 			    pg->cmt_siblings == pg->cmt_parent->cmt_children);
3953434Sesaxe 			continue;
3963434Sesaxe 		}
3973434Sesaxe 
3983434Sesaxe 		if ((level + 1) == nlevels) {
3993434Sesaxe 			pg->cmt_parent = NULL;
4003434Sesaxe 			pg->cmt_siblings = &lgrp->cl_pgs;
4013434Sesaxe 			children = ++lgrp->cl_npgs;
4023434Sesaxe 		} else {
4033434Sesaxe 			pg->cmt_parent = cpu_cmt_hier[level + 1];
4043434Sesaxe 
4053434Sesaxe 			/*
4063434Sesaxe 			 * A good parent keeps track of their children.
4073434Sesaxe 			 * The parent's children group is also the PG's
4083434Sesaxe 			 * siblings.
4093434Sesaxe 			 */
4103434Sesaxe 			if (pg->cmt_parent->cmt_children == NULL) {
4113434Sesaxe 				pg->cmt_parent->cmt_children =
4123434Sesaxe 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
4133434Sesaxe 				group_create(pg->cmt_parent->cmt_children);
4143434Sesaxe 			}
4153434Sesaxe 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
4163434Sesaxe 			children = ++pg->cmt_parent->cmt_nchildren;
4173434Sesaxe 		}
4183434Sesaxe 		pg->cmt_hint = 0;
4193434Sesaxe 		group_expand(pg->cmt_siblings, children);
4203434Sesaxe 	}
4213434Sesaxe 
4223434Sesaxe 	/*
4233434Sesaxe 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
4243434Sesaxe 	 * for fast lookups later.
4253434Sesaxe 	 */
4263434Sesaxe 	if (cp->cpu_physid) {
4273434Sesaxe 		cp->cpu_physid->cpu_chipid =
4283434Sesaxe 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
4293434Sesaxe 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
4303434Sesaxe 
4313434Sesaxe 		/*
4323434Sesaxe 		 * If this cpu has a PG representing shared cache, then set
4333434Sesaxe 		 * cpu_cacheid to that PG's logical id
4343434Sesaxe 		 */
4353434Sesaxe 		if (pg_cache)
4363434Sesaxe 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
4373434Sesaxe 	}
4383434Sesaxe 
4393434Sesaxe 	/* CPU0 only initialization */
4403434Sesaxe 	if (is_cpu0) {
4413434Sesaxe 		pg_cmt_cpu_startup(cp);
4423434Sesaxe 		is_cpu0 = 0;
443*3676Sesaxe 		cpu0_lgrp = lgrp;
4443434Sesaxe 	}
4453434Sesaxe 
4463434Sesaxe }
4473434Sesaxe 
4483434Sesaxe /*
4493434Sesaxe  * Class callback when a CPU is leaving the system (deletion)
4503434Sesaxe  */
4513434Sesaxe static void
4523434Sesaxe pg_cmt_cpu_fini(cpu_t *cp)
4533434Sesaxe {
4543434Sesaxe 	group_iter_t	i;
4553434Sesaxe 	pg_cmt_t	*pg;
4563434Sesaxe 	group_t		*pgs, *cmt_pgs;
4573434Sesaxe 	lgrp_handle_t	lgrp_handle;
4583434Sesaxe 	cmt_lgrp_t	*lgrp;
4593434Sesaxe 
4603434Sesaxe 	pgs = &cp->cpu_pg->pgs;
4613434Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
4623434Sesaxe 
4633434Sesaxe 	/*
4643434Sesaxe 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
4653434Sesaxe 	 */
4663434Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
467*3676Sesaxe 
4683434Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
469*3676Sesaxe 	if (lgrp == NULL) {
470*3676Sesaxe 		/*
471*3676Sesaxe 		 * This is a bit of a special case.
472*3676Sesaxe 		 * The only way this can happen is if the CPU's lgrp
473*3676Sesaxe 		 * handle changed out from underneath us, which is what
474*3676Sesaxe 		 * happens with null_proc_lpa on starcat systems.
475*3676Sesaxe 		 *
476*3676Sesaxe 		 * Use the initial boot CPU lgrp, since this is what
477*3676Sesaxe 		 * we need to tear down.
478*3676Sesaxe 		 */
479*3676Sesaxe 		lgrp = cpu0_lgrp;
480*3676Sesaxe 	}
4813434Sesaxe 
4823434Sesaxe 	/*
4833434Sesaxe 	 * First, clean up anything load balancing specific for each of
4843434Sesaxe 	 * the CPU's PGs that participated in CMT load balancing
4853434Sesaxe 	 */
4863434Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
4873434Sesaxe 	while (pg != NULL) {
4883434Sesaxe 
4893434Sesaxe 		/*
4903434Sesaxe 		 * Remove the PG from the CPU's load balancing lineage
4913434Sesaxe 		 */
4923434Sesaxe 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
4933434Sesaxe 
4943434Sesaxe 		/*
4953434Sesaxe 		 * If it's about to become empty, destroy it's children
4963434Sesaxe 		 * group, and remove it's reference from it's siblings.
4973434Sesaxe 		 * This is done here (rather than below) to avoid removing
4983434Sesaxe 		 * our reference from a PG that we just eliminated.
4993434Sesaxe 		 */
5003434Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
5013434Sesaxe 			if (pg->cmt_children != NULL)
5023434Sesaxe 				group_destroy(pg->cmt_children);
5033434Sesaxe 			if (pg->cmt_siblings != NULL) {
5043434Sesaxe 				if (pg->cmt_siblings == &lgrp->cl_pgs)
5053434Sesaxe 					lgrp->cl_npgs--;
5063434Sesaxe 				else
5073434Sesaxe 					pg->cmt_parent->cmt_nchildren--;
5083434Sesaxe 			}
5093434Sesaxe 		}
5103434Sesaxe 		pg = pg->cmt_parent;
5113434Sesaxe 	}
5123434Sesaxe 
5133434Sesaxe 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
5143434Sesaxe 
5153434Sesaxe 	/*
5163434Sesaxe 	 * Now that the load balancing lineage updates have happened,
5173434Sesaxe 	 * remove the CPU from all it's PGs (destroying any that become
5183434Sesaxe 	 * empty).
5193434Sesaxe 	 */
5203434Sesaxe 	group_iter_init(&i);
5213434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
5223434Sesaxe 		if (IS_CMT_PG(pg) == 0)
5233434Sesaxe 			continue;
5243434Sesaxe 
5253434Sesaxe 		pg_cpu_delete((pg_t *)pg, cp);
5263434Sesaxe 		/*
5273434Sesaxe 		 * Deleting the CPU from the PG changes the CPU's
5283434Sesaxe 		 * PG group over which we are actively iterating
5293434Sesaxe 		 * Re-initialize the iteration
5303434Sesaxe 		 */
5313434Sesaxe 		group_iter_init(&i);
5323434Sesaxe 
5333434Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
5343434Sesaxe 
5353434Sesaxe 			/*
5363434Sesaxe 			 * The PG has become zero sized, so destroy it.
5373434Sesaxe 			 */
5383434Sesaxe 			group_destroy(&pg->cmt_cpus_actv);
5393434Sesaxe 			bitset_fini(&pg->cmt_cpus_actv_set);
5403434Sesaxe 			pghw_fini((pghw_t *)pg);
5413434Sesaxe 
5423434Sesaxe 			pg_destroy((pg_t *)pg);
5433434Sesaxe 		}
5443434Sesaxe 	}
5453434Sesaxe }
5463434Sesaxe 
5473434Sesaxe /*
5483434Sesaxe  * Class callback when a CPU is entering a cpu partition
5493434Sesaxe  */
5503434Sesaxe static void
5513434Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
5523434Sesaxe {
5533434Sesaxe 	group_t		*pgs;
5543434Sesaxe 	pg_t		*pg;
5553434Sesaxe 	group_iter_t	i;
5563434Sesaxe 
5573434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
5583434Sesaxe 
5593434Sesaxe 	pgs = &cp->cpu_pg->pgs;
5603434Sesaxe 
5613434Sesaxe 	/*
5623434Sesaxe 	 * Ensure that the new partition's PG bitset
5633434Sesaxe 	 * is large enough for all CMT PG's to which cp
5643434Sesaxe 	 * belongs
5653434Sesaxe 	 */
5663434Sesaxe 	group_iter_init(&i);
5673434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
5683434Sesaxe 		if (IS_CMT_PG(pg) == 0)
5693434Sesaxe 			continue;
5703434Sesaxe 
5713434Sesaxe 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
5723434Sesaxe 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
5733434Sesaxe 	}
5743434Sesaxe }
5753434Sesaxe 
5763434Sesaxe /*
5773434Sesaxe  * Class callback when a CPU is actually moving partitions
5783434Sesaxe  */
5793434Sesaxe static void
5803434Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
5813434Sesaxe {
5823434Sesaxe 	cpu_t		*cpp;
5833434Sesaxe 	group_t		*pgs;
5843434Sesaxe 	pg_t		*pg;
5853434Sesaxe 	group_iter_t	pg_iter;
5863434Sesaxe 	pg_cpu_itr_t	cpu_iter;
5873434Sesaxe 	boolean_t	found;
5883434Sesaxe 
5893434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
5903434Sesaxe 
5913434Sesaxe 	pgs = &cp->cpu_pg->pgs;
5923434Sesaxe 	group_iter_init(&pg_iter);
5933434Sesaxe 
5943434Sesaxe 	/*
5953434Sesaxe 	 * Iterate over the CPUs CMT PGs
5963434Sesaxe 	 */
5973434Sesaxe 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
5983434Sesaxe 
5993434Sesaxe 		if (IS_CMT_PG(pg) == 0)
6003434Sesaxe 			continue;
6013434Sesaxe 
6023434Sesaxe 		/*
6033434Sesaxe 		 * Add the PG to the bitset in the new partition.
6043434Sesaxe 		 */
6053434Sesaxe 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
6063434Sesaxe 
6073434Sesaxe 		/*
6083434Sesaxe 		 * Remove the PG from the bitset in the old partition
6093434Sesaxe 		 * if the last of the PG's CPUs have left.
6103434Sesaxe 		 */
6113434Sesaxe 		found = B_FALSE;
6123434Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
6133434Sesaxe 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
6143434Sesaxe 			if (cpp == cp)
6153434Sesaxe 				continue;
616*3676Sesaxe 			if (CPU_ACTIVE(cpp) &&
617*3676Sesaxe 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
6183434Sesaxe 				found = B_TRUE;
6193434Sesaxe 				break;
6203434Sesaxe 			}
6213434Sesaxe 		}
6223434Sesaxe 		if (!found)
6233434Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
6243434Sesaxe 	}
6253434Sesaxe }
6263434Sesaxe 
6273434Sesaxe /*
6283434Sesaxe  * Class callback when a CPU becomes active (online)
6293434Sesaxe  *
6303434Sesaxe  * This is called in a context where CPUs are paused
6313434Sesaxe  */
6323434Sesaxe static void
6333434Sesaxe pg_cmt_cpu_active(cpu_t *cp)
6343434Sesaxe {
6353434Sesaxe 	int		err;
6363434Sesaxe 	group_iter_t	i;
6373434Sesaxe 	pg_cmt_t	*pg;
6383434Sesaxe 	group_t		*pgs;
6393434Sesaxe 
6403434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
6413434Sesaxe 
6423434Sesaxe 	pgs = &cp->cpu_pg->pgs;
6433434Sesaxe 	group_iter_init(&i);
6443434Sesaxe 
6453434Sesaxe 	/*
6463434Sesaxe 	 * Iterate over the CPU's PGs
6473434Sesaxe 	 */
6483434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
6493434Sesaxe 
6503434Sesaxe 		if (IS_CMT_PG(pg) == 0)
6513434Sesaxe 			continue;
6523434Sesaxe 
6533434Sesaxe 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
6543434Sesaxe 		ASSERT(err == 0);
6553434Sesaxe 
6563434Sesaxe 		/*
6573434Sesaxe 		 * If this is the first active CPU in the PG, and it
6583434Sesaxe 		 * represents a hardware sharing relationship over which
6593434Sesaxe 		 * CMT load balancing is performed, add it as a candidate
6603434Sesaxe 		 * for balancing with it's siblings.
6613434Sesaxe 		 */
6623434Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
6633434Sesaxe 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
6643434Sesaxe 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
6653434Sesaxe 			ASSERT(err == 0);
6663434Sesaxe 		}
6673434Sesaxe 
6683434Sesaxe 		/*
6693434Sesaxe 		 * Notate the CPU in the PGs active CPU bitset.
6703434Sesaxe 		 * Also notate the PG as being active in it's associated
6713434Sesaxe 		 * partition
6723434Sesaxe 		 */
6733434Sesaxe 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
6743434Sesaxe 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
6753434Sesaxe 	}
6763434Sesaxe }
6773434Sesaxe 
6783434Sesaxe /*
6793434Sesaxe  * Class callback when a CPU goes inactive (offline)
6803434Sesaxe  *
6813434Sesaxe  * This is called in a context where CPUs are paused
6823434Sesaxe  */
6833434Sesaxe static void
6843434Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
6853434Sesaxe {
6863434Sesaxe 	int		err;
6873434Sesaxe 	group_t		*pgs;
6883434Sesaxe 	pg_cmt_t	*pg;
6893434Sesaxe 	cpu_t		*cpp;
6903434Sesaxe 	group_iter_t	i;
6913434Sesaxe 	pg_cpu_itr_t	cpu_itr;
6923434Sesaxe 	boolean_t	found;
6933434Sesaxe 
6943434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
6953434Sesaxe 
6963434Sesaxe 	pgs = &cp->cpu_pg->pgs;
6973434Sesaxe 	group_iter_init(&i);
6983434Sesaxe 
6993434Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
7003434Sesaxe 
7013434Sesaxe 		if (IS_CMT_PG(pg) == 0)
7023434Sesaxe 			continue;
7033434Sesaxe 
7043434Sesaxe 		/*
7053434Sesaxe 		 * Remove the CPU from the CMT PGs active CPU group
7063434Sesaxe 		 * bitmap
7073434Sesaxe 		 */
7083434Sesaxe 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
7093434Sesaxe 		ASSERT(err == 0);
7103434Sesaxe 
7113434Sesaxe 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
7123434Sesaxe 
7133434Sesaxe 		/*
7143434Sesaxe 		 * If there are no more active CPUs in this PG over which
7153434Sesaxe 		 * load was balanced, remove it as a balancing candidate.
7163434Sesaxe 		 */
7173434Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
7183434Sesaxe 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
7193434Sesaxe 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
7203434Sesaxe 			ASSERT(err == 0);
7213434Sesaxe 		}
7223434Sesaxe 
7233434Sesaxe 		/*
7243434Sesaxe 		 * Assert the number of active CPUs does not exceed
7253434Sesaxe 		 * the total number of CPUs in the PG
7263434Sesaxe 		 */
7273434Sesaxe 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
7283434Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
7293434Sesaxe 
7303434Sesaxe 		/*
7313434Sesaxe 		 * Update the PG bitset in the CPU's old partition
7323434Sesaxe 		 */
7333434Sesaxe 		found = B_FALSE;
7343434Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_itr);
7353434Sesaxe 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
7363434Sesaxe 			if (cpp == cp)
7373434Sesaxe 				continue;
738*3676Sesaxe 			if (CPU_ACTIVE(cpp) &&
739*3676Sesaxe 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
7403434Sesaxe 				found = B_TRUE;
7413434Sesaxe 				break;
7423434Sesaxe 			}
7433434Sesaxe 		}
7443434Sesaxe 		if (!found) {
7453434Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
7463434Sesaxe 			    ((pg_t *)pg)->pg_id);
7473434Sesaxe 		}
7483434Sesaxe 	}
7493434Sesaxe }
7503434Sesaxe 
7513434Sesaxe /*
7523434Sesaxe  * Return non-zero if the CPU belongs in the given PG
7533434Sesaxe  */
7543434Sesaxe static int
7553434Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
7563434Sesaxe {
7573434Sesaxe 	cpu_t	*pg_cpu;
7583434Sesaxe 
7593434Sesaxe 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
7603434Sesaxe 
7613434Sesaxe 	ASSERT(pg_cpu != NULL);
7623434Sesaxe 
7633434Sesaxe 	/*
7643434Sesaxe 	 * The CPU belongs if, given the nature of the hardware sharing
7653434Sesaxe 	 * relationship represented by the PG, the CPU has that
7663434Sesaxe 	 * relationship with some other CPU already in the PG
7673434Sesaxe 	 */
7683434Sesaxe 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
7693434Sesaxe 		return (1);
7703434Sesaxe 
7713434Sesaxe 	return (0);
7723434Sesaxe }
7733434Sesaxe 
7743434Sesaxe /*
7753434Sesaxe  * Pack the CPUs CMT hierarchy
7763434Sesaxe  * The hierarchy order is preserved
7773434Sesaxe  */
7783434Sesaxe static void
7793434Sesaxe pg_cmt_hier_pack(pg_cmt_t *hier[], int sz)
7803434Sesaxe {
7813434Sesaxe 	int	i, j;
7823434Sesaxe 
7833434Sesaxe 	for (i = 0; i < sz; i++) {
7843434Sesaxe 		if (hier[i] != NULL)
7853434Sesaxe 			continue;
7863434Sesaxe 
7873434Sesaxe 		for (j = i; j < sz; j++) {
7883434Sesaxe 			if (hier[j] != NULL) {
7893434Sesaxe 				hier[i] = hier[j];
7903434Sesaxe 				hier[j] = NULL;
7913434Sesaxe 				break;
7923434Sesaxe 			}
7933434Sesaxe 		}
7943434Sesaxe 		if (j == sz)
7953434Sesaxe 			break;
7963434Sesaxe 	}
7973434Sesaxe }
7983434Sesaxe 
7993434Sesaxe /*
8003434Sesaxe  * Return a cmt_lgrp_t * given an lgroup handle.
8013434Sesaxe  */
8023434Sesaxe static cmt_lgrp_t *
8033434Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
8043434Sesaxe {
8053434Sesaxe 	cmt_lgrp_t	*lgrp;
8063434Sesaxe 
8073434Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
8083434Sesaxe 
8093434Sesaxe 	lgrp = cmt_lgrps;
8103434Sesaxe 	while (lgrp != NULL) {
8113434Sesaxe 		if (lgrp->cl_hand == hand)
812*3676Sesaxe 			break;
8133434Sesaxe 		lgrp = lgrp->cl_next;
8143434Sesaxe 	}
815*3676Sesaxe 	return (lgrp);
816*3676Sesaxe }
8173434Sesaxe 
818*3676Sesaxe /*
819*3676Sesaxe  * Create a cmt_lgrp_t with the specified handle.
820*3676Sesaxe  */
821*3676Sesaxe static cmt_lgrp_t *
822*3676Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
823*3676Sesaxe {
824*3676Sesaxe 	cmt_lgrp_t	*lgrp;
825*3676Sesaxe 
826*3676Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
827*3676Sesaxe 
8283434Sesaxe 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
8293434Sesaxe 
8303434Sesaxe 	lgrp->cl_hand = hand;
8313434Sesaxe 	lgrp->cl_npgs = 0;
8323434Sesaxe 	lgrp->cl_next = cmt_lgrps;
8333434Sesaxe 	cmt_lgrps = lgrp;
8343434Sesaxe 	group_create(&lgrp->cl_pgs);
8353434Sesaxe 
8363434Sesaxe 	return (lgrp);
8373434Sesaxe }
838