13434Sesaxe /* 23434Sesaxe * CDDL HEADER START 33434Sesaxe * 43434Sesaxe * The contents of this file are subject to the terms of the 53434Sesaxe * Common Development and Distribution License (the "License"). 63434Sesaxe * You may not use this file except in compliance with the License. 73434Sesaxe * 83434Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 93434Sesaxe * or http://www.opensolaris.org/os/licensing. 103434Sesaxe * See the License for the specific language governing permissions 113434Sesaxe * and limitations under the License. 123434Sesaxe * 133434Sesaxe * When distributing Covered Code, include this CDDL HEADER in each 143434Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 153434Sesaxe * If applicable, add the following below this CDDL HEADER, with the 163434Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying 173434Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner] 183434Sesaxe * 193434Sesaxe * CDDL HEADER END 203434Sesaxe */ 213434Sesaxe /* 223434Sesaxe * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 233434Sesaxe * Use is subject to license terms. 243434Sesaxe */ 253434Sesaxe 263434Sesaxe #pragma ident "%Z%%M% %I% %E% SMI" 273434Sesaxe 283434Sesaxe #include <sys/systm.h> 293434Sesaxe #include <sys/types.h> 303434Sesaxe #include <sys/param.h> 313434Sesaxe #include <sys/thread.h> 323434Sesaxe #include <sys/cpuvar.h> 333434Sesaxe #include <sys/cpupart.h> 343434Sesaxe #include <sys/kmem.h> 353434Sesaxe #include <sys/cmn_err.h> 363434Sesaxe #include <sys/kstat.h> 373434Sesaxe #include <sys/processor.h> 383434Sesaxe #include <sys/disp.h> 393434Sesaxe #include <sys/group.h> 403434Sesaxe #include <sys/pghw.h> 413434Sesaxe #include <sys/bitset.h> 423434Sesaxe #include <sys/lgrp.h> 433434Sesaxe #include <sys/cmt.h> 443434Sesaxe 453434Sesaxe /* 463434Sesaxe * CMT scheduler / dispatcher support 473434Sesaxe * 483434Sesaxe * This file implements CMT scheduler support using Processor Groups. 493434Sesaxe * The CMT processor group class creates and maintains the CMT class 503434Sesaxe * specific processor group pg_cmt_t. 513434Sesaxe * 523434Sesaxe * ---------------------------- <-- pg_cmt_t * 533434Sesaxe * | pghw_t | 543434Sesaxe * ---------------------------- 553434Sesaxe * | CMT class specific data | 563434Sesaxe * | - hierarchy linkage | 573434Sesaxe * | - CMT load balancing data| 583434Sesaxe * | - active CPU group/bitset| 593434Sesaxe * ---------------------------- 603434Sesaxe * 613434Sesaxe * The scheduler/dispatcher leverages knowledge of the performance 623434Sesaxe * relevant CMT sharing relationships existing between cpus to implement 633434Sesaxe * optimized affinity and load balancing policies. 643434Sesaxe * 653434Sesaxe * Load balancing policy seeks to improve performance by minimizing 663434Sesaxe * contention over shared processor resources / facilities, while the 673434Sesaxe * affinity policies seek to improve cache and TLB utilization. 683434Sesaxe * 693434Sesaxe * The CMT PGs created by this class are already arranged into a 703434Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down 713434Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain 723434Sesaxe * parent, child and sibling hierarchy relationships. 733434Sesaxe * Parent PGs always contain a superset of their children(s) resources, 743434Sesaxe * each PG can have at most one parent, and siblings are the group of PGs 753434Sesaxe * sharing the same parent. 763434Sesaxe * 773434Sesaxe * On NUMA systems, the CMT load balancing algorithm balances across the 783434Sesaxe * CMT PGs within their respective lgroups. On UMA based system, there 793434Sesaxe * exists a top level group of PGs to balance across. On NUMA systems multiple 803434Sesaxe * top level groups are instantiated, where the top level balancing begins by 813434Sesaxe * balancng across the CMT PGs within their respective (per lgroup) top level 823434Sesaxe * groups. 833434Sesaxe */ 843434Sesaxe 853434Sesaxe typedef struct cmt_lgrp { 863434Sesaxe group_t cl_pgs; /* Top level group of active CMT PGs */ 873434Sesaxe int cl_npgs; /* # of top level PGs in the lgroup */ 883434Sesaxe lgrp_handle_t cl_hand; /* lgroup's platform handle */ 893434Sesaxe struct cmt_lgrp *cl_next; /* next cmt_lgrp */ 903434Sesaxe } cmt_lgrp_t; 913434Sesaxe 92*3676Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 93*3676Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 94*3676Sesaxe /* used for null_proc_lpa */ 953434Sesaxe 96*3676Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */ 97*3676Sesaxe 98*3676Sesaxe /* 99*3676Sesaxe * Set this to non-zero to disable CMT scheduling 100*3676Sesaxe * This must be done via kmdb -d, as /etc/system will be too late 101*3676Sesaxe */ 1023434Sesaxe static int cmt_sched_disabled = 0; 1033434Sesaxe 1043434Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */ 1053434Sesaxe 1063434Sesaxe static pg_t *pg_cmt_alloc(); 1073434Sesaxe static void pg_cmt_free(pg_t *); 1083434Sesaxe static void pg_cmt_cpu_init(cpu_t *); 1093434Sesaxe static void pg_cmt_cpu_fini(cpu_t *); 1103434Sesaxe static void pg_cmt_cpu_active(cpu_t *); 1113434Sesaxe static void pg_cmt_cpu_inactive(cpu_t *); 1123434Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 1133434Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 1143434Sesaxe static void pg_cmt_hier_pack(pg_cmt_t **, int); 1153434Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 1163434Sesaxe static int pg_cmt_hw(pghw_type_t); 1173434Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 118*3676Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 1193434Sesaxe 1203434Sesaxe /* 1213434Sesaxe * Macro to test if PG is managed by the CMT PG class 1223434Sesaxe */ 1233434Sesaxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 1243434Sesaxe 1253434Sesaxe /* 1263434Sesaxe * CMT PG ops 1273434Sesaxe */ 1283434Sesaxe struct pg_ops pg_ops_cmt = { 1293434Sesaxe pg_cmt_alloc, 1303434Sesaxe pg_cmt_free, 1313434Sesaxe pg_cmt_cpu_init, 1323434Sesaxe pg_cmt_cpu_fini, 1333434Sesaxe pg_cmt_cpu_active, 1343434Sesaxe pg_cmt_cpu_inactive, 1353434Sesaxe pg_cmt_cpupart_in, 1363434Sesaxe NULL, /* cpupart_out */ 1373434Sesaxe pg_cmt_cpupart_move, 1383434Sesaxe pg_cmt_cpu_belongs, 1393434Sesaxe }; 1403434Sesaxe 1413434Sesaxe /* 1423434Sesaxe * Initialize the CMT PG class 1433434Sesaxe */ 1443434Sesaxe void 1453434Sesaxe pg_cmt_class_init(void) 1463434Sesaxe { 1473434Sesaxe if (cmt_sched_disabled) 1483434Sesaxe return; 1493434Sesaxe 1503434Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 1513434Sesaxe } 1523434Sesaxe 1533434Sesaxe /* 1543434Sesaxe * Called to indicate a new CPU has started up so 1553434Sesaxe * that either t0 or the slave startup thread can 1563434Sesaxe * be accounted for. 1573434Sesaxe */ 1583434Sesaxe void 1593434Sesaxe pg_cmt_cpu_startup(cpu_t *cp) 1603434Sesaxe { 1613434Sesaxe PG_NRUN_UPDATE(cp, 1); 1623434Sesaxe } 1633434Sesaxe 1643434Sesaxe /* 1653434Sesaxe * Adjust the CMT load in the CMT PGs in which the CPU belongs 1663434Sesaxe * Note that "n" can be positive in the case of increasing 1673434Sesaxe * load, or negative in the case of decreasing load. 1683434Sesaxe */ 1693434Sesaxe void 1703434Sesaxe pg_cmt_load(cpu_t *cp, int n) 1713434Sesaxe { 1723434Sesaxe pg_cmt_t *pg; 1733434Sesaxe 1743434Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 1753434Sesaxe while (pg != NULL) { 1763434Sesaxe ASSERT(IS_CMT_PG(pg)); 1773434Sesaxe atomic_add_32(&pg->cmt_nrunning, n); 1783434Sesaxe pg = pg->cmt_parent; 1793434Sesaxe } 1803434Sesaxe } 1813434Sesaxe 1823434Sesaxe /* 1833434Sesaxe * Return non-zero if thread can migrate between "from" and "to" 1843434Sesaxe * without a performance penalty 1853434Sesaxe */ 1863434Sesaxe int 1873434Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 1883434Sesaxe { 1893434Sesaxe if (from->cpu_physid->cpu_cacheid == 1903434Sesaxe to->cpu_physid->cpu_cacheid) 1913434Sesaxe return (1); 1923434Sesaxe return (0); 1933434Sesaxe } 1943434Sesaxe 1953434Sesaxe /* 1963434Sesaxe * CMT class specific PG allocation 1973434Sesaxe */ 1983434Sesaxe static pg_t * 1993434Sesaxe pg_cmt_alloc(void) 2003434Sesaxe { 2013434Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 2023434Sesaxe } 2033434Sesaxe 2043434Sesaxe /* 2053434Sesaxe * Class specific PG de-allocation 2063434Sesaxe */ 2073434Sesaxe static void 2083434Sesaxe pg_cmt_free(pg_t *pg) 2093434Sesaxe { 2103434Sesaxe ASSERT(pg != NULL); 2113434Sesaxe ASSERT(IS_CMT_PG(pg)); 2123434Sesaxe 2133434Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 2143434Sesaxe } 2153434Sesaxe 2163434Sesaxe /* 2173434Sesaxe * Return 1 if CMT load balancing policies should be 2183434Sesaxe * implemented across instances of the specified hardware 2193434Sesaxe * sharing relationship. 2203434Sesaxe */ 2213434Sesaxe static int 2223434Sesaxe pg_cmt_load_bal_hw(pghw_type_t hw) 2233434Sesaxe { 2243434Sesaxe if (hw == PGHW_IPIPE || 2253434Sesaxe hw == PGHW_FPU || 2263434Sesaxe hw == PGHW_CHIP) 2273434Sesaxe return (1); 2283434Sesaxe else 2293434Sesaxe return (0); 2303434Sesaxe } 2313434Sesaxe 2323434Sesaxe /* 2333434Sesaxe * Return 1 if thread affinity polices should be implemented 2343434Sesaxe * for instances of the specifed hardware sharing relationship. 2353434Sesaxe */ 2363434Sesaxe static int 2373434Sesaxe pg_cmt_affinity_hw(pghw_type_t hw) 2383434Sesaxe { 2393434Sesaxe if (hw == PGHW_CACHE) 2403434Sesaxe return (1); 2413434Sesaxe else 2423434Sesaxe return (0); 2433434Sesaxe } 2443434Sesaxe 2453434Sesaxe /* 2463434Sesaxe * Return 1 if CMT scheduling policies should be impelmented 2473434Sesaxe * for the specified hardware sharing relationship. 2483434Sesaxe */ 2493434Sesaxe static int 2503434Sesaxe pg_cmt_hw(pghw_type_t hw) 2513434Sesaxe { 2523434Sesaxe return (pg_cmt_load_bal_hw(hw) || 2533434Sesaxe pg_cmt_affinity_hw(hw)); 2543434Sesaxe } 2553434Sesaxe 2563434Sesaxe /* 2573434Sesaxe * CMT class callback for a new CPU entering the system 2583434Sesaxe */ 2593434Sesaxe static void 2603434Sesaxe pg_cmt_cpu_init(cpu_t *cp) 2613434Sesaxe { 2623434Sesaxe pg_cmt_t *pg; 2633434Sesaxe group_t *cmt_pgs; 2643434Sesaxe int level, max_level, nlevels; 2653434Sesaxe pghw_type_t hw; 2663434Sesaxe pg_t *pg_cache = NULL; 2673434Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 2683434Sesaxe lgrp_handle_t lgrp_handle; 2693434Sesaxe cmt_lgrp_t *lgrp; 2703434Sesaxe 2713434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 2723434Sesaxe 2733434Sesaxe /* 2743434Sesaxe * A new CPU is coming into the system. 2753434Sesaxe * Interrogate the platform to see if the CPU 2763434Sesaxe * has any performance relevant CMT sharing 2773434Sesaxe * relationships 2783434Sesaxe */ 2793434Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 2803434Sesaxe cp->cpu_pg->cmt_lineage = NULL; 2813434Sesaxe 2823434Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 2833434Sesaxe max_level = nlevels = 0; 2843434Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 2853434Sesaxe 2863434Sesaxe /* 2873434Sesaxe * We're only interested in CMT hw sharing relationships 2883434Sesaxe */ 2893434Sesaxe if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) 2903434Sesaxe continue; 2913434Sesaxe 2923434Sesaxe /* 2933434Sesaxe * Find (or create) the PG associated with 2943434Sesaxe * the hw sharing relationship in which cp 2953434Sesaxe * belongs. 2963434Sesaxe * 2973434Sesaxe * Determine if a suitable PG already 2983434Sesaxe * exists, or if one needs to be created. 2993434Sesaxe */ 3003434Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 3013434Sesaxe if (pg == NULL) { 3023434Sesaxe /* 3033434Sesaxe * Create a new one. 3043434Sesaxe * Initialize the common... 3053434Sesaxe */ 3063434Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 3073434Sesaxe 3083434Sesaxe /* ... physical ... */ 3093434Sesaxe pghw_init((pghw_t *)pg, cp, hw); 3103434Sesaxe 3113434Sesaxe /* 3123434Sesaxe * ... and CMT specific portions of the 3133434Sesaxe * structure. 3143434Sesaxe */ 3153434Sesaxe bitset_init(&pg->cmt_cpus_actv_set); 3163434Sesaxe group_create(&pg->cmt_cpus_actv); 3173434Sesaxe } else { 3183434Sesaxe ASSERT(IS_CMT_PG(pg)); 3193434Sesaxe } 3203434Sesaxe 3213434Sesaxe /* Add the CPU to the PG */ 3223434Sesaxe pg_cpu_add((pg_t *)pg, cp); 3233434Sesaxe 3243434Sesaxe /* 3253434Sesaxe * Ensure capacity of the active CPUs group/bitset 3263434Sesaxe */ 3273434Sesaxe group_expand(&pg->cmt_cpus_actv, 3283434Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 3293434Sesaxe 3303434Sesaxe if (cp->cpu_seqid >= 3313434Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) { 3323434Sesaxe bitset_resize(&pg->cmt_cpus_actv_set, 3333434Sesaxe cp->cpu_seqid + 1); 3343434Sesaxe } 3353434Sesaxe 3363434Sesaxe /* 3373434Sesaxe * Build a lineage of CMT PGs for load balancing 3383434Sesaxe */ 3393434Sesaxe if (pg_cmt_load_bal_hw(hw)) { 3403434Sesaxe level = pghw_level(hw); 3413434Sesaxe cpu_cmt_hier[level] = pg; 3423434Sesaxe if (level > max_level) 3433434Sesaxe max_level = level; 3443434Sesaxe nlevels++; 3453434Sesaxe } 3463434Sesaxe 3473434Sesaxe /* Cache this for later */ 3483434Sesaxe if (hw == PGHW_CACHE) 3493434Sesaxe pg_cache = (pg_t *)pg; 3503434Sesaxe } 3513434Sesaxe 3523434Sesaxe /* 3533434Sesaxe * Pack out any gaps in the constructed lineage. 3543434Sesaxe * Gaps may exist where the architecture knows 3553434Sesaxe * about a hardware sharing relationship, but such a 3563434Sesaxe * relationship either isn't relevant for load 3573434Sesaxe * balancing or doesn't exist between CPUs on the system. 3583434Sesaxe */ 3593434Sesaxe pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1); 3603434Sesaxe 3613434Sesaxe /* 3623434Sesaxe * For each of the PGs int the CPU's lineage: 3633434Sesaxe * - Add an entry in the CPU sorted CMT PG group 3643434Sesaxe * which is used for top down CMT load balancing 3653434Sesaxe * - Tie the PG into the CMT hierarchy by connecting 3663434Sesaxe * it to it's parent and siblings. 3673434Sesaxe */ 3683434Sesaxe group_expand(cmt_pgs, nlevels); 3693434Sesaxe 3703434Sesaxe /* 3713434Sesaxe * Find the lgrp that encapsulates this CPU's CMT hierarchy 3723434Sesaxe */ 3733434Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 3743434Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 375*3676Sesaxe if (lgrp == NULL) 376*3676Sesaxe lgrp = pg_cmt_lgrp_create(lgrp_handle); 3773434Sesaxe 3783434Sesaxe for (level = 0; level < nlevels; level++) { 3793434Sesaxe uint_t children; 3803434Sesaxe int err; 3813434Sesaxe 3823434Sesaxe pg = cpu_cmt_hier[level]; 3833434Sesaxe err = group_add_at(cmt_pgs, pg, nlevels - level - 1); 3843434Sesaxe ASSERT(err == 0); 3853434Sesaxe 3863434Sesaxe if (level == 0) 3873434Sesaxe cp->cpu_pg->cmt_lineage = (pg_t *)pg; 3883434Sesaxe 3893434Sesaxe if (pg->cmt_siblings != NULL) { 3903434Sesaxe /* Already initialized */ 3913434Sesaxe ASSERT(pg->cmt_parent == NULL || 3923434Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]); 3933434Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 3943434Sesaxe pg->cmt_siblings == pg->cmt_parent->cmt_children); 3953434Sesaxe continue; 3963434Sesaxe } 3973434Sesaxe 3983434Sesaxe if ((level + 1) == nlevels) { 3993434Sesaxe pg->cmt_parent = NULL; 4003434Sesaxe pg->cmt_siblings = &lgrp->cl_pgs; 4013434Sesaxe children = ++lgrp->cl_npgs; 4023434Sesaxe } else { 4033434Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1]; 4043434Sesaxe 4053434Sesaxe /* 4063434Sesaxe * A good parent keeps track of their children. 4073434Sesaxe * The parent's children group is also the PG's 4083434Sesaxe * siblings. 4093434Sesaxe */ 4103434Sesaxe if (pg->cmt_parent->cmt_children == NULL) { 4113434Sesaxe pg->cmt_parent->cmt_children = 4123434Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP); 4133434Sesaxe group_create(pg->cmt_parent->cmt_children); 4143434Sesaxe } 4153434Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children; 4163434Sesaxe children = ++pg->cmt_parent->cmt_nchildren; 4173434Sesaxe } 4183434Sesaxe pg->cmt_hint = 0; 4193434Sesaxe group_expand(pg->cmt_siblings, children); 4203434Sesaxe } 4213434Sesaxe 4223434Sesaxe /* 4233434Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure 4243434Sesaxe * for fast lookups later. 4253434Sesaxe */ 4263434Sesaxe if (cp->cpu_physid) { 4273434Sesaxe cp->cpu_physid->cpu_chipid = 4283434Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP); 4293434Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 4303434Sesaxe 4313434Sesaxe /* 4323434Sesaxe * If this cpu has a PG representing shared cache, then set 4333434Sesaxe * cpu_cacheid to that PG's logical id 4343434Sesaxe */ 4353434Sesaxe if (pg_cache) 4363434Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 4373434Sesaxe } 4383434Sesaxe 4393434Sesaxe /* CPU0 only initialization */ 4403434Sesaxe if (is_cpu0) { 4413434Sesaxe pg_cmt_cpu_startup(cp); 4423434Sesaxe is_cpu0 = 0; 443*3676Sesaxe cpu0_lgrp = lgrp; 4443434Sesaxe } 4453434Sesaxe 4463434Sesaxe } 4473434Sesaxe 4483434Sesaxe /* 4493434Sesaxe * Class callback when a CPU is leaving the system (deletion) 4503434Sesaxe */ 4513434Sesaxe static void 4523434Sesaxe pg_cmt_cpu_fini(cpu_t *cp) 4533434Sesaxe { 4543434Sesaxe group_iter_t i; 4553434Sesaxe pg_cmt_t *pg; 4563434Sesaxe group_t *pgs, *cmt_pgs; 4573434Sesaxe lgrp_handle_t lgrp_handle; 4583434Sesaxe cmt_lgrp_t *lgrp; 4593434Sesaxe 4603434Sesaxe pgs = &cp->cpu_pg->pgs; 4613434Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 4623434Sesaxe 4633434Sesaxe /* 4643434Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy 4653434Sesaxe */ 4663434Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 467*3676Sesaxe 4683434Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 469*3676Sesaxe if (lgrp == NULL) { 470*3676Sesaxe /* 471*3676Sesaxe * This is a bit of a special case. 472*3676Sesaxe * The only way this can happen is if the CPU's lgrp 473*3676Sesaxe * handle changed out from underneath us, which is what 474*3676Sesaxe * happens with null_proc_lpa on starcat systems. 475*3676Sesaxe * 476*3676Sesaxe * Use the initial boot CPU lgrp, since this is what 477*3676Sesaxe * we need to tear down. 478*3676Sesaxe */ 479*3676Sesaxe lgrp = cpu0_lgrp; 480*3676Sesaxe } 4813434Sesaxe 4823434Sesaxe /* 4833434Sesaxe * First, clean up anything load balancing specific for each of 4843434Sesaxe * the CPU's PGs that participated in CMT load balancing 4853434Sesaxe */ 4863434Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 4873434Sesaxe while (pg != NULL) { 4883434Sesaxe 4893434Sesaxe /* 4903434Sesaxe * Remove the PG from the CPU's load balancing lineage 4913434Sesaxe */ 4923434Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 4933434Sesaxe 4943434Sesaxe /* 4953434Sesaxe * If it's about to become empty, destroy it's children 4963434Sesaxe * group, and remove it's reference from it's siblings. 4973434Sesaxe * This is done here (rather than below) to avoid removing 4983434Sesaxe * our reference from a PG that we just eliminated. 4993434Sesaxe */ 5003434Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 5013434Sesaxe if (pg->cmt_children != NULL) 5023434Sesaxe group_destroy(pg->cmt_children); 5033434Sesaxe if (pg->cmt_siblings != NULL) { 5043434Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs) 5053434Sesaxe lgrp->cl_npgs--; 5063434Sesaxe else 5073434Sesaxe pg->cmt_parent->cmt_nchildren--; 5083434Sesaxe } 5093434Sesaxe } 5103434Sesaxe pg = pg->cmt_parent; 5113434Sesaxe } 5123434Sesaxe 5133434Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0); 5143434Sesaxe 5153434Sesaxe /* 5163434Sesaxe * Now that the load balancing lineage updates have happened, 5173434Sesaxe * remove the CPU from all it's PGs (destroying any that become 5183434Sesaxe * empty). 5193434Sesaxe */ 5203434Sesaxe group_iter_init(&i); 5213434Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 5223434Sesaxe if (IS_CMT_PG(pg) == 0) 5233434Sesaxe continue; 5243434Sesaxe 5253434Sesaxe pg_cpu_delete((pg_t *)pg, cp); 5263434Sesaxe /* 5273434Sesaxe * Deleting the CPU from the PG changes the CPU's 5283434Sesaxe * PG group over which we are actively iterating 5293434Sesaxe * Re-initialize the iteration 5303434Sesaxe */ 5313434Sesaxe group_iter_init(&i); 5323434Sesaxe 5333434Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 5343434Sesaxe 5353434Sesaxe /* 5363434Sesaxe * The PG has become zero sized, so destroy it. 5373434Sesaxe */ 5383434Sesaxe group_destroy(&pg->cmt_cpus_actv); 5393434Sesaxe bitset_fini(&pg->cmt_cpus_actv_set); 5403434Sesaxe pghw_fini((pghw_t *)pg); 5413434Sesaxe 5423434Sesaxe pg_destroy((pg_t *)pg); 5433434Sesaxe } 5443434Sesaxe } 5453434Sesaxe } 5463434Sesaxe 5473434Sesaxe /* 5483434Sesaxe * Class callback when a CPU is entering a cpu partition 5493434Sesaxe */ 5503434Sesaxe static void 5513434Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 5523434Sesaxe { 5533434Sesaxe group_t *pgs; 5543434Sesaxe pg_t *pg; 5553434Sesaxe group_iter_t i; 5563434Sesaxe 5573434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 5583434Sesaxe 5593434Sesaxe pgs = &cp->cpu_pg->pgs; 5603434Sesaxe 5613434Sesaxe /* 5623434Sesaxe * Ensure that the new partition's PG bitset 5633434Sesaxe * is large enough for all CMT PG's to which cp 5643434Sesaxe * belongs 5653434Sesaxe */ 5663434Sesaxe group_iter_init(&i); 5673434Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 5683434Sesaxe if (IS_CMT_PG(pg) == 0) 5693434Sesaxe continue; 5703434Sesaxe 5713434Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 5723434Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 5733434Sesaxe } 5743434Sesaxe } 5753434Sesaxe 5763434Sesaxe /* 5773434Sesaxe * Class callback when a CPU is actually moving partitions 5783434Sesaxe */ 5793434Sesaxe static void 5803434Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 5813434Sesaxe { 5823434Sesaxe cpu_t *cpp; 5833434Sesaxe group_t *pgs; 5843434Sesaxe pg_t *pg; 5853434Sesaxe group_iter_t pg_iter; 5863434Sesaxe pg_cpu_itr_t cpu_iter; 5873434Sesaxe boolean_t found; 5883434Sesaxe 5893434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 5903434Sesaxe 5913434Sesaxe pgs = &cp->cpu_pg->pgs; 5923434Sesaxe group_iter_init(&pg_iter); 5933434Sesaxe 5943434Sesaxe /* 5953434Sesaxe * Iterate over the CPUs CMT PGs 5963434Sesaxe */ 5973434Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 5983434Sesaxe 5993434Sesaxe if (IS_CMT_PG(pg) == 0) 6003434Sesaxe continue; 6013434Sesaxe 6023434Sesaxe /* 6033434Sesaxe * Add the PG to the bitset in the new partition. 6043434Sesaxe */ 6053434Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 6063434Sesaxe 6073434Sesaxe /* 6083434Sesaxe * Remove the PG from the bitset in the old partition 6093434Sesaxe * if the last of the PG's CPUs have left. 6103434Sesaxe */ 6113434Sesaxe found = B_FALSE; 6123434Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter); 6133434Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 6143434Sesaxe if (cpp == cp) 6153434Sesaxe continue; 616*3676Sesaxe if (CPU_ACTIVE(cpp) && 617*3676Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) { 6183434Sesaxe found = B_TRUE; 6193434Sesaxe break; 6203434Sesaxe } 6213434Sesaxe } 6223434Sesaxe if (!found) 6233434Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 6243434Sesaxe } 6253434Sesaxe } 6263434Sesaxe 6273434Sesaxe /* 6283434Sesaxe * Class callback when a CPU becomes active (online) 6293434Sesaxe * 6303434Sesaxe * This is called in a context where CPUs are paused 6313434Sesaxe */ 6323434Sesaxe static void 6333434Sesaxe pg_cmt_cpu_active(cpu_t *cp) 6343434Sesaxe { 6353434Sesaxe int err; 6363434Sesaxe group_iter_t i; 6373434Sesaxe pg_cmt_t *pg; 6383434Sesaxe group_t *pgs; 6393434Sesaxe 6403434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 6413434Sesaxe 6423434Sesaxe pgs = &cp->cpu_pg->pgs; 6433434Sesaxe group_iter_init(&i); 6443434Sesaxe 6453434Sesaxe /* 6463434Sesaxe * Iterate over the CPU's PGs 6473434Sesaxe */ 6483434Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 6493434Sesaxe 6503434Sesaxe if (IS_CMT_PG(pg) == 0) 6513434Sesaxe continue; 6523434Sesaxe 6533434Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 6543434Sesaxe ASSERT(err == 0); 6553434Sesaxe 6563434Sesaxe /* 6573434Sesaxe * If this is the first active CPU in the PG, and it 6583434Sesaxe * represents a hardware sharing relationship over which 6593434Sesaxe * CMT load balancing is performed, add it as a candidate 6603434Sesaxe * for balancing with it's siblings. 6613434Sesaxe */ 6623434Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 6633434Sesaxe pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 6643434Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 6653434Sesaxe ASSERT(err == 0); 6663434Sesaxe } 6673434Sesaxe 6683434Sesaxe /* 6693434Sesaxe * Notate the CPU in the PGs active CPU bitset. 6703434Sesaxe * Also notate the PG as being active in it's associated 6713434Sesaxe * partition 6723434Sesaxe */ 6733434Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 6743434Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 6753434Sesaxe } 6763434Sesaxe } 6773434Sesaxe 6783434Sesaxe /* 6793434Sesaxe * Class callback when a CPU goes inactive (offline) 6803434Sesaxe * 6813434Sesaxe * This is called in a context where CPUs are paused 6823434Sesaxe */ 6833434Sesaxe static void 6843434Sesaxe pg_cmt_cpu_inactive(cpu_t *cp) 6853434Sesaxe { 6863434Sesaxe int err; 6873434Sesaxe group_t *pgs; 6883434Sesaxe pg_cmt_t *pg; 6893434Sesaxe cpu_t *cpp; 6903434Sesaxe group_iter_t i; 6913434Sesaxe pg_cpu_itr_t cpu_itr; 6923434Sesaxe boolean_t found; 6933434Sesaxe 6943434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 6953434Sesaxe 6963434Sesaxe pgs = &cp->cpu_pg->pgs; 6973434Sesaxe group_iter_init(&i); 6983434Sesaxe 6993434Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 7003434Sesaxe 7013434Sesaxe if (IS_CMT_PG(pg) == 0) 7023434Sesaxe continue; 7033434Sesaxe 7043434Sesaxe /* 7053434Sesaxe * Remove the CPU from the CMT PGs active CPU group 7063434Sesaxe * bitmap 7073434Sesaxe */ 7083434Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 7093434Sesaxe ASSERT(err == 0); 7103434Sesaxe 7113434Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 7123434Sesaxe 7133434Sesaxe /* 7143434Sesaxe * If there are no more active CPUs in this PG over which 7153434Sesaxe * load was balanced, remove it as a balancing candidate. 7163434Sesaxe */ 7173434Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 7183434Sesaxe pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 7193434Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 7203434Sesaxe ASSERT(err == 0); 7213434Sesaxe } 7223434Sesaxe 7233434Sesaxe /* 7243434Sesaxe * Assert the number of active CPUs does not exceed 7253434Sesaxe * the total number of CPUs in the PG 7263434Sesaxe */ 7273434Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 7283434Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 7293434Sesaxe 7303434Sesaxe /* 7313434Sesaxe * Update the PG bitset in the CPU's old partition 7323434Sesaxe */ 7333434Sesaxe found = B_FALSE; 7343434Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr); 7353434Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 7363434Sesaxe if (cpp == cp) 7373434Sesaxe continue; 738*3676Sesaxe if (CPU_ACTIVE(cpp) && 739*3676Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 7403434Sesaxe found = B_TRUE; 7413434Sesaxe break; 7423434Sesaxe } 7433434Sesaxe } 7443434Sesaxe if (!found) { 7453434Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, 7463434Sesaxe ((pg_t *)pg)->pg_id); 7473434Sesaxe } 7483434Sesaxe } 7493434Sesaxe } 7503434Sesaxe 7513434Sesaxe /* 7523434Sesaxe * Return non-zero if the CPU belongs in the given PG 7533434Sesaxe */ 7543434Sesaxe static int 7553434Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 7563434Sesaxe { 7573434Sesaxe cpu_t *pg_cpu; 7583434Sesaxe 7593434Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 7603434Sesaxe 7613434Sesaxe ASSERT(pg_cpu != NULL); 7623434Sesaxe 7633434Sesaxe /* 7643434Sesaxe * The CPU belongs if, given the nature of the hardware sharing 7653434Sesaxe * relationship represented by the PG, the CPU has that 7663434Sesaxe * relationship with some other CPU already in the PG 7673434Sesaxe */ 7683434Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 7693434Sesaxe return (1); 7703434Sesaxe 7713434Sesaxe return (0); 7723434Sesaxe } 7733434Sesaxe 7743434Sesaxe /* 7753434Sesaxe * Pack the CPUs CMT hierarchy 7763434Sesaxe * The hierarchy order is preserved 7773434Sesaxe */ 7783434Sesaxe static void 7793434Sesaxe pg_cmt_hier_pack(pg_cmt_t *hier[], int sz) 7803434Sesaxe { 7813434Sesaxe int i, j; 7823434Sesaxe 7833434Sesaxe for (i = 0; i < sz; i++) { 7843434Sesaxe if (hier[i] != NULL) 7853434Sesaxe continue; 7863434Sesaxe 7873434Sesaxe for (j = i; j < sz; j++) { 7883434Sesaxe if (hier[j] != NULL) { 7893434Sesaxe hier[i] = hier[j]; 7903434Sesaxe hier[j] = NULL; 7913434Sesaxe break; 7923434Sesaxe } 7933434Sesaxe } 7943434Sesaxe if (j == sz) 7953434Sesaxe break; 7963434Sesaxe } 7973434Sesaxe } 7983434Sesaxe 7993434Sesaxe /* 8003434Sesaxe * Return a cmt_lgrp_t * given an lgroup handle. 8013434Sesaxe */ 8023434Sesaxe static cmt_lgrp_t * 8033434Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand) 8043434Sesaxe { 8053434Sesaxe cmt_lgrp_t *lgrp; 8063434Sesaxe 8073434Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 8083434Sesaxe 8093434Sesaxe lgrp = cmt_lgrps; 8103434Sesaxe while (lgrp != NULL) { 8113434Sesaxe if (lgrp->cl_hand == hand) 812*3676Sesaxe break; 8133434Sesaxe lgrp = lgrp->cl_next; 8143434Sesaxe } 815*3676Sesaxe return (lgrp); 816*3676Sesaxe } 8173434Sesaxe 818*3676Sesaxe /* 819*3676Sesaxe * Create a cmt_lgrp_t with the specified handle. 820*3676Sesaxe */ 821*3676Sesaxe static cmt_lgrp_t * 822*3676Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand) 823*3676Sesaxe { 824*3676Sesaxe cmt_lgrp_t *lgrp; 825*3676Sesaxe 826*3676Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 827*3676Sesaxe 8283434Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 8293434Sesaxe 8303434Sesaxe lgrp->cl_hand = hand; 8313434Sesaxe lgrp->cl_npgs = 0; 8323434Sesaxe lgrp->cl_next = cmt_lgrps; 8333434Sesaxe cmt_lgrps = lgrp; 8343434Sesaxe group_create(&lgrp->cl_pgs); 8353434Sesaxe 8363434Sesaxe return (lgrp); 8373434Sesaxe } 838