10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51892Sesaxe * Common Development and Distribution License (the "License"). 61892Sesaxe * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 223434Sesaxe * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Basic NUMA support in terms of locality groups 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * Solaris needs to know which CPUs, memory, etc. are near each other to 320Sstevel@tonic-gate * provide good performance on NUMA machines by optimizing for locality. 330Sstevel@tonic-gate * In order to do this, a new abstraction called a "locality group (lgroup)" 340Sstevel@tonic-gate * has been introduced to keep track of which CPU-like and memory-like hardware 350Sstevel@tonic-gate * resources are close to each other. Currently, latency is the only measure 360Sstevel@tonic-gate * used to determine how to group hardware resources into lgroups, but this 370Sstevel@tonic-gate * does not limit the groupings to be based solely on latency. Other factors 380Sstevel@tonic-gate * may be used to determine the groupings in the future. 390Sstevel@tonic-gate * 400Sstevel@tonic-gate * Lgroups are organized into a hieararchy or topology that represents the 410Sstevel@tonic-gate * latency topology of the machine. There is always at least a root lgroup in 420Sstevel@tonic-gate * the system. It represents all the hardware resources in the machine at a 430Sstevel@tonic-gate * latency big enough that any hardware resource can at least access any other 440Sstevel@tonic-gate * hardware resource within that latency. A Uniform Memory Access (UMA) 450Sstevel@tonic-gate * machine is represented with one lgroup (the root). In contrast, a NUMA 460Sstevel@tonic-gate * machine is represented at least by the root lgroup and some number of leaf 470Sstevel@tonic-gate * lgroups where the leaf lgroups contain the hardware resources within the 480Sstevel@tonic-gate * least latency of each other and the root lgroup still contains all the 490Sstevel@tonic-gate * resources in the machine. Some number of intermediate lgroups may exist 500Sstevel@tonic-gate * which represent more levels of locality than just the local latency of the 510Sstevel@tonic-gate * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 520Sstevel@tonic-gate * (eg. root and intermediate lgroups) contain the next nearest resources to 530Sstevel@tonic-gate * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 540Sstevel@tonic-gate * to the root lgroup shows the hardware resources from closest to farthest 550Sstevel@tonic-gate * from the leaf lgroup such that each successive ancestor lgroup contains 560Sstevel@tonic-gate * the next nearest resources at the next level of locality from the previous. 570Sstevel@tonic-gate * 580Sstevel@tonic-gate * The kernel uses the lgroup abstraction to know how to allocate resources 590Sstevel@tonic-gate * near a given process/thread. At fork() and lwp/thread_create() time, a 600Sstevel@tonic-gate * "home" lgroup is chosen for a thread. This is done by picking the lgroup 610Sstevel@tonic-gate * with the lowest load average. Binding to a processor or processor set will 620Sstevel@tonic-gate * change the home lgroup for a thread. The scheduler has been modified to try 630Sstevel@tonic-gate * to dispatch a thread on a CPU in its home lgroup. Physical memory 640Sstevel@tonic-gate * allocation is lgroup aware too, so memory will be allocated from the current 650Sstevel@tonic-gate * thread's home lgroup if possible. If the desired resources are not 660Sstevel@tonic-gate * available, the kernel traverses the lgroup hierarchy going to the parent 670Sstevel@tonic-gate * lgroup to find resources at the next level of locality until it reaches the 680Sstevel@tonic-gate * root lgroup. 690Sstevel@tonic-gate */ 700Sstevel@tonic-gate 710Sstevel@tonic-gate #include <sys/lgrp.h> 720Sstevel@tonic-gate #include <sys/lgrp_user.h> 730Sstevel@tonic-gate #include <sys/types.h> 740Sstevel@tonic-gate #include <sys/mman.h> 750Sstevel@tonic-gate #include <sys/param.h> 760Sstevel@tonic-gate #include <sys/var.h> 770Sstevel@tonic-gate #include <sys/thread.h> 780Sstevel@tonic-gate #include <sys/cpuvar.h> 790Sstevel@tonic-gate #include <sys/cpupart.h> 800Sstevel@tonic-gate #include <sys/kmem.h> 810Sstevel@tonic-gate #include <vm/seg.h> 820Sstevel@tonic-gate #include <vm/seg_kmem.h> 830Sstevel@tonic-gate #include <vm/seg_spt.h> 840Sstevel@tonic-gate #include <vm/seg_vn.h> 850Sstevel@tonic-gate #include <vm/as.h> 860Sstevel@tonic-gate #include <sys/atomic.h> 870Sstevel@tonic-gate #include <sys/systm.h> 880Sstevel@tonic-gate #include <sys/errno.h> 890Sstevel@tonic-gate #include <sys/cmn_err.h> 900Sstevel@tonic-gate #include <sys/kstat.h> 910Sstevel@tonic-gate #include <sys/sysmacros.h> 923434Sesaxe #include <sys/pg.h> 930Sstevel@tonic-gate #include <sys/promif.h> 940Sstevel@tonic-gate #include <sys/sdt.h> 950Sstevel@tonic-gate 960Sstevel@tonic-gate lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 970Sstevel@tonic-gate lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 980Sstevel@tonic-gate /* indexed by lgrp_id */ 990Sstevel@tonic-gate int nlgrps; /* number of lgroups in machine */ 1000Sstevel@tonic-gate int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 1010Sstevel@tonic-gate int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 1020Sstevel@tonic-gate 1030Sstevel@tonic-gate /* 1040Sstevel@tonic-gate * Kstat data for lgroups. 1050Sstevel@tonic-gate * 1060Sstevel@tonic-gate * Actual kstat data is collected in lgrp_stats array. 1070Sstevel@tonic-gate * The lgrp_kstat_data array of named kstats is used to extract data from 1080Sstevel@tonic-gate * lgrp_stats and present it to kstat framework. It is protected from partallel 1090Sstevel@tonic-gate * modifications by lgrp_kstat_mutex. This may cause some contention when 1100Sstevel@tonic-gate * several kstat commands run in parallel but this is not the 1110Sstevel@tonic-gate * performance-critical path. 1120Sstevel@tonic-gate */ 1130Sstevel@tonic-gate extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 1140Sstevel@tonic-gate 1150Sstevel@tonic-gate /* 1160Sstevel@tonic-gate * Declare kstat names statically for enums as defined in the header file. 1170Sstevel@tonic-gate */ 1180Sstevel@tonic-gate LGRP_KSTAT_NAMES; 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate static void lgrp_kstat_init(void); 1210Sstevel@tonic-gate static int lgrp_kstat_extract(kstat_t *, int); 1220Sstevel@tonic-gate static void lgrp_kstat_reset(lgrp_id_t); 1230Sstevel@tonic-gate 1240Sstevel@tonic-gate static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 1250Sstevel@tonic-gate static kmutex_t lgrp_kstat_mutex; 1260Sstevel@tonic-gate 1270Sstevel@tonic-gate 1280Sstevel@tonic-gate /* 1290Sstevel@tonic-gate * max number of lgroups supported by the platform 1300Sstevel@tonic-gate */ 1310Sstevel@tonic-gate int nlgrpsmax = 0; 1320Sstevel@tonic-gate 1330Sstevel@tonic-gate /* 1340Sstevel@tonic-gate * The root lgroup. Represents the set of resources at the system wide 1350Sstevel@tonic-gate * level of locality. 1360Sstevel@tonic-gate */ 1370Sstevel@tonic-gate lgrp_t *lgrp_root = NULL; 1380Sstevel@tonic-gate 1390Sstevel@tonic-gate /* 1400Sstevel@tonic-gate * During system bootstrap cp_default does not contain the list of lgrp load 1410Sstevel@tonic-gate * averages (cp_lgrploads). The list is allocated after the first CPU is brought 1420Sstevel@tonic-gate * on-line when cp_default is initialized by cpupart_initialize_default(). 1430Sstevel@tonic-gate * Configuring CPU0 may create a two-level topology with root and one leaf node 1440Sstevel@tonic-gate * containing CPU0. This topology is initially constructed in a special 1450Sstevel@tonic-gate * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 1460Sstevel@tonic-gate * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 1470Sstevel@tonic-gate * for all lpl operations until cp_default is fully constructed. 1480Sstevel@tonic-gate * 1490Sstevel@tonic-gate * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 1500Sstevel@tonic-gate * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 1510Sstevel@tonic-gate * the first element of lpl_bootstrap_list. 15260Sesaxe * 15360Sesaxe * CPUs that are added to the system, but have not yet been assigned to an 15460Sesaxe * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 15560Sesaxe * on some architectures (x86) it's possible for the slave CPU startup thread 15660Sesaxe * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 1570Sstevel@tonic-gate */ 1580Sstevel@tonic-gate #define LPL_BOOTSTRAP_SIZE 2 1590Sstevel@tonic-gate static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 1600Sstevel@tonic-gate lpl_t *lpl_bootstrap; 1610Sstevel@tonic-gate 16260Sesaxe /* 16360Sesaxe * If cp still references the bootstrap lpl, it has not yet been added to 16460Sesaxe * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 16560Sesaxe * a thread is trying to allocate memory close to a CPU that has no lgrp. 16660Sesaxe */ 16760Sesaxe #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 16860Sesaxe 1690Sstevel@tonic-gate static lgrp_t lroot; 1700Sstevel@tonic-gate 1710Sstevel@tonic-gate /* 1720Sstevel@tonic-gate * Size, in bytes, beyond which random memory allocation policy is applied 1730Sstevel@tonic-gate * to non-shared memory. Default is the maximum size, so random memory 1740Sstevel@tonic-gate * allocation won't be used for non-shared memory by default. 1750Sstevel@tonic-gate */ 1760Sstevel@tonic-gate size_t lgrp_privm_random_thresh = (size_t)(-1); 1770Sstevel@tonic-gate 1782685Sakolb /* the maximum effect that a single thread can have on it's lgroup's load */ 1792685Sakolb #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 1802685Sakolb ((lgrp_loadavg_max_effect) / (ncpu)) 1812685Sakolb uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 1822685Sakolb 1832685Sakolb 1840Sstevel@tonic-gate /* 1850Sstevel@tonic-gate * Size, in bytes, beyond which random memory allocation policy is applied to 1860Sstevel@tonic-gate * shared memory. Default is 8MB (2 ISM pages). 1870Sstevel@tonic-gate */ 1880Sstevel@tonic-gate size_t lgrp_shm_random_thresh = 8*1024*1024; 1890Sstevel@tonic-gate 1900Sstevel@tonic-gate /* 1910Sstevel@tonic-gate * Whether to do processor set aware memory allocation by default 1920Sstevel@tonic-gate */ 1930Sstevel@tonic-gate int lgrp_mem_pset_aware = 0; 1940Sstevel@tonic-gate 1950Sstevel@tonic-gate /* 1960Sstevel@tonic-gate * Set the default memory allocation policy for root lgroup 1970Sstevel@tonic-gate */ 1980Sstevel@tonic-gate lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 1990Sstevel@tonic-gate 2000Sstevel@tonic-gate /* 2010Sstevel@tonic-gate * Set the default memory allocation policy. For most platforms, 2020Sstevel@tonic-gate * next touch is sufficient, but some platforms may wish to override 2030Sstevel@tonic-gate * this. 2040Sstevel@tonic-gate */ 2050Sstevel@tonic-gate lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 2060Sstevel@tonic-gate 2070Sstevel@tonic-gate 2080Sstevel@tonic-gate /* 2090Sstevel@tonic-gate * lgroup CPU event handlers 2100Sstevel@tonic-gate */ 2110Sstevel@tonic-gate static void lgrp_cpu_init(struct cpu *); 2120Sstevel@tonic-gate static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 2130Sstevel@tonic-gate static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 2140Sstevel@tonic-gate 2150Sstevel@tonic-gate /* 2160Sstevel@tonic-gate * lgroup memory event handlers 2170Sstevel@tonic-gate */ 2180Sstevel@tonic-gate static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 2190Sstevel@tonic-gate static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 2200Sstevel@tonic-gate static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 2210Sstevel@tonic-gate 2220Sstevel@tonic-gate /* 2230Sstevel@tonic-gate * lgroup CPU partition event handlers 2240Sstevel@tonic-gate */ 2250Sstevel@tonic-gate static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 2260Sstevel@tonic-gate static void lgrp_part_del_cpu(struct cpu *); 2270Sstevel@tonic-gate 2280Sstevel@tonic-gate static void lgrp_root_init(void); 2290Sstevel@tonic-gate 2300Sstevel@tonic-gate /* 2310Sstevel@tonic-gate * lpl topology 2320Sstevel@tonic-gate */ 2330Sstevel@tonic-gate static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 2340Sstevel@tonic-gate static void lpl_clear(lpl_t *); 2350Sstevel@tonic-gate static void lpl_leaf_insert(lpl_t *, struct cpupart *); 2360Sstevel@tonic-gate static void lpl_leaf_remove(lpl_t *, struct cpupart *); 2370Sstevel@tonic-gate static void lpl_rset_add(lpl_t *, lpl_t *); 2380Sstevel@tonic-gate static void lpl_rset_del(lpl_t *, lpl_t *); 2390Sstevel@tonic-gate static int lpl_rset_contains(lpl_t *, lpl_t *); 2400Sstevel@tonic-gate static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 2410Sstevel@tonic-gate static void lpl_child_update(lpl_t *, struct cpupart *); 2420Sstevel@tonic-gate static int lpl_pick(lpl_t *, lpl_t *); 2430Sstevel@tonic-gate static void lpl_verify_wrapper(struct cpupart *); 2440Sstevel@tonic-gate 2450Sstevel@tonic-gate /* 2460Sstevel@tonic-gate * defines for lpl topology verifier return codes 2470Sstevel@tonic-gate */ 2480Sstevel@tonic-gate 2490Sstevel@tonic-gate #define LPL_TOPO_CORRECT 0 2500Sstevel@tonic-gate #define LPL_TOPO_PART_HAS_NO_LPL -1 2510Sstevel@tonic-gate #define LPL_TOPO_CPUS_NOT_EMPTY -2 2520Sstevel@tonic-gate #define LPL_TOPO_LGRP_MISMATCH -3 2530Sstevel@tonic-gate #define LPL_TOPO_MISSING_PARENT -4 2540Sstevel@tonic-gate #define LPL_TOPO_PARENT_MISMATCH -5 2550Sstevel@tonic-gate #define LPL_TOPO_BAD_CPUCNT -6 2560Sstevel@tonic-gate #define LPL_TOPO_RSET_MISMATCH -7 2570Sstevel@tonic-gate #define LPL_TOPO_LPL_ORPHANED -8 2580Sstevel@tonic-gate #define LPL_TOPO_LPL_BAD_NCPU -9 2590Sstevel@tonic-gate #define LPL_TOPO_RSET_MSSNG_LF -10 2600Sstevel@tonic-gate #define LPL_TOPO_CPU_HAS_BAD_LPL -11 2610Sstevel@tonic-gate #define LPL_TOPO_BOGUS_HINT -12 2620Sstevel@tonic-gate #define LPL_TOPO_NONLEAF_HAS_CPUS -13 2630Sstevel@tonic-gate #define LPL_TOPO_LGRP_NOT_LEAF -14 2640Sstevel@tonic-gate #define LPL_TOPO_BAD_RSETCNT -15 2650Sstevel@tonic-gate 2660Sstevel@tonic-gate /* 2670Sstevel@tonic-gate * Return whether lgroup optimizations should be enabled on this system 2680Sstevel@tonic-gate */ 2690Sstevel@tonic-gate int 2700Sstevel@tonic-gate lgrp_optimizations(void) 2710Sstevel@tonic-gate { 2720Sstevel@tonic-gate /* 2730Sstevel@tonic-gate * System must have more than 2 lgroups to enable lgroup optimizations 2740Sstevel@tonic-gate * 2750Sstevel@tonic-gate * XXX This assumes that a 2 lgroup system has an empty root lgroup 2760Sstevel@tonic-gate * with one child lgroup containing all the resources. A 2 lgroup 2770Sstevel@tonic-gate * system with a root lgroup directly containing CPUs or memory might 2780Sstevel@tonic-gate * need lgroup optimizations with its child lgroup, but there 2790Sstevel@tonic-gate * isn't such a machine for now.... 2800Sstevel@tonic-gate */ 2810Sstevel@tonic-gate if (nlgrps > 2) 2820Sstevel@tonic-gate return (1); 2830Sstevel@tonic-gate 2840Sstevel@tonic-gate return (0); 2850Sstevel@tonic-gate } 2860Sstevel@tonic-gate 2870Sstevel@tonic-gate /* 2880Sstevel@tonic-gate * Build full lgroup topology 2890Sstevel@tonic-gate */ 2900Sstevel@tonic-gate static void 2910Sstevel@tonic-gate lgrp_root_init(void) 2920Sstevel@tonic-gate { 2930Sstevel@tonic-gate lgrp_handle_t hand; 2940Sstevel@tonic-gate int i; 2950Sstevel@tonic-gate lgrp_id_t id; 2960Sstevel@tonic-gate 2970Sstevel@tonic-gate /* 2980Sstevel@tonic-gate * Create the "root" lgroup 2990Sstevel@tonic-gate */ 3000Sstevel@tonic-gate ASSERT(nlgrps == 0); 3010Sstevel@tonic-gate id = nlgrps++; 3020Sstevel@tonic-gate 3030Sstevel@tonic-gate lgrp_root = &lroot; 3040Sstevel@tonic-gate 3050Sstevel@tonic-gate lgrp_root->lgrp_cpu = NULL; 3060Sstevel@tonic-gate lgrp_root->lgrp_mnodes = 0; 3070Sstevel@tonic-gate lgrp_root->lgrp_nmnodes = 0; 3080Sstevel@tonic-gate hand = lgrp_plat_root_hand(); 3090Sstevel@tonic-gate lgrp_root->lgrp_plathand = hand; 3100Sstevel@tonic-gate 3110Sstevel@tonic-gate lgrp_root->lgrp_id = id; 3120Sstevel@tonic-gate lgrp_root->lgrp_cpucnt = 0; 3130Sstevel@tonic-gate lgrp_root->lgrp_childcnt = 0; 3140Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_children); 3150Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_leaves); 3160Sstevel@tonic-gate lgrp_root->lgrp_parent = NULL; 3170Sstevel@tonic-gate lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 3180Sstevel@tonic-gate 3190Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 3200Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_set[i]); 3210Sstevel@tonic-gate 3220Sstevel@tonic-gate lgrp_root->lgrp_kstat = NULL; 3230Sstevel@tonic-gate 3240Sstevel@tonic-gate lgrp_table[id] = lgrp_root; 3250Sstevel@tonic-gate 3260Sstevel@tonic-gate /* 3270Sstevel@tonic-gate * Setup initial lpl list for CPU0 and initial t0 home. 3280Sstevel@tonic-gate * The only lpl space we have so far is lpl_bootstrap. It is used for 32960Sesaxe * all topology operations until cp_default is initialized at which 33060Sesaxe * point t0.t_lpl will be updated. 3310Sstevel@tonic-gate */ 3320Sstevel@tonic-gate lpl_bootstrap = lpl_bootstrap_list; 3330Sstevel@tonic-gate t0.t_lpl = lpl_bootstrap; 3340Sstevel@tonic-gate cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 3350Sstevel@tonic-gate lpl_bootstrap_list[1].lpl_lgrpid = 1; 3360Sstevel@tonic-gate cp_default.cp_lgrploads = lpl_bootstrap; 3370Sstevel@tonic-gate } 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate /* 3400Sstevel@tonic-gate * Initialize the lgroup framework and allow the platform to do the same 3410Sstevel@tonic-gate */ 3420Sstevel@tonic-gate void 3430Sstevel@tonic-gate lgrp_init(void) 3440Sstevel@tonic-gate { 3450Sstevel@tonic-gate /* 3460Sstevel@tonic-gate * Initialize the platform 3470Sstevel@tonic-gate */ 3480Sstevel@tonic-gate lgrp_plat_init(); 3490Sstevel@tonic-gate 3500Sstevel@tonic-gate /* 3510Sstevel@tonic-gate * Set max number of lgroups supported on this platform which must be 3520Sstevel@tonic-gate * less than the max number of lgroups supported by the common lgroup 3530Sstevel@tonic-gate * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 3540Sstevel@tonic-gate */ 3550Sstevel@tonic-gate nlgrpsmax = lgrp_plat_max_lgrps(); 3560Sstevel@tonic-gate ASSERT(nlgrpsmax <= NLGRPS_MAX); 3570Sstevel@tonic-gate } 3580Sstevel@tonic-gate 3590Sstevel@tonic-gate /* 3600Sstevel@tonic-gate * Create the root and cpu0's lgroup, and set t0's home. 3610Sstevel@tonic-gate */ 3620Sstevel@tonic-gate void 3630Sstevel@tonic-gate lgrp_setup(void) 3640Sstevel@tonic-gate { 3650Sstevel@tonic-gate /* 3660Sstevel@tonic-gate * Setup the root lgroup 3670Sstevel@tonic-gate */ 3680Sstevel@tonic-gate lgrp_root_init(); 3690Sstevel@tonic-gate 3700Sstevel@tonic-gate /* 3710Sstevel@tonic-gate * Add cpu0 to an lgroup 3720Sstevel@tonic-gate */ 3730Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 3740Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 3750Sstevel@tonic-gate } 3760Sstevel@tonic-gate 3770Sstevel@tonic-gate /* 3780Sstevel@tonic-gate * Lgroup initialization is split in two parts. The first part 3790Sstevel@tonic-gate * (lgrp_main_init()) is called right before start_other_cpus() in main. The 3800Sstevel@tonic-gate * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 3810Sstevel@tonic-gate * when all CPUs are brought online and all distance information is available. 3820Sstevel@tonic-gate * 3830Sstevel@tonic-gate * When lgrp_main_init() is complete it sets lgrp_initialized. The 3840Sstevel@tonic-gate * lgrp_main_mp_init() sets lgrp_topo_initialized. 3850Sstevel@tonic-gate */ 3860Sstevel@tonic-gate 3870Sstevel@tonic-gate /* 3880Sstevel@tonic-gate * true when lgrp initialization has been completed. 3890Sstevel@tonic-gate */ 3900Sstevel@tonic-gate int lgrp_initialized = 0; 3910Sstevel@tonic-gate 3920Sstevel@tonic-gate /* 3930Sstevel@tonic-gate * True when lgrp topology is constructed. 3940Sstevel@tonic-gate */ 3950Sstevel@tonic-gate int lgrp_topo_initialized = 0; 3960Sstevel@tonic-gate 3970Sstevel@tonic-gate /* 3980Sstevel@tonic-gate * Init routine called after startup(), /etc/system has been processed, 3990Sstevel@tonic-gate * and cpu0 has been added to an lgroup. 4000Sstevel@tonic-gate */ 4010Sstevel@tonic-gate void 4020Sstevel@tonic-gate lgrp_main_init(void) 4030Sstevel@tonic-gate { 4040Sstevel@tonic-gate cpu_t *cp = CPU; 4050Sstevel@tonic-gate lgrp_id_t lgrpid; 4060Sstevel@tonic-gate int i; 407*3676Sesaxe extern void pg_cpu0_reinit(); 408*3676Sesaxe 4090Sstevel@tonic-gate /* 4100Sstevel@tonic-gate * Enforce a valid lgrp_mem_default_policy 4110Sstevel@tonic-gate */ 4120Sstevel@tonic-gate if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 4130Sstevel@tonic-gate (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 4140Sstevel@tonic-gate lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 4150Sstevel@tonic-gate 4160Sstevel@tonic-gate /* 4170Sstevel@tonic-gate * See if mpo should be disabled. 4180Sstevel@tonic-gate * This may happen in the case of null proc LPA on Starcat. 4190Sstevel@tonic-gate * The platform won't be able to detect null proc LPA until after 4200Sstevel@tonic-gate * cpu0 and memory have already been added to lgroups. 4210Sstevel@tonic-gate * When and if it is detected, the Starcat platform will return 4220Sstevel@tonic-gate * a different platform handle for cpu0 which is what we check for 4230Sstevel@tonic-gate * here. If mpo should be disabled move cpu0 to it's rightful place 4240Sstevel@tonic-gate * (the root), and destroy the remaining lgroups. This effectively 4250Sstevel@tonic-gate * provides an UMA lgroup topology. 4260Sstevel@tonic-gate */ 4270Sstevel@tonic-gate lgrpid = cp->cpu_lpl->lpl_lgrpid; 4280Sstevel@tonic-gate if (lgrp_table[lgrpid]->lgrp_plathand != 4290Sstevel@tonic-gate lgrp_plat_cpu_to_hand(cp->cpu_id)) { 4300Sstevel@tonic-gate lgrp_part_del_cpu(cp); 4310Sstevel@tonic-gate lgrp_cpu_fini(cp, lgrpid); 4320Sstevel@tonic-gate 4330Sstevel@tonic-gate lgrp_cpu_init(cp); 4340Sstevel@tonic-gate lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 4350Sstevel@tonic-gate 4360Sstevel@tonic-gate ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 4370Sstevel@tonic-gate 438218Sjjc /* 439*3676Sesaxe * Notify the PG subsystem that the CPU's lgrp 440*3676Sesaxe * association has changed 441*3676Sesaxe */ 442*3676Sesaxe pg_cpu0_reinit(); 443*3676Sesaxe 444*3676Sesaxe /* 445218Sjjc * Destroy all lgroups except for root 446218Sjjc */ 4470Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 4480Sstevel@tonic-gate if (LGRP_EXISTS(lgrp_table[i]) && 4490Sstevel@tonic-gate lgrp_table[i] != lgrp_root) 4500Sstevel@tonic-gate lgrp_destroy(lgrp_table[i]); 4510Sstevel@tonic-gate } 452218Sjjc 453218Sjjc /* 454218Sjjc * Fix up root to point at itself for leaves and resources 455218Sjjc * and not have any children 456218Sjjc */ 457218Sjjc lgrp_root->lgrp_childcnt = 0; 458218Sjjc klgrpset_clear(lgrp_root->lgrp_children); 459218Sjjc klgrpset_clear(lgrp_root->lgrp_leaves); 460218Sjjc klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 4610Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 4620Sstevel@tonic-gate klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 4630Sstevel@tonic-gate } 4640Sstevel@tonic-gate 4650Sstevel@tonic-gate /* 4660Sstevel@tonic-gate * Initialize kstats framework. 4670Sstevel@tonic-gate */ 4680Sstevel@tonic-gate lgrp_kstat_init(); 4690Sstevel@tonic-gate /* 4700Sstevel@tonic-gate * cpu0 is finally where it should be, so create it's lgroup's kstats 4710Sstevel@tonic-gate */ 4720Sstevel@tonic-gate mutex_enter(&cpu_lock); 4730Sstevel@tonic-gate lgrp_kstat_create(cp); 4740Sstevel@tonic-gate mutex_exit(&cpu_lock); 4750Sstevel@tonic-gate 4760Sstevel@tonic-gate lgrp_plat_main_init(); 4770Sstevel@tonic-gate lgrp_initialized = 1; 4780Sstevel@tonic-gate } 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate /* 4810Sstevel@tonic-gate * Finish lgrp initialization after all CPUS are brought on-line. 4820Sstevel@tonic-gate * This routine is called after start_other_cpus(). 4830Sstevel@tonic-gate */ 4840Sstevel@tonic-gate void 4850Sstevel@tonic-gate lgrp_main_mp_init(void) 4860Sstevel@tonic-gate { 4870Sstevel@tonic-gate klgrpset_t changed; 4880Sstevel@tonic-gate 4890Sstevel@tonic-gate /* 4900Sstevel@tonic-gate * Update lgroup topology (if necessary) 4910Sstevel@tonic-gate */ 4920Sstevel@tonic-gate klgrpset_clear(changed); 4930Sstevel@tonic-gate (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 4940Sstevel@tonic-gate lgrp_topo_initialized = 1; 4950Sstevel@tonic-gate } 4960Sstevel@tonic-gate 4970Sstevel@tonic-gate /* 4982988Sjjc * Change latency of lgroup with specified lgroup platform handle (if one is 4992988Sjjc * given) or change all lgroups with old latency to new latency 5002988Sjjc */ 5012988Sjjc void 5022988Sjjc lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 5032988Sjjc u_longlong_t newtime) 5042988Sjjc { 5052988Sjjc lgrp_t *lgrp; 5062988Sjjc int i; 5072988Sjjc 5082988Sjjc for (i = 0; i <= lgrp_alloc_max; i++) { 5092988Sjjc lgrp = lgrp_table[i]; 5102988Sjjc 5112988Sjjc if (!LGRP_EXISTS(lgrp)) 5122988Sjjc continue; 5132988Sjjc 5142988Sjjc if ((hand == LGRP_NULL_HANDLE && 5152988Sjjc lgrp->lgrp_latency == oldtime) || 5162988Sjjc (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 5172988Sjjc lgrp->lgrp_latency = (int)newtime; 5182988Sjjc } 5192988Sjjc } 5202988Sjjc 5212988Sjjc /* 5220Sstevel@tonic-gate * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 5230Sstevel@tonic-gate */ 5240Sstevel@tonic-gate void 5250Sstevel@tonic-gate lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 5260Sstevel@tonic-gate { 5270Sstevel@tonic-gate klgrpset_t changed; 5280Sstevel@tonic-gate cpu_t *cp; 5290Sstevel@tonic-gate lgrp_id_t id; 5300Sstevel@tonic-gate int rc; 5310Sstevel@tonic-gate 5320Sstevel@tonic-gate switch (event) { 5330Sstevel@tonic-gate /* 5340Sstevel@tonic-gate * The following (re)configuration events are common code 5350Sstevel@tonic-gate * initiated. lgrp_plat_config() is called here to inform the 5360Sstevel@tonic-gate * platform of the reconfiguration event. 5370Sstevel@tonic-gate */ 5380Sstevel@tonic-gate case LGRP_CONFIG_CPU_ADD: 53960Sesaxe cp = (cpu_t *)resource; 54060Sesaxe 54160Sesaxe /* 54260Sesaxe * Initialize the new CPU's lgrp related next/prev 54360Sesaxe * links, and give it a bootstrap lpl so that it can 54460Sesaxe * survive should it need to enter the dispatcher. 54560Sesaxe */ 54660Sesaxe cp->cpu_next_lpl = cp; 54760Sesaxe cp->cpu_prev_lpl = cp; 54860Sesaxe cp->cpu_next_lgrp = cp; 54960Sesaxe cp->cpu_prev_lgrp = cp; 55060Sesaxe cp->cpu_lpl = lpl_bootstrap; 55160Sesaxe 5520Sstevel@tonic-gate lgrp_plat_config(event, resource); 5530Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5540Sstevel@tonic-gate 5550Sstevel@tonic-gate break; 5560Sstevel@tonic-gate case LGRP_CONFIG_CPU_DEL: 5570Sstevel@tonic-gate lgrp_plat_config(event, resource); 5580Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate break; 5610Sstevel@tonic-gate case LGRP_CONFIG_CPU_ONLINE: 5620Sstevel@tonic-gate cp = (cpu_t *)resource; 5630Sstevel@tonic-gate lgrp_cpu_init(cp); 5640Sstevel@tonic-gate lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 5650Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5660Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5670Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5680Sstevel@tonic-gate } 5690Sstevel@tonic-gate lgrp_plat_config(event, resource); 5700Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5710Sstevel@tonic-gate 5720Sstevel@tonic-gate break; 5730Sstevel@tonic-gate case LGRP_CONFIG_CPU_OFFLINE: 5740Sstevel@tonic-gate cp = (cpu_t *)resource; 5750Sstevel@tonic-gate id = cp->cpu_lpl->lpl_lgrpid; 5760Sstevel@tonic-gate lgrp_part_del_cpu(cp); 5770Sstevel@tonic-gate lgrp_cpu_fini(cp, id); 5780Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5790Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5800Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5810Sstevel@tonic-gate } 5820Sstevel@tonic-gate lgrp_plat_config(event, resource); 5830Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5840Sstevel@tonic-gate 5850Sstevel@tonic-gate break; 5860Sstevel@tonic-gate case LGRP_CONFIG_CPUPART_ADD: 5870Sstevel@tonic-gate cp = (cpu_t *)resource; 5880Sstevel@tonic-gate lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 5890Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5900Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5910Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5920Sstevel@tonic-gate } 5930Sstevel@tonic-gate lgrp_plat_config(event, resource); 5940Sstevel@tonic-gate 5950Sstevel@tonic-gate break; 5960Sstevel@tonic-gate case LGRP_CONFIG_CPUPART_DEL: 5970Sstevel@tonic-gate cp = (cpu_t *)resource; 5980Sstevel@tonic-gate lgrp_part_del_cpu((cpu_t *)resource); 5990Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 6000Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 6010Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 6020Sstevel@tonic-gate } 6030Sstevel@tonic-gate lgrp_plat_config(event, resource); 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate break; 6060Sstevel@tonic-gate /* 6070Sstevel@tonic-gate * The following events are initiated by the memnode 6080Sstevel@tonic-gate * subsystem. 6090Sstevel@tonic-gate */ 6100Sstevel@tonic-gate case LGRP_CONFIG_MEM_ADD: 6110Sstevel@tonic-gate lgrp_mem_init((int)resource, where, B_FALSE); 6120Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 6130Sstevel@tonic-gate 6140Sstevel@tonic-gate break; 6150Sstevel@tonic-gate case LGRP_CONFIG_MEM_DEL: 6160Sstevel@tonic-gate lgrp_mem_fini((int)resource, where, B_FALSE); 6170Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 6180Sstevel@tonic-gate 6190Sstevel@tonic-gate break; 6200Sstevel@tonic-gate case LGRP_CONFIG_MEM_RENAME: { 6210Sstevel@tonic-gate lgrp_config_mem_rename_t *ren_arg = 6220Sstevel@tonic-gate (lgrp_config_mem_rename_t *)where; 6230Sstevel@tonic-gate 6240Sstevel@tonic-gate lgrp_mem_rename((int)resource, 6250Sstevel@tonic-gate ren_arg->lmem_rename_from, 6260Sstevel@tonic-gate ren_arg->lmem_rename_to); 6270Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 6280Sstevel@tonic-gate 6290Sstevel@tonic-gate break; 6300Sstevel@tonic-gate } 6310Sstevel@tonic-gate case LGRP_CONFIG_GEN_UPDATE: 6320Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 6330Sstevel@tonic-gate 6340Sstevel@tonic-gate break; 6350Sstevel@tonic-gate case LGRP_CONFIG_FLATTEN: 6360Sstevel@tonic-gate if (where == 0) 6370Sstevel@tonic-gate lgrp_topo_levels = (int)resource; 6380Sstevel@tonic-gate else 6390Sstevel@tonic-gate (void) lgrp_topo_flatten(resource, 6400Sstevel@tonic-gate lgrp_table, lgrp_alloc_max, &changed); 6410Sstevel@tonic-gate 6420Sstevel@tonic-gate break; 6430Sstevel@tonic-gate /* 6442988Sjjc * Update any lgroups with old latency to new latency 6450Sstevel@tonic-gate */ 6462988Sjjc case LGRP_CONFIG_LAT_CHANGE_ALL: 6472988Sjjc lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 6482988Sjjc (u_longlong_t)where); 6492988Sjjc 6502988Sjjc break; 6512988Sjjc /* 6522988Sjjc * Update lgroup with specified lgroup platform handle to have 6532988Sjjc * new latency 6542988Sjjc */ 6552988Sjjc case LGRP_CONFIG_LAT_CHANGE: 6562988Sjjc lgrp_latency_change((lgrp_handle_t)resource, 0, 6570Sstevel@tonic-gate (u_longlong_t)where); 6580Sstevel@tonic-gate 6590Sstevel@tonic-gate break; 6600Sstevel@tonic-gate case LGRP_CONFIG_NOP: 6610Sstevel@tonic-gate 6620Sstevel@tonic-gate break; 6630Sstevel@tonic-gate default: 6640Sstevel@tonic-gate break; 6650Sstevel@tonic-gate } 6660Sstevel@tonic-gate 6670Sstevel@tonic-gate } 6680Sstevel@tonic-gate 6690Sstevel@tonic-gate /* 6700Sstevel@tonic-gate * Called to add lgrp info into cpu structure from cpu_add_unit; 6710Sstevel@tonic-gate * do not assume cpu is in cpu[] yet! 6720Sstevel@tonic-gate * 6730Sstevel@tonic-gate * CPUs are brought online with all other CPUs paused so we can't 6740Sstevel@tonic-gate * allocate memory or we could deadlock the system, so we rely on 6750Sstevel@tonic-gate * the platform to statically allocate as much space as we need 6760Sstevel@tonic-gate * for the lgrp structs and stats. 6770Sstevel@tonic-gate */ 6780Sstevel@tonic-gate static void 6790Sstevel@tonic-gate lgrp_cpu_init(struct cpu *cp) 6800Sstevel@tonic-gate { 6810Sstevel@tonic-gate klgrpset_t changed; 6820Sstevel@tonic-gate int count; 6830Sstevel@tonic-gate lgrp_handle_t hand; 6840Sstevel@tonic-gate int first_cpu; 6850Sstevel@tonic-gate lgrp_t *my_lgrp; 6860Sstevel@tonic-gate lgrp_id_t lgrpid; 6870Sstevel@tonic-gate struct cpu *cptr; 6880Sstevel@tonic-gate 6890Sstevel@tonic-gate /* 6900Sstevel@tonic-gate * This is the first time through if the resource set 6910Sstevel@tonic-gate * for the root lgroup is empty. After cpu0 has been 6920Sstevel@tonic-gate * initially added to an lgroup, the root's CPU resource 6930Sstevel@tonic-gate * set can never be empty, since the system's last CPU 6940Sstevel@tonic-gate * cannot be offlined. 6950Sstevel@tonic-gate */ 6960Sstevel@tonic-gate if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 6970Sstevel@tonic-gate /* 6980Sstevel@tonic-gate * First time through. 6990Sstevel@tonic-gate */ 7000Sstevel@tonic-gate first_cpu = 1; 7010Sstevel@tonic-gate } else { 7020Sstevel@tonic-gate /* 7030Sstevel@tonic-gate * If cpu0 needs to move lgroups, we may come 7040Sstevel@tonic-gate * through here again, at which time cpu_lock won't 7050Sstevel@tonic-gate * be held, and lgrp_initialized will be false. 7060Sstevel@tonic-gate */ 7070Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 7080Sstevel@tonic-gate ASSERT(cp->cpu_part != NULL); 7090Sstevel@tonic-gate first_cpu = 0; 7100Sstevel@tonic-gate } 7110Sstevel@tonic-gate 7120Sstevel@tonic-gate hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 7130Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 7140Sstevel@tonic-gate 7150Sstevel@tonic-gate if (my_lgrp == NULL) { 7160Sstevel@tonic-gate /* 7170Sstevel@tonic-gate * Create new lgrp and add it to lgroup topology 7180Sstevel@tonic-gate */ 7190Sstevel@tonic-gate my_lgrp = lgrp_create(); 7200Sstevel@tonic-gate my_lgrp->lgrp_plathand = hand; 7210Sstevel@tonic-gate my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 7220Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7230Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 7240Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7250Sstevel@tonic-gate 7260Sstevel@tonic-gate count = 0; 7270Sstevel@tonic-gate klgrpset_clear(changed); 7280Sstevel@tonic-gate count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 7290Sstevel@tonic-gate &changed); 73050Sjjc /* 73150Sjjc * May have added new intermediate lgroups, so need to add 73250Sjjc * resources other than CPUs which are added below 73350Sjjc */ 73450Sjjc (void) lgrp_mnode_update(changed, NULL); 7350Sstevel@tonic-gate } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 7360Sstevel@tonic-gate > 0) { 7370Sstevel@tonic-gate /* 7380Sstevel@tonic-gate * Leaf lgroup was created, but latency wasn't available 7390Sstevel@tonic-gate * then. So, set latency for it and fill in rest of lgroup 7400Sstevel@tonic-gate * topology now that we know how far it is from other leaf 7410Sstevel@tonic-gate * lgroups. 7420Sstevel@tonic-gate */ 7430Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7440Sstevel@tonic-gate klgrpset_clear(changed); 7450Sstevel@tonic-gate if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 7460Sstevel@tonic-gate lgrpid)) 7470Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7480Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 7490Sstevel@tonic-gate &changed); 7500Sstevel@tonic-gate 7510Sstevel@tonic-gate /* 7520Sstevel@tonic-gate * May have added new intermediate lgroups, so need to add 7530Sstevel@tonic-gate * resources other than CPUs which are added below 7540Sstevel@tonic-gate */ 7550Sstevel@tonic-gate (void) lgrp_mnode_update(changed, NULL); 7560Sstevel@tonic-gate } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 7570Sstevel@tonic-gate my_lgrp->lgrp_id)) { 7580Sstevel@tonic-gate int i; 7590Sstevel@tonic-gate 7600Sstevel@tonic-gate /* 7610Sstevel@tonic-gate * Update existing lgroup and lgroups containing it with CPU 7620Sstevel@tonic-gate * resource 7630Sstevel@tonic-gate */ 7640Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7650Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7660Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 7670Sstevel@tonic-gate lgrp_t *lgrp; 7680Sstevel@tonic-gate 7690Sstevel@tonic-gate lgrp = lgrp_table[i]; 7700Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 7710Sstevel@tonic-gate !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 7720Sstevel@tonic-gate continue; 7730Sstevel@tonic-gate 7740Sstevel@tonic-gate klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7750Sstevel@tonic-gate } 7760Sstevel@tonic-gate } 7770Sstevel@tonic-gate 7780Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7790Sstevel@tonic-gate cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 7800Sstevel@tonic-gate 7810Sstevel@tonic-gate /* 7820Sstevel@tonic-gate * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 7830Sstevel@tonic-gate * end up in lpl for lgroup 0 whether it is supposed to be in there or 7840Sstevel@tonic-gate * not since none of lgroup IDs in the lpl's have been set yet. 7850Sstevel@tonic-gate */ 7860Sstevel@tonic-gate if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 7870Sstevel@tonic-gate cp->cpu_lpl->lpl_lgrpid = lgrpid; 7880Sstevel@tonic-gate 7890Sstevel@tonic-gate /* 7900Sstevel@tonic-gate * link the CPU into the lgrp's CPU list 7910Sstevel@tonic-gate */ 7920Sstevel@tonic-gate if (my_lgrp->lgrp_cpucnt == 0) { 7930Sstevel@tonic-gate my_lgrp->lgrp_cpu = cp; 7940Sstevel@tonic-gate cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 7950Sstevel@tonic-gate } else { 7960Sstevel@tonic-gate cptr = my_lgrp->lgrp_cpu; 7970Sstevel@tonic-gate cp->cpu_next_lgrp = cptr; 7980Sstevel@tonic-gate cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 7990Sstevel@tonic-gate cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 8000Sstevel@tonic-gate cptr->cpu_prev_lgrp = cp; 8010Sstevel@tonic-gate } 8020Sstevel@tonic-gate my_lgrp->lgrp_cpucnt++; 8030Sstevel@tonic-gate } 8040Sstevel@tonic-gate 8050Sstevel@tonic-gate lgrp_t * 8060Sstevel@tonic-gate lgrp_create(void) 8070Sstevel@tonic-gate { 8080Sstevel@tonic-gate lgrp_t *my_lgrp; 8090Sstevel@tonic-gate lgrp_id_t lgrpid; 8100Sstevel@tonic-gate int i; 8110Sstevel@tonic-gate 8120Sstevel@tonic-gate ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 8130Sstevel@tonic-gate 8140Sstevel@tonic-gate /* 8150Sstevel@tonic-gate * Find an open slot in the lgroup table and recycle unused lgroup 8160Sstevel@tonic-gate * left there if any 8170Sstevel@tonic-gate */ 8180Sstevel@tonic-gate my_lgrp = NULL; 8190Sstevel@tonic-gate if (lgrp_alloc_hint == -1) 8200Sstevel@tonic-gate /* 8210Sstevel@tonic-gate * Allocate from end when hint not set yet because no lgroups 8220Sstevel@tonic-gate * have been deleted yet 8230Sstevel@tonic-gate */ 8240Sstevel@tonic-gate lgrpid = nlgrps++; 8250Sstevel@tonic-gate else { 8260Sstevel@tonic-gate /* 8270Sstevel@tonic-gate * Start looking for next open slot from hint and leave hint 8280Sstevel@tonic-gate * at slot allocated 8290Sstevel@tonic-gate */ 8300Sstevel@tonic-gate for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 8310Sstevel@tonic-gate my_lgrp = lgrp_table[i]; 8320Sstevel@tonic-gate if (!LGRP_EXISTS(my_lgrp)) { 8330Sstevel@tonic-gate lgrpid = i; 8340Sstevel@tonic-gate nlgrps++; 8350Sstevel@tonic-gate break; 8360Sstevel@tonic-gate } 8370Sstevel@tonic-gate } 8380Sstevel@tonic-gate lgrp_alloc_hint = lgrpid; 8390Sstevel@tonic-gate } 8400Sstevel@tonic-gate 8410Sstevel@tonic-gate /* 8420Sstevel@tonic-gate * Keep track of max lgroup ID allocated so far to cut down on searches 8430Sstevel@tonic-gate */ 8440Sstevel@tonic-gate if (lgrpid > lgrp_alloc_max) 8450Sstevel@tonic-gate lgrp_alloc_max = lgrpid; 8460Sstevel@tonic-gate 8470Sstevel@tonic-gate /* 8480Sstevel@tonic-gate * Need to allocate new lgroup if next open slot didn't have one 8490Sstevel@tonic-gate * for recycling 8500Sstevel@tonic-gate */ 8510Sstevel@tonic-gate if (my_lgrp == NULL) 8520Sstevel@tonic-gate my_lgrp = lgrp_plat_alloc(lgrpid); 8530Sstevel@tonic-gate 8540Sstevel@tonic-gate if (nlgrps > nlgrpsmax || my_lgrp == NULL) 8550Sstevel@tonic-gate panic("Too many lgrps for platform (%d)", nlgrps); 8560Sstevel@tonic-gate 8570Sstevel@tonic-gate my_lgrp->lgrp_id = lgrpid; 8580Sstevel@tonic-gate my_lgrp->lgrp_latency = 0; 8590Sstevel@tonic-gate my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 8600Sstevel@tonic-gate my_lgrp->lgrp_parent = NULL; 8610Sstevel@tonic-gate my_lgrp->lgrp_childcnt = 0; 8620Sstevel@tonic-gate my_lgrp->lgrp_mnodes = (mnodeset_t)0; 8630Sstevel@tonic-gate my_lgrp->lgrp_nmnodes = 0; 8640Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_children); 8650Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_leaves); 8660Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 8670Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_set[i]); 8680Sstevel@tonic-gate 8690Sstevel@tonic-gate my_lgrp->lgrp_cpu = NULL; 8700Sstevel@tonic-gate my_lgrp->lgrp_cpucnt = 0; 8710Sstevel@tonic-gate 8720Sstevel@tonic-gate if (my_lgrp->lgrp_kstat != NULL) 8730Sstevel@tonic-gate lgrp_kstat_reset(lgrpid); 8740Sstevel@tonic-gate 8750Sstevel@tonic-gate lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 8760Sstevel@tonic-gate 8770Sstevel@tonic-gate return (my_lgrp); 8780Sstevel@tonic-gate } 8790Sstevel@tonic-gate 8800Sstevel@tonic-gate void 8810Sstevel@tonic-gate lgrp_destroy(lgrp_t *lgrp) 8820Sstevel@tonic-gate { 8830Sstevel@tonic-gate int i; 8840Sstevel@tonic-gate 8850Sstevel@tonic-gate /* 8860Sstevel@tonic-gate * Unless this lgroup is being destroyed on behalf of 8870Sstevel@tonic-gate * the boot CPU, cpu_lock must be held 8880Sstevel@tonic-gate */ 8890Sstevel@tonic-gate ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 8900Sstevel@tonic-gate 8910Sstevel@tonic-gate if (nlgrps == 1) 8920Sstevel@tonic-gate cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 8930Sstevel@tonic-gate 8940Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 8950Sstevel@tonic-gate return; 8960Sstevel@tonic-gate 8970Sstevel@tonic-gate /* 8980Sstevel@tonic-gate * Set hint to lgroup being deleted and try to keep lower numbered 8990Sstevel@tonic-gate * hints to facilitate finding empty slots 9000Sstevel@tonic-gate */ 9010Sstevel@tonic-gate if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 9020Sstevel@tonic-gate lgrp_alloc_hint = lgrp->lgrp_id; 9030Sstevel@tonic-gate 9040Sstevel@tonic-gate /* 9050Sstevel@tonic-gate * Mark this lgroup to be recycled by setting its lgroup ID to 9060Sstevel@tonic-gate * LGRP_NONE and clear relevant fields 9070Sstevel@tonic-gate */ 9080Sstevel@tonic-gate lgrp->lgrp_id = LGRP_NONE; 9090Sstevel@tonic-gate lgrp->lgrp_latency = 0; 9100Sstevel@tonic-gate lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 9110Sstevel@tonic-gate lgrp->lgrp_parent = NULL; 9120Sstevel@tonic-gate lgrp->lgrp_childcnt = 0; 9130Sstevel@tonic-gate 9140Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_children); 9150Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_leaves); 9160Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 9170Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_set[i]); 9180Sstevel@tonic-gate 9190Sstevel@tonic-gate lgrp->lgrp_mnodes = (mnodeset_t)0; 9200Sstevel@tonic-gate lgrp->lgrp_nmnodes = 0; 9210Sstevel@tonic-gate 9220Sstevel@tonic-gate lgrp->lgrp_cpu = NULL; 9230Sstevel@tonic-gate lgrp->lgrp_cpucnt = 0; 9240Sstevel@tonic-gate 9250Sstevel@tonic-gate nlgrps--; 9260Sstevel@tonic-gate } 9270Sstevel@tonic-gate 9280Sstevel@tonic-gate /* 9290Sstevel@tonic-gate * Initialize kstat data. Called from lgrp intialization code. 9300Sstevel@tonic-gate */ 9310Sstevel@tonic-gate static void 9320Sstevel@tonic-gate lgrp_kstat_init(void) 9330Sstevel@tonic-gate { 9340Sstevel@tonic-gate lgrp_stat_t stat; 9350Sstevel@tonic-gate 9360Sstevel@tonic-gate mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 9370Sstevel@tonic-gate 9380Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_STATS; stat++) 9390Sstevel@tonic-gate kstat_named_init(&lgrp_kstat_data[stat], 9400Sstevel@tonic-gate lgrp_kstat_names[stat], KSTAT_DATA_INT64); 9410Sstevel@tonic-gate } 9420Sstevel@tonic-gate 9430Sstevel@tonic-gate /* 9440Sstevel@tonic-gate * initialize an lgrp's kstats if needed 9450Sstevel@tonic-gate * called with cpu_lock held but not with cpus paused. 9460Sstevel@tonic-gate * we don't tear these down now because we don't know about 9470Sstevel@tonic-gate * memory leaving the lgrp yet... 9480Sstevel@tonic-gate */ 9490Sstevel@tonic-gate 9500Sstevel@tonic-gate void 9510Sstevel@tonic-gate lgrp_kstat_create(cpu_t *cp) 9520Sstevel@tonic-gate { 9530Sstevel@tonic-gate kstat_t *lgrp_kstat; 9540Sstevel@tonic-gate lgrp_id_t lgrpid; 9550Sstevel@tonic-gate lgrp_t *my_lgrp; 9560Sstevel@tonic-gate 9570Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9580Sstevel@tonic-gate 9590Sstevel@tonic-gate lgrpid = cp->cpu_lpl->lpl_lgrpid; 9600Sstevel@tonic-gate my_lgrp = lgrp_table[lgrpid]; 9610Sstevel@tonic-gate 9620Sstevel@tonic-gate if (my_lgrp->lgrp_kstat != NULL) 9630Sstevel@tonic-gate return; /* already initialized */ 9640Sstevel@tonic-gate 9650Sstevel@tonic-gate lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 9660Sstevel@tonic-gate KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 9670Sstevel@tonic-gate KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 9680Sstevel@tonic-gate 9690Sstevel@tonic-gate if (lgrp_kstat != NULL) { 9700Sstevel@tonic-gate lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 9710Sstevel@tonic-gate lgrp_kstat->ks_private = my_lgrp; 9720Sstevel@tonic-gate lgrp_kstat->ks_data = &lgrp_kstat_data; 9730Sstevel@tonic-gate lgrp_kstat->ks_update = lgrp_kstat_extract; 9740Sstevel@tonic-gate my_lgrp->lgrp_kstat = lgrp_kstat; 9750Sstevel@tonic-gate kstat_install(lgrp_kstat); 9760Sstevel@tonic-gate } 9770Sstevel@tonic-gate } 9780Sstevel@tonic-gate 9790Sstevel@tonic-gate /* 9800Sstevel@tonic-gate * this will do something when we manage to remove now unused lgrps 9810Sstevel@tonic-gate */ 9820Sstevel@tonic-gate 9830Sstevel@tonic-gate /* ARGSUSED */ 9840Sstevel@tonic-gate void 9850Sstevel@tonic-gate lgrp_kstat_destroy(cpu_t *cp) 9860Sstevel@tonic-gate { 9870Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9880Sstevel@tonic-gate } 9890Sstevel@tonic-gate 9900Sstevel@tonic-gate /* 9910Sstevel@tonic-gate * Called when a CPU is off-lined. 9920Sstevel@tonic-gate */ 9930Sstevel@tonic-gate static void 9940Sstevel@tonic-gate lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 9950Sstevel@tonic-gate { 9960Sstevel@tonic-gate lgrp_t *my_lgrp; 9970Sstevel@tonic-gate struct cpu *prev; 9980Sstevel@tonic-gate struct cpu *next; 9990Sstevel@tonic-gate 10000Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 10010Sstevel@tonic-gate 10020Sstevel@tonic-gate prev = cp->cpu_prev_lgrp; 10030Sstevel@tonic-gate next = cp->cpu_next_lgrp; 10040Sstevel@tonic-gate 10050Sstevel@tonic-gate prev->cpu_next_lgrp = next; 10060Sstevel@tonic-gate next->cpu_prev_lgrp = prev; 10070Sstevel@tonic-gate 10080Sstevel@tonic-gate /* 10090Sstevel@tonic-gate * just because I'm paranoid doesn't mean... 10100Sstevel@tonic-gate */ 10110Sstevel@tonic-gate 10120Sstevel@tonic-gate cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 10130Sstevel@tonic-gate 10140Sstevel@tonic-gate my_lgrp = lgrp_table[lgrpid]; 10150Sstevel@tonic-gate my_lgrp->lgrp_cpucnt--; 10160Sstevel@tonic-gate 10170Sstevel@tonic-gate /* 10180Sstevel@tonic-gate * Removing last CPU in lgroup, so update lgroup topology 10190Sstevel@tonic-gate */ 10200Sstevel@tonic-gate if (my_lgrp->lgrp_cpucnt == 0) { 10210Sstevel@tonic-gate klgrpset_t changed; 10220Sstevel@tonic-gate int count; 10230Sstevel@tonic-gate int i; 10240Sstevel@tonic-gate 10250Sstevel@tonic-gate my_lgrp->lgrp_cpu = NULL; 10260Sstevel@tonic-gate 10270Sstevel@tonic-gate /* 10280Sstevel@tonic-gate * Remove this lgroup from its lgroup CPU resources and remove 10290Sstevel@tonic-gate * lgroup from lgroup topology if it doesn't have any more 10300Sstevel@tonic-gate * resources in it now 10310Sstevel@tonic-gate */ 10320Sstevel@tonic-gate klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 10330Sstevel@tonic-gate if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 10340Sstevel@tonic-gate count = 0; 10350Sstevel@tonic-gate klgrpset_clear(changed); 10360Sstevel@tonic-gate count += lgrp_leaf_delete(my_lgrp, lgrp_table, 10370Sstevel@tonic-gate lgrp_alloc_max + 1, &changed); 10380Sstevel@tonic-gate return; 10390Sstevel@tonic-gate } 10400Sstevel@tonic-gate 10410Sstevel@tonic-gate /* 10420Sstevel@tonic-gate * This lgroup isn't empty, so just remove it from CPU 10430Sstevel@tonic-gate * resources of any lgroups that contain it as such 10440Sstevel@tonic-gate */ 10450Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 10460Sstevel@tonic-gate lgrp_t *lgrp; 10470Sstevel@tonic-gate 10480Sstevel@tonic-gate lgrp = lgrp_table[i]; 10490Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 10500Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 10510Sstevel@tonic-gate lgrpid)) 10520Sstevel@tonic-gate continue; 10530Sstevel@tonic-gate 10540Sstevel@tonic-gate klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 10550Sstevel@tonic-gate } 10560Sstevel@tonic-gate return; 10570Sstevel@tonic-gate } 10580Sstevel@tonic-gate 10590Sstevel@tonic-gate if (my_lgrp->lgrp_cpu == cp) 10600Sstevel@tonic-gate my_lgrp->lgrp_cpu = next; 10610Sstevel@tonic-gate 10620Sstevel@tonic-gate } 10630Sstevel@tonic-gate 10640Sstevel@tonic-gate /* 10650Sstevel@tonic-gate * Update memory nodes in target lgroups and return ones that get changed 10660Sstevel@tonic-gate */ 10670Sstevel@tonic-gate int 10680Sstevel@tonic-gate lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 10690Sstevel@tonic-gate { 10700Sstevel@tonic-gate int count; 10710Sstevel@tonic-gate int i; 10720Sstevel@tonic-gate int j; 10730Sstevel@tonic-gate lgrp_t *lgrp; 10740Sstevel@tonic-gate lgrp_t *lgrp_rsrc; 10750Sstevel@tonic-gate 10760Sstevel@tonic-gate count = 0; 10770Sstevel@tonic-gate if (changed) 10780Sstevel@tonic-gate klgrpset_clear(*changed); 10790Sstevel@tonic-gate 10800Sstevel@tonic-gate if (klgrpset_isempty(target)) 10810Sstevel@tonic-gate return (0); 10820Sstevel@tonic-gate 10830Sstevel@tonic-gate /* 10840Sstevel@tonic-gate * Find each lgroup in target lgroups 10850Sstevel@tonic-gate */ 10860Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 10870Sstevel@tonic-gate /* 10880Sstevel@tonic-gate * Skip any lgroups that don't exist or aren't in target group 10890Sstevel@tonic-gate */ 10900Sstevel@tonic-gate lgrp = lgrp_table[i]; 10910Sstevel@tonic-gate if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 10920Sstevel@tonic-gate continue; 10930Sstevel@tonic-gate } 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate /* 10960Sstevel@tonic-gate * Initialize memnodes for intermediate lgroups to 0 10970Sstevel@tonic-gate * and update them from scratch since they may have completely 10980Sstevel@tonic-gate * changed 10990Sstevel@tonic-gate */ 11000Sstevel@tonic-gate if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 11010Sstevel@tonic-gate lgrp->lgrp_mnodes = (mnodeset_t)0; 11020Sstevel@tonic-gate lgrp->lgrp_nmnodes = 0; 11030Sstevel@tonic-gate } 11040Sstevel@tonic-gate 11050Sstevel@tonic-gate /* 11060Sstevel@tonic-gate * Update memory nodes of of target lgroup with memory nodes 11070Sstevel@tonic-gate * from each lgroup in its lgroup memory resource set 11080Sstevel@tonic-gate */ 11090Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 11100Sstevel@tonic-gate int k; 11110Sstevel@tonic-gate 11120Sstevel@tonic-gate /* 11130Sstevel@tonic-gate * Skip any lgroups that don't exist or aren't in 11140Sstevel@tonic-gate * memory resources of target lgroup 11150Sstevel@tonic-gate */ 11160Sstevel@tonic-gate lgrp_rsrc = lgrp_table[j]; 11170Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_rsrc) || 11180Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 11190Sstevel@tonic-gate j)) 11200Sstevel@tonic-gate continue; 11210Sstevel@tonic-gate 11220Sstevel@tonic-gate /* 11230Sstevel@tonic-gate * Update target lgroup's memnodes to include memnodes 11240Sstevel@tonic-gate * of this lgroup 11250Sstevel@tonic-gate */ 11260Sstevel@tonic-gate for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 11270Sstevel@tonic-gate mnodeset_t mnode_mask; 11280Sstevel@tonic-gate 11290Sstevel@tonic-gate mnode_mask = (mnodeset_t)1 << k; 11300Sstevel@tonic-gate if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 11310Sstevel@tonic-gate !(lgrp->lgrp_mnodes & mnode_mask)) { 11320Sstevel@tonic-gate lgrp->lgrp_mnodes |= mnode_mask; 11330Sstevel@tonic-gate lgrp->lgrp_nmnodes++; 11340Sstevel@tonic-gate } 11350Sstevel@tonic-gate } 11360Sstevel@tonic-gate count++; 11370Sstevel@tonic-gate if (changed) 11380Sstevel@tonic-gate klgrpset_add(*changed, lgrp->lgrp_id); 11390Sstevel@tonic-gate } 11400Sstevel@tonic-gate } 11410Sstevel@tonic-gate 11420Sstevel@tonic-gate return (count); 11430Sstevel@tonic-gate } 11440Sstevel@tonic-gate 11450Sstevel@tonic-gate /* 11460Sstevel@tonic-gate * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 11470Sstevel@tonic-gate * is moved from one board to another. The "from" and "to" arguments specify the 11480Sstevel@tonic-gate * source and the destination of the move. 11490Sstevel@tonic-gate * 11500Sstevel@tonic-gate * See plat_lgrp_config() for a detailed description of the copy-rename 11510Sstevel@tonic-gate * semantics. 11520Sstevel@tonic-gate * 11530Sstevel@tonic-gate * The lgrp_mem_rename() is called by the platform copy-rename code to update 11540Sstevel@tonic-gate * the lgroup topology which is changing as memory moves from one lgroup to 11550Sstevel@tonic-gate * another. It removes the mnode from the source lgroup and re-inserts it in the 11560Sstevel@tonic-gate * target lgroup. 11570Sstevel@tonic-gate * 11580Sstevel@tonic-gate * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 11590Sstevel@tonic-gate * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 11600Sstevel@tonic-gate * copy-rename operation. 11610Sstevel@tonic-gate * 11620Sstevel@tonic-gate * There is one case which requires special handling. If the system contains 11630Sstevel@tonic-gate * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 11640Sstevel@tonic-gate * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 11650Sstevel@tonic-gate * lgrp_mem_init), but there is a window when the system has no memory in the 11660Sstevel@tonic-gate * lgroup hierarchy. If another thread tries to allocate memory during this 11670Sstevel@tonic-gate * window, the allocation will fail, although the system has physical memory. 11680Sstevel@tonic-gate * This may cause a system panic or a deadlock (some sleeping memory allocations 11690Sstevel@tonic-gate * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 11700Sstevel@tonic-gate * the mnode back). 11710Sstevel@tonic-gate * 11720Sstevel@tonic-gate * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 11730Sstevel@tonic-gate * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 11740Sstevel@tonic-gate * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 11750Sstevel@tonic-gate * but it updates the rest of the lgroup topology as if the mnode was actually 11760Sstevel@tonic-gate * removed. The lgrp_mem_init() function recognizes that the mnode being 11770Sstevel@tonic-gate * inserted represents such a special case and updates the topology 11780Sstevel@tonic-gate * appropriately. 11790Sstevel@tonic-gate */ 11800Sstevel@tonic-gate void 11810Sstevel@tonic-gate lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 11820Sstevel@tonic-gate { 11830Sstevel@tonic-gate /* 11840Sstevel@tonic-gate * Remove the memory from the source node and add it to the destination 11850Sstevel@tonic-gate * node. 11860Sstevel@tonic-gate */ 11870Sstevel@tonic-gate lgrp_mem_fini(mnode, from, B_TRUE); 11880Sstevel@tonic-gate lgrp_mem_init(mnode, to, B_TRUE); 11890Sstevel@tonic-gate } 11900Sstevel@tonic-gate 11910Sstevel@tonic-gate /* 11920Sstevel@tonic-gate * Called to indicate that the lgrp with platform handle "hand" now 11930Sstevel@tonic-gate * contains the memory identified by "mnode". 11940Sstevel@tonic-gate * 11950Sstevel@tonic-gate * LOCKING for this routine is a bit tricky. Usually it is called without 11960Sstevel@tonic-gate * cpu_lock and it must must grab cpu_lock here to prevent racing with other 11970Sstevel@tonic-gate * callers. During DR of the board containing the caged memory it may be called 11980Sstevel@tonic-gate * with cpu_lock already held and CPUs paused. 11990Sstevel@tonic-gate * 12000Sstevel@tonic-gate * If the insertion is part of the DR copy-rename and the inserted mnode (and 12010Sstevel@tonic-gate * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 12020Sstevel@tonic-gate * dealing with the special case of DR copy-rename described in 12030Sstevel@tonic-gate * lgrp_mem_rename(). 12040Sstevel@tonic-gate */ 12050Sstevel@tonic-gate void 12060Sstevel@tonic-gate lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 12070Sstevel@tonic-gate { 12080Sstevel@tonic-gate klgrpset_t changed; 12090Sstevel@tonic-gate int count; 12100Sstevel@tonic-gate int i; 12110Sstevel@tonic-gate lgrp_t *my_lgrp; 12120Sstevel@tonic-gate lgrp_id_t lgrpid; 12130Sstevel@tonic-gate mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 12140Sstevel@tonic-gate boolean_t drop_lock = B_FALSE; 12150Sstevel@tonic-gate boolean_t need_synch = B_FALSE; 12160Sstevel@tonic-gate 12170Sstevel@tonic-gate /* 12180Sstevel@tonic-gate * Grab CPU lock (if we haven't already) 12190Sstevel@tonic-gate */ 12200Sstevel@tonic-gate if (!MUTEX_HELD(&cpu_lock)) { 12210Sstevel@tonic-gate mutex_enter(&cpu_lock); 12220Sstevel@tonic-gate drop_lock = B_TRUE; 12230Sstevel@tonic-gate } 12240Sstevel@tonic-gate 12250Sstevel@tonic-gate /* 12260Sstevel@tonic-gate * This routine may be called from a context where we already 12270Sstevel@tonic-gate * hold cpu_lock, and have already paused cpus. 12280Sstevel@tonic-gate */ 12290Sstevel@tonic-gate if (!cpus_paused()) 12300Sstevel@tonic-gate need_synch = B_TRUE; 12310Sstevel@tonic-gate 12320Sstevel@tonic-gate /* 12330Sstevel@tonic-gate * Check if this mnode is already configured and return immediately if 12340Sstevel@tonic-gate * it is. 12350Sstevel@tonic-gate * 12360Sstevel@tonic-gate * NOTE: in special case of copy-rename of the only remaining mnode, 12370Sstevel@tonic-gate * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 12380Sstevel@tonic-gate * recognize this case and continue as usual, but skip the update to 12390Sstevel@tonic-gate * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 12400Sstevel@tonic-gate * in topology, temporarily introduced by lgrp_mem_fini(). 12410Sstevel@tonic-gate */ 12420Sstevel@tonic-gate if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 12430Sstevel@tonic-gate lgrp_root->lgrp_mnodes & mnodes_mask) { 12440Sstevel@tonic-gate if (drop_lock) 12450Sstevel@tonic-gate mutex_exit(&cpu_lock); 12460Sstevel@tonic-gate return; 12470Sstevel@tonic-gate } 12480Sstevel@tonic-gate 12490Sstevel@tonic-gate /* 12500Sstevel@tonic-gate * Update lgroup topology with new memory resources, keeping track of 12510Sstevel@tonic-gate * which lgroups change 12520Sstevel@tonic-gate */ 12530Sstevel@tonic-gate count = 0; 12540Sstevel@tonic-gate klgrpset_clear(changed); 12550Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 12560Sstevel@tonic-gate if (my_lgrp == NULL) { 12570Sstevel@tonic-gate /* new lgrp */ 12580Sstevel@tonic-gate my_lgrp = lgrp_create(); 12590Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 12600Sstevel@tonic-gate my_lgrp->lgrp_plathand = hand; 12610Sstevel@tonic-gate my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 12620Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 12630Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 12640Sstevel@tonic-gate 12650Sstevel@tonic-gate if (need_synch) 12660Sstevel@tonic-gate pause_cpus(NULL); 12670Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 12680Sstevel@tonic-gate &changed); 12690Sstevel@tonic-gate if (need_synch) 12700Sstevel@tonic-gate start_cpus(); 12710Sstevel@tonic-gate } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 12720Sstevel@tonic-gate > 0) { 12730Sstevel@tonic-gate /* 12740Sstevel@tonic-gate * Leaf lgroup was created, but latency wasn't available 12750Sstevel@tonic-gate * then. So, set latency for it and fill in rest of lgroup 12760Sstevel@tonic-gate * topology now that we know how far it is from other leaf 12770Sstevel@tonic-gate * lgroups. 12780Sstevel@tonic-gate */ 12790Sstevel@tonic-gate klgrpset_clear(changed); 12800Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 12810Sstevel@tonic-gate if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 12820Sstevel@tonic-gate lgrpid)) 12830Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 12840Sstevel@tonic-gate if (need_synch) 12850Sstevel@tonic-gate pause_cpus(NULL); 12860Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 12870Sstevel@tonic-gate &changed); 12880Sstevel@tonic-gate if (need_synch) 12890Sstevel@tonic-gate start_cpus(); 12900Sstevel@tonic-gate } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 12910Sstevel@tonic-gate my_lgrp->lgrp_id)) { 129250Sjjc /* 129350Sjjc * Add new lgroup memory resource to existing lgroup 129450Sjjc */ 12950Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 12960Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 12970Sstevel@tonic-gate klgrpset_add(changed, lgrpid); 12980Sstevel@tonic-gate count++; 12990Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 13000Sstevel@tonic-gate lgrp_t *lgrp; 13010Sstevel@tonic-gate 13020Sstevel@tonic-gate lgrp = lgrp_table[i]; 13030Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 13040Sstevel@tonic-gate !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 13050Sstevel@tonic-gate continue; 13060Sstevel@tonic-gate 13070Sstevel@tonic-gate klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 13080Sstevel@tonic-gate klgrpset_add(changed, lgrp->lgrp_id); 13090Sstevel@tonic-gate count++; 13100Sstevel@tonic-gate } 13110Sstevel@tonic-gate } 13120Sstevel@tonic-gate 13130Sstevel@tonic-gate /* 13140Sstevel@tonic-gate * Add memory node to lgroup and remove lgroup from ones that need 13150Sstevel@tonic-gate * to be updated 13160Sstevel@tonic-gate */ 13170Sstevel@tonic-gate if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 13180Sstevel@tonic-gate my_lgrp->lgrp_mnodes |= mnodes_mask; 13190Sstevel@tonic-gate my_lgrp->lgrp_nmnodes++; 13200Sstevel@tonic-gate } 13210Sstevel@tonic-gate klgrpset_del(changed, lgrpid); 13220Sstevel@tonic-gate 13230Sstevel@tonic-gate /* 13240Sstevel@tonic-gate * Update memory node information for all lgroups that changed and 13250Sstevel@tonic-gate * contain new memory node as a resource 13260Sstevel@tonic-gate */ 13270Sstevel@tonic-gate if (count) 13280Sstevel@tonic-gate (void) lgrp_mnode_update(changed, NULL); 13290Sstevel@tonic-gate 13300Sstevel@tonic-gate if (drop_lock) 13310Sstevel@tonic-gate mutex_exit(&cpu_lock); 13320Sstevel@tonic-gate } 13330Sstevel@tonic-gate 13340Sstevel@tonic-gate /* 13350Sstevel@tonic-gate * Called to indicate that the lgroup associated with the platform 13360Sstevel@tonic-gate * handle "hand" no longer contains given memory node 13370Sstevel@tonic-gate * 13380Sstevel@tonic-gate * LOCKING for this routine is a bit tricky. Usually it is called without 13390Sstevel@tonic-gate * cpu_lock and it must must grab cpu_lock here to prevent racing with other 13400Sstevel@tonic-gate * callers. During DR of the board containing the caged memory it may be called 13410Sstevel@tonic-gate * with cpu_lock already held and CPUs paused. 13420Sstevel@tonic-gate * 13430Sstevel@tonic-gate * If the deletion is part of the DR copy-rename and the deleted mnode is the 13440Sstevel@tonic-gate * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 13450Sstevel@tonic-gate * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 13460Sstevel@tonic-gate * the same mnode back into the topology. See lgrp_mem_rename() and 13470Sstevel@tonic-gate * lgrp_mem_init() for additional details. 13480Sstevel@tonic-gate */ 13490Sstevel@tonic-gate void 13500Sstevel@tonic-gate lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 13510Sstevel@tonic-gate { 13520Sstevel@tonic-gate klgrpset_t changed; 13530Sstevel@tonic-gate int count; 13540Sstevel@tonic-gate int i; 13550Sstevel@tonic-gate lgrp_t *my_lgrp; 13560Sstevel@tonic-gate lgrp_id_t lgrpid; 13570Sstevel@tonic-gate mnodeset_t mnodes_mask; 13580Sstevel@tonic-gate boolean_t drop_lock = B_FALSE; 13590Sstevel@tonic-gate boolean_t need_synch = B_FALSE; 13600Sstevel@tonic-gate 13610Sstevel@tonic-gate /* 13620Sstevel@tonic-gate * Grab CPU lock (if we haven't already) 13630Sstevel@tonic-gate */ 13640Sstevel@tonic-gate if (!MUTEX_HELD(&cpu_lock)) { 13650Sstevel@tonic-gate mutex_enter(&cpu_lock); 13660Sstevel@tonic-gate drop_lock = B_TRUE; 13670Sstevel@tonic-gate } 13680Sstevel@tonic-gate 13690Sstevel@tonic-gate /* 13700Sstevel@tonic-gate * This routine may be called from a context where we already 13710Sstevel@tonic-gate * hold cpu_lock and have already paused cpus. 13720Sstevel@tonic-gate */ 13730Sstevel@tonic-gate if (!cpus_paused()) 13740Sstevel@tonic-gate need_synch = B_TRUE; 13750Sstevel@tonic-gate 13760Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 13770Sstevel@tonic-gate 13780Sstevel@tonic-gate /* 13790Sstevel@tonic-gate * The lgrp *must* be pre-existing 13800Sstevel@tonic-gate */ 13810Sstevel@tonic-gate ASSERT(my_lgrp != NULL); 13820Sstevel@tonic-gate 13830Sstevel@tonic-gate /* 13840Sstevel@tonic-gate * Delete memory node from lgroups which contain it 13850Sstevel@tonic-gate */ 13860Sstevel@tonic-gate mnodes_mask = ((mnodeset_t)1 << mnode); 13870Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 13880Sstevel@tonic-gate lgrp_t *lgrp = lgrp_table[i]; 13890Sstevel@tonic-gate /* 13900Sstevel@tonic-gate * Skip any non-existent lgroups and any lgroups that don't 13910Sstevel@tonic-gate * contain leaf lgroup of memory as a memory resource 13920Sstevel@tonic-gate */ 13930Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 13940Sstevel@tonic-gate !(lgrp->lgrp_mnodes & mnodes_mask)) 13950Sstevel@tonic-gate continue; 13960Sstevel@tonic-gate 13970Sstevel@tonic-gate /* 13980Sstevel@tonic-gate * Avoid removing the last mnode from the root in the DR 13990Sstevel@tonic-gate * copy-rename case. See lgrp_mem_rename() for details. 14000Sstevel@tonic-gate */ 14010Sstevel@tonic-gate if (is_copy_rename && 14020Sstevel@tonic-gate (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 14030Sstevel@tonic-gate continue; 14040Sstevel@tonic-gate 14050Sstevel@tonic-gate /* 14060Sstevel@tonic-gate * Remove memory node from lgroup. 14070Sstevel@tonic-gate */ 14080Sstevel@tonic-gate lgrp->lgrp_mnodes &= ~mnodes_mask; 14090Sstevel@tonic-gate lgrp->lgrp_nmnodes--; 14100Sstevel@tonic-gate ASSERT(lgrp->lgrp_nmnodes >= 0); 14110Sstevel@tonic-gate } 14120Sstevel@tonic-gate ASSERT(lgrp_root->lgrp_nmnodes > 0); 14130Sstevel@tonic-gate 14140Sstevel@tonic-gate /* 14150Sstevel@tonic-gate * Don't need to update lgroup topology if this lgroup still has memory. 14160Sstevel@tonic-gate * 14170Sstevel@tonic-gate * In the special case of DR copy-rename with the only mnode being 14180Sstevel@tonic-gate * removed, the lgrp_mnodes for the root is always non-zero, but we 14190Sstevel@tonic-gate * still need to update the lgroup topology. 14200Sstevel@tonic-gate */ 14210Sstevel@tonic-gate if ((my_lgrp->lgrp_nmnodes > 0) && 14220Sstevel@tonic-gate !(is_copy_rename && 14230Sstevel@tonic-gate (my_lgrp == lgrp_root) && 14240Sstevel@tonic-gate (my_lgrp->lgrp_mnodes == mnodes_mask))) { 14250Sstevel@tonic-gate if (drop_lock) 14260Sstevel@tonic-gate mutex_exit(&cpu_lock); 14270Sstevel@tonic-gate return; 14280Sstevel@tonic-gate } 14290Sstevel@tonic-gate 14300Sstevel@tonic-gate /* 14310Sstevel@tonic-gate * This lgroup does not contain any memory now 14320Sstevel@tonic-gate */ 14330Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 14340Sstevel@tonic-gate 14350Sstevel@tonic-gate /* 14360Sstevel@tonic-gate * Remove this lgroup from lgroup topology if it does not contain any 14370Sstevel@tonic-gate * resources now 14380Sstevel@tonic-gate */ 14390Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 14400Sstevel@tonic-gate count = 0; 14410Sstevel@tonic-gate klgrpset_clear(changed); 14420Sstevel@tonic-gate if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 14430Sstevel@tonic-gate /* 14440Sstevel@tonic-gate * Delete lgroup when no more resources 14450Sstevel@tonic-gate */ 14460Sstevel@tonic-gate if (need_synch) 14470Sstevel@tonic-gate pause_cpus(NULL); 14480Sstevel@tonic-gate count = lgrp_leaf_delete(my_lgrp, lgrp_table, 14490Sstevel@tonic-gate lgrp_alloc_max + 1, &changed); 14500Sstevel@tonic-gate ASSERT(count > 0); 14510Sstevel@tonic-gate if (need_synch) 14520Sstevel@tonic-gate start_cpus(); 14530Sstevel@tonic-gate } else { 14540Sstevel@tonic-gate /* 14550Sstevel@tonic-gate * Remove lgroup from memory resources of any lgroups that 14560Sstevel@tonic-gate * contain it as such 14570Sstevel@tonic-gate */ 14580Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 14590Sstevel@tonic-gate lgrp_t *lgrp; 14600Sstevel@tonic-gate 14610Sstevel@tonic-gate lgrp = lgrp_table[i]; 14620Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 14630Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 14640Sstevel@tonic-gate lgrpid)) 14650Sstevel@tonic-gate continue; 14660Sstevel@tonic-gate 14670Sstevel@tonic-gate klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 14680Sstevel@tonic-gate } 14690Sstevel@tonic-gate } 14700Sstevel@tonic-gate if (drop_lock) 14710Sstevel@tonic-gate mutex_exit(&cpu_lock); 14720Sstevel@tonic-gate } 14730Sstevel@tonic-gate 14740Sstevel@tonic-gate /* 14750Sstevel@tonic-gate * Return lgroup with given platform handle 14760Sstevel@tonic-gate */ 14770Sstevel@tonic-gate lgrp_t * 14780Sstevel@tonic-gate lgrp_hand_to_lgrp(lgrp_handle_t hand) 14790Sstevel@tonic-gate { 14800Sstevel@tonic-gate int i; 14810Sstevel@tonic-gate lgrp_t *lgrp; 14820Sstevel@tonic-gate 14830Sstevel@tonic-gate if (hand == LGRP_NULL_HANDLE) 14840Sstevel@tonic-gate return (NULL); 14850Sstevel@tonic-gate 14860Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 14870Sstevel@tonic-gate lgrp = lgrp_table[i]; 14880Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 14890Sstevel@tonic-gate return (lgrp); 14900Sstevel@tonic-gate } 14910Sstevel@tonic-gate return (NULL); 14920Sstevel@tonic-gate } 14930Sstevel@tonic-gate 14940Sstevel@tonic-gate /* 14950Sstevel@tonic-gate * Return the home lgroup of the current thread. 14960Sstevel@tonic-gate * We must do this with kernel preemption disabled, since we don't want our 14970Sstevel@tonic-gate * thread to be re-homed while we're poking around with its lpl, and the lpl 14980Sstevel@tonic-gate * should never be NULL. 14990Sstevel@tonic-gate * 15000Sstevel@tonic-gate * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 15010Sstevel@tonic-gate * is enabled because of DR. Callers can use disable kernel preemption 15020Sstevel@tonic-gate * around this call to guarantee that the lgroup will be valid beyond this 15030Sstevel@tonic-gate * routine, since kernel preemption can be recursive. 15040Sstevel@tonic-gate */ 15050Sstevel@tonic-gate lgrp_t * 15060Sstevel@tonic-gate lgrp_home_lgrp(void) 15070Sstevel@tonic-gate { 15080Sstevel@tonic-gate lgrp_t *lgrp; 15090Sstevel@tonic-gate lpl_t *lpl; 15100Sstevel@tonic-gate 15110Sstevel@tonic-gate kpreempt_disable(); 15120Sstevel@tonic-gate 15130Sstevel@tonic-gate lpl = curthread->t_lpl; 15140Sstevel@tonic-gate ASSERT(lpl != NULL); 15150Sstevel@tonic-gate ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 15160Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 15170Sstevel@tonic-gate lgrp = lgrp_table[lpl->lpl_lgrpid]; 15180Sstevel@tonic-gate 15190Sstevel@tonic-gate kpreempt_enable(); 15200Sstevel@tonic-gate 15210Sstevel@tonic-gate return (lgrp); 15220Sstevel@tonic-gate } 15230Sstevel@tonic-gate 15240Sstevel@tonic-gate /* 15250Sstevel@tonic-gate * Return ID of home lgroup for given thread 15260Sstevel@tonic-gate * (See comments for lgrp_home_lgrp() for special care and handling 15270Sstevel@tonic-gate * instructions) 15280Sstevel@tonic-gate */ 15290Sstevel@tonic-gate lgrp_id_t 15300Sstevel@tonic-gate lgrp_home_id(kthread_t *t) 15310Sstevel@tonic-gate { 15320Sstevel@tonic-gate lgrp_id_t lgrp; 15330Sstevel@tonic-gate lpl_t *lpl; 15340Sstevel@tonic-gate 15350Sstevel@tonic-gate ASSERT(t != NULL); 15360Sstevel@tonic-gate /* 15370Sstevel@tonic-gate * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 15380Sstevel@tonic-gate * cannot since the HAT layer can call into this routine to 15390Sstevel@tonic-gate * determine the locality for its data structures in the context 15400Sstevel@tonic-gate * of a page fault. 15410Sstevel@tonic-gate */ 15420Sstevel@tonic-gate 15430Sstevel@tonic-gate kpreempt_disable(); 15440Sstevel@tonic-gate 15450Sstevel@tonic-gate lpl = t->t_lpl; 15460Sstevel@tonic-gate ASSERT(lpl != NULL); 15470Sstevel@tonic-gate ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 15480Sstevel@tonic-gate lgrp = lpl->lpl_lgrpid; 15490Sstevel@tonic-gate 15500Sstevel@tonic-gate kpreempt_enable(); 15510Sstevel@tonic-gate 15520Sstevel@tonic-gate return (lgrp); 15530Sstevel@tonic-gate } 15540Sstevel@tonic-gate 15550Sstevel@tonic-gate /* 15560Sstevel@tonic-gate * Return lgroup containing the physical memory for the given page frame number 15570Sstevel@tonic-gate */ 15580Sstevel@tonic-gate lgrp_t * 15590Sstevel@tonic-gate lgrp_pfn_to_lgrp(pfn_t pfn) 15600Sstevel@tonic-gate { 15610Sstevel@tonic-gate lgrp_handle_t hand; 15620Sstevel@tonic-gate int i; 15630Sstevel@tonic-gate lgrp_t *lgrp; 15640Sstevel@tonic-gate 15650Sstevel@tonic-gate hand = lgrp_plat_pfn_to_hand(pfn); 15660Sstevel@tonic-gate if (hand != LGRP_NULL_HANDLE) 15670Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 15680Sstevel@tonic-gate lgrp = lgrp_table[i]; 15690Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 15700Sstevel@tonic-gate return (lgrp); 15710Sstevel@tonic-gate } 15720Sstevel@tonic-gate return (NULL); 15730Sstevel@tonic-gate } 15740Sstevel@tonic-gate 15750Sstevel@tonic-gate /* 15760Sstevel@tonic-gate * Return lgroup containing the physical memory for the given page frame number 15770Sstevel@tonic-gate */ 15780Sstevel@tonic-gate lgrp_t * 15790Sstevel@tonic-gate lgrp_phys_to_lgrp(u_longlong_t physaddr) 15800Sstevel@tonic-gate { 15810Sstevel@tonic-gate lgrp_handle_t hand; 15820Sstevel@tonic-gate int i; 15830Sstevel@tonic-gate lgrp_t *lgrp; 15840Sstevel@tonic-gate pfn_t pfn; 15850Sstevel@tonic-gate 15860Sstevel@tonic-gate pfn = btop(physaddr); 15870Sstevel@tonic-gate hand = lgrp_plat_pfn_to_hand(pfn); 15880Sstevel@tonic-gate if (hand != LGRP_NULL_HANDLE) 15890Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 15900Sstevel@tonic-gate lgrp = lgrp_table[i]; 15910Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 15920Sstevel@tonic-gate return (lgrp); 15930Sstevel@tonic-gate } 15940Sstevel@tonic-gate return (NULL); 15950Sstevel@tonic-gate } 15960Sstevel@tonic-gate 15970Sstevel@tonic-gate /* 15980Sstevel@tonic-gate * Return the leaf lgroup containing the given CPU 159960Sesaxe * 160060Sesaxe * The caller needs to take precautions necessary to prevent 16013434Sesaxe * "cpu", and it's lpl from going away across a call to this function. 160260Sesaxe * hint: kpreempt_disable()/kpreempt_enable() 16030Sstevel@tonic-gate */ 16040Sstevel@tonic-gate static lgrp_t * 16050Sstevel@tonic-gate lgrp_cpu_to_lgrp(cpu_t *cpu) 16060Sstevel@tonic-gate { 16071892Sesaxe return (cpu->cpu_lpl->lpl_lgrp); 16080Sstevel@tonic-gate } 16090Sstevel@tonic-gate 16100Sstevel@tonic-gate /* 16110Sstevel@tonic-gate * Return the sum of the partition loads in an lgrp divided by 16120Sstevel@tonic-gate * the number of CPUs in the lgrp. This is our best approximation 16130Sstevel@tonic-gate * of an 'lgroup load average' for a useful per-lgroup kstat. 16140Sstevel@tonic-gate */ 16150Sstevel@tonic-gate static uint64_t 16160Sstevel@tonic-gate lgrp_sum_loadavgs(lgrp_t *lgrp) 16170Sstevel@tonic-gate { 16180Sstevel@tonic-gate cpu_t *cpu; 16190Sstevel@tonic-gate int ncpu; 16200Sstevel@tonic-gate uint64_t loads = 0; 16210Sstevel@tonic-gate 16220Sstevel@tonic-gate mutex_enter(&cpu_lock); 16230Sstevel@tonic-gate 16240Sstevel@tonic-gate cpu = lgrp->lgrp_cpu; 16250Sstevel@tonic-gate ncpu = lgrp->lgrp_cpucnt; 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate if (cpu == NULL || ncpu == 0) { 16280Sstevel@tonic-gate mutex_exit(&cpu_lock); 16290Sstevel@tonic-gate return (0ull); 16300Sstevel@tonic-gate } 16310Sstevel@tonic-gate 16320Sstevel@tonic-gate do { 16330Sstevel@tonic-gate loads += cpu->cpu_lpl->lpl_loadavg; 16340Sstevel@tonic-gate cpu = cpu->cpu_next_lgrp; 16350Sstevel@tonic-gate } while (cpu != lgrp->lgrp_cpu); 16360Sstevel@tonic-gate 16370Sstevel@tonic-gate mutex_exit(&cpu_lock); 16380Sstevel@tonic-gate 16390Sstevel@tonic-gate return (loads / ncpu); 16400Sstevel@tonic-gate } 16410Sstevel@tonic-gate 16420Sstevel@tonic-gate void 16430Sstevel@tonic-gate lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 16440Sstevel@tonic-gate { 16450Sstevel@tonic-gate struct lgrp_stats *pstats; 16460Sstevel@tonic-gate 16470Sstevel@tonic-gate /* 16480Sstevel@tonic-gate * Verify that the caller isn't trying to add to 16490Sstevel@tonic-gate * a statistic for an lgroup that has gone away 16500Sstevel@tonic-gate */ 16510Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 16520Sstevel@tonic-gate return; 16530Sstevel@tonic-gate 16540Sstevel@tonic-gate pstats = &lgrp_stats[lgrpid]; 16550Sstevel@tonic-gate atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 16560Sstevel@tonic-gate } 16570Sstevel@tonic-gate 16580Sstevel@tonic-gate int64_t 16590Sstevel@tonic-gate lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 16600Sstevel@tonic-gate { 16610Sstevel@tonic-gate uint64_t val; 16620Sstevel@tonic-gate struct lgrp_stats *pstats; 16630Sstevel@tonic-gate 16640Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 16650Sstevel@tonic-gate return ((int64_t)0); 16660Sstevel@tonic-gate 16670Sstevel@tonic-gate pstats = &lgrp_stats[lgrpid]; 16680Sstevel@tonic-gate LGRP_STAT_READ(pstats, stat, val); 16690Sstevel@tonic-gate return (val); 16700Sstevel@tonic-gate } 16710Sstevel@tonic-gate 16720Sstevel@tonic-gate /* 16730Sstevel@tonic-gate * Reset all kstats for lgrp specified by its lgrpid. 16740Sstevel@tonic-gate */ 16750Sstevel@tonic-gate static void 16760Sstevel@tonic-gate lgrp_kstat_reset(lgrp_id_t lgrpid) 16770Sstevel@tonic-gate { 16780Sstevel@tonic-gate lgrp_stat_t stat; 16790Sstevel@tonic-gate 16800Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 16810Sstevel@tonic-gate return; 16820Sstevel@tonic-gate 16830Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 16840Sstevel@tonic-gate LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 16850Sstevel@tonic-gate } 16860Sstevel@tonic-gate } 16870Sstevel@tonic-gate 16880Sstevel@tonic-gate /* 16890Sstevel@tonic-gate * Collect all per-lgrp statistics for the lgrp associated with this 16900Sstevel@tonic-gate * kstat, and store them in the ks_data array. 16910Sstevel@tonic-gate * 16920Sstevel@tonic-gate * The superuser can reset all the running counter statistics for an 16930Sstevel@tonic-gate * lgrp by writing to any of the lgrp's stats. 16940Sstevel@tonic-gate */ 16950Sstevel@tonic-gate static int 16960Sstevel@tonic-gate lgrp_kstat_extract(kstat_t *ksp, int rw) 16970Sstevel@tonic-gate { 16980Sstevel@tonic-gate lgrp_stat_t stat; 16990Sstevel@tonic-gate struct kstat_named *ksd; 17000Sstevel@tonic-gate lgrp_t *lgrp; 17010Sstevel@tonic-gate lgrp_id_t lgrpid; 17020Sstevel@tonic-gate 17030Sstevel@tonic-gate lgrp = (lgrp_t *)ksp->ks_private; 17040Sstevel@tonic-gate 17050Sstevel@tonic-gate ksd = (struct kstat_named *)ksp->ks_data; 17060Sstevel@tonic-gate ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 17070Sstevel@tonic-gate 17080Sstevel@tonic-gate lgrpid = lgrp->lgrp_id; 17090Sstevel@tonic-gate 17100Sstevel@tonic-gate if (lgrpid == LGRP_NONE) { 17110Sstevel@tonic-gate /* 17120Sstevel@tonic-gate * Return all zeroes as stats for freed lgrp. 17130Sstevel@tonic-gate */ 17140Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 17150Sstevel@tonic-gate ksd[stat].value.i64 = 0; 17160Sstevel@tonic-gate } 17170Sstevel@tonic-gate ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 17180Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 17190Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 17200Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 17210Sstevel@tonic-gate ksd[stat + LGRP_LOADAVG].value.i64 = 0; 17220Sstevel@tonic-gate } else if (rw != KSTAT_WRITE) { 17230Sstevel@tonic-gate /* 17240Sstevel@tonic-gate * Handle counter stats 17250Sstevel@tonic-gate */ 17260Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 17270Sstevel@tonic-gate ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 17280Sstevel@tonic-gate } 17290Sstevel@tonic-gate 17300Sstevel@tonic-gate /* 17310Sstevel@tonic-gate * Handle kernel data snapshot stats 17320Sstevel@tonic-gate */ 17330Sstevel@tonic-gate ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 17340Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 17350Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 17360Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 17370Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 17380Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 17390Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 17400Sstevel@tonic-gate ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 17412685Sakolb ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 17422685Sakolb lgrp_loadavg_max_effect; 17430Sstevel@tonic-gate } else { 17440Sstevel@tonic-gate lgrp_kstat_reset(lgrpid); 17450Sstevel@tonic-gate } 17460Sstevel@tonic-gate 17470Sstevel@tonic-gate return (0); 17480Sstevel@tonic-gate } 17490Sstevel@tonic-gate 17500Sstevel@tonic-gate int 17510Sstevel@tonic-gate lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 17520Sstevel@tonic-gate { 17530Sstevel@tonic-gate cpu_t *cp; 17540Sstevel@tonic-gate 17550Sstevel@tonic-gate mutex_enter(&cpu_lock); 17560Sstevel@tonic-gate 17570Sstevel@tonic-gate if ((cp = cpu_get(id)) == NULL) { 17580Sstevel@tonic-gate mutex_exit(&cpu_lock); 17590Sstevel@tonic-gate return (EINVAL); 17600Sstevel@tonic-gate } 17610Sstevel@tonic-gate 17620Sstevel@tonic-gate if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 17630Sstevel@tonic-gate mutex_exit(&cpu_lock); 17640Sstevel@tonic-gate return (EINVAL); 17650Sstevel@tonic-gate } 17660Sstevel@tonic-gate 17670Sstevel@tonic-gate ASSERT(cp->cpu_lpl != NULL); 17680Sstevel@tonic-gate 17690Sstevel@tonic-gate *lp = cp->cpu_lpl->lpl_lgrpid; 17700Sstevel@tonic-gate 17710Sstevel@tonic-gate mutex_exit(&cpu_lock); 17720Sstevel@tonic-gate 17730Sstevel@tonic-gate return (0); 17740Sstevel@tonic-gate } 17750Sstevel@tonic-gate 17760Sstevel@tonic-gate int 17770Sstevel@tonic-gate lgrp_query_load(processorid_t id, lgrp_load_t *lp) 17780Sstevel@tonic-gate { 17790Sstevel@tonic-gate cpu_t *cp; 17800Sstevel@tonic-gate 17810Sstevel@tonic-gate mutex_enter(&cpu_lock); 17820Sstevel@tonic-gate 17830Sstevel@tonic-gate if ((cp = cpu_get(id)) == NULL) { 17840Sstevel@tonic-gate mutex_exit(&cpu_lock); 17850Sstevel@tonic-gate return (EINVAL); 17860Sstevel@tonic-gate } 17870Sstevel@tonic-gate 17880Sstevel@tonic-gate ASSERT(cp->cpu_lpl != NULL); 17890Sstevel@tonic-gate 17900Sstevel@tonic-gate *lp = cp->cpu_lpl->lpl_loadavg; 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate mutex_exit(&cpu_lock); 17930Sstevel@tonic-gate 17940Sstevel@tonic-gate return (0); 17950Sstevel@tonic-gate } 17960Sstevel@tonic-gate 17970Sstevel@tonic-gate /* 17980Sstevel@tonic-gate * Add a resource named by lpl_leaf to rset of lpl_target 17990Sstevel@tonic-gate * 18000Sstevel@tonic-gate * This routine also adjusts ncpu and nrset if the call succeeds in adding a 18010Sstevel@tonic-gate * resource. It is adjusted here, as this is presently the only place that we 18020Sstevel@tonic-gate * can be certain a resource addition has succeeded. 18030Sstevel@tonic-gate * 18040Sstevel@tonic-gate * We keep the list of rsets sorted so that the dispatcher can quickly walk the 18050Sstevel@tonic-gate * list in order until it reaches a NULL. (This list is required to be NULL 18060Sstevel@tonic-gate * terminated, too). This is done so that we can mark start pos + 1, so that 18070Sstevel@tonic-gate * each lpl is traversed sequentially, but in a different order. We hope this 18080Sstevel@tonic-gate * will improve performance a bit. (Hopefully, less read-to-own traffic...) 18090Sstevel@tonic-gate */ 18100Sstevel@tonic-gate 18110Sstevel@tonic-gate void 18120Sstevel@tonic-gate lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 18130Sstevel@tonic-gate { 18140Sstevel@tonic-gate int i; 18150Sstevel@tonic-gate int entry_slot = 0; 18160Sstevel@tonic-gate 18170Sstevel@tonic-gate /* return if leaf is already present */ 18180Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 18190Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) { 18200Sstevel@tonic-gate return; 18210Sstevel@tonic-gate } 18220Sstevel@tonic-gate 18230Sstevel@tonic-gate if (lpl_target->lpl_rset[i]->lpl_lgrpid > 18240Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) { 18250Sstevel@tonic-gate break; 18260Sstevel@tonic-gate } 18270Sstevel@tonic-gate } 18280Sstevel@tonic-gate 18290Sstevel@tonic-gate /* insert leaf, update counts */ 18300Sstevel@tonic-gate entry_slot = i; 18310Sstevel@tonic-gate i = lpl_target->lpl_nrset++; 18320Sstevel@tonic-gate if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 18330Sstevel@tonic-gate panic("More leaf lgrps in system than are supported!\n"); 18340Sstevel@tonic-gate } 18350Sstevel@tonic-gate 18360Sstevel@tonic-gate /* 18370Sstevel@tonic-gate * Start at the end of the rset array and work backwards towards the 18380Sstevel@tonic-gate * slot into which the new lpl will be inserted. This effectively 18390Sstevel@tonic-gate * preserves the current ordering by scooting everybody over one entry, 18400Sstevel@tonic-gate * and placing the new entry into the space created. 18410Sstevel@tonic-gate */ 18420Sstevel@tonic-gate 18430Sstevel@tonic-gate while (i-- > entry_slot) { 18440Sstevel@tonic-gate lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 18450Sstevel@tonic-gate } 18460Sstevel@tonic-gate 18470Sstevel@tonic-gate lpl_target->lpl_rset[entry_slot] = lpl_leaf; 18480Sstevel@tonic-gate lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 18490Sstevel@tonic-gate } 18500Sstevel@tonic-gate 18510Sstevel@tonic-gate /* 18520Sstevel@tonic-gate * Update each of lpl_parent's children with a proper hint and 18530Sstevel@tonic-gate * a reference to their parent. 18540Sstevel@tonic-gate * The lgrp topology is used as the reference since it is fully 18550Sstevel@tonic-gate * consistent and correct at this point. 18560Sstevel@tonic-gate * 18570Sstevel@tonic-gate * Each child's hint will reference an element in lpl_parent's 18580Sstevel@tonic-gate * rset that designates where the child should start searching 18590Sstevel@tonic-gate * for CPU resources. The hint selected is the highest order leaf present 18600Sstevel@tonic-gate * in the child's lineage. 18610Sstevel@tonic-gate * 18620Sstevel@tonic-gate * This should be called after any potential change in lpl_parent's 18630Sstevel@tonic-gate * rset. 18640Sstevel@tonic-gate */ 18650Sstevel@tonic-gate static void 18660Sstevel@tonic-gate lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 18670Sstevel@tonic-gate { 18680Sstevel@tonic-gate klgrpset_t children, leaves; 18690Sstevel@tonic-gate lpl_t *lpl; 18700Sstevel@tonic-gate int hint; 18710Sstevel@tonic-gate int i, j; 18720Sstevel@tonic-gate 18730Sstevel@tonic-gate children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 18740Sstevel@tonic-gate if (klgrpset_isempty(children)) 18750Sstevel@tonic-gate return; /* nothing to do */ 18760Sstevel@tonic-gate 18770Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 18780Sstevel@tonic-gate if (klgrpset_ismember(children, i)) { 18790Sstevel@tonic-gate 18800Sstevel@tonic-gate /* 18810Sstevel@tonic-gate * Given the set of leaves in this child's lineage, 18820Sstevel@tonic-gate * find the highest order leaf present in the parent's 18830Sstevel@tonic-gate * rset. Select this as the hint for the child. 18840Sstevel@tonic-gate */ 18850Sstevel@tonic-gate leaves = lgrp_table[i]->lgrp_leaves; 18860Sstevel@tonic-gate hint = 0; 18870Sstevel@tonic-gate for (j = 0; j < lpl_parent->lpl_nrset; j++) { 18880Sstevel@tonic-gate lpl = lpl_parent->lpl_rset[j]; 18890Sstevel@tonic-gate if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 18900Sstevel@tonic-gate hint = j; 18910Sstevel@tonic-gate } 18920Sstevel@tonic-gate cp->cp_lgrploads[i].lpl_hint = hint; 18930Sstevel@tonic-gate 18940Sstevel@tonic-gate /* 18950Sstevel@tonic-gate * (Re)set the parent. It may be incorrect if 18960Sstevel@tonic-gate * lpl_parent is new in the topology. 18970Sstevel@tonic-gate */ 18980Sstevel@tonic-gate cp->cp_lgrploads[i].lpl_parent = lpl_parent; 18990Sstevel@tonic-gate } 19000Sstevel@tonic-gate } 19010Sstevel@tonic-gate } 19020Sstevel@tonic-gate 19030Sstevel@tonic-gate /* 19040Sstevel@tonic-gate * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 19050Sstevel@tonic-gate * 19060Sstevel@tonic-gate * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 19070Sstevel@tonic-gate * resource. The values are adjusted here, as this is the only place that we can 19080Sstevel@tonic-gate * be certain a resource was successfully deleted. 19090Sstevel@tonic-gate */ 19100Sstevel@tonic-gate void 19110Sstevel@tonic-gate lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 19120Sstevel@tonic-gate { 19130Sstevel@tonic-gate int i; 19140Sstevel@tonic-gate 19150Sstevel@tonic-gate /* find leaf in intermediate node */ 19160Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 19170Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) 19180Sstevel@tonic-gate break; 19190Sstevel@tonic-gate } 19200Sstevel@tonic-gate 19210Sstevel@tonic-gate /* return if leaf not found */ 19220Sstevel@tonic-gate if (lpl_target->lpl_rset[i] != lpl_leaf) 19230Sstevel@tonic-gate return; 19240Sstevel@tonic-gate 19250Sstevel@tonic-gate /* prune leaf, compress array */ 19260Sstevel@tonic-gate ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 19270Sstevel@tonic-gate lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 19280Sstevel@tonic-gate lpl_target->lpl_ncpu--; 19290Sstevel@tonic-gate do { 19300Sstevel@tonic-gate lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 19310Sstevel@tonic-gate } while (i++ < lpl_target->lpl_nrset); 19320Sstevel@tonic-gate } 19330Sstevel@tonic-gate 19340Sstevel@tonic-gate /* 19350Sstevel@tonic-gate * Check to see if the resource set of the target lpl contains the 19360Sstevel@tonic-gate * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 19370Sstevel@tonic-gate */ 19380Sstevel@tonic-gate 19390Sstevel@tonic-gate int 19400Sstevel@tonic-gate lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 19410Sstevel@tonic-gate { 19420Sstevel@tonic-gate int i; 19430Sstevel@tonic-gate 19440Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 19450Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) 19460Sstevel@tonic-gate return (1); 19470Sstevel@tonic-gate } 19480Sstevel@tonic-gate 19490Sstevel@tonic-gate return (0); 19500Sstevel@tonic-gate } 19510Sstevel@tonic-gate 19520Sstevel@tonic-gate /* 19530Sstevel@tonic-gate * Called when we change cpu lpl membership. This increments or decrements the 19540Sstevel@tonic-gate * per-cpu counter in every lpl in which our leaf appears. 19550Sstevel@tonic-gate */ 19560Sstevel@tonic-gate void 19570Sstevel@tonic-gate lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 19580Sstevel@tonic-gate { 19590Sstevel@tonic-gate cpupart_t *cpupart; 19600Sstevel@tonic-gate lgrp_t *lgrp_leaf; 19610Sstevel@tonic-gate lgrp_t *lgrp_cur; 19620Sstevel@tonic-gate lpl_t *lpl_leaf; 19630Sstevel@tonic-gate lpl_t *lpl_cur; 19640Sstevel@tonic-gate int i; 19650Sstevel@tonic-gate 19660Sstevel@tonic-gate ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 19670Sstevel@tonic-gate 19680Sstevel@tonic-gate cpupart = cp->cpu_part; 19690Sstevel@tonic-gate lpl_leaf = cp->cpu_lpl; 19700Sstevel@tonic-gate lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 19710Sstevel@tonic-gate 19720Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 19730Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 19740Sstevel@tonic-gate 19750Sstevel@tonic-gate /* 19760Sstevel@tonic-gate * Don't adjust if the lgrp isn't there, if we're the leaf lpl 19770Sstevel@tonic-gate * for the cpu in question, or if the current lgrp and leaf 19780Sstevel@tonic-gate * don't share the same resources. 19790Sstevel@tonic-gate */ 19800Sstevel@tonic-gate 19810Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 19820Sstevel@tonic-gate !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 19830Sstevel@tonic-gate lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 19840Sstevel@tonic-gate continue; 19850Sstevel@tonic-gate 19860Sstevel@tonic-gate 19870Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 19880Sstevel@tonic-gate 19890Sstevel@tonic-gate if (lpl_cur->lpl_nrset > 0) { 19900Sstevel@tonic-gate if (act == LPL_INCREMENT) { 19910Sstevel@tonic-gate lpl_cur->lpl_ncpu++; 19920Sstevel@tonic-gate } else if (act == LPL_DECREMENT) { 19930Sstevel@tonic-gate lpl_cur->lpl_ncpu--; 19940Sstevel@tonic-gate } 19950Sstevel@tonic-gate } 19960Sstevel@tonic-gate } 19970Sstevel@tonic-gate } 19980Sstevel@tonic-gate 19990Sstevel@tonic-gate /* 20000Sstevel@tonic-gate * Initialize lpl with given resources and specified lgrp 20010Sstevel@tonic-gate */ 20020Sstevel@tonic-gate 20030Sstevel@tonic-gate void 20040Sstevel@tonic-gate lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 20050Sstevel@tonic-gate { 20060Sstevel@tonic-gate lpl->lpl_lgrpid = lgrp->lgrp_id; 20070Sstevel@tonic-gate lpl->lpl_loadavg = 0; 20080Sstevel@tonic-gate if (lpl == lpl_leaf) 20090Sstevel@tonic-gate lpl->lpl_ncpu = 1; 20100Sstevel@tonic-gate else 20110Sstevel@tonic-gate lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 20120Sstevel@tonic-gate lpl->lpl_nrset = 1; 20130Sstevel@tonic-gate lpl->lpl_rset[0] = lpl_leaf; 20140Sstevel@tonic-gate lpl->lpl_lgrp = lgrp; 20150Sstevel@tonic-gate lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 20160Sstevel@tonic-gate lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 20170Sstevel@tonic-gate } 20180Sstevel@tonic-gate 20190Sstevel@tonic-gate /* 20200Sstevel@tonic-gate * Clear an unused lpl 20210Sstevel@tonic-gate */ 20220Sstevel@tonic-gate 20230Sstevel@tonic-gate void 20240Sstevel@tonic-gate lpl_clear(lpl_t *lpl) 20250Sstevel@tonic-gate { 20261892Sesaxe lgrp_id_t lid; 20270Sstevel@tonic-gate 20280Sstevel@tonic-gate /* save lid for debugging purposes */ 20290Sstevel@tonic-gate lid = lpl->lpl_lgrpid; 20300Sstevel@tonic-gate bzero(lpl, sizeof (lpl_t)); 20310Sstevel@tonic-gate lpl->lpl_lgrpid = lid; 20320Sstevel@tonic-gate } 20330Sstevel@tonic-gate 20340Sstevel@tonic-gate /* 20350Sstevel@tonic-gate * Given a CPU-partition, verify that the lpl topology in the CPU-partition 20360Sstevel@tonic-gate * is in sync with the lgroup toplogy in the system. The lpl topology may not 20370Sstevel@tonic-gate * make full use of all of the lgroup topology, but this checks to make sure 20380Sstevel@tonic-gate * that for the parts that it does use, it has correctly understood the 20390Sstevel@tonic-gate * relationships that exist. This function returns 20400Sstevel@tonic-gate * 0 if the topology is correct, and a non-zero error code, for non-debug 20410Sstevel@tonic-gate * kernels if incorrect. Asserts are spread throughout the code to aid in 20420Sstevel@tonic-gate * debugging on a DEBUG kernel. 20430Sstevel@tonic-gate */ 20440Sstevel@tonic-gate int 20450Sstevel@tonic-gate lpl_topo_verify(cpupart_t *cpupart) 20460Sstevel@tonic-gate { 20470Sstevel@tonic-gate lgrp_t *lgrp; 20480Sstevel@tonic-gate lpl_t *lpl; 20490Sstevel@tonic-gate klgrpset_t rset; 20500Sstevel@tonic-gate klgrpset_t cset; 20510Sstevel@tonic-gate cpu_t *cpu; 20520Sstevel@tonic-gate cpu_t *cp_start; 20530Sstevel@tonic-gate int i; 20540Sstevel@tonic-gate int j; 20550Sstevel@tonic-gate int sum; 20560Sstevel@tonic-gate 20570Sstevel@tonic-gate /* topology can't be incorrect if it doesn't exist */ 20580Sstevel@tonic-gate if (!lgrp_topo_initialized || !lgrp_initialized) 20590Sstevel@tonic-gate return (LPL_TOPO_CORRECT); 20600Sstevel@tonic-gate 20610Sstevel@tonic-gate ASSERT(cpupart != NULL); 20620Sstevel@tonic-gate 20630Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 20640Sstevel@tonic-gate lgrp = lgrp_table[i]; 20650Sstevel@tonic-gate lpl = NULL; 20660Sstevel@tonic-gate /* make sure lpls are allocated */ 20670Sstevel@tonic-gate ASSERT(cpupart->cp_lgrploads); 20680Sstevel@tonic-gate if (!cpupart->cp_lgrploads) 20690Sstevel@tonic-gate return (LPL_TOPO_PART_HAS_NO_LPL); 20700Sstevel@tonic-gate 20710Sstevel@tonic-gate lpl = &cpupart->cp_lgrploads[i]; 20720Sstevel@tonic-gate /* make sure our index is good */ 20730Sstevel@tonic-gate ASSERT(i < cpupart->cp_nlgrploads); 20740Sstevel@tonic-gate 20750Sstevel@tonic-gate /* if lgroup doesn't exist, make sure lpl is empty */ 20760Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) { 20770Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu == 0); 20780Sstevel@tonic-gate if (lpl->lpl_ncpu > 0) { 20790Sstevel@tonic-gate return (LPL_TOPO_CPUS_NOT_EMPTY); 20800Sstevel@tonic-gate } else { 20810Sstevel@tonic-gate continue; 20820Sstevel@tonic-gate } 20830Sstevel@tonic-gate } 20840Sstevel@tonic-gate 20850Sstevel@tonic-gate /* verify that lgroup and lpl are identically numbered */ 20860Sstevel@tonic-gate ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 20870Sstevel@tonic-gate 20880Sstevel@tonic-gate /* if lgroup isn't in our partition, make sure lpl is empty */ 20890Sstevel@tonic-gate if (!klgrpset_intersects(lgrp->lgrp_leaves, 20900Sstevel@tonic-gate cpupart->cp_lgrpset)) { 20910Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu == 0); 20920Sstevel@tonic-gate if (lpl->lpl_ncpu > 0) { 20930Sstevel@tonic-gate return (LPL_TOPO_CPUS_NOT_EMPTY); 20940Sstevel@tonic-gate } 20950Sstevel@tonic-gate /* 20960Sstevel@tonic-gate * lpl is empty, and lgroup isn't in partition. verify 20970Sstevel@tonic-gate * that lpl doesn't show up in anyone else's rsets (in 20980Sstevel@tonic-gate * this partition, anyway) 20990Sstevel@tonic-gate */ 21000Sstevel@tonic-gate 21010Sstevel@tonic-gate for (j = 0; j < cpupart->cp_nlgrploads; j++) { 21020Sstevel@tonic-gate lpl_t *i_lpl; /* lpl we're iterating over */ 21030Sstevel@tonic-gate 21040Sstevel@tonic-gate i_lpl = &cpupart->cp_lgrploads[j]; 21050Sstevel@tonic-gate 21060Sstevel@tonic-gate ASSERT(!lpl_rset_contains(i_lpl, lpl)); 21070Sstevel@tonic-gate if (lpl_rset_contains(i_lpl, lpl)) { 21080Sstevel@tonic-gate return (LPL_TOPO_LPL_ORPHANED); 21090Sstevel@tonic-gate } 21100Sstevel@tonic-gate } 21110Sstevel@tonic-gate /* lgroup is empty, and everything is ok. continue */ 21120Sstevel@tonic-gate continue; 21130Sstevel@tonic-gate } 21140Sstevel@tonic-gate 21150Sstevel@tonic-gate 21160Sstevel@tonic-gate /* lgroup is in this partition, now check it against lpl */ 21170Sstevel@tonic-gate 21180Sstevel@tonic-gate /* do both have matching lgrps? */ 21190Sstevel@tonic-gate ASSERT(lgrp == lpl->lpl_lgrp); 21200Sstevel@tonic-gate if (lgrp != lpl->lpl_lgrp) { 21210Sstevel@tonic-gate return (LPL_TOPO_LGRP_MISMATCH); 21220Sstevel@tonic-gate } 21230Sstevel@tonic-gate 21240Sstevel@tonic-gate /* do the parent lgroups exist and do they match? */ 21250Sstevel@tonic-gate if (lgrp->lgrp_parent) { 21260Sstevel@tonic-gate ASSERT(lpl->lpl_parent); 21270Sstevel@tonic-gate ASSERT(lgrp->lgrp_parent->lgrp_id == 21280Sstevel@tonic-gate lpl->lpl_parent->lpl_lgrpid); 21290Sstevel@tonic-gate 21300Sstevel@tonic-gate if (!lpl->lpl_parent) { 21310Sstevel@tonic-gate return (LPL_TOPO_MISSING_PARENT); 21320Sstevel@tonic-gate } else if (lgrp->lgrp_parent->lgrp_id != 21330Sstevel@tonic-gate lpl->lpl_parent->lpl_lgrpid) { 21340Sstevel@tonic-gate return (LPL_TOPO_PARENT_MISMATCH); 21350Sstevel@tonic-gate } 21360Sstevel@tonic-gate } 21370Sstevel@tonic-gate 21380Sstevel@tonic-gate /* only leaf lgroups keep a cpucnt, only check leaves */ 21390Sstevel@tonic-gate if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 21400Sstevel@tonic-gate 21410Sstevel@tonic-gate /* verify that lgrp is also a leaf */ 21420Sstevel@tonic-gate ASSERT((lgrp->lgrp_childcnt == 0) && 21430Sstevel@tonic-gate (klgrpset_ismember(lgrp->lgrp_leaves, 21440Sstevel@tonic-gate lpl->lpl_lgrpid))); 21450Sstevel@tonic-gate 21460Sstevel@tonic-gate if ((lgrp->lgrp_childcnt > 0) || 21470Sstevel@tonic-gate (!klgrpset_ismember(lgrp->lgrp_leaves, 21480Sstevel@tonic-gate lpl->lpl_lgrpid))) { 21490Sstevel@tonic-gate return (LPL_TOPO_LGRP_NOT_LEAF); 21500Sstevel@tonic-gate } 21510Sstevel@tonic-gate 21520Sstevel@tonic-gate ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 21530Sstevel@tonic-gate (lpl->lpl_ncpu > 0)); 21540Sstevel@tonic-gate if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 21550Sstevel@tonic-gate (lpl->lpl_ncpu <= 0)) { 21560Sstevel@tonic-gate return (LPL_TOPO_BAD_CPUCNT); 21570Sstevel@tonic-gate } 21580Sstevel@tonic-gate 21590Sstevel@tonic-gate /* 21600Sstevel@tonic-gate * Check that lpl_ncpu also matches the number of 21610Sstevel@tonic-gate * cpus in the lpl's linked list. This only exists in 21620Sstevel@tonic-gate * leaves, but they should always match. 21630Sstevel@tonic-gate */ 21640Sstevel@tonic-gate j = 0; 21650Sstevel@tonic-gate cpu = cp_start = lpl->lpl_cpus; 21660Sstevel@tonic-gate while (cpu != NULL) { 21670Sstevel@tonic-gate j++; 21680Sstevel@tonic-gate 21690Sstevel@tonic-gate /* check to make sure cpu's lpl is leaf lpl */ 21700Sstevel@tonic-gate ASSERT(cpu->cpu_lpl == lpl); 21710Sstevel@tonic-gate if (cpu->cpu_lpl != lpl) { 21720Sstevel@tonic-gate return (LPL_TOPO_CPU_HAS_BAD_LPL); 21730Sstevel@tonic-gate } 21740Sstevel@tonic-gate 21750Sstevel@tonic-gate /* check next cpu */ 21760Sstevel@tonic-gate if ((cpu = cpu->cpu_next_lpl) != cp_start) { 21770Sstevel@tonic-gate continue; 21780Sstevel@tonic-gate } else { 21790Sstevel@tonic-gate cpu = NULL; 21800Sstevel@tonic-gate } 21810Sstevel@tonic-gate } 21820Sstevel@tonic-gate 21830Sstevel@tonic-gate ASSERT(j == lpl->lpl_ncpu); 21840Sstevel@tonic-gate if (j != lpl->lpl_ncpu) { 21850Sstevel@tonic-gate return (LPL_TOPO_LPL_BAD_NCPU); 21860Sstevel@tonic-gate } 21870Sstevel@tonic-gate 21880Sstevel@tonic-gate /* 21890Sstevel@tonic-gate * Also, check that leaf lpl is contained in all 21900Sstevel@tonic-gate * intermediate lpls that name the leaf as a descendant 21910Sstevel@tonic-gate */ 21920Sstevel@tonic-gate 21930Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 21940Sstevel@tonic-gate klgrpset_t intersect; 21950Sstevel@tonic-gate lgrp_t *lgrp_cand; 21960Sstevel@tonic-gate lpl_t *lpl_cand; 21970Sstevel@tonic-gate 21980Sstevel@tonic-gate lgrp_cand = lgrp_table[j]; 21990Sstevel@tonic-gate intersect = klgrpset_intersects( 22000Sstevel@tonic-gate lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 22010Sstevel@tonic-gate cpupart->cp_lgrpset); 22020Sstevel@tonic-gate 22030Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cand) || 22040Sstevel@tonic-gate !klgrpset_intersects(lgrp_cand->lgrp_leaves, 22050Sstevel@tonic-gate cpupart->cp_lgrpset) || 22060Sstevel@tonic-gate (intersect == 0)) 22070Sstevel@tonic-gate continue; 22080Sstevel@tonic-gate 22090Sstevel@tonic-gate lpl_cand = 22100Sstevel@tonic-gate &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 22110Sstevel@tonic-gate 22120Sstevel@tonic-gate if (klgrpset_ismember(intersect, 22130Sstevel@tonic-gate lgrp->lgrp_id)) { 22140Sstevel@tonic-gate ASSERT(lpl_rset_contains(lpl_cand, 22150Sstevel@tonic-gate lpl)); 22160Sstevel@tonic-gate 22170Sstevel@tonic-gate if (!lpl_rset_contains(lpl_cand, lpl)) { 22180Sstevel@tonic-gate return (LPL_TOPO_RSET_MSSNG_LF); 22190Sstevel@tonic-gate } 22200Sstevel@tonic-gate } 22210Sstevel@tonic-gate } 22220Sstevel@tonic-gate 22230Sstevel@tonic-gate } else { /* non-leaf specific checks */ 22240Sstevel@tonic-gate 22250Sstevel@tonic-gate /* 22260Sstevel@tonic-gate * Non-leaf lpls should have lpl_cpus == NULL 22270Sstevel@tonic-gate * verify that this is so 22280Sstevel@tonic-gate */ 22290Sstevel@tonic-gate ASSERT(lpl->lpl_cpus == NULL); 22300Sstevel@tonic-gate if (lpl->lpl_cpus != NULL) { 22310Sstevel@tonic-gate return (LPL_TOPO_NONLEAF_HAS_CPUS); 22320Sstevel@tonic-gate } 22330Sstevel@tonic-gate 22340Sstevel@tonic-gate /* 22350Sstevel@tonic-gate * verify that the sum of the cpus in the leaf resources 22360Sstevel@tonic-gate * is equal to the total ncpu in the intermediate 22370Sstevel@tonic-gate */ 22380Sstevel@tonic-gate for (j = sum = 0; j < lpl->lpl_nrset; j++) { 22390Sstevel@tonic-gate sum += lpl->lpl_rset[j]->lpl_ncpu; 22400Sstevel@tonic-gate } 22410Sstevel@tonic-gate 22420Sstevel@tonic-gate ASSERT(sum == lpl->lpl_ncpu); 22430Sstevel@tonic-gate if (sum != lpl->lpl_ncpu) { 22440Sstevel@tonic-gate return (LPL_TOPO_LPL_BAD_NCPU); 22450Sstevel@tonic-gate } 22460Sstevel@tonic-gate } 22470Sstevel@tonic-gate 22480Sstevel@tonic-gate /* 22490Sstevel@tonic-gate * check on lpl_hint. Don't check root, since it has no parent. 22500Sstevel@tonic-gate */ 22510Sstevel@tonic-gate if (lpl->lpl_parent != NULL) { 22520Sstevel@tonic-gate int hint; 22530Sstevel@tonic-gate lpl_t *hint_lpl; 22540Sstevel@tonic-gate 22550Sstevel@tonic-gate /* make sure hint is within limits of nrset */ 22560Sstevel@tonic-gate hint = lpl->lpl_hint; 22570Sstevel@tonic-gate ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 22580Sstevel@tonic-gate if (lpl->lpl_parent->lpl_nrset < hint) { 22590Sstevel@tonic-gate return (LPL_TOPO_BOGUS_HINT); 22600Sstevel@tonic-gate } 22610Sstevel@tonic-gate 22620Sstevel@tonic-gate /* make sure hint points to valid lpl */ 22630Sstevel@tonic-gate hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 22640Sstevel@tonic-gate ASSERT(hint_lpl->lpl_ncpu > 0); 22650Sstevel@tonic-gate if (hint_lpl->lpl_ncpu <= 0) { 22660Sstevel@tonic-gate return (LPL_TOPO_BOGUS_HINT); 22670Sstevel@tonic-gate } 22680Sstevel@tonic-gate } 22690Sstevel@tonic-gate 22700Sstevel@tonic-gate /* 22710Sstevel@tonic-gate * Check the rset of the lpl in question. Make sure that each 22720Sstevel@tonic-gate * rset contains a subset of the resources in 22730Sstevel@tonic-gate * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 22740Sstevel@tonic-gate * sure that each rset doesn't include resources that are 22750Sstevel@tonic-gate * outside of that set. (Which would be resources somehow not 22760Sstevel@tonic-gate * accounted for). 22770Sstevel@tonic-gate */ 22780Sstevel@tonic-gate 22790Sstevel@tonic-gate klgrpset_clear(rset); 22800Sstevel@tonic-gate for (j = 0; j < lpl->lpl_nrset; j++) { 22810Sstevel@tonic-gate klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 22820Sstevel@tonic-gate } 22830Sstevel@tonic-gate klgrpset_copy(cset, rset); 22840Sstevel@tonic-gate /* make sure lpl rset matches lgrp rset */ 22850Sstevel@tonic-gate klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 22860Sstevel@tonic-gate /* make sure rset is contained with in partition, too */ 22870Sstevel@tonic-gate klgrpset_diff(cset, cpupart->cp_lgrpset); 22880Sstevel@tonic-gate 22890Sstevel@tonic-gate ASSERT(klgrpset_isempty(rset) && 22900Sstevel@tonic-gate klgrpset_isempty(cset)); 22910Sstevel@tonic-gate if (!klgrpset_isempty(rset) || 22920Sstevel@tonic-gate !klgrpset_isempty(cset)) { 22930Sstevel@tonic-gate return (LPL_TOPO_RSET_MISMATCH); 22940Sstevel@tonic-gate } 22950Sstevel@tonic-gate 22960Sstevel@tonic-gate /* 22970Sstevel@tonic-gate * check to make sure lpl_nrset matches the number of rsets 22980Sstevel@tonic-gate * contained in the lpl 22990Sstevel@tonic-gate */ 23000Sstevel@tonic-gate 23010Sstevel@tonic-gate for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 23020Sstevel@tonic-gate j++); 23030Sstevel@tonic-gate 23040Sstevel@tonic-gate ASSERT(j == lpl->lpl_nrset); 23050Sstevel@tonic-gate if (j != lpl->lpl_nrset) { 23060Sstevel@tonic-gate return (LPL_TOPO_BAD_RSETCNT); 23070Sstevel@tonic-gate } 23080Sstevel@tonic-gate 23090Sstevel@tonic-gate } 23100Sstevel@tonic-gate return (LPL_TOPO_CORRECT); 23110Sstevel@tonic-gate } 23120Sstevel@tonic-gate 23130Sstevel@tonic-gate /* 23140Sstevel@tonic-gate * Flatten lpl topology to given number of levels. This is presently only 23150Sstevel@tonic-gate * implemented for a flatten to 2 levels, which will prune out the intermediates 23160Sstevel@tonic-gate * and home the leaf lpls to the root lpl. 23170Sstevel@tonic-gate */ 23180Sstevel@tonic-gate int 23190Sstevel@tonic-gate lpl_topo_flatten(int levels) 23200Sstevel@tonic-gate { 23210Sstevel@tonic-gate int i; 23220Sstevel@tonic-gate uint_t sum; 23230Sstevel@tonic-gate lgrp_t *lgrp_cur; 23240Sstevel@tonic-gate lpl_t *lpl_cur; 23250Sstevel@tonic-gate lpl_t *lpl_root; 23260Sstevel@tonic-gate cpupart_t *cp; 23270Sstevel@tonic-gate 23280Sstevel@tonic-gate if (levels != 2) 23290Sstevel@tonic-gate return (0); 23300Sstevel@tonic-gate 23310Sstevel@tonic-gate /* called w/ cpus paused - grab no locks! */ 23320Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 23330Sstevel@tonic-gate !lgrp_initialized); 23340Sstevel@tonic-gate 23350Sstevel@tonic-gate cp = cp_list_head; 23360Sstevel@tonic-gate do { 23370Sstevel@tonic-gate lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 23380Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 23390Sstevel@tonic-gate 23400Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 23410Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 23420Sstevel@tonic-gate lpl_cur = &cp->cp_lgrploads[i]; 23430Sstevel@tonic-gate 23440Sstevel@tonic-gate if ((lgrp_cur == lgrp_root) || 23450Sstevel@tonic-gate (!LGRP_EXISTS(lgrp_cur) && 23460Sstevel@tonic-gate (lpl_cur->lpl_ncpu == 0))) 23470Sstevel@tonic-gate continue; 23480Sstevel@tonic-gate 23490Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 23500Sstevel@tonic-gate /* 23510Sstevel@tonic-gate * this should be a deleted intermediate, so 23520Sstevel@tonic-gate * clear it 23530Sstevel@tonic-gate */ 23540Sstevel@tonic-gate lpl_clear(lpl_cur); 23550Sstevel@tonic-gate } else if ((lpl_cur->lpl_nrset == 1) && 23560Sstevel@tonic-gate (lpl_cur->lpl_rset[0] == lpl_cur) && 23570Sstevel@tonic-gate ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 23580Sstevel@tonic-gate (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 23590Sstevel@tonic-gate /* 23600Sstevel@tonic-gate * this is a leaf whose parent was deleted, or 23610Sstevel@tonic-gate * whose parent had their lgrp deleted. (And 23620Sstevel@tonic-gate * whose parent will soon be deleted). Point 23630Sstevel@tonic-gate * this guy back to the root lpl. 23640Sstevel@tonic-gate */ 23650Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_root; 23660Sstevel@tonic-gate lpl_rset_add(lpl_root, lpl_cur); 23670Sstevel@tonic-gate } 23680Sstevel@tonic-gate 23690Sstevel@tonic-gate } 23700Sstevel@tonic-gate 23710Sstevel@tonic-gate /* 23720Sstevel@tonic-gate * Now that we're done, make sure the count on the root lpl is 23730Sstevel@tonic-gate * correct, and update the hints of the children for the sake of 23740Sstevel@tonic-gate * thoroughness 23750Sstevel@tonic-gate */ 23760Sstevel@tonic-gate for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 23770Sstevel@tonic-gate sum += lpl_root->lpl_rset[i]->lpl_ncpu; 23780Sstevel@tonic-gate } 23790Sstevel@tonic-gate lpl_root->lpl_ncpu = sum; 23800Sstevel@tonic-gate lpl_child_update(lpl_root, cp); 23810Sstevel@tonic-gate 23820Sstevel@tonic-gate cp = cp->cp_next; 23830Sstevel@tonic-gate } while (cp != cp_list_head); 23840Sstevel@tonic-gate 23850Sstevel@tonic-gate return (levels); 23860Sstevel@tonic-gate } 23870Sstevel@tonic-gate 23880Sstevel@tonic-gate /* 23890Sstevel@tonic-gate * Insert a lpl into the resource hierarchy and create any additional lpls that 23900Sstevel@tonic-gate * are necessary to represent the varying states of locality for the cpu 23910Sstevel@tonic-gate * resoruces newly added to the partition. 23920Sstevel@tonic-gate * 23930Sstevel@tonic-gate * This routine is clever enough that it can correctly add resources from the 23940Sstevel@tonic-gate * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 23950Sstevel@tonic-gate * those for which the lpl is a leaf as opposed to simply a named equally local 23960Sstevel@tonic-gate * resource). The one special case that needs additional processing is when a 23970Sstevel@tonic-gate * new intermediate lpl is introduced. Since the main loop only traverses 23980Sstevel@tonic-gate * looking to add the leaf resource where it does not yet exist, additional work 23990Sstevel@tonic-gate * is necessary to add other leaf resources that may need to exist in the newly 24000Sstevel@tonic-gate * created intermediate. This is performed by the second inner loop, and is 24010Sstevel@tonic-gate * only done when the check for more than one overlapping resource succeeds. 24020Sstevel@tonic-gate */ 24030Sstevel@tonic-gate 24040Sstevel@tonic-gate void 24050Sstevel@tonic-gate lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 24060Sstevel@tonic-gate { 24070Sstevel@tonic-gate int i; 24080Sstevel@tonic-gate int j; 24090Sstevel@tonic-gate int hint; 24100Sstevel@tonic-gate int rset_num_intersect; 24110Sstevel@tonic-gate lgrp_t *lgrp_cur; 24120Sstevel@tonic-gate lpl_t *lpl_cur; 24130Sstevel@tonic-gate lpl_t *lpl_parent; 24141892Sesaxe lgrp_id_t parent_id; 24150Sstevel@tonic-gate klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 24160Sstevel@tonic-gate 24170Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 24180Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 24190Sstevel@tonic-gate 24200Sstevel@tonic-gate /* 24210Sstevel@tonic-gate * Don't insert if the lgrp isn't there, if the leaf isn't 24220Sstevel@tonic-gate * contained within the current lgrp, or if the current lgrp has 24230Sstevel@tonic-gate * no leaves in this partition 24240Sstevel@tonic-gate */ 24250Sstevel@tonic-gate 24260Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) || 24270Sstevel@tonic-gate !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 24280Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) || 24290Sstevel@tonic-gate !klgrpset_intersects(lgrp_cur->lgrp_leaves, 24300Sstevel@tonic-gate cpupart->cp_lgrpset)) 24310Sstevel@tonic-gate continue; 24320Sstevel@tonic-gate 24330Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 24340Sstevel@tonic-gate if (lgrp_cur->lgrp_parent != NULL) { 24350Sstevel@tonic-gate /* if lgrp has a parent, assign it properly */ 24360Sstevel@tonic-gate parent_id = lgrp_cur->lgrp_parent->lgrp_id; 24370Sstevel@tonic-gate lpl_parent = &cpupart->cp_lgrploads[parent_id]; 24380Sstevel@tonic-gate } else { 24390Sstevel@tonic-gate /* if not, make sure parent ptr gets set to null */ 24400Sstevel@tonic-gate lpl_parent = NULL; 24410Sstevel@tonic-gate } 24420Sstevel@tonic-gate 24430Sstevel@tonic-gate if (lpl_cur == lpl_leaf) { 24440Sstevel@tonic-gate /* 24450Sstevel@tonic-gate * Almost all leaf state was initialized elsewhere. The 24460Sstevel@tonic-gate * only thing left to do is to set the parent. 24470Sstevel@tonic-gate */ 24480Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_parent; 24490Sstevel@tonic-gate continue; 24500Sstevel@tonic-gate } 24510Sstevel@tonic-gate 24520Sstevel@tonic-gate /* 24530Sstevel@tonic-gate * Initialize intermediate lpl 24540Sstevel@tonic-gate * Save this lpl's hint though. Since we're changing this 24550Sstevel@tonic-gate * lpl's resources, we need to update the hint in this lpl's 24560Sstevel@tonic-gate * children, but the hint in this lpl is unaffected and 24570Sstevel@tonic-gate * should be preserved. 24580Sstevel@tonic-gate */ 24590Sstevel@tonic-gate hint = lpl_cur->lpl_hint; 24600Sstevel@tonic-gate 24610Sstevel@tonic-gate lpl_clear(lpl_cur); 24620Sstevel@tonic-gate lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 24630Sstevel@tonic-gate 24640Sstevel@tonic-gate lpl_cur->lpl_hint = hint; 24650Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_parent; 24660Sstevel@tonic-gate 24670Sstevel@tonic-gate /* does new lpl need to be populated with other resources? */ 24680Sstevel@tonic-gate rset_intersect = 24690Sstevel@tonic-gate klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 24700Sstevel@tonic-gate cpupart->cp_lgrpset); 24710Sstevel@tonic-gate klgrpset_nlgrps(rset_intersect, rset_num_intersect); 24720Sstevel@tonic-gate 24730Sstevel@tonic-gate if (rset_num_intersect > 1) { 24740Sstevel@tonic-gate /* 24750Sstevel@tonic-gate * If so, figure out what lpls have resources that 24760Sstevel@tonic-gate * intersect this one, and add them. 24770Sstevel@tonic-gate */ 24780Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 24790Sstevel@tonic-gate lgrp_t *lgrp_cand; /* candidate lgrp */ 24800Sstevel@tonic-gate lpl_t *lpl_cand; /* candidate lpl */ 24810Sstevel@tonic-gate 24820Sstevel@tonic-gate lgrp_cand = lgrp_table[j]; 24830Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cand) || 24840Sstevel@tonic-gate !klgrpset_ismember(rset_intersect, 24850Sstevel@tonic-gate lgrp_cand->lgrp_id)) 24860Sstevel@tonic-gate continue; 24870Sstevel@tonic-gate lpl_cand = 24880Sstevel@tonic-gate &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 24890Sstevel@tonic-gate lpl_rset_add(lpl_cur, lpl_cand); 24900Sstevel@tonic-gate } 24910Sstevel@tonic-gate } 24920Sstevel@tonic-gate /* 24930Sstevel@tonic-gate * This lpl's rset has changed. Update the hint in it's 24940Sstevel@tonic-gate * children. 24950Sstevel@tonic-gate */ 24960Sstevel@tonic-gate lpl_child_update(lpl_cur, cpupart); 24970Sstevel@tonic-gate } 24980Sstevel@tonic-gate } 24990Sstevel@tonic-gate 25000Sstevel@tonic-gate /* 25010Sstevel@tonic-gate * remove a lpl from the hierarchy of resources, clearing its state when 25020Sstevel@tonic-gate * finished. If the lpls at the intermediate levels of the hierarchy have no 25030Sstevel@tonic-gate * remaining resources, or no longer name a leaf resource in the cpu-partition, 25040Sstevel@tonic-gate * delete them as well. 25050Sstevel@tonic-gate */ 25060Sstevel@tonic-gate 25070Sstevel@tonic-gate void 25080Sstevel@tonic-gate lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 25090Sstevel@tonic-gate { 25100Sstevel@tonic-gate int i; 25110Sstevel@tonic-gate lgrp_t *lgrp_cur; 25120Sstevel@tonic-gate lpl_t *lpl_cur; 25130Sstevel@tonic-gate klgrpset_t leaf_intersect; /* intersection of leaves */ 25140Sstevel@tonic-gate 25150Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 25160Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 25170Sstevel@tonic-gate 25180Sstevel@tonic-gate /* 25190Sstevel@tonic-gate * Don't attempt to remove from lgrps that aren't there, that 25200Sstevel@tonic-gate * don't contain our leaf, or from the leaf itself. (We do that 25210Sstevel@tonic-gate * later) 25220Sstevel@tonic-gate */ 25230Sstevel@tonic-gate 25240Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur)) 25250Sstevel@tonic-gate continue; 25260Sstevel@tonic-gate 25270Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 25280Sstevel@tonic-gate 25290Sstevel@tonic-gate if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 25300Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) || 25310Sstevel@tonic-gate (lpl_cur == lpl_leaf)) { 25320Sstevel@tonic-gate continue; 25330Sstevel@tonic-gate } 25340Sstevel@tonic-gate 25350Sstevel@tonic-gate /* 25360Sstevel@tonic-gate * This is a slightly sleazy simplification in that we have 25370Sstevel@tonic-gate * already marked the cp_lgrpset as no longer containing the 25380Sstevel@tonic-gate * leaf we've deleted. Any lpls that pass the above checks 25390Sstevel@tonic-gate * based upon lgrp membership but not necessarily cpu-part 25400Sstevel@tonic-gate * membership also get cleared by the checks below. Currently 25410Sstevel@tonic-gate * this is harmless, as the lpls should be empty anyway. 25420Sstevel@tonic-gate * 25430Sstevel@tonic-gate * In particular, we want to preserve lpls that have additional 25440Sstevel@tonic-gate * leaf resources, even though we don't yet have a processor 25450Sstevel@tonic-gate * architecture that represents resources this way. 25460Sstevel@tonic-gate */ 25470Sstevel@tonic-gate 25480Sstevel@tonic-gate leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 25490Sstevel@tonic-gate cpupart->cp_lgrpset); 25500Sstevel@tonic-gate 25510Sstevel@tonic-gate lpl_rset_del(lpl_cur, lpl_leaf); 25520Sstevel@tonic-gate if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 25530Sstevel@tonic-gate lpl_clear(lpl_cur); 25540Sstevel@tonic-gate } else { 25550Sstevel@tonic-gate /* 25560Sstevel@tonic-gate * Update this lpl's children 25570Sstevel@tonic-gate */ 25580Sstevel@tonic-gate lpl_child_update(lpl_cur, cpupart); 25590Sstevel@tonic-gate } 25600Sstevel@tonic-gate } 25610Sstevel@tonic-gate lpl_clear(lpl_leaf); 25620Sstevel@tonic-gate } 25630Sstevel@tonic-gate 25640Sstevel@tonic-gate /* 25650Sstevel@tonic-gate * add a cpu to a partition in terms of lgrp load avg bookeeping 25660Sstevel@tonic-gate * 25670Sstevel@tonic-gate * The lpl (cpu partition load average information) is now arranged in a 25680Sstevel@tonic-gate * hierarchical fashion whereby resources that are closest, ie. most local, to 25690Sstevel@tonic-gate * the cpu in question are considered to be leaves in a tree of resources. 25700Sstevel@tonic-gate * There are two general cases for cpu additon: 25710Sstevel@tonic-gate * 25720Sstevel@tonic-gate * 1. A lpl structure that contains resources already in the hierarchy tree. 25730Sstevel@tonic-gate * In this case, all of the associated lpl relationships have been defined, and 25740Sstevel@tonic-gate * all that is necessary is that we link the new cpu into the per-lpl list of 25750Sstevel@tonic-gate * cpus, and increment the ncpu count of all places where this cpu resource will 25760Sstevel@tonic-gate * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 25770Sstevel@tonic-gate * pushing is accomplished by this routine. 25780Sstevel@tonic-gate * 25790Sstevel@tonic-gate * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 25800Sstevel@tonic-gate * not exist yet. In this case, it is necessary to build the leaf lpl, and 25810Sstevel@tonic-gate * construct the hierarchy of state necessary to name it's more distant 25820Sstevel@tonic-gate * resources, if they should exist. The leaf structure is initialized by this 25830Sstevel@tonic-gate * routine, as is the cpu-partition state for the lgrp membership. This routine 25840Sstevel@tonic-gate * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 25850Sstevel@tonic-gate * and builds all of the "ancestoral" state necessary to identify resources at 25860Sstevel@tonic-gate * differing levels of locality. 25870Sstevel@tonic-gate */ 25880Sstevel@tonic-gate void 25890Sstevel@tonic-gate lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 25900Sstevel@tonic-gate { 25910Sstevel@tonic-gate cpupart_t *cpupart; 25920Sstevel@tonic-gate lgrp_t *lgrp_leaf; 25930Sstevel@tonic-gate lpl_t *lpl_leaf; 25940Sstevel@tonic-gate 25950Sstevel@tonic-gate /* called sometimes w/ cpus paused - grab no locks */ 25960Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 25970Sstevel@tonic-gate 25980Sstevel@tonic-gate cpupart = cp->cpu_part; 25990Sstevel@tonic-gate lgrp_leaf = lgrp_table[lgrpid]; 26000Sstevel@tonic-gate 26010Sstevel@tonic-gate /* don't add non-existent lgrp */ 26020Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_leaf)); 26030Sstevel@tonic-gate lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 26040Sstevel@tonic-gate cp->cpu_lpl = lpl_leaf; 26050Sstevel@tonic-gate 26060Sstevel@tonic-gate /* only leaf lpls contain cpus */ 26070Sstevel@tonic-gate 26080Sstevel@tonic-gate if (lpl_leaf->lpl_ncpu++ == 0) { 26090Sstevel@tonic-gate lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 26100Sstevel@tonic-gate klgrpset_add(cpupart->cp_lgrpset, lgrpid); 26110Sstevel@tonic-gate lpl_leaf_insert(lpl_leaf, cpupart); 26120Sstevel@tonic-gate } else { 26130Sstevel@tonic-gate /* 26140Sstevel@tonic-gate * the lpl should already exist in the parent, so just update 26150Sstevel@tonic-gate * the count of available CPUs 26160Sstevel@tonic-gate */ 26170Sstevel@tonic-gate lpl_cpu_adjcnt(LPL_INCREMENT, cp); 26180Sstevel@tonic-gate } 26190Sstevel@tonic-gate 26200Sstevel@tonic-gate /* link cpu into list of cpus in lpl */ 26210Sstevel@tonic-gate 26220Sstevel@tonic-gate if (lpl_leaf->lpl_cpus) { 26230Sstevel@tonic-gate cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 26240Sstevel@tonic-gate cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 26250Sstevel@tonic-gate lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 26260Sstevel@tonic-gate lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 26270Sstevel@tonic-gate } else { 26280Sstevel@tonic-gate /* 26290Sstevel@tonic-gate * We increment ncpu immediately after we create a new leaf 26300Sstevel@tonic-gate * lpl, so assert that ncpu == 1 for the case where we don't 26310Sstevel@tonic-gate * have any cpu pointers yet. 26320Sstevel@tonic-gate */ 26330Sstevel@tonic-gate ASSERT(lpl_leaf->lpl_ncpu == 1); 26340Sstevel@tonic-gate lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 26350Sstevel@tonic-gate } 26360Sstevel@tonic-gate 26370Sstevel@tonic-gate } 26380Sstevel@tonic-gate 26390Sstevel@tonic-gate 26400Sstevel@tonic-gate /* 26410Sstevel@tonic-gate * remove a cpu from a partition in terms of lgrp load avg bookeeping 26420Sstevel@tonic-gate * 26430Sstevel@tonic-gate * The lpl (cpu partition load average information) is now arranged in a 26440Sstevel@tonic-gate * hierarchical fashion whereby resources that are closest, ie. most local, to 26450Sstevel@tonic-gate * the cpu in question are considered to be leaves in a tree of resources. 26460Sstevel@tonic-gate * There are two removal cases in question: 26470Sstevel@tonic-gate * 26480Sstevel@tonic-gate * 1. Removal of the resource in the leaf leaves other resources remaining in 26490Sstevel@tonic-gate * that leaf. (Another cpu still exists at this level of locality). In this 26500Sstevel@tonic-gate * case, the count of available cpus is decremented in all assocated lpls by 26510Sstevel@tonic-gate * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 26520Sstevel@tonic-gate * from the per-cpu lpl list. 26530Sstevel@tonic-gate * 26540Sstevel@tonic-gate * 2. Removal of the resource results in the lpl containing no resources. (It's 26550Sstevel@tonic-gate * empty) In this case, all of what has occurred for the first step must take 26560Sstevel@tonic-gate * place; however, additionally we must remove the lpl structure itself, prune 26570Sstevel@tonic-gate * out any stranded lpls that do not directly name a leaf resource, and mark the 26580Sstevel@tonic-gate * cpu partition in question as no longer containing resources from the lgrp of 26590Sstevel@tonic-gate * the lpl that has been delted. Cpu-partition changes are handled by this 26600Sstevel@tonic-gate * method, but the lpl_leaf_remove function deals with the details of pruning 26610Sstevel@tonic-gate * out the empty lpl and any of its orphaned direct ancestors. 26620Sstevel@tonic-gate */ 26630Sstevel@tonic-gate void 26640Sstevel@tonic-gate lgrp_part_del_cpu(cpu_t *cp) 26650Sstevel@tonic-gate { 26660Sstevel@tonic-gate lpl_t *lpl; 26670Sstevel@tonic-gate lpl_t *leaf_lpl; 26680Sstevel@tonic-gate lgrp_t *lgrp_leaf; 26690Sstevel@tonic-gate 26700Sstevel@tonic-gate /* called sometimes w/ cpus paused - grab no locks */ 26710Sstevel@tonic-gate 26720Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 26730Sstevel@tonic-gate 26740Sstevel@tonic-gate lpl = leaf_lpl = cp->cpu_lpl; 26750Sstevel@tonic-gate lgrp_leaf = leaf_lpl->lpl_lgrp; 26760Sstevel@tonic-gate 26770Sstevel@tonic-gate /* don't delete a leaf that isn't there */ 26780Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_leaf)); 26790Sstevel@tonic-gate 26800Sstevel@tonic-gate /* no double-deletes */ 26810Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu); 26820Sstevel@tonic-gate if (--lpl->lpl_ncpu == 0) { 26830Sstevel@tonic-gate /* 26840Sstevel@tonic-gate * This was the last cpu in this lgroup for this partition, 26850Sstevel@tonic-gate * clear its bit in the partition's lgroup bitmask 26860Sstevel@tonic-gate */ 26870Sstevel@tonic-gate klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 26880Sstevel@tonic-gate 26890Sstevel@tonic-gate /* eliminate remaning lpl link pointers in cpu, lpl */ 26900Sstevel@tonic-gate lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 26910Sstevel@tonic-gate 26920Sstevel@tonic-gate lpl_leaf_remove(leaf_lpl, cp->cpu_part); 26930Sstevel@tonic-gate } else { 26940Sstevel@tonic-gate 26950Sstevel@tonic-gate /* unlink cpu from lists of cpus in lpl */ 26960Sstevel@tonic-gate cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 26970Sstevel@tonic-gate cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 26980Sstevel@tonic-gate if (lpl->lpl_cpus == cp) { 26990Sstevel@tonic-gate lpl->lpl_cpus = cp->cpu_next_lpl; 27000Sstevel@tonic-gate } 27010Sstevel@tonic-gate 27020Sstevel@tonic-gate /* 27030Sstevel@tonic-gate * Update the cpu count in the lpls associated with parent 27040Sstevel@tonic-gate * lgroups. 27050Sstevel@tonic-gate */ 27060Sstevel@tonic-gate lpl_cpu_adjcnt(LPL_DECREMENT, cp); 27070Sstevel@tonic-gate 27080Sstevel@tonic-gate } 27090Sstevel@tonic-gate /* clear cpu's lpl ptr when we're all done */ 27100Sstevel@tonic-gate cp->cpu_lpl = NULL; 27110Sstevel@tonic-gate } 27120Sstevel@tonic-gate 27130Sstevel@tonic-gate /* 27140Sstevel@tonic-gate * Recompute load average for the specified partition/lgrp fragment. 27150Sstevel@tonic-gate * 27160Sstevel@tonic-gate * We rely on the fact that this routine is called from the clock thread 27170Sstevel@tonic-gate * at a point before the clock thread can block (i.e. before its first 27180Sstevel@tonic-gate * lock request). Since the clock thread can not be preempted (since it 27190Sstevel@tonic-gate * runs at highest priority), we know that cpu partitions can not change 27200Sstevel@tonic-gate * (since doing so would require either the repartition requester or the 27210Sstevel@tonic-gate * cpu_pause thread to run on this cpu), so we can update the cpu's load 27220Sstevel@tonic-gate * without grabbing cpu_lock. 27230Sstevel@tonic-gate */ 27240Sstevel@tonic-gate void 27250Sstevel@tonic-gate lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 27260Sstevel@tonic-gate { 27270Sstevel@tonic-gate uint_t ncpu; 27280Sstevel@tonic-gate int64_t old, new, f; 27290Sstevel@tonic-gate 27300Sstevel@tonic-gate /* 27310Sstevel@tonic-gate * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 27320Sstevel@tonic-gate */ 27330Sstevel@tonic-gate static short expval[] = { 27340Sstevel@tonic-gate 0, 3196, 1618, 1083, 27350Sstevel@tonic-gate 814, 652, 543, 466, 27360Sstevel@tonic-gate 408, 363, 326, 297, 27370Sstevel@tonic-gate 272, 251, 233, 218, 27380Sstevel@tonic-gate 204, 192, 181, 172, 27390Sstevel@tonic-gate 163, 155, 148, 142, 27400Sstevel@tonic-gate 136, 130, 125, 121, 27410Sstevel@tonic-gate 116, 112, 109, 105 27420Sstevel@tonic-gate }; 27430Sstevel@tonic-gate 27440Sstevel@tonic-gate /* ASSERT (called from clock level) */ 27450Sstevel@tonic-gate 27460Sstevel@tonic-gate if ((lpl == NULL) || /* we're booting - this is easiest for now */ 27470Sstevel@tonic-gate ((ncpu = lpl->lpl_ncpu) == 0)) { 27480Sstevel@tonic-gate return; 27490Sstevel@tonic-gate } 27500Sstevel@tonic-gate 27510Sstevel@tonic-gate for (;;) { 27520Sstevel@tonic-gate 27530Sstevel@tonic-gate if (ncpu >= sizeof (expval) / sizeof (expval[0])) 27540Sstevel@tonic-gate f = expval[1]/ncpu; /* good approx. for large ncpu */ 27550Sstevel@tonic-gate else 27560Sstevel@tonic-gate f = expval[ncpu]; 27570Sstevel@tonic-gate 27580Sstevel@tonic-gate /* 27590Sstevel@tonic-gate * Modify the load average atomically to avoid losing 27600Sstevel@tonic-gate * anticipatory load updates (see lgrp_move_thread()). 27610Sstevel@tonic-gate */ 27620Sstevel@tonic-gate if (ageflag) { 27630Sstevel@tonic-gate /* 27640Sstevel@tonic-gate * We're supposed to both update and age the load. 27650Sstevel@tonic-gate * This happens 10 times/sec. per cpu. We do a 27660Sstevel@tonic-gate * little hoop-jumping to avoid integer overflow. 27670Sstevel@tonic-gate */ 27680Sstevel@tonic-gate int64_t q, r; 27690Sstevel@tonic-gate 27700Sstevel@tonic-gate do { 27710Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 27720Sstevel@tonic-gate q = (old >> 16) << 7; 27730Sstevel@tonic-gate r = (old & 0xffff) << 7; 27740Sstevel@tonic-gate new += ((long long)(nrcpus - q) * f - 27750Sstevel@tonic-gate ((r * f) >> 16)) >> 7; 27760Sstevel@tonic-gate 27770Sstevel@tonic-gate /* 27780Sstevel@tonic-gate * Check for overflow 27790Sstevel@tonic-gate */ 27800Sstevel@tonic-gate if (new > LGRP_LOADAVG_MAX) 27810Sstevel@tonic-gate new = LGRP_LOADAVG_MAX; 27820Sstevel@tonic-gate else if (new < 0) 27830Sstevel@tonic-gate new = 0; 27840Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 27850Sstevel@tonic-gate new) != old); 27860Sstevel@tonic-gate } else { 27870Sstevel@tonic-gate /* 27880Sstevel@tonic-gate * We're supposed to update the load, but not age it. 27890Sstevel@tonic-gate * This option is used to update the load (which either 27900Sstevel@tonic-gate * has already been aged in this 1/10 sec. interval or 27910Sstevel@tonic-gate * soon will be) to account for a remotely executing 27920Sstevel@tonic-gate * thread. 27930Sstevel@tonic-gate */ 27940Sstevel@tonic-gate do { 27950Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 27960Sstevel@tonic-gate new += f; 27970Sstevel@tonic-gate /* 27980Sstevel@tonic-gate * Check for overflow 27990Sstevel@tonic-gate * Underflow not possible here 28000Sstevel@tonic-gate */ 28010Sstevel@tonic-gate if (new < old) 28020Sstevel@tonic-gate new = LGRP_LOADAVG_MAX; 28030Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 28040Sstevel@tonic-gate new) != old); 28050Sstevel@tonic-gate } 28060Sstevel@tonic-gate 28070Sstevel@tonic-gate /* 28080Sstevel@tonic-gate * Do the same for this lpl's parent 28090Sstevel@tonic-gate */ 28100Sstevel@tonic-gate if ((lpl = lpl->lpl_parent) == NULL) 28110Sstevel@tonic-gate break; 28120Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 28130Sstevel@tonic-gate } 28140Sstevel@tonic-gate } 28150Sstevel@tonic-gate 28160Sstevel@tonic-gate /* 28170Sstevel@tonic-gate * Initialize lpl topology in the target based on topology currently present in 28180Sstevel@tonic-gate * lpl_bootstrap. 28190Sstevel@tonic-gate * 28200Sstevel@tonic-gate * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 28210Sstevel@tonic-gate * initialize cp_default list of lpls. Up to this point all topology operations 28220Sstevel@tonic-gate * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 28230Sstevel@tonic-gate * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 28240Sstevel@tonic-gate * `target' points to the list of lpls in cp_default and `size' is the size of 28250Sstevel@tonic-gate * this list. 28260Sstevel@tonic-gate * 28270Sstevel@tonic-gate * This function walks the lpl topology in lpl_bootstrap and does for things: 28280Sstevel@tonic-gate * 28290Sstevel@tonic-gate * 1) Copies all fields from lpl_bootstrap to the target. 28300Sstevel@tonic-gate * 28310Sstevel@tonic-gate * 2) Sets CPU0 lpl pointer to the correct element of the target list. 28320Sstevel@tonic-gate * 28330Sstevel@tonic-gate * 3) Updates lpl_parent pointers to point to the lpls in the target list 28340Sstevel@tonic-gate * instead of lpl_bootstrap. 28350Sstevel@tonic-gate * 28360Sstevel@tonic-gate * 4) Updates pointers in the resource list of the target to point to the lpls 28370Sstevel@tonic-gate * in the target list instead of lpl_bootstrap. 28380Sstevel@tonic-gate * 28390Sstevel@tonic-gate * After lpl_topo_bootstrap() completes, target contains the same information 28400Sstevel@tonic-gate * that would be present there if it were used during boot instead of 28410Sstevel@tonic-gate * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 28420Sstevel@tonic-gate * and it is bzeroed. 28430Sstevel@tonic-gate */ 28440Sstevel@tonic-gate void 28450Sstevel@tonic-gate lpl_topo_bootstrap(lpl_t *target, int size) 28460Sstevel@tonic-gate { 28470Sstevel@tonic-gate lpl_t *lpl = lpl_bootstrap; 28480Sstevel@tonic-gate lpl_t *target_lpl = target; 28490Sstevel@tonic-gate int howmany; 28500Sstevel@tonic-gate int id; 28510Sstevel@tonic-gate int i; 28520Sstevel@tonic-gate 28530Sstevel@tonic-gate /* 28540Sstevel@tonic-gate * The only target that should be passed here is cp_default lpl list. 28550Sstevel@tonic-gate */ 28560Sstevel@tonic-gate ASSERT(target == cp_default.cp_lgrploads); 28570Sstevel@tonic-gate ASSERT(size == cp_default.cp_nlgrploads); 28580Sstevel@tonic-gate ASSERT(!lgrp_topo_initialized); 28590Sstevel@tonic-gate ASSERT(ncpus == 1); 28600Sstevel@tonic-gate 28610Sstevel@tonic-gate howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 28620Sstevel@tonic-gate for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 28630Sstevel@tonic-gate /* 28640Sstevel@tonic-gate * Copy all fields from lpl. 28650Sstevel@tonic-gate */ 28660Sstevel@tonic-gate 28670Sstevel@tonic-gate *target_lpl = *lpl; 28680Sstevel@tonic-gate 28690Sstevel@tonic-gate /* 28700Sstevel@tonic-gate * Substitute CPU0 lpl pointer with one relative to target. 28710Sstevel@tonic-gate */ 28720Sstevel@tonic-gate if (lpl->lpl_cpus == CPU) { 28730Sstevel@tonic-gate ASSERT(CPU->cpu_lpl == lpl); 28740Sstevel@tonic-gate CPU->cpu_lpl = target_lpl; 28750Sstevel@tonic-gate } 28760Sstevel@tonic-gate 28770Sstevel@tonic-gate /* 28780Sstevel@tonic-gate * Substitute parent information with parent relative to target. 28790Sstevel@tonic-gate */ 28800Sstevel@tonic-gate if (lpl->lpl_parent != NULL) 28810Sstevel@tonic-gate target_lpl->lpl_parent = (lpl_t *) 28820Sstevel@tonic-gate (((uintptr_t)lpl->lpl_parent - 28830Sstevel@tonic-gate (uintptr_t)lpl_bootstrap) + 28840Sstevel@tonic-gate (uintptr_t)target); 28850Sstevel@tonic-gate 28860Sstevel@tonic-gate /* 28870Sstevel@tonic-gate * Walk over resource set substituting pointers relative to 28880Sstevel@tonic-gate * lpl_bootstrap to pointers relative to target. 28890Sstevel@tonic-gate */ 28900Sstevel@tonic-gate ASSERT(lpl->lpl_nrset <= 1); 28910Sstevel@tonic-gate 28920Sstevel@tonic-gate for (id = 0; id < lpl->lpl_nrset; id++) { 28930Sstevel@tonic-gate if (lpl->lpl_rset[id] != NULL) { 28940Sstevel@tonic-gate target_lpl->lpl_rset[id] = 28950Sstevel@tonic-gate (lpl_t *) 28960Sstevel@tonic-gate (((uintptr_t)lpl->lpl_rset[id] - 28970Sstevel@tonic-gate (uintptr_t)lpl_bootstrap) + 28980Sstevel@tonic-gate (uintptr_t)target); 28990Sstevel@tonic-gate } 29000Sstevel@tonic-gate } 29010Sstevel@tonic-gate } 29020Sstevel@tonic-gate 29030Sstevel@tonic-gate /* 29040Sstevel@tonic-gate * Topology information in lpl_bootstrap is no longer needed. 29050Sstevel@tonic-gate */ 29060Sstevel@tonic-gate bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 29070Sstevel@tonic-gate } 29080Sstevel@tonic-gate 29090Sstevel@tonic-gate /* 29100Sstevel@tonic-gate * If the lowest load among the lgroups a process' threads are currently 29110Sstevel@tonic-gate * spread across is greater than lgrp_expand_proc_thresh, we'll consider 29120Sstevel@tonic-gate * expanding the process to a new lgroup. 29130Sstevel@tonic-gate */ 29140Sstevel@tonic-gate #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 29150Sstevel@tonic-gate lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 29160Sstevel@tonic-gate 29170Sstevel@tonic-gate #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 29180Sstevel@tonic-gate ((lgrp_expand_proc_thresh) / (ncpu)) 29190Sstevel@tonic-gate 29200Sstevel@tonic-gate /* 29210Sstevel@tonic-gate * A process will be expanded to a new lgroup only if the difference between 29220Sstevel@tonic-gate * the lowest load on the lgroups the process' thread's are currently spread 29230Sstevel@tonic-gate * across and the lowest load on the other lgroups in the process' partition 29240Sstevel@tonic-gate * is greater than lgrp_expand_proc_diff. 29250Sstevel@tonic-gate */ 29260Sstevel@tonic-gate #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 29270Sstevel@tonic-gate lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 29280Sstevel@tonic-gate 29290Sstevel@tonic-gate #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 29300Sstevel@tonic-gate ((lgrp_expand_proc_diff) / (ncpu)) 29310Sstevel@tonic-gate 29320Sstevel@tonic-gate /* 29330Sstevel@tonic-gate * The loadavg tolerance accounts for "noise" inherent in the load, which may 29340Sstevel@tonic-gate * be present due to impreciseness of the load average decay algorithm. 29350Sstevel@tonic-gate * 29360Sstevel@tonic-gate * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 29370Sstevel@tonic-gate * tolerance is scaled by the number of cpus in the lgroup just like 29380Sstevel@tonic-gate * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 29390Sstevel@tonic-gate * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 29400Sstevel@tonic-gate * of: 0x10000 / 4 => 0x4000 or greater to be significant. 29410Sstevel@tonic-gate */ 29420Sstevel@tonic-gate uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 29430Sstevel@tonic-gate #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 29440Sstevel@tonic-gate ((lgrp_loadavg_tolerance) / ncpu) 29450Sstevel@tonic-gate 29460Sstevel@tonic-gate /* 29470Sstevel@tonic-gate * lgrp_choose() will choose root lgroup as home when lowest lgroup load 29480Sstevel@tonic-gate * average is above this threshold 29490Sstevel@tonic-gate */ 29500Sstevel@tonic-gate uint32_t lgrp_load_thresh = UINT32_MAX; 29510Sstevel@tonic-gate 29520Sstevel@tonic-gate /* 29530Sstevel@tonic-gate * lgrp_choose() will try to skip any lgroups with less memory 29540Sstevel@tonic-gate * than this free when choosing a home lgroup 29550Sstevel@tonic-gate */ 29560Sstevel@tonic-gate pgcnt_t lgrp_mem_free_thresh = 0; 29570Sstevel@tonic-gate 29580Sstevel@tonic-gate /* 29590Sstevel@tonic-gate * When choosing between similarly loaded lgroups, lgrp_choose() will pick 29600Sstevel@tonic-gate * one based on one of the following policies: 29610Sstevel@tonic-gate * - Random selection 29620Sstevel@tonic-gate * - Pseudo round robin placement 29630Sstevel@tonic-gate * - Longest time since a thread was last placed 29640Sstevel@tonic-gate */ 29650Sstevel@tonic-gate #define LGRP_CHOOSE_RANDOM 1 29660Sstevel@tonic-gate #define LGRP_CHOOSE_RR 2 29670Sstevel@tonic-gate #define LGRP_CHOOSE_TIME 3 29680Sstevel@tonic-gate 29690Sstevel@tonic-gate int lgrp_choose_policy = LGRP_CHOOSE_TIME; 29700Sstevel@tonic-gate 29710Sstevel@tonic-gate /* 29720Sstevel@tonic-gate * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 29730Sstevel@tonic-gate * be bound to a CPU or processor set. 29740Sstevel@tonic-gate * 29750Sstevel@tonic-gate * Arguments: 29760Sstevel@tonic-gate * t The thread 29770Sstevel@tonic-gate * cpupart The partition the thread belongs to. 29780Sstevel@tonic-gate * 29790Sstevel@tonic-gate * NOTE: Should at least be called with the cpu_lock held, kernel preemption 29800Sstevel@tonic-gate * disabled, or thread_lock held (at splhigh) to protect against the CPU 29810Sstevel@tonic-gate * partitions changing out from under us and assumes that given thread is 29820Sstevel@tonic-gate * protected. Also, called sometimes w/ cpus paused or kernel preemption 29830Sstevel@tonic-gate * disabled, so don't grab any locks because we should never block under 29840Sstevel@tonic-gate * those conditions. 29850Sstevel@tonic-gate */ 29860Sstevel@tonic-gate lpl_t * 29870Sstevel@tonic-gate lgrp_choose(kthread_t *t, cpupart_t *cpupart) 29880Sstevel@tonic-gate { 29890Sstevel@tonic-gate lgrp_load_t bestload, bestrload; 29900Sstevel@tonic-gate int lgrpid_offset, lgrp_count; 29910Sstevel@tonic-gate lgrp_id_t lgrpid, lgrpid_start; 29920Sstevel@tonic-gate lpl_t *lpl, *bestlpl, *bestrlpl; 29930Sstevel@tonic-gate klgrpset_t lgrpset; 29940Sstevel@tonic-gate proc_t *p; 29950Sstevel@tonic-gate 29960Sstevel@tonic-gate ASSERT(t != NULL); 29970Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 29980Sstevel@tonic-gate THREAD_LOCK_HELD(t)); 29990Sstevel@tonic-gate ASSERT(cpupart != NULL); 30000Sstevel@tonic-gate 30010Sstevel@tonic-gate p = t->t_procp; 30020Sstevel@tonic-gate 30030Sstevel@tonic-gate /* A process should always be in an active partition */ 30040Sstevel@tonic-gate ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 30050Sstevel@tonic-gate 30060Sstevel@tonic-gate bestlpl = bestrlpl = NULL; 30070Sstevel@tonic-gate bestload = bestrload = LGRP_LOADAVG_MAX; 30080Sstevel@tonic-gate lgrpset = cpupart->cp_lgrpset; 30090Sstevel@tonic-gate 30100Sstevel@tonic-gate switch (lgrp_choose_policy) { 30110Sstevel@tonic-gate case LGRP_CHOOSE_RR: 30120Sstevel@tonic-gate lgrpid = cpupart->cp_lgrp_hint; 30130Sstevel@tonic-gate do { 30140Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 30150Sstevel@tonic-gate lgrpid = 0; 30160Sstevel@tonic-gate } while (!klgrpset_ismember(lgrpset, lgrpid)); 30170Sstevel@tonic-gate 30180Sstevel@tonic-gate break; 30190Sstevel@tonic-gate default: 30200Sstevel@tonic-gate case LGRP_CHOOSE_TIME: 30210Sstevel@tonic-gate case LGRP_CHOOSE_RANDOM: 30220Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrp_count); 30230Sstevel@tonic-gate lgrpid_offset = 30240Sstevel@tonic-gate (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 30250Sstevel@tonic-gate for (lgrpid = 0; ; lgrpid++) { 30260Sstevel@tonic-gate if (klgrpset_ismember(lgrpset, lgrpid)) { 30270Sstevel@tonic-gate if (--lgrpid_offset == 0) 30280Sstevel@tonic-gate break; 30290Sstevel@tonic-gate } 30300Sstevel@tonic-gate } 30310Sstevel@tonic-gate break; 30320Sstevel@tonic-gate } 30330Sstevel@tonic-gate 30340Sstevel@tonic-gate lgrpid_start = lgrpid; 30350Sstevel@tonic-gate 30360Sstevel@tonic-gate DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 30370Sstevel@tonic-gate lgrp_id_t, cpupart->cp_lgrp_hint); 30380Sstevel@tonic-gate 30390Sstevel@tonic-gate /* 30400Sstevel@tonic-gate * Use lgroup affinities (if any) to choose best lgroup 30410Sstevel@tonic-gate * 30420Sstevel@tonic-gate * NOTE: Assumes that thread is protected from going away and its 30430Sstevel@tonic-gate * lgroup affinities won't change (ie. p_lock, or 30440Sstevel@tonic-gate * thread_lock() being held and/or CPUs paused) 30450Sstevel@tonic-gate */ 30460Sstevel@tonic-gate if (t->t_lgrp_affinity) { 30472988Sjjc lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 30480Sstevel@tonic-gate if (lpl != NULL) 30490Sstevel@tonic-gate return (lpl); 30500Sstevel@tonic-gate } 30510Sstevel@tonic-gate 30520Sstevel@tonic-gate ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 30530Sstevel@tonic-gate 30540Sstevel@tonic-gate do { 30550Sstevel@tonic-gate pgcnt_t npgs; 30560Sstevel@tonic-gate 30570Sstevel@tonic-gate /* 30580Sstevel@tonic-gate * Skip any lgroups outside of thread's pset 30590Sstevel@tonic-gate */ 30600Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, lgrpid)) { 30610Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 30620Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 30630Sstevel@tonic-gate continue; 30640Sstevel@tonic-gate } 30650Sstevel@tonic-gate 30660Sstevel@tonic-gate /* 30670Sstevel@tonic-gate * Skip any non-leaf lgroups 30680Sstevel@tonic-gate */ 30690Sstevel@tonic-gate if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 30700Sstevel@tonic-gate continue; 30710Sstevel@tonic-gate 30720Sstevel@tonic-gate /* 30730Sstevel@tonic-gate * Skip any lgroups without enough free memory 30740Sstevel@tonic-gate * (when threshold set to nonzero positive value) 30750Sstevel@tonic-gate */ 30760Sstevel@tonic-gate if (lgrp_mem_free_thresh > 0) { 30770Sstevel@tonic-gate npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 30780Sstevel@tonic-gate if (npgs < lgrp_mem_free_thresh) { 30790Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 30800Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 30810Sstevel@tonic-gate continue; 30820Sstevel@tonic-gate } 30830Sstevel@tonic-gate } 30840Sstevel@tonic-gate 30850Sstevel@tonic-gate lpl = &cpupart->cp_lgrploads[lgrpid]; 30860Sstevel@tonic-gate if (klgrpset_isempty(p->p_lgrpset) || 30870Sstevel@tonic-gate klgrpset_ismember(p->p_lgrpset, lgrpid)) { 30880Sstevel@tonic-gate /* 30890Sstevel@tonic-gate * Either this is a new process or the process already 30900Sstevel@tonic-gate * has threads on this lgrp, so this is a preferred 30910Sstevel@tonic-gate * lgroup for the thread. 30920Sstevel@tonic-gate */ 30931892Sesaxe if (bestlpl == NULL || 30941892Sesaxe lpl_pick(lpl, bestlpl)) { 30950Sstevel@tonic-gate bestload = lpl->lpl_loadavg; 30960Sstevel@tonic-gate bestlpl = lpl; 30970Sstevel@tonic-gate } 30980Sstevel@tonic-gate } else { 30990Sstevel@tonic-gate /* 31000Sstevel@tonic-gate * The process doesn't have any threads on this lgrp, 31010Sstevel@tonic-gate * but we're willing to consider this lgrp if the load 31020Sstevel@tonic-gate * difference is big enough to justify splitting up 31030Sstevel@tonic-gate * the process' threads. 31040Sstevel@tonic-gate */ 31051892Sesaxe if (bestrlpl == NULL || 31061892Sesaxe lpl_pick(lpl, bestrlpl)) { 31070Sstevel@tonic-gate bestrload = lpl->lpl_loadavg; 31080Sstevel@tonic-gate bestrlpl = lpl; 31090Sstevel@tonic-gate } 31100Sstevel@tonic-gate } 31110Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 31120Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 31130Sstevel@tonic-gate } while (lgrpid != lgrpid_start); 31140Sstevel@tonic-gate 31150Sstevel@tonic-gate /* 31160Sstevel@tonic-gate * Return root lgroup if threshold isn't set to maximum value and 31170Sstevel@tonic-gate * lowest lgroup load average more than a certain threshold 31180Sstevel@tonic-gate */ 31190Sstevel@tonic-gate if (lgrp_load_thresh != UINT32_MAX && 31200Sstevel@tonic-gate bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 31210Sstevel@tonic-gate return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 31220Sstevel@tonic-gate 31230Sstevel@tonic-gate /* 31240Sstevel@tonic-gate * If all the lgroups over which the thread's process is spread are 31251892Sesaxe * heavily loaded, or otherwise undesirable, we'll consider placing 31261892Sesaxe * the thread on one of the other leaf lgroups in the thread's 31271892Sesaxe * partition. 31280Sstevel@tonic-gate */ 31291892Sesaxe if ((bestlpl == NULL) || 31301892Sesaxe ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 31310Sstevel@tonic-gate (bestrload < bestload) && /* paranoid about wraparound */ 31320Sstevel@tonic-gate (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 31331892Sesaxe bestload))) { 31340Sstevel@tonic-gate bestlpl = bestrlpl; 31350Sstevel@tonic-gate } 31360Sstevel@tonic-gate 31371892Sesaxe if (bestlpl == NULL) { 31381892Sesaxe /* 31391892Sesaxe * No lgroup looked particularly good, but we still 31401892Sesaxe * have to pick something. Go with the randomly selected 31411892Sesaxe * legal lgroup we started with above. 31421892Sesaxe */ 31431892Sesaxe bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 31441892Sesaxe } 31451892Sesaxe 31460Sstevel@tonic-gate cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 31470Sstevel@tonic-gate bestlpl->lpl_homed_time = gethrtime_unscaled(); 31480Sstevel@tonic-gate 31490Sstevel@tonic-gate ASSERT(bestlpl->lpl_ncpu > 0); 31500Sstevel@tonic-gate return (bestlpl); 31510Sstevel@tonic-gate } 31520Sstevel@tonic-gate 31530Sstevel@tonic-gate /* 31541892Sesaxe * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 31551892Sesaxe * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 31560Sstevel@tonic-gate */ 31570Sstevel@tonic-gate static int 31580Sstevel@tonic-gate lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 31590Sstevel@tonic-gate { 31600Sstevel@tonic-gate lgrp_load_t l1, l2; 31610Sstevel@tonic-gate lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 31620Sstevel@tonic-gate 31630Sstevel@tonic-gate l1 = lpl1->lpl_loadavg; 31640Sstevel@tonic-gate l2 = lpl2->lpl_loadavg; 31650Sstevel@tonic-gate 31660Sstevel@tonic-gate if ((l1 + tolerance < l2) && (l1 < l2)) { 31670Sstevel@tonic-gate /* lpl1 is significantly less loaded than lpl2 */ 31680Sstevel@tonic-gate return (1); 31690Sstevel@tonic-gate } 31700Sstevel@tonic-gate 31710Sstevel@tonic-gate if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 31720Sstevel@tonic-gate l1 + tolerance >= l2 && l1 < l2 && 31730Sstevel@tonic-gate lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 31740Sstevel@tonic-gate /* 31750Sstevel@tonic-gate * lpl1's load is within the tolerance of lpl2. We're 31760Sstevel@tonic-gate * willing to consider it be to better however if 31770Sstevel@tonic-gate * it has been longer since we last homed a thread there 31780Sstevel@tonic-gate */ 31790Sstevel@tonic-gate return (1); 31800Sstevel@tonic-gate } 31810Sstevel@tonic-gate 31820Sstevel@tonic-gate return (0); 31830Sstevel@tonic-gate } 31840Sstevel@tonic-gate 31850Sstevel@tonic-gate /* 31860Sstevel@tonic-gate * An LWP is expected to be assigned to an lgroup for at least this long 31870Sstevel@tonic-gate * for its anticipatory load to be justified. NOTE that this value should 31880Sstevel@tonic-gate * not be set extremely huge (say, larger than 100 years), to avoid problems 31890Sstevel@tonic-gate * with overflow in the calculation that uses it. 31900Sstevel@tonic-gate */ 31910Sstevel@tonic-gate #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 31920Sstevel@tonic-gate hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 31930Sstevel@tonic-gate 31940Sstevel@tonic-gate /* 31950Sstevel@tonic-gate * Routine to change a thread's lgroup affiliation. This routine updates 31960Sstevel@tonic-gate * the thread's kthread_t struct and its process' proc_t struct to note the 31970Sstevel@tonic-gate * thread's new lgroup affiliation, and its lgroup affinities. 31980Sstevel@tonic-gate * 31990Sstevel@tonic-gate * Note that this is the only routine that modifies a thread's t_lpl field, 32000Sstevel@tonic-gate * and that adds in or removes anticipatory load. 32010Sstevel@tonic-gate * 32020Sstevel@tonic-gate * If the thread is exiting, newlpl is NULL. 32030Sstevel@tonic-gate * 32040Sstevel@tonic-gate * Locking: 32050Sstevel@tonic-gate * The following lock must be held on entry: 32060Sstevel@tonic-gate * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 32070Sstevel@tonic-gate * doesn't get removed from t's partition 32080Sstevel@tonic-gate * 32090Sstevel@tonic-gate * This routine is not allowed to grab any locks, since it may be called 32100Sstevel@tonic-gate * with cpus paused (such as from cpu_offline). 32110Sstevel@tonic-gate */ 32120Sstevel@tonic-gate void 32130Sstevel@tonic-gate lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 32140Sstevel@tonic-gate { 32150Sstevel@tonic-gate proc_t *p; 32160Sstevel@tonic-gate lpl_t *lpl, *oldlpl; 32170Sstevel@tonic-gate lgrp_id_t oldid; 32180Sstevel@tonic-gate kthread_t *tp; 32190Sstevel@tonic-gate uint_t ncpu; 32200Sstevel@tonic-gate lgrp_load_t old, new; 32210Sstevel@tonic-gate 32220Sstevel@tonic-gate ASSERT(t); 32230Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 32240Sstevel@tonic-gate THREAD_LOCK_HELD(t)); 32250Sstevel@tonic-gate 32260Sstevel@tonic-gate /* 32270Sstevel@tonic-gate * If not changing lpls, just return 32280Sstevel@tonic-gate */ 32290Sstevel@tonic-gate if ((oldlpl = t->t_lpl) == newlpl) 32300Sstevel@tonic-gate return; 32310Sstevel@tonic-gate 32320Sstevel@tonic-gate /* 32330Sstevel@tonic-gate * Make sure the thread's lwp hasn't exited (if so, this thread is now 32340Sstevel@tonic-gate * associated with process 0 rather than with its original process). 32350Sstevel@tonic-gate */ 32360Sstevel@tonic-gate if (t->t_proc_flag & TP_LWPEXIT) { 32370Sstevel@tonic-gate if (newlpl != NULL) { 32380Sstevel@tonic-gate t->t_lpl = newlpl; 32390Sstevel@tonic-gate } 32400Sstevel@tonic-gate return; 32410Sstevel@tonic-gate } 32420Sstevel@tonic-gate 32430Sstevel@tonic-gate p = ttoproc(t); 32440Sstevel@tonic-gate 32450Sstevel@tonic-gate /* 32460Sstevel@tonic-gate * If the thread had a previous lgroup, update its process' p_lgrpset 32470Sstevel@tonic-gate * to account for it being moved from its old lgroup. 32480Sstevel@tonic-gate */ 32490Sstevel@tonic-gate if ((oldlpl != NULL) && /* thread had a previous lgroup */ 32500Sstevel@tonic-gate (p->p_tlist != NULL)) { 32510Sstevel@tonic-gate oldid = oldlpl->lpl_lgrpid; 32520Sstevel@tonic-gate 32530Sstevel@tonic-gate if (newlpl != NULL) 32540Sstevel@tonic-gate lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 32550Sstevel@tonic-gate 32560Sstevel@tonic-gate if ((do_lgrpset_delete) && 32570Sstevel@tonic-gate (klgrpset_ismember(p->p_lgrpset, oldid))) { 32580Sstevel@tonic-gate for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 32590Sstevel@tonic-gate /* 32600Sstevel@tonic-gate * Check if a thread other than the thread 32610Sstevel@tonic-gate * that's moving is assigned to the same 32620Sstevel@tonic-gate * lgroup as the thread that's moving. Note 32630Sstevel@tonic-gate * that we have to compare lgroup IDs, rather 32640Sstevel@tonic-gate * than simply comparing t_lpl's, since the 32650Sstevel@tonic-gate * threads may belong to different partitions 32660Sstevel@tonic-gate * but be assigned to the same lgroup. 32670Sstevel@tonic-gate */ 32680Sstevel@tonic-gate ASSERT(tp->t_lpl != NULL); 32690Sstevel@tonic-gate 32700Sstevel@tonic-gate if ((tp != t) && 32710Sstevel@tonic-gate (tp->t_lpl->lpl_lgrpid == oldid)) { 32720Sstevel@tonic-gate /* 32730Sstevel@tonic-gate * Another thread is assigned to the 32740Sstevel@tonic-gate * same lgroup as the thread that's 32750Sstevel@tonic-gate * moving, p_lgrpset doesn't change. 32760Sstevel@tonic-gate */ 32770Sstevel@tonic-gate break; 32780Sstevel@tonic-gate } else if (tp == p->p_tlist) { 32790Sstevel@tonic-gate /* 32800Sstevel@tonic-gate * No other thread is assigned to the 32810Sstevel@tonic-gate * same lgroup as the exiting thread, 32820Sstevel@tonic-gate * clear the lgroup's bit in p_lgrpset. 32830Sstevel@tonic-gate */ 32840Sstevel@tonic-gate klgrpset_del(p->p_lgrpset, oldid); 32850Sstevel@tonic-gate break; 32860Sstevel@tonic-gate } 32870Sstevel@tonic-gate } 32880Sstevel@tonic-gate } 32890Sstevel@tonic-gate 32900Sstevel@tonic-gate /* 32910Sstevel@tonic-gate * If this thread was assigned to its old lgroup for such a 32920Sstevel@tonic-gate * short amount of time that the anticipatory load that was 32930Sstevel@tonic-gate * added on its behalf has aged very little, remove that 32940Sstevel@tonic-gate * anticipatory load. 32950Sstevel@tonic-gate */ 32960Sstevel@tonic-gate if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 32970Sstevel@tonic-gate ((ncpu = oldlpl->lpl_ncpu) > 0)) { 32980Sstevel@tonic-gate lpl = oldlpl; 32990Sstevel@tonic-gate for (;;) { 33000Sstevel@tonic-gate do { 33010Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 33020Sstevel@tonic-gate new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 33030Sstevel@tonic-gate if (new > old) { 33040Sstevel@tonic-gate /* 33050Sstevel@tonic-gate * this can happen if the load 33060Sstevel@tonic-gate * average was aged since we 33070Sstevel@tonic-gate * added in the anticipatory 33080Sstevel@tonic-gate * load 33090Sstevel@tonic-gate */ 33100Sstevel@tonic-gate new = 0; 33110Sstevel@tonic-gate } 33120Sstevel@tonic-gate } while (cas32( 33130Sstevel@tonic-gate (lgrp_load_t *)&lpl->lpl_loadavg, old, 33140Sstevel@tonic-gate new) != old); 33150Sstevel@tonic-gate 33160Sstevel@tonic-gate lpl = lpl->lpl_parent; 33170Sstevel@tonic-gate if (lpl == NULL) 33180Sstevel@tonic-gate break; 33190Sstevel@tonic-gate 33200Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 33210Sstevel@tonic-gate ASSERT(ncpu > 0); 33220Sstevel@tonic-gate } 33230Sstevel@tonic-gate } 33240Sstevel@tonic-gate } 33250Sstevel@tonic-gate /* 33260Sstevel@tonic-gate * If the thread has a new lgroup (i.e. it's not exiting), update its 33270Sstevel@tonic-gate * t_lpl and its process' p_lgrpset, and apply an anticipatory load 33280Sstevel@tonic-gate * to its new lgroup to account for its move to its new lgroup. 33290Sstevel@tonic-gate */ 33300Sstevel@tonic-gate if (newlpl != NULL) { 33310Sstevel@tonic-gate /* 33320Sstevel@tonic-gate * This thread is moving to a new lgroup 33330Sstevel@tonic-gate */ 33340Sstevel@tonic-gate t->t_lpl = newlpl; 33350Sstevel@tonic-gate 33360Sstevel@tonic-gate /* 33370Sstevel@tonic-gate * Reflect move in load average of new lgroup 33380Sstevel@tonic-gate * unless it is root lgroup 33390Sstevel@tonic-gate */ 33400Sstevel@tonic-gate if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 33410Sstevel@tonic-gate return; 33420Sstevel@tonic-gate 33430Sstevel@tonic-gate if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 33440Sstevel@tonic-gate klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 33450Sstevel@tonic-gate } 33460Sstevel@tonic-gate 33470Sstevel@tonic-gate /* 33480Sstevel@tonic-gate * It'll take some time for the load on the new lgroup 33490Sstevel@tonic-gate * to reflect this thread's placement on it. We'd 33500Sstevel@tonic-gate * like not, however, to have all threads between now 33510Sstevel@tonic-gate * and then also piling on to this lgroup. To avoid 33520Sstevel@tonic-gate * this pileup, we anticipate the load this thread 33530Sstevel@tonic-gate * will generate on its new lgroup. The goal is to 33540Sstevel@tonic-gate * make the lgroup's load appear as though the thread 33550Sstevel@tonic-gate * had been there all along. We're very conservative 33560Sstevel@tonic-gate * in calculating this anticipatory load, we assume 33570Sstevel@tonic-gate * the worst case case (100% CPU-bound thread). This 33580Sstevel@tonic-gate * may be modified in the future to be more accurate. 33590Sstevel@tonic-gate */ 33600Sstevel@tonic-gate lpl = newlpl; 33610Sstevel@tonic-gate for (;;) { 33620Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 33630Sstevel@tonic-gate ASSERT(ncpu > 0); 33640Sstevel@tonic-gate do { 33650Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 33660Sstevel@tonic-gate new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 33670Sstevel@tonic-gate /* 33680Sstevel@tonic-gate * Check for overflow 33690Sstevel@tonic-gate * Underflow not possible here 33700Sstevel@tonic-gate */ 33710Sstevel@tonic-gate if (new < old) 33720Sstevel@tonic-gate new = UINT32_MAX; 33730Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 33740Sstevel@tonic-gate new) != old); 33750Sstevel@tonic-gate 33760Sstevel@tonic-gate lpl = lpl->lpl_parent; 33770Sstevel@tonic-gate if (lpl == NULL) 33780Sstevel@tonic-gate break; 33790Sstevel@tonic-gate } 33800Sstevel@tonic-gate t->t_anttime = gethrtime(); 33810Sstevel@tonic-gate } 33820Sstevel@tonic-gate } 33830Sstevel@tonic-gate 33840Sstevel@tonic-gate /* 33850Sstevel@tonic-gate * Return lgroup memory allocation policy given advice from madvise(3C) 33860Sstevel@tonic-gate */ 33870Sstevel@tonic-gate lgrp_mem_policy_t 33880Sstevel@tonic-gate lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 33890Sstevel@tonic-gate { 33900Sstevel@tonic-gate switch (advice) { 33910Sstevel@tonic-gate case MADV_ACCESS_LWP: 33920Sstevel@tonic-gate return (LGRP_MEM_POLICY_NEXT); 33930Sstevel@tonic-gate case MADV_ACCESS_MANY: 33940Sstevel@tonic-gate return (LGRP_MEM_POLICY_RANDOM); 33950Sstevel@tonic-gate default: 33960Sstevel@tonic-gate return (lgrp_mem_policy_default(size, type)); 33970Sstevel@tonic-gate } 33980Sstevel@tonic-gate } 33990Sstevel@tonic-gate 34000Sstevel@tonic-gate /* 34010Sstevel@tonic-gate * Figure out default policy 34020Sstevel@tonic-gate */ 34030Sstevel@tonic-gate lgrp_mem_policy_t 34040Sstevel@tonic-gate lgrp_mem_policy_default(size_t size, int type) 34050Sstevel@tonic-gate { 34060Sstevel@tonic-gate cpupart_t *cp; 34070Sstevel@tonic-gate lgrp_mem_policy_t policy; 34080Sstevel@tonic-gate size_t pset_mem_size; 34090Sstevel@tonic-gate 34100Sstevel@tonic-gate /* 34110Sstevel@tonic-gate * Randomly allocate memory across lgroups for shared memory 34120Sstevel@tonic-gate * beyond a certain threshold 34130Sstevel@tonic-gate */ 34140Sstevel@tonic-gate if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 34150Sstevel@tonic-gate (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 34160Sstevel@tonic-gate /* 34170Sstevel@tonic-gate * Get total memory size of current thread's pset 34180Sstevel@tonic-gate */ 34190Sstevel@tonic-gate kpreempt_disable(); 34200Sstevel@tonic-gate cp = curthread->t_cpupart; 34210Sstevel@tonic-gate klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 34220Sstevel@tonic-gate kpreempt_enable(); 34230Sstevel@tonic-gate 34240Sstevel@tonic-gate /* 34250Sstevel@tonic-gate * Choose policy to randomly allocate memory across 34260Sstevel@tonic-gate * lgroups in pset if it will fit and is not default 34270Sstevel@tonic-gate * partition. Otherwise, allocate memory randomly 34280Sstevel@tonic-gate * across machine. 34290Sstevel@tonic-gate */ 34300Sstevel@tonic-gate if (lgrp_mem_pset_aware && size < pset_mem_size) 34310Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM_PSET; 34320Sstevel@tonic-gate else 34330Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM; 34340Sstevel@tonic-gate } else 34350Sstevel@tonic-gate /* 34360Sstevel@tonic-gate * Apply default policy for private memory and 34370Sstevel@tonic-gate * shared memory under the respective random 34380Sstevel@tonic-gate * threshold. 34390Sstevel@tonic-gate */ 34400Sstevel@tonic-gate policy = lgrp_mem_default_policy; 34410Sstevel@tonic-gate 34420Sstevel@tonic-gate return (policy); 34430Sstevel@tonic-gate } 34440Sstevel@tonic-gate 34450Sstevel@tonic-gate /* 34460Sstevel@tonic-gate * Get memory allocation policy for this segment 34470Sstevel@tonic-gate */ 34480Sstevel@tonic-gate lgrp_mem_policy_info_t * 34490Sstevel@tonic-gate lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 34500Sstevel@tonic-gate { 34510Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 34520Sstevel@tonic-gate extern struct seg_ops segspt_ops; 34530Sstevel@tonic-gate extern struct seg_ops segspt_shmops; 34540Sstevel@tonic-gate 34550Sstevel@tonic-gate /* 34560Sstevel@tonic-gate * This is for binary compatibility to protect against third party 34570Sstevel@tonic-gate * segment drivers which haven't recompiled to allow for 34580Sstevel@tonic-gate * SEGOP_GETPOLICY() 34590Sstevel@tonic-gate */ 34600Sstevel@tonic-gate if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 34610Sstevel@tonic-gate seg->s_ops != &segspt_shmops) 34620Sstevel@tonic-gate return (NULL); 34630Sstevel@tonic-gate 34640Sstevel@tonic-gate policy_info = NULL; 34650Sstevel@tonic-gate if (seg->s_ops->getpolicy != NULL) 34660Sstevel@tonic-gate policy_info = SEGOP_GETPOLICY(seg, vaddr); 34670Sstevel@tonic-gate 34680Sstevel@tonic-gate return (policy_info); 34690Sstevel@tonic-gate } 34700Sstevel@tonic-gate 34710Sstevel@tonic-gate /* 34720Sstevel@tonic-gate * Set policy for allocating private memory given desired policy, policy info, 34730Sstevel@tonic-gate * size in bytes of memory that policy is being applied. 34740Sstevel@tonic-gate * Return 0 if policy wasn't set already and 1 if policy was set already 34750Sstevel@tonic-gate */ 34760Sstevel@tonic-gate int 34770Sstevel@tonic-gate lgrp_privm_policy_set(lgrp_mem_policy_t policy, 34780Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info, size_t size) 34790Sstevel@tonic-gate { 34800Sstevel@tonic-gate 34810Sstevel@tonic-gate ASSERT(policy_info != NULL); 34820Sstevel@tonic-gate 34830Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_DEFAULT) 34840Sstevel@tonic-gate policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 34850Sstevel@tonic-gate 34860Sstevel@tonic-gate /* 34870Sstevel@tonic-gate * Policy set already? 34880Sstevel@tonic-gate */ 34890Sstevel@tonic-gate if (policy == policy_info->mem_policy) 34900Sstevel@tonic-gate return (1); 34910Sstevel@tonic-gate 34920Sstevel@tonic-gate /* 34930Sstevel@tonic-gate * Set policy 34940Sstevel@tonic-gate */ 34950Sstevel@tonic-gate policy_info->mem_policy = policy; 34960Sstevel@tonic-gate policy_info->mem_reserved = 0; 34970Sstevel@tonic-gate 34980Sstevel@tonic-gate return (0); 34990Sstevel@tonic-gate } 35000Sstevel@tonic-gate 35010Sstevel@tonic-gate 35020Sstevel@tonic-gate /* 35030Sstevel@tonic-gate * Get shared memory allocation policy with given tree and offset 35040Sstevel@tonic-gate */ 35050Sstevel@tonic-gate lgrp_mem_policy_info_t * 35060Sstevel@tonic-gate lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 35070Sstevel@tonic-gate u_offset_t vn_off) 35080Sstevel@tonic-gate { 35090Sstevel@tonic-gate u_offset_t off; 35100Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 35110Sstevel@tonic-gate lgrp_shm_policy_seg_t *policy_seg; 35120Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 35130Sstevel@tonic-gate avl_tree_t *tree; 35140Sstevel@tonic-gate avl_index_t where; 35150Sstevel@tonic-gate 35160Sstevel@tonic-gate /* 35170Sstevel@tonic-gate * Get policy segment tree from anon_map or vnode and use specified 35180Sstevel@tonic-gate * anon index or vnode offset as offset 35190Sstevel@tonic-gate * 35200Sstevel@tonic-gate * Assume that no lock needs to be held on anon_map or vnode, since 35210Sstevel@tonic-gate * they should be protected by their reference count which must be 35220Sstevel@tonic-gate * nonzero for an existing segment 35230Sstevel@tonic-gate */ 35240Sstevel@tonic-gate if (amp) { 35250Sstevel@tonic-gate ASSERT(amp->refcnt != 0); 35260Sstevel@tonic-gate shm_locality = amp->locality; 35270Sstevel@tonic-gate if (shm_locality == NULL) 35280Sstevel@tonic-gate return (NULL); 35290Sstevel@tonic-gate tree = shm_locality->loc_tree; 35300Sstevel@tonic-gate off = ptob(anon_index); 35310Sstevel@tonic-gate } else if (vp) { 35320Sstevel@tonic-gate shm_locality = vp->v_locality; 35330Sstevel@tonic-gate if (shm_locality == NULL) 35340Sstevel@tonic-gate return (NULL); 35350Sstevel@tonic-gate ASSERT(shm_locality->loc_count != 0); 35360Sstevel@tonic-gate tree = shm_locality->loc_tree; 35370Sstevel@tonic-gate off = vn_off; 35380Sstevel@tonic-gate } 35390Sstevel@tonic-gate 35400Sstevel@tonic-gate if (tree == NULL) 35410Sstevel@tonic-gate return (NULL); 35420Sstevel@tonic-gate 35430Sstevel@tonic-gate /* 35440Sstevel@tonic-gate * Lookup policy segment for offset into shared object and return 35450Sstevel@tonic-gate * policy info 35460Sstevel@tonic-gate */ 35470Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_READER); 35480Sstevel@tonic-gate policy_info = NULL; 35490Sstevel@tonic-gate policy_seg = avl_find(tree, &off, &where); 35500Sstevel@tonic-gate if (policy_seg) 35510Sstevel@tonic-gate policy_info = &policy_seg->shm_policy; 35520Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 35530Sstevel@tonic-gate 35540Sstevel@tonic-gate return (policy_info); 35550Sstevel@tonic-gate } 35560Sstevel@tonic-gate 35570Sstevel@tonic-gate /* 35582480Sesaxe * Default memory allocation policy for kernel segmap pages 35592480Sesaxe */ 35602480Sesaxe lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 35612480Sesaxe 35622480Sesaxe /* 35630Sstevel@tonic-gate * Return lgroup to use for allocating memory 35640Sstevel@tonic-gate * given the segment and address 35650Sstevel@tonic-gate * 35660Sstevel@tonic-gate * There isn't any mutual exclusion that exists between calls 35670Sstevel@tonic-gate * to this routine and DR, so this routine and whomever calls it 35680Sstevel@tonic-gate * should be mindful of the possibility that the lgrp returned 35690Sstevel@tonic-gate * may be deleted. If this happens, dereferences of the lgrp 35700Sstevel@tonic-gate * pointer will still be safe, but the resources in the lgrp will 35710Sstevel@tonic-gate * be gone, and LGRP_EXISTS() will no longer be true. 35720Sstevel@tonic-gate */ 35730Sstevel@tonic-gate lgrp_t * 35740Sstevel@tonic-gate lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 35750Sstevel@tonic-gate { 35760Sstevel@tonic-gate int i; 35770Sstevel@tonic-gate lgrp_t *lgrp; 35780Sstevel@tonic-gate klgrpset_t lgrpset; 35790Sstevel@tonic-gate int lgrps_spanned; 35800Sstevel@tonic-gate unsigned long off; 35810Sstevel@tonic-gate lgrp_mem_policy_t policy; 35820Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 35830Sstevel@tonic-gate ushort_t random; 35840Sstevel@tonic-gate int stat = 0; 35852480Sesaxe extern struct seg *segkmap; 35860Sstevel@tonic-gate 35870Sstevel@tonic-gate /* 35880Sstevel@tonic-gate * Just return null if the lgrp framework hasn't finished 35890Sstevel@tonic-gate * initializing or if this is a UMA machine. 35900Sstevel@tonic-gate */ 35910Sstevel@tonic-gate if (nlgrps == 1 || !lgrp_initialized) 35920Sstevel@tonic-gate return (lgrp_root); 35930Sstevel@tonic-gate 35940Sstevel@tonic-gate /* 35950Sstevel@tonic-gate * Get memory allocation policy for this segment 35960Sstevel@tonic-gate */ 35970Sstevel@tonic-gate policy = lgrp_mem_default_policy; 35980Sstevel@tonic-gate if (seg != NULL) { 35990Sstevel@tonic-gate if (seg->s_as == &kas) { 36002480Sesaxe if (seg == segkmap) 36012480Sesaxe policy = lgrp_segmap_default_policy; 36020Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 36030Sstevel@tonic-gate policy == LGRP_MEM_POLICY_RANDOM_PSET) 36040Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM; 36050Sstevel@tonic-gate } else { 36060Sstevel@tonic-gate policy_info = lgrp_mem_policy_get(seg, vaddr); 36070Sstevel@tonic-gate if (policy_info != NULL) 36080Sstevel@tonic-gate policy = policy_info->mem_policy; 36090Sstevel@tonic-gate } 36100Sstevel@tonic-gate } 36110Sstevel@tonic-gate lgrpset = 0; 36120Sstevel@tonic-gate 36130Sstevel@tonic-gate /* 36140Sstevel@tonic-gate * Initialize lgroup to home by default 36150Sstevel@tonic-gate */ 36160Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 36170Sstevel@tonic-gate 36180Sstevel@tonic-gate /* 36190Sstevel@tonic-gate * When homing threads on root lgrp, override default memory 36200Sstevel@tonic-gate * allocation policies with root lgroup memory allocation policy 36210Sstevel@tonic-gate */ 36220Sstevel@tonic-gate if (lgrp == lgrp_root) 36230Sstevel@tonic-gate policy = lgrp_mem_policy_root; 36240Sstevel@tonic-gate 36250Sstevel@tonic-gate /* 36260Sstevel@tonic-gate * Implement policy 36270Sstevel@tonic-gate */ 36280Sstevel@tonic-gate switch (policy) { 36290Sstevel@tonic-gate case LGRP_MEM_POLICY_NEXT_CPU: 36300Sstevel@tonic-gate 36310Sstevel@tonic-gate /* 36320Sstevel@tonic-gate * Return lgroup of current CPU which faulted on memory 363360Sesaxe * If the CPU isn't currently in an lgrp, then opt to 363460Sesaxe * allocate from the root. 363560Sesaxe * 363660Sesaxe * Kernel preemption needs to be disabled here to prevent 363760Sesaxe * the current CPU from going away before lgrp is found. 36380Sstevel@tonic-gate */ 363960Sesaxe if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 364060Sesaxe lgrp = lgrp_root; 364160Sesaxe } else { 364260Sesaxe kpreempt_disable(); 364360Sesaxe lgrp = lgrp_cpu_to_lgrp(CPU); 364460Sesaxe kpreempt_enable(); 364560Sesaxe } 36460Sstevel@tonic-gate break; 36470Sstevel@tonic-gate 36480Sstevel@tonic-gate case LGRP_MEM_POLICY_NEXT: 36490Sstevel@tonic-gate case LGRP_MEM_POLICY_DEFAULT: 36500Sstevel@tonic-gate default: 36510Sstevel@tonic-gate 36520Sstevel@tonic-gate /* 36530Sstevel@tonic-gate * Just return current thread's home lgroup 36540Sstevel@tonic-gate * for default policy (next touch) 36550Sstevel@tonic-gate * If the thread is homed to the root, 36560Sstevel@tonic-gate * then the default policy is random across lgroups. 36570Sstevel@tonic-gate * Fallthrough to the random case. 36580Sstevel@tonic-gate */ 36590Sstevel@tonic-gate if (lgrp != lgrp_root) { 36600Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_NEXT) 36610Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 36620Sstevel@tonic-gate else 36630Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, 36640Sstevel@tonic-gate LGRP_NUM_DEFAULT, 1); 36650Sstevel@tonic-gate break; 36660Sstevel@tonic-gate } 36670Sstevel@tonic-gate /* LINTED fallthrough on case statement */ 36680Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM: 36690Sstevel@tonic-gate 36700Sstevel@tonic-gate /* 36710Sstevel@tonic-gate * Return a random leaf lgroup with memory 36720Sstevel@tonic-gate */ 36730Sstevel@tonic-gate lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 36740Sstevel@tonic-gate /* 36750Sstevel@tonic-gate * Count how many lgroups are spanned 36760Sstevel@tonic-gate */ 36770Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrps_spanned); 36780Sstevel@tonic-gate 36790Sstevel@tonic-gate /* 36800Sstevel@tonic-gate * There may be no memnodes in the root lgroup during DR copy 36810Sstevel@tonic-gate * rename on a system with only two boards (memnodes) 36820Sstevel@tonic-gate * configured. In this case just return the root lgrp. 36830Sstevel@tonic-gate */ 36840Sstevel@tonic-gate if (lgrps_spanned == 0) { 36850Sstevel@tonic-gate lgrp = lgrp_root; 36860Sstevel@tonic-gate break; 36870Sstevel@tonic-gate } 36880Sstevel@tonic-gate 36890Sstevel@tonic-gate /* 36900Sstevel@tonic-gate * Pick a random offset within lgroups spanned 36910Sstevel@tonic-gate * and return lgroup at that offset 36920Sstevel@tonic-gate */ 36930Sstevel@tonic-gate random = (ushort_t)gethrtime() >> 4; 36940Sstevel@tonic-gate off = random % lgrps_spanned; 36950Sstevel@tonic-gate ASSERT(off <= lgrp_alloc_max); 36960Sstevel@tonic-gate 36970Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 36980Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, i)) 36990Sstevel@tonic-gate continue; 37000Sstevel@tonic-gate if (off) 37010Sstevel@tonic-gate off--; 37020Sstevel@tonic-gate else { 37030Sstevel@tonic-gate lgrp = lgrp_table[i]; 37040Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 37050Sstevel@tonic-gate 1); 37060Sstevel@tonic-gate break; 37070Sstevel@tonic-gate } 37080Sstevel@tonic-gate } 37090Sstevel@tonic-gate break; 37100Sstevel@tonic-gate 37110Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM_PROC: 37120Sstevel@tonic-gate 37130Sstevel@tonic-gate /* 37140Sstevel@tonic-gate * Grab copy of bitmask of lgroups spanned by 37150Sstevel@tonic-gate * this process 37160Sstevel@tonic-gate */ 37170Sstevel@tonic-gate klgrpset_copy(lgrpset, curproc->p_lgrpset); 37180Sstevel@tonic-gate stat = LGRP_NUM_RANDOM_PROC; 37190Sstevel@tonic-gate 37200Sstevel@tonic-gate /* LINTED fallthrough on case statement */ 37210Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM_PSET: 37220Sstevel@tonic-gate 37230Sstevel@tonic-gate if (!stat) 37240Sstevel@tonic-gate stat = LGRP_NUM_RANDOM_PSET; 37250Sstevel@tonic-gate 37260Sstevel@tonic-gate if (klgrpset_isempty(lgrpset)) { 37270Sstevel@tonic-gate /* 37280Sstevel@tonic-gate * Grab copy of bitmask of lgroups spanned by 37290Sstevel@tonic-gate * this processor set 37300Sstevel@tonic-gate */ 37310Sstevel@tonic-gate kpreempt_disable(); 37320Sstevel@tonic-gate klgrpset_copy(lgrpset, 37330Sstevel@tonic-gate curthread->t_cpupart->cp_lgrpset); 37340Sstevel@tonic-gate kpreempt_enable(); 37350Sstevel@tonic-gate } 37360Sstevel@tonic-gate 37370Sstevel@tonic-gate /* 37380Sstevel@tonic-gate * Count how many lgroups are spanned 37390Sstevel@tonic-gate */ 37400Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrps_spanned); 37410Sstevel@tonic-gate ASSERT(lgrps_spanned <= nlgrps); 37420Sstevel@tonic-gate 37430Sstevel@tonic-gate /* 37440Sstevel@tonic-gate * Probably lgrps_spanned should be always non-zero, but to be 37450Sstevel@tonic-gate * on the safe side we return lgrp_root if it is empty. 37460Sstevel@tonic-gate */ 37470Sstevel@tonic-gate if (lgrps_spanned == 0) { 37480Sstevel@tonic-gate lgrp = lgrp_root; 37490Sstevel@tonic-gate break; 37500Sstevel@tonic-gate } 37510Sstevel@tonic-gate 37520Sstevel@tonic-gate /* 37530Sstevel@tonic-gate * Pick a random offset within lgroups spanned 37540Sstevel@tonic-gate * and return lgroup at that offset 37550Sstevel@tonic-gate */ 37560Sstevel@tonic-gate random = (ushort_t)gethrtime() >> 4; 37570Sstevel@tonic-gate off = random % lgrps_spanned; 37580Sstevel@tonic-gate ASSERT(off <= lgrp_alloc_max); 37590Sstevel@tonic-gate 37600Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 37610Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, i)) 37620Sstevel@tonic-gate continue; 37630Sstevel@tonic-gate if (off) 37640Sstevel@tonic-gate off--; 37650Sstevel@tonic-gate else { 37660Sstevel@tonic-gate lgrp = lgrp_table[i]; 37670Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 37680Sstevel@tonic-gate 1); 37690Sstevel@tonic-gate break; 37700Sstevel@tonic-gate } 37710Sstevel@tonic-gate } 37720Sstevel@tonic-gate break; 37730Sstevel@tonic-gate 37740Sstevel@tonic-gate case LGRP_MEM_POLICY_ROUNDROBIN: 37750Sstevel@tonic-gate 37760Sstevel@tonic-gate /* 37770Sstevel@tonic-gate * Use offset within segment to determine 37780Sstevel@tonic-gate * offset from home lgroup to choose for 37790Sstevel@tonic-gate * next lgroup to allocate memory from 37800Sstevel@tonic-gate */ 37810Sstevel@tonic-gate off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 37820Sstevel@tonic-gate (lgrp_alloc_max + 1); 37830Sstevel@tonic-gate 37840Sstevel@tonic-gate kpreempt_disable(); 37850Sstevel@tonic-gate lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 37860Sstevel@tonic-gate i = lgrp->lgrp_id; 37870Sstevel@tonic-gate kpreempt_enable(); 37880Sstevel@tonic-gate 37890Sstevel@tonic-gate while (off > 0) { 37900Sstevel@tonic-gate i = (i + 1) % (lgrp_alloc_max + 1); 37910Sstevel@tonic-gate lgrp = lgrp_table[i]; 37920Sstevel@tonic-gate if (klgrpset_ismember(lgrpset, i)) 37930Sstevel@tonic-gate off--; 37940Sstevel@tonic-gate } 37950Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 37960Sstevel@tonic-gate 37970Sstevel@tonic-gate break; 37980Sstevel@tonic-gate } 37990Sstevel@tonic-gate 38000Sstevel@tonic-gate ASSERT(lgrp != NULL); 38010Sstevel@tonic-gate return (lgrp); 38020Sstevel@tonic-gate } 38030Sstevel@tonic-gate 38040Sstevel@tonic-gate /* 38050Sstevel@tonic-gate * Return the number of pages in an lgroup 38060Sstevel@tonic-gate * 38070Sstevel@tonic-gate * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 38080Sstevel@tonic-gate * could cause tests that rely on the numat driver to fail.... 38090Sstevel@tonic-gate */ 38100Sstevel@tonic-gate pgcnt_t 38110Sstevel@tonic-gate lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 38120Sstevel@tonic-gate { 38130Sstevel@tonic-gate lgrp_t *lgrp; 38140Sstevel@tonic-gate 38150Sstevel@tonic-gate lgrp = lgrp_table[lgrpid]; 38160Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 38170Sstevel@tonic-gate klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 38180Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 38190Sstevel@tonic-gate return (0); 38200Sstevel@tonic-gate 38210Sstevel@tonic-gate return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 38220Sstevel@tonic-gate } 38230Sstevel@tonic-gate 38240Sstevel@tonic-gate /* 38250Sstevel@tonic-gate * Initialize lgroup shared memory allocation policy support 38260Sstevel@tonic-gate */ 38270Sstevel@tonic-gate void 38280Sstevel@tonic-gate lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 38290Sstevel@tonic-gate { 38300Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 38310Sstevel@tonic-gate 38320Sstevel@tonic-gate /* 38330Sstevel@tonic-gate * Initialize locality field in anon_map 38340Sstevel@tonic-gate * Don't need any locks because this is called when anon_map is 38350Sstevel@tonic-gate * allocated, but not used anywhere yet. 38360Sstevel@tonic-gate */ 38370Sstevel@tonic-gate if (amp) { 38380Sstevel@tonic-gate ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 38390Sstevel@tonic-gate if (amp->locality == NULL) { 38400Sstevel@tonic-gate /* 38410Sstevel@tonic-gate * Allocate and initialize shared memory locality info 38420Sstevel@tonic-gate * and set anon_map locality pointer to it 38430Sstevel@tonic-gate * Drop lock across kmem_alloc(KM_SLEEP) 38440Sstevel@tonic-gate */ 38450Sstevel@tonic-gate ANON_LOCK_EXIT(&->a_rwlock); 38460Sstevel@tonic-gate shm_locality = kmem_alloc(sizeof (*shm_locality), 38470Sstevel@tonic-gate KM_SLEEP); 38480Sstevel@tonic-gate rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 38490Sstevel@tonic-gate NULL); 38500Sstevel@tonic-gate shm_locality->loc_count = 1; /* not used for amp */ 38510Sstevel@tonic-gate shm_locality->loc_tree = NULL; 38520Sstevel@tonic-gate 38530Sstevel@tonic-gate /* 38540Sstevel@tonic-gate * Reacquire lock and check to see whether anyone beat 38550Sstevel@tonic-gate * us to initializing the locality info 38560Sstevel@tonic-gate */ 38570Sstevel@tonic-gate ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 38580Sstevel@tonic-gate if (amp->locality != NULL) { 38590Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 38600Sstevel@tonic-gate kmem_free(shm_locality, 38610Sstevel@tonic-gate sizeof (*shm_locality)); 38620Sstevel@tonic-gate } else 38630Sstevel@tonic-gate amp->locality = shm_locality; 38640Sstevel@tonic-gate } 38650Sstevel@tonic-gate ANON_LOCK_EXIT(&->a_rwlock); 38660Sstevel@tonic-gate return; 38670Sstevel@tonic-gate } 38680Sstevel@tonic-gate 38690Sstevel@tonic-gate /* 38700Sstevel@tonic-gate * Allocate shared vnode policy info if vnode is not locality aware yet 38710Sstevel@tonic-gate */ 38720Sstevel@tonic-gate mutex_enter(&vp->v_lock); 38730Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0) { 38740Sstevel@tonic-gate /* 38750Sstevel@tonic-gate * Allocate and initialize shared memory locality info 38760Sstevel@tonic-gate */ 38770Sstevel@tonic-gate mutex_exit(&vp->v_lock); 38780Sstevel@tonic-gate shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 38790Sstevel@tonic-gate rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 38800Sstevel@tonic-gate shm_locality->loc_count = 1; 38810Sstevel@tonic-gate shm_locality->loc_tree = NULL; 38820Sstevel@tonic-gate 38830Sstevel@tonic-gate /* 38840Sstevel@tonic-gate * Point vnode locality field at shared vnode policy info 38850Sstevel@tonic-gate * and set locality aware flag in vnode 38860Sstevel@tonic-gate */ 38870Sstevel@tonic-gate mutex_enter(&vp->v_lock); 38880Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0) { 38890Sstevel@tonic-gate vp->v_locality = shm_locality; 38900Sstevel@tonic-gate vp->v_flag |= V_LOCALITY; 38910Sstevel@tonic-gate } else { 38920Sstevel@tonic-gate /* 38930Sstevel@tonic-gate * Lost race so free locality info and increment count. 38940Sstevel@tonic-gate */ 38950Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 38960Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 38970Sstevel@tonic-gate shm_locality = vp->v_locality; 38980Sstevel@tonic-gate shm_locality->loc_count++; 38990Sstevel@tonic-gate } 39000Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39010Sstevel@tonic-gate 39020Sstevel@tonic-gate return; 39030Sstevel@tonic-gate } 39040Sstevel@tonic-gate 39050Sstevel@tonic-gate /* 39060Sstevel@tonic-gate * Increment reference count of number of segments mapping this vnode 39070Sstevel@tonic-gate * shared 39080Sstevel@tonic-gate */ 39090Sstevel@tonic-gate shm_locality = vp->v_locality; 39100Sstevel@tonic-gate shm_locality->loc_count++; 39110Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39120Sstevel@tonic-gate } 39130Sstevel@tonic-gate 39140Sstevel@tonic-gate /* 39150Sstevel@tonic-gate * Destroy the given shared memory policy segment tree 39160Sstevel@tonic-gate */ 39170Sstevel@tonic-gate void 39180Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 39190Sstevel@tonic-gate { 39200Sstevel@tonic-gate lgrp_shm_policy_seg_t *cur; 39210Sstevel@tonic-gate lgrp_shm_policy_seg_t *next; 39220Sstevel@tonic-gate 39230Sstevel@tonic-gate if (tree == NULL) 39240Sstevel@tonic-gate return; 39250Sstevel@tonic-gate 39260Sstevel@tonic-gate cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 39270Sstevel@tonic-gate while (cur != NULL) { 39280Sstevel@tonic-gate next = AVL_NEXT(tree, cur); 39290Sstevel@tonic-gate avl_remove(tree, cur); 39300Sstevel@tonic-gate kmem_free(cur, sizeof (*cur)); 39310Sstevel@tonic-gate cur = next; 39320Sstevel@tonic-gate } 39330Sstevel@tonic-gate kmem_free(tree, sizeof (avl_tree_t)); 39340Sstevel@tonic-gate } 39350Sstevel@tonic-gate 39360Sstevel@tonic-gate /* 39370Sstevel@tonic-gate * Uninitialize lgroup shared memory allocation policy support 39380Sstevel@tonic-gate */ 39390Sstevel@tonic-gate void 39400Sstevel@tonic-gate lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 39410Sstevel@tonic-gate { 39420Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 39430Sstevel@tonic-gate 39440Sstevel@tonic-gate /* 39450Sstevel@tonic-gate * For anon_map, deallocate shared memory policy tree and 39460Sstevel@tonic-gate * zero locality field 39470Sstevel@tonic-gate * Don't need any locks because anon_map is being freed 39480Sstevel@tonic-gate */ 39490Sstevel@tonic-gate if (amp) { 39500Sstevel@tonic-gate if (amp->locality == NULL) 39510Sstevel@tonic-gate return; 39520Sstevel@tonic-gate shm_locality = amp->locality; 39530Sstevel@tonic-gate shm_locality->loc_count = 0; /* not really used for amp */ 39540Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 39550Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 39560Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 39570Sstevel@tonic-gate amp->locality = 0; 39580Sstevel@tonic-gate return; 39590Sstevel@tonic-gate } 39600Sstevel@tonic-gate 39610Sstevel@tonic-gate /* 39620Sstevel@tonic-gate * For vnode, decrement reference count of segments mapping this vnode 39630Sstevel@tonic-gate * shared and delete locality info if reference count drops to 0 39640Sstevel@tonic-gate */ 39650Sstevel@tonic-gate mutex_enter(&vp->v_lock); 39660Sstevel@tonic-gate shm_locality = vp->v_locality; 39670Sstevel@tonic-gate shm_locality->loc_count--; 39680Sstevel@tonic-gate 39690Sstevel@tonic-gate if (shm_locality->loc_count == 0) { 39700Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 39710Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 39720Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 39730Sstevel@tonic-gate vp->v_locality = 0; 39740Sstevel@tonic-gate vp->v_flag &= ~V_LOCALITY; 39750Sstevel@tonic-gate } 39760Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39770Sstevel@tonic-gate } 39780Sstevel@tonic-gate 39790Sstevel@tonic-gate /* 39800Sstevel@tonic-gate * Compare two shared memory policy segments 39810Sstevel@tonic-gate * Used by AVL tree code for searching 39820Sstevel@tonic-gate */ 39830Sstevel@tonic-gate int 39840Sstevel@tonic-gate lgrp_shm_policy_compar(const void *x, const void *y) 39850Sstevel@tonic-gate { 39860Sstevel@tonic-gate lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 39870Sstevel@tonic-gate lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 39880Sstevel@tonic-gate 39890Sstevel@tonic-gate if (a->shm_off < b->shm_off) 39900Sstevel@tonic-gate return (-1); 39910Sstevel@tonic-gate if (a->shm_off >= b->shm_off + b->shm_size) 39920Sstevel@tonic-gate return (1); 39930Sstevel@tonic-gate return (0); 39940Sstevel@tonic-gate } 39950Sstevel@tonic-gate 39960Sstevel@tonic-gate /* 39970Sstevel@tonic-gate * Concatenate seg1 with seg2 and remove seg2 39980Sstevel@tonic-gate */ 39990Sstevel@tonic-gate static int 40000Sstevel@tonic-gate lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 40010Sstevel@tonic-gate lgrp_shm_policy_seg_t *seg2) 40020Sstevel@tonic-gate { 40030Sstevel@tonic-gate if (!seg1 || !seg2 || 40040Sstevel@tonic-gate seg1->shm_off + seg1->shm_size != seg2->shm_off || 40050Sstevel@tonic-gate seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 40060Sstevel@tonic-gate return (-1); 40070Sstevel@tonic-gate 40080Sstevel@tonic-gate seg1->shm_size += seg2->shm_size; 40090Sstevel@tonic-gate avl_remove(tree, seg2); 40100Sstevel@tonic-gate kmem_free(seg2, sizeof (*seg2)); 40110Sstevel@tonic-gate return (0); 40120Sstevel@tonic-gate } 40130Sstevel@tonic-gate 40140Sstevel@tonic-gate /* 40150Sstevel@tonic-gate * Split segment at given offset and return rightmost (uppermost) segment 40160Sstevel@tonic-gate * Assumes that there are no overlapping segments 40170Sstevel@tonic-gate */ 40180Sstevel@tonic-gate static lgrp_shm_policy_seg_t * 40190Sstevel@tonic-gate lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 40200Sstevel@tonic-gate u_offset_t off) 40210Sstevel@tonic-gate { 40220Sstevel@tonic-gate lgrp_shm_policy_seg_t *newseg; 40230Sstevel@tonic-gate avl_index_t where; 40240Sstevel@tonic-gate 40250Sstevel@tonic-gate ASSERT(seg != NULL); 40260Sstevel@tonic-gate ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 40270Sstevel@tonic-gate 40280Sstevel@tonic-gate if (!seg || off < seg->shm_off || off > seg->shm_off + 40290Sstevel@tonic-gate seg->shm_size) 40300Sstevel@tonic-gate return (NULL); 40310Sstevel@tonic-gate 40320Sstevel@tonic-gate if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 40330Sstevel@tonic-gate return (seg); 40340Sstevel@tonic-gate 40350Sstevel@tonic-gate /* 40360Sstevel@tonic-gate * Adjust size of left segment and allocate new (right) segment 40370Sstevel@tonic-gate */ 40380Sstevel@tonic-gate newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 40390Sstevel@tonic-gate newseg->shm_policy = seg->shm_policy; 40400Sstevel@tonic-gate newseg->shm_off = off; 40410Sstevel@tonic-gate newseg->shm_size = seg->shm_size - (off - seg->shm_off); 40420Sstevel@tonic-gate seg->shm_size = off - seg->shm_off; 40430Sstevel@tonic-gate 40440Sstevel@tonic-gate /* 40450Sstevel@tonic-gate * Find where to insert new segment in AVL tree and insert it 40460Sstevel@tonic-gate */ 40470Sstevel@tonic-gate (void) avl_find(tree, &off, &where); 40480Sstevel@tonic-gate avl_insert(tree, newseg, where); 40490Sstevel@tonic-gate 40500Sstevel@tonic-gate return (newseg); 40510Sstevel@tonic-gate } 40520Sstevel@tonic-gate 40530Sstevel@tonic-gate /* 40540Sstevel@tonic-gate * Set shared memory allocation policy on specified shared object at given 40550Sstevel@tonic-gate * offset and length 40560Sstevel@tonic-gate * 40570Sstevel@tonic-gate * Return 0 if policy wasn't set already, 1 if policy was set already, and 40580Sstevel@tonic-gate * -1 if can't set policy. 40590Sstevel@tonic-gate */ 40600Sstevel@tonic-gate int 40610Sstevel@tonic-gate lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 40620Sstevel@tonic-gate ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 40630Sstevel@tonic-gate { 40640Sstevel@tonic-gate u_offset_t eoff; 40650Sstevel@tonic-gate lgrp_shm_policy_seg_t *next; 40660Sstevel@tonic-gate lgrp_shm_policy_seg_t *newseg; 40670Sstevel@tonic-gate u_offset_t off; 40680Sstevel@tonic-gate u_offset_t oldeoff; 40690Sstevel@tonic-gate lgrp_shm_policy_seg_t *prev; 40700Sstevel@tonic-gate int retval; 40710Sstevel@tonic-gate lgrp_shm_policy_seg_t *seg; 40720Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 40730Sstevel@tonic-gate avl_tree_t *tree; 40740Sstevel@tonic-gate avl_index_t where; 40750Sstevel@tonic-gate 40760Sstevel@tonic-gate ASSERT(amp || vp); 40770Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0); 40780Sstevel@tonic-gate 40790Sstevel@tonic-gate if (len == 0) 40800Sstevel@tonic-gate return (-1); 40810Sstevel@tonic-gate 40820Sstevel@tonic-gate retval = 0; 40830Sstevel@tonic-gate 40840Sstevel@tonic-gate /* 40850Sstevel@tonic-gate * Get locality info and starting offset into shared object 40860Sstevel@tonic-gate * Try anon map first and then vnode 40870Sstevel@tonic-gate * Assume that no locks need to be held on anon_map or vnode, since 40880Sstevel@tonic-gate * it should be protected by its reference count which must be nonzero 40890Sstevel@tonic-gate * for an existing segment. 40900Sstevel@tonic-gate */ 40910Sstevel@tonic-gate if (amp) { 40920Sstevel@tonic-gate /* 40930Sstevel@tonic-gate * Get policy info from anon_map 40940Sstevel@tonic-gate * 40950Sstevel@tonic-gate */ 40960Sstevel@tonic-gate ASSERT(amp->refcnt != 0); 40970Sstevel@tonic-gate if (amp->locality == NULL) 40980Sstevel@tonic-gate lgrp_shm_policy_init(amp, NULL); 40990Sstevel@tonic-gate shm_locality = amp->locality; 41000Sstevel@tonic-gate off = ptob(anon_index); 41010Sstevel@tonic-gate } else if (vp) { 41020Sstevel@tonic-gate /* 41030Sstevel@tonic-gate * Get policy info from vnode 41040Sstevel@tonic-gate */ 41050Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 41060Sstevel@tonic-gate lgrp_shm_policy_init(NULL, vp); 41070Sstevel@tonic-gate shm_locality = vp->v_locality; 41080Sstevel@tonic-gate ASSERT(shm_locality->loc_count != 0); 41090Sstevel@tonic-gate off = vn_off; 41100Sstevel@tonic-gate } else 41110Sstevel@tonic-gate return (-1); 41120Sstevel@tonic-gate 41130Sstevel@tonic-gate ASSERT((off & PAGEOFFSET) == 0); 41140Sstevel@tonic-gate 41150Sstevel@tonic-gate /* 41160Sstevel@tonic-gate * Figure out default policy 41170Sstevel@tonic-gate */ 41180Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_DEFAULT) 41190Sstevel@tonic-gate policy = lgrp_mem_policy_default(len, MAP_SHARED); 41200Sstevel@tonic-gate 41210Sstevel@tonic-gate /* 41220Sstevel@tonic-gate * Create AVL tree if there isn't one yet 41230Sstevel@tonic-gate * and set locality field to point at it 41240Sstevel@tonic-gate */ 41250Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_WRITER); 41260Sstevel@tonic-gate tree = shm_locality->loc_tree; 41270Sstevel@tonic-gate if (!tree) { 41280Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 41290Sstevel@tonic-gate 41300Sstevel@tonic-gate tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 41310Sstevel@tonic-gate 41320Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_WRITER); 41330Sstevel@tonic-gate if (shm_locality->loc_tree == NULL) { 41340Sstevel@tonic-gate avl_create(tree, lgrp_shm_policy_compar, 41350Sstevel@tonic-gate sizeof (lgrp_shm_policy_seg_t), 41360Sstevel@tonic-gate offsetof(lgrp_shm_policy_seg_t, shm_tree)); 41370Sstevel@tonic-gate shm_locality->loc_tree = tree; 41380Sstevel@tonic-gate } else { 41390Sstevel@tonic-gate /* 41400Sstevel@tonic-gate * Another thread managed to set up the tree 41410Sstevel@tonic-gate * before we could. Free the tree we allocated 41420Sstevel@tonic-gate * and use the one that's already there. 41430Sstevel@tonic-gate */ 41440Sstevel@tonic-gate kmem_free(tree, sizeof (*tree)); 41450Sstevel@tonic-gate tree = shm_locality->loc_tree; 41460Sstevel@tonic-gate } 41470Sstevel@tonic-gate } 41480Sstevel@tonic-gate 41490Sstevel@tonic-gate /* 41500Sstevel@tonic-gate * Set policy 41510Sstevel@tonic-gate * 41520Sstevel@tonic-gate * Need to maintain hold on writer's lock to keep tree from 41530Sstevel@tonic-gate * changing out from under us 41540Sstevel@tonic-gate */ 41550Sstevel@tonic-gate while (len != 0) { 41560Sstevel@tonic-gate /* 41570Sstevel@tonic-gate * Find policy segment for specified offset into shared object 41580Sstevel@tonic-gate */ 41590Sstevel@tonic-gate seg = avl_find(tree, &off, &where); 41600Sstevel@tonic-gate 41610Sstevel@tonic-gate /* 41620Sstevel@tonic-gate * Didn't find any existing segment that contains specified 41630Sstevel@tonic-gate * offset, so allocate new segment, insert it, and concatenate 41640Sstevel@tonic-gate * with adjacent segments if possible 41650Sstevel@tonic-gate */ 41660Sstevel@tonic-gate if (seg == NULL) { 41670Sstevel@tonic-gate newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 41680Sstevel@tonic-gate KM_SLEEP); 41690Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 41700Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 41710Sstevel@tonic-gate newseg->shm_off = off; 41720Sstevel@tonic-gate avl_insert(tree, newseg, where); 41730Sstevel@tonic-gate 41740Sstevel@tonic-gate /* 41750Sstevel@tonic-gate * Check to see whether new segment overlaps with next 41760Sstevel@tonic-gate * one, set length of new segment accordingly, and 41770Sstevel@tonic-gate * calculate remaining length and next offset 41780Sstevel@tonic-gate */ 41790Sstevel@tonic-gate seg = AVL_NEXT(tree, newseg); 41800Sstevel@tonic-gate if (seg == NULL || off + len <= seg->shm_off) { 41810Sstevel@tonic-gate newseg->shm_size = len; 41820Sstevel@tonic-gate len = 0; 41830Sstevel@tonic-gate } else { 41840Sstevel@tonic-gate newseg->shm_size = seg->shm_off - off; 41850Sstevel@tonic-gate off = seg->shm_off; 41860Sstevel@tonic-gate len -= newseg->shm_size; 41870Sstevel@tonic-gate } 41880Sstevel@tonic-gate 41890Sstevel@tonic-gate /* 41900Sstevel@tonic-gate * Try to concatenate new segment with next and 41910Sstevel@tonic-gate * previous ones, since they might have the same policy 41920Sstevel@tonic-gate * now. Grab previous and next segments first because 41930Sstevel@tonic-gate * they will change on concatenation. 41940Sstevel@tonic-gate */ 41950Sstevel@tonic-gate prev = AVL_PREV(tree, newseg); 41960Sstevel@tonic-gate next = AVL_NEXT(tree, newseg); 41970Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, newseg, next); 41980Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, prev, newseg); 41990Sstevel@tonic-gate 42000Sstevel@tonic-gate continue; 42010Sstevel@tonic-gate } 42020Sstevel@tonic-gate 42030Sstevel@tonic-gate eoff = off + len; 42040Sstevel@tonic-gate oldeoff = seg->shm_off + seg->shm_size; 42050Sstevel@tonic-gate 42060Sstevel@tonic-gate /* 42070Sstevel@tonic-gate * Policy set already? 42080Sstevel@tonic-gate */ 42090Sstevel@tonic-gate if (policy == seg->shm_policy.mem_policy) { 42100Sstevel@tonic-gate /* 42110Sstevel@tonic-gate * Nothing left to do if offset and length 42120Sstevel@tonic-gate * fall within this segment 42130Sstevel@tonic-gate */ 42140Sstevel@tonic-gate if (eoff <= oldeoff) { 42150Sstevel@tonic-gate retval = 1; 42160Sstevel@tonic-gate break; 42170Sstevel@tonic-gate } else { 42180Sstevel@tonic-gate len = eoff - oldeoff; 42190Sstevel@tonic-gate off = oldeoff; 42200Sstevel@tonic-gate continue; 42210Sstevel@tonic-gate } 42220Sstevel@tonic-gate } 42230Sstevel@tonic-gate 42240Sstevel@tonic-gate /* 42250Sstevel@tonic-gate * Specified offset and length match existing segment exactly 42260Sstevel@tonic-gate */ 42270Sstevel@tonic-gate if (off == seg->shm_off && len == seg->shm_size) { 42280Sstevel@tonic-gate /* 42290Sstevel@tonic-gate * Set policy and update current length 42300Sstevel@tonic-gate */ 42310Sstevel@tonic-gate seg->shm_policy.mem_policy = policy; 42320Sstevel@tonic-gate seg->shm_policy.mem_reserved = 0; 42330Sstevel@tonic-gate len = 0; 42340Sstevel@tonic-gate 42350Sstevel@tonic-gate /* 42360Sstevel@tonic-gate * Try concatenating new segment with previous and next 42370Sstevel@tonic-gate * segments, since they might have the same policy now. 42380Sstevel@tonic-gate * Grab previous and next segments first because they 42390Sstevel@tonic-gate * will change on concatenation. 42400Sstevel@tonic-gate */ 42410Sstevel@tonic-gate prev = AVL_PREV(tree, seg); 42420Sstevel@tonic-gate next = AVL_NEXT(tree, seg); 42430Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, seg, next); 42440Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, prev, seg); 42450Sstevel@tonic-gate } else { 42460Sstevel@tonic-gate /* 42470Sstevel@tonic-gate * Specified offset and length only apply to part of 42480Sstevel@tonic-gate * existing segment 42490Sstevel@tonic-gate */ 42500Sstevel@tonic-gate 42510Sstevel@tonic-gate /* 42520Sstevel@tonic-gate * New segment starts in middle of old one, so split 42530Sstevel@tonic-gate * new one off near beginning of old one 42540Sstevel@tonic-gate */ 42550Sstevel@tonic-gate newseg = NULL; 42560Sstevel@tonic-gate if (off > seg->shm_off) { 42570Sstevel@tonic-gate newseg = lgrp_shm_policy_split(tree, seg, off); 42580Sstevel@tonic-gate 42590Sstevel@tonic-gate /* 42600Sstevel@tonic-gate * New segment ends where old one did, so try 42610Sstevel@tonic-gate * to concatenate with next segment 42620Sstevel@tonic-gate */ 42630Sstevel@tonic-gate if (eoff == oldeoff) { 42640Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 42650Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 42660Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, 42670Sstevel@tonic-gate newseg, AVL_NEXT(tree, newseg)); 42680Sstevel@tonic-gate break; 42690Sstevel@tonic-gate } 42700Sstevel@tonic-gate } 42710Sstevel@tonic-gate 42720Sstevel@tonic-gate /* 42730Sstevel@tonic-gate * New segment ends before old one, so split off end of 42740Sstevel@tonic-gate * old one 42750Sstevel@tonic-gate */ 42760Sstevel@tonic-gate if (eoff < oldeoff) { 42770Sstevel@tonic-gate if (newseg) { 42780Sstevel@tonic-gate (void) lgrp_shm_policy_split(tree, 42790Sstevel@tonic-gate newseg, eoff); 42800Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 42810Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 42820Sstevel@tonic-gate } else { 42830Sstevel@tonic-gate (void) lgrp_shm_policy_split(tree, seg, 42840Sstevel@tonic-gate eoff); 42850Sstevel@tonic-gate seg->shm_policy.mem_policy = policy; 42860Sstevel@tonic-gate seg->shm_policy.mem_reserved = 0; 42870Sstevel@tonic-gate } 42880Sstevel@tonic-gate 42890Sstevel@tonic-gate if (off == seg->shm_off) 42900Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, 42910Sstevel@tonic-gate AVL_PREV(tree, seg), seg); 42920Sstevel@tonic-gate break; 42930Sstevel@tonic-gate } 42940Sstevel@tonic-gate 42950Sstevel@tonic-gate /* 42960Sstevel@tonic-gate * Calculate remaining length and next offset 42970Sstevel@tonic-gate */ 42980Sstevel@tonic-gate len = eoff - oldeoff; 42990Sstevel@tonic-gate off = oldeoff; 43000Sstevel@tonic-gate } 43010Sstevel@tonic-gate } 43020Sstevel@tonic-gate 43030Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 43040Sstevel@tonic-gate return (retval); 43050Sstevel@tonic-gate } 43060Sstevel@tonic-gate 43070Sstevel@tonic-gate /* 43080Sstevel@tonic-gate * Return the best memnode from which to allocate memory given 43090Sstevel@tonic-gate * an lgroup. 43100Sstevel@tonic-gate * 43110Sstevel@tonic-gate * "c" is for cookie, which is good enough for me. 43120Sstevel@tonic-gate * It references a cookie struct that should be zero'ed to initialize. 43130Sstevel@tonic-gate * The cookie should live on the caller's stack. 43140Sstevel@tonic-gate * 43150Sstevel@tonic-gate * The routine returns -1 when: 43160Sstevel@tonic-gate * - traverse is 0, and all the memnodes in "lgrp" have been returned. 43170Sstevel@tonic-gate * - traverse is 1, and all the memnodes in the system have been 43180Sstevel@tonic-gate * returned. 43190Sstevel@tonic-gate */ 43200Sstevel@tonic-gate int 43210Sstevel@tonic-gate lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 43220Sstevel@tonic-gate { 43230Sstevel@tonic-gate lgrp_t *lp = c->lmc_lgrp; 43240Sstevel@tonic-gate mnodeset_t nodes = c->lmc_nodes; 43250Sstevel@tonic-gate int cnt = c->lmc_cnt; 43260Sstevel@tonic-gate int offset, mnode; 43270Sstevel@tonic-gate 43280Sstevel@tonic-gate extern int max_mem_nodes; 43290Sstevel@tonic-gate 43300Sstevel@tonic-gate /* 43310Sstevel@tonic-gate * If the set is empty, and the caller is willing, traverse 43320Sstevel@tonic-gate * up the hierarchy until we find a non-empty set. 43330Sstevel@tonic-gate */ 43340Sstevel@tonic-gate while (nodes == (mnodeset_t)0 || cnt <= 0) { 43350Sstevel@tonic-gate if (c->lmc_scope == LGRP_SRCH_LOCAL || 43360Sstevel@tonic-gate ((lp = lp->lgrp_parent) == NULL)) 43370Sstevel@tonic-gate return (-1); 43380Sstevel@tonic-gate 43390Sstevel@tonic-gate nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 43400Sstevel@tonic-gate cnt = lp->lgrp_nmnodes - c->lmc_ntried; 43410Sstevel@tonic-gate } 43420Sstevel@tonic-gate 43430Sstevel@tonic-gate /* 43440Sstevel@tonic-gate * Select a memnode by picking one at a "random" offset. 43450Sstevel@tonic-gate * Because of DR, memnodes can come and go at any time. 43460Sstevel@tonic-gate * This code must be able to cope with the possibility 43470Sstevel@tonic-gate * that the nodes count "cnt" is inconsistent with respect 43480Sstevel@tonic-gate * to the number of elements actually in "nodes", and 43490Sstevel@tonic-gate * therefore that the offset chosen could be greater than 43500Sstevel@tonic-gate * the number of elements in the set (some memnodes may 43510Sstevel@tonic-gate * have dissapeared just before cnt was read). 43520Sstevel@tonic-gate * If this happens, the search simply wraps back to the 43530Sstevel@tonic-gate * beginning of the set. 43540Sstevel@tonic-gate */ 43550Sstevel@tonic-gate ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 43560Sstevel@tonic-gate offset = c->lmc_rand % cnt; 43570Sstevel@tonic-gate do { 43580Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) 43590Sstevel@tonic-gate if (nodes & ((mnodeset_t)1 << mnode)) 43600Sstevel@tonic-gate if (!offset--) 43610Sstevel@tonic-gate break; 43620Sstevel@tonic-gate } while (mnode >= max_mem_nodes); 43630Sstevel@tonic-gate 43640Sstevel@tonic-gate /* Found a node. Store state before returning. */ 43650Sstevel@tonic-gate c->lmc_lgrp = lp; 43660Sstevel@tonic-gate c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 43670Sstevel@tonic-gate c->lmc_cnt = cnt - 1; 43680Sstevel@tonic-gate c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 43690Sstevel@tonic-gate c->lmc_ntried++; 43700Sstevel@tonic-gate 43710Sstevel@tonic-gate return (mnode); 43720Sstevel@tonic-gate } 4373