10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51892Sesaxe * Common Development and Distribution License (the "License"). 61892Sesaxe * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 221892Sesaxe * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Basic NUMA support in terms of locality groups 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * Solaris needs to know which CPUs, memory, etc. are near each other to 320Sstevel@tonic-gate * provide good performance on NUMA machines by optimizing for locality. 330Sstevel@tonic-gate * In order to do this, a new abstraction called a "locality group (lgroup)" 340Sstevel@tonic-gate * has been introduced to keep track of which CPU-like and memory-like hardware 350Sstevel@tonic-gate * resources are close to each other. Currently, latency is the only measure 360Sstevel@tonic-gate * used to determine how to group hardware resources into lgroups, but this 370Sstevel@tonic-gate * does not limit the groupings to be based solely on latency. Other factors 380Sstevel@tonic-gate * may be used to determine the groupings in the future. 390Sstevel@tonic-gate * 400Sstevel@tonic-gate * Lgroups are organized into a hieararchy or topology that represents the 410Sstevel@tonic-gate * latency topology of the machine. There is always at least a root lgroup in 420Sstevel@tonic-gate * the system. It represents all the hardware resources in the machine at a 430Sstevel@tonic-gate * latency big enough that any hardware resource can at least access any other 440Sstevel@tonic-gate * hardware resource within that latency. A Uniform Memory Access (UMA) 450Sstevel@tonic-gate * machine is represented with one lgroup (the root). In contrast, a NUMA 460Sstevel@tonic-gate * machine is represented at least by the root lgroup and some number of leaf 470Sstevel@tonic-gate * lgroups where the leaf lgroups contain the hardware resources within the 480Sstevel@tonic-gate * least latency of each other and the root lgroup still contains all the 490Sstevel@tonic-gate * resources in the machine. Some number of intermediate lgroups may exist 500Sstevel@tonic-gate * which represent more levels of locality than just the local latency of the 510Sstevel@tonic-gate * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 520Sstevel@tonic-gate * (eg. root and intermediate lgroups) contain the next nearest resources to 530Sstevel@tonic-gate * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 540Sstevel@tonic-gate * to the root lgroup shows the hardware resources from closest to farthest 550Sstevel@tonic-gate * from the leaf lgroup such that each successive ancestor lgroup contains 560Sstevel@tonic-gate * the next nearest resources at the next level of locality from the previous. 570Sstevel@tonic-gate * 580Sstevel@tonic-gate * The kernel uses the lgroup abstraction to know how to allocate resources 590Sstevel@tonic-gate * near a given process/thread. At fork() and lwp/thread_create() time, a 600Sstevel@tonic-gate * "home" lgroup is chosen for a thread. This is done by picking the lgroup 610Sstevel@tonic-gate * with the lowest load average. Binding to a processor or processor set will 620Sstevel@tonic-gate * change the home lgroup for a thread. The scheduler has been modified to try 630Sstevel@tonic-gate * to dispatch a thread on a CPU in its home lgroup. Physical memory 640Sstevel@tonic-gate * allocation is lgroup aware too, so memory will be allocated from the current 650Sstevel@tonic-gate * thread's home lgroup if possible. If the desired resources are not 660Sstevel@tonic-gate * available, the kernel traverses the lgroup hierarchy going to the parent 670Sstevel@tonic-gate * lgroup to find resources at the next level of locality until it reaches the 680Sstevel@tonic-gate * root lgroup. 690Sstevel@tonic-gate */ 700Sstevel@tonic-gate 710Sstevel@tonic-gate #include <sys/lgrp.h> 720Sstevel@tonic-gate #include <sys/lgrp_user.h> 730Sstevel@tonic-gate #include <sys/types.h> 740Sstevel@tonic-gate #include <sys/mman.h> 750Sstevel@tonic-gate #include <sys/param.h> 760Sstevel@tonic-gate #include <sys/var.h> 770Sstevel@tonic-gate #include <sys/thread.h> 780Sstevel@tonic-gate #include <sys/cpuvar.h> 790Sstevel@tonic-gate #include <sys/cpupart.h> 800Sstevel@tonic-gate #include <sys/kmem.h> 810Sstevel@tonic-gate #include <vm/seg.h> 820Sstevel@tonic-gate #include <vm/seg_kmem.h> 830Sstevel@tonic-gate #include <vm/seg_spt.h> 840Sstevel@tonic-gate #include <vm/seg_vn.h> 850Sstevel@tonic-gate #include <vm/as.h> 860Sstevel@tonic-gate #include <sys/atomic.h> 870Sstevel@tonic-gate #include <sys/systm.h> 880Sstevel@tonic-gate #include <sys/errno.h> 890Sstevel@tonic-gate #include <sys/cmn_err.h> 900Sstevel@tonic-gate #include <sys/kstat.h> 910Sstevel@tonic-gate #include <sys/sysmacros.h> 920Sstevel@tonic-gate #include <sys/chip.h> 930Sstevel@tonic-gate #include <sys/promif.h> 940Sstevel@tonic-gate #include <sys/sdt.h> 950Sstevel@tonic-gate 960Sstevel@tonic-gate lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 970Sstevel@tonic-gate lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 980Sstevel@tonic-gate /* indexed by lgrp_id */ 990Sstevel@tonic-gate int nlgrps; /* number of lgroups in machine */ 1000Sstevel@tonic-gate int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 1010Sstevel@tonic-gate int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 1020Sstevel@tonic-gate 1030Sstevel@tonic-gate /* 1040Sstevel@tonic-gate * Kstat data for lgroups. 1050Sstevel@tonic-gate * 1060Sstevel@tonic-gate * Actual kstat data is collected in lgrp_stats array. 1070Sstevel@tonic-gate * The lgrp_kstat_data array of named kstats is used to extract data from 1080Sstevel@tonic-gate * lgrp_stats and present it to kstat framework. It is protected from partallel 1090Sstevel@tonic-gate * modifications by lgrp_kstat_mutex. This may cause some contention when 1100Sstevel@tonic-gate * several kstat commands run in parallel but this is not the 1110Sstevel@tonic-gate * performance-critical path. 1120Sstevel@tonic-gate */ 1130Sstevel@tonic-gate extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 1140Sstevel@tonic-gate 1150Sstevel@tonic-gate /* 1160Sstevel@tonic-gate * Declare kstat names statically for enums as defined in the header file. 1170Sstevel@tonic-gate */ 1180Sstevel@tonic-gate LGRP_KSTAT_NAMES; 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate static void lgrp_kstat_init(void); 1210Sstevel@tonic-gate static int lgrp_kstat_extract(kstat_t *, int); 1220Sstevel@tonic-gate static void lgrp_kstat_reset(lgrp_id_t); 1230Sstevel@tonic-gate 1240Sstevel@tonic-gate static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 1250Sstevel@tonic-gate static kmutex_t lgrp_kstat_mutex; 1260Sstevel@tonic-gate 1270Sstevel@tonic-gate 1280Sstevel@tonic-gate /* 1290Sstevel@tonic-gate * max number of lgroups supported by the platform 1300Sstevel@tonic-gate */ 1310Sstevel@tonic-gate int nlgrpsmax = 0; 1320Sstevel@tonic-gate 1330Sstevel@tonic-gate /* 1340Sstevel@tonic-gate * The root lgroup. Represents the set of resources at the system wide 1350Sstevel@tonic-gate * level of locality. 1360Sstevel@tonic-gate */ 1370Sstevel@tonic-gate lgrp_t *lgrp_root = NULL; 1380Sstevel@tonic-gate 1390Sstevel@tonic-gate /* 1400Sstevel@tonic-gate * During system bootstrap cp_default does not contain the list of lgrp load 1410Sstevel@tonic-gate * averages (cp_lgrploads). The list is allocated after the first CPU is brought 1420Sstevel@tonic-gate * on-line when cp_default is initialized by cpupart_initialize_default(). 1430Sstevel@tonic-gate * Configuring CPU0 may create a two-level topology with root and one leaf node 1440Sstevel@tonic-gate * containing CPU0. This topology is initially constructed in a special 1450Sstevel@tonic-gate * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 1460Sstevel@tonic-gate * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 1470Sstevel@tonic-gate * for all lpl operations until cp_default is fully constructed. 1480Sstevel@tonic-gate * 1490Sstevel@tonic-gate * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 1500Sstevel@tonic-gate * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 1510Sstevel@tonic-gate * the first element of lpl_bootstrap_list. 15260Sesaxe * 15360Sesaxe * CPUs that are added to the system, but have not yet been assigned to an 15460Sesaxe * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 15560Sesaxe * on some architectures (x86) it's possible for the slave CPU startup thread 15660Sesaxe * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 1570Sstevel@tonic-gate */ 1580Sstevel@tonic-gate #define LPL_BOOTSTRAP_SIZE 2 1590Sstevel@tonic-gate static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 1600Sstevel@tonic-gate lpl_t *lpl_bootstrap; 1610Sstevel@tonic-gate 16260Sesaxe /* 16360Sesaxe * If cp still references the bootstrap lpl, it has not yet been added to 16460Sesaxe * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 16560Sesaxe * a thread is trying to allocate memory close to a CPU that has no lgrp. 16660Sesaxe */ 16760Sesaxe #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 16860Sesaxe 1690Sstevel@tonic-gate static lgrp_t lroot; 1700Sstevel@tonic-gate 1710Sstevel@tonic-gate 1720Sstevel@tonic-gate /* 1730Sstevel@tonic-gate * Size, in bytes, beyond which random memory allocation policy is applied 1740Sstevel@tonic-gate * to non-shared memory. Default is the maximum size, so random memory 1750Sstevel@tonic-gate * allocation won't be used for non-shared memory by default. 1760Sstevel@tonic-gate */ 1770Sstevel@tonic-gate size_t lgrp_privm_random_thresh = (size_t)(-1); 1780Sstevel@tonic-gate 1790Sstevel@tonic-gate /* 1800Sstevel@tonic-gate * Size, in bytes, beyond which random memory allocation policy is applied to 1810Sstevel@tonic-gate * shared memory. Default is 8MB (2 ISM pages). 1820Sstevel@tonic-gate */ 1830Sstevel@tonic-gate size_t lgrp_shm_random_thresh = 8*1024*1024; 1840Sstevel@tonic-gate 1850Sstevel@tonic-gate /* 1860Sstevel@tonic-gate * Whether to do processor set aware memory allocation by default 1870Sstevel@tonic-gate */ 1880Sstevel@tonic-gate int lgrp_mem_pset_aware = 0; 1890Sstevel@tonic-gate 1900Sstevel@tonic-gate /* 1910Sstevel@tonic-gate * Set the default memory allocation policy for root lgroup 1920Sstevel@tonic-gate */ 1930Sstevel@tonic-gate lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 1940Sstevel@tonic-gate 1950Sstevel@tonic-gate /* 1960Sstevel@tonic-gate * Set the default memory allocation policy. For most platforms, 1970Sstevel@tonic-gate * next touch is sufficient, but some platforms may wish to override 1980Sstevel@tonic-gate * this. 1990Sstevel@tonic-gate */ 2000Sstevel@tonic-gate lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate 2030Sstevel@tonic-gate /* 2040Sstevel@tonic-gate * lgroup CPU event handlers 2050Sstevel@tonic-gate */ 2060Sstevel@tonic-gate static void lgrp_cpu_init(struct cpu *); 2070Sstevel@tonic-gate static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 2080Sstevel@tonic-gate static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 2090Sstevel@tonic-gate 2100Sstevel@tonic-gate static void lgrp_latency_change(u_longlong_t, u_longlong_t); 2110Sstevel@tonic-gate 2120Sstevel@tonic-gate /* 2130Sstevel@tonic-gate * lgroup memory event handlers 2140Sstevel@tonic-gate */ 2150Sstevel@tonic-gate static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 2160Sstevel@tonic-gate static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 2170Sstevel@tonic-gate static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 2180Sstevel@tonic-gate 2190Sstevel@tonic-gate /* 2200Sstevel@tonic-gate * lgroup CPU partition event handlers 2210Sstevel@tonic-gate */ 2220Sstevel@tonic-gate static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 2230Sstevel@tonic-gate static void lgrp_part_del_cpu(struct cpu *); 2240Sstevel@tonic-gate 2250Sstevel@tonic-gate static void lgrp_root_init(void); 2260Sstevel@tonic-gate 2270Sstevel@tonic-gate /* 2280Sstevel@tonic-gate * lpl topology 2290Sstevel@tonic-gate */ 2300Sstevel@tonic-gate static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 2310Sstevel@tonic-gate static void lpl_clear(lpl_t *); 2320Sstevel@tonic-gate static void lpl_leaf_insert(lpl_t *, struct cpupart *); 2330Sstevel@tonic-gate static void lpl_leaf_remove(lpl_t *, struct cpupart *); 2340Sstevel@tonic-gate static void lpl_rset_add(lpl_t *, lpl_t *); 2350Sstevel@tonic-gate static void lpl_rset_del(lpl_t *, lpl_t *); 2360Sstevel@tonic-gate static int lpl_rset_contains(lpl_t *, lpl_t *); 2370Sstevel@tonic-gate static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 2380Sstevel@tonic-gate static void lpl_child_update(lpl_t *, struct cpupart *); 2390Sstevel@tonic-gate static int lpl_pick(lpl_t *, lpl_t *); 2400Sstevel@tonic-gate static void lpl_verify_wrapper(struct cpupart *); 2410Sstevel@tonic-gate 2420Sstevel@tonic-gate /* 2430Sstevel@tonic-gate * defines for lpl topology verifier return codes 2440Sstevel@tonic-gate */ 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate #define LPL_TOPO_CORRECT 0 2470Sstevel@tonic-gate #define LPL_TOPO_PART_HAS_NO_LPL -1 2480Sstevel@tonic-gate #define LPL_TOPO_CPUS_NOT_EMPTY -2 2490Sstevel@tonic-gate #define LPL_TOPO_LGRP_MISMATCH -3 2500Sstevel@tonic-gate #define LPL_TOPO_MISSING_PARENT -4 2510Sstevel@tonic-gate #define LPL_TOPO_PARENT_MISMATCH -5 2520Sstevel@tonic-gate #define LPL_TOPO_BAD_CPUCNT -6 2530Sstevel@tonic-gate #define LPL_TOPO_RSET_MISMATCH -7 2540Sstevel@tonic-gate #define LPL_TOPO_LPL_ORPHANED -8 2550Sstevel@tonic-gate #define LPL_TOPO_LPL_BAD_NCPU -9 2560Sstevel@tonic-gate #define LPL_TOPO_RSET_MSSNG_LF -10 2570Sstevel@tonic-gate #define LPL_TOPO_CPU_HAS_BAD_LPL -11 2580Sstevel@tonic-gate #define LPL_TOPO_BOGUS_HINT -12 2590Sstevel@tonic-gate #define LPL_TOPO_NONLEAF_HAS_CPUS -13 2600Sstevel@tonic-gate #define LPL_TOPO_LGRP_NOT_LEAF -14 2610Sstevel@tonic-gate #define LPL_TOPO_BAD_RSETCNT -15 2620Sstevel@tonic-gate 2630Sstevel@tonic-gate /* 2640Sstevel@tonic-gate * Return whether lgroup optimizations should be enabled on this system 2650Sstevel@tonic-gate */ 2660Sstevel@tonic-gate int 2670Sstevel@tonic-gate lgrp_optimizations(void) 2680Sstevel@tonic-gate { 2690Sstevel@tonic-gate /* 2700Sstevel@tonic-gate * System must have more than 2 lgroups to enable lgroup optimizations 2710Sstevel@tonic-gate * 2720Sstevel@tonic-gate * XXX This assumes that a 2 lgroup system has an empty root lgroup 2730Sstevel@tonic-gate * with one child lgroup containing all the resources. A 2 lgroup 2740Sstevel@tonic-gate * system with a root lgroup directly containing CPUs or memory might 2750Sstevel@tonic-gate * need lgroup optimizations with its child lgroup, but there 2760Sstevel@tonic-gate * isn't such a machine for now.... 2770Sstevel@tonic-gate */ 2780Sstevel@tonic-gate if (nlgrps > 2) 2790Sstevel@tonic-gate return (1); 2800Sstevel@tonic-gate 2810Sstevel@tonic-gate return (0); 2820Sstevel@tonic-gate } 2830Sstevel@tonic-gate 2840Sstevel@tonic-gate /* 2850Sstevel@tonic-gate * Build full lgroup topology 2860Sstevel@tonic-gate */ 2870Sstevel@tonic-gate static void 2880Sstevel@tonic-gate lgrp_root_init(void) 2890Sstevel@tonic-gate { 2900Sstevel@tonic-gate lgrp_handle_t hand; 2910Sstevel@tonic-gate int i; 2920Sstevel@tonic-gate lgrp_id_t id; 2930Sstevel@tonic-gate 2940Sstevel@tonic-gate /* 2950Sstevel@tonic-gate * Create the "root" lgroup 2960Sstevel@tonic-gate */ 2970Sstevel@tonic-gate ASSERT(nlgrps == 0); 2980Sstevel@tonic-gate id = nlgrps++; 2990Sstevel@tonic-gate 3000Sstevel@tonic-gate lgrp_root = &lroot; 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate lgrp_root->lgrp_cpu = NULL; 3030Sstevel@tonic-gate lgrp_root->lgrp_mnodes = 0; 3040Sstevel@tonic-gate lgrp_root->lgrp_nmnodes = 0; 3050Sstevel@tonic-gate hand = lgrp_plat_root_hand(); 3060Sstevel@tonic-gate lgrp_root->lgrp_plathand = hand; 3070Sstevel@tonic-gate 3080Sstevel@tonic-gate lgrp_root->lgrp_id = id; 3090Sstevel@tonic-gate lgrp_root->lgrp_cpucnt = 0; 3100Sstevel@tonic-gate lgrp_root->lgrp_childcnt = 0; 3110Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_children); 3120Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_leaves); 3130Sstevel@tonic-gate lgrp_root->lgrp_parent = NULL; 3140Sstevel@tonic-gate lgrp_root->lgrp_chips = NULL; 3150Sstevel@tonic-gate lgrp_root->lgrp_chipcnt = 0; 3160Sstevel@tonic-gate lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 3170Sstevel@tonic-gate 3180Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 3190Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_set[i]); 3200Sstevel@tonic-gate 3210Sstevel@tonic-gate lgrp_root->lgrp_kstat = NULL; 3220Sstevel@tonic-gate 3230Sstevel@tonic-gate lgrp_table[id] = lgrp_root; 3240Sstevel@tonic-gate 3250Sstevel@tonic-gate /* 3260Sstevel@tonic-gate * Setup initial lpl list for CPU0 and initial t0 home. 3270Sstevel@tonic-gate * The only lpl space we have so far is lpl_bootstrap. It is used for 32860Sesaxe * all topology operations until cp_default is initialized at which 32960Sesaxe * point t0.t_lpl will be updated. 3300Sstevel@tonic-gate */ 3310Sstevel@tonic-gate lpl_bootstrap = lpl_bootstrap_list; 3320Sstevel@tonic-gate t0.t_lpl = lpl_bootstrap; 3330Sstevel@tonic-gate cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 3340Sstevel@tonic-gate lpl_bootstrap_list[1].lpl_lgrpid = 1; 3350Sstevel@tonic-gate cp_default.cp_lgrploads = lpl_bootstrap; 3360Sstevel@tonic-gate } 3370Sstevel@tonic-gate 3380Sstevel@tonic-gate /* 3390Sstevel@tonic-gate * Initialize the lgroup framework and allow the platform to do the same 3400Sstevel@tonic-gate */ 3410Sstevel@tonic-gate void 3420Sstevel@tonic-gate lgrp_init(void) 3430Sstevel@tonic-gate { 3440Sstevel@tonic-gate /* 3450Sstevel@tonic-gate * Initialize the platform 3460Sstevel@tonic-gate */ 3470Sstevel@tonic-gate lgrp_plat_init(); 3480Sstevel@tonic-gate 3490Sstevel@tonic-gate /* 3500Sstevel@tonic-gate * Set max number of lgroups supported on this platform which must be 3510Sstevel@tonic-gate * less than the max number of lgroups supported by the common lgroup 3520Sstevel@tonic-gate * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 3530Sstevel@tonic-gate */ 3540Sstevel@tonic-gate nlgrpsmax = lgrp_plat_max_lgrps(); 3550Sstevel@tonic-gate ASSERT(nlgrpsmax <= NLGRPS_MAX); 3560Sstevel@tonic-gate } 3570Sstevel@tonic-gate 3580Sstevel@tonic-gate /* 3590Sstevel@tonic-gate * Create the root and cpu0's lgroup, and set t0's home. 3600Sstevel@tonic-gate */ 3610Sstevel@tonic-gate void 3620Sstevel@tonic-gate lgrp_setup(void) 3630Sstevel@tonic-gate { 3640Sstevel@tonic-gate /* 3650Sstevel@tonic-gate * Setup the root lgroup 3660Sstevel@tonic-gate */ 3670Sstevel@tonic-gate lgrp_root_init(); 3680Sstevel@tonic-gate 3690Sstevel@tonic-gate /* 3700Sstevel@tonic-gate * Add cpu0 to an lgroup 3710Sstevel@tonic-gate */ 3720Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 3730Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 3740Sstevel@tonic-gate } 3750Sstevel@tonic-gate 3760Sstevel@tonic-gate /* 3770Sstevel@tonic-gate * Lgroup initialization is split in two parts. The first part 3780Sstevel@tonic-gate * (lgrp_main_init()) is called right before start_other_cpus() in main. The 3790Sstevel@tonic-gate * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 3800Sstevel@tonic-gate * when all CPUs are brought online and all distance information is available. 3810Sstevel@tonic-gate * 3820Sstevel@tonic-gate * When lgrp_main_init() is complete it sets lgrp_initialized. The 3830Sstevel@tonic-gate * lgrp_main_mp_init() sets lgrp_topo_initialized. 3840Sstevel@tonic-gate */ 3850Sstevel@tonic-gate 3860Sstevel@tonic-gate /* 3870Sstevel@tonic-gate * true when lgrp initialization has been completed. 3880Sstevel@tonic-gate */ 3890Sstevel@tonic-gate int lgrp_initialized = 0; 3900Sstevel@tonic-gate 3910Sstevel@tonic-gate /* 3920Sstevel@tonic-gate * True when lgrp topology is constructed. 3930Sstevel@tonic-gate */ 3940Sstevel@tonic-gate int lgrp_topo_initialized = 0; 3950Sstevel@tonic-gate 3960Sstevel@tonic-gate /* 3970Sstevel@tonic-gate * Init routine called after startup(), /etc/system has been processed, 3980Sstevel@tonic-gate * and cpu0 has been added to an lgroup. 3990Sstevel@tonic-gate */ 4000Sstevel@tonic-gate void 4010Sstevel@tonic-gate lgrp_main_init(void) 4020Sstevel@tonic-gate { 4030Sstevel@tonic-gate cpu_t *cp = CPU; 4040Sstevel@tonic-gate lgrp_id_t lgrpid; 4050Sstevel@tonic-gate int i; 4060Sstevel@tonic-gate /* 4070Sstevel@tonic-gate * Enforce a valid lgrp_mem_default_policy 4080Sstevel@tonic-gate */ 4090Sstevel@tonic-gate if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 4100Sstevel@tonic-gate (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 4110Sstevel@tonic-gate lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 4120Sstevel@tonic-gate 4130Sstevel@tonic-gate /* 4140Sstevel@tonic-gate * See if mpo should be disabled. 4150Sstevel@tonic-gate * This may happen in the case of null proc LPA on Starcat. 4160Sstevel@tonic-gate * The platform won't be able to detect null proc LPA until after 4170Sstevel@tonic-gate * cpu0 and memory have already been added to lgroups. 4180Sstevel@tonic-gate * When and if it is detected, the Starcat platform will return 4190Sstevel@tonic-gate * a different platform handle for cpu0 which is what we check for 4200Sstevel@tonic-gate * here. If mpo should be disabled move cpu0 to it's rightful place 4210Sstevel@tonic-gate * (the root), and destroy the remaining lgroups. This effectively 4220Sstevel@tonic-gate * provides an UMA lgroup topology. 4230Sstevel@tonic-gate */ 4240Sstevel@tonic-gate lgrpid = cp->cpu_lpl->lpl_lgrpid; 4250Sstevel@tonic-gate if (lgrp_table[lgrpid]->lgrp_plathand != 4260Sstevel@tonic-gate lgrp_plat_cpu_to_hand(cp->cpu_id)) { 4270Sstevel@tonic-gate lgrp_part_del_cpu(cp); 4280Sstevel@tonic-gate lgrp_cpu_fini(cp, lgrpid); 4290Sstevel@tonic-gate 4300Sstevel@tonic-gate lgrp_cpu_init(cp); 4310Sstevel@tonic-gate lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 4320Sstevel@tonic-gate 4330Sstevel@tonic-gate ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 4340Sstevel@tonic-gate 435218Sjjc /* 436218Sjjc * Destroy all lgroups except for root 437218Sjjc */ 4380Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 4390Sstevel@tonic-gate if (LGRP_EXISTS(lgrp_table[i]) && 4400Sstevel@tonic-gate lgrp_table[i] != lgrp_root) 4410Sstevel@tonic-gate lgrp_destroy(lgrp_table[i]); 4420Sstevel@tonic-gate } 443218Sjjc 444218Sjjc /* 445218Sjjc * Fix up root to point at itself for leaves and resources 446218Sjjc * and not have any children 447218Sjjc */ 448218Sjjc lgrp_root->lgrp_childcnt = 0; 449218Sjjc klgrpset_clear(lgrp_root->lgrp_children); 450218Sjjc klgrpset_clear(lgrp_root->lgrp_leaves); 451218Sjjc klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 4520Sstevel@tonic-gate klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 4530Sstevel@tonic-gate klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 4540Sstevel@tonic-gate } 4550Sstevel@tonic-gate 4560Sstevel@tonic-gate /* 4570Sstevel@tonic-gate * Initialize kstats framework. 4580Sstevel@tonic-gate */ 4590Sstevel@tonic-gate lgrp_kstat_init(); 4600Sstevel@tonic-gate /* 4610Sstevel@tonic-gate * cpu0 is finally where it should be, so create it's lgroup's kstats 4620Sstevel@tonic-gate */ 4630Sstevel@tonic-gate mutex_enter(&cpu_lock); 4640Sstevel@tonic-gate lgrp_kstat_create(cp); 4650Sstevel@tonic-gate mutex_exit(&cpu_lock); 4660Sstevel@tonic-gate 4670Sstevel@tonic-gate lgrp_plat_main_init(); 4680Sstevel@tonic-gate lgrp_initialized = 1; 4690Sstevel@tonic-gate } 4700Sstevel@tonic-gate 4710Sstevel@tonic-gate /* 4720Sstevel@tonic-gate * Finish lgrp initialization after all CPUS are brought on-line. 4730Sstevel@tonic-gate * This routine is called after start_other_cpus(). 4740Sstevel@tonic-gate */ 4750Sstevel@tonic-gate void 4760Sstevel@tonic-gate lgrp_main_mp_init(void) 4770Sstevel@tonic-gate { 4780Sstevel@tonic-gate klgrpset_t changed; 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate /* 4810Sstevel@tonic-gate * Update lgroup topology (if necessary) 4820Sstevel@tonic-gate */ 4830Sstevel@tonic-gate klgrpset_clear(changed); 4840Sstevel@tonic-gate (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 4850Sstevel@tonic-gate lgrp_topo_initialized = 1; 4860Sstevel@tonic-gate } 4870Sstevel@tonic-gate 4880Sstevel@tonic-gate /* 4890Sstevel@tonic-gate * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 4900Sstevel@tonic-gate */ 4910Sstevel@tonic-gate void 4920Sstevel@tonic-gate lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 4930Sstevel@tonic-gate { 4940Sstevel@tonic-gate klgrpset_t changed; 4950Sstevel@tonic-gate cpu_t *cp; 4960Sstevel@tonic-gate lgrp_id_t id; 4970Sstevel@tonic-gate int rc; 4980Sstevel@tonic-gate 4990Sstevel@tonic-gate switch (event) { 5000Sstevel@tonic-gate /* 5010Sstevel@tonic-gate * The following (re)configuration events are common code 5020Sstevel@tonic-gate * initiated. lgrp_plat_config() is called here to inform the 5030Sstevel@tonic-gate * platform of the reconfiguration event. 5040Sstevel@tonic-gate */ 5050Sstevel@tonic-gate case LGRP_CONFIG_CPU_ADD: 50660Sesaxe cp = (cpu_t *)resource; 50760Sesaxe 50860Sesaxe /* 50960Sesaxe * Initialize the new CPU's lgrp related next/prev 51060Sesaxe * links, and give it a bootstrap lpl so that it can 51160Sesaxe * survive should it need to enter the dispatcher. 51260Sesaxe */ 51360Sesaxe cp->cpu_next_lpl = cp; 51460Sesaxe cp->cpu_prev_lpl = cp; 51560Sesaxe cp->cpu_next_lgrp = cp; 51660Sesaxe cp->cpu_prev_lgrp = cp; 51760Sesaxe cp->cpu_lpl = lpl_bootstrap; 51860Sesaxe 5190Sstevel@tonic-gate lgrp_plat_config(event, resource); 5200Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5210Sstevel@tonic-gate 5220Sstevel@tonic-gate break; 5230Sstevel@tonic-gate case LGRP_CONFIG_CPU_DEL: 5240Sstevel@tonic-gate lgrp_plat_config(event, resource); 5250Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5260Sstevel@tonic-gate 5270Sstevel@tonic-gate break; 5280Sstevel@tonic-gate case LGRP_CONFIG_CPU_ONLINE: 5290Sstevel@tonic-gate cp = (cpu_t *)resource; 5300Sstevel@tonic-gate lgrp_cpu_init(cp); 5310Sstevel@tonic-gate lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 5320Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5330Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5340Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5350Sstevel@tonic-gate } 5360Sstevel@tonic-gate lgrp_plat_config(event, resource); 5370Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5380Sstevel@tonic-gate 5390Sstevel@tonic-gate break; 5400Sstevel@tonic-gate case LGRP_CONFIG_CPU_OFFLINE: 5410Sstevel@tonic-gate cp = (cpu_t *)resource; 5420Sstevel@tonic-gate id = cp->cpu_lpl->lpl_lgrpid; 5430Sstevel@tonic-gate lgrp_part_del_cpu(cp); 5440Sstevel@tonic-gate lgrp_cpu_fini(cp, id); 5450Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5460Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5470Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5480Sstevel@tonic-gate } 5490Sstevel@tonic-gate lgrp_plat_config(event, resource); 5500Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate break; 5530Sstevel@tonic-gate case LGRP_CONFIG_CPUPART_ADD: 5540Sstevel@tonic-gate cp = (cpu_t *)resource; 5550Sstevel@tonic-gate lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 5560Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5570Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5580Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5590Sstevel@tonic-gate } 5600Sstevel@tonic-gate lgrp_plat_config(event, resource); 5610Sstevel@tonic-gate 5620Sstevel@tonic-gate break; 5630Sstevel@tonic-gate case LGRP_CONFIG_CPUPART_DEL: 5640Sstevel@tonic-gate cp = (cpu_t *)resource; 5650Sstevel@tonic-gate lgrp_part_del_cpu((cpu_t *)resource); 5660Sstevel@tonic-gate rc = lpl_topo_verify(cp->cpu_part); 5670Sstevel@tonic-gate if (rc != LPL_TOPO_CORRECT) { 5680Sstevel@tonic-gate panic("lpl_topo_verify failed: %d", rc); 5690Sstevel@tonic-gate } 5700Sstevel@tonic-gate lgrp_plat_config(event, resource); 5710Sstevel@tonic-gate 5720Sstevel@tonic-gate break; 5730Sstevel@tonic-gate /* 5740Sstevel@tonic-gate * The following events are initiated by the memnode 5750Sstevel@tonic-gate * subsystem. 5760Sstevel@tonic-gate */ 5770Sstevel@tonic-gate case LGRP_CONFIG_MEM_ADD: 5780Sstevel@tonic-gate lgrp_mem_init((int)resource, where, B_FALSE); 5790Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5800Sstevel@tonic-gate 5810Sstevel@tonic-gate break; 5820Sstevel@tonic-gate case LGRP_CONFIG_MEM_DEL: 5830Sstevel@tonic-gate lgrp_mem_fini((int)resource, where, B_FALSE); 5840Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5850Sstevel@tonic-gate 5860Sstevel@tonic-gate break; 5870Sstevel@tonic-gate case LGRP_CONFIG_MEM_RENAME: { 5880Sstevel@tonic-gate lgrp_config_mem_rename_t *ren_arg = 5890Sstevel@tonic-gate (lgrp_config_mem_rename_t *)where; 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate lgrp_mem_rename((int)resource, 5920Sstevel@tonic-gate ren_arg->lmem_rename_from, 5930Sstevel@tonic-gate ren_arg->lmem_rename_to); 5940Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 5950Sstevel@tonic-gate 5960Sstevel@tonic-gate break; 5970Sstevel@tonic-gate } 5980Sstevel@tonic-gate case LGRP_CONFIG_GEN_UPDATE: 5990Sstevel@tonic-gate atomic_add_32(&lgrp_gen, 1); 6000Sstevel@tonic-gate 6010Sstevel@tonic-gate break; 6020Sstevel@tonic-gate case LGRP_CONFIG_FLATTEN: 6030Sstevel@tonic-gate if (where == 0) 6040Sstevel@tonic-gate lgrp_topo_levels = (int)resource; 6050Sstevel@tonic-gate else 6060Sstevel@tonic-gate (void) lgrp_topo_flatten(resource, 6070Sstevel@tonic-gate lgrp_table, lgrp_alloc_max, &changed); 6080Sstevel@tonic-gate 6090Sstevel@tonic-gate break; 6100Sstevel@tonic-gate /* 6110Sstevel@tonic-gate * Initiated by platform latency probing code 6120Sstevel@tonic-gate */ 6130Sstevel@tonic-gate case LGRP_CONFIG_LATENCY_CHANGE: 6140Sstevel@tonic-gate lgrp_latency_change((u_longlong_t)resource, 6150Sstevel@tonic-gate (u_longlong_t)where); 6160Sstevel@tonic-gate 6170Sstevel@tonic-gate break; 6180Sstevel@tonic-gate case LGRP_CONFIG_NOP: 6190Sstevel@tonic-gate 6200Sstevel@tonic-gate break; 6210Sstevel@tonic-gate default: 6220Sstevel@tonic-gate break; 6230Sstevel@tonic-gate } 6240Sstevel@tonic-gate 6250Sstevel@tonic-gate } 6260Sstevel@tonic-gate 6270Sstevel@tonic-gate /* 6280Sstevel@tonic-gate * Called to add lgrp info into cpu structure from cpu_add_unit; 6290Sstevel@tonic-gate * do not assume cpu is in cpu[] yet! 6300Sstevel@tonic-gate * 6310Sstevel@tonic-gate * CPUs are brought online with all other CPUs paused so we can't 6320Sstevel@tonic-gate * allocate memory or we could deadlock the system, so we rely on 6330Sstevel@tonic-gate * the platform to statically allocate as much space as we need 6340Sstevel@tonic-gate * for the lgrp structs and stats. 6350Sstevel@tonic-gate */ 6360Sstevel@tonic-gate static void 6370Sstevel@tonic-gate lgrp_cpu_init(struct cpu *cp) 6380Sstevel@tonic-gate { 6390Sstevel@tonic-gate klgrpset_t changed; 6400Sstevel@tonic-gate int count; 6410Sstevel@tonic-gate lgrp_handle_t hand; 6420Sstevel@tonic-gate int first_cpu; 6430Sstevel@tonic-gate lgrp_t *my_lgrp; 6440Sstevel@tonic-gate lgrp_id_t lgrpid; 6450Sstevel@tonic-gate struct cpu *cptr; 6460Sstevel@tonic-gate struct chip *chp; 6470Sstevel@tonic-gate 6480Sstevel@tonic-gate /* 6490Sstevel@tonic-gate * This is the first time through if the resource set 6500Sstevel@tonic-gate * for the root lgroup is empty. After cpu0 has been 6510Sstevel@tonic-gate * initially added to an lgroup, the root's CPU resource 6520Sstevel@tonic-gate * set can never be empty, since the system's last CPU 6530Sstevel@tonic-gate * cannot be offlined. 6540Sstevel@tonic-gate */ 6550Sstevel@tonic-gate if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 6560Sstevel@tonic-gate /* 6570Sstevel@tonic-gate * First time through. 6580Sstevel@tonic-gate */ 6590Sstevel@tonic-gate first_cpu = 1; 6600Sstevel@tonic-gate } else { 6610Sstevel@tonic-gate /* 6620Sstevel@tonic-gate * If cpu0 needs to move lgroups, we may come 6630Sstevel@tonic-gate * through here again, at which time cpu_lock won't 6640Sstevel@tonic-gate * be held, and lgrp_initialized will be false. 6650Sstevel@tonic-gate */ 6660Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 6670Sstevel@tonic-gate ASSERT(cp->cpu_part != NULL); 6680Sstevel@tonic-gate first_cpu = 0; 6690Sstevel@tonic-gate } 6700Sstevel@tonic-gate 6710Sstevel@tonic-gate hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 6720Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 6730Sstevel@tonic-gate 6740Sstevel@tonic-gate if (my_lgrp == NULL) { 6750Sstevel@tonic-gate /* 6760Sstevel@tonic-gate * Create new lgrp and add it to lgroup topology 6770Sstevel@tonic-gate */ 6780Sstevel@tonic-gate my_lgrp = lgrp_create(); 6790Sstevel@tonic-gate my_lgrp->lgrp_plathand = hand; 6800Sstevel@tonic-gate my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 6810Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 6820Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 6830Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 6840Sstevel@tonic-gate 6850Sstevel@tonic-gate count = 0; 6860Sstevel@tonic-gate klgrpset_clear(changed); 6870Sstevel@tonic-gate count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 6880Sstevel@tonic-gate &changed); 68950Sjjc /* 69050Sjjc * May have added new intermediate lgroups, so need to add 69150Sjjc * resources other than CPUs which are added below 69250Sjjc */ 69350Sjjc (void) lgrp_mnode_update(changed, NULL); 6940Sstevel@tonic-gate } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 6950Sstevel@tonic-gate > 0) { 6960Sstevel@tonic-gate /* 6970Sstevel@tonic-gate * Leaf lgroup was created, but latency wasn't available 6980Sstevel@tonic-gate * then. So, set latency for it and fill in rest of lgroup 6990Sstevel@tonic-gate * topology now that we know how far it is from other leaf 7000Sstevel@tonic-gate * lgroups. 7010Sstevel@tonic-gate */ 7020Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7030Sstevel@tonic-gate klgrpset_clear(changed); 7040Sstevel@tonic-gate if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 7050Sstevel@tonic-gate lgrpid)) 7060Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7070Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 7080Sstevel@tonic-gate &changed); 7090Sstevel@tonic-gate 7100Sstevel@tonic-gate /* 7110Sstevel@tonic-gate * May have added new intermediate lgroups, so need to add 7120Sstevel@tonic-gate * resources other than CPUs which are added below 7130Sstevel@tonic-gate */ 7140Sstevel@tonic-gate (void) lgrp_mnode_update(changed, NULL); 7150Sstevel@tonic-gate } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 7160Sstevel@tonic-gate my_lgrp->lgrp_id)) { 7170Sstevel@tonic-gate int i; 7180Sstevel@tonic-gate 7190Sstevel@tonic-gate /* 7200Sstevel@tonic-gate * Update existing lgroup and lgroups containing it with CPU 7210Sstevel@tonic-gate * resource 7220Sstevel@tonic-gate */ 7230Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7240Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7250Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 7260Sstevel@tonic-gate lgrp_t *lgrp; 7270Sstevel@tonic-gate 7280Sstevel@tonic-gate lgrp = lgrp_table[i]; 7290Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 7300Sstevel@tonic-gate !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 7310Sstevel@tonic-gate continue; 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 7340Sstevel@tonic-gate } 7350Sstevel@tonic-gate } 7360Sstevel@tonic-gate 7370Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 7380Sstevel@tonic-gate cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 7390Sstevel@tonic-gate 7400Sstevel@tonic-gate /* 7410Sstevel@tonic-gate * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 7420Sstevel@tonic-gate * end up in lpl for lgroup 0 whether it is supposed to be in there or 7430Sstevel@tonic-gate * not since none of lgroup IDs in the lpl's have been set yet. 7440Sstevel@tonic-gate */ 7450Sstevel@tonic-gate if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 7460Sstevel@tonic-gate cp->cpu_lpl->lpl_lgrpid = lgrpid; 7470Sstevel@tonic-gate 7480Sstevel@tonic-gate /* 7490Sstevel@tonic-gate * link the CPU into the lgrp's CPU list 7500Sstevel@tonic-gate */ 7510Sstevel@tonic-gate if (my_lgrp->lgrp_cpucnt == 0) { 7520Sstevel@tonic-gate my_lgrp->lgrp_cpu = cp; 7530Sstevel@tonic-gate cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 7540Sstevel@tonic-gate } else { 7550Sstevel@tonic-gate cptr = my_lgrp->lgrp_cpu; 7560Sstevel@tonic-gate cp->cpu_next_lgrp = cptr; 7570Sstevel@tonic-gate cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 7580Sstevel@tonic-gate cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 7590Sstevel@tonic-gate cptr->cpu_prev_lgrp = cp; 7600Sstevel@tonic-gate } 7610Sstevel@tonic-gate my_lgrp->lgrp_cpucnt++; 7620Sstevel@tonic-gate 7630Sstevel@tonic-gate /* 7640Sstevel@tonic-gate * Add this cpu's chip to the per lgroup list 7650Sstevel@tonic-gate * if necessary 7660Sstevel@tonic-gate */ 7670Sstevel@tonic-gate if (cp->cpu_chip->chip_lgrp == NULL) { 7680Sstevel@tonic-gate struct chip *lcpr; 7690Sstevel@tonic-gate 7700Sstevel@tonic-gate chp = cp->cpu_chip; 7710Sstevel@tonic-gate 7720Sstevel@tonic-gate if (my_lgrp->lgrp_chipcnt == 0) { 7730Sstevel@tonic-gate my_lgrp->lgrp_chips = chp; 7740Sstevel@tonic-gate chp->chip_next_lgrp = 7750Sstevel@tonic-gate chp->chip_prev_lgrp = chp; 7760Sstevel@tonic-gate } else { 7770Sstevel@tonic-gate lcpr = my_lgrp->lgrp_chips; 7780Sstevel@tonic-gate chp->chip_next_lgrp = lcpr; 7790Sstevel@tonic-gate chp->chip_prev_lgrp = 7800Sstevel@tonic-gate lcpr->chip_prev_lgrp; 7810Sstevel@tonic-gate lcpr->chip_prev_lgrp->chip_next_lgrp = 7820Sstevel@tonic-gate chp; 7830Sstevel@tonic-gate lcpr->chip_prev_lgrp = chp; 7840Sstevel@tonic-gate } 7850Sstevel@tonic-gate chp->chip_lgrp = my_lgrp; 7860Sstevel@tonic-gate chp->chip_balance = chp->chip_next_lgrp; 7870Sstevel@tonic-gate my_lgrp->lgrp_chipcnt++; 7880Sstevel@tonic-gate } 7890Sstevel@tonic-gate } 7900Sstevel@tonic-gate 7910Sstevel@tonic-gate lgrp_t * 7920Sstevel@tonic-gate lgrp_create(void) 7930Sstevel@tonic-gate { 7940Sstevel@tonic-gate lgrp_t *my_lgrp; 7950Sstevel@tonic-gate lgrp_id_t lgrpid; 7960Sstevel@tonic-gate int i; 7970Sstevel@tonic-gate 7980Sstevel@tonic-gate ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 7990Sstevel@tonic-gate 8000Sstevel@tonic-gate /* 8010Sstevel@tonic-gate * Find an open slot in the lgroup table and recycle unused lgroup 8020Sstevel@tonic-gate * left there if any 8030Sstevel@tonic-gate */ 8040Sstevel@tonic-gate my_lgrp = NULL; 8050Sstevel@tonic-gate if (lgrp_alloc_hint == -1) 8060Sstevel@tonic-gate /* 8070Sstevel@tonic-gate * Allocate from end when hint not set yet because no lgroups 8080Sstevel@tonic-gate * have been deleted yet 8090Sstevel@tonic-gate */ 8100Sstevel@tonic-gate lgrpid = nlgrps++; 8110Sstevel@tonic-gate else { 8120Sstevel@tonic-gate /* 8130Sstevel@tonic-gate * Start looking for next open slot from hint and leave hint 8140Sstevel@tonic-gate * at slot allocated 8150Sstevel@tonic-gate */ 8160Sstevel@tonic-gate for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 8170Sstevel@tonic-gate my_lgrp = lgrp_table[i]; 8180Sstevel@tonic-gate if (!LGRP_EXISTS(my_lgrp)) { 8190Sstevel@tonic-gate lgrpid = i; 8200Sstevel@tonic-gate nlgrps++; 8210Sstevel@tonic-gate break; 8220Sstevel@tonic-gate } 8230Sstevel@tonic-gate } 8240Sstevel@tonic-gate lgrp_alloc_hint = lgrpid; 8250Sstevel@tonic-gate } 8260Sstevel@tonic-gate 8270Sstevel@tonic-gate /* 8280Sstevel@tonic-gate * Keep track of max lgroup ID allocated so far to cut down on searches 8290Sstevel@tonic-gate */ 8300Sstevel@tonic-gate if (lgrpid > lgrp_alloc_max) 8310Sstevel@tonic-gate lgrp_alloc_max = lgrpid; 8320Sstevel@tonic-gate 8330Sstevel@tonic-gate /* 8340Sstevel@tonic-gate * Need to allocate new lgroup if next open slot didn't have one 8350Sstevel@tonic-gate * for recycling 8360Sstevel@tonic-gate */ 8370Sstevel@tonic-gate if (my_lgrp == NULL) 8380Sstevel@tonic-gate my_lgrp = lgrp_plat_alloc(lgrpid); 8390Sstevel@tonic-gate 8400Sstevel@tonic-gate if (nlgrps > nlgrpsmax || my_lgrp == NULL) 8410Sstevel@tonic-gate panic("Too many lgrps for platform (%d)", nlgrps); 8420Sstevel@tonic-gate 8430Sstevel@tonic-gate my_lgrp->lgrp_id = lgrpid; 8440Sstevel@tonic-gate my_lgrp->lgrp_latency = 0; 8450Sstevel@tonic-gate my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 8460Sstevel@tonic-gate my_lgrp->lgrp_parent = NULL; 8470Sstevel@tonic-gate my_lgrp->lgrp_childcnt = 0; 8480Sstevel@tonic-gate my_lgrp->lgrp_mnodes = (mnodeset_t)0; 8490Sstevel@tonic-gate my_lgrp->lgrp_nmnodes = 0; 8500Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_children); 8510Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_leaves); 8520Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 8530Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_set[i]); 8540Sstevel@tonic-gate 8550Sstevel@tonic-gate my_lgrp->lgrp_cpu = NULL; 8560Sstevel@tonic-gate my_lgrp->lgrp_cpucnt = 0; 8570Sstevel@tonic-gate my_lgrp->lgrp_chips = NULL; 8580Sstevel@tonic-gate my_lgrp->lgrp_chipcnt = 0; 8590Sstevel@tonic-gate 8600Sstevel@tonic-gate if (my_lgrp->lgrp_kstat != NULL) 8610Sstevel@tonic-gate lgrp_kstat_reset(lgrpid); 8620Sstevel@tonic-gate 8630Sstevel@tonic-gate lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate return (my_lgrp); 8660Sstevel@tonic-gate } 8670Sstevel@tonic-gate 8680Sstevel@tonic-gate void 8690Sstevel@tonic-gate lgrp_destroy(lgrp_t *lgrp) 8700Sstevel@tonic-gate { 8710Sstevel@tonic-gate int i; 8720Sstevel@tonic-gate 8730Sstevel@tonic-gate /* 8740Sstevel@tonic-gate * Unless this lgroup is being destroyed on behalf of 8750Sstevel@tonic-gate * the boot CPU, cpu_lock must be held 8760Sstevel@tonic-gate */ 8770Sstevel@tonic-gate ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 8780Sstevel@tonic-gate 8790Sstevel@tonic-gate if (nlgrps == 1) 8800Sstevel@tonic-gate cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 8810Sstevel@tonic-gate 8820Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 8830Sstevel@tonic-gate return; 8840Sstevel@tonic-gate 8850Sstevel@tonic-gate /* 8860Sstevel@tonic-gate * Set hint to lgroup being deleted and try to keep lower numbered 8870Sstevel@tonic-gate * hints to facilitate finding empty slots 8880Sstevel@tonic-gate */ 8890Sstevel@tonic-gate if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 8900Sstevel@tonic-gate lgrp_alloc_hint = lgrp->lgrp_id; 8910Sstevel@tonic-gate 8920Sstevel@tonic-gate /* 8930Sstevel@tonic-gate * Mark this lgroup to be recycled by setting its lgroup ID to 8940Sstevel@tonic-gate * LGRP_NONE and clear relevant fields 8950Sstevel@tonic-gate */ 8960Sstevel@tonic-gate lgrp->lgrp_id = LGRP_NONE; 8970Sstevel@tonic-gate lgrp->lgrp_latency = 0; 8980Sstevel@tonic-gate lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 8990Sstevel@tonic-gate lgrp->lgrp_parent = NULL; 9000Sstevel@tonic-gate lgrp->lgrp_childcnt = 0; 9010Sstevel@tonic-gate 9020Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_children); 9030Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_leaves); 9040Sstevel@tonic-gate for (i = 0; i < LGRP_RSRC_COUNT; i++) 9050Sstevel@tonic-gate klgrpset_clear(lgrp->lgrp_set[i]); 9060Sstevel@tonic-gate 9070Sstevel@tonic-gate lgrp->lgrp_mnodes = (mnodeset_t)0; 9080Sstevel@tonic-gate lgrp->lgrp_nmnodes = 0; 9090Sstevel@tonic-gate 9100Sstevel@tonic-gate lgrp->lgrp_cpu = NULL; 9110Sstevel@tonic-gate lgrp->lgrp_cpucnt = 0; 9120Sstevel@tonic-gate lgrp->lgrp_chipcnt = 0; 9130Sstevel@tonic-gate lgrp->lgrp_chips = NULL; 9140Sstevel@tonic-gate 9150Sstevel@tonic-gate nlgrps--; 9160Sstevel@tonic-gate } 9170Sstevel@tonic-gate 9180Sstevel@tonic-gate /* 9190Sstevel@tonic-gate * Initialize kstat data. Called from lgrp intialization code. 9200Sstevel@tonic-gate */ 9210Sstevel@tonic-gate static void 9220Sstevel@tonic-gate lgrp_kstat_init(void) 9230Sstevel@tonic-gate { 9240Sstevel@tonic-gate lgrp_stat_t stat; 9250Sstevel@tonic-gate 9260Sstevel@tonic-gate mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 9270Sstevel@tonic-gate 9280Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_STATS; stat++) 9290Sstevel@tonic-gate kstat_named_init(&lgrp_kstat_data[stat], 9300Sstevel@tonic-gate lgrp_kstat_names[stat], KSTAT_DATA_INT64); 9310Sstevel@tonic-gate } 9320Sstevel@tonic-gate 9330Sstevel@tonic-gate /* 9340Sstevel@tonic-gate * initialize an lgrp's kstats if needed 9350Sstevel@tonic-gate * called with cpu_lock held but not with cpus paused. 9360Sstevel@tonic-gate * we don't tear these down now because we don't know about 9370Sstevel@tonic-gate * memory leaving the lgrp yet... 9380Sstevel@tonic-gate */ 9390Sstevel@tonic-gate 9400Sstevel@tonic-gate void 9410Sstevel@tonic-gate lgrp_kstat_create(cpu_t *cp) 9420Sstevel@tonic-gate { 9430Sstevel@tonic-gate kstat_t *lgrp_kstat; 9440Sstevel@tonic-gate lgrp_id_t lgrpid; 9450Sstevel@tonic-gate lgrp_t *my_lgrp; 9460Sstevel@tonic-gate 9470Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9480Sstevel@tonic-gate 9490Sstevel@tonic-gate lgrpid = cp->cpu_lpl->lpl_lgrpid; 9500Sstevel@tonic-gate my_lgrp = lgrp_table[lgrpid]; 9510Sstevel@tonic-gate 9520Sstevel@tonic-gate if (my_lgrp->lgrp_kstat != NULL) 9530Sstevel@tonic-gate return; /* already initialized */ 9540Sstevel@tonic-gate 9550Sstevel@tonic-gate lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 9560Sstevel@tonic-gate KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 9570Sstevel@tonic-gate KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 9580Sstevel@tonic-gate 9590Sstevel@tonic-gate if (lgrp_kstat != NULL) { 9600Sstevel@tonic-gate lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 9610Sstevel@tonic-gate lgrp_kstat->ks_private = my_lgrp; 9620Sstevel@tonic-gate lgrp_kstat->ks_data = &lgrp_kstat_data; 9630Sstevel@tonic-gate lgrp_kstat->ks_update = lgrp_kstat_extract; 9640Sstevel@tonic-gate my_lgrp->lgrp_kstat = lgrp_kstat; 9650Sstevel@tonic-gate kstat_install(lgrp_kstat); 9660Sstevel@tonic-gate } 9670Sstevel@tonic-gate } 9680Sstevel@tonic-gate 9690Sstevel@tonic-gate /* 9700Sstevel@tonic-gate * this will do something when we manage to remove now unused lgrps 9710Sstevel@tonic-gate */ 9720Sstevel@tonic-gate 9730Sstevel@tonic-gate /* ARGSUSED */ 9740Sstevel@tonic-gate void 9750Sstevel@tonic-gate lgrp_kstat_destroy(cpu_t *cp) 9760Sstevel@tonic-gate { 9770Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9780Sstevel@tonic-gate } 9790Sstevel@tonic-gate 9800Sstevel@tonic-gate /* 9810Sstevel@tonic-gate * Called when a CPU is off-lined. 9820Sstevel@tonic-gate */ 9830Sstevel@tonic-gate static void 9840Sstevel@tonic-gate lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 9850Sstevel@tonic-gate { 9860Sstevel@tonic-gate lgrp_t *my_lgrp; 9870Sstevel@tonic-gate struct cpu *prev; 9880Sstevel@tonic-gate struct cpu *next; 9890Sstevel@tonic-gate chip_t *chp; 9900Sstevel@tonic-gate 9910Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 9920Sstevel@tonic-gate 9930Sstevel@tonic-gate prev = cp->cpu_prev_lgrp; 9940Sstevel@tonic-gate next = cp->cpu_next_lgrp; 9950Sstevel@tonic-gate 9960Sstevel@tonic-gate prev->cpu_next_lgrp = next; 9970Sstevel@tonic-gate next->cpu_prev_lgrp = prev; 9980Sstevel@tonic-gate 9990Sstevel@tonic-gate /* 10000Sstevel@tonic-gate * just because I'm paranoid doesn't mean... 10010Sstevel@tonic-gate */ 10020Sstevel@tonic-gate 10030Sstevel@tonic-gate cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 10040Sstevel@tonic-gate 10050Sstevel@tonic-gate my_lgrp = lgrp_table[lgrpid]; 10060Sstevel@tonic-gate my_lgrp->lgrp_cpucnt--; 10070Sstevel@tonic-gate 10080Sstevel@tonic-gate /* 10090Sstevel@tonic-gate * If the last CPU on it's chip is being offlined 10100Sstevel@tonic-gate * then remove this chip from the per lgroup list. 10110Sstevel@tonic-gate * 10120Sstevel@tonic-gate * This is also done for the boot CPU when it needs 10130Sstevel@tonic-gate * to move between lgroups as a consequence of 10140Sstevel@tonic-gate * null proc lpa. 10150Sstevel@tonic-gate */ 10160Sstevel@tonic-gate chp = cp->cpu_chip; 10170Sstevel@tonic-gate if (chp->chip_ncpu == 0 || !lgrp_initialized) { 10180Sstevel@tonic-gate 10190Sstevel@tonic-gate chip_t *chpp; 10200Sstevel@tonic-gate 10210Sstevel@tonic-gate if (--my_lgrp->lgrp_chipcnt == 0) 10220Sstevel@tonic-gate my_lgrp->lgrp_chips = NULL; 10230Sstevel@tonic-gate else if (my_lgrp->lgrp_chips == chp) 10240Sstevel@tonic-gate my_lgrp->lgrp_chips = chp->chip_next_lgrp; 10250Sstevel@tonic-gate 10260Sstevel@tonic-gate /* 10270Sstevel@tonic-gate * Walk this lgroup's chip list looking for chips that 10280Sstevel@tonic-gate * may try to balance against the one that's leaving 10290Sstevel@tonic-gate */ 10300Sstevel@tonic-gate for (chpp = chp->chip_next_lgrp; chpp != chp; 10310Sstevel@tonic-gate chpp = chpp->chip_next_lgrp) { 10320Sstevel@tonic-gate if (chpp->chip_balance == chp) 10330Sstevel@tonic-gate chpp->chip_balance = chp->chip_next_lgrp; 10340Sstevel@tonic-gate } 10350Sstevel@tonic-gate 10360Sstevel@tonic-gate chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 10370Sstevel@tonic-gate chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 10380Sstevel@tonic-gate 10390Sstevel@tonic-gate chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 10400Sstevel@tonic-gate chp->chip_lgrp = NULL; 10410Sstevel@tonic-gate chp->chip_balance = NULL; 10420Sstevel@tonic-gate } 10430Sstevel@tonic-gate 10440Sstevel@tonic-gate /* 10450Sstevel@tonic-gate * Removing last CPU in lgroup, so update lgroup topology 10460Sstevel@tonic-gate */ 10470Sstevel@tonic-gate if (my_lgrp->lgrp_cpucnt == 0) { 10480Sstevel@tonic-gate klgrpset_t changed; 10490Sstevel@tonic-gate int count; 10500Sstevel@tonic-gate int i; 10510Sstevel@tonic-gate 10520Sstevel@tonic-gate my_lgrp->lgrp_cpu = NULL; 10530Sstevel@tonic-gate 10540Sstevel@tonic-gate /* 10550Sstevel@tonic-gate * Remove this lgroup from its lgroup CPU resources and remove 10560Sstevel@tonic-gate * lgroup from lgroup topology if it doesn't have any more 10570Sstevel@tonic-gate * resources in it now 10580Sstevel@tonic-gate */ 10590Sstevel@tonic-gate klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 10600Sstevel@tonic-gate if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 10610Sstevel@tonic-gate count = 0; 10620Sstevel@tonic-gate klgrpset_clear(changed); 10630Sstevel@tonic-gate count += lgrp_leaf_delete(my_lgrp, lgrp_table, 10640Sstevel@tonic-gate lgrp_alloc_max + 1, &changed); 10650Sstevel@tonic-gate return; 10660Sstevel@tonic-gate } 10670Sstevel@tonic-gate 10680Sstevel@tonic-gate /* 10690Sstevel@tonic-gate * This lgroup isn't empty, so just remove it from CPU 10700Sstevel@tonic-gate * resources of any lgroups that contain it as such 10710Sstevel@tonic-gate */ 10720Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 10730Sstevel@tonic-gate lgrp_t *lgrp; 10740Sstevel@tonic-gate 10750Sstevel@tonic-gate lgrp = lgrp_table[i]; 10760Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 10770Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 10780Sstevel@tonic-gate lgrpid)) 10790Sstevel@tonic-gate continue; 10800Sstevel@tonic-gate 10810Sstevel@tonic-gate klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 10820Sstevel@tonic-gate } 10830Sstevel@tonic-gate return; 10840Sstevel@tonic-gate } 10850Sstevel@tonic-gate 10860Sstevel@tonic-gate if (my_lgrp->lgrp_cpu == cp) 10870Sstevel@tonic-gate my_lgrp->lgrp_cpu = next; 10880Sstevel@tonic-gate 10890Sstevel@tonic-gate } 10900Sstevel@tonic-gate 10910Sstevel@tonic-gate /* 10920Sstevel@tonic-gate * Update memory nodes in target lgroups and return ones that get changed 10930Sstevel@tonic-gate */ 10940Sstevel@tonic-gate int 10950Sstevel@tonic-gate lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 10960Sstevel@tonic-gate { 10970Sstevel@tonic-gate int count; 10980Sstevel@tonic-gate int i; 10990Sstevel@tonic-gate int j; 11000Sstevel@tonic-gate lgrp_t *lgrp; 11010Sstevel@tonic-gate lgrp_t *lgrp_rsrc; 11020Sstevel@tonic-gate 11030Sstevel@tonic-gate count = 0; 11040Sstevel@tonic-gate if (changed) 11050Sstevel@tonic-gate klgrpset_clear(*changed); 11060Sstevel@tonic-gate 11070Sstevel@tonic-gate if (klgrpset_isempty(target)) 11080Sstevel@tonic-gate return (0); 11090Sstevel@tonic-gate 11100Sstevel@tonic-gate /* 11110Sstevel@tonic-gate * Find each lgroup in target lgroups 11120Sstevel@tonic-gate */ 11130Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 11140Sstevel@tonic-gate /* 11150Sstevel@tonic-gate * Skip any lgroups that don't exist or aren't in target group 11160Sstevel@tonic-gate */ 11170Sstevel@tonic-gate lgrp = lgrp_table[i]; 11180Sstevel@tonic-gate if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 11190Sstevel@tonic-gate continue; 11200Sstevel@tonic-gate } 11210Sstevel@tonic-gate 11220Sstevel@tonic-gate /* 11230Sstevel@tonic-gate * Initialize memnodes for intermediate lgroups to 0 11240Sstevel@tonic-gate * and update them from scratch since they may have completely 11250Sstevel@tonic-gate * changed 11260Sstevel@tonic-gate */ 11270Sstevel@tonic-gate if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 11280Sstevel@tonic-gate lgrp->lgrp_mnodes = (mnodeset_t)0; 11290Sstevel@tonic-gate lgrp->lgrp_nmnodes = 0; 11300Sstevel@tonic-gate } 11310Sstevel@tonic-gate 11320Sstevel@tonic-gate /* 11330Sstevel@tonic-gate * Update memory nodes of of target lgroup with memory nodes 11340Sstevel@tonic-gate * from each lgroup in its lgroup memory resource set 11350Sstevel@tonic-gate */ 11360Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 11370Sstevel@tonic-gate int k; 11380Sstevel@tonic-gate 11390Sstevel@tonic-gate /* 11400Sstevel@tonic-gate * Skip any lgroups that don't exist or aren't in 11410Sstevel@tonic-gate * memory resources of target lgroup 11420Sstevel@tonic-gate */ 11430Sstevel@tonic-gate lgrp_rsrc = lgrp_table[j]; 11440Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_rsrc) || 11450Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 11460Sstevel@tonic-gate j)) 11470Sstevel@tonic-gate continue; 11480Sstevel@tonic-gate 11490Sstevel@tonic-gate /* 11500Sstevel@tonic-gate * Update target lgroup's memnodes to include memnodes 11510Sstevel@tonic-gate * of this lgroup 11520Sstevel@tonic-gate */ 11530Sstevel@tonic-gate for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 11540Sstevel@tonic-gate mnodeset_t mnode_mask; 11550Sstevel@tonic-gate 11560Sstevel@tonic-gate mnode_mask = (mnodeset_t)1 << k; 11570Sstevel@tonic-gate if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 11580Sstevel@tonic-gate !(lgrp->lgrp_mnodes & mnode_mask)) { 11590Sstevel@tonic-gate lgrp->lgrp_mnodes |= mnode_mask; 11600Sstevel@tonic-gate lgrp->lgrp_nmnodes++; 11610Sstevel@tonic-gate } 11620Sstevel@tonic-gate } 11630Sstevel@tonic-gate count++; 11640Sstevel@tonic-gate if (changed) 11650Sstevel@tonic-gate klgrpset_add(*changed, lgrp->lgrp_id); 11660Sstevel@tonic-gate } 11670Sstevel@tonic-gate } 11680Sstevel@tonic-gate 11690Sstevel@tonic-gate return (count); 11700Sstevel@tonic-gate } 11710Sstevel@tonic-gate 11720Sstevel@tonic-gate /* 11730Sstevel@tonic-gate * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 11740Sstevel@tonic-gate * is moved from one board to another. The "from" and "to" arguments specify the 11750Sstevel@tonic-gate * source and the destination of the move. 11760Sstevel@tonic-gate * 11770Sstevel@tonic-gate * See plat_lgrp_config() for a detailed description of the copy-rename 11780Sstevel@tonic-gate * semantics. 11790Sstevel@tonic-gate * 11800Sstevel@tonic-gate * The lgrp_mem_rename() is called by the platform copy-rename code to update 11810Sstevel@tonic-gate * the lgroup topology which is changing as memory moves from one lgroup to 11820Sstevel@tonic-gate * another. It removes the mnode from the source lgroup and re-inserts it in the 11830Sstevel@tonic-gate * target lgroup. 11840Sstevel@tonic-gate * 11850Sstevel@tonic-gate * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 11860Sstevel@tonic-gate * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 11870Sstevel@tonic-gate * copy-rename operation. 11880Sstevel@tonic-gate * 11890Sstevel@tonic-gate * There is one case which requires special handling. If the system contains 11900Sstevel@tonic-gate * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 11910Sstevel@tonic-gate * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 11920Sstevel@tonic-gate * lgrp_mem_init), but there is a window when the system has no memory in the 11930Sstevel@tonic-gate * lgroup hierarchy. If another thread tries to allocate memory during this 11940Sstevel@tonic-gate * window, the allocation will fail, although the system has physical memory. 11950Sstevel@tonic-gate * This may cause a system panic or a deadlock (some sleeping memory allocations 11960Sstevel@tonic-gate * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 11970Sstevel@tonic-gate * the mnode back). 11980Sstevel@tonic-gate * 11990Sstevel@tonic-gate * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 12000Sstevel@tonic-gate * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 12010Sstevel@tonic-gate * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 12020Sstevel@tonic-gate * but it updates the rest of the lgroup topology as if the mnode was actually 12030Sstevel@tonic-gate * removed. The lgrp_mem_init() function recognizes that the mnode being 12040Sstevel@tonic-gate * inserted represents such a special case and updates the topology 12050Sstevel@tonic-gate * appropriately. 12060Sstevel@tonic-gate */ 12070Sstevel@tonic-gate void 12080Sstevel@tonic-gate lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 12090Sstevel@tonic-gate { 12100Sstevel@tonic-gate /* 12110Sstevel@tonic-gate * Remove the memory from the source node and add it to the destination 12120Sstevel@tonic-gate * node. 12130Sstevel@tonic-gate */ 12140Sstevel@tonic-gate lgrp_mem_fini(mnode, from, B_TRUE); 12150Sstevel@tonic-gate lgrp_mem_init(mnode, to, B_TRUE); 12160Sstevel@tonic-gate } 12170Sstevel@tonic-gate 12180Sstevel@tonic-gate /* 12190Sstevel@tonic-gate * Called to indicate that the lgrp with platform handle "hand" now 12200Sstevel@tonic-gate * contains the memory identified by "mnode". 12210Sstevel@tonic-gate * 12220Sstevel@tonic-gate * LOCKING for this routine is a bit tricky. Usually it is called without 12230Sstevel@tonic-gate * cpu_lock and it must must grab cpu_lock here to prevent racing with other 12240Sstevel@tonic-gate * callers. During DR of the board containing the caged memory it may be called 12250Sstevel@tonic-gate * with cpu_lock already held and CPUs paused. 12260Sstevel@tonic-gate * 12270Sstevel@tonic-gate * If the insertion is part of the DR copy-rename and the inserted mnode (and 12280Sstevel@tonic-gate * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 12290Sstevel@tonic-gate * dealing with the special case of DR copy-rename described in 12300Sstevel@tonic-gate * lgrp_mem_rename(). 12310Sstevel@tonic-gate */ 12320Sstevel@tonic-gate void 12330Sstevel@tonic-gate lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 12340Sstevel@tonic-gate { 12350Sstevel@tonic-gate klgrpset_t changed; 12360Sstevel@tonic-gate int count; 12370Sstevel@tonic-gate int i; 12380Sstevel@tonic-gate lgrp_t *my_lgrp; 12390Sstevel@tonic-gate lgrp_id_t lgrpid; 12400Sstevel@tonic-gate mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 12410Sstevel@tonic-gate boolean_t drop_lock = B_FALSE; 12420Sstevel@tonic-gate boolean_t need_synch = B_FALSE; 12430Sstevel@tonic-gate 12440Sstevel@tonic-gate /* 12450Sstevel@tonic-gate * Grab CPU lock (if we haven't already) 12460Sstevel@tonic-gate */ 12470Sstevel@tonic-gate if (!MUTEX_HELD(&cpu_lock)) { 12480Sstevel@tonic-gate mutex_enter(&cpu_lock); 12490Sstevel@tonic-gate drop_lock = B_TRUE; 12500Sstevel@tonic-gate } 12510Sstevel@tonic-gate 12520Sstevel@tonic-gate /* 12530Sstevel@tonic-gate * This routine may be called from a context where we already 12540Sstevel@tonic-gate * hold cpu_lock, and have already paused cpus. 12550Sstevel@tonic-gate */ 12560Sstevel@tonic-gate if (!cpus_paused()) 12570Sstevel@tonic-gate need_synch = B_TRUE; 12580Sstevel@tonic-gate 12590Sstevel@tonic-gate /* 12600Sstevel@tonic-gate * Check if this mnode is already configured and return immediately if 12610Sstevel@tonic-gate * it is. 12620Sstevel@tonic-gate * 12630Sstevel@tonic-gate * NOTE: in special case of copy-rename of the only remaining mnode, 12640Sstevel@tonic-gate * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 12650Sstevel@tonic-gate * recognize this case and continue as usual, but skip the update to 12660Sstevel@tonic-gate * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 12670Sstevel@tonic-gate * in topology, temporarily introduced by lgrp_mem_fini(). 12680Sstevel@tonic-gate */ 12690Sstevel@tonic-gate if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 12700Sstevel@tonic-gate lgrp_root->lgrp_mnodes & mnodes_mask) { 12710Sstevel@tonic-gate if (drop_lock) 12720Sstevel@tonic-gate mutex_exit(&cpu_lock); 12730Sstevel@tonic-gate return; 12740Sstevel@tonic-gate } 12750Sstevel@tonic-gate 12760Sstevel@tonic-gate /* 12770Sstevel@tonic-gate * Update lgroup topology with new memory resources, keeping track of 12780Sstevel@tonic-gate * which lgroups change 12790Sstevel@tonic-gate */ 12800Sstevel@tonic-gate count = 0; 12810Sstevel@tonic-gate klgrpset_clear(changed); 12820Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 12830Sstevel@tonic-gate if (my_lgrp == NULL) { 12840Sstevel@tonic-gate /* new lgrp */ 12850Sstevel@tonic-gate my_lgrp = lgrp_create(); 12860Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 12870Sstevel@tonic-gate my_lgrp->lgrp_plathand = hand; 12880Sstevel@tonic-gate my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 12890Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 12900Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 12910Sstevel@tonic-gate 12920Sstevel@tonic-gate if (need_synch) 12930Sstevel@tonic-gate pause_cpus(NULL); 12940Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 12950Sstevel@tonic-gate &changed); 12960Sstevel@tonic-gate if (need_synch) 12970Sstevel@tonic-gate start_cpus(); 12980Sstevel@tonic-gate } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 12990Sstevel@tonic-gate > 0) { 13000Sstevel@tonic-gate /* 13010Sstevel@tonic-gate * Leaf lgroup was created, but latency wasn't available 13020Sstevel@tonic-gate * then. So, set latency for it and fill in rest of lgroup 13030Sstevel@tonic-gate * topology now that we know how far it is from other leaf 13040Sstevel@tonic-gate * lgroups. 13050Sstevel@tonic-gate */ 13060Sstevel@tonic-gate klgrpset_clear(changed); 13070Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 13080Sstevel@tonic-gate if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 13090Sstevel@tonic-gate lgrpid)) 13100Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 13110Sstevel@tonic-gate if (need_synch) 13120Sstevel@tonic-gate pause_cpus(NULL); 13130Sstevel@tonic-gate count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 13140Sstevel@tonic-gate &changed); 13150Sstevel@tonic-gate if (need_synch) 13160Sstevel@tonic-gate start_cpus(); 13170Sstevel@tonic-gate } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 13180Sstevel@tonic-gate my_lgrp->lgrp_id)) { 131950Sjjc /* 132050Sjjc * Add new lgroup memory resource to existing lgroup 132150Sjjc */ 13220Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 13230Sstevel@tonic-gate klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 13240Sstevel@tonic-gate klgrpset_add(changed, lgrpid); 13250Sstevel@tonic-gate count++; 13260Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 13270Sstevel@tonic-gate lgrp_t *lgrp; 13280Sstevel@tonic-gate 13290Sstevel@tonic-gate lgrp = lgrp_table[i]; 13300Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 13310Sstevel@tonic-gate !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 13320Sstevel@tonic-gate continue; 13330Sstevel@tonic-gate 13340Sstevel@tonic-gate klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 13350Sstevel@tonic-gate klgrpset_add(changed, lgrp->lgrp_id); 13360Sstevel@tonic-gate count++; 13370Sstevel@tonic-gate } 13380Sstevel@tonic-gate } 13390Sstevel@tonic-gate 13400Sstevel@tonic-gate /* 13410Sstevel@tonic-gate * Add memory node to lgroup and remove lgroup from ones that need 13420Sstevel@tonic-gate * to be updated 13430Sstevel@tonic-gate */ 13440Sstevel@tonic-gate if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 13450Sstevel@tonic-gate my_lgrp->lgrp_mnodes |= mnodes_mask; 13460Sstevel@tonic-gate my_lgrp->lgrp_nmnodes++; 13470Sstevel@tonic-gate } 13480Sstevel@tonic-gate klgrpset_del(changed, lgrpid); 13490Sstevel@tonic-gate 13500Sstevel@tonic-gate /* 13510Sstevel@tonic-gate * Update memory node information for all lgroups that changed and 13520Sstevel@tonic-gate * contain new memory node as a resource 13530Sstevel@tonic-gate */ 13540Sstevel@tonic-gate if (count) 13550Sstevel@tonic-gate (void) lgrp_mnode_update(changed, NULL); 13560Sstevel@tonic-gate 13570Sstevel@tonic-gate if (drop_lock) 13580Sstevel@tonic-gate mutex_exit(&cpu_lock); 13590Sstevel@tonic-gate } 13600Sstevel@tonic-gate 13610Sstevel@tonic-gate /* 13620Sstevel@tonic-gate * Called to indicate that the lgroup associated with the platform 13630Sstevel@tonic-gate * handle "hand" no longer contains given memory node 13640Sstevel@tonic-gate * 13650Sstevel@tonic-gate * LOCKING for this routine is a bit tricky. Usually it is called without 13660Sstevel@tonic-gate * cpu_lock and it must must grab cpu_lock here to prevent racing with other 13670Sstevel@tonic-gate * callers. During DR of the board containing the caged memory it may be called 13680Sstevel@tonic-gate * with cpu_lock already held and CPUs paused. 13690Sstevel@tonic-gate * 13700Sstevel@tonic-gate * If the deletion is part of the DR copy-rename and the deleted mnode is the 13710Sstevel@tonic-gate * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 13720Sstevel@tonic-gate * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 13730Sstevel@tonic-gate * the same mnode back into the topology. See lgrp_mem_rename() and 13740Sstevel@tonic-gate * lgrp_mem_init() for additional details. 13750Sstevel@tonic-gate */ 13760Sstevel@tonic-gate void 13770Sstevel@tonic-gate lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 13780Sstevel@tonic-gate { 13790Sstevel@tonic-gate klgrpset_t changed; 13800Sstevel@tonic-gate int count; 13810Sstevel@tonic-gate int i; 13820Sstevel@tonic-gate lgrp_t *my_lgrp; 13830Sstevel@tonic-gate lgrp_id_t lgrpid; 13840Sstevel@tonic-gate mnodeset_t mnodes_mask; 13850Sstevel@tonic-gate boolean_t drop_lock = B_FALSE; 13860Sstevel@tonic-gate boolean_t need_synch = B_FALSE; 13870Sstevel@tonic-gate 13880Sstevel@tonic-gate /* 13890Sstevel@tonic-gate * Grab CPU lock (if we haven't already) 13900Sstevel@tonic-gate */ 13910Sstevel@tonic-gate if (!MUTEX_HELD(&cpu_lock)) { 13920Sstevel@tonic-gate mutex_enter(&cpu_lock); 13930Sstevel@tonic-gate drop_lock = B_TRUE; 13940Sstevel@tonic-gate } 13950Sstevel@tonic-gate 13960Sstevel@tonic-gate /* 13970Sstevel@tonic-gate * This routine may be called from a context where we already 13980Sstevel@tonic-gate * hold cpu_lock and have already paused cpus. 13990Sstevel@tonic-gate */ 14000Sstevel@tonic-gate if (!cpus_paused()) 14010Sstevel@tonic-gate need_synch = B_TRUE; 14020Sstevel@tonic-gate 14030Sstevel@tonic-gate my_lgrp = lgrp_hand_to_lgrp(hand); 14040Sstevel@tonic-gate 14050Sstevel@tonic-gate /* 14060Sstevel@tonic-gate * The lgrp *must* be pre-existing 14070Sstevel@tonic-gate */ 14080Sstevel@tonic-gate ASSERT(my_lgrp != NULL); 14090Sstevel@tonic-gate 14100Sstevel@tonic-gate /* 14110Sstevel@tonic-gate * Delete memory node from lgroups which contain it 14120Sstevel@tonic-gate */ 14130Sstevel@tonic-gate mnodes_mask = ((mnodeset_t)1 << mnode); 14140Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 14150Sstevel@tonic-gate lgrp_t *lgrp = lgrp_table[i]; 14160Sstevel@tonic-gate /* 14170Sstevel@tonic-gate * Skip any non-existent lgroups and any lgroups that don't 14180Sstevel@tonic-gate * contain leaf lgroup of memory as a memory resource 14190Sstevel@tonic-gate */ 14200Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 14210Sstevel@tonic-gate !(lgrp->lgrp_mnodes & mnodes_mask)) 14220Sstevel@tonic-gate continue; 14230Sstevel@tonic-gate 14240Sstevel@tonic-gate /* 14250Sstevel@tonic-gate * Avoid removing the last mnode from the root in the DR 14260Sstevel@tonic-gate * copy-rename case. See lgrp_mem_rename() for details. 14270Sstevel@tonic-gate */ 14280Sstevel@tonic-gate if (is_copy_rename && 14290Sstevel@tonic-gate (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 14300Sstevel@tonic-gate continue; 14310Sstevel@tonic-gate 14320Sstevel@tonic-gate /* 14330Sstevel@tonic-gate * Remove memory node from lgroup. 14340Sstevel@tonic-gate */ 14350Sstevel@tonic-gate lgrp->lgrp_mnodes &= ~mnodes_mask; 14360Sstevel@tonic-gate lgrp->lgrp_nmnodes--; 14370Sstevel@tonic-gate ASSERT(lgrp->lgrp_nmnodes >= 0); 14380Sstevel@tonic-gate } 14390Sstevel@tonic-gate ASSERT(lgrp_root->lgrp_nmnodes > 0); 14400Sstevel@tonic-gate 14410Sstevel@tonic-gate /* 14420Sstevel@tonic-gate * Don't need to update lgroup topology if this lgroup still has memory. 14430Sstevel@tonic-gate * 14440Sstevel@tonic-gate * In the special case of DR copy-rename with the only mnode being 14450Sstevel@tonic-gate * removed, the lgrp_mnodes for the root is always non-zero, but we 14460Sstevel@tonic-gate * still need to update the lgroup topology. 14470Sstevel@tonic-gate */ 14480Sstevel@tonic-gate if ((my_lgrp->lgrp_nmnodes > 0) && 14490Sstevel@tonic-gate !(is_copy_rename && 14500Sstevel@tonic-gate (my_lgrp == lgrp_root) && 14510Sstevel@tonic-gate (my_lgrp->lgrp_mnodes == mnodes_mask))) { 14520Sstevel@tonic-gate if (drop_lock) 14530Sstevel@tonic-gate mutex_exit(&cpu_lock); 14540Sstevel@tonic-gate return; 14550Sstevel@tonic-gate } 14560Sstevel@tonic-gate 14570Sstevel@tonic-gate /* 14580Sstevel@tonic-gate * This lgroup does not contain any memory now 14590Sstevel@tonic-gate */ 14600Sstevel@tonic-gate klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 14610Sstevel@tonic-gate 14620Sstevel@tonic-gate /* 14630Sstevel@tonic-gate * Remove this lgroup from lgroup topology if it does not contain any 14640Sstevel@tonic-gate * resources now 14650Sstevel@tonic-gate */ 14660Sstevel@tonic-gate lgrpid = my_lgrp->lgrp_id; 14670Sstevel@tonic-gate count = 0; 14680Sstevel@tonic-gate klgrpset_clear(changed); 14690Sstevel@tonic-gate if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 14700Sstevel@tonic-gate /* 14710Sstevel@tonic-gate * Delete lgroup when no more resources 14720Sstevel@tonic-gate */ 14730Sstevel@tonic-gate if (need_synch) 14740Sstevel@tonic-gate pause_cpus(NULL); 14750Sstevel@tonic-gate count = lgrp_leaf_delete(my_lgrp, lgrp_table, 14760Sstevel@tonic-gate lgrp_alloc_max + 1, &changed); 14770Sstevel@tonic-gate ASSERT(count > 0); 14780Sstevel@tonic-gate if (need_synch) 14790Sstevel@tonic-gate start_cpus(); 14800Sstevel@tonic-gate } else { 14810Sstevel@tonic-gate /* 14820Sstevel@tonic-gate * Remove lgroup from memory resources of any lgroups that 14830Sstevel@tonic-gate * contain it as such 14840Sstevel@tonic-gate */ 14850Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 14860Sstevel@tonic-gate lgrp_t *lgrp; 14870Sstevel@tonic-gate 14880Sstevel@tonic-gate lgrp = lgrp_table[i]; 14890Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 14900Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 14910Sstevel@tonic-gate lgrpid)) 14920Sstevel@tonic-gate continue; 14930Sstevel@tonic-gate 14940Sstevel@tonic-gate klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 14950Sstevel@tonic-gate } 14960Sstevel@tonic-gate } 14970Sstevel@tonic-gate if (drop_lock) 14980Sstevel@tonic-gate mutex_exit(&cpu_lock); 14990Sstevel@tonic-gate } 15000Sstevel@tonic-gate 15010Sstevel@tonic-gate /* 15020Sstevel@tonic-gate * Return lgroup with given platform handle 15030Sstevel@tonic-gate */ 15040Sstevel@tonic-gate lgrp_t * 15050Sstevel@tonic-gate lgrp_hand_to_lgrp(lgrp_handle_t hand) 15060Sstevel@tonic-gate { 15070Sstevel@tonic-gate int i; 15080Sstevel@tonic-gate lgrp_t *lgrp; 15090Sstevel@tonic-gate 15100Sstevel@tonic-gate if (hand == LGRP_NULL_HANDLE) 15110Sstevel@tonic-gate return (NULL); 15120Sstevel@tonic-gate 15130Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 15140Sstevel@tonic-gate lgrp = lgrp_table[i]; 15150Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 15160Sstevel@tonic-gate return (lgrp); 15170Sstevel@tonic-gate } 15180Sstevel@tonic-gate return (NULL); 15190Sstevel@tonic-gate } 15200Sstevel@tonic-gate 15210Sstevel@tonic-gate /* 15220Sstevel@tonic-gate * Return the home lgroup of the current thread. 15230Sstevel@tonic-gate * We must do this with kernel preemption disabled, since we don't want our 15240Sstevel@tonic-gate * thread to be re-homed while we're poking around with its lpl, and the lpl 15250Sstevel@tonic-gate * should never be NULL. 15260Sstevel@tonic-gate * 15270Sstevel@tonic-gate * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 15280Sstevel@tonic-gate * is enabled because of DR. Callers can use disable kernel preemption 15290Sstevel@tonic-gate * around this call to guarantee that the lgroup will be valid beyond this 15300Sstevel@tonic-gate * routine, since kernel preemption can be recursive. 15310Sstevel@tonic-gate */ 15320Sstevel@tonic-gate lgrp_t * 15330Sstevel@tonic-gate lgrp_home_lgrp(void) 15340Sstevel@tonic-gate { 15350Sstevel@tonic-gate lgrp_t *lgrp; 15360Sstevel@tonic-gate lpl_t *lpl; 15370Sstevel@tonic-gate 15380Sstevel@tonic-gate kpreempt_disable(); 15390Sstevel@tonic-gate 15400Sstevel@tonic-gate lpl = curthread->t_lpl; 15410Sstevel@tonic-gate ASSERT(lpl != NULL); 15420Sstevel@tonic-gate ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 15430Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 15440Sstevel@tonic-gate lgrp = lgrp_table[lpl->lpl_lgrpid]; 15450Sstevel@tonic-gate 15460Sstevel@tonic-gate kpreempt_enable(); 15470Sstevel@tonic-gate 15480Sstevel@tonic-gate return (lgrp); 15490Sstevel@tonic-gate } 15500Sstevel@tonic-gate 15510Sstevel@tonic-gate /* 15520Sstevel@tonic-gate * Return ID of home lgroup for given thread 15530Sstevel@tonic-gate * (See comments for lgrp_home_lgrp() for special care and handling 15540Sstevel@tonic-gate * instructions) 15550Sstevel@tonic-gate */ 15560Sstevel@tonic-gate lgrp_id_t 15570Sstevel@tonic-gate lgrp_home_id(kthread_t *t) 15580Sstevel@tonic-gate { 15590Sstevel@tonic-gate lgrp_id_t lgrp; 15600Sstevel@tonic-gate lpl_t *lpl; 15610Sstevel@tonic-gate 15620Sstevel@tonic-gate ASSERT(t != NULL); 15630Sstevel@tonic-gate /* 15640Sstevel@tonic-gate * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 15650Sstevel@tonic-gate * cannot since the HAT layer can call into this routine to 15660Sstevel@tonic-gate * determine the locality for its data structures in the context 15670Sstevel@tonic-gate * of a page fault. 15680Sstevel@tonic-gate */ 15690Sstevel@tonic-gate 15700Sstevel@tonic-gate kpreempt_disable(); 15710Sstevel@tonic-gate 15720Sstevel@tonic-gate lpl = t->t_lpl; 15730Sstevel@tonic-gate ASSERT(lpl != NULL); 15740Sstevel@tonic-gate ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 15750Sstevel@tonic-gate lgrp = lpl->lpl_lgrpid; 15760Sstevel@tonic-gate 15770Sstevel@tonic-gate kpreempt_enable(); 15780Sstevel@tonic-gate 15790Sstevel@tonic-gate return (lgrp); 15800Sstevel@tonic-gate } 15810Sstevel@tonic-gate 15820Sstevel@tonic-gate /* 15830Sstevel@tonic-gate * Return lgroup containing the physical memory for the given page frame number 15840Sstevel@tonic-gate */ 15850Sstevel@tonic-gate lgrp_t * 15860Sstevel@tonic-gate lgrp_pfn_to_lgrp(pfn_t pfn) 15870Sstevel@tonic-gate { 15880Sstevel@tonic-gate lgrp_handle_t hand; 15890Sstevel@tonic-gate int i; 15900Sstevel@tonic-gate lgrp_t *lgrp; 15910Sstevel@tonic-gate 15920Sstevel@tonic-gate hand = lgrp_plat_pfn_to_hand(pfn); 15930Sstevel@tonic-gate if (hand != LGRP_NULL_HANDLE) 15940Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 15950Sstevel@tonic-gate lgrp = lgrp_table[i]; 15960Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 15970Sstevel@tonic-gate return (lgrp); 15980Sstevel@tonic-gate } 15990Sstevel@tonic-gate return (NULL); 16000Sstevel@tonic-gate } 16010Sstevel@tonic-gate 16020Sstevel@tonic-gate /* 16030Sstevel@tonic-gate * Return lgroup containing the physical memory for the given page frame number 16040Sstevel@tonic-gate */ 16050Sstevel@tonic-gate lgrp_t * 16060Sstevel@tonic-gate lgrp_phys_to_lgrp(u_longlong_t physaddr) 16070Sstevel@tonic-gate { 16080Sstevel@tonic-gate lgrp_handle_t hand; 16090Sstevel@tonic-gate int i; 16100Sstevel@tonic-gate lgrp_t *lgrp; 16110Sstevel@tonic-gate pfn_t pfn; 16120Sstevel@tonic-gate 16130Sstevel@tonic-gate pfn = btop(physaddr); 16140Sstevel@tonic-gate hand = lgrp_plat_pfn_to_hand(pfn); 16150Sstevel@tonic-gate if (hand != LGRP_NULL_HANDLE) 16160Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 16170Sstevel@tonic-gate lgrp = lgrp_table[i]; 16180Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 16190Sstevel@tonic-gate return (lgrp); 16200Sstevel@tonic-gate } 16210Sstevel@tonic-gate return (NULL); 16220Sstevel@tonic-gate } 16230Sstevel@tonic-gate 16240Sstevel@tonic-gate /* 16250Sstevel@tonic-gate * Return the leaf lgroup containing the given CPU 162660Sesaxe * 162760Sesaxe * The caller needs to take precautions necessary to prevent 162860Sesaxe * "cpu" from going away across a call to this function. 162960Sesaxe * hint: kpreempt_disable()/kpreempt_enable() 16300Sstevel@tonic-gate */ 16310Sstevel@tonic-gate static lgrp_t * 16320Sstevel@tonic-gate lgrp_cpu_to_lgrp(cpu_t *cpu) 16330Sstevel@tonic-gate { 16341892Sesaxe return (cpu->cpu_lpl->lpl_lgrp); 16350Sstevel@tonic-gate } 16360Sstevel@tonic-gate 16370Sstevel@tonic-gate /* 16380Sstevel@tonic-gate * Return the sum of the partition loads in an lgrp divided by 16390Sstevel@tonic-gate * the number of CPUs in the lgrp. This is our best approximation 16400Sstevel@tonic-gate * of an 'lgroup load average' for a useful per-lgroup kstat. 16410Sstevel@tonic-gate */ 16420Sstevel@tonic-gate static uint64_t 16430Sstevel@tonic-gate lgrp_sum_loadavgs(lgrp_t *lgrp) 16440Sstevel@tonic-gate { 16450Sstevel@tonic-gate cpu_t *cpu; 16460Sstevel@tonic-gate int ncpu; 16470Sstevel@tonic-gate uint64_t loads = 0; 16480Sstevel@tonic-gate 16490Sstevel@tonic-gate mutex_enter(&cpu_lock); 16500Sstevel@tonic-gate 16510Sstevel@tonic-gate cpu = lgrp->lgrp_cpu; 16520Sstevel@tonic-gate ncpu = lgrp->lgrp_cpucnt; 16530Sstevel@tonic-gate 16540Sstevel@tonic-gate if (cpu == NULL || ncpu == 0) { 16550Sstevel@tonic-gate mutex_exit(&cpu_lock); 16560Sstevel@tonic-gate return (0ull); 16570Sstevel@tonic-gate } 16580Sstevel@tonic-gate 16590Sstevel@tonic-gate do { 16600Sstevel@tonic-gate loads += cpu->cpu_lpl->lpl_loadavg; 16610Sstevel@tonic-gate cpu = cpu->cpu_next_lgrp; 16620Sstevel@tonic-gate } while (cpu != lgrp->lgrp_cpu); 16630Sstevel@tonic-gate 16640Sstevel@tonic-gate mutex_exit(&cpu_lock); 16650Sstevel@tonic-gate 16660Sstevel@tonic-gate return (loads / ncpu); 16670Sstevel@tonic-gate } 16680Sstevel@tonic-gate 16690Sstevel@tonic-gate void 16700Sstevel@tonic-gate lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 16710Sstevel@tonic-gate { 16720Sstevel@tonic-gate struct lgrp_stats *pstats; 16730Sstevel@tonic-gate 16740Sstevel@tonic-gate /* 16750Sstevel@tonic-gate * Verify that the caller isn't trying to add to 16760Sstevel@tonic-gate * a statistic for an lgroup that has gone away 16770Sstevel@tonic-gate */ 16780Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 16790Sstevel@tonic-gate return; 16800Sstevel@tonic-gate 16810Sstevel@tonic-gate pstats = &lgrp_stats[lgrpid]; 16820Sstevel@tonic-gate atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 16830Sstevel@tonic-gate } 16840Sstevel@tonic-gate 16850Sstevel@tonic-gate int64_t 16860Sstevel@tonic-gate lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 16870Sstevel@tonic-gate { 16880Sstevel@tonic-gate uint64_t val; 16890Sstevel@tonic-gate struct lgrp_stats *pstats; 16900Sstevel@tonic-gate 16910Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 16920Sstevel@tonic-gate return ((int64_t)0); 16930Sstevel@tonic-gate 16940Sstevel@tonic-gate pstats = &lgrp_stats[lgrpid]; 16950Sstevel@tonic-gate LGRP_STAT_READ(pstats, stat, val); 16960Sstevel@tonic-gate return (val); 16970Sstevel@tonic-gate } 16980Sstevel@tonic-gate 16990Sstevel@tonic-gate /* 17000Sstevel@tonic-gate * Reset all kstats for lgrp specified by its lgrpid. 17010Sstevel@tonic-gate */ 17020Sstevel@tonic-gate static void 17030Sstevel@tonic-gate lgrp_kstat_reset(lgrp_id_t lgrpid) 17040Sstevel@tonic-gate { 17050Sstevel@tonic-gate lgrp_stat_t stat; 17060Sstevel@tonic-gate 17070Sstevel@tonic-gate if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 17080Sstevel@tonic-gate return; 17090Sstevel@tonic-gate 17100Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 17110Sstevel@tonic-gate LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 17120Sstevel@tonic-gate } 17130Sstevel@tonic-gate } 17140Sstevel@tonic-gate 17150Sstevel@tonic-gate /* 17160Sstevel@tonic-gate * Collect all per-lgrp statistics for the lgrp associated with this 17170Sstevel@tonic-gate * kstat, and store them in the ks_data array. 17180Sstevel@tonic-gate * 17190Sstevel@tonic-gate * The superuser can reset all the running counter statistics for an 17200Sstevel@tonic-gate * lgrp by writing to any of the lgrp's stats. 17210Sstevel@tonic-gate */ 17220Sstevel@tonic-gate static int 17230Sstevel@tonic-gate lgrp_kstat_extract(kstat_t *ksp, int rw) 17240Sstevel@tonic-gate { 17250Sstevel@tonic-gate lgrp_stat_t stat; 17260Sstevel@tonic-gate struct kstat_named *ksd; 17270Sstevel@tonic-gate lgrp_t *lgrp; 17280Sstevel@tonic-gate lgrp_id_t lgrpid; 17290Sstevel@tonic-gate 17300Sstevel@tonic-gate lgrp = (lgrp_t *)ksp->ks_private; 17310Sstevel@tonic-gate 17320Sstevel@tonic-gate ksd = (struct kstat_named *)ksp->ks_data; 17330Sstevel@tonic-gate ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 17340Sstevel@tonic-gate 17350Sstevel@tonic-gate lgrpid = lgrp->lgrp_id; 17360Sstevel@tonic-gate 17370Sstevel@tonic-gate if (lgrpid == LGRP_NONE) { 17380Sstevel@tonic-gate /* 17390Sstevel@tonic-gate * Return all zeroes as stats for freed lgrp. 17400Sstevel@tonic-gate */ 17410Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 17420Sstevel@tonic-gate ksd[stat].value.i64 = 0; 17430Sstevel@tonic-gate } 17440Sstevel@tonic-gate ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 17450Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 17460Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 17470Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 17480Sstevel@tonic-gate ksd[stat + LGRP_LOADAVG].value.i64 = 0; 17490Sstevel@tonic-gate } else if (rw != KSTAT_WRITE) { 17500Sstevel@tonic-gate /* 17510Sstevel@tonic-gate * Handle counter stats 17520Sstevel@tonic-gate */ 17530Sstevel@tonic-gate for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 17540Sstevel@tonic-gate ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 17550Sstevel@tonic-gate } 17560Sstevel@tonic-gate 17570Sstevel@tonic-gate /* 17580Sstevel@tonic-gate * Handle kernel data snapshot stats 17590Sstevel@tonic-gate */ 17600Sstevel@tonic-gate ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 17610Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 17620Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 17630Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 17640Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 17650Sstevel@tonic-gate ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 17660Sstevel@tonic-gate lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 17670Sstevel@tonic-gate ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 17680Sstevel@tonic-gate } else { 17690Sstevel@tonic-gate lgrp_kstat_reset(lgrpid); 17700Sstevel@tonic-gate } 17710Sstevel@tonic-gate 17720Sstevel@tonic-gate return (0); 17730Sstevel@tonic-gate } 17740Sstevel@tonic-gate 17750Sstevel@tonic-gate int 17760Sstevel@tonic-gate lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 17770Sstevel@tonic-gate { 17780Sstevel@tonic-gate cpu_t *cp; 17790Sstevel@tonic-gate 17800Sstevel@tonic-gate mutex_enter(&cpu_lock); 17810Sstevel@tonic-gate 17820Sstevel@tonic-gate if ((cp = cpu_get(id)) == NULL) { 17830Sstevel@tonic-gate mutex_exit(&cpu_lock); 17840Sstevel@tonic-gate return (EINVAL); 17850Sstevel@tonic-gate } 17860Sstevel@tonic-gate 17870Sstevel@tonic-gate if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 17880Sstevel@tonic-gate mutex_exit(&cpu_lock); 17890Sstevel@tonic-gate return (EINVAL); 17900Sstevel@tonic-gate } 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate ASSERT(cp->cpu_lpl != NULL); 17930Sstevel@tonic-gate 17940Sstevel@tonic-gate *lp = cp->cpu_lpl->lpl_lgrpid; 17950Sstevel@tonic-gate 17960Sstevel@tonic-gate mutex_exit(&cpu_lock); 17970Sstevel@tonic-gate 17980Sstevel@tonic-gate return (0); 17990Sstevel@tonic-gate } 18000Sstevel@tonic-gate 18010Sstevel@tonic-gate int 18020Sstevel@tonic-gate lgrp_query_load(processorid_t id, lgrp_load_t *lp) 18030Sstevel@tonic-gate { 18040Sstevel@tonic-gate cpu_t *cp; 18050Sstevel@tonic-gate 18060Sstevel@tonic-gate mutex_enter(&cpu_lock); 18070Sstevel@tonic-gate 18080Sstevel@tonic-gate if ((cp = cpu_get(id)) == NULL) { 18090Sstevel@tonic-gate mutex_exit(&cpu_lock); 18100Sstevel@tonic-gate return (EINVAL); 18110Sstevel@tonic-gate } 18120Sstevel@tonic-gate 18130Sstevel@tonic-gate ASSERT(cp->cpu_lpl != NULL); 18140Sstevel@tonic-gate 18150Sstevel@tonic-gate *lp = cp->cpu_lpl->lpl_loadavg; 18160Sstevel@tonic-gate 18170Sstevel@tonic-gate mutex_exit(&cpu_lock); 18180Sstevel@tonic-gate 18190Sstevel@tonic-gate return (0); 18200Sstevel@tonic-gate } 18210Sstevel@tonic-gate 18220Sstevel@tonic-gate void 18230Sstevel@tonic-gate lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 18240Sstevel@tonic-gate { 18250Sstevel@tonic-gate lgrp_t *lgrp; 18260Sstevel@tonic-gate int i; 18270Sstevel@tonic-gate 18280Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 18290Sstevel@tonic-gate lgrp = lgrp_table[i]; 18300Sstevel@tonic-gate 18310Sstevel@tonic-gate if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 18320Sstevel@tonic-gate lgrp->lgrp_latency = (int)newtime; 18330Sstevel@tonic-gate } 18340Sstevel@tonic-gate } 18350Sstevel@tonic-gate 18360Sstevel@tonic-gate /* 18370Sstevel@tonic-gate * Add a resource named by lpl_leaf to rset of lpl_target 18380Sstevel@tonic-gate * 18390Sstevel@tonic-gate * This routine also adjusts ncpu and nrset if the call succeeds in adding a 18400Sstevel@tonic-gate * resource. It is adjusted here, as this is presently the only place that we 18410Sstevel@tonic-gate * can be certain a resource addition has succeeded. 18420Sstevel@tonic-gate * 18430Sstevel@tonic-gate * We keep the list of rsets sorted so that the dispatcher can quickly walk the 18440Sstevel@tonic-gate * list in order until it reaches a NULL. (This list is required to be NULL 18450Sstevel@tonic-gate * terminated, too). This is done so that we can mark start pos + 1, so that 18460Sstevel@tonic-gate * each lpl is traversed sequentially, but in a different order. We hope this 18470Sstevel@tonic-gate * will improve performance a bit. (Hopefully, less read-to-own traffic...) 18480Sstevel@tonic-gate */ 18490Sstevel@tonic-gate 18500Sstevel@tonic-gate void 18510Sstevel@tonic-gate lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 18520Sstevel@tonic-gate { 18530Sstevel@tonic-gate int i; 18540Sstevel@tonic-gate int entry_slot = 0; 18550Sstevel@tonic-gate 18560Sstevel@tonic-gate /* return if leaf is already present */ 18570Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 18580Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) { 18590Sstevel@tonic-gate return; 18600Sstevel@tonic-gate } 18610Sstevel@tonic-gate 18620Sstevel@tonic-gate if (lpl_target->lpl_rset[i]->lpl_lgrpid > 18630Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) { 18640Sstevel@tonic-gate break; 18650Sstevel@tonic-gate } 18660Sstevel@tonic-gate } 18670Sstevel@tonic-gate 18680Sstevel@tonic-gate /* insert leaf, update counts */ 18690Sstevel@tonic-gate entry_slot = i; 18700Sstevel@tonic-gate i = lpl_target->lpl_nrset++; 18710Sstevel@tonic-gate if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 18720Sstevel@tonic-gate panic("More leaf lgrps in system than are supported!\n"); 18730Sstevel@tonic-gate } 18740Sstevel@tonic-gate 18750Sstevel@tonic-gate /* 18760Sstevel@tonic-gate * Start at the end of the rset array and work backwards towards the 18770Sstevel@tonic-gate * slot into which the new lpl will be inserted. This effectively 18780Sstevel@tonic-gate * preserves the current ordering by scooting everybody over one entry, 18790Sstevel@tonic-gate * and placing the new entry into the space created. 18800Sstevel@tonic-gate */ 18810Sstevel@tonic-gate 18820Sstevel@tonic-gate while (i-- > entry_slot) { 18830Sstevel@tonic-gate lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 18840Sstevel@tonic-gate } 18850Sstevel@tonic-gate 18860Sstevel@tonic-gate lpl_target->lpl_rset[entry_slot] = lpl_leaf; 18870Sstevel@tonic-gate lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 18880Sstevel@tonic-gate } 18890Sstevel@tonic-gate 18900Sstevel@tonic-gate /* 18910Sstevel@tonic-gate * Update each of lpl_parent's children with a proper hint and 18920Sstevel@tonic-gate * a reference to their parent. 18930Sstevel@tonic-gate * The lgrp topology is used as the reference since it is fully 18940Sstevel@tonic-gate * consistent and correct at this point. 18950Sstevel@tonic-gate * 18960Sstevel@tonic-gate * Each child's hint will reference an element in lpl_parent's 18970Sstevel@tonic-gate * rset that designates where the child should start searching 18980Sstevel@tonic-gate * for CPU resources. The hint selected is the highest order leaf present 18990Sstevel@tonic-gate * in the child's lineage. 19000Sstevel@tonic-gate * 19010Sstevel@tonic-gate * This should be called after any potential change in lpl_parent's 19020Sstevel@tonic-gate * rset. 19030Sstevel@tonic-gate */ 19040Sstevel@tonic-gate static void 19050Sstevel@tonic-gate lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 19060Sstevel@tonic-gate { 19070Sstevel@tonic-gate klgrpset_t children, leaves; 19080Sstevel@tonic-gate lpl_t *lpl; 19090Sstevel@tonic-gate int hint; 19100Sstevel@tonic-gate int i, j; 19110Sstevel@tonic-gate 19120Sstevel@tonic-gate children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 19130Sstevel@tonic-gate if (klgrpset_isempty(children)) 19140Sstevel@tonic-gate return; /* nothing to do */ 19150Sstevel@tonic-gate 19160Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 19170Sstevel@tonic-gate if (klgrpset_ismember(children, i)) { 19180Sstevel@tonic-gate 19190Sstevel@tonic-gate /* 19200Sstevel@tonic-gate * Given the set of leaves in this child's lineage, 19210Sstevel@tonic-gate * find the highest order leaf present in the parent's 19220Sstevel@tonic-gate * rset. Select this as the hint for the child. 19230Sstevel@tonic-gate */ 19240Sstevel@tonic-gate leaves = lgrp_table[i]->lgrp_leaves; 19250Sstevel@tonic-gate hint = 0; 19260Sstevel@tonic-gate for (j = 0; j < lpl_parent->lpl_nrset; j++) { 19270Sstevel@tonic-gate lpl = lpl_parent->lpl_rset[j]; 19280Sstevel@tonic-gate if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 19290Sstevel@tonic-gate hint = j; 19300Sstevel@tonic-gate } 19310Sstevel@tonic-gate cp->cp_lgrploads[i].lpl_hint = hint; 19320Sstevel@tonic-gate 19330Sstevel@tonic-gate /* 19340Sstevel@tonic-gate * (Re)set the parent. It may be incorrect if 19350Sstevel@tonic-gate * lpl_parent is new in the topology. 19360Sstevel@tonic-gate */ 19370Sstevel@tonic-gate cp->cp_lgrploads[i].lpl_parent = lpl_parent; 19380Sstevel@tonic-gate } 19390Sstevel@tonic-gate } 19400Sstevel@tonic-gate } 19410Sstevel@tonic-gate 19420Sstevel@tonic-gate /* 19430Sstevel@tonic-gate * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 19440Sstevel@tonic-gate * 19450Sstevel@tonic-gate * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 19460Sstevel@tonic-gate * resource. The values are adjusted here, as this is the only place that we can 19470Sstevel@tonic-gate * be certain a resource was successfully deleted. 19480Sstevel@tonic-gate */ 19490Sstevel@tonic-gate void 19500Sstevel@tonic-gate lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 19510Sstevel@tonic-gate { 19520Sstevel@tonic-gate int i; 19530Sstevel@tonic-gate 19540Sstevel@tonic-gate /* find leaf in intermediate node */ 19550Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 19560Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) 19570Sstevel@tonic-gate break; 19580Sstevel@tonic-gate } 19590Sstevel@tonic-gate 19600Sstevel@tonic-gate /* return if leaf not found */ 19610Sstevel@tonic-gate if (lpl_target->lpl_rset[i] != lpl_leaf) 19620Sstevel@tonic-gate return; 19630Sstevel@tonic-gate 19640Sstevel@tonic-gate /* prune leaf, compress array */ 19650Sstevel@tonic-gate ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 19660Sstevel@tonic-gate lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 19670Sstevel@tonic-gate lpl_target->lpl_ncpu--; 19680Sstevel@tonic-gate do { 19690Sstevel@tonic-gate lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 19700Sstevel@tonic-gate } while (i++ < lpl_target->lpl_nrset); 19710Sstevel@tonic-gate } 19720Sstevel@tonic-gate 19730Sstevel@tonic-gate /* 19740Sstevel@tonic-gate * Check to see if the resource set of the target lpl contains the 19750Sstevel@tonic-gate * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 19760Sstevel@tonic-gate */ 19770Sstevel@tonic-gate 19780Sstevel@tonic-gate int 19790Sstevel@tonic-gate lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 19800Sstevel@tonic-gate { 19810Sstevel@tonic-gate int i; 19820Sstevel@tonic-gate 19830Sstevel@tonic-gate for (i = 0; i < lpl_target->lpl_nrset; i++) { 19840Sstevel@tonic-gate if (lpl_target->lpl_rset[i] == lpl_leaf) 19850Sstevel@tonic-gate return (1); 19860Sstevel@tonic-gate } 19870Sstevel@tonic-gate 19880Sstevel@tonic-gate return (0); 19890Sstevel@tonic-gate } 19900Sstevel@tonic-gate 19910Sstevel@tonic-gate /* 19920Sstevel@tonic-gate * Called when we change cpu lpl membership. This increments or decrements the 19930Sstevel@tonic-gate * per-cpu counter in every lpl in which our leaf appears. 19940Sstevel@tonic-gate */ 19950Sstevel@tonic-gate void 19960Sstevel@tonic-gate lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 19970Sstevel@tonic-gate { 19980Sstevel@tonic-gate cpupart_t *cpupart; 19990Sstevel@tonic-gate lgrp_t *lgrp_leaf; 20000Sstevel@tonic-gate lgrp_t *lgrp_cur; 20010Sstevel@tonic-gate lpl_t *lpl_leaf; 20020Sstevel@tonic-gate lpl_t *lpl_cur; 20030Sstevel@tonic-gate int i; 20040Sstevel@tonic-gate 20050Sstevel@tonic-gate ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 20060Sstevel@tonic-gate 20070Sstevel@tonic-gate cpupart = cp->cpu_part; 20080Sstevel@tonic-gate lpl_leaf = cp->cpu_lpl; 20090Sstevel@tonic-gate lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 20100Sstevel@tonic-gate 20110Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 20120Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 20130Sstevel@tonic-gate 20140Sstevel@tonic-gate /* 20150Sstevel@tonic-gate * Don't adjust if the lgrp isn't there, if we're the leaf lpl 20160Sstevel@tonic-gate * for the cpu in question, or if the current lgrp and leaf 20170Sstevel@tonic-gate * don't share the same resources. 20180Sstevel@tonic-gate */ 20190Sstevel@tonic-gate 20200Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 20210Sstevel@tonic-gate !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 20220Sstevel@tonic-gate lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 20230Sstevel@tonic-gate continue; 20240Sstevel@tonic-gate 20250Sstevel@tonic-gate 20260Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 20270Sstevel@tonic-gate 20280Sstevel@tonic-gate if (lpl_cur->lpl_nrset > 0) { 20290Sstevel@tonic-gate if (act == LPL_INCREMENT) { 20300Sstevel@tonic-gate lpl_cur->lpl_ncpu++; 20310Sstevel@tonic-gate } else if (act == LPL_DECREMENT) { 20320Sstevel@tonic-gate lpl_cur->lpl_ncpu--; 20330Sstevel@tonic-gate } 20340Sstevel@tonic-gate } 20350Sstevel@tonic-gate } 20360Sstevel@tonic-gate } 20370Sstevel@tonic-gate 20380Sstevel@tonic-gate /* 20390Sstevel@tonic-gate * Initialize lpl with given resources and specified lgrp 20400Sstevel@tonic-gate */ 20410Sstevel@tonic-gate 20420Sstevel@tonic-gate void 20430Sstevel@tonic-gate lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 20440Sstevel@tonic-gate { 20450Sstevel@tonic-gate lpl->lpl_lgrpid = lgrp->lgrp_id; 20460Sstevel@tonic-gate lpl->lpl_loadavg = 0; 20470Sstevel@tonic-gate if (lpl == lpl_leaf) 20480Sstevel@tonic-gate lpl->lpl_ncpu = 1; 20490Sstevel@tonic-gate else 20500Sstevel@tonic-gate lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 20510Sstevel@tonic-gate lpl->lpl_nrset = 1; 20520Sstevel@tonic-gate lpl->lpl_rset[0] = lpl_leaf; 20530Sstevel@tonic-gate lpl->lpl_lgrp = lgrp; 20540Sstevel@tonic-gate lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 20550Sstevel@tonic-gate lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 20560Sstevel@tonic-gate } 20570Sstevel@tonic-gate 20580Sstevel@tonic-gate /* 20590Sstevel@tonic-gate * Clear an unused lpl 20600Sstevel@tonic-gate */ 20610Sstevel@tonic-gate 20620Sstevel@tonic-gate void 20630Sstevel@tonic-gate lpl_clear(lpl_t *lpl) 20640Sstevel@tonic-gate { 20651892Sesaxe lgrp_id_t lid; 20660Sstevel@tonic-gate 20670Sstevel@tonic-gate /* save lid for debugging purposes */ 20680Sstevel@tonic-gate lid = lpl->lpl_lgrpid; 20690Sstevel@tonic-gate bzero(lpl, sizeof (lpl_t)); 20700Sstevel@tonic-gate lpl->lpl_lgrpid = lid; 20710Sstevel@tonic-gate } 20720Sstevel@tonic-gate 20730Sstevel@tonic-gate /* 20740Sstevel@tonic-gate * Given a CPU-partition, verify that the lpl topology in the CPU-partition 20750Sstevel@tonic-gate * is in sync with the lgroup toplogy in the system. The lpl topology may not 20760Sstevel@tonic-gate * make full use of all of the lgroup topology, but this checks to make sure 20770Sstevel@tonic-gate * that for the parts that it does use, it has correctly understood the 20780Sstevel@tonic-gate * relationships that exist. This function returns 20790Sstevel@tonic-gate * 0 if the topology is correct, and a non-zero error code, for non-debug 20800Sstevel@tonic-gate * kernels if incorrect. Asserts are spread throughout the code to aid in 20810Sstevel@tonic-gate * debugging on a DEBUG kernel. 20820Sstevel@tonic-gate */ 20830Sstevel@tonic-gate int 20840Sstevel@tonic-gate lpl_topo_verify(cpupart_t *cpupart) 20850Sstevel@tonic-gate { 20860Sstevel@tonic-gate lgrp_t *lgrp; 20870Sstevel@tonic-gate lpl_t *lpl; 20880Sstevel@tonic-gate klgrpset_t rset; 20890Sstevel@tonic-gate klgrpset_t cset; 20900Sstevel@tonic-gate cpu_t *cpu; 20910Sstevel@tonic-gate cpu_t *cp_start; 20920Sstevel@tonic-gate int i; 20930Sstevel@tonic-gate int j; 20940Sstevel@tonic-gate int sum; 20950Sstevel@tonic-gate 20960Sstevel@tonic-gate /* topology can't be incorrect if it doesn't exist */ 20970Sstevel@tonic-gate if (!lgrp_topo_initialized || !lgrp_initialized) 20980Sstevel@tonic-gate return (LPL_TOPO_CORRECT); 20990Sstevel@tonic-gate 21000Sstevel@tonic-gate ASSERT(cpupart != NULL); 21010Sstevel@tonic-gate 21020Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 21030Sstevel@tonic-gate lgrp = lgrp_table[i]; 21040Sstevel@tonic-gate lpl = NULL; 21050Sstevel@tonic-gate /* make sure lpls are allocated */ 21060Sstevel@tonic-gate ASSERT(cpupart->cp_lgrploads); 21070Sstevel@tonic-gate if (!cpupart->cp_lgrploads) 21080Sstevel@tonic-gate return (LPL_TOPO_PART_HAS_NO_LPL); 21090Sstevel@tonic-gate 21100Sstevel@tonic-gate lpl = &cpupart->cp_lgrploads[i]; 21110Sstevel@tonic-gate /* make sure our index is good */ 21120Sstevel@tonic-gate ASSERT(i < cpupart->cp_nlgrploads); 21130Sstevel@tonic-gate 21140Sstevel@tonic-gate /* if lgroup doesn't exist, make sure lpl is empty */ 21150Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) { 21160Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu == 0); 21170Sstevel@tonic-gate if (lpl->lpl_ncpu > 0) { 21180Sstevel@tonic-gate return (LPL_TOPO_CPUS_NOT_EMPTY); 21190Sstevel@tonic-gate } else { 21200Sstevel@tonic-gate continue; 21210Sstevel@tonic-gate } 21220Sstevel@tonic-gate } 21230Sstevel@tonic-gate 21240Sstevel@tonic-gate /* verify that lgroup and lpl are identically numbered */ 21250Sstevel@tonic-gate ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 21260Sstevel@tonic-gate 21270Sstevel@tonic-gate /* if lgroup isn't in our partition, make sure lpl is empty */ 21280Sstevel@tonic-gate if (!klgrpset_intersects(lgrp->lgrp_leaves, 21290Sstevel@tonic-gate cpupart->cp_lgrpset)) { 21300Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu == 0); 21310Sstevel@tonic-gate if (lpl->lpl_ncpu > 0) { 21320Sstevel@tonic-gate return (LPL_TOPO_CPUS_NOT_EMPTY); 21330Sstevel@tonic-gate } 21340Sstevel@tonic-gate /* 21350Sstevel@tonic-gate * lpl is empty, and lgroup isn't in partition. verify 21360Sstevel@tonic-gate * that lpl doesn't show up in anyone else's rsets (in 21370Sstevel@tonic-gate * this partition, anyway) 21380Sstevel@tonic-gate */ 21390Sstevel@tonic-gate 21400Sstevel@tonic-gate for (j = 0; j < cpupart->cp_nlgrploads; j++) { 21410Sstevel@tonic-gate lpl_t *i_lpl; /* lpl we're iterating over */ 21420Sstevel@tonic-gate 21430Sstevel@tonic-gate i_lpl = &cpupart->cp_lgrploads[j]; 21440Sstevel@tonic-gate 21450Sstevel@tonic-gate ASSERT(!lpl_rset_contains(i_lpl, lpl)); 21460Sstevel@tonic-gate if (lpl_rset_contains(i_lpl, lpl)) { 21470Sstevel@tonic-gate return (LPL_TOPO_LPL_ORPHANED); 21480Sstevel@tonic-gate } 21490Sstevel@tonic-gate } 21500Sstevel@tonic-gate /* lgroup is empty, and everything is ok. continue */ 21510Sstevel@tonic-gate continue; 21520Sstevel@tonic-gate } 21530Sstevel@tonic-gate 21540Sstevel@tonic-gate 21550Sstevel@tonic-gate /* lgroup is in this partition, now check it against lpl */ 21560Sstevel@tonic-gate 21570Sstevel@tonic-gate /* do both have matching lgrps? */ 21580Sstevel@tonic-gate ASSERT(lgrp == lpl->lpl_lgrp); 21590Sstevel@tonic-gate if (lgrp != lpl->lpl_lgrp) { 21600Sstevel@tonic-gate return (LPL_TOPO_LGRP_MISMATCH); 21610Sstevel@tonic-gate } 21620Sstevel@tonic-gate 21630Sstevel@tonic-gate /* do the parent lgroups exist and do they match? */ 21640Sstevel@tonic-gate if (lgrp->lgrp_parent) { 21650Sstevel@tonic-gate ASSERT(lpl->lpl_parent); 21660Sstevel@tonic-gate ASSERT(lgrp->lgrp_parent->lgrp_id == 21670Sstevel@tonic-gate lpl->lpl_parent->lpl_lgrpid); 21680Sstevel@tonic-gate 21690Sstevel@tonic-gate if (!lpl->lpl_parent) { 21700Sstevel@tonic-gate return (LPL_TOPO_MISSING_PARENT); 21710Sstevel@tonic-gate } else if (lgrp->lgrp_parent->lgrp_id != 21720Sstevel@tonic-gate lpl->lpl_parent->lpl_lgrpid) { 21730Sstevel@tonic-gate return (LPL_TOPO_PARENT_MISMATCH); 21740Sstevel@tonic-gate } 21750Sstevel@tonic-gate } 21760Sstevel@tonic-gate 21770Sstevel@tonic-gate /* only leaf lgroups keep a cpucnt, only check leaves */ 21780Sstevel@tonic-gate if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 21790Sstevel@tonic-gate 21800Sstevel@tonic-gate /* verify that lgrp is also a leaf */ 21810Sstevel@tonic-gate ASSERT((lgrp->lgrp_childcnt == 0) && 21820Sstevel@tonic-gate (klgrpset_ismember(lgrp->lgrp_leaves, 21830Sstevel@tonic-gate lpl->lpl_lgrpid))); 21840Sstevel@tonic-gate 21850Sstevel@tonic-gate if ((lgrp->lgrp_childcnt > 0) || 21860Sstevel@tonic-gate (!klgrpset_ismember(lgrp->lgrp_leaves, 21870Sstevel@tonic-gate lpl->lpl_lgrpid))) { 21880Sstevel@tonic-gate return (LPL_TOPO_LGRP_NOT_LEAF); 21890Sstevel@tonic-gate } 21900Sstevel@tonic-gate 21910Sstevel@tonic-gate ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 21920Sstevel@tonic-gate (lpl->lpl_ncpu > 0)); 21930Sstevel@tonic-gate if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 21940Sstevel@tonic-gate (lpl->lpl_ncpu <= 0)) { 21950Sstevel@tonic-gate return (LPL_TOPO_BAD_CPUCNT); 21960Sstevel@tonic-gate } 21970Sstevel@tonic-gate 21980Sstevel@tonic-gate /* 21990Sstevel@tonic-gate * Check that lpl_ncpu also matches the number of 22000Sstevel@tonic-gate * cpus in the lpl's linked list. This only exists in 22010Sstevel@tonic-gate * leaves, but they should always match. 22020Sstevel@tonic-gate */ 22030Sstevel@tonic-gate j = 0; 22040Sstevel@tonic-gate cpu = cp_start = lpl->lpl_cpus; 22050Sstevel@tonic-gate while (cpu != NULL) { 22060Sstevel@tonic-gate j++; 22070Sstevel@tonic-gate 22080Sstevel@tonic-gate /* check to make sure cpu's lpl is leaf lpl */ 22090Sstevel@tonic-gate ASSERT(cpu->cpu_lpl == lpl); 22100Sstevel@tonic-gate if (cpu->cpu_lpl != lpl) { 22110Sstevel@tonic-gate return (LPL_TOPO_CPU_HAS_BAD_LPL); 22120Sstevel@tonic-gate } 22130Sstevel@tonic-gate 22140Sstevel@tonic-gate /* check next cpu */ 22150Sstevel@tonic-gate if ((cpu = cpu->cpu_next_lpl) != cp_start) { 22160Sstevel@tonic-gate continue; 22170Sstevel@tonic-gate } else { 22180Sstevel@tonic-gate cpu = NULL; 22190Sstevel@tonic-gate } 22200Sstevel@tonic-gate } 22210Sstevel@tonic-gate 22220Sstevel@tonic-gate ASSERT(j == lpl->lpl_ncpu); 22230Sstevel@tonic-gate if (j != lpl->lpl_ncpu) { 22240Sstevel@tonic-gate return (LPL_TOPO_LPL_BAD_NCPU); 22250Sstevel@tonic-gate } 22260Sstevel@tonic-gate 22270Sstevel@tonic-gate /* 22280Sstevel@tonic-gate * Also, check that leaf lpl is contained in all 22290Sstevel@tonic-gate * intermediate lpls that name the leaf as a descendant 22300Sstevel@tonic-gate */ 22310Sstevel@tonic-gate 22320Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 22330Sstevel@tonic-gate klgrpset_t intersect; 22340Sstevel@tonic-gate lgrp_t *lgrp_cand; 22350Sstevel@tonic-gate lpl_t *lpl_cand; 22360Sstevel@tonic-gate 22370Sstevel@tonic-gate lgrp_cand = lgrp_table[j]; 22380Sstevel@tonic-gate intersect = klgrpset_intersects( 22390Sstevel@tonic-gate lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 22400Sstevel@tonic-gate cpupart->cp_lgrpset); 22410Sstevel@tonic-gate 22420Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cand) || 22430Sstevel@tonic-gate !klgrpset_intersects(lgrp_cand->lgrp_leaves, 22440Sstevel@tonic-gate cpupart->cp_lgrpset) || 22450Sstevel@tonic-gate (intersect == 0)) 22460Sstevel@tonic-gate continue; 22470Sstevel@tonic-gate 22480Sstevel@tonic-gate lpl_cand = 22490Sstevel@tonic-gate &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 22500Sstevel@tonic-gate 22510Sstevel@tonic-gate if (klgrpset_ismember(intersect, 22520Sstevel@tonic-gate lgrp->lgrp_id)) { 22530Sstevel@tonic-gate ASSERT(lpl_rset_contains(lpl_cand, 22540Sstevel@tonic-gate lpl)); 22550Sstevel@tonic-gate 22560Sstevel@tonic-gate if (!lpl_rset_contains(lpl_cand, lpl)) { 22570Sstevel@tonic-gate return (LPL_TOPO_RSET_MSSNG_LF); 22580Sstevel@tonic-gate } 22590Sstevel@tonic-gate } 22600Sstevel@tonic-gate } 22610Sstevel@tonic-gate 22620Sstevel@tonic-gate } else { /* non-leaf specific checks */ 22630Sstevel@tonic-gate 22640Sstevel@tonic-gate /* 22650Sstevel@tonic-gate * Non-leaf lpls should have lpl_cpus == NULL 22660Sstevel@tonic-gate * verify that this is so 22670Sstevel@tonic-gate */ 22680Sstevel@tonic-gate ASSERT(lpl->lpl_cpus == NULL); 22690Sstevel@tonic-gate if (lpl->lpl_cpus != NULL) { 22700Sstevel@tonic-gate return (LPL_TOPO_NONLEAF_HAS_CPUS); 22710Sstevel@tonic-gate } 22720Sstevel@tonic-gate 22730Sstevel@tonic-gate /* 22740Sstevel@tonic-gate * verify that the sum of the cpus in the leaf resources 22750Sstevel@tonic-gate * is equal to the total ncpu in the intermediate 22760Sstevel@tonic-gate */ 22770Sstevel@tonic-gate for (j = sum = 0; j < lpl->lpl_nrset; j++) { 22780Sstevel@tonic-gate sum += lpl->lpl_rset[j]->lpl_ncpu; 22790Sstevel@tonic-gate } 22800Sstevel@tonic-gate 22810Sstevel@tonic-gate ASSERT(sum == lpl->lpl_ncpu); 22820Sstevel@tonic-gate if (sum != lpl->lpl_ncpu) { 22830Sstevel@tonic-gate return (LPL_TOPO_LPL_BAD_NCPU); 22840Sstevel@tonic-gate } 22850Sstevel@tonic-gate } 22860Sstevel@tonic-gate 22870Sstevel@tonic-gate /* 22880Sstevel@tonic-gate * check on lpl_hint. Don't check root, since it has no parent. 22890Sstevel@tonic-gate */ 22900Sstevel@tonic-gate if (lpl->lpl_parent != NULL) { 22910Sstevel@tonic-gate int hint; 22920Sstevel@tonic-gate lpl_t *hint_lpl; 22930Sstevel@tonic-gate 22940Sstevel@tonic-gate /* make sure hint is within limits of nrset */ 22950Sstevel@tonic-gate hint = lpl->lpl_hint; 22960Sstevel@tonic-gate ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 22970Sstevel@tonic-gate if (lpl->lpl_parent->lpl_nrset < hint) { 22980Sstevel@tonic-gate return (LPL_TOPO_BOGUS_HINT); 22990Sstevel@tonic-gate } 23000Sstevel@tonic-gate 23010Sstevel@tonic-gate /* make sure hint points to valid lpl */ 23020Sstevel@tonic-gate hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 23030Sstevel@tonic-gate ASSERT(hint_lpl->lpl_ncpu > 0); 23040Sstevel@tonic-gate if (hint_lpl->lpl_ncpu <= 0) { 23050Sstevel@tonic-gate return (LPL_TOPO_BOGUS_HINT); 23060Sstevel@tonic-gate } 23070Sstevel@tonic-gate } 23080Sstevel@tonic-gate 23090Sstevel@tonic-gate /* 23100Sstevel@tonic-gate * Check the rset of the lpl in question. Make sure that each 23110Sstevel@tonic-gate * rset contains a subset of the resources in 23120Sstevel@tonic-gate * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 23130Sstevel@tonic-gate * sure that each rset doesn't include resources that are 23140Sstevel@tonic-gate * outside of that set. (Which would be resources somehow not 23150Sstevel@tonic-gate * accounted for). 23160Sstevel@tonic-gate */ 23170Sstevel@tonic-gate 23180Sstevel@tonic-gate klgrpset_clear(rset); 23190Sstevel@tonic-gate for (j = 0; j < lpl->lpl_nrset; j++) { 23200Sstevel@tonic-gate klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 23210Sstevel@tonic-gate } 23220Sstevel@tonic-gate klgrpset_copy(cset, rset); 23230Sstevel@tonic-gate /* make sure lpl rset matches lgrp rset */ 23240Sstevel@tonic-gate klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 23250Sstevel@tonic-gate /* make sure rset is contained with in partition, too */ 23260Sstevel@tonic-gate klgrpset_diff(cset, cpupart->cp_lgrpset); 23270Sstevel@tonic-gate 23280Sstevel@tonic-gate ASSERT(klgrpset_isempty(rset) && 23290Sstevel@tonic-gate klgrpset_isempty(cset)); 23300Sstevel@tonic-gate if (!klgrpset_isempty(rset) || 23310Sstevel@tonic-gate !klgrpset_isempty(cset)) { 23320Sstevel@tonic-gate return (LPL_TOPO_RSET_MISMATCH); 23330Sstevel@tonic-gate } 23340Sstevel@tonic-gate 23350Sstevel@tonic-gate /* 23360Sstevel@tonic-gate * check to make sure lpl_nrset matches the number of rsets 23370Sstevel@tonic-gate * contained in the lpl 23380Sstevel@tonic-gate */ 23390Sstevel@tonic-gate 23400Sstevel@tonic-gate for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 23410Sstevel@tonic-gate j++); 23420Sstevel@tonic-gate 23430Sstevel@tonic-gate ASSERT(j == lpl->lpl_nrset); 23440Sstevel@tonic-gate if (j != lpl->lpl_nrset) { 23450Sstevel@tonic-gate return (LPL_TOPO_BAD_RSETCNT); 23460Sstevel@tonic-gate } 23470Sstevel@tonic-gate 23480Sstevel@tonic-gate } 23490Sstevel@tonic-gate return (LPL_TOPO_CORRECT); 23500Sstevel@tonic-gate } 23510Sstevel@tonic-gate 23520Sstevel@tonic-gate /* 23530Sstevel@tonic-gate * Flatten lpl topology to given number of levels. This is presently only 23540Sstevel@tonic-gate * implemented for a flatten to 2 levels, which will prune out the intermediates 23550Sstevel@tonic-gate * and home the leaf lpls to the root lpl. 23560Sstevel@tonic-gate */ 23570Sstevel@tonic-gate int 23580Sstevel@tonic-gate lpl_topo_flatten(int levels) 23590Sstevel@tonic-gate { 23600Sstevel@tonic-gate int i; 23610Sstevel@tonic-gate uint_t sum; 23620Sstevel@tonic-gate lgrp_t *lgrp_cur; 23630Sstevel@tonic-gate lpl_t *lpl_cur; 23640Sstevel@tonic-gate lpl_t *lpl_root; 23650Sstevel@tonic-gate cpupart_t *cp; 23660Sstevel@tonic-gate 23670Sstevel@tonic-gate if (levels != 2) 23680Sstevel@tonic-gate return (0); 23690Sstevel@tonic-gate 23700Sstevel@tonic-gate /* called w/ cpus paused - grab no locks! */ 23710Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 23720Sstevel@tonic-gate !lgrp_initialized); 23730Sstevel@tonic-gate 23740Sstevel@tonic-gate cp = cp_list_head; 23750Sstevel@tonic-gate do { 23760Sstevel@tonic-gate lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 23770Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 23780Sstevel@tonic-gate 23790Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 23800Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 23810Sstevel@tonic-gate lpl_cur = &cp->cp_lgrploads[i]; 23820Sstevel@tonic-gate 23830Sstevel@tonic-gate if ((lgrp_cur == lgrp_root) || 23840Sstevel@tonic-gate (!LGRP_EXISTS(lgrp_cur) && 23850Sstevel@tonic-gate (lpl_cur->lpl_ncpu == 0))) 23860Sstevel@tonic-gate continue; 23870Sstevel@tonic-gate 23880Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 23890Sstevel@tonic-gate /* 23900Sstevel@tonic-gate * this should be a deleted intermediate, so 23910Sstevel@tonic-gate * clear it 23920Sstevel@tonic-gate */ 23930Sstevel@tonic-gate lpl_clear(lpl_cur); 23940Sstevel@tonic-gate } else if ((lpl_cur->lpl_nrset == 1) && 23950Sstevel@tonic-gate (lpl_cur->lpl_rset[0] == lpl_cur) && 23960Sstevel@tonic-gate ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 23970Sstevel@tonic-gate (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 23980Sstevel@tonic-gate /* 23990Sstevel@tonic-gate * this is a leaf whose parent was deleted, or 24000Sstevel@tonic-gate * whose parent had their lgrp deleted. (And 24010Sstevel@tonic-gate * whose parent will soon be deleted). Point 24020Sstevel@tonic-gate * this guy back to the root lpl. 24030Sstevel@tonic-gate */ 24040Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_root; 24050Sstevel@tonic-gate lpl_rset_add(lpl_root, lpl_cur); 24060Sstevel@tonic-gate } 24070Sstevel@tonic-gate 24080Sstevel@tonic-gate } 24090Sstevel@tonic-gate 24100Sstevel@tonic-gate /* 24110Sstevel@tonic-gate * Now that we're done, make sure the count on the root lpl is 24120Sstevel@tonic-gate * correct, and update the hints of the children for the sake of 24130Sstevel@tonic-gate * thoroughness 24140Sstevel@tonic-gate */ 24150Sstevel@tonic-gate for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 24160Sstevel@tonic-gate sum += lpl_root->lpl_rset[i]->lpl_ncpu; 24170Sstevel@tonic-gate } 24180Sstevel@tonic-gate lpl_root->lpl_ncpu = sum; 24190Sstevel@tonic-gate lpl_child_update(lpl_root, cp); 24200Sstevel@tonic-gate 24210Sstevel@tonic-gate cp = cp->cp_next; 24220Sstevel@tonic-gate } while (cp != cp_list_head); 24230Sstevel@tonic-gate 24240Sstevel@tonic-gate return (levels); 24250Sstevel@tonic-gate } 24260Sstevel@tonic-gate 24270Sstevel@tonic-gate /* 24280Sstevel@tonic-gate * Insert a lpl into the resource hierarchy and create any additional lpls that 24290Sstevel@tonic-gate * are necessary to represent the varying states of locality for the cpu 24300Sstevel@tonic-gate * resoruces newly added to the partition. 24310Sstevel@tonic-gate * 24320Sstevel@tonic-gate * This routine is clever enough that it can correctly add resources from the 24330Sstevel@tonic-gate * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 24340Sstevel@tonic-gate * those for which the lpl is a leaf as opposed to simply a named equally local 24350Sstevel@tonic-gate * resource). The one special case that needs additional processing is when a 24360Sstevel@tonic-gate * new intermediate lpl is introduced. Since the main loop only traverses 24370Sstevel@tonic-gate * looking to add the leaf resource where it does not yet exist, additional work 24380Sstevel@tonic-gate * is necessary to add other leaf resources that may need to exist in the newly 24390Sstevel@tonic-gate * created intermediate. This is performed by the second inner loop, and is 24400Sstevel@tonic-gate * only done when the check for more than one overlapping resource succeeds. 24410Sstevel@tonic-gate */ 24420Sstevel@tonic-gate 24430Sstevel@tonic-gate void 24440Sstevel@tonic-gate lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 24450Sstevel@tonic-gate { 24460Sstevel@tonic-gate int i; 24470Sstevel@tonic-gate int j; 24480Sstevel@tonic-gate int hint; 24490Sstevel@tonic-gate int rset_num_intersect; 24500Sstevel@tonic-gate lgrp_t *lgrp_cur; 24510Sstevel@tonic-gate lpl_t *lpl_cur; 24520Sstevel@tonic-gate lpl_t *lpl_parent; 24531892Sesaxe lgrp_id_t parent_id; 24540Sstevel@tonic-gate klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 24550Sstevel@tonic-gate 24560Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 24570Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 24580Sstevel@tonic-gate 24590Sstevel@tonic-gate /* 24600Sstevel@tonic-gate * Don't insert if the lgrp isn't there, if the leaf isn't 24610Sstevel@tonic-gate * contained within the current lgrp, or if the current lgrp has 24620Sstevel@tonic-gate * no leaves in this partition 24630Sstevel@tonic-gate */ 24640Sstevel@tonic-gate 24650Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur) || 24660Sstevel@tonic-gate !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 24670Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) || 24680Sstevel@tonic-gate !klgrpset_intersects(lgrp_cur->lgrp_leaves, 24690Sstevel@tonic-gate cpupart->cp_lgrpset)) 24700Sstevel@tonic-gate continue; 24710Sstevel@tonic-gate 24720Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 24730Sstevel@tonic-gate if (lgrp_cur->lgrp_parent != NULL) { 24740Sstevel@tonic-gate /* if lgrp has a parent, assign it properly */ 24750Sstevel@tonic-gate parent_id = lgrp_cur->lgrp_parent->lgrp_id; 24760Sstevel@tonic-gate lpl_parent = &cpupart->cp_lgrploads[parent_id]; 24770Sstevel@tonic-gate } else { 24780Sstevel@tonic-gate /* if not, make sure parent ptr gets set to null */ 24790Sstevel@tonic-gate lpl_parent = NULL; 24800Sstevel@tonic-gate } 24810Sstevel@tonic-gate 24820Sstevel@tonic-gate if (lpl_cur == lpl_leaf) { 24830Sstevel@tonic-gate /* 24840Sstevel@tonic-gate * Almost all leaf state was initialized elsewhere. The 24850Sstevel@tonic-gate * only thing left to do is to set the parent. 24860Sstevel@tonic-gate */ 24870Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_parent; 24880Sstevel@tonic-gate continue; 24890Sstevel@tonic-gate } 24900Sstevel@tonic-gate 24910Sstevel@tonic-gate /* 24920Sstevel@tonic-gate * Initialize intermediate lpl 24930Sstevel@tonic-gate * Save this lpl's hint though. Since we're changing this 24940Sstevel@tonic-gate * lpl's resources, we need to update the hint in this lpl's 24950Sstevel@tonic-gate * children, but the hint in this lpl is unaffected and 24960Sstevel@tonic-gate * should be preserved. 24970Sstevel@tonic-gate */ 24980Sstevel@tonic-gate hint = lpl_cur->lpl_hint; 24990Sstevel@tonic-gate 25000Sstevel@tonic-gate lpl_clear(lpl_cur); 25010Sstevel@tonic-gate lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 25020Sstevel@tonic-gate 25030Sstevel@tonic-gate lpl_cur->lpl_hint = hint; 25040Sstevel@tonic-gate lpl_cur->lpl_parent = lpl_parent; 25050Sstevel@tonic-gate 25060Sstevel@tonic-gate /* does new lpl need to be populated with other resources? */ 25070Sstevel@tonic-gate rset_intersect = 25080Sstevel@tonic-gate klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 25090Sstevel@tonic-gate cpupart->cp_lgrpset); 25100Sstevel@tonic-gate klgrpset_nlgrps(rset_intersect, rset_num_intersect); 25110Sstevel@tonic-gate 25120Sstevel@tonic-gate if (rset_num_intersect > 1) { 25130Sstevel@tonic-gate /* 25140Sstevel@tonic-gate * If so, figure out what lpls have resources that 25150Sstevel@tonic-gate * intersect this one, and add them. 25160Sstevel@tonic-gate */ 25170Sstevel@tonic-gate for (j = 0; j <= lgrp_alloc_max; j++) { 25180Sstevel@tonic-gate lgrp_t *lgrp_cand; /* candidate lgrp */ 25190Sstevel@tonic-gate lpl_t *lpl_cand; /* candidate lpl */ 25200Sstevel@tonic-gate 25210Sstevel@tonic-gate lgrp_cand = lgrp_table[j]; 25220Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cand) || 25230Sstevel@tonic-gate !klgrpset_ismember(rset_intersect, 25240Sstevel@tonic-gate lgrp_cand->lgrp_id)) 25250Sstevel@tonic-gate continue; 25260Sstevel@tonic-gate lpl_cand = 25270Sstevel@tonic-gate &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 25280Sstevel@tonic-gate lpl_rset_add(lpl_cur, lpl_cand); 25290Sstevel@tonic-gate } 25300Sstevel@tonic-gate } 25310Sstevel@tonic-gate /* 25320Sstevel@tonic-gate * This lpl's rset has changed. Update the hint in it's 25330Sstevel@tonic-gate * children. 25340Sstevel@tonic-gate */ 25350Sstevel@tonic-gate lpl_child_update(lpl_cur, cpupart); 25360Sstevel@tonic-gate } 25370Sstevel@tonic-gate } 25380Sstevel@tonic-gate 25390Sstevel@tonic-gate /* 25400Sstevel@tonic-gate * remove a lpl from the hierarchy of resources, clearing its state when 25410Sstevel@tonic-gate * finished. If the lpls at the intermediate levels of the hierarchy have no 25420Sstevel@tonic-gate * remaining resources, or no longer name a leaf resource in the cpu-partition, 25430Sstevel@tonic-gate * delete them as well. 25440Sstevel@tonic-gate */ 25450Sstevel@tonic-gate 25460Sstevel@tonic-gate void 25470Sstevel@tonic-gate lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 25480Sstevel@tonic-gate { 25490Sstevel@tonic-gate int i; 25500Sstevel@tonic-gate lgrp_t *lgrp_cur; 25510Sstevel@tonic-gate lpl_t *lpl_cur; 25520Sstevel@tonic-gate klgrpset_t leaf_intersect; /* intersection of leaves */ 25530Sstevel@tonic-gate 25540Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 25550Sstevel@tonic-gate lgrp_cur = lgrp_table[i]; 25560Sstevel@tonic-gate 25570Sstevel@tonic-gate /* 25580Sstevel@tonic-gate * Don't attempt to remove from lgrps that aren't there, that 25590Sstevel@tonic-gate * don't contain our leaf, or from the leaf itself. (We do that 25600Sstevel@tonic-gate * later) 25610Sstevel@tonic-gate */ 25620Sstevel@tonic-gate 25630Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp_cur)) 25640Sstevel@tonic-gate continue; 25650Sstevel@tonic-gate 25660Sstevel@tonic-gate lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 25670Sstevel@tonic-gate 25680Sstevel@tonic-gate if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 25690Sstevel@tonic-gate lpl_leaf->lpl_lgrpid) || 25700Sstevel@tonic-gate (lpl_cur == lpl_leaf)) { 25710Sstevel@tonic-gate continue; 25720Sstevel@tonic-gate } 25730Sstevel@tonic-gate 25740Sstevel@tonic-gate /* 25750Sstevel@tonic-gate * This is a slightly sleazy simplification in that we have 25760Sstevel@tonic-gate * already marked the cp_lgrpset as no longer containing the 25770Sstevel@tonic-gate * leaf we've deleted. Any lpls that pass the above checks 25780Sstevel@tonic-gate * based upon lgrp membership but not necessarily cpu-part 25790Sstevel@tonic-gate * membership also get cleared by the checks below. Currently 25800Sstevel@tonic-gate * this is harmless, as the lpls should be empty anyway. 25810Sstevel@tonic-gate * 25820Sstevel@tonic-gate * In particular, we want to preserve lpls that have additional 25830Sstevel@tonic-gate * leaf resources, even though we don't yet have a processor 25840Sstevel@tonic-gate * architecture that represents resources this way. 25850Sstevel@tonic-gate */ 25860Sstevel@tonic-gate 25870Sstevel@tonic-gate leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 25880Sstevel@tonic-gate cpupart->cp_lgrpset); 25890Sstevel@tonic-gate 25900Sstevel@tonic-gate lpl_rset_del(lpl_cur, lpl_leaf); 25910Sstevel@tonic-gate if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 25920Sstevel@tonic-gate lpl_clear(lpl_cur); 25930Sstevel@tonic-gate } else { 25940Sstevel@tonic-gate /* 25950Sstevel@tonic-gate * Update this lpl's children 25960Sstevel@tonic-gate */ 25970Sstevel@tonic-gate lpl_child_update(lpl_cur, cpupart); 25980Sstevel@tonic-gate } 25990Sstevel@tonic-gate } 26000Sstevel@tonic-gate lpl_clear(lpl_leaf); 26010Sstevel@tonic-gate } 26020Sstevel@tonic-gate 26030Sstevel@tonic-gate /* 26040Sstevel@tonic-gate * add a cpu to a partition in terms of lgrp load avg bookeeping 26050Sstevel@tonic-gate * 26060Sstevel@tonic-gate * The lpl (cpu partition load average information) is now arranged in a 26070Sstevel@tonic-gate * hierarchical fashion whereby resources that are closest, ie. most local, to 26080Sstevel@tonic-gate * the cpu in question are considered to be leaves in a tree of resources. 26090Sstevel@tonic-gate * There are two general cases for cpu additon: 26100Sstevel@tonic-gate * 26110Sstevel@tonic-gate * 1. A lpl structure that contains resources already in the hierarchy tree. 26120Sstevel@tonic-gate * In this case, all of the associated lpl relationships have been defined, and 26130Sstevel@tonic-gate * all that is necessary is that we link the new cpu into the per-lpl list of 26140Sstevel@tonic-gate * cpus, and increment the ncpu count of all places where this cpu resource will 26150Sstevel@tonic-gate * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 26160Sstevel@tonic-gate * pushing is accomplished by this routine. 26170Sstevel@tonic-gate * 26180Sstevel@tonic-gate * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 26190Sstevel@tonic-gate * not exist yet. In this case, it is necessary to build the leaf lpl, and 26200Sstevel@tonic-gate * construct the hierarchy of state necessary to name it's more distant 26210Sstevel@tonic-gate * resources, if they should exist. The leaf structure is initialized by this 26220Sstevel@tonic-gate * routine, as is the cpu-partition state for the lgrp membership. This routine 26230Sstevel@tonic-gate * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 26240Sstevel@tonic-gate * and builds all of the "ancestoral" state necessary to identify resources at 26250Sstevel@tonic-gate * differing levels of locality. 26260Sstevel@tonic-gate */ 26270Sstevel@tonic-gate void 26280Sstevel@tonic-gate lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 26290Sstevel@tonic-gate { 26300Sstevel@tonic-gate cpupart_t *cpupart; 26310Sstevel@tonic-gate lgrp_t *lgrp_leaf; 26320Sstevel@tonic-gate lpl_t *lpl_leaf; 26330Sstevel@tonic-gate 26340Sstevel@tonic-gate /* called sometimes w/ cpus paused - grab no locks */ 26350Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 26360Sstevel@tonic-gate 26370Sstevel@tonic-gate cpupart = cp->cpu_part; 26380Sstevel@tonic-gate lgrp_leaf = lgrp_table[lgrpid]; 26390Sstevel@tonic-gate 26400Sstevel@tonic-gate /* don't add non-existent lgrp */ 26410Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_leaf)); 26420Sstevel@tonic-gate lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 26430Sstevel@tonic-gate cp->cpu_lpl = lpl_leaf; 26440Sstevel@tonic-gate 26450Sstevel@tonic-gate /* only leaf lpls contain cpus */ 26460Sstevel@tonic-gate 26470Sstevel@tonic-gate if (lpl_leaf->lpl_ncpu++ == 0) { 26480Sstevel@tonic-gate lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 26490Sstevel@tonic-gate klgrpset_add(cpupart->cp_lgrpset, lgrpid); 26500Sstevel@tonic-gate lpl_leaf_insert(lpl_leaf, cpupart); 26510Sstevel@tonic-gate } else { 26520Sstevel@tonic-gate /* 26530Sstevel@tonic-gate * the lpl should already exist in the parent, so just update 26540Sstevel@tonic-gate * the count of available CPUs 26550Sstevel@tonic-gate */ 26560Sstevel@tonic-gate lpl_cpu_adjcnt(LPL_INCREMENT, cp); 26570Sstevel@tonic-gate } 26580Sstevel@tonic-gate 26590Sstevel@tonic-gate /* link cpu into list of cpus in lpl */ 26600Sstevel@tonic-gate 26610Sstevel@tonic-gate if (lpl_leaf->lpl_cpus) { 26620Sstevel@tonic-gate cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 26630Sstevel@tonic-gate cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 26640Sstevel@tonic-gate lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 26650Sstevel@tonic-gate lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 26660Sstevel@tonic-gate } else { 26670Sstevel@tonic-gate /* 26680Sstevel@tonic-gate * We increment ncpu immediately after we create a new leaf 26690Sstevel@tonic-gate * lpl, so assert that ncpu == 1 for the case where we don't 26700Sstevel@tonic-gate * have any cpu pointers yet. 26710Sstevel@tonic-gate */ 26720Sstevel@tonic-gate ASSERT(lpl_leaf->lpl_ncpu == 1); 26730Sstevel@tonic-gate lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 26740Sstevel@tonic-gate } 26750Sstevel@tonic-gate 26760Sstevel@tonic-gate } 26770Sstevel@tonic-gate 26780Sstevel@tonic-gate 26790Sstevel@tonic-gate /* 26800Sstevel@tonic-gate * remove a cpu from a partition in terms of lgrp load avg bookeeping 26810Sstevel@tonic-gate * 26820Sstevel@tonic-gate * The lpl (cpu partition load average information) is now arranged in a 26830Sstevel@tonic-gate * hierarchical fashion whereby resources that are closest, ie. most local, to 26840Sstevel@tonic-gate * the cpu in question are considered to be leaves in a tree of resources. 26850Sstevel@tonic-gate * There are two removal cases in question: 26860Sstevel@tonic-gate * 26870Sstevel@tonic-gate * 1. Removal of the resource in the leaf leaves other resources remaining in 26880Sstevel@tonic-gate * that leaf. (Another cpu still exists at this level of locality). In this 26890Sstevel@tonic-gate * case, the count of available cpus is decremented in all assocated lpls by 26900Sstevel@tonic-gate * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 26910Sstevel@tonic-gate * from the per-cpu lpl list. 26920Sstevel@tonic-gate * 26930Sstevel@tonic-gate * 2. Removal of the resource results in the lpl containing no resources. (It's 26940Sstevel@tonic-gate * empty) In this case, all of what has occurred for the first step must take 26950Sstevel@tonic-gate * place; however, additionally we must remove the lpl structure itself, prune 26960Sstevel@tonic-gate * out any stranded lpls that do not directly name a leaf resource, and mark the 26970Sstevel@tonic-gate * cpu partition in question as no longer containing resources from the lgrp of 26980Sstevel@tonic-gate * the lpl that has been delted. Cpu-partition changes are handled by this 26990Sstevel@tonic-gate * method, but the lpl_leaf_remove function deals with the details of pruning 27000Sstevel@tonic-gate * out the empty lpl and any of its orphaned direct ancestors. 27010Sstevel@tonic-gate */ 27020Sstevel@tonic-gate void 27030Sstevel@tonic-gate lgrp_part_del_cpu(cpu_t *cp) 27040Sstevel@tonic-gate { 27050Sstevel@tonic-gate lpl_t *lpl; 27060Sstevel@tonic-gate lpl_t *leaf_lpl; 27070Sstevel@tonic-gate lgrp_t *lgrp_leaf; 27080Sstevel@tonic-gate 27090Sstevel@tonic-gate /* called sometimes w/ cpus paused - grab no locks */ 27100Sstevel@tonic-gate 27110Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 27120Sstevel@tonic-gate 27130Sstevel@tonic-gate lpl = leaf_lpl = cp->cpu_lpl; 27140Sstevel@tonic-gate lgrp_leaf = leaf_lpl->lpl_lgrp; 27150Sstevel@tonic-gate 27160Sstevel@tonic-gate /* don't delete a leaf that isn't there */ 27170Sstevel@tonic-gate ASSERT(LGRP_EXISTS(lgrp_leaf)); 27180Sstevel@tonic-gate 27190Sstevel@tonic-gate /* no double-deletes */ 27200Sstevel@tonic-gate ASSERT(lpl->lpl_ncpu); 27210Sstevel@tonic-gate if (--lpl->lpl_ncpu == 0) { 27220Sstevel@tonic-gate /* 27230Sstevel@tonic-gate * This was the last cpu in this lgroup for this partition, 27240Sstevel@tonic-gate * clear its bit in the partition's lgroup bitmask 27250Sstevel@tonic-gate */ 27260Sstevel@tonic-gate klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 27270Sstevel@tonic-gate 27280Sstevel@tonic-gate /* eliminate remaning lpl link pointers in cpu, lpl */ 27290Sstevel@tonic-gate lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 27300Sstevel@tonic-gate 27310Sstevel@tonic-gate lpl_leaf_remove(leaf_lpl, cp->cpu_part); 27320Sstevel@tonic-gate } else { 27330Sstevel@tonic-gate 27340Sstevel@tonic-gate /* unlink cpu from lists of cpus in lpl */ 27350Sstevel@tonic-gate cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 27360Sstevel@tonic-gate cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 27370Sstevel@tonic-gate if (lpl->lpl_cpus == cp) { 27380Sstevel@tonic-gate lpl->lpl_cpus = cp->cpu_next_lpl; 27390Sstevel@tonic-gate } 27400Sstevel@tonic-gate 27410Sstevel@tonic-gate /* 27420Sstevel@tonic-gate * Update the cpu count in the lpls associated with parent 27430Sstevel@tonic-gate * lgroups. 27440Sstevel@tonic-gate */ 27450Sstevel@tonic-gate lpl_cpu_adjcnt(LPL_DECREMENT, cp); 27460Sstevel@tonic-gate 27470Sstevel@tonic-gate } 27480Sstevel@tonic-gate /* clear cpu's lpl ptr when we're all done */ 27490Sstevel@tonic-gate cp->cpu_lpl = NULL; 27500Sstevel@tonic-gate } 27510Sstevel@tonic-gate 27520Sstevel@tonic-gate /* 27530Sstevel@tonic-gate * Recompute load average for the specified partition/lgrp fragment. 27540Sstevel@tonic-gate * 27550Sstevel@tonic-gate * We rely on the fact that this routine is called from the clock thread 27560Sstevel@tonic-gate * at a point before the clock thread can block (i.e. before its first 27570Sstevel@tonic-gate * lock request). Since the clock thread can not be preempted (since it 27580Sstevel@tonic-gate * runs at highest priority), we know that cpu partitions can not change 27590Sstevel@tonic-gate * (since doing so would require either the repartition requester or the 27600Sstevel@tonic-gate * cpu_pause thread to run on this cpu), so we can update the cpu's load 27610Sstevel@tonic-gate * without grabbing cpu_lock. 27620Sstevel@tonic-gate */ 27630Sstevel@tonic-gate void 27640Sstevel@tonic-gate lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 27650Sstevel@tonic-gate { 27660Sstevel@tonic-gate uint_t ncpu; 27670Sstevel@tonic-gate int64_t old, new, f; 27680Sstevel@tonic-gate 27690Sstevel@tonic-gate /* 27700Sstevel@tonic-gate * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 27710Sstevel@tonic-gate */ 27720Sstevel@tonic-gate static short expval[] = { 27730Sstevel@tonic-gate 0, 3196, 1618, 1083, 27740Sstevel@tonic-gate 814, 652, 543, 466, 27750Sstevel@tonic-gate 408, 363, 326, 297, 27760Sstevel@tonic-gate 272, 251, 233, 218, 27770Sstevel@tonic-gate 204, 192, 181, 172, 27780Sstevel@tonic-gate 163, 155, 148, 142, 27790Sstevel@tonic-gate 136, 130, 125, 121, 27800Sstevel@tonic-gate 116, 112, 109, 105 27810Sstevel@tonic-gate }; 27820Sstevel@tonic-gate 27830Sstevel@tonic-gate /* ASSERT (called from clock level) */ 27840Sstevel@tonic-gate 27850Sstevel@tonic-gate if ((lpl == NULL) || /* we're booting - this is easiest for now */ 27860Sstevel@tonic-gate ((ncpu = lpl->lpl_ncpu) == 0)) { 27870Sstevel@tonic-gate return; 27880Sstevel@tonic-gate } 27890Sstevel@tonic-gate 27900Sstevel@tonic-gate for (;;) { 27910Sstevel@tonic-gate 27920Sstevel@tonic-gate if (ncpu >= sizeof (expval) / sizeof (expval[0])) 27930Sstevel@tonic-gate f = expval[1]/ncpu; /* good approx. for large ncpu */ 27940Sstevel@tonic-gate else 27950Sstevel@tonic-gate f = expval[ncpu]; 27960Sstevel@tonic-gate 27970Sstevel@tonic-gate /* 27980Sstevel@tonic-gate * Modify the load average atomically to avoid losing 27990Sstevel@tonic-gate * anticipatory load updates (see lgrp_move_thread()). 28000Sstevel@tonic-gate */ 28010Sstevel@tonic-gate if (ageflag) { 28020Sstevel@tonic-gate /* 28030Sstevel@tonic-gate * We're supposed to both update and age the load. 28040Sstevel@tonic-gate * This happens 10 times/sec. per cpu. We do a 28050Sstevel@tonic-gate * little hoop-jumping to avoid integer overflow. 28060Sstevel@tonic-gate */ 28070Sstevel@tonic-gate int64_t q, r; 28080Sstevel@tonic-gate 28090Sstevel@tonic-gate do { 28100Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 28110Sstevel@tonic-gate q = (old >> 16) << 7; 28120Sstevel@tonic-gate r = (old & 0xffff) << 7; 28130Sstevel@tonic-gate new += ((long long)(nrcpus - q) * f - 28140Sstevel@tonic-gate ((r * f) >> 16)) >> 7; 28150Sstevel@tonic-gate 28160Sstevel@tonic-gate /* 28170Sstevel@tonic-gate * Check for overflow 28180Sstevel@tonic-gate */ 28190Sstevel@tonic-gate if (new > LGRP_LOADAVG_MAX) 28200Sstevel@tonic-gate new = LGRP_LOADAVG_MAX; 28210Sstevel@tonic-gate else if (new < 0) 28220Sstevel@tonic-gate new = 0; 28230Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 28240Sstevel@tonic-gate new) != old); 28250Sstevel@tonic-gate } else { 28260Sstevel@tonic-gate /* 28270Sstevel@tonic-gate * We're supposed to update the load, but not age it. 28280Sstevel@tonic-gate * This option is used to update the load (which either 28290Sstevel@tonic-gate * has already been aged in this 1/10 sec. interval or 28300Sstevel@tonic-gate * soon will be) to account for a remotely executing 28310Sstevel@tonic-gate * thread. 28320Sstevel@tonic-gate */ 28330Sstevel@tonic-gate do { 28340Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 28350Sstevel@tonic-gate new += f; 28360Sstevel@tonic-gate /* 28370Sstevel@tonic-gate * Check for overflow 28380Sstevel@tonic-gate * Underflow not possible here 28390Sstevel@tonic-gate */ 28400Sstevel@tonic-gate if (new < old) 28410Sstevel@tonic-gate new = LGRP_LOADAVG_MAX; 28420Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 28430Sstevel@tonic-gate new) != old); 28440Sstevel@tonic-gate } 28450Sstevel@tonic-gate 28460Sstevel@tonic-gate /* 28470Sstevel@tonic-gate * Do the same for this lpl's parent 28480Sstevel@tonic-gate */ 28490Sstevel@tonic-gate if ((lpl = lpl->lpl_parent) == NULL) 28500Sstevel@tonic-gate break; 28510Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 28520Sstevel@tonic-gate } 28530Sstevel@tonic-gate } 28540Sstevel@tonic-gate 28550Sstevel@tonic-gate /* 28560Sstevel@tonic-gate * Initialize lpl topology in the target based on topology currently present in 28570Sstevel@tonic-gate * lpl_bootstrap. 28580Sstevel@tonic-gate * 28590Sstevel@tonic-gate * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 28600Sstevel@tonic-gate * initialize cp_default list of lpls. Up to this point all topology operations 28610Sstevel@tonic-gate * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 28620Sstevel@tonic-gate * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 28630Sstevel@tonic-gate * `target' points to the list of lpls in cp_default and `size' is the size of 28640Sstevel@tonic-gate * this list. 28650Sstevel@tonic-gate * 28660Sstevel@tonic-gate * This function walks the lpl topology in lpl_bootstrap and does for things: 28670Sstevel@tonic-gate * 28680Sstevel@tonic-gate * 1) Copies all fields from lpl_bootstrap to the target. 28690Sstevel@tonic-gate * 28700Sstevel@tonic-gate * 2) Sets CPU0 lpl pointer to the correct element of the target list. 28710Sstevel@tonic-gate * 28720Sstevel@tonic-gate * 3) Updates lpl_parent pointers to point to the lpls in the target list 28730Sstevel@tonic-gate * instead of lpl_bootstrap. 28740Sstevel@tonic-gate * 28750Sstevel@tonic-gate * 4) Updates pointers in the resource list of the target to point to the lpls 28760Sstevel@tonic-gate * in the target list instead of lpl_bootstrap. 28770Sstevel@tonic-gate * 28780Sstevel@tonic-gate * After lpl_topo_bootstrap() completes, target contains the same information 28790Sstevel@tonic-gate * that would be present there if it were used during boot instead of 28800Sstevel@tonic-gate * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 28810Sstevel@tonic-gate * and it is bzeroed. 28820Sstevel@tonic-gate */ 28830Sstevel@tonic-gate void 28840Sstevel@tonic-gate lpl_topo_bootstrap(lpl_t *target, int size) 28850Sstevel@tonic-gate { 28860Sstevel@tonic-gate lpl_t *lpl = lpl_bootstrap; 28870Sstevel@tonic-gate lpl_t *target_lpl = target; 28880Sstevel@tonic-gate int howmany; 28890Sstevel@tonic-gate int id; 28900Sstevel@tonic-gate int i; 28910Sstevel@tonic-gate 28920Sstevel@tonic-gate /* 28930Sstevel@tonic-gate * The only target that should be passed here is cp_default lpl list. 28940Sstevel@tonic-gate */ 28950Sstevel@tonic-gate ASSERT(target == cp_default.cp_lgrploads); 28960Sstevel@tonic-gate ASSERT(size == cp_default.cp_nlgrploads); 28970Sstevel@tonic-gate ASSERT(!lgrp_topo_initialized); 28980Sstevel@tonic-gate ASSERT(ncpus == 1); 28990Sstevel@tonic-gate 29000Sstevel@tonic-gate howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 29010Sstevel@tonic-gate for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 29020Sstevel@tonic-gate /* 29030Sstevel@tonic-gate * Copy all fields from lpl. 29040Sstevel@tonic-gate */ 29050Sstevel@tonic-gate 29060Sstevel@tonic-gate *target_lpl = *lpl; 29070Sstevel@tonic-gate 29080Sstevel@tonic-gate /* 29090Sstevel@tonic-gate * Substitute CPU0 lpl pointer with one relative to target. 29100Sstevel@tonic-gate */ 29110Sstevel@tonic-gate if (lpl->lpl_cpus == CPU) { 29120Sstevel@tonic-gate ASSERT(CPU->cpu_lpl == lpl); 29130Sstevel@tonic-gate CPU->cpu_lpl = target_lpl; 29140Sstevel@tonic-gate } 29150Sstevel@tonic-gate 29160Sstevel@tonic-gate /* 29170Sstevel@tonic-gate * Substitute parent information with parent relative to target. 29180Sstevel@tonic-gate */ 29190Sstevel@tonic-gate if (lpl->lpl_parent != NULL) 29200Sstevel@tonic-gate target_lpl->lpl_parent = (lpl_t *) 29210Sstevel@tonic-gate (((uintptr_t)lpl->lpl_parent - 29220Sstevel@tonic-gate (uintptr_t)lpl_bootstrap) + 29230Sstevel@tonic-gate (uintptr_t)target); 29240Sstevel@tonic-gate 29250Sstevel@tonic-gate /* 29260Sstevel@tonic-gate * Walk over resource set substituting pointers relative to 29270Sstevel@tonic-gate * lpl_bootstrap to pointers relative to target. 29280Sstevel@tonic-gate */ 29290Sstevel@tonic-gate ASSERT(lpl->lpl_nrset <= 1); 29300Sstevel@tonic-gate 29310Sstevel@tonic-gate for (id = 0; id < lpl->lpl_nrset; id++) { 29320Sstevel@tonic-gate if (lpl->lpl_rset[id] != NULL) { 29330Sstevel@tonic-gate target_lpl->lpl_rset[id] = 29340Sstevel@tonic-gate (lpl_t *) 29350Sstevel@tonic-gate (((uintptr_t)lpl->lpl_rset[id] - 29360Sstevel@tonic-gate (uintptr_t)lpl_bootstrap) + 29370Sstevel@tonic-gate (uintptr_t)target); 29380Sstevel@tonic-gate } 29390Sstevel@tonic-gate } 29400Sstevel@tonic-gate } 29410Sstevel@tonic-gate 29420Sstevel@tonic-gate /* 29430Sstevel@tonic-gate * Topology information in lpl_bootstrap is no longer needed. 29440Sstevel@tonic-gate */ 29450Sstevel@tonic-gate bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 29460Sstevel@tonic-gate } 29470Sstevel@tonic-gate 29480Sstevel@tonic-gate /* the maximum effect that a single thread can have on it's lgroup's load */ 29490Sstevel@tonic-gate #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 29500Sstevel@tonic-gate ((lgrp_loadavg_max_effect) / (ncpu)) 29510Sstevel@tonic-gate uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 29520Sstevel@tonic-gate 29530Sstevel@tonic-gate /* 29540Sstevel@tonic-gate * If the lowest load among the lgroups a process' threads are currently 29550Sstevel@tonic-gate * spread across is greater than lgrp_expand_proc_thresh, we'll consider 29560Sstevel@tonic-gate * expanding the process to a new lgroup. 29570Sstevel@tonic-gate */ 29580Sstevel@tonic-gate #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 29590Sstevel@tonic-gate lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 29600Sstevel@tonic-gate 29610Sstevel@tonic-gate #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 29620Sstevel@tonic-gate ((lgrp_expand_proc_thresh) / (ncpu)) 29630Sstevel@tonic-gate 29640Sstevel@tonic-gate /* 29650Sstevel@tonic-gate * A process will be expanded to a new lgroup only if the difference between 29660Sstevel@tonic-gate * the lowest load on the lgroups the process' thread's are currently spread 29670Sstevel@tonic-gate * across and the lowest load on the other lgroups in the process' partition 29680Sstevel@tonic-gate * is greater than lgrp_expand_proc_diff. 29690Sstevel@tonic-gate */ 29700Sstevel@tonic-gate #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 29710Sstevel@tonic-gate lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 29720Sstevel@tonic-gate 29730Sstevel@tonic-gate #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 29740Sstevel@tonic-gate ((lgrp_expand_proc_diff) / (ncpu)) 29750Sstevel@tonic-gate 29760Sstevel@tonic-gate /* 29770Sstevel@tonic-gate * The loadavg tolerance accounts for "noise" inherent in the load, which may 29780Sstevel@tonic-gate * be present due to impreciseness of the load average decay algorithm. 29790Sstevel@tonic-gate * 29800Sstevel@tonic-gate * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 29810Sstevel@tonic-gate * tolerance is scaled by the number of cpus in the lgroup just like 29820Sstevel@tonic-gate * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 29830Sstevel@tonic-gate * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 29840Sstevel@tonic-gate * of: 0x10000 / 4 => 0x4000 or greater to be significant. 29850Sstevel@tonic-gate */ 29860Sstevel@tonic-gate uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 29870Sstevel@tonic-gate #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 29880Sstevel@tonic-gate ((lgrp_loadavg_tolerance) / ncpu) 29890Sstevel@tonic-gate 29900Sstevel@tonic-gate /* 29910Sstevel@tonic-gate * lgrp_choose() will choose root lgroup as home when lowest lgroup load 29920Sstevel@tonic-gate * average is above this threshold 29930Sstevel@tonic-gate */ 29940Sstevel@tonic-gate uint32_t lgrp_load_thresh = UINT32_MAX; 29950Sstevel@tonic-gate 29960Sstevel@tonic-gate /* 29970Sstevel@tonic-gate * lgrp_choose() will try to skip any lgroups with less memory 29980Sstevel@tonic-gate * than this free when choosing a home lgroup 29990Sstevel@tonic-gate */ 30000Sstevel@tonic-gate pgcnt_t lgrp_mem_free_thresh = 0; 30010Sstevel@tonic-gate 30020Sstevel@tonic-gate /* 30030Sstevel@tonic-gate * When choosing between similarly loaded lgroups, lgrp_choose() will pick 30040Sstevel@tonic-gate * one based on one of the following policies: 30050Sstevel@tonic-gate * - Random selection 30060Sstevel@tonic-gate * - Pseudo round robin placement 30070Sstevel@tonic-gate * - Longest time since a thread was last placed 30080Sstevel@tonic-gate */ 30090Sstevel@tonic-gate #define LGRP_CHOOSE_RANDOM 1 30100Sstevel@tonic-gate #define LGRP_CHOOSE_RR 2 30110Sstevel@tonic-gate #define LGRP_CHOOSE_TIME 3 30120Sstevel@tonic-gate 30130Sstevel@tonic-gate int lgrp_choose_policy = LGRP_CHOOSE_TIME; 30140Sstevel@tonic-gate 30150Sstevel@tonic-gate /* 30160Sstevel@tonic-gate * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 30170Sstevel@tonic-gate * be bound to a CPU or processor set. 30180Sstevel@tonic-gate * 30190Sstevel@tonic-gate * Arguments: 30200Sstevel@tonic-gate * t The thread 30210Sstevel@tonic-gate * cpupart The partition the thread belongs to. 30220Sstevel@tonic-gate * 30230Sstevel@tonic-gate * NOTE: Should at least be called with the cpu_lock held, kernel preemption 30240Sstevel@tonic-gate * disabled, or thread_lock held (at splhigh) to protect against the CPU 30250Sstevel@tonic-gate * partitions changing out from under us and assumes that given thread is 30260Sstevel@tonic-gate * protected. Also, called sometimes w/ cpus paused or kernel preemption 30270Sstevel@tonic-gate * disabled, so don't grab any locks because we should never block under 30280Sstevel@tonic-gate * those conditions. 30290Sstevel@tonic-gate */ 30300Sstevel@tonic-gate lpl_t * 30310Sstevel@tonic-gate lgrp_choose(kthread_t *t, cpupart_t *cpupart) 30320Sstevel@tonic-gate { 30330Sstevel@tonic-gate lgrp_load_t bestload, bestrload; 30340Sstevel@tonic-gate int lgrpid_offset, lgrp_count; 30350Sstevel@tonic-gate lgrp_id_t lgrpid, lgrpid_start; 30360Sstevel@tonic-gate lpl_t *lpl, *bestlpl, *bestrlpl; 30370Sstevel@tonic-gate klgrpset_t lgrpset; 30380Sstevel@tonic-gate proc_t *p; 30390Sstevel@tonic-gate 30400Sstevel@tonic-gate ASSERT(t != NULL); 30410Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 30420Sstevel@tonic-gate THREAD_LOCK_HELD(t)); 30430Sstevel@tonic-gate ASSERT(cpupart != NULL); 30440Sstevel@tonic-gate 30450Sstevel@tonic-gate p = t->t_procp; 30460Sstevel@tonic-gate 30470Sstevel@tonic-gate /* A process should always be in an active partition */ 30480Sstevel@tonic-gate ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 30490Sstevel@tonic-gate 30500Sstevel@tonic-gate bestlpl = bestrlpl = NULL; 30510Sstevel@tonic-gate bestload = bestrload = LGRP_LOADAVG_MAX; 30520Sstevel@tonic-gate lgrpset = cpupart->cp_lgrpset; 30530Sstevel@tonic-gate 30540Sstevel@tonic-gate switch (lgrp_choose_policy) { 30550Sstevel@tonic-gate case LGRP_CHOOSE_RR: 30560Sstevel@tonic-gate lgrpid = cpupart->cp_lgrp_hint; 30570Sstevel@tonic-gate do { 30580Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 30590Sstevel@tonic-gate lgrpid = 0; 30600Sstevel@tonic-gate } while (!klgrpset_ismember(lgrpset, lgrpid)); 30610Sstevel@tonic-gate 30620Sstevel@tonic-gate break; 30630Sstevel@tonic-gate default: 30640Sstevel@tonic-gate case LGRP_CHOOSE_TIME: 30650Sstevel@tonic-gate case LGRP_CHOOSE_RANDOM: 30660Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrp_count); 30670Sstevel@tonic-gate lgrpid_offset = 30680Sstevel@tonic-gate (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 30690Sstevel@tonic-gate for (lgrpid = 0; ; lgrpid++) { 30700Sstevel@tonic-gate if (klgrpset_ismember(lgrpset, lgrpid)) { 30710Sstevel@tonic-gate if (--lgrpid_offset == 0) 30720Sstevel@tonic-gate break; 30730Sstevel@tonic-gate } 30740Sstevel@tonic-gate } 30750Sstevel@tonic-gate break; 30760Sstevel@tonic-gate } 30770Sstevel@tonic-gate 30780Sstevel@tonic-gate lgrpid_start = lgrpid; 30790Sstevel@tonic-gate 30800Sstevel@tonic-gate DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 30810Sstevel@tonic-gate lgrp_id_t, cpupart->cp_lgrp_hint); 30820Sstevel@tonic-gate 30830Sstevel@tonic-gate /* 30840Sstevel@tonic-gate * Use lgroup affinities (if any) to choose best lgroup 30850Sstevel@tonic-gate * 30860Sstevel@tonic-gate * NOTE: Assumes that thread is protected from going away and its 30870Sstevel@tonic-gate * lgroup affinities won't change (ie. p_lock, or 30880Sstevel@tonic-gate * thread_lock() being held and/or CPUs paused) 30890Sstevel@tonic-gate */ 30900Sstevel@tonic-gate if (t->t_lgrp_affinity) { 30910Sstevel@tonic-gate lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 30920Sstevel@tonic-gate if (lpl != NULL) 30930Sstevel@tonic-gate return (lpl); 30940Sstevel@tonic-gate } 30950Sstevel@tonic-gate 30960Sstevel@tonic-gate ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 30970Sstevel@tonic-gate 30980Sstevel@tonic-gate do { 30990Sstevel@tonic-gate pgcnt_t npgs; 31000Sstevel@tonic-gate 31010Sstevel@tonic-gate /* 31020Sstevel@tonic-gate * Skip any lgroups outside of thread's pset 31030Sstevel@tonic-gate */ 31040Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, lgrpid)) { 31050Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 31060Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 31070Sstevel@tonic-gate continue; 31080Sstevel@tonic-gate } 31090Sstevel@tonic-gate 31100Sstevel@tonic-gate /* 31110Sstevel@tonic-gate * Skip any non-leaf lgroups 31120Sstevel@tonic-gate */ 31130Sstevel@tonic-gate if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 31140Sstevel@tonic-gate continue; 31150Sstevel@tonic-gate 31160Sstevel@tonic-gate /* 31170Sstevel@tonic-gate * Skip any lgroups without enough free memory 31180Sstevel@tonic-gate * (when threshold set to nonzero positive value) 31190Sstevel@tonic-gate */ 31200Sstevel@tonic-gate if (lgrp_mem_free_thresh > 0) { 31210Sstevel@tonic-gate npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 31220Sstevel@tonic-gate if (npgs < lgrp_mem_free_thresh) { 31230Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 31240Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 31250Sstevel@tonic-gate continue; 31260Sstevel@tonic-gate } 31270Sstevel@tonic-gate } 31280Sstevel@tonic-gate 31290Sstevel@tonic-gate lpl = &cpupart->cp_lgrploads[lgrpid]; 31300Sstevel@tonic-gate if (klgrpset_isempty(p->p_lgrpset) || 31310Sstevel@tonic-gate klgrpset_ismember(p->p_lgrpset, lgrpid)) { 31320Sstevel@tonic-gate /* 31330Sstevel@tonic-gate * Either this is a new process or the process already 31340Sstevel@tonic-gate * has threads on this lgrp, so this is a preferred 31350Sstevel@tonic-gate * lgroup for the thread. 31360Sstevel@tonic-gate */ 31371892Sesaxe if (bestlpl == NULL || 31381892Sesaxe lpl_pick(lpl, bestlpl)) { 31390Sstevel@tonic-gate bestload = lpl->lpl_loadavg; 31400Sstevel@tonic-gate bestlpl = lpl; 31410Sstevel@tonic-gate } 31420Sstevel@tonic-gate } else { 31430Sstevel@tonic-gate /* 31440Sstevel@tonic-gate * The process doesn't have any threads on this lgrp, 31450Sstevel@tonic-gate * but we're willing to consider this lgrp if the load 31460Sstevel@tonic-gate * difference is big enough to justify splitting up 31470Sstevel@tonic-gate * the process' threads. 31480Sstevel@tonic-gate */ 31491892Sesaxe if (bestrlpl == NULL || 31501892Sesaxe lpl_pick(lpl, bestrlpl)) { 31510Sstevel@tonic-gate bestrload = lpl->lpl_loadavg; 31520Sstevel@tonic-gate bestrlpl = lpl; 31530Sstevel@tonic-gate } 31540Sstevel@tonic-gate } 31550Sstevel@tonic-gate if (++lgrpid > lgrp_alloc_max) 31560Sstevel@tonic-gate lgrpid = 0; /* wrap the search */ 31570Sstevel@tonic-gate } while (lgrpid != lgrpid_start); 31580Sstevel@tonic-gate 31590Sstevel@tonic-gate /* 31600Sstevel@tonic-gate * Return root lgroup if threshold isn't set to maximum value and 31610Sstevel@tonic-gate * lowest lgroup load average more than a certain threshold 31620Sstevel@tonic-gate */ 31630Sstevel@tonic-gate if (lgrp_load_thresh != UINT32_MAX && 31640Sstevel@tonic-gate bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 31650Sstevel@tonic-gate return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 31660Sstevel@tonic-gate 31670Sstevel@tonic-gate /* 31680Sstevel@tonic-gate * If all the lgroups over which the thread's process is spread are 31691892Sesaxe * heavily loaded, or otherwise undesirable, we'll consider placing 31701892Sesaxe * the thread on one of the other leaf lgroups in the thread's 31711892Sesaxe * partition. 31720Sstevel@tonic-gate */ 31731892Sesaxe if ((bestlpl == NULL) || 31741892Sesaxe ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 31750Sstevel@tonic-gate (bestrload < bestload) && /* paranoid about wraparound */ 31760Sstevel@tonic-gate (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 31771892Sesaxe bestload))) { 31780Sstevel@tonic-gate bestlpl = bestrlpl; 31790Sstevel@tonic-gate } 31800Sstevel@tonic-gate 31811892Sesaxe if (bestlpl == NULL) { 31821892Sesaxe /* 31831892Sesaxe * No lgroup looked particularly good, but we still 31841892Sesaxe * have to pick something. Go with the randomly selected 31851892Sesaxe * legal lgroup we started with above. 31861892Sesaxe */ 31871892Sesaxe bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 31881892Sesaxe } 31891892Sesaxe 31900Sstevel@tonic-gate cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 31910Sstevel@tonic-gate bestlpl->lpl_homed_time = gethrtime_unscaled(); 31920Sstevel@tonic-gate 31930Sstevel@tonic-gate ASSERT(bestlpl->lpl_ncpu > 0); 31940Sstevel@tonic-gate return (bestlpl); 31950Sstevel@tonic-gate } 31960Sstevel@tonic-gate 31970Sstevel@tonic-gate /* 31981892Sesaxe * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 31991892Sesaxe * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 32000Sstevel@tonic-gate */ 32010Sstevel@tonic-gate static int 32020Sstevel@tonic-gate lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 32030Sstevel@tonic-gate { 32040Sstevel@tonic-gate lgrp_load_t l1, l2; 32050Sstevel@tonic-gate lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 32060Sstevel@tonic-gate 32070Sstevel@tonic-gate l1 = lpl1->lpl_loadavg; 32080Sstevel@tonic-gate l2 = lpl2->lpl_loadavg; 32090Sstevel@tonic-gate 32100Sstevel@tonic-gate if ((l1 + tolerance < l2) && (l1 < l2)) { 32110Sstevel@tonic-gate /* lpl1 is significantly less loaded than lpl2 */ 32120Sstevel@tonic-gate return (1); 32130Sstevel@tonic-gate } 32140Sstevel@tonic-gate 32150Sstevel@tonic-gate if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 32160Sstevel@tonic-gate l1 + tolerance >= l2 && l1 < l2 && 32170Sstevel@tonic-gate lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 32180Sstevel@tonic-gate /* 32190Sstevel@tonic-gate * lpl1's load is within the tolerance of lpl2. We're 32200Sstevel@tonic-gate * willing to consider it be to better however if 32210Sstevel@tonic-gate * it has been longer since we last homed a thread there 32220Sstevel@tonic-gate */ 32230Sstevel@tonic-gate return (1); 32240Sstevel@tonic-gate } 32250Sstevel@tonic-gate 32260Sstevel@tonic-gate return (0); 32270Sstevel@tonic-gate } 32280Sstevel@tonic-gate 32290Sstevel@tonic-gate /* 32300Sstevel@tonic-gate * An LWP is expected to be assigned to an lgroup for at least this long 32310Sstevel@tonic-gate * for its anticipatory load to be justified. NOTE that this value should 32320Sstevel@tonic-gate * not be set extremely huge (say, larger than 100 years), to avoid problems 32330Sstevel@tonic-gate * with overflow in the calculation that uses it. 32340Sstevel@tonic-gate */ 32350Sstevel@tonic-gate #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 32360Sstevel@tonic-gate hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 32370Sstevel@tonic-gate 32380Sstevel@tonic-gate /* 32390Sstevel@tonic-gate * Routine to change a thread's lgroup affiliation. This routine updates 32400Sstevel@tonic-gate * the thread's kthread_t struct and its process' proc_t struct to note the 32410Sstevel@tonic-gate * thread's new lgroup affiliation, and its lgroup affinities. 32420Sstevel@tonic-gate * 32430Sstevel@tonic-gate * Note that this is the only routine that modifies a thread's t_lpl field, 32440Sstevel@tonic-gate * and that adds in or removes anticipatory load. 32450Sstevel@tonic-gate * 32460Sstevel@tonic-gate * If the thread is exiting, newlpl is NULL. 32470Sstevel@tonic-gate * 32480Sstevel@tonic-gate * Locking: 32490Sstevel@tonic-gate * The following lock must be held on entry: 32500Sstevel@tonic-gate * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 32510Sstevel@tonic-gate * doesn't get removed from t's partition 32520Sstevel@tonic-gate * 32530Sstevel@tonic-gate * This routine is not allowed to grab any locks, since it may be called 32540Sstevel@tonic-gate * with cpus paused (such as from cpu_offline). 32550Sstevel@tonic-gate */ 32560Sstevel@tonic-gate void 32570Sstevel@tonic-gate lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 32580Sstevel@tonic-gate { 32590Sstevel@tonic-gate proc_t *p; 32600Sstevel@tonic-gate lpl_t *lpl, *oldlpl; 32610Sstevel@tonic-gate lgrp_id_t oldid; 32620Sstevel@tonic-gate kthread_t *tp; 32630Sstevel@tonic-gate uint_t ncpu; 32640Sstevel@tonic-gate lgrp_load_t old, new; 32650Sstevel@tonic-gate 32660Sstevel@tonic-gate ASSERT(t); 32670Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 32680Sstevel@tonic-gate THREAD_LOCK_HELD(t)); 32690Sstevel@tonic-gate 32700Sstevel@tonic-gate /* 32710Sstevel@tonic-gate * If not changing lpls, just return 32720Sstevel@tonic-gate */ 32730Sstevel@tonic-gate if ((oldlpl = t->t_lpl) == newlpl) 32740Sstevel@tonic-gate return; 32750Sstevel@tonic-gate 32760Sstevel@tonic-gate /* 32770Sstevel@tonic-gate * Make sure the thread's lwp hasn't exited (if so, this thread is now 32780Sstevel@tonic-gate * associated with process 0 rather than with its original process). 32790Sstevel@tonic-gate */ 32800Sstevel@tonic-gate if (t->t_proc_flag & TP_LWPEXIT) { 32810Sstevel@tonic-gate if (newlpl != NULL) { 32820Sstevel@tonic-gate t->t_lpl = newlpl; 32830Sstevel@tonic-gate } 32840Sstevel@tonic-gate return; 32850Sstevel@tonic-gate } 32860Sstevel@tonic-gate 32870Sstevel@tonic-gate p = ttoproc(t); 32880Sstevel@tonic-gate 32890Sstevel@tonic-gate /* 32900Sstevel@tonic-gate * If the thread had a previous lgroup, update its process' p_lgrpset 32910Sstevel@tonic-gate * to account for it being moved from its old lgroup. 32920Sstevel@tonic-gate */ 32930Sstevel@tonic-gate if ((oldlpl != NULL) && /* thread had a previous lgroup */ 32940Sstevel@tonic-gate (p->p_tlist != NULL)) { 32950Sstevel@tonic-gate oldid = oldlpl->lpl_lgrpid; 32960Sstevel@tonic-gate 32970Sstevel@tonic-gate if (newlpl != NULL) 32980Sstevel@tonic-gate lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 32990Sstevel@tonic-gate 33000Sstevel@tonic-gate if ((do_lgrpset_delete) && 33010Sstevel@tonic-gate (klgrpset_ismember(p->p_lgrpset, oldid))) { 33020Sstevel@tonic-gate for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 33030Sstevel@tonic-gate /* 33040Sstevel@tonic-gate * Check if a thread other than the thread 33050Sstevel@tonic-gate * that's moving is assigned to the same 33060Sstevel@tonic-gate * lgroup as the thread that's moving. Note 33070Sstevel@tonic-gate * that we have to compare lgroup IDs, rather 33080Sstevel@tonic-gate * than simply comparing t_lpl's, since the 33090Sstevel@tonic-gate * threads may belong to different partitions 33100Sstevel@tonic-gate * but be assigned to the same lgroup. 33110Sstevel@tonic-gate */ 33120Sstevel@tonic-gate ASSERT(tp->t_lpl != NULL); 33130Sstevel@tonic-gate 33140Sstevel@tonic-gate if ((tp != t) && 33150Sstevel@tonic-gate (tp->t_lpl->lpl_lgrpid == oldid)) { 33160Sstevel@tonic-gate /* 33170Sstevel@tonic-gate * Another thread is assigned to the 33180Sstevel@tonic-gate * same lgroup as the thread that's 33190Sstevel@tonic-gate * moving, p_lgrpset doesn't change. 33200Sstevel@tonic-gate */ 33210Sstevel@tonic-gate break; 33220Sstevel@tonic-gate } else if (tp == p->p_tlist) { 33230Sstevel@tonic-gate /* 33240Sstevel@tonic-gate * No other thread is assigned to the 33250Sstevel@tonic-gate * same lgroup as the exiting thread, 33260Sstevel@tonic-gate * clear the lgroup's bit in p_lgrpset. 33270Sstevel@tonic-gate */ 33280Sstevel@tonic-gate klgrpset_del(p->p_lgrpset, oldid); 33290Sstevel@tonic-gate break; 33300Sstevel@tonic-gate } 33310Sstevel@tonic-gate } 33320Sstevel@tonic-gate } 33330Sstevel@tonic-gate 33340Sstevel@tonic-gate /* 33350Sstevel@tonic-gate * If this thread was assigned to its old lgroup for such a 33360Sstevel@tonic-gate * short amount of time that the anticipatory load that was 33370Sstevel@tonic-gate * added on its behalf has aged very little, remove that 33380Sstevel@tonic-gate * anticipatory load. 33390Sstevel@tonic-gate */ 33400Sstevel@tonic-gate if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 33410Sstevel@tonic-gate ((ncpu = oldlpl->lpl_ncpu) > 0)) { 33420Sstevel@tonic-gate lpl = oldlpl; 33430Sstevel@tonic-gate for (;;) { 33440Sstevel@tonic-gate do { 33450Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 33460Sstevel@tonic-gate new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 33470Sstevel@tonic-gate if (new > old) { 33480Sstevel@tonic-gate /* 33490Sstevel@tonic-gate * this can happen if the load 33500Sstevel@tonic-gate * average was aged since we 33510Sstevel@tonic-gate * added in the anticipatory 33520Sstevel@tonic-gate * load 33530Sstevel@tonic-gate */ 33540Sstevel@tonic-gate new = 0; 33550Sstevel@tonic-gate } 33560Sstevel@tonic-gate } while (cas32( 33570Sstevel@tonic-gate (lgrp_load_t *)&lpl->lpl_loadavg, old, 33580Sstevel@tonic-gate new) != old); 33590Sstevel@tonic-gate 33600Sstevel@tonic-gate lpl = lpl->lpl_parent; 33610Sstevel@tonic-gate if (lpl == NULL) 33620Sstevel@tonic-gate break; 33630Sstevel@tonic-gate 33640Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 33650Sstevel@tonic-gate ASSERT(ncpu > 0); 33660Sstevel@tonic-gate } 33670Sstevel@tonic-gate } 33680Sstevel@tonic-gate } 33690Sstevel@tonic-gate /* 33700Sstevel@tonic-gate * If the thread has a new lgroup (i.e. it's not exiting), update its 33710Sstevel@tonic-gate * t_lpl and its process' p_lgrpset, and apply an anticipatory load 33720Sstevel@tonic-gate * to its new lgroup to account for its move to its new lgroup. 33730Sstevel@tonic-gate */ 33740Sstevel@tonic-gate if (newlpl != NULL) { 33750Sstevel@tonic-gate /* 33760Sstevel@tonic-gate * This thread is moving to a new lgroup 33770Sstevel@tonic-gate */ 33780Sstevel@tonic-gate t->t_lpl = newlpl; 33790Sstevel@tonic-gate 33800Sstevel@tonic-gate /* 33810Sstevel@tonic-gate * Reflect move in load average of new lgroup 33820Sstevel@tonic-gate * unless it is root lgroup 33830Sstevel@tonic-gate */ 33840Sstevel@tonic-gate if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 33850Sstevel@tonic-gate return; 33860Sstevel@tonic-gate 33870Sstevel@tonic-gate if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 33880Sstevel@tonic-gate klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 33890Sstevel@tonic-gate } 33900Sstevel@tonic-gate 33910Sstevel@tonic-gate /* 33920Sstevel@tonic-gate * It'll take some time for the load on the new lgroup 33930Sstevel@tonic-gate * to reflect this thread's placement on it. We'd 33940Sstevel@tonic-gate * like not, however, to have all threads between now 33950Sstevel@tonic-gate * and then also piling on to this lgroup. To avoid 33960Sstevel@tonic-gate * this pileup, we anticipate the load this thread 33970Sstevel@tonic-gate * will generate on its new lgroup. The goal is to 33980Sstevel@tonic-gate * make the lgroup's load appear as though the thread 33990Sstevel@tonic-gate * had been there all along. We're very conservative 34000Sstevel@tonic-gate * in calculating this anticipatory load, we assume 34010Sstevel@tonic-gate * the worst case case (100% CPU-bound thread). This 34020Sstevel@tonic-gate * may be modified in the future to be more accurate. 34030Sstevel@tonic-gate */ 34040Sstevel@tonic-gate lpl = newlpl; 34050Sstevel@tonic-gate for (;;) { 34060Sstevel@tonic-gate ncpu = lpl->lpl_ncpu; 34070Sstevel@tonic-gate ASSERT(ncpu > 0); 34080Sstevel@tonic-gate do { 34090Sstevel@tonic-gate old = new = lpl->lpl_loadavg; 34100Sstevel@tonic-gate new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 34110Sstevel@tonic-gate /* 34120Sstevel@tonic-gate * Check for overflow 34130Sstevel@tonic-gate * Underflow not possible here 34140Sstevel@tonic-gate */ 34150Sstevel@tonic-gate if (new < old) 34160Sstevel@tonic-gate new = UINT32_MAX; 34170Sstevel@tonic-gate } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 34180Sstevel@tonic-gate new) != old); 34190Sstevel@tonic-gate 34200Sstevel@tonic-gate lpl = lpl->lpl_parent; 34210Sstevel@tonic-gate if (lpl == NULL) 34220Sstevel@tonic-gate break; 34230Sstevel@tonic-gate } 34240Sstevel@tonic-gate t->t_anttime = gethrtime(); 34250Sstevel@tonic-gate } 34260Sstevel@tonic-gate } 34270Sstevel@tonic-gate 34280Sstevel@tonic-gate /* 34290Sstevel@tonic-gate * Return lgroup memory allocation policy given advice from madvise(3C) 34300Sstevel@tonic-gate */ 34310Sstevel@tonic-gate lgrp_mem_policy_t 34320Sstevel@tonic-gate lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 34330Sstevel@tonic-gate { 34340Sstevel@tonic-gate switch (advice) { 34350Sstevel@tonic-gate case MADV_ACCESS_LWP: 34360Sstevel@tonic-gate return (LGRP_MEM_POLICY_NEXT); 34370Sstevel@tonic-gate case MADV_ACCESS_MANY: 34380Sstevel@tonic-gate return (LGRP_MEM_POLICY_RANDOM); 34390Sstevel@tonic-gate default: 34400Sstevel@tonic-gate return (lgrp_mem_policy_default(size, type)); 34410Sstevel@tonic-gate } 34420Sstevel@tonic-gate } 34430Sstevel@tonic-gate 34440Sstevel@tonic-gate /* 34450Sstevel@tonic-gate * Figure out default policy 34460Sstevel@tonic-gate */ 34470Sstevel@tonic-gate lgrp_mem_policy_t 34480Sstevel@tonic-gate lgrp_mem_policy_default(size_t size, int type) 34490Sstevel@tonic-gate { 34500Sstevel@tonic-gate cpupart_t *cp; 34510Sstevel@tonic-gate lgrp_mem_policy_t policy; 34520Sstevel@tonic-gate size_t pset_mem_size; 34530Sstevel@tonic-gate 34540Sstevel@tonic-gate /* 34550Sstevel@tonic-gate * Randomly allocate memory across lgroups for shared memory 34560Sstevel@tonic-gate * beyond a certain threshold 34570Sstevel@tonic-gate */ 34580Sstevel@tonic-gate if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 34590Sstevel@tonic-gate (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 34600Sstevel@tonic-gate /* 34610Sstevel@tonic-gate * Get total memory size of current thread's pset 34620Sstevel@tonic-gate */ 34630Sstevel@tonic-gate kpreempt_disable(); 34640Sstevel@tonic-gate cp = curthread->t_cpupart; 34650Sstevel@tonic-gate klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 34660Sstevel@tonic-gate kpreempt_enable(); 34670Sstevel@tonic-gate 34680Sstevel@tonic-gate /* 34690Sstevel@tonic-gate * Choose policy to randomly allocate memory across 34700Sstevel@tonic-gate * lgroups in pset if it will fit and is not default 34710Sstevel@tonic-gate * partition. Otherwise, allocate memory randomly 34720Sstevel@tonic-gate * across machine. 34730Sstevel@tonic-gate */ 34740Sstevel@tonic-gate if (lgrp_mem_pset_aware && size < pset_mem_size) 34750Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM_PSET; 34760Sstevel@tonic-gate else 34770Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM; 34780Sstevel@tonic-gate } else 34790Sstevel@tonic-gate /* 34800Sstevel@tonic-gate * Apply default policy for private memory and 34810Sstevel@tonic-gate * shared memory under the respective random 34820Sstevel@tonic-gate * threshold. 34830Sstevel@tonic-gate */ 34840Sstevel@tonic-gate policy = lgrp_mem_default_policy; 34850Sstevel@tonic-gate 34860Sstevel@tonic-gate return (policy); 34870Sstevel@tonic-gate } 34880Sstevel@tonic-gate 34890Sstevel@tonic-gate /* 34900Sstevel@tonic-gate * Get memory allocation policy for this segment 34910Sstevel@tonic-gate */ 34920Sstevel@tonic-gate lgrp_mem_policy_info_t * 34930Sstevel@tonic-gate lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 34940Sstevel@tonic-gate { 34950Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 34960Sstevel@tonic-gate extern struct seg_ops segspt_ops; 34970Sstevel@tonic-gate extern struct seg_ops segspt_shmops; 34980Sstevel@tonic-gate 34990Sstevel@tonic-gate /* 35000Sstevel@tonic-gate * This is for binary compatibility to protect against third party 35010Sstevel@tonic-gate * segment drivers which haven't recompiled to allow for 35020Sstevel@tonic-gate * SEGOP_GETPOLICY() 35030Sstevel@tonic-gate */ 35040Sstevel@tonic-gate if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 35050Sstevel@tonic-gate seg->s_ops != &segspt_shmops) 35060Sstevel@tonic-gate return (NULL); 35070Sstevel@tonic-gate 35080Sstevel@tonic-gate policy_info = NULL; 35090Sstevel@tonic-gate if (seg->s_ops->getpolicy != NULL) 35100Sstevel@tonic-gate policy_info = SEGOP_GETPOLICY(seg, vaddr); 35110Sstevel@tonic-gate 35120Sstevel@tonic-gate return (policy_info); 35130Sstevel@tonic-gate } 35140Sstevel@tonic-gate 35150Sstevel@tonic-gate /* 35160Sstevel@tonic-gate * Set policy for allocating private memory given desired policy, policy info, 35170Sstevel@tonic-gate * size in bytes of memory that policy is being applied. 35180Sstevel@tonic-gate * Return 0 if policy wasn't set already and 1 if policy was set already 35190Sstevel@tonic-gate */ 35200Sstevel@tonic-gate int 35210Sstevel@tonic-gate lgrp_privm_policy_set(lgrp_mem_policy_t policy, 35220Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info, size_t size) 35230Sstevel@tonic-gate { 35240Sstevel@tonic-gate 35250Sstevel@tonic-gate ASSERT(policy_info != NULL); 35260Sstevel@tonic-gate 35270Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_DEFAULT) 35280Sstevel@tonic-gate policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 35290Sstevel@tonic-gate 35300Sstevel@tonic-gate /* 35310Sstevel@tonic-gate * Policy set already? 35320Sstevel@tonic-gate */ 35330Sstevel@tonic-gate if (policy == policy_info->mem_policy) 35340Sstevel@tonic-gate return (1); 35350Sstevel@tonic-gate 35360Sstevel@tonic-gate /* 35370Sstevel@tonic-gate * Set policy 35380Sstevel@tonic-gate */ 35390Sstevel@tonic-gate policy_info->mem_policy = policy; 35400Sstevel@tonic-gate policy_info->mem_reserved = 0; 35410Sstevel@tonic-gate 35420Sstevel@tonic-gate return (0); 35430Sstevel@tonic-gate } 35440Sstevel@tonic-gate 35450Sstevel@tonic-gate 35460Sstevel@tonic-gate /* 35470Sstevel@tonic-gate * Get shared memory allocation policy with given tree and offset 35480Sstevel@tonic-gate */ 35490Sstevel@tonic-gate lgrp_mem_policy_info_t * 35500Sstevel@tonic-gate lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 35510Sstevel@tonic-gate u_offset_t vn_off) 35520Sstevel@tonic-gate { 35530Sstevel@tonic-gate u_offset_t off; 35540Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 35550Sstevel@tonic-gate lgrp_shm_policy_seg_t *policy_seg; 35560Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 35570Sstevel@tonic-gate avl_tree_t *tree; 35580Sstevel@tonic-gate avl_index_t where; 35590Sstevel@tonic-gate 35600Sstevel@tonic-gate /* 35610Sstevel@tonic-gate * Get policy segment tree from anon_map or vnode and use specified 35620Sstevel@tonic-gate * anon index or vnode offset as offset 35630Sstevel@tonic-gate * 35640Sstevel@tonic-gate * Assume that no lock needs to be held on anon_map or vnode, since 35650Sstevel@tonic-gate * they should be protected by their reference count which must be 35660Sstevel@tonic-gate * nonzero for an existing segment 35670Sstevel@tonic-gate */ 35680Sstevel@tonic-gate if (amp) { 35690Sstevel@tonic-gate ASSERT(amp->refcnt != 0); 35700Sstevel@tonic-gate shm_locality = amp->locality; 35710Sstevel@tonic-gate if (shm_locality == NULL) 35720Sstevel@tonic-gate return (NULL); 35730Sstevel@tonic-gate tree = shm_locality->loc_tree; 35740Sstevel@tonic-gate off = ptob(anon_index); 35750Sstevel@tonic-gate } else if (vp) { 35760Sstevel@tonic-gate shm_locality = vp->v_locality; 35770Sstevel@tonic-gate if (shm_locality == NULL) 35780Sstevel@tonic-gate return (NULL); 35790Sstevel@tonic-gate ASSERT(shm_locality->loc_count != 0); 35800Sstevel@tonic-gate tree = shm_locality->loc_tree; 35810Sstevel@tonic-gate off = vn_off; 35820Sstevel@tonic-gate } 35830Sstevel@tonic-gate 35840Sstevel@tonic-gate if (tree == NULL) 35850Sstevel@tonic-gate return (NULL); 35860Sstevel@tonic-gate 35870Sstevel@tonic-gate /* 35880Sstevel@tonic-gate * Lookup policy segment for offset into shared object and return 35890Sstevel@tonic-gate * policy info 35900Sstevel@tonic-gate */ 35910Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_READER); 35920Sstevel@tonic-gate policy_info = NULL; 35930Sstevel@tonic-gate policy_seg = avl_find(tree, &off, &where); 35940Sstevel@tonic-gate if (policy_seg) 35950Sstevel@tonic-gate policy_info = &policy_seg->shm_policy; 35960Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 35970Sstevel@tonic-gate 35980Sstevel@tonic-gate return (policy_info); 35990Sstevel@tonic-gate } 36000Sstevel@tonic-gate 36010Sstevel@tonic-gate /* 3602*2480Sesaxe * Default memory allocation policy for kernel segmap pages 3603*2480Sesaxe */ 3604*2480Sesaxe lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3605*2480Sesaxe 3606*2480Sesaxe /* 36070Sstevel@tonic-gate * Return lgroup to use for allocating memory 36080Sstevel@tonic-gate * given the segment and address 36090Sstevel@tonic-gate * 36100Sstevel@tonic-gate * There isn't any mutual exclusion that exists between calls 36110Sstevel@tonic-gate * to this routine and DR, so this routine and whomever calls it 36120Sstevel@tonic-gate * should be mindful of the possibility that the lgrp returned 36130Sstevel@tonic-gate * may be deleted. If this happens, dereferences of the lgrp 36140Sstevel@tonic-gate * pointer will still be safe, but the resources in the lgrp will 36150Sstevel@tonic-gate * be gone, and LGRP_EXISTS() will no longer be true. 36160Sstevel@tonic-gate */ 36170Sstevel@tonic-gate lgrp_t * 36180Sstevel@tonic-gate lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 36190Sstevel@tonic-gate { 36200Sstevel@tonic-gate int i; 36210Sstevel@tonic-gate lgrp_t *lgrp; 36220Sstevel@tonic-gate klgrpset_t lgrpset; 36230Sstevel@tonic-gate int lgrps_spanned; 36240Sstevel@tonic-gate unsigned long off; 36250Sstevel@tonic-gate lgrp_mem_policy_t policy; 36260Sstevel@tonic-gate lgrp_mem_policy_info_t *policy_info; 36270Sstevel@tonic-gate ushort_t random; 36280Sstevel@tonic-gate int stat = 0; 3629*2480Sesaxe extern struct seg *segkmap; 36300Sstevel@tonic-gate 36310Sstevel@tonic-gate /* 36320Sstevel@tonic-gate * Just return null if the lgrp framework hasn't finished 36330Sstevel@tonic-gate * initializing or if this is a UMA machine. 36340Sstevel@tonic-gate */ 36350Sstevel@tonic-gate if (nlgrps == 1 || !lgrp_initialized) 36360Sstevel@tonic-gate return (lgrp_root); 36370Sstevel@tonic-gate 36380Sstevel@tonic-gate /* 36390Sstevel@tonic-gate * Get memory allocation policy for this segment 36400Sstevel@tonic-gate */ 36410Sstevel@tonic-gate policy = lgrp_mem_default_policy; 36420Sstevel@tonic-gate if (seg != NULL) { 36430Sstevel@tonic-gate if (seg->s_as == &kas) { 3644*2480Sesaxe if (seg == segkmap) 3645*2480Sesaxe policy = lgrp_segmap_default_policy; 36460Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 36470Sstevel@tonic-gate policy == LGRP_MEM_POLICY_RANDOM_PSET) 36480Sstevel@tonic-gate policy = LGRP_MEM_POLICY_RANDOM; 36490Sstevel@tonic-gate } else { 36500Sstevel@tonic-gate policy_info = lgrp_mem_policy_get(seg, vaddr); 36510Sstevel@tonic-gate if (policy_info != NULL) 36520Sstevel@tonic-gate policy = policy_info->mem_policy; 36530Sstevel@tonic-gate } 36540Sstevel@tonic-gate } 36550Sstevel@tonic-gate lgrpset = 0; 36560Sstevel@tonic-gate 36570Sstevel@tonic-gate /* 36580Sstevel@tonic-gate * Initialize lgroup to home by default 36590Sstevel@tonic-gate */ 36600Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 36610Sstevel@tonic-gate 36620Sstevel@tonic-gate /* 36630Sstevel@tonic-gate * When homing threads on root lgrp, override default memory 36640Sstevel@tonic-gate * allocation policies with root lgroup memory allocation policy 36650Sstevel@tonic-gate */ 36660Sstevel@tonic-gate if (lgrp == lgrp_root) 36670Sstevel@tonic-gate policy = lgrp_mem_policy_root; 36680Sstevel@tonic-gate 36690Sstevel@tonic-gate /* 36700Sstevel@tonic-gate * Implement policy 36710Sstevel@tonic-gate */ 36720Sstevel@tonic-gate switch (policy) { 36730Sstevel@tonic-gate case LGRP_MEM_POLICY_NEXT_CPU: 36740Sstevel@tonic-gate 36750Sstevel@tonic-gate /* 36760Sstevel@tonic-gate * Return lgroup of current CPU which faulted on memory 367760Sesaxe * If the CPU isn't currently in an lgrp, then opt to 367860Sesaxe * allocate from the root. 367960Sesaxe * 368060Sesaxe * Kernel preemption needs to be disabled here to prevent 368160Sesaxe * the current CPU from going away before lgrp is found. 36820Sstevel@tonic-gate */ 368360Sesaxe if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 368460Sesaxe lgrp = lgrp_root; 368560Sesaxe } else { 368660Sesaxe kpreempt_disable(); 368760Sesaxe lgrp = lgrp_cpu_to_lgrp(CPU); 368860Sesaxe kpreempt_enable(); 368960Sesaxe } 36900Sstevel@tonic-gate break; 36910Sstevel@tonic-gate 36920Sstevel@tonic-gate case LGRP_MEM_POLICY_NEXT: 36930Sstevel@tonic-gate case LGRP_MEM_POLICY_DEFAULT: 36940Sstevel@tonic-gate default: 36950Sstevel@tonic-gate 36960Sstevel@tonic-gate /* 36970Sstevel@tonic-gate * Just return current thread's home lgroup 36980Sstevel@tonic-gate * for default policy (next touch) 36990Sstevel@tonic-gate * If the thread is homed to the root, 37000Sstevel@tonic-gate * then the default policy is random across lgroups. 37010Sstevel@tonic-gate * Fallthrough to the random case. 37020Sstevel@tonic-gate */ 37030Sstevel@tonic-gate if (lgrp != lgrp_root) { 37040Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_NEXT) 37050Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 37060Sstevel@tonic-gate else 37070Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, 37080Sstevel@tonic-gate LGRP_NUM_DEFAULT, 1); 37090Sstevel@tonic-gate break; 37100Sstevel@tonic-gate } 37110Sstevel@tonic-gate /* LINTED fallthrough on case statement */ 37120Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM: 37130Sstevel@tonic-gate 37140Sstevel@tonic-gate /* 37150Sstevel@tonic-gate * Return a random leaf lgroup with memory 37160Sstevel@tonic-gate */ 37170Sstevel@tonic-gate lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 37180Sstevel@tonic-gate /* 37190Sstevel@tonic-gate * Count how many lgroups are spanned 37200Sstevel@tonic-gate */ 37210Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrps_spanned); 37220Sstevel@tonic-gate 37230Sstevel@tonic-gate /* 37240Sstevel@tonic-gate * There may be no memnodes in the root lgroup during DR copy 37250Sstevel@tonic-gate * rename on a system with only two boards (memnodes) 37260Sstevel@tonic-gate * configured. In this case just return the root lgrp. 37270Sstevel@tonic-gate */ 37280Sstevel@tonic-gate if (lgrps_spanned == 0) { 37290Sstevel@tonic-gate lgrp = lgrp_root; 37300Sstevel@tonic-gate break; 37310Sstevel@tonic-gate } 37320Sstevel@tonic-gate 37330Sstevel@tonic-gate /* 37340Sstevel@tonic-gate * Pick a random offset within lgroups spanned 37350Sstevel@tonic-gate * and return lgroup at that offset 37360Sstevel@tonic-gate */ 37370Sstevel@tonic-gate random = (ushort_t)gethrtime() >> 4; 37380Sstevel@tonic-gate off = random % lgrps_spanned; 37390Sstevel@tonic-gate ASSERT(off <= lgrp_alloc_max); 37400Sstevel@tonic-gate 37410Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 37420Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, i)) 37430Sstevel@tonic-gate continue; 37440Sstevel@tonic-gate if (off) 37450Sstevel@tonic-gate off--; 37460Sstevel@tonic-gate else { 37470Sstevel@tonic-gate lgrp = lgrp_table[i]; 37480Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 37490Sstevel@tonic-gate 1); 37500Sstevel@tonic-gate break; 37510Sstevel@tonic-gate } 37520Sstevel@tonic-gate } 37530Sstevel@tonic-gate break; 37540Sstevel@tonic-gate 37550Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM_PROC: 37560Sstevel@tonic-gate 37570Sstevel@tonic-gate /* 37580Sstevel@tonic-gate * Grab copy of bitmask of lgroups spanned by 37590Sstevel@tonic-gate * this process 37600Sstevel@tonic-gate */ 37610Sstevel@tonic-gate klgrpset_copy(lgrpset, curproc->p_lgrpset); 37620Sstevel@tonic-gate stat = LGRP_NUM_RANDOM_PROC; 37630Sstevel@tonic-gate 37640Sstevel@tonic-gate /* LINTED fallthrough on case statement */ 37650Sstevel@tonic-gate case LGRP_MEM_POLICY_RANDOM_PSET: 37660Sstevel@tonic-gate 37670Sstevel@tonic-gate if (!stat) 37680Sstevel@tonic-gate stat = LGRP_NUM_RANDOM_PSET; 37690Sstevel@tonic-gate 37700Sstevel@tonic-gate if (klgrpset_isempty(lgrpset)) { 37710Sstevel@tonic-gate /* 37720Sstevel@tonic-gate * Grab copy of bitmask of lgroups spanned by 37730Sstevel@tonic-gate * this processor set 37740Sstevel@tonic-gate */ 37750Sstevel@tonic-gate kpreempt_disable(); 37760Sstevel@tonic-gate klgrpset_copy(lgrpset, 37770Sstevel@tonic-gate curthread->t_cpupart->cp_lgrpset); 37780Sstevel@tonic-gate kpreempt_enable(); 37790Sstevel@tonic-gate } 37800Sstevel@tonic-gate 37810Sstevel@tonic-gate /* 37820Sstevel@tonic-gate * Count how many lgroups are spanned 37830Sstevel@tonic-gate */ 37840Sstevel@tonic-gate klgrpset_nlgrps(lgrpset, lgrps_spanned); 37850Sstevel@tonic-gate ASSERT(lgrps_spanned <= nlgrps); 37860Sstevel@tonic-gate 37870Sstevel@tonic-gate /* 37880Sstevel@tonic-gate * Probably lgrps_spanned should be always non-zero, but to be 37890Sstevel@tonic-gate * on the safe side we return lgrp_root if it is empty. 37900Sstevel@tonic-gate */ 37910Sstevel@tonic-gate if (lgrps_spanned == 0) { 37920Sstevel@tonic-gate lgrp = lgrp_root; 37930Sstevel@tonic-gate break; 37940Sstevel@tonic-gate } 37950Sstevel@tonic-gate 37960Sstevel@tonic-gate /* 37970Sstevel@tonic-gate * Pick a random offset within lgroups spanned 37980Sstevel@tonic-gate * and return lgroup at that offset 37990Sstevel@tonic-gate */ 38000Sstevel@tonic-gate random = (ushort_t)gethrtime() >> 4; 38010Sstevel@tonic-gate off = random % lgrps_spanned; 38020Sstevel@tonic-gate ASSERT(off <= lgrp_alloc_max); 38030Sstevel@tonic-gate 38040Sstevel@tonic-gate for (i = 0; i <= lgrp_alloc_max; i++) { 38050Sstevel@tonic-gate if (!klgrpset_ismember(lgrpset, i)) 38060Sstevel@tonic-gate continue; 38070Sstevel@tonic-gate if (off) 38080Sstevel@tonic-gate off--; 38090Sstevel@tonic-gate else { 38100Sstevel@tonic-gate lgrp = lgrp_table[i]; 38110Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 38120Sstevel@tonic-gate 1); 38130Sstevel@tonic-gate break; 38140Sstevel@tonic-gate } 38150Sstevel@tonic-gate } 38160Sstevel@tonic-gate break; 38170Sstevel@tonic-gate 38180Sstevel@tonic-gate case LGRP_MEM_POLICY_ROUNDROBIN: 38190Sstevel@tonic-gate 38200Sstevel@tonic-gate /* 38210Sstevel@tonic-gate * Use offset within segment to determine 38220Sstevel@tonic-gate * offset from home lgroup to choose for 38230Sstevel@tonic-gate * next lgroup to allocate memory from 38240Sstevel@tonic-gate */ 38250Sstevel@tonic-gate off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 38260Sstevel@tonic-gate (lgrp_alloc_max + 1); 38270Sstevel@tonic-gate 38280Sstevel@tonic-gate kpreempt_disable(); 38290Sstevel@tonic-gate lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 38300Sstevel@tonic-gate i = lgrp->lgrp_id; 38310Sstevel@tonic-gate kpreempt_enable(); 38320Sstevel@tonic-gate 38330Sstevel@tonic-gate while (off > 0) { 38340Sstevel@tonic-gate i = (i + 1) % (lgrp_alloc_max + 1); 38350Sstevel@tonic-gate lgrp = lgrp_table[i]; 38360Sstevel@tonic-gate if (klgrpset_ismember(lgrpset, i)) 38370Sstevel@tonic-gate off--; 38380Sstevel@tonic-gate } 38390Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 38400Sstevel@tonic-gate 38410Sstevel@tonic-gate break; 38420Sstevel@tonic-gate } 38430Sstevel@tonic-gate 38440Sstevel@tonic-gate ASSERT(lgrp != NULL); 38450Sstevel@tonic-gate return (lgrp); 38460Sstevel@tonic-gate } 38470Sstevel@tonic-gate 38480Sstevel@tonic-gate /* 38490Sstevel@tonic-gate * Return the number of pages in an lgroup 38500Sstevel@tonic-gate * 38510Sstevel@tonic-gate * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 38520Sstevel@tonic-gate * could cause tests that rely on the numat driver to fail.... 38530Sstevel@tonic-gate */ 38540Sstevel@tonic-gate pgcnt_t 38550Sstevel@tonic-gate lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 38560Sstevel@tonic-gate { 38570Sstevel@tonic-gate lgrp_t *lgrp; 38580Sstevel@tonic-gate 38590Sstevel@tonic-gate lgrp = lgrp_table[lgrpid]; 38600Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp) || 38610Sstevel@tonic-gate klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 38620Sstevel@tonic-gate !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 38630Sstevel@tonic-gate return (0); 38640Sstevel@tonic-gate 38650Sstevel@tonic-gate return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 38660Sstevel@tonic-gate } 38670Sstevel@tonic-gate 38680Sstevel@tonic-gate /* 38690Sstevel@tonic-gate * Initialize lgroup shared memory allocation policy support 38700Sstevel@tonic-gate */ 38710Sstevel@tonic-gate void 38720Sstevel@tonic-gate lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 38730Sstevel@tonic-gate { 38740Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 38750Sstevel@tonic-gate 38760Sstevel@tonic-gate /* 38770Sstevel@tonic-gate * Initialize locality field in anon_map 38780Sstevel@tonic-gate * Don't need any locks because this is called when anon_map is 38790Sstevel@tonic-gate * allocated, but not used anywhere yet. 38800Sstevel@tonic-gate */ 38810Sstevel@tonic-gate if (amp) { 38820Sstevel@tonic-gate ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 38830Sstevel@tonic-gate if (amp->locality == NULL) { 38840Sstevel@tonic-gate /* 38850Sstevel@tonic-gate * Allocate and initialize shared memory locality info 38860Sstevel@tonic-gate * and set anon_map locality pointer to it 38870Sstevel@tonic-gate * Drop lock across kmem_alloc(KM_SLEEP) 38880Sstevel@tonic-gate */ 38890Sstevel@tonic-gate ANON_LOCK_EXIT(&->a_rwlock); 38900Sstevel@tonic-gate shm_locality = kmem_alloc(sizeof (*shm_locality), 38910Sstevel@tonic-gate KM_SLEEP); 38920Sstevel@tonic-gate rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 38930Sstevel@tonic-gate NULL); 38940Sstevel@tonic-gate shm_locality->loc_count = 1; /* not used for amp */ 38950Sstevel@tonic-gate shm_locality->loc_tree = NULL; 38960Sstevel@tonic-gate 38970Sstevel@tonic-gate /* 38980Sstevel@tonic-gate * Reacquire lock and check to see whether anyone beat 38990Sstevel@tonic-gate * us to initializing the locality info 39000Sstevel@tonic-gate */ 39010Sstevel@tonic-gate ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 39020Sstevel@tonic-gate if (amp->locality != NULL) { 39030Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 39040Sstevel@tonic-gate kmem_free(shm_locality, 39050Sstevel@tonic-gate sizeof (*shm_locality)); 39060Sstevel@tonic-gate } else 39070Sstevel@tonic-gate amp->locality = shm_locality; 39080Sstevel@tonic-gate } 39090Sstevel@tonic-gate ANON_LOCK_EXIT(&->a_rwlock); 39100Sstevel@tonic-gate return; 39110Sstevel@tonic-gate } 39120Sstevel@tonic-gate 39130Sstevel@tonic-gate /* 39140Sstevel@tonic-gate * Allocate shared vnode policy info if vnode is not locality aware yet 39150Sstevel@tonic-gate */ 39160Sstevel@tonic-gate mutex_enter(&vp->v_lock); 39170Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0) { 39180Sstevel@tonic-gate /* 39190Sstevel@tonic-gate * Allocate and initialize shared memory locality info 39200Sstevel@tonic-gate */ 39210Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39220Sstevel@tonic-gate shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 39230Sstevel@tonic-gate rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 39240Sstevel@tonic-gate shm_locality->loc_count = 1; 39250Sstevel@tonic-gate shm_locality->loc_tree = NULL; 39260Sstevel@tonic-gate 39270Sstevel@tonic-gate /* 39280Sstevel@tonic-gate * Point vnode locality field at shared vnode policy info 39290Sstevel@tonic-gate * and set locality aware flag in vnode 39300Sstevel@tonic-gate */ 39310Sstevel@tonic-gate mutex_enter(&vp->v_lock); 39320Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0) { 39330Sstevel@tonic-gate vp->v_locality = shm_locality; 39340Sstevel@tonic-gate vp->v_flag |= V_LOCALITY; 39350Sstevel@tonic-gate } else { 39360Sstevel@tonic-gate /* 39370Sstevel@tonic-gate * Lost race so free locality info and increment count. 39380Sstevel@tonic-gate */ 39390Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 39400Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 39410Sstevel@tonic-gate shm_locality = vp->v_locality; 39420Sstevel@tonic-gate shm_locality->loc_count++; 39430Sstevel@tonic-gate } 39440Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39450Sstevel@tonic-gate 39460Sstevel@tonic-gate return; 39470Sstevel@tonic-gate } 39480Sstevel@tonic-gate 39490Sstevel@tonic-gate /* 39500Sstevel@tonic-gate * Increment reference count of number of segments mapping this vnode 39510Sstevel@tonic-gate * shared 39520Sstevel@tonic-gate */ 39530Sstevel@tonic-gate shm_locality = vp->v_locality; 39540Sstevel@tonic-gate shm_locality->loc_count++; 39550Sstevel@tonic-gate mutex_exit(&vp->v_lock); 39560Sstevel@tonic-gate } 39570Sstevel@tonic-gate 39580Sstevel@tonic-gate /* 39590Sstevel@tonic-gate * Destroy the given shared memory policy segment tree 39600Sstevel@tonic-gate */ 39610Sstevel@tonic-gate void 39620Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 39630Sstevel@tonic-gate { 39640Sstevel@tonic-gate lgrp_shm_policy_seg_t *cur; 39650Sstevel@tonic-gate lgrp_shm_policy_seg_t *next; 39660Sstevel@tonic-gate 39670Sstevel@tonic-gate if (tree == NULL) 39680Sstevel@tonic-gate return; 39690Sstevel@tonic-gate 39700Sstevel@tonic-gate cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 39710Sstevel@tonic-gate while (cur != NULL) { 39720Sstevel@tonic-gate next = AVL_NEXT(tree, cur); 39730Sstevel@tonic-gate avl_remove(tree, cur); 39740Sstevel@tonic-gate kmem_free(cur, sizeof (*cur)); 39750Sstevel@tonic-gate cur = next; 39760Sstevel@tonic-gate } 39770Sstevel@tonic-gate kmem_free(tree, sizeof (avl_tree_t)); 39780Sstevel@tonic-gate } 39790Sstevel@tonic-gate 39800Sstevel@tonic-gate /* 39810Sstevel@tonic-gate * Uninitialize lgroup shared memory allocation policy support 39820Sstevel@tonic-gate */ 39830Sstevel@tonic-gate void 39840Sstevel@tonic-gate lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 39850Sstevel@tonic-gate { 39860Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 39870Sstevel@tonic-gate 39880Sstevel@tonic-gate /* 39890Sstevel@tonic-gate * For anon_map, deallocate shared memory policy tree and 39900Sstevel@tonic-gate * zero locality field 39910Sstevel@tonic-gate * Don't need any locks because anon_map is being freed 39920Sstevel@tonic-gate */ 39930Sstevel@tonic-gate if (amp) { 39940Sstevel@tonic-gate if (amp->locality == NULL) 39950Sstevel@tonic-gate return; 39960Sstevel@tonic-gate shm_locality = amp->locality; 39970Sstevel@tonic-gate shm_locality->loc_count = 0; /* not really used for amp */ 39980Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 39990Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 40000Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 40010Sstevel@tonic-gate amp->locality = 0; 40020Sstevel@tonic-gate return; 40030Sstevel@tonic-gate } 40040Sstevel@tonic-gate 40050Sstevel@tonic-gate /* 40060Sstevel@tonic-gate * For vnode, decrement reference count of segments mapping this vnode 40070Sstevel@tonic-gate * shared and delete locality info if reference count drops to 0 40080Sstevel@tonic-gate */ 40090Sstevel@tonic-gate mutex_enter(&vp->v_lock); 40100Sstevel@tonic-gate shm_locality = vp->v_locality; 40110Sstevel@tonic-gate shm_locality->loc_count--; 40120Sstevel@tonic-gate 40130Sstevel@tonic-gate if (shm_locality->loc_count == 0) { 40140Sstevel@tonic-gate rw_destroy(&shm_locality->loc_lock); 40150Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 40160Sstevel@tonic-gate kmem_free(shm_locality, sizeof (*shm_locality)); 40170Sstevel@tonic-gate vp->v_locality = 0; 40180Sstevel@tonic-gate vp->v_flag &= ~V_LOCALITY; 40190Sstevel@tonic-gate } 40200Sstevel@tonic-gate mutex_exit(&vp->v_lock); 40210Sstevel@tonic-gate } 40220Sstevel@tonic-gate 40230Sstevel@tonic-gate /* 40240Sstevel@tonic-gate * Compare two shared memory policy segments 40250Sstevel@tonic-gate * Used by AVL tree code for searching 40260Sstevel@tonic-gate */ 40270Sstevel@tonic-gate int 40280Sstevel@tonic-gate lgrp_shm_policy_compar(const void *x, const void *y) 40290Sstevel@tonic-gate { 40300Sstevel@tonic-gate lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 40310Sstevel@tonic-gate lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 40320Sstevel@tonic-gate 40330Sstevel@tonic-gate if (a->shm_off < b->shm_off) 40340Sstevel@tonic-gate return (-1); 40350Sstevel@tonic-gate if (a->shm_off >= b->shm_off + b->shm_size) 40360Sstevel@tonic-gate return (1); 40370Sstevel@tonic-gate return (0); 40380Sstevel@tonic-gate } 40390Sstevel@tonic-gate 40400Sstevel@tonic-gate /* 40410Sstevel@tonic-gate * Concatenate seg1 with seg2 and remove seg2 40420Sstevel@tonic-gate */ 40430Sstevel@tonic-gate static int 40440Sstevel@tonic-gate lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 40450Sstevel@tonic-gate lgrp_shm_policy_seg_t *seg2) 40460Sstevel@tonic-gate { 40470Sstevel@tonic-gate if (!seg1 || !seg2 || 40480Sstevel@tonic-gate seg1->shm_off + seg1->shm_size != seg2->shm_off || 40490Sstevel@tonic-gate seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 40500Sstevel@tonic-gate return (-1); 40510Sstevel@tonic-gate 40520Sstevel@tonic-gate seg1->shm_size += seg2->shm_size; 40530Sstevel@tonic-gate avl_remove(tree, seg2); 40540Sstevel@tonic-gate kmem_free(seg2, sizeof (*seg2)); 40550Sstevel@tonic-gate return (0); 40560Sstevel@tonic-gate } 40570Sstevel@tonic-gate 40580Sstevel@tonic-gate /* 40590Sstevel@tonic-gate * Split segment at given offset and return rightmost (uppermost) segment 40600Sstevel@tonic-gate * Assumes that there are no overlapping segments 40610Sstevel@tonic-gate */ 40620Sstevel@tonic-gate static lgrp_shm_policy_seg_t * 40630Sstevel@tonic-gate lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 40640Sstevel@tonic-gate u_offset_t off) 40650Sstevel@tonic-gate { 40660Sstevel@tonic-gate lgrp_shm_policy_seg_t *newseg; 40670Sstevel@tonic-gate avl_index_t where; 40680Sstevel@tonic-gate 40690Sstevel@tonic-gate ASSERT(seg != NULL); 40700Sstevel@tonic-gate ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 40710Sstevel@tonic-gate 40720Sstevel@tonic-gate if (!seg || off < seg->shm_off || off > seg->shm_off + 40730Sstevel@tonic-gate seg->shm_size) 40740Sstevel@tonic-gate return (NULL); 40750Sstevel@tonic-gate 40760Sstevel@tonic-gate if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 40770Sstevel@tonic-gate return (seg); 40780Sstevel@tonic-gate 40790Sstevel@tonic-gate /* 40800Sstevel@tonic-gate * Adjust size of left segment and allocate new (right) segment 40810Sstevel@tonic-gate */ 40820Sstevel@tonic-gate newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 40830Sstevel@tonic-gate newseg->shm_policy = seg->shm_policy; 40840Sstevel@tonic-gate newseg->shm_off = off; 40850Sstevel@tonic-gate newseg->shm_size = seg->shm_size - (off - seg->shm_off); 40860Sstevel@tonic-gate seg->shm_size = off - seg->shm_off; 40870Sstevel@tonic-gate 40880Sstevel@tonic-gate /* 40890Sstevel@tonic-gate * Find where to insert new segment in AVL tree and insert it 40900Sstevel@tonic-gate */ 40910Sstevel@tonic-gate (void) avl_find(tree, &off, &where); 40920Sstevel@tonic-gate avl_insert(tree, newseg, where); 40930Sstevel@tonic-gate 40940Sstevel@tonic-gate return (newseg); 40950Sstevel@tonic-gate } 40960Sstevel@tonic-gate 40970Sstevel@tonic-gate /* 40980Sstevel@tonic-gate * Set shared memory allocation policy on specified shared object at given 40990Sstevel@tonic-gate * offset and length 41000Sstevel@tonic-gate * 41010Sstevel@tonic-gate * Return 0 if policy wasn't set already, 1 if policy was set already, and 41020Sstevel@tonic-gate * -1 if can't set policy. 41030Sstevel@tonic-gate */ 41040Sstevel@tonic-gate int 41050Sstevel@tonic-gate lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 41060Sstevel@tonic-gate ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 41070Sstevel@tonic-gate { 41080Sstevel@tonic-gate u_offset_t eoff; 41090Sstevel@tonic-gate lgrp_shm_policy_seg_t *next; 41100Sstevel@tonic-gate lgrp_shm_policy_seg_t *newseg; 41110Sstevel@tonic-gate u_offset_t off; 41120Sstevel@tonic-gate u_offset_t oldeoff; 41130Sstevel@tonic-gate lgrp_shm_policy_seg_t *prev; 41140Sstevel@tonic-gate int retval; 41150Sstevel@tonic-gate lgrp_shm_policy_seg_t *seg; 41160Sstevel@tonic-gate lgrp_shm_locality_t *shm_locality; 41170Sstevel@tonic-gate avl_tree_t *tree; 41180Sstevel@tonic-gate avl_index_t where; 41190Sstevel@tonic-gate 41200Sstevel@tonic-gate ASSERT(amp || vp); 41210Sstevel@tonic-gate ASSERT((len & PAGEOFFSET) == 0); 41220Sstevel@tonic-gate 41230Sstevel@tonic-gate if (len == 0) 41240Sstevel@tonic-gate return (-1); 41250Sstevel@tonic-gate 41260Sstevel@tonic-gate retval = 0; 41270Sstevel@tonic-gate 41280Sstevel@tonic-gate /* 41290Sstevel@tonic-gate * Get locality info and starting offset into shared object 41300Sstevel@tonic-gate * Try anon map first and then vnode 41310Sstevel@tonic-gate * Assume that no locks need to be held on anon_map or vnode, since 41320Sstevel@tonic-gate * it should be protected by its reference count which must be nonzero 41330Sstevel@tonic-gate * for an existing segment. 41340Sstevel@tonic-gate */ 41350Sstevel@tonic-gate if (amp) { 41360Sstevel@tonic-gate /* 41370Sstevel@tonic-gate * Get policy info from anon_map 41380Sstevel@tonic-gate * 41390Sstevel@tonic-gate */ 41400Sstevel@tonic-gate ASSERT(amp->refcnt != 0); 41410Sstevel@tonic-gate if (amp->locality == NULL) 41420Sstevel@tonic-gate lgrp_shm_policy_init(amp, NULL); 41430Sstevel@tonic-gate shm_locality = amp->locality; 41440Sstevel@tonic-gate off = ptob(anon_index); 41450Sstevel@tonic-gate } else if (vp) { 41460Sstevel@tonic-gate /* 41470Sstevel@tonic-gate * Get policy info from vnode 41480Sstevel@tonic-gate */ 41490Sstevel@tonic-gate if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 41500Sstevel@tonic-gate lgrp_shm_policy_init(NULL, vp); 41510Sstevel@tonic-gate shm_locality = vp->v_locality; 41520Sstevel@tonic-gate ASSERT(shm_locality->loc_count != 0); 41530Sstevel@tonic-gate off = vn_off; 41540Sstevel@tonic-gate } else 41550Sstevel@tonic-gate return (-1); 41560Sstevel@tonic-gate 41570Sstevel@tonic-gate ASSERT((off & PAGEOFFSET) == 0); 41580Sstevel@tonic-gate 41590Sstevel@tonic-gate /* 41600Sstevel@tonic-gate * Figure out default policy 41610Sstevel@tonic-gate */ 41620Sstevel@tonic-gate if (policy == LGRP_MEM_POLICY_DEFAULT) 41630Sstevel@tonic-gate policy = lgrp_mem_policy_default(len, MAP_SHARED); 41640Sstevel@tonic-gate 41650Sstevel@tonic-gate /* 41660Sstevel@tonic-gate * Create AVL tree if there isn't one yet 41670Sstevel@tonic-gate * and set locality field to point at it 41680Sstevel@tonic-gate */ 41690Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_WRITER); 41700Sstevel@tonic-gate tree = shm_locality->loc_tree; 41710Sstevel@tonic-gate if (!tree) { 41720Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 41730Sstevel@tonic-gate 41740Sstevel@tonic-gate tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 41750Sstevel@tonic-gate 41760Sstevel@tonic-gate rw_enter(&shm_locality->loc_lock, RW_WRITER); 41770Sstevel@tonic-gate if (shm_locality->loc_tree == NULL) { 41780Sstevel@tonic-gate avl_create(tree, lgrp_shm_policy_compar, 41790Sstevel@tonic-gate sizeof (lgrp_shm_policy_seg_t), 41800Sstevel@tonic-gate offsetof(lgrp_shm_policy_seg_t, shm_tree)); 41810Sstevel@tonic-gate shm_locality->loc_tree = tree; 41820Sstevel@tonic-gate } else { 41830Sstevel@tonic-gate /* 41840Sstevel@tonic-gate * Another thread managed to set up the tree 41850Sstevel@tonic-gate * before we could. Free the tree we allocated 41860Sstevel@tonic-gate * and use the one that's already there. 41870Sstevel@tonic-gate */ 41880Sstevel@tonic-gate kmem_free(tree, sizeof (*tree)); 41890Sstevel@tonic-gate tree = shm_locality->loc_tree; 41900Sstevel@tonic-gate } 41910Sstevel@tonic-gate } 41920Sstevel@tonic-gate 41930Sstevel@tonic-gate /* 41940Sstevel@tonic-gate * Set policy 41950Sstevel@tonic-gate * 41960Sstevel@tonic-gate * Need to maintain hold on writer's lock to keep tree from 41970Sstevel@tonic-gate * changing out from under us 41980Sstevel@tonic-gate */ 41990Sstevel@tonic-gate while (len != 0) { 42000Sstevel@tonic-gate /* 42010Sstevel@tonic-gate * Find policy segment for specified offset into shared object 42020Sstevel@tonic-gate */ 42030Sstevel@tonic-gate seg = avl_find(tree, &off, &where); 42040Sstevel@tonic-gate 42050Sstevel@tonic-gate /* 42060Sstevel@tonic-gate * Didn't find any existing segment that contains specified 42070Sstevel@tonic-gate * offset, so allocate new segment, insert it, and concatenate 42080Sstevel@tonic-gate * with adjacent segments if possible 42090Sstevel@tonic-gate */ 42100Sstevel@tonic-gate if (seg == NULL) { 42110Sstevel@tonic-gate newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 42120Sstevel@tonic-gate KM_SLEEP); 42130Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 42140Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 42150Sstevel@tonic-gate newseg->shm_off = off; 42160Sstevel@tonic-gate avl_insert(tree, newseg, where); 42170Sstevel@tonic-gate 42180Sstevel@tonic-gate /* 42190Sstevel@tonic-gate * Check to see whether new segment overlaps with next 42200Sstevel@tonic-gate * one, set length of new segment accordingly, and 42210Sstevel@tonic-gate * calculate remaining length and next offset 42220Sstevel@tonic-gate */ 42230Sstevel@tonic-gate seg = AVL_NEXT(tree, newseg); 42240Sstevel@tonic-gate if (seg == NULL || off + len <= seg->shm_off) { 42250Sstevel@tonic-gate newseg->shm_size = len; 42260Sstevel@tonic-gate len = 0; 42270Sstevel@tonic-gate } else { 42280Sstevel@tonic-gate newseg->shm_size = seg->shm_off - off; 42290Sstevel@tonic-gate off = seg->shm_off; 42300Sstevel@tonic-gate len -= newseg->shm_size; 42310Sstevel@tonic-gate } 42320Sstevel@tonic-gate 42330Sstevel@tonic-gate /* 42340Sstevel@tonic-gate * Try to concatenate new segment with next and 42350Sstevel@tonic-gate * previous ones, since they might have the same policy 42360Sstevel@tonic-gate * now. Grab previous and next segments first because 42370Sstevel@tonic-gate * they will change on concatenation. 42380Sstevel@tonic-gate */ 42390Sstevel@tonic-gate prev = AVL_PREV(tree, newseg); 42400Sstevel@tonic-gate next = AVL_NEXT(tree, newseg); 42410Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, newseg, next); 42420Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, prev, newseg); 42430Sstevel@tonic-gate 42440Sstevel@tonic-gate continue; 42450Sstevel@tonic-gate } 42460Sstevel@tonic-gate 42470Sstevel@tonic-gate eoff = off + len; 42480Sstevel@tonic-gate oldeoff = seg->shm_off + seg->shm_size; 42490Sstevel@tonic-gate 42500Sstevel@tonic-gate /* 42510Sstevel@tonic-gate * Policy set already? 42520Sstevel@tonic-gate */ 42530Sstevel@tonic-gate if (policy == seg->shm_policy.mem_policy) { 42540Sstevel@tonic-gate /* 42550Sstevel@tonic-gate * Nothing left to do if offset and length 42560Sstevel@tonic-gate * fall within this segment 42570Sstevel@tonic-gate */ 42580Sstevel@tonic-gate if (eoff <= oldeoff) { 42590Sstevel@tonic-gate retval = 1; 42600Sstevel@tonic-gate break; 42610Sstevel@tonic-gate } else { 42620Sstevel@tonic-gate len = eoff - oldeoff; 42630Sstevel@tonic-gate off = oldeoff; 42640Sstevel@tonic-gate continue; 42650Sstevel@tonic-gate } 42660Sstevel@tonic-gate } 42670Sstevel@tonic-gate 42680Sstevel@tonic-gate /* 42690Sstevel@tonic-gate * Specified offset and length match existing segment exactly 42700Sstevel@tonic-gate */ 42710Sstevel@tonic-gate if (off == seg->shm_off && len == seg->shm_size) { 42720Sstevel@tonic-gate /* 42730Sstevel@tonic-gate * Set policy and update current length 42740Sstevel@tonic-gate */ 42750Sstevel@tonic-gate seg->shm_policy.mem_policy = policy; 42760Sstevel@tonic-gate seg->shm_policy.mem_reserved = 0; 42770Sstevel@tonic-gate len = 0; 42780Sstevel@tonic-gate 42790Sstevel@tonic-gate /* 42800Sstevel@tonic-gate * Try concatenating new segment with previous and next 42810Sstevel@tonic-gate * segments, since they might have the same policy now. 42820Sstevel@tonic-gate * Grab previous and next segments first because they 42830Sstevel@tonic-gate * will change on concatenation. 42840Sstevel@tonic-gate */ 42850Sstevel@tonic-gate prev = AVL_PREV(tree, seg); 42860Sstevel@tonic-gate next = AVL_NEXT(tree, seg); 42870Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, seg, next); 42880Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, prev, seg); 42890Sstevel@tonic-gate } else { 42900Sstevel@tonic-gate /* 42910Sstevel@tonic-gate * Specified offset and length only apply to part of 42920Sstevel@tonic-gate * existing segment 42930Sstevel@tonic-gate */ 42940Sstevel@tonic-gate 42950Sstevel@tonic-gate /* 42960Sstevel@tonic-gate * New segment starts in middle of old one, so split 42970Sstevel@tonic-gate * new one off near beginning of old one 42980Sstevel@tonic-gate */ 42990Sstevel@tonic-gate newseg = NULL; 43000Sstevel@tonic-gate if (off > seg->shm_off) { 43010Sstevel@tonic-gate newseg = lgrp_shm_policy_split(tree, seg, off); 43020Sstevel@tonic-gate 43030Sstevel@tonic-gate /* 43040Sstevel@tonic-gate * New segment ends where old one did, so try 43050Sstevel@tonic-gate * to concatenate with next segment 43060Sstevel@tonic-gate */ 43070Sstevel@tonic-gate if (eoff == oldeoff) { 43080Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 43090Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 43100Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, 43110Sstevel@tonic-gate newseg, AVL_NEXT(tree, newseg)); 43120Sstevel@tonic-gate break; 43130Sstevel@tonic-gate } 43140Sstevel@tonic-gate } 43150Sstevel@tonic-gate 43160Sstevel@tonic-gate /* 43170Sstevel@tonic-gate * New segment ends before old one, so split off end of 43180Sstevel@tonic-gate * old one 43190Sstevel@tonic-gate */ 43200Sstevel@tonic-gate if (eoff < oldeoff) { 43210Sstevel@tonic-gate if (newseg) { 43220Sstevel@tonic-gate (void) lgrp_shm_policy_split(tree, 43230Sstevel@tonic-gate newseg, eoff); 43240Sstevel@tonic-gate newseg->shm_policy.mem_policy = policy; 43250Sstevel@tonic-gate newseg->shm_policy.mem_reserved = 0; 43260Sstevel@tonic-gate } else { 43270Sstevel@tonic-gate (void) lgrp_shm_policy_split(tree, seg, 43280Sstevel@tonic-gate eoff); 43290Sstevel@tonic-gate seg->shm_policy.mem_policy = policy; 43300Sstevel@tonic-gate seg->shm_policy.mem_reserved = 0; 43310Sstevel@tonic-gate } 43320Sstevel@tonic-gate 43330Sstevel@tonic-gate if (off == seg->shm_off) 43340Sstevel@tonic-gate (void) lgrp_shm_policy_concat(tree, 43350Sstevel@tonic-gate AVL_PREV(tree, seg), seg); 43360Sstevel@tonic-gate break; 43370Sstevel@tonic-gate } 43380Sstevel@tonic-gate 43390Sstevel@tonic-gate /* 43400Sstevel@tonic-gate * Calculate remaining length and next offset 43410Sstevel@tonic-gate */ 43420Sstevel@tonic-gate len = eoff - oldeoff; 43430Sstevel@tonic-gate off = oldeoff; 43440Sstevel@tonic-gate } 43450Sstevel@tonic-gate } 43460Sstevel@tonic-gate 43470Sstevel@tonic-gate rw_exit(&shm_locality->loc_lock); 43480Sstevel@tonic-gate return (retval); 43490Sstevel@tonic-gate } 43500Sstevel@tonic-gate 43510Sstevel@tonic-gate /* 43520Sstevel@tonic-gate * Return the best memnode from which to allocate memory given 43530Sstevel@tonic-gate * an lgroup. 43540Sstevel@tonic-gate * 43550Sstevel@tonic-gate * "c" is for cookie, which is good enough for me. 43560Sstevel@tonic-gate * It references a cookie struct that should be zero'ed to initialize. 43570Sstevel@tonic-gate * The cookie should live on the caller's stack. 43580Sstevel@tonic-gate * 43590Sstevel@tonic-gate * The routine returns -1 when: 43600Sstevel@tonic-gate * - traverse is 0, and all the memnodes in "lgrp" have been returned. 43610Sstevel@tonic-gate * - traverse is 1, and all the memnodes in the system have been 43620Sstevel@tonic-gate * returned. 43630Sstevel@tonic-gate */ 43640Sstevel@tonic-gate int 43650Sstevel@tonic-gate lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 43660Sstevel@tonic-gate { 43670Sstevel@tonic-gate lgrp_t *lp = c->lmc_lgrp; 43680Sstevel@tonic-gate mnodeset_t nodes = c->lmc_nodes; 43690Sstevel@tonic-gate int cnt = c->lmc_cnt; 43700Sstevel@tonic-gate int offset, mnode; 43710Sstevel@tonic-gate 43720Sstevel@tonic-gate extern int max_mem_nodes; 43730Sstevel@tonic-gate 43740Sstevel@tonic-gate /* 43750Sstevel@tonic-gate * If the set is empty, and the caller is willing, traverse 43760Sstevel@tonic-gate * up the hierarchy until we find a non-empty set. 43770Sstevel@tonic-gate */ 43780Sstevel@tonic-gate while (nodes == (mnodeset_t)0 || cnt <= 0) { 43790Sstevel@tonic-gate if (c->lmc_scope == LGRP_SRCH_LOCAL || 43800Sstevel@tonic-gate ((lp = lp->lgrp_parent) == NULL)) 43810Sstevel@tonic-gate return (-1); 43820Sstevel@tonic-gate 43830Sstevel@tonic-gate nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 43840Sstevel@tonic-gate cnt = lp->lgrp_nmnodes - c->lmc_ntried; 43850Sstevel@tonic-gate } 43860Sstevel@tonic-gate 43870Sstevel@tonic-gate /* 43880Sstevel@tonic-gate * Select a memnode by picking one at a "random" offset. 43890Sstevel@tonic-gate * Because of DR, memnodes can come and go at any time. 43900Sstevel@tonic-gate * This code must be able to cope with the possibility 43910Sstevel@tonic-gate * that the nodes count "cnt" is inconsistent with respect 43920Sstevel@tonic-gate * to the number of elements actually in "nodes", and 43930Sstevel@tonic-gate * therefore that the offset chosen could be greater than 43940Sstevel@tonic-gate * the number of elements in the set (some memnodes may 43950Sstevel@tonic-gate * have dissapeared just before cnt was read). 43960Sstevel@tonic-gate * If this happens, the search simply wraps back to the 43970Sstevel@tonic-gate * beginning of the set. 43980Sstevel@tonic-gate */ 43990Sstevel@tonic-gate ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 44000Sstevel@tonic-gate offset = c->lmc_rand % cnt; 44010Sstevel@tonic-gate do { 44020Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) 44030Sstevel@tonic-gate if (nodes & ((mnodeset_t)1 << mnode)) 44040Sstevel@tonic-gate if (!offset--) 44050Sstevel@tonic-gate break; 44060Sstevel@tonic-gate } while (mnode >= max_mem_nodes); 44070Sstevel@tonic-gate 44080Sstevel@tonic-gate /* Found a node. Store state before returning. */ 44090Sstevel@tonic-gate c->lmc_lgrp = lp; 44100Sstevel@tonic-gate c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 44110Sstevel@tonic-gate c->lmc_cnt = cnt - 1; 44120Sstevel@tonic-gate c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 44130Sstevel@tonic-gate c->lmc_ntried++; 44140Sstevel@tonic-gate 44150Sstevel@tonic-gate return (mnode); 44160Sstevel@tonic-gate } 4417