xref: /onnv-gate/usr/src/uts/common/os/lgrp.c (revision 10710:b9f4a7af952b)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51892Sesaxe  * Common Development and Distribution License (the "License").
61892Sesaxe  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*10710Sjonathan.chew@sun.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * Basic NUMA support in terms of locality groups
280Sstevel@tonic-gate  *
290Sstevel@tonic-gate  * Solaris needs to know which CPUs, memory, etc. are near each other to
300Sstevel@tonic-gate  * provide good performance on NUMA machines by optimizing for locality.
310Sstevel@tonic-gate  * In order to do this, a new abstraction called a "locality group (lgroup)"
320Sstevel@tonic-gate  * has been introduced to keep track of which CPU-like and memory-like hardware
330Sstevel@tonic-gate  * resources are close to each other.  Currently, latency is the only measure
340Sstevel@tonic-gate  * used to determine how to group hardware resources into lgroups, but this
350Sstevel@tonic-gate  * does not limit the groupings to be based solely on latency.  Other factors
360Sstevel@tonic-gate  * may be used to determine the groupings in the future.
370Sstevel@tonic-gate  *
380Sstevel@tonic-gate  * Lgroups are organized into a hieararchy or topology that represents the
390Sstevel@tonic-gate  * latency topology of the machine.  There is always at least a root lgroup in
400Sstevel@tonic-gate  * the system.  It represents all the hardware resources in the machine at a
410Sstevel@tonic-gate  * latency big enough that any hardware resource can at least access any other
420Sstevel@tonic-gate  * hardware resource within that latency.  A Uniform Memory Access (UMA)
430Sstevel@tonic-gate  * machine is represented with one lgroup (the root).  In contrast, a NUMA
440Sstevel@tonic-gate  * machine is represented at least by the root lgroup and some number of leaf
450Sstevel@tonic-gate  * lgroups where the leaf lgroups contain the hardware resources within the
460Sstevel@tonic-gate  * least latency of each other and the root lgroup still contains all the
470Sstevel@tonic-gate  * resources in the machine.  Some number of intermediate lgroups may exist
480Sstevel@tonic-gate  * which represent more levels of locality than just the local latency of the
490Sstevel@tonic-gate  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
500Sstevel@tonic-gate  * (eg. root and intermediate lgroups) contain the next nearest resources to
510Sstevel@tonic-gate  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
520Sstevel@tonic-gate  * to the root lgroup shows the hardware resources from closest to farthest
530Sstevel@tonic-gate  * from the leaf lgroup such that each successive ancestor lgroup contains
540Sstevel@tonic-gate  * the next nearest resources at the next level of locality from the previous.
550Sstevel@tonic-gate  *
560Sstevel@tonic-gate  * The kernel uses the lgroup abstraction to know how to allocate resources
570Sstevel@tonic-gate  * near a given process/thread.  At fork() and lwp/thread_create() time, a
580Sstevel@tonic-gate  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
590Sstevel@tonic-gate  * with the lowest load average.  Binding to a processor or processor set will
600Sstevel@tonic-gate  * change the home lgroup for a thread.  The scheduler has been modified to try
610Sstevel@tonic-gate  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
620Sstevel@tonic-gate  * allocation is lgroup aware too, so memory will be allocated from the current
630Sstevel@tonic-gate  * thread's home lgroup if possible.  If the desired resources are not
640Sstevel@tonic-gate  * available, the kernel traverses the lgroup hierarchy going to the parent
650Sstevel@tonic-gate  * lgroup to find resources at the next level of locality until it reaches the
660Sstevel@tonic-gate  * root lgroup.
670Sstevel@tonic-gate  */
680Sstevel@tonic-gate 
690Sstevel@tonic-gate #include <sys/lgrp.h>
700Sstevel@tonic-gate #include <sys/lgrp_user.h>
710Sstevel@tonic-gate #include <sys/types.h>
720Sstevel@tonic-gate #include <sys/mman.h>
730Sstevel@tonic-gate #include <sys/param.h>
740Sstevel@tonic-gate #include <sys/var.h>
750Sstevel@tonic-gate #include <sys/thread.h>
760Sstevel@tonic-gate #include <sys/cpuvar.h>
770Sstevel@tonic-gate #include <sys/cpupart.h>
780Sstevel@tonic-gate #include <sys/kmem.h>
790Sstevel@tonic-gate #include <vm/seg.h>
800Sstevel@tonic-gate #include <vm/seg_kmem.h>
810Sstevel@tonic-gate #include <vm/seg_spt.h>
820Sstevel@tonic-gate #include <vm/seg_vn.h>
830Sstevel@tonic-gate #include <vm/as.h>
840Sstevel@tonic-gate #include <sys/atomic.h>
850Sstevel@tonic-gate #include <sys/systm.h>
860Sstevel@tonic-gate #include <sys/errno.h>
870Sstevel@tonic-gate #include <sys/cmn_err.h>
880Sstevel@tonic-gate #include <sys/kstat.h>
890Sstevel@tonic-gate #include <sys/sysmacros.h>
903434Sesaxe #include <sys/pg.h>
910Sstevel@tonic-gate #include <sys/promif.h>
920Sstevel@tonic-gate #include <sys/sdt.h>
930Sstevel@tonic-gate 
940Sstevel@tonic-gate lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
950Sstevel@tonic-gate lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
960Sstevel@tonic-gate 				/* indexed by lgrp_id */
970Sstevel@tonic-gate int	nlgrps;			/* number of lgroups in machine */
980Sstevel@tonic-gate int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
990Sstevel@tonic-gate int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
1000Sstevel@tonic-gate 
1010Sstevel@tonic-gate /*
1020Sstevel@tonic-gate  * Kstat data for lgroups.
1030Sstevel@tonic-gate  *
1040Sstevel@tonic-gate  * Actual kstat data is collected in lgrp_stats array.
1050Sstevel@tonic-gate  * The lgrp_kstat_data array of named kstats is used to extract data from
1060Sstevel@tonic-gate  * lgrp_stats and present it to kstat framework. It is protected from partallel
1070Sstevel@tonic-gate  * modifications by lgrp_kstat_mutex. This may cause some contention when
1080Sstevel@tonic-gate  * several kstat commands run in parallel but this is not the
1090Sstevel@tonic-gate  * performance-critical path.
1100Sstevel@tonic-gate  */
1110Sstevel@tonic-gate extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
1120Sstevel@tonic-gate 
1130Sstevel@tonic-gate /*
1140Sstevel@tonic-gate  * Declare kstat names statically for enums as defined in the header file.
1150Sstevel@tonic-gate  */
1160Sstevel@tonic-gate LGRP_KSTAT_NAMES;
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate static void	lgrp_kstat_init(void);
1190Sstevel@tonic-gate static int	lgrp_kstat_extract(kstat_t *, int);
1200Sstevel@tonic-gate static void	lgrp_kstat_reset(lgrp_id_t);
1210Sstevel@tonic-gate 
1220Sstevel@tonic-gate static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
1230Sstevel@tonic-gate static kmutex_t lgrp_kstat_mutex;
1240Sstevel@tonic-gate 
1250Sstevel@tonic-gate 
1260Sstevel@tonic-gate /*
1270Sstevel@tonic-gate  * max number of lgroups supported by the platform
1280Sstevel@tonic-gate  */
1290Sstevel@tonic-gate int	nlgrpsmax = 0;
1300Sstevel@tonic-gate 
1310Sstevel@tonic-gate /*
1320Sstevel@tonic-gate  * The root lgroup. Represents the set of resources at the system wide
1330Sstevel@tonic-gate  * level of locality.
1340Sstevel@tonic-gate  */
1350Sstevel@tonic-gate lgrp_t		*lgrp_root = NULL;
1360Sstevel@tonic-gate 
1370Sstevel@tonic-gate /*
1380Sstevel@tonic-gate  * During system bootstrap cp_default does not contain the list of lgrp load
1390Sstevel@tonic-gate  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
1400Sstevel@tonic-gate  * on-line when cp_default is initialized by cpupart_initialize_default().
1410Sstevel@tonic-gate  * Configuring CPU0 may create a two-level topology with root and one leaf node
1420Sstevel@tonic-gate  * containing CPU0. This topology is initially constructed in a special
1430Sstevel@tonic-gate  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
1440Sstevel@tonic-gate  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
1450Sstevel@tonic-gate  * for all lpl operations until cp_default is fully constructed.
1460Sstevel@tonic-gate  *
1470Sstevel@tonic-gate  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
1480Sstevel@tonic-gate  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
1490Sstevel@tonic-gate  * the first element of lpl_bootstrap_list.
15060Sesaxe  *
15160Sesaxe  * CPUs that are added to the system, but have not yet been assigned to an
15260Sesaxe  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
15360Sesaxe  * on some architectures (x86) it's possible for the slave CPU startup thread
15460Sesaxe  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
1550Sstevel@tonic-gate  */
1560Sstevel@tonic-gate #define	LPL_BOOTSTRAP_SIZE 2
1570Sstevel@tonic-gate static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
1580Sstevel@tonic-gate lpl_t		*lpl_bootstrap;
1598408SEric.Saxe@Sun.COM static lpl_t	*lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
1608408SEric.Saxe@Sun.COM static int	lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
1610Sstevel@tonic-gate 
16260Sesaxe /*
16360Sesaxe  * If cp still references the bootstrap lpl, it has not yet been added to
16460Sesaxe  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
16560Sesaxe  * a thread is trying to allocate memory close to a CPU that has no lgrp.
16660Sesaxe  */
16760Sesaxe #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
16860Sesaxe 
1690Sstevel@tonic-gate static lgrp_t	lroot;
1700Sstevel@tonic-gate 
1710Sstevel@tonic-gate /*
1720Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied
1730Sstevel@tonic-gate  * to non-shared memory.  Default is the maximum size, so random memory
1740Sstevel@tonic-gate  * allocation won't be used for non-shared memory by default.
1750Sstevel@tonic-gate  */
1760Sstevel@tonic-gate size_t	lgrp_privm_random_thresh = (size_t)(-1);
1770Sstevel@tonic-gate 
1782685Sakolb /* the maximum effect that a single thread can have on it's lgroup's load */
1792685Sakolb #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
1802685Sakolb 	((lgrp_loadavg_max_effect) / (ncpu))
1812685Sakolb uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
1822685Sakolb 
1832685Sakolb 
1840Sstevel@tonic-gate /*
1850Sstevel@tonic-gate  * Size, in bytes, beyond which random memory allocation policy is applied to
1860Sstevel@tonic-gate  * shared memory.  Default is 8MB (2 ISM pages).
1870Sstevel@tonic-gate  */
1880Sstevel@tonic-gate size_t	lgrp_shm_random_thresh = 8*1024*1024;
1890Sstevel@tonic-gate 
1900Sstevel@tonic-gate /*
1910Sstevel@tonic-gate  * Whether to do processor set aware memory allocation by default
1920Sstevel@tonic-gate  */
1930Sstevel@tonic-gate int	lgrp_mem_pset_aware = 0;
1940Sstevel@tonic-gate 
1950Sstevel@tonic-gate /*
1960Sstevel@tonic-gate  * Set the default memory allocation policy for root lgroup
1970Sstevel@tonic-gate  */
1980Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
1990Sstevel@tonic-gate 
2000Sstevel@tonic-gate /*
2010Sstevel@tonic-gate  * Set the default memory allocation policy.  For most platforms,
2020Sstevel@tonic-gate  * next touch is sufficient, but some platforms may wish to override
2030Sstevel@tonic-gate  * this.
2040Sstevel@tonic-gate  */
2050Sstevel@tonic-gate lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate 
2080Sstevel@tonic-gate /*
2090Sstevel@tonic-gate  * lgroup CPU event handlers
2100Sstevel@tonic-gate  */
2110Sstevel@tonic-gate static void	lgrp_cpu_init(struct cpu *);
2120Sstevel@tonic-gate static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
2130Sstevel@tonic-gate static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
2140Sstevel@tonic-gate 
2150Sstevel@tonic-gate /*
2160Sstevel@tonic-gate  * lgroup memory event handlers
2170Sstevel@tonic-gate  */
2180Sstevel@tonic-gate static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
2190Sstevel@tonic-gate static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
2200Sstevel@tonic-gate static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate /*
2230Sstevel@tonic-gate  * lgroup CPU partition event handlers
2240Sstevel@tonic-gate  */
2250Sstevel@tonic-gate static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
2260Sstevel@tonic-gate static void	lgrp_part_del_cpu(struct cpu *);
2270Sstevel@tonic-gate 
228*10710Sjonathan.chew@sun.com /*
229*10710Sjonathan.chew@sun.com  * lgroup framework initialization
230*10710Sjonathan.chew@sun.com  */
231*10710Sjonathan.chew@sun.com static void	lgrp_main_init(void);
232*10710Sjonathan.chew@sun.com static void	lgrp_main_mp_init(void);
2330Sstevel@tonic-gate static void	lgrp_root_init(void);
234*10710Sjonathan.chew@sun.com static void	lgrp_setup(void);
2350Sstevel@tonic-gate 
2360Sstevel@tonic-gate /*
2370Sstevel@tonic-gate  * lpl topology
2380Sstevel@tonic-gate  */
2390Sstevel@tonic-gate static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
2400Sstevel@tonic-gate static void	lpl_clear(lpl_t *);
2410Sstevel@tonic-gate static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
2420Sstevel@tonic-gate static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
2430Sstevel@tonic-gate static void	lpl_rset_add(lpl_t *, lpl_t *);
2440Sstevel@tonic-gate static void	lpl_rset_del(lpl_t *, lpl_t *);
2450Sstevel@tonic-gate static int	lpl_rset_contains(lpl_t *, lpl_t *);
2460Sstevel@tonic-gate static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
2470Sstevel@tonic-gate static void	lpl_child_update(lpl_t *, struct cpupart *);
2480Sstevel@tonic-gate static int	lpl_pick(lpl_t *, lpl_t *);
2490Sstevel@tonic-gate static void	lpl_verify_wrapper(struct cpupart *);
2500Sstevel@tonic-gate 
2510Sstevel@tonic-gate /*
2520Sstevel@tonic-gate  * defines for lpl topology verifier return codes
2530Sstevel@tonic-gate  */
2540Sstevel@tonic-gate 
2550Sstevel@tonic-gate #define	LPL_TOPO_CORRECT			0
2560Sstevel@tonic-gate #define	LPL_TOPO_PART_HAS_NO_LPL		-1
2570Sstevel@tonic-gate #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
2580Sstevel@tonic-gate #define	LPL_TOPO_LGRP_MISMATCH			-3
2590Sstevel@tonic-gate #define	LPL_TOPO_MISSING_PARENT			-4
2600Sstevel@tonic-gate #define	LPL_TOPO_PARENT_MISMATCH		-5
2610Sstevel@tonic-gate #define	LPL_TOPO_BAD_CPUCNT			-6
2620Sstevel@tonic-gate #define	LPL_TOPO_RSET_MISMATCH			-7
2630Sstevel@tonic-gate #define	LPL_TOPO_LPL_ORPHANED			-8
2640Sstevel@tonic-gate #define	LPL_TOPO_LPL_BAD_NCPU			-9
2650Sstevel@tonic-gate #define	LPL_TOPO_RSET_MSSNG_LF			-10
2660Sstevel@tonic-gate #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
2678408SEric.Saxe@Sun.COM #define	LPL_TOPO_NONLEAF_HAS_CPUS		-12
2688408SEric.Saxe@Sun.COM #define	LPL_TOPO_LGRP_NOT_LEAF			-13
2698408SEric.Saxe@Sun.COM #define	LPL_TOPO_BAD_RSETCNT			-14
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate /*
2720Sstevel@tonic-gate  * Return whether lgroup optimizations should be enabled on this system
2730Sstevel@tonic-gate  */
2740Sstevel@tonic-gate int
lgrp_optimizations(void)2750Sstevel@tonic-gate lgrp_optimizations(void)
2760Sstevel@tonic-gate {
2770Sstevel@tonic-gate 	/*
2780Sstevel@tonic-gate 	 * System must have more than 2 lgroups to enable lgroup optimizations
2790Sstevel@tonic-gate 	 *
2800Sstevel@tonic-gate 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
2810Sstevel@tonic-gate 	 * with one child lgroup containing all the resources. A 2 lgroup
2820Sstevel@tonic-gate 	 * system with a root lgroup directly containing CPUs or memory might
2830Sstevel@tonic-gate 	 * need lgroup optimizations with its child lgroup, but there
2840Sstevel@tonic-gate 	 * isn't such a machine for now....
2850Sstevel@tonic-gate 	 */
2860Sstevel@tonic-gate 	if (nlgrps > 2)
2870Sstevel@tonic-gate 		return (1);
2880Sstevel@tonic-gate 
2890Sstevel@tonic-gate 	return (0);
2900Sstevel@tonic-gate }
2910Sstevel@tonic-gate 
2920Sstevel@tonic-gate /*
293*10710Sjonathan.chew@sun.com  * Setup root lgroup
2940Sstevel@tonic-gate  */
2950Sstevel@tonic-gate static void
lgrp_root_init(void)2960Sstevel@tonic-gate lgrp_root_init(void)
2970Sstevel@tonic-gate {
2980Sstevel@tonic-gate 	lgrp_handle_t	hand;
2990Sstevel@tonic-gate 	int		i;
3000Sstevel@tonic-gate 	lgrp_id_t	id;
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate 	/*
3030Sstevel@tonic-gate 	 * Create the "root" lgroup
3040Sstevel@tonic-gate 	 */
3050Sstevel@tonic-gate 	ASSERT(nlgrps == 0);
3060Sstevel@tonic-gate 	id = nlgrps++;
3070Sstevel@tonic-gate 
3080Sstevel@tonic-gate 	lgrp_root = &lroot;
3090Sstevel@tonic-gate 
3100Sstevel@tonic-gate 	lgrp_root->lgrp_cpu = NULL;
3110Sstevel@tonic-gate 	lgrp_root->lgrp_mnodes = 0;
3120Sstevel@tonic-gate 	lgrp_root->lgrp_nmnodes = 0;
3130Sstevel@tonic-gate 	hand = lgrp_plat_root_hand();
3140Sstevel@tonic-gate 	lgrp_root->lgrp_plathand = hand;
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 	lgrp_root->lgrp_id = id;
3170Sstevel@tonic-gate 	lgrp_root->lgrp_cpucnt = 0;
3180Sstevel@tonic-gate 	lgrp_root->lgrp_childcnt = 0;
3190Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_children);
3200Sstevel@tonic-gate 	klgrpset_clear(lgrp_root->lgrp_leaves);
3210Sstevel@tonic-gate 	lgrp_root->lgrp_parent = NULL;
3220Sstevel@tonic-gate 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
3250Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[i]);
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 	lgrp_root->lgrp_kstat = NULL;
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate 	lgrp_table[id] = lgrp_root;
3300Sstevel@tonic-gate 
3310Sstevel@tonic-gate 	/*
3320Sstevel@tonic-gate 	 * Setup initial lpl list for CPU0 and initial t0 home.
3330Sstevel@tonic-gate 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
33460Sesaxe 	 * all topology operations until cp_default is initialized at which
33560Sesaxe 	 * point t0.t_lpl will be updated.
3360Sstevel@tonic-gate 	 */
3370Sstevel@tonic-gate 	lpl_bootstrap = lpl_bootstrap_list;
3380Sstevel@tonic-gate 	t0.t_lpl = lpl_bootstrap;
3390Sstevel@tonic-gate 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
3400Sstevel@tonic-gate 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
3418408SEric.Saxe@Sun.COM 
3428408SEric.Saxe@Sun.COM 	/*
3438408SEric.Saxe@Sun.COM 	 * Set up the bootstrap rset
3448408SEric.Saxe@Sun.COM 	 * Since the bootstrap toplogy has just the root, and a leaf,
3458408SEric.Saxe@Sun.COM 	 * the rset contains just the leaf, and both lpls can use the same rset
3468408SEric.Saxe@Sun.COM 	 */
3478408SEric.Saxe@Sun.COM 	lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
3488408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[0].lpl_rset_sz = 1;
3498408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
3508408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
3518408SEric.Saxe@Sun.COM 
3528408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[1].lpl_rset_sz = 1;
3538408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
3548408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
3558408SEric.Saxe@Sun.COM 
3560Sstevel@tonic-gate 	cp_default.cp_lgrploads = lpl_bootstrap;
3570Sstevel@tonic-gate }
3580Sstevel@tonic-gate 
3590Sstevel@tonic-gate /*
3600Sstevel@tonic-gate  * Initialize the lgroup framework and allow the platform to do the same
361*10710Sjonathan.chew@sun.com  *
362*10710Sjonathan.chew@sun.com  * This happens in stages during boot and is all funnelled through this routine
363*10710Sjonathan.chew@sun.com  * (see definition of lgrp_init_stages_t to see what happens at each stage and
364*10710Sjonathan.chew@sun.com  * when)
3650Sstevel@tonic-gate  */
3660Sstevel@tonic-gate void
lgrp_init(lgrp_init_stages_t stage)367*10710Sjonathan.chew@sun.com lgrp_init(lgrp_init_stages_t stage)
3680Sstevel@tonic-gate {
3690Sstevel@tonic-gate 	/*
3700Sstevel@tonic-gate 	 * Initialize the platform
3710Sstevel@tonic-gate 	 */
372*10710Sjonathan.chew@sun.com 	lgrp_plat_init(stage);
373*10710Sjonathan.chew@sun.com 
374*10710Sjonathan.chew@sun.com 	switch (stage) {
375*10710Sjonathan.chew@sun.com 	case LGRP_INIT_STAGE1:
376*10710Sjonathan.chew@sun.com 		/*
377*10710Sjonathan.chew@sun.com 		 * Set max number of lgroups supported on this platform which
378*10710Sjonathan.chew@sun.com 		 * must be less than the max number of lgroups supported by the
379*10710Sjonathan.chew@sun.com 		 * common lgroup framework (eg. NLGRPS_MAX is max elements in
380*10710Sjonathan.chew@sun.com 		 * lgrp_table[], etc.)
381*10710Sjonathan.chew@sun.com 		 */
382*10710Sjonathan.chew@sun.com 		nlgrpsmax = lgrp_plat_max_lgrps();
383*10710Sjonathan.chew@sun.com 		ASSERT(nlgrpsmax <= NLGRPS_MAX);
384*10710Sjonathan.chew@sun.com 		break;
385*10710Sjonathan.chew@sun.com 
386*10710Sjonathan.chew@sun.com 	case LGRP_INIT_STAGE2:
387*10710Sjonathan.chew@sun.com 		lgrp_setup();
388*10710Sjonathan.chew@sun.com 		break;
389*10710Sjonathan.chew@sun.com 
390*10710Sjonathan.chew@sun.com 	case LGRP_INIT_STAGE4:
391*10710Sjonathan.chew@sun.com 		lgrp_main_init();
392*10710Sjonathan.chew@sun.com 		break;
393*10710Sjonathan.chew@sun.com 
394*10710Sjonathan.chew@sun.com 	case LGRP_INIT_STAGE5:
395*10710Sjonathan.chew@sun.com 		lgrp_main_mp_init();
396*10710Sjonathan.chew@sun.com 		break;
397*10710Sjonathan.chew@sun.com 
398*10710Sjonathan.chew@sun.com 	default:
399*10710Sjonathan.chew@sun.com 		break;
400*10710Sjonathan.chew@sun.com 	}
4010Sstevel@tonic-gate }
4020Sstevel@tonic-gate 
4030Sstevel@tonic-gate /*
4040Sstevel@tonic-gate  * Create the root and cpu0's lgroup, and set t0's home.
4050Sstevel@tonic-gate  */
406*10710Sjonathan.chew@sun.com static void
lgrp_setup(void)4070Sstevel@tonic-gate lgrp_setup(void)
4080Sstevel@tonic-gate {
4090Sstevel@tonic-gate 	/*
4100Sstevel@tonic-gate 	 * Setup the root lgroup
4110Sstevel@tonic-gate 	 */
4120Sstevel@tonic-gate 	lgrp_root_init();
4130Sstevel@tonic-gate 
4140Sstevel@tonic-gate 	/*
4150Sstevel@tonic-gate 	 * Add cpu0 to an lgroup
4160Sstevel@tonic-gate 	 */
4170Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
4180Sstevel@tonic-gate 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
4190Sstevel@tonic-gate }
4200Sstevel@tonic-gate 
4210Sstevel@tonic-gate /*
4220Sstevel@tonic-gate  * true when lgrp initialization has been completed.
4230Sstevel@tonic-gate  */
4240Sstevel@tonic-gate int	lgrp_initialized = 0;
4250Sstevel@tonic-gate 
4260Sstevel@tonic-gate /*
4270Sstevel@tonic-gate  * True when lgrp topology is constructed.
4280Sstevel@tonic-gate  */
4290Sstevel@tonic-gate int	lgrp_topo_initialized = 0;
4300Sstevel@tonic-gate 
4310Sstevel@tonic-gate /*
4320Sstevel@tonic-gate  * Init routine called after startup(), /etc/system has been processed,
4330Sstevel@tonic-gate  * and cpu0 has been added to an lgroup.
4340Sstevel@tonic-gate  */
435*10710Sjonathan.chew@sun.com static void
lgrp_main_init(void)4360Sstevel@tonic-gate lgrp_main_init(void)
4370Sstevel@tonic-gate {
4380Sstevel@tonic-gate 	cpu_t		*cp = CPU;
4390Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
4400Sstevel@tonic-gate 	int		i;
4413676Sesaxe 	extern void	pg_cpu0_reinit();
4423676Sesaxe 
4430Sstevel@tonic-gate 	/*
4440Sstevel@tonic-gate 	 * Enforce a valid lgrp_mem_default_policy
4450Sstevel@tonic-gate 	 */
4460Sstevel@tonic-gate 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
4474426Saguzovsk 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
4484426Saguzovsk 	    (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
4490Sstevel@tonic-gate 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 	/*
4520Sstevel@tonic-gate 	 * See if mpo should be disabled.
4530Sstevel@tonic-gate 	 * This may happen in the case of null proc LPA on Starcat.
4540Sstevel@tonic-gate 	 * The platform won't be able to detect null proc LPA until after
4550Sstevel@tonic-gate 	 * cpu0 and memory have already been added to lgroups.
4560Sstevel@tonic-gate 	 * When and if it is detected, the Starcat platform will return
4570Sstevel@tonic-gate 	 * a different platform handle for cpu0 which is what we check for
4580Sstevel@tonic-gate 	 * here. If mpo should be disabled move cpu0 to it's rightful place
4590Sstevel@tonic-gate 	 * (the root), and destroy the remaining lgroups. This effectively
4600Sstevel@tonic-gate 	 * provides an UMA lgroup topology.
4610Sstevel@tonic-gate 	 */
4620Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
4630Sstevel@tonic-gate 	if (lgrp_table[lgrpid]->lgrp_plathand !=
4640Sstevel@tonic-gate 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
4650Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
4660Sstevel@tonic-gate 		lgrp_cpu_fini(cp, lgrpid);
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate 		lgrp_cpu_init(cp);
4690Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
4700Sstevel@tonic-gate 
4710Sstevel@tonic-gate 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
4720Sstevel@tonic-gate 
473218Sjjc 		/*
4743676Sesaxe 		 * Notify the PG subsystem that the CPU's lgrp
4753676Sesaxe 		 * association has changed
4763676Sesaxe 		 */
4773676Sesaxe 		pg_cpu0_reinit();
4783676Sesaxe 
4793676Sesaxe 		/*
480218Sjjc 		 * Destroy all lgroups except for root
481218Sjjc 		 */
4820Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
4830Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp_table[i]) &&
4840Sstevel@tonic-gate 			    lgrp_table[i] != lgrp_root)
4850Sstevel@tonic-gate 				lgrp_destroy(lgrp_table[i]);
4860Sstevel@tonic-gate 		}
487218Sjjc 
488218Sjjc 		/*
489218Sjjc 		 * Fix up root to point at itself for leaves and resources
490218Sjjc 		 * and not have any children
491218Sjjc 		 */
492218Sjjc 		lgrp_root->lgrp_childcnt = 0;
493218Sjjc 		klgrpset_clear(lgrp_root->lgrp_children);
494218Sjjc 		klgrpset_clear(lgrp_root->lgrp_leaves);
495218Sjjc 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
4960Sstevel@tonic-gate 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
4970Sstevel@tonic-gate 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
4980Sstevel@tonic-gate 	}
4990Sstevel@tonic-gate 
5000Sstevel@tonic-gate 	/*
5010Sstevel@tonic-gate 	 * Initialize kstats framework.
5020Sstevel@tonic-gate 	 */
5030Sstevel@tonic-gate 	lgrp_kstat_init();
5040Sstevel@tonic-gate 	/*
5050Sstevel@tonic-gate 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
5060Sstevel@tonic-gate 	 */
5070Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
5080Sstevel@tonic-gate 	lgrp_kstat_create(cp);
5090Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
5100Sstevel@tonic-gate 
5110Sstevel@tonic-gate 	lgrp_initialized = 1;
5120Sstevel@tonic-gate }
5130Sstevel@tonic-gate 
5140Sstevel@tonic-gate /*
5150Sstevel@tonic-gate  * Finish lgrp initialization after all CPUS are brought on-line.
5160Sstevel@tonic-gate  * This routine is called after start_other_cpus().
5170Sstevel@tonic-gate  */
518*10710Sjonathan.chew@sun.com static void
lgrp_main_mp_init(void)5190Sstevel@tonic-gate lgrp_main_mp_init(void)
5200Sstevel@tonic-gate {
5210Sstevel@tonic-gate 	klgrpset_t changed;
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 	/*
5240Sstevel@tonic-gate 	 * Update lgroup topology (if necessary)
5250Sstevel@tonic-gate 	 */
5260Sstevel@tonic-gate 	klgrpset_clear(changed);
5270Sstevel@tonic-gate 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
5280Sstevel@tonic-gate 	lgrp_topo_initialized = 1;
5290Sstevel@tonic-gate }
5300Sstevel@tonic-gate 
5310Sstevel@tonic-gate /*
5322988Sjjc  * Change latency of lgroup with specified lgroup platform handle (if one is
5332988Sjjc  * given) or change all lgroups with old latency to new latency
5342988Sjjc  */
5352988Sjjc void
lgrp_latency_change(lgrp_handle_t hand,u_longlong_t oldtime,u_longlong_t newtime)5362988Sjjc lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
5372988Sjjc     u_longlong_t newtime)
5382988Sjjc {
5392988Sjjc 	lgrp_t		*lgrp;
5402988Sjjc 	int		i;
5412988Sjjc 
5422988Sjjc 	for (i = 0; i <= lgrp_alloc_max; i++) {
5432988Sjjc 		lgrp = lgrp_table[i];
5442988Sjjc 
5452988Sjjc 		if (!LGRP_EXISTS(lgrp))
5462988Sjjc 			continue;
5472988Sjjc 
5482988Sjjc 		if ((hand == LGRP_NULL_HANDLE &&
5492988Sjjc 		    lgrp->lgrp_latency == oldtime) ||
5502988Sjjc 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
5512988Sjjc 			lgrp->lgrp_latency = (int)newtime;
5522988Sjjc 	}
5532988Sjjc }
5542988Sjjc 
5552988Sjjc /*
5560Sstevel@tonic-gate  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
5570Sstevel@tonic-gate  */
5580Sstevel@tonic-gate void
lgrp_config(lgrp_config_flag_t event,uintptr_t resource,uintptr_t where)5590Sstevel@tonic-gate lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
5600Sstevel@tonic-gate {
5610Sstevel@tonic-gate 	klgrpset_t	changed;
5620Sstevel@tonic-gate 	cpu_t		*cp;
5630Sstevel@tonic-gate 	lgrp_id_t	id;
5640Sstevel@tonic-gate 	int		rc;
5650Sstevel@tonic-gate 
5660Sstevel@tonic-gate 	switch (event) {
5670Sstevel@tonic-gate 	/*
5680Sstevel@tonic-gate 	 * The following (re)configuration events are common code
5690Sstevel@tonic-gate 	 * initiated. lgrp_plat_config() is called here to inform the
5700Sstevel@tonic-gate 	 * platform of the reconfiguration event.
5710Sstevel@tonic-gate 	 */
5720Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ADD:
57360Sesaxe 		cp = (cpu_t *)resource;
57460Sesaxe 
57560Sesaxe 		/*
57660Sesaxe 		 * Initialize the new CPU's lgrp related next/prev
57760Sesaxe 		 * links, and give it a bootstrap lpl so that it can
57860Sesaxe 		 * survive should it need to enter the dispatcher.
57960Sesaxe 		 */
58060Sesaxe 		cp->cpu_next_lpl = cp;
58160Sesaxe 		cp->cpu_prev_lpl = cp;
58260Sesaxe 		cp->cpu_next_lgrp = cp;
58360Sesaxe 		cp->cpu_prev_lgrp = cp;
58460Sesaxe 		cp->cpu_lpl = lpl_bootstrap;
58560Sesaxe 
5860Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5870Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5880Sstevel@tonic-gate 
5890Sstevel@tonic-gate 		break;
5900Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_DEL:
5910Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
5920Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
5930Sstevel@tonic-gate 
5940Sstevel@tonic-gate 		break;
5950Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_ONLINE:
5960Sstevel@tonic-gate 		cp = (cpu_t *)resource;
5970Sstevel@tonic-gate 		lgrp_cpu_init(cp);
5980Sstevel@tonic-gate 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
5990Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
6000Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
6010Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
6020Sstevel@tonic-gate 		}
6030Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
6040Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6050Sstevel@tonic-gate 
6060Sstevel@tonic-gate 		break;
6070Sstevel@tonic-gate 	case LGRP_CONFIG_CPU_OFFLINE:
6080Sstevel@tonic-gate 		cp = (cpu_t *)resource;
6090Sstevel@tonic-gate 		id = cp->cpu_lpl->lpl_lgrpid;
6100Sstevel@tonic-gate 		lgrp_part_del_cpu(cp);
6110Sstevel@tonic-gate 		lgrp_cpu_fini(cp, id);
6120Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
6130Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
6140Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
6150Sstevel@tonic-gate 		}
6160Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
6170Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6180Sstevel@tonic-gate 
6190Sstevel@tonic-gate 		break;
6200Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_ADD:
6210Sstevel@tonic-gate 		cp = (cpu_t *)resource;
6220Sstevel@tonic-gate 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
6230Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
6240Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
6250Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
6260Sstevel@tonic-gate 		}
6270Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
6280Sstevel@tonic-gate 
6290Sstevel@tonic-gate 		break;
6300Sstevel@tonic-gate 	case LGRP_CONFIG_CPUPART_DEL:
6310Sstevel@tonic-gate 		cp = (cpu_t *)resource;
6320Sstevel@tonic-gate 		lgrp_part_del_cpu((cpu_t *)resource);
6330Sstevel@tonic-gate 		rc = lpl_topo_verify(cp->cpu_part);
6340Sstevel@tonic-gate 		if (rc != LPL_TOPO_CORRECT) {
6350Sstevel@tonic-gate 			panic("lpl_topo_verify failed: %d", rc);
6360Sstevel@tonic-gate 		}
6370Sstevel@tonic-gate 		lgrp_plat_config(event, resource);
6380Sstevel@tonic-gate 
6390Sstevel@tonic-gate 		break;
6400Sstevel@tonic-gate 	/*
6410Sstevel@tonic-gate 	 * The following events are initiated by the memnode
6420Sstevel@tonic-gate 	 * subsystem.
6430Sstevel@tonic-gate 	 */
6440Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_ADD:
6450Sstevel@tonic-gate 		lgrp_mem_init((int)resource, where, B_FALSE);
6460Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6470Sstevel@tonic-gate 
6480Sstevel@tonic-gate 		break;
6490Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_DEL:
6500Sstevel@tonic-gate 		lgrp_mem_fini((int)resource, where, B_FALSE);
6510Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate 		break;
6540Sstevel@tonic-gate 	case LGRP_CONFIG_MEM_RENAME: {
6550Sstevel@tonic-gate 		lgrp_config_mem_rename_t *ren_arg =
6560Sstevel@tonic-gate 		    (lgrp_config_mem_rename_t *)where;
6570Sstevel@tonic-gate 
6580Sstevel@tonic-gate 		lgrp_mem_rename((int)resource,
6590Sstevel@tonic-gate 		    ren_arg->lmem_rename_from,
6600Sstevel@tonic-gate 		    ren_arg->lmem_rename_to);
6610Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6620Sstevel@tonic-gate 
6630Sstevel@tonic-gate 		break;
6640Sstevel@tonic-gate 	}
6650Sstevel@tonic-gate 	case LGRP_CONFIG_GEN_UPDATE:
6660Sstevel@tonic-gate 		atomic_add_32(&lgrp_gen, 1);
6670Sstevel@tonic-gate 
6680Sstevel@tonic-gate 		break;
6690Sstevel@tonic-gate 	case LGRP_CONFIG_FLATTEN:
6700Sstevel@tonic-gate 		if (where == 0)
6710Sstevel@tonic-gate 			lgrp_topo_levels = (int)resource;
6720Sstevel@tonic-gate 		else
6730Sstevel@tonic-gate 			(void) lgrp_topo_flatten(resource,
6740Sstevel@tonic-gate 			    lgrp_table, lgrp_alloc_max, &changed);
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate 		break;
6770Sstevel@tonic-gate 	/*
6782988Sjjc 	 * Update any lgroups with old latency to new latency
6790Sstevel@tonic-gate 	 */
6802988Sjjc 	case LGRP_CONFIG_LAT_CHANGE_ALL:
6812988Sjjc 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
6822988Sjjc 		    (u_longlong_t)where);
6832988Sjjc 
6842988Sjjc 		break;
6852988Sjjc 	/*
6862988Sjjc 	 * Update lgroup with specified lgroup platform handle to have
6872988Sjjc 	 * new latency
6882988Sjjc 	 */
6892988Sjjc 	case LGRP_CONFIG_LAT_CHANGE:
6902988Sjjc 		lgrp_latency_change((lgrp_handle_t)resource, 0,
6910Sstevel@tonic-gate 		    (u_longlong_t)where);
6920Sstevel@tonic-gate 
6930Sstevel@tonic-gate 		break;
6940Sstevel@tonic-gate 	case LGRP_CONFIG_NOP:
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate 		break;
6970Sstevel@tonic-gate 	default:
6980Sstevel@tonic-gate 		break;
6990Sstevel@tonic-gate 	}
7000Sstevel@tonic-gate 
7010Sstevel@tonic-gate }
7020Sstevel@tonic-gate 
7030Sstevel@tonic-gate /*
7040Sstevel@tonic-gate  * Called to add lgrp info into cpu structure from cpu_add_unit;
7050Sstevel@tonic-gate  * do not assume cpu is in cpu[] yet!
7060Sstevel@tonic-gate  *
7070Sstevel@tonic-gate  * CPUs are brought online with all other CPUs paused so we can't
7080Sstevel@tonic-gate  * allocate memory or we could deadlock the system, so we rely on
7090Sstevel@tonic-gate  * the platform to statically allocate as much space as we need
7100Sstevel@tonic-gate  * for the lgrp structs and stats.
7110Sstevel@tonic-gate  */
7120Sstevel@tonic-gate static void
lgrp_cpu_init(struct cpu * cp)7130Sstevel@tonic-gate lgrp_cpu_init(struct cpu *cp)
7140Sstevel@tonic-gate {
7150Sstevel@tonic-gate 	klgrpset_t	changed;
7160Sstevel@tonic-gate 	int		count;
7170Sstevel@tonic-gate 	lgrp_handle_t	hand;
7180Sstevel@tonic-gate 	int		first_cpu;
7190Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
7200Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
7210Sstevel@tonic-gate 	struct cpu	*cptr;
7220Sstevel@tonic-gate 
7230Sstevel@tonic-gate 	/*
7240Sstevel@tonic-gate 	 * This is the first time through if the resource set
7250Sstevel@tonic-gate 	 * for the root lgroup is empty. After cpu0 has been
7260Sstevel@tonic-gate 	 * initially added to an lgroup, the root's CPU resource
7270Sstevel@tonic-gate 	 * set can never be empty, since the system's last CPU
7280Sstevel@tonic-gate 	 * cannot be offlined.
7290Sstevel@tonic-gate 	 */
7300Sstevel@tonic-gate 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
7310Sstevel@tonic-gate 		/*
7320Sstevel@tonic-gate 		 * First time through.
7330Sstevel@tonic-gate 		 */
7340Sstevel@tonic-gate 		first_cpu = 1;
7350Sstevel@tonic-gate 	} else {
7360Sstevel@tonic-gate 		/*
7370Sstevel@tonic-gate 		 * If cpu0 needs to move lgroups, we may come
7380Sstevel@tonic-gate 		 * through here again, at which time cpu_lock won't
7390Sstevel@tonic-gate 		 * be held, and lgrp_initialized will be false.
7400Sstevel@tonic-gate 		 */
7410Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
7420Sstevel@tonic-gate 		ASSERT(cp->cpu_part != NULL);
7430Sstevel@tonic-gate 		first_cpu = 0;
7440Sstevel@tonic-gate 	}
7450Sstevel@tonic-gate 
7460Sstevel@tonic-gate 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
7470Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
7480Sstevel@tonic-gate 
7490Sstevel@tonic-gate 	if (my_lgrp == NULL) {
7500Sstevel@tonic-gate 		/*
7510Sstevel@tonic-gate 		 * Create new lgrp and add it to lgroup topology
7520Sstevel@tonic-gate 		 */
7530Sstevel@tonic-gate 		my_lgrp = lgrp_create();
7540Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
7550Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
7560Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
7570Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
7580Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate 		count = 0;
7610Sstevel@tonic-gate 		klgrpset_clear(changed);
7620Sstevel@tonic-gate 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
7630Sstevel@tonic-gate 		    &changed);
76450Sjjc 		/*
76550Sjjc 		 * May have added new intermediate lgroups, so need to add
76650Sjjc 		 * resources other than CPUs which are added below
76750Sjjc 		 */
76850Sjjc 		(void) lgrp_mnode_update(changed, NULL);
7690Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
7700Sstevel@tonic-gate 	    > 0) {
7710Sstevel@tonic-gate 		/*
7720Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
7730Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
7740Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
7750Sstevel@tonic-gate 		 * lgroups.
7760Sstevel@tonic-gate 		 */
7770Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
7780Sstevel@tonic-gate 		klgrpset_clear(changed);
7790Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
7800Sstevel@tonic-gate 		    lgrpid))
7810Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
7820Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
7830Sstevel@tonic-gate 		    &changed);
7840Sstevel@tonic-gate 
7850Sstevel@tonic-gate 		/*
7860Sstevel@tonic-gate 		 * May have added new intermediate lgroups, so need to add
7870Sstevel@tonic-gate 		 * resources other than CPUs which are added below
7880Sstevel@tonic-gate 		 */
7890Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
7900Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
7910Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
7920Sstevel@tonic-gate 		int	i;
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate 		/*
7950Sstevel@tonic-gate 		 * Update existing lgroup and lgroups containing it with CPU
7960Sstevel@tonic-gate 		 * resource
7970Sstevel@tonic-gate 		 */
7980Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
7990Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
8000Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
8010Sstevel@tonic-gate 			lgrp_t		*lgrp;
8020Sstevel@tonic-gate 
8030Sstevel@tonic-gate 			lgrp = lgrp_table[i];
8040Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
8050Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
8060Sstevel@tonic-gate 				continue;
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
8090Sstevel@tonic-gate 		}
8100Sstevel@tonic-gate 	}
8110Sstevel@tonic-gate 
8120Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
8130Sstevel@tonic-gate 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
8140Sstevel@tonic-gate 
8150Sstevel@tonic-gate 	/*
8160Sstevel@tonic-gate 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
8170Sstevel@tonic-gate 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
8180Sstevel@tonic-gate 	 * not since none of lgroup IDs in the lpl's have been set yet.
8190Sstevel@tonic-gate 	 */
8200Sstevel@tonic-gate 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
8210Sstevel@tonic-gate 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
8220Sstevel@tonic-gate 
8230Sstevel@tonic-gate 	/*
8240Sstevel@tonic-gate 	 * link the CPU into the lgrp's CPU list
8250Sstevel@tonic-gate 	 */
8260Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
8270Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = cp;
8280Sstevel@tonic-gate 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
8290Sstevel@tonic-gate 	} else {
8300Sstevel@tonic-gate 		cptr = my_lgrp->lgrp_cpu;
8310Sstevel@tonic-gate 		cp->cpu_next_lgrp = cptr;
8320Sstevel@tonic-gate 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
8330Sstevel@tonic-gate 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
8340Sstevel@tonic-gate 		cptr->cpu_prev_lgrp = cp;
8350Sstevel@tonic-gate 	}
8360Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt++;
8370Sstevel@tonic-gate }
8380Sstevel@tonic-gate 
8390Sstevel@tonic-gate lgrp_t *
lgrp_create(void)8400Sstevel@tonic-gate lgrp_create(void)
8410Sstevel@tonic-gate {
8420Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
8430Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
8440Sstevel@tonic-gate 	int		i;
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 	/*
8490Sstevel@tonic-gate 	 * Find an open slot in the lgroup table and recycle unused lgroup
8500Sstevel@tonic-gate 	 * left there if any
8510Sstevel@tonic-gate 	 */
8520Sstevel@tonic-gate 	my_lgrp = NULL;
8530Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1)
8540Sstevel@tonic-gate 		/*
8550Sstevel@tonic-gate 		 * Allocate from end when hint not set yet because no lgroups
8560Sstevel@tonic-gate 		 * have been deleted yet
8570Sstevel@tonic-gate 		 */
8580Sstevel@tonic-gate 		lgrpid = nlgrps++;
8590Sstevel@tonic-gate 	else {
8600Sstevel@tonic-gate 		/*
8610Sstevel@tonic-gate 		 * Start looking for next open slot from hint and leave hint
8620Sstevel@tonic-gate 		 * at slot allocated
8630Sstevel@tonic-gate 		 */
8640Sstevel@tonic-gate 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
8650Sstevel@tonic-gate 			my_lgrp = lgrp_table[i];
8660Sstevel@tonic-gate 			if (!LGRP_EXISTS(my_lgrp)) {
8670Sstevel@tonic-gate 				lgrpid = i;
8680Sstevel@tonic-gate 				nlgrps++;
8690Sstevel@tonic-gate 				break;
8700Sstevel@tonic-gate 			}
8710Sstevel@tonic-gate 		}
8720Sstevel@tonic-gate 		lgrp_alloc_hint = lgrpid;
8730Sstevel@tonic-gate 	}
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate 	/*
8760Sstevel@tonic-gate 	 * Keep track of max lgroup ID allocated so far to cut down on searches
8770Sstevel@tonic-gate 	 */
8780Sstevel@tonic-gate 	if (lgrpid > lgrp_alloc_max)
8790Sstevel@tonic-gate 		lgrp_alloc_max = lgrpid;
8800Sstevel@tonic-gate 
8810Sstevel@tonic-gate 	/*
8820Sstevel@tonic-gate 	 * Need to allocate new lgroup if next open slot didn't have one
8830Sstevel@tonic-gate 	 * for recycling
8840Sstevel@tonic-gate 	 */
8850Sstevel@tonic-gate 	if (my_lgrp == NULL)
8860Sstevel@tonic-gate 		my_lgrp = lgrp_plat_alloc(lgrpid);
8870Sstevel@tonic-gate 
8880Sstevel@tonic-gate 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
8890Sstevel@tonic-gate 		panic("Too many lgrps for platform (%d)", nlgrps);
8900Sstevel@tonic-gate 
8910Sstevel@tonic-gate 	my_lgrp->lgrp_id = lgrpid;
8920Sstevel@tonic-gate 	my_lgrp->lgrp_latency = 0;
8930Sstevel@tonic-gate 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
8940Sstevel@tonic-gate 	my_lgrp->lgrp_parent = NULL;
8950Sstevel@tonic-gate 	my_lgrp->lgrp_childcnt = 0;
8960Sstevel@tonic-gate 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
8970Sstevel@tonic-gate 	my_lgrp->lgrp_nmnodes = 0;
8980Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_children);
8990Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_leaves);
9000Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
9010Sstevel@tonic-gate 		klgrpset_clear(my_lgrp->lgrp_set[i]);
9020Sstevel@tonic-gate 
9030Sstevel@tonic-gate 	my_lgrp->lgrp_cpu = NULL;
9040Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt = 0;
9050Sstevel@tonic-gate 
9060Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
9070Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
9080Sstevel@tonic-gate 
9090Sstevel@tonic-gate 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 	return (my_lgrp);
9120Sstevel@tonic-gate }
9130Sstevel@tonic-gate 
9140Sstevel@tonic-gate void
lgrp_destroy(lgrp_t * lgrp)9150Sstevel@tonic-gate lgrp_destroy(lgrp_t *lgrp)
9160Sstevel@tonic-gate {
9170Sstevel@tonic-gate 	int		i;
9180Sstevel@tonic-gate 
9190Sstevel@tonic-gate 	/*
9200Sstevel@tonic-gate 	 * Unless this lgroup is being destroyed on behalf of
9210Sstevel@tonic-gate 	 * the boot CPU, cpu_lock must be held
9220Sstevel@tonic-gate 	 */
9230Sstevel@tonic-gate 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
9240Sstevel@tonic-gate 
9250Sstevel@tonic-gate 	if (nlgrps == 1)
9260Sstevel@tonic-gate 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
9270Sstevel@tonic-gate 
9280Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
9290Sstevel@tonic-gate 		return;
9300Sstevel@tonic-gate 
9310Sstevel@tonic-gate 	/*
9320Sstevel@tonic-gate 	 * Set hint to lgroup being deleted and try to keep lower numbered
9330Sstevel@tonic-gate 	 * hints to facilitate finding empty slots
9340Sstevel@tonic-gate 	 */
9350Sstevel@tonic-gate 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
9360Sstevel@tonic-gate 		lgrp_alloc_hint = lgrp->lgrp_id;
9370Sstevel@tonic-gate 
9380Sstevel@tonic-gate 	/*
9390Sstevel@tonic-gate 	 * Mark this lgroup to be recycled by setting its lgroup ID to
9400Sstevel@tonic-gate 	 * LGRP_NONE and clear relevant fields
9410Sstevel@tonic-gate 	 */
9420Sstevel@tonic-gate 	lgrp->lgrp_id = LGRP_NONE;
9430Sstevel@tonic-gate 	lgrp->lgrp_latency = 0;
9440Sstevel@tonic-gate 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
9450Sstevel@tonic-gate 	lgrp->lgrp_parent = NULL;
9460Sstevel@tonic-gate 	lgrp->lgrp_childcnt = 0;
9470Sstevel@tonic-gate 
9480Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_children);
9490Sstevel@tonic-gate 	klgrpset_clear(lgrp->lgrp_leaves);
9500Sstevel@tonic-gate 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
9510Sstevel@tonic-gate 		klgrpset_clear(lgrp->lgrp_set[i]);
9520Sstevel@tonic-gate 
9530Sstevel@tonic-gate 	lgrp->lgrp_mnodes = (mnodeset_t)0;
9540Sstevel@tonic-gate 	lgrp->lgrp_nmnodes = 0;
9550Sstevel@tonic-gate 
9560Sstevel@tonic-gate 	lgrp->lgrp_cpu = NULL;
9570Sstevel@tonic-gate 	lgrp->lgrp_cpucnt = 0;
9580Sstevel@tonic-gate 
9590Sstevel@tonic-gate 	nlgrps--;
9600Sstevel@tonic-gate }
9610Sstevel@tonic-gate 
9620Sstevel@tonic-gate /*
9630Sstevel@tonic-gate  * Initialize kstat data. Called from lgrp intialization code.
9640Sstevel@tonic-gate  */
9650Sstevel@tonic-gate static void
lgrp_kstat_init(void)9660Sstevel@tonic-gate lgrp_kstat_init(void)
9670Sstevel@tonic-gate {
9680Sstevel@tonic-gate 	lgrp_stat_t	stat;
9690Sstevel@tonic-gate 
9700Sstevel@tonic-gate 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
9710Sstevel@tonic-gate 
9720Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
9730Sstevel@tonic-gate 		kstat_named_init(&lgrp_kstat_data[stat],
9740Sstevel@tonic-gate 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
9750Sstevel@tonic-gate }
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate /*
9780Sstevel@tonic-gate  * initialize an lgrp's kstats if needed
9790Sstevel@tonic-gate  * called with cpu_lock held but not with cpus paused.
9800Sstevel@tonic-gate  * we don't tear these down now because we don't know about
9810Sstevel@tonic-gate  * memory leaving the lgrp yet...
9820Sstevel@tonic-gate  */
9830Sstevel@tonic-gate 
9840Sstevel@tonic-gate void
lgrp_kstat_create(cpu_t * cp)9850Sstevel@tonic-gate lgrp_kstat_create(cpu_t *cp)
9860Sstevel@tonic-gate {
9870Sstevel@tonic-gate 	kstat_t		*lgrp_kstat;
9880Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
9890Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
9900Sstevel@tonic-gate 
9910Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
9920Sstevel@tonic-gate 
9930Sstevel@tonic-gate 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
9940Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate 	if (my_lgrp->lgrp_kstat != NULL)
9970Sstevel@tonic-gate 		return; /* already initialized */
9980Sstevel@tonic-gate 
9990Sstevel@tonic-gate 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
10000Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
10010Sstevel@tonic-gate 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
10020Sstevel@tonic-gate 
10030Sstevel@tonic-gate 	if (lgrp_kstat != NULL) {
10040Sstevel@tonic-gate 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
10050Sstevel@tonic-gate 		lgrp_kstat->ks_private = my_lgrp;
10060Sstevel@tonic-gate 		lgrp_kstat->ks_data = &lgrp_kstat_data;
10070Sstevel@tonic-gate 		lgrp_kstat->ks_update = lgrp_kstat_extract;
10080Sstevel@tonic-gate 		my_lgrp->lgrp_kstat = lgrp_kstat;
10090Sstevel@tonic-gate 		kstat_install(lgrp_kstat);
10100Sstevel@tonic-gate 	}
10110Sstevel@tonic-gate }
10120Sstevel@tonic-gate 
10130Sstevel@tonic-gate /*
10140Sstevel@tonic-gate  * this will do something when we manage to remove now unused lgrps
10150Sstevel@tonic-gate  */
10160Sstevel@tonic-gate 
10170Sstevel@tonic-gate /* ARGSUSED */
10180Sstevel@tonic-gate void
lgrp_kstat_destroy(cpu_t * cp)10190Sstevel@tonic-gate lgrp_kstat_destroy(cpu_t *cp)
10200Sstevel@tonic-gate {
10210Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
10220Sstevel@tonic-gate }
10230Sstevel@tonic-gate 
10240Sstevel@tonic-gate /*
10250Sstevel@tonic-gate  * Called when a CPU is off-lined.
10260Sstevel@tonic-gate  */
10270Sstevel@tonic-gate static void
lgrp_cpu_fini(struct cpu * cp,lgrp_id_t lgrpid)10280Sstevel@tonic-gate lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
10290Sstevel@tonic-gate {
10300Sstevel@tonic-gate 	lgrp_t *my_lgrp;
10310Sstevel@tonic-gate 	struct cpu *prev;
10320Sstevel@tonic-gate 	struct cpu *next;
10330Sstevel@tonic-gate 
10340Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
10350Sstevel@tonic-gate 
10360Sstevel@tonic-gate 	prev = cp->cpu_prev_lgrp;
10370Sstevel@tonic-gate 	next = cp->cpu_next_lgrp;
10380Sstevel@tonic-gate 
10390Sstevel@tonic-gate 	prev->cpu_next_lgrp = next;
10400Sstevel@tonic-gate 	next->cpu_prev_lgrp = prev;
10410Sstevel@tonic-gate 
10420Sstevel@tonic-gate 	/*
10430Sstevel@tonic-gate 	 * just because I'm paranoid doesn't mean...
10440Sstevel@tonic-gate 	 */
10450Sstevel@tonic-gate 
10460Sstevel@tonic-gate 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
10470Sstevel@tonic-gate 
10480Sstevel@tonic-gate 	my_lgrp = lgrp_table[lgrpid];
10490Sstevel@tonic-gate 	my_lgrp->lgrp_cpucnt--;
10500Sstevel@tonic-gate 
10510Sstevel@tonic-gate 	/*
10520Sstevel@tonic-gate 	 * Removing last CPU in lgroup, so update lgroup topology
10530Sstevel@tonic-gate 	 */
10540Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpucnt == 0) {
10550Sstevel@tonic-gate 		klgrpset_t	changed;
10560Sstevel@tonic-gate 		int		count;
10570Sstevel@tonic-gate 		int		i;
10580Sstevel@tonic-gate 
10590Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = NULL;
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate 		/*
10620Sstevel@tonic-gate 		 * Remove this lgroup from its lgroup CPU resources and remove
10630Sstevel@tonic-gate 		 * lgroup from lgroup topology if it doesn't have any more
10640Sstevel@tonic-gate 		 * resources in it now
10650Sstevel@tonic-gate 		 */
10660Sstevel@tonic-gate 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
10670Sstevel@tonic-gate 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
10680Sstevel@tonic-gate 			count = 0;
10690Sstevel@tonic-gate 			klgrpset_clear(changed);
10700Sstevel@tonic-gate 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
10710Sstevel@tonic-gate 			    lgrp_alloc_max + 1, &changed);
10720Sstevel@tonic-gate 			return;
10730Sstevel@tonic-gate 		}
10740Sstevel@tonic-gate 
10750Sstevel@tonic-gate 		/*
10760Sstevel@tonic-gate 		 * This lgroup isn't empty, so just remove it from CPU
10770Sstevel@tonic-gate 		 * resources of any lgroups that contain it as such
10780Sstevel@tonic-gate 		 */
10790Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
10800Sstevel@tonic-gate 			lgrp_t		*lgrp;
10810Sstevel@tonic-gate 
10820Sstevel@tonic-gate 			lgrp = lgrp_table[i];
10830Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
10840Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
10850Sstevel@tonic-gate 			    lgrpid))
10860Sstevel@tonic-gate 				continue;
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
10890Sstevel@tonic-gate 		}
10900Sstevel@tonic-gate 		return;
10910Sstevel@tonic-gate 	}
10920Sstevel@tonic-gate 
10930Sstevel@tonic-gate 	if (my_lgrp->lgrp_cpu == cp)
10940Sstevel@tonic-gate 		my_lgrp->lgrp_cpu = next;
10950Sstevel@tonic-gate 
10960Sstevel@tonic-gate }
10970Sstevel@tonic-gate 
10980Sstevel@tonic-gate /*
10990Sstevel@tonic-gate  * Update memory nodes in target lgroups and return ones that get changed
11000Sstevel@tonic-gate  */
11010Sstevel@tonic-gate int
lgrp_mnode_update(klgrpset_t target,klgrpset_t * changed)11020Sstevel@tonic-gate lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
11030Sstevel@tonic-gate {
11040Sstevel@tonic-gate 	int	count;
11050Sstevel@tonic-gate 	int	i;
11060Sstevel@tonic-gate 	int	j;
11070Sstevel@tonic-gate 	lgrp_t	*lgrp;
11080Sstevel@tonic-gate 	lgrp_t	*lgrp_rsrc;
11090Sstevel@tonic-gate 
11100Sstevel@tonic-gate 	count = 0;
11110Sstevel@tonic-gate 	if (changed)
11120Sstevel@tonic-gate 		klgrpset_clear(*changed);
11130Sstevel@tonic-gate 
11140Sstevel@tonic-gate 	if (klgrpset_isempty(target))
11150Sstevel@tonic-gate 		return (0);
11160Sstevel@tonic-gate 
11170Sstevel@tonic-gate 	/*
11180Sstevel@tonic-gate 	 * Find each lgroup in target lgroups
11190Sstevel@tonic-gate 	 */
11200Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
11210Sstevel@tonic-gate 		/*
11220Sstevel@tonic-gate 		 * Skip any lgroups that don't exist or aren't in target group
11230Sstevel@tonic-gate 		 */
11240Sstevel@tonic-gate 		lgrp = lgrp_table[i];
11250Sstevel@tonic-gate 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
11260Sstevel@tonic-gate 			continue;
11270Sstevel@tonic-gate 		}
11280Sstevel@tonic-gate 
11290Sstevel@tonic-gate 		/*
11300Sstevel@tonic-gate 		 * Initialize memnodes for intermediate lgroups to 0
11310Sstevel@tonic-gate 		 * and update them from scratch since they may have completely
11320Sstevel@tonic-gate 		 * changed
11330Sstevel@tonic-gate 		 */
11340Sstevel@tonic-gate 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
11350Sstevel@tonic-gate 			lgrp->lgrp_mnodes = (mnodeset_t)0;
11360Sstevel@tonic-gate 			lgrp->lgrp_nmnodes = 0;
11370Sstevel@tonic-gate 		}
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 		/*
11400Sstevel@tonic-gate 		 * Update memory nodes of of target lgroup with memory nodes
11410Sstevel@tonic-gate 		 * from each lgroup in its lgroup memory resource set
11420Sstevel@tonic-gate 		 */
11430Sstevel@tonic-gate 		for (j = 0; j <= lgrp_alloc_max; j++) {
11440Sstevel@tonic-gate 			int	k;
11450Sstevel@tonic-gate 
11460Sstevel@tonic-gate 			/*
11470Sstevel@tonic-gate 			 * Skip any lgroups that don't exist or aren't in
11480Sstevel@tonic-gate 			 * memory resources of target lgroup
11490Sstevel@tonic-gate 			 */
11500Sstevel@tonic-gate 			lgrp_rsrc = lgrp_table[j];
11510Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_rsrc) ||
11520Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
11530Sstevel@tonic-gate 			    j))
11540Sstevel@tonic-gate 				continue;
11550Sstevel@tonic-gate 
11560Sstevel@tonic-gate 			/*
11570Sstevel@tonic-gate 			 * Update target lgroup's memnodes to include memnodes
11580Sstevel@tonic-gate 			 * of this lgroup
11590Sstevel@tonic-gate 			 */
11600Sstevel@tonic-gate 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
11610Sstevel@tonic-gate 				mnodeset_t	mnode_mask;
11620Sstevel@tonic-gate 
11630Sstevel@tonic-gate 				mnode_mask = (mnodeset_t)1 << k;
11640Sstevel@tonic-gate 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
11650Sstevel@tonic-gate 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
11660Sstevel@tonic-gate 					lgrp->lgrp_mnodes |= mnode_mask;
11670Sstevel@tonic-gate 					lgrp->lgrp_nmnodes++;
11680Sstevel@tonic-gate 				}
11690Sstevel@tonic-gate 			}
11700Sstevel@tonic-gate 			count++;
11710Sstevel@tonic-gate 			if (changed)
11720Sstevel@tonic-gate 				klgrpset_add(*changed, lgrp->lgrp_id);
11730Sstevel@tonic-gate 		}
11740Sstevel@tonic-gate 	}
11750Sstevel@tonic-gate 
11760Sstevel@tonic-gate 	return (count);
11770Sstevel@tonic-gate }
11780Sstevel@tonic-gate 
11790Sstevel@tonic-gate /*
11800Sstevel@tonic-gate  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
11810Sstevel@tonic-gate  * is moved from one board to another. The "from" and "to" arguments specify the
11820Sstevel@tonic-gate  * source and the destination of the move.
11830Sstevel@tonic-gate  *
11840Sstevel@tonic-gate  * See plat_lgrp_config() for a detailed description of the copy-rename
11850Sstevel@tonic-gate  * semantics.
11860Sstevel@tonic-gate  *
11870Sstevel@tonic-gate  * The lgrp_mem_rename() is called by the platform copy-rename code to update
11880Sstevel@tonic-gate  * the lgroup topology which is changing as memory moves from one lgroup to
11890Sstevel@tonic-gate  * another. It removes the mnode from the source lgroup and re-inserts it in the
11900Sstevel@tonic-gate  * target lgroup.
11910Sstevel@tonic-gate  *
11920Sstevel@tonic-gate  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
11930Sstevel@tonic-gate  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
11940Sstevel@tonic-gate  * copy-rename operation.
11950Sstevel@tonic-gate  *
11960Sstevel@tonic-gate  * There is one case which requires special handling. If the system contains
11970Sstevel@tonic-gate  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
11980Sstevel@tonic-gate  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
11990Sstevel@tonic-gate  * lgrp_mem_init), but there is a window when the system has no memory in the
12000Sstevel@tonic-gate  * lgroup hierarchy. If another thread tries to allocate memory during this
12010Sstevel@tonic-gate  * window, the allocation will fail, although the system has physical memory.
12020Sstevel@tonic-gate  * This may cause a system panic or a deadlock (some sleeping memory allocations
12030Sstevel@tonic-gate  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
12040Sstevel@tonic-gate  * the mnode back).
12050Sstevel@tonic-gate  *
12060Sstevel@tonic-gate  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
12070Sstevel@tonic-gate  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
12080Sstevel@tonic-gate  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
12090Sstevel@tonic-gate  * but it updates the rest of the lgroup topology as if the mnode was actually
12100Sstevel@tonic-gate  * removed. The lgrp_mem_init() function recognizes that the mnode being
12110Sstevel@tonic-gate  * inserted represents such a special case and updates the topology
12120Sstevel@tonic-gate  * appropriately.
12130Sstevel@tonic-gate  */
12140Sstevel@tonic-gate void
lgrp_mem_rename(int mnode,lgrp_handle_t from,lgrp_handle_t to)12150Sstevel@tonic-gate lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
12160Sstevel@tonic-gate {
12170Sstevel@tonic-gate 	/*
12180Sstevel@tonic-gate 	 * Remove the memory from the source node and add it to the destination
12190Sstevel@tonic-gate 	 * node.
12200Sstevel@tonic-gate 	 */
12210Sstevel@tonic-gate 	lgrp_mem_fini(mnode, from, B_TRUE);
12220Sstevel@tonic-gate 	lgrp_mem_init(mnode, to, B_TRUE);
12230Sstevel@tonic-gate }
12240Sstevel@tonic-gate 
12250Sstevel@tonic-gate /*
12260Sstevel@tonic-gate  * Called to indicate that the lgrp with platform handle "hand" now
12270Sstevel@tonic-gate  * contains the memory identified by "mnode".
12280Sstevel@tonic-gate  *
12290Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
12300Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
12310Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
12320Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
12330Sstevel@tonic-gate  *
12340Sstevel@tonic-gate  * If the insertion is part of the DR copy-rename and the inserted mnode (and
12350Sstevel@tonic-gate  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
12360Sstevel@tonic-gate  * dealing with the special case of DR copy-rename described in
12370Sstevel@tonic-gate  * lgrp_mem_rename().
12380Sstevel@tonic-gate  */
12390Sstevel@tonic-gate void
lgrp_mem_init(int mnode,lgrp_handle_t hand,boolean_t is_copy_rename)12400Sstevel@tonic-gate lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
12410Sstevel@tonic-gate {
12420Sstevel@tonic-gate 	klgrpset_t	changed;
12430Sstevel@tonic-gate 	int		count;
12440Sstevel@tonic-gate 	int		i;
12450Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
12460Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
12470Sstevel@tonic-gate 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
12480Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
12490Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 	/*
12520Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
12530Sstevel@tonic-gate 	 */
12540Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
12550Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
12560Sstevel@tonic-gate 		drop_lock = B_TRUE;
12570Sstevel@tonic-gate 	}
12580Sstevel@tonic-gate 
12590Sstevel@tonic-gate 	/*
12600Sstevel@tonic-gate 	 * This routine may be called from a context where we already
12610Sstevel@tonic-gate 	 * hold cpu_lock, and have already paused cpus.
12620Sstevel@tonic-gate 	 */
12630Sstevel@tonic-gate 	if (!cpus_paused())
12640Sstevel@tonic-gate 		need_synch = B_TRUE;
12650Sstevel@tonic-gate 
12660Sstevel@tonic-gate 	/*
12670Sstevel@tonic-gate 	 * Check if this mnode is already configured and return immediately if
12680Sstevel@tonic-gate 	 * it is.
12690Sstevel@tonic-gate 	 *
12700Sstevel@tonic-gate 	 * NOTE: in special case of copy-rename of the only remaining mnode,
12710Sstevel@tonic-gate 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
12720Sstevel@tonic-gate 	 * recognize this case and continue as usual, but skip the update to
12730Sstevel@tonic-gate 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
12740Sstevel@tonic-gate 	 * in topology, temporarily introduced by lgrp_mem_fini().
12750Sstevel@tonic-gate 	 */
12760Sstevel@tonic-gate 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
12770Sstevel@tonic-gate 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
12780Sstevel@tonic-gate 		if (drop_lock)
12790Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
12800Sstevel@tonic-gate 		return;
12810Sstevel@tonic-gate 	}
12820Sstevel@tonic-gate 
12830Sstevel@tonic-gate 	/*
12840Sstevel@tonic-gate 	 * Update lgroup topology with new memory resources, keeping track of
12850Sstevel@tonic-gate 	 * which lgroups change
12860Sstevel@tonic-gate 	 */
12870Sstevel@tonic-gate 	count = 0;
12880Sstevel@tonic-gate 	klgrpset_clear(changed);
12890Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
12900Sstevel@tonic-gate 	if (my_lgrp == NULL) {
12910Sstevel@tonic-gate 		/* new lgrp */
12920Sstevel@tonic-gate 		my_lgrp = lgrp_create();
12930Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
12940Sstevel@tonic-gate 		my_lgrp->lgrp_plathand = hand;
12950Sstevel@tonic-gate 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
12960Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
12970Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
12980Sstevel@tonic-gate 
12990Sstevel@tonic-gate 		if (need_synch)
13000Sstevel@tonic-gate 			pause_cpus(NULL);
13010Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
13020Sstevel@tonic-gate 		    &changed);
13030Sstevel@tonic-gate 		if (need_synch)
13040Sstevel@tonic-gate 			start_cpus();
13050Sstevel@tonic-gate 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
13060Sstevel@tonic-gate 	    > 0) {
13070Sstevel@tonic-gate 		/*
13080Sstevel@tonic-gate 		 * Leaf lgroup was created, but latency wasn't available
13090Sstevel@tonic-gate 		 * then.  So, set latency for it and fill in rest of lgroup
13100Sstevel@tonic-gate 		 * topology  now that we know how far it is from other leaf
13110Sstevel@tonic-gate 		 * lgroups.
13120Sstevel@tonic-gate 		 */
13130Sstevel@tonic-gate 		klgrpset_clear(changed);
13140Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
13150Sstevel@tonic-gate 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
13160Sstevel@tonic-gate 		    lgrpid))
13170Sstevel@tonic-gate 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13180Sstevel@tonic-gate 		if (need_synch)
13190Sstevel@tonic-gate 			pause_cpus(NULL);
13200Sstevel@tonic-gate 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
13210Sstevel@tonic-gate 		    &changed);
13220Sstevel@tonic-gate 		if (need_synch)
13230Sstevel@tonic-gate 			start_cpus();
13240Sstevel@tonic-gate 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
13250Sstevel@tonic-gate 	    my_lgrp->lgrp_id)) {
132650Sjjc 		/*
132750Sjjc 		 * Add new lgroup memory resource to existing lgroup
132850Sjjc 		 */
13290Sstevel@tonic-gate 		lgrpid = my_lgrp->lgrp_id;
13300Sstevel@tonic-gate 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13310Sstevel@tonic-gate 		klgrpset_add(changed, lgrpid);
13320Sstevel@tonic-gate 		count++;
13330Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
13340Sstevel@tonic-gate 			lgrp_t		*lgrp;
13350Sstevel@tonic-gate 
13360Sstevel@tonic-gate 			lgrp = lgrp_table[i];
13370Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
13380Sstevel@tonic-gate 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
13390Sstevel@tonic-gate 				continue;
13400Sstevel@tonic-gate 
13410Sstevel@tonic-gate 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
13420Sstevel@tonic-gate 			klgrpset_add(changed, lgrp->lgrp_id);
13430Sstevel@tonic-gate 			count++;
13440Sstevel@tonic-gate 		}
13450Sstevel@tonic-gate 	}
13460Sstevel@tonic-gate 
13470Sstevel@tonic-gate 	/*
13480Sstevel@tonic-gate 	 * Add memory node to lgroup and remove lgroup from ones that need
13490Sstevel@tonic-gate 	 * to be updated
13500Sstevel@tonic-gate 	 */
13510Sstevel@tonic-gate 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
13520Sstevel@tonic-gate 		my_lgrp->lgrp_mnodes |= mnodes_mask;
13530Sstevel@tonic-gate 		my_lgrp->lgrp_nmnodes++;
13540Sstevel@tonic-gate 	}
13550Sstevel@tonic-gate 	klgrpset_del(changed, lgrpid);
13560Sstevel@tonic-gate 
13570Sstevel@tonic-gate 	/*
13580Sstevel@tonic-gate 	 * Update memory node information for all lgroups that changed and
13590Sstevel@tonic-gate 	 * contain new memory node as a resource
13600Sstevel@tonic-gate 	 */
13610Sstevel@tonic-gate 	if (count)
13620Sstevel@tonic-gate 		(void) lgrp_mnode_update(changed, NULL);
13630Sstevel@tonic-gate 
13640Sstevel@tonic-gate 	if (drop_lock)
13650Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
13660Sstevel@tonic-gate }
13670Sstevel@tonic-gate 
13680Sstevel@tonic-gate /*
13690Sstevel@tonic-gate  * Called to indicate that the lgroup associated with the platform
13700Sstevel@tonic-gate  * handle "hand" no longer contains given memory node
13710Sstevel@tonic-gate  *
13720Sstevel@tonic-gate  * LOCKING for this routine is a bit tricky. Usually it is called without
13730Sstevel@tonic-gate  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
13740Sstevel@tonic-gate  * callers. During DR of the board containing the caged memory it may be called
13750Sstevel@tonic-gate  * with cpu_lock already held and CPUs paused.
13760Sstevel@tonic-gate  *
13770Sstevel@tonic-gate  * If the deletion is part of the DR copy-rename and the deleted mnode is the
13780Sstevel@tonic-gate  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
13790Sstevel@tonic-gate  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
13800Sstevel@tonic-gate  * the same mnode back into the topology. See lgrp_mem_rename() and
13810Sstevel@tonic-gate  * lgrp_mem_init() for additional details.
13820Sstevel@tonic-gate  */
13830Sstevel@tonic-gate void
lgrp_mem_fini(int mnode,lgrp_handle_t hand,boolean_t is_copy_rename)13840Sstevel@tonic-gate lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
13850Sstevel@tonic-gate {
13860Sstevel@tonic-gate 	klgrpset_t	changed;
13870Sstevel@tonic-gate 	int		count;
13880Sstevel@tonic-gate 	int		i;
13890Sstevel@tonic-gate 	lgrp_t		*my_lgrp;
13900Sstevel@tonic-gate 	lgrp_id_t	lgrpid;
13910Sstevel@tonic-gate 	mnodeset_t	mnodes_mask;
13920Sstevel@tonic-gate 	boolean_t	drop_lock = B_FALSE;
13930Sstevel@tonic-gate 	boolean_t	need_synch = B_FALSE;
13940Sstevel@tonic-gate 
13950Sstevel@tonic-gate 	/*
13960Sstevel@tonic-gate 	 * Grab CPU lock (if we haven't already)
13970Sstevel@tonic-gate 	 */
13980Sstevel@tonic-gate 	if (!MUTEX_HELD(&cpu_lock)) {
13990Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
14000Sstevel@tonic-gate 		drop_lock = B_TRUE;
14010Sstevel@tonic-gate 	}
14020Sstevel@tonic-gate 
14030Sstevel@tonic-gate 	/*
14040Sstevel@tonic-gate 	 * This routine may be called from a context where we already
14050Sstevel@tonic-gate 	 * hold cpu_lock and have already paused cpus.
14060Sstevel@tonic-gate 	 */
14070Sstevel@tonic-gate 	if (!cpus_paused())
14080Sstevel@tonic-gate 		need_synch = B_TRUE;
14090Sstevel@tonic-gate 
14100Sstevel@tonic-gate 	my_lgrp = lgrp_hand_to_lgrp(hand);
14110Sstevel@tonic-gate 
14120Sstevel@tonic-gate 	/*
14130Sstevel@tonic-gate 	 * The lgrp *must* be pre-existing
14140Sstevel@tonic-gate 	 */
14150Sstevel@tonic-gate 	ASSERT(my_lgrp != NULL);
14160Sstevel@tonic-gate 
14170Sstevel@tonic-gate 	/*
14180Sstevel@tonic-gate 	 * Delete memory node from lgroups which contain it
14190Sstevel@tonic-gate 	 */
14200Sstevel@tonic-gate 	mnodes_mask = ((mnodeset_t)1 << mnode);
14210Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
14220Sstevel@tonic-gate 		lgrp_t *lgrp = lgrp_table[i];
14230Sstevel@tonic-gate 		/*
14240Sstevel@tonic-gate 		 * Skip any non-existent lgroups and any lgroups that don't
14250Sstevel@tonic-gate 		 * contain leaf lgroup of memory as a memory resource
14260Sstevel@tonic-gate 		 */
14270Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp) ||
14280Sstevel@tonic-gate 		    !(lgrp->lgrp_mnodes & mnodes_mask))
14290Sstevel@tonic-gate 			continue;
14300Sstevel@tonic-gate 
14310Sstevel@tonic-gate 		/*
14320Sstevel@tonic-gate 		 * Avoid removing the last mnode from the root in the DR
14330Sstevel@tonic-gate 		 * copy-rename case. See lgrp_mem_rename() for details.
14340Sstevel@tonic-gate 		 */
14350Sstevel@tonic-gate 		if (is_copy_rename &&
14360Sstevel@tonic-gate 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
14370Sstevel@tonic-gate 			continue;
14380Sstevel@tonic-gate 
14390Sstevel@tonic-gate 		/*
14400Sstevel@tonic-gate 		 * Remove memory node from lgroup.
14410Sstevel@tonic-gate 		 */
14420Sstevel@tonic-gate 		lgrp->lgrp_mnodes &= ~mnodes_mask;
14430Sstevel@tonic-gate 		lgrp->lgrp_nmnodes--;
14440Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_nmnodes >= 0);
14450Sstevel@tonic-gate 	}
14460Sstevel@tonic-gate 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
14470Sstevel@tonic-gate 
14480Sstevel@tonic-gate 	/*
14490Sstevel@tonic-gate 	 * Don't need to update lgroup topology if this lgroup still has memory.
14500Sstevel@tonic-gate 	 *
14510Sstevel@tonic-gate 	 * In the special case of DR copy-rename with the only mnode being
14520Sstevel@tonic-gate 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
14530Sstevel@tonic-gate 	 * still need to update the lgroup topology.
14540Sstevel@tonic-gate 	 */
14550Sstevel@tonic-gate 	if ((my_lgrp->lgrp_nmnodes > 0) &&
14568408SEric.Saxe@Sun.COM 	    !(is_copy_rename && (my_lgrp == lgrp_root) &&
14578408SEric.Saxe@Sun.COM 	    (my_lgrp->lgrp_mnodes == mnodes_mask))) {
14580Sstevel@tonic-gate 		if (drop_lock)
14590Sstevel@tonic-gate 			mutex_exit(&cpu_lock);
14600Sstevel@tonic-gate 		return;
14610Sstevel@tonic-gate 	}
14620Sstevel@tonic-gate 
14630Sstevel@tonic-gate 	/*
14640Sstevel@tonic-gate 	 * This lgroup does not contain any memory now
14650Sstevel@tonic-gate 	 */
14660Sstevel@tonic-gate 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
14670Sstevel@tonic-gate 
14680Sstevel@tonic-gate 	/*
14690Sstevel@tonic-gate 	 * Remove this lgroup from lgroup topology if it does not contain any
14700Sstevel@tonic-gate 	 * resources now
14710Sstevel@tonic-gate 	 */
14720Sstevel@tonic-gate 	lgrpid = my_lgrp->lgrp_id;
14730Sstevel@tonic-gate 	count = 0;
14740Sstevel@tonic-gate 	klgrpset_clear(changed);
14750Sstevel@tonic-gate 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
14760Sstevel@tonic-gate 		/*
14770Sstevel@tonic-gate 		 * Delete lgroup when no more resources
14780Sstevel@tonic-gate 		 */
14790Sstevel@tonic-gate 		if (need_synch)
14800Sstevel@tonic-gate 			pause_cpus(NULL);
14810Sstevel@tonic-gate 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
14820Sstevel@tonic-gate 		    lgrp_alloc_max + 1, &changed);
14830Sstevel@tonic-gate 		ASSERT(count > 0);
14840Sstevel@tonic-gate 		if (need_synch)
14850Sstevel@tonic-gate 			start_cpus();
14860Sstevel@tonic-gate 	} else {
14870Sstevel@tonic-gate 		/*
14880Sstevel@tonic-gate 		 * Remove lgroup from memory resources of any lgroups that
14890Sstevel@tonic-gate 		 * contain it as such
14900Sstevel@tonic-gate 		 */
14910Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
14920Sstevel@tonic-gate 			lgrp_t		*lgrp;
14930Sstevel@tonic-gate 
14940Sstevel@tonic-gate 			lgrp = lgrp_table[i];
14950Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp) ||
14960Sstevel@tonic-gate 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
14970Sstevel@tonic-gate 			    lgrpid))
14980Sstevel@tonic-gate 				continue;
14990Sstevel@tonic-gate 
15000Sstevel@tonic-gate 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
15010Sstevel@tonic-gate 		}
15020Sstevel@tonic-gate 	}
15030Sstevel@tonic-gate 	if (drop_lock)
15040Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
15050Sstevel@tonic-gate }
15060Sstevel@tonic-gate 
15070Sstevel@tonic-gate /*
15080Sstevel@tonic-gate  * Return lgroup with given platform handle
15090Sstevel@tonic-gate  */
15100Sstevel@tonic-gate lgrp_t *
lgrp_hand_to_lgrp(lgrp_handle_t hand)15110Sstevel@tonic-gate lgrp_hand_to_lgrp(lgrp_handle_t hand)
15120Sstevel@tonic-gate {
15130Sstevel@tonic-gate 	int	i;
15140Sstevel@tonic-gate 	lgrp_t	*lgrp;
15150Sstevel@tonic-gate 
15160Sstevel@tonic-gate 	if (hand == LGRP_NULL_HANDLE)
15170Sstevel@tonic-gate 		return (NULL);
15180Sstevel@tonic-gate 
15190Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
15200Sstevel@tonic-gate 		lgrp = lgrp_table[i];
15210Sstevel@tonic-gate 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
15220Sstevel@tonic-gate 			return (lgrp);
15230Sstevel@tonic-gate 	}
15240Sstevel@tonic-gate 	return (NULL);
15250Sstevel@tonic-gate }
15260Sstevel@tonic-gate 
15270Sstevel@tonic-gate /*
15280Sstevel@tonic-gate  * Return the home lgroup of the current thread.
15290Sstevel@tonic-gate  * We must do this with kernel preemption disabled, since we don't want our
15300Sstevel@tonic-gate  * thread to be re-homed while we're poking around with its lpl, and the lpl
15310Sstevel@tonic-gate  * should never be NULL.
15320Sstevel@tonic-gate  *
15330Sstevel@tonic-gate  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
15340Sstevel@tonic-gate  * is enabled because of DR.  Callers can use disable kernel preemption
15350Sstevel@tonic-gate  * around this call to guarantee that the lgroup will be valid beyond this
15360Sstevel@tonic-gate  * routine, since kernel preemption can be recursive.
15370Sstevel@tonic-gate  */
15380Sstevel@tonic-gate lgrp_t *
lgrp_home_lgrp(void)15390Sstevel@tonic-gate lgrp_home_lgrp(void)
15400Sstevel@tonic-gate {
15410Sstevel@tonic-gate 	lgrp_t	*lgrp;
15420Sstevel@tonic-gate 	lpl_t	*lpl;
15430Sstevel@tonic-gate 
15440Sstevel@tonic-gate 	kpreempt_disable();
15450Sstevel@tonic-gate 
15460Sstevel@tonic-gate 	lpl = curthread->t_lpl;
15470Sstevel@tonic-gate 	ASSERT(lpl != NULL);
15480Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
15490Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
15500Sstevel@tonic-gate 	lgrp = lgrp_table[lpl->lpl_lgrpid];
15510Sstevel@tonic-gate 
15520Sstevel@tonic-gate 	kpreempt_enable();
15530Sstevel@tonic-gate 
15540Sstevel@tonic-gate 	return (lgrp);
15550Sstevel@tonic-gate }
15560Sstevel@tonic-gate 
15570Sstevel@tonic-gate /*
15580Sstevel@tonic-gate  * Return ID of home lgroup for given thread
15590Sstevel@tonic-gate  * (See comments for lgrp_home_lgrp() for special care and handling
15600Sstevel@tonic-gate  * instructions)
15610Sstevel@tonic-gate  */
15620Sstevel@tonic-gate lgrp_id_t
lgrp_home_id(kthread_t * t)15630Sstevel@tonic-gate lgrp_home_id(kthread_t *t)
15640Sstevel@tonic-gate {
15650Sstevel@tonic-gate 	lgrp_id_t	lgrp;
15660Sstevel@tonic-gate 	lpl_t		*lpl;
15670Sstevel@tonic-gate 
15680Sstevel@tonic-gate 	ASSERT(t != NULL);
15690Sstevel@tonic-gate 	/*
15700Sstevel@tonic-gate 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
15710Sstevel@tonic-gate 	 * cannot since the HAT layer can call into this routine to
15720Sstevel@tonic-gate 	 * determine the locality for its data structures in the context
15730Sstevel@tonic-gate 	 * of a page fault.
15740Sstevel@tonic-gate 	 */
15750Sstevel@tonic-gate 
15760Sstevel@tonic-gate 	kpreempt_disable();
15770Sstevel@tonic-gate 
15780Sstevel@tonic-gate 	lpl = t->t_lpl;
15790Sstevel@tonic-gate 	ASSERT(lpl != NULL);
15800Sstevel@tonic-gate 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
15810Sstevel@tonic-gate 	lgrp = lpl->lpl_lgrpid;
15820Sstevel@tonic-gate 
15830Sstevel@tonic-gate 	kpreempt_enable();
15840Sstevel@tonic-gate 
15850Sstevel@tonic-gate 	return (lgrp);
15860Sstevel@tonic-gate }
15870Sstevel@tonic-gate 
15880Sstevel@tonic-gate /*
15890Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
15900Sstevel@tonic-gate  */
15910Sstevel@tonic-gate lgrp_t *
lgrp_pfn_to_lgrp(pfn_t pfn)15920Sstevel@tonic-gate lgrp_pfn_to_lgrp(pfn_t pfn)
15930Sstevel@tonic-gate {
15940Sstevel@tonic-gate 	lgrp_handle_t	hand;
15950Sstevel@tonic-gate 	int		i;
15960Sstevel@tonic-gate 	lgrp_t		*lgrp;
15970Sstevel@tonic-gate 
15980Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
15990Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
16000Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
16010Sstevel@tonic-gate 			lgrp = lgrp_table[i];
16020Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
16030Sstevel@tonic-gate 				return (lgrp);
16040Sstevel@tonic-gate 		}
16050Sstevel@tonic-gate 	return (NULL);
16060Sstevel@tonic-gate }
16070Sstevel@tonic-gate 
16080Sstevel@tonic-gate /*
16090Sstevel@tonic-gate  * Return lgroup containing the physical memory for the given page frame number
16100Sstevel@tonic-gate  */
16110Sstevel@tonic-gate lgrp_t *
lgrp_phys_to_lgrp(u_longlong_t physaddr)16120Sstevel@tonic-gate lgrp_phys_to_lgrp(u_longlong_t physaddr)
16130Sstevel@tonic-gate {
16140Sstevel@tonic-gate 	lgrp_handle_t	hand;
16150Sstevel@tonic-gate 	int		i;
16160Sstevel@tonic-gate 	lgrp_t		*lgrp;
16170Sstevel@tonic-gate 	pfn_t		pfn;
16180Sstevel@tonic-gate 
16190Sstevel@tonic-gate 	pfn = btop(physaddr);
16200Sstevel@tonic-gate 	hand = lgrp_plat_pfn_to_hand(pfn);
16210Sstevel@tonic-gate 	if (hand != LGRP_NULL_HANDLE)
16220Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
16230Sstevel@tonic-gate 			lgrp = lgrp_table[i];
16240Sstevel@tonic-gate 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
16250Sstevel@tonic-gate 				return (lgrp);
16260Sstevel@tonic-gate 		}
16270Sstevel@tonic-gate 	return (NULL);
16280Sstevel@tonic-gate }
16290Sstevel@tonic-gate 
16300Sstevel@tonic-gate /*
16310Sstevel@tonic-gate  * Return the leaf lgroup containing the given CPU
163260Sesaxe  *
163360Sesaxe  * The caller needs to take precautions necessary to prevent
16343434Sesaxe  * "cpu", and it's lpl from going away across a call to this function.
163560Sesaxe  * hint: kpreempt_disable()/kpreempt_enable()
16360Sstevel@tonic-gate  */
16370Sstevel@tonic-gate static lgrp_t *
lgrp_cpu_to_lgrp(cpu_t * cpu)16380Sstevel@tonic-gate lgrp_cpu_to_lgrp(cpu_t *cpu)
16390Sstevel@tonic-gate {
16401892Sesaxe 	return (cpu->cpu_lpl->lpl_lgrp);
16410Sstevel@tonic-gate }
16420Sstevel@tonic-gate 
16430Sstevel@tonic-gate /*
16440Sstevel@tonic-gate  * Return the sum of the partition loads in an lgrp divided by
16450Sstevel@tonic-gate  * the number of CPUs in the lgrp.  This is our best approximation
16460Sstevel@tonic-gate  * of an 'lgroup load average' for a useful per-lgroup kstat.
16470Sstevel@tonic-gate  */
16480Sstevel@tonic-gate static uint64_t
lgrp_sum_loadavgs(lgrp_t * lgrp)16490Sstevel@tonic-gate lgrp_sum_loadavgs(lgrp_t *lgrp)
16500Sstevel@tonic-gate {
16510Sstevel@tonic-gate 	cpu_t *cpu;
16520Sstevel@tonic-gate 	int ncpu;
16530Sstevel@tonic-gate 	uint64_t loads = 0;
16540Sstevel@tonic-gate 
16550Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
16560Sstevel@tonic-gate 
16570Sstevel@tonic-gate 	cpu = lgrp->lgrp_cpu;
16580Sstevel@tonic-gate 	ncpu = lgrp->lgrp_cpucnt;
16590Sstevel@tonic-gate 
16600Sstevel@tonic-gate 	if (cpu == NULL || ncpu == 0) {
16610Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
16620Sstevel@tonic-gate 		return (0ull);
16630Sstevel@tonic-gate 	}
16640Sstevel@tonic-gate 
16650Sstevel@tonic-gate 	do {
16660Sstevel@tonic-gate 		loads += cpu->cpu_lpl->lpl_loadavg;
16670Sstevel@tonic-gate 		cpu = cpu->cpu_next_lgrp;
16680Sstevel@tonic-gate 	} while (cpu != lgrp->lgrp_cpu);
16690Sstevel@tonic-gate 
16700Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
16710Sstevel@tonic-gate 
16720Sstevel@tonic-gate 	return (loads / ncpu);
16730Sstevel@tonic-gate }
16740Sstevel@tonic-gate 
16750Sstevel@tonic-gate void
lgrp_stat_add(lgrp_id_t lgrpid,lgrp_stat_t stat,int64_t val)16760Sstevel@tonic-gate lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
16770Sstevel@tonic-gate {
16780Sstevel@tonic-gate 	struct lgrp_stats *pstats;
16790Sstevel@tonic-gate 
16800Sstevel@tonic-gate 	/*
16810Sstevel@tonic-gate 	 * Verify that the caller isn't trying to add to
16820Sstevel@tonic-gate 	 * a statistic for an lgroup that has gone away
16830Sstevel@tonic-gate 	 */
16840Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
16850Sstevel@tonic-gate 		return;
16860Sstevel@tonic-gate 
16870Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
16880Sstevel@tonic-gate 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
16890Sstevel@tonic-gate }
16900Sstevel@tonic-gate 
16910Sstevel@tonic-gate int64_t
lgrp_stat_read(lgrp_id_t lgrpid,lgrp_stat_t stat)16920Sstevel@tonic-gate lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
16930Sstevel@tonic-gate {
16940Sstevel@tonic-gate 	uint64_t val;
16950Sstevel@tonic-gate 	struct lgrp_stats *pstats;
16960Sstevel@tonic-gate 
16970Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
16980Sstevel@tonic-gate 		return ((int64_t)0);
16990Sstevel@tonic-gate 
17000Sstevel@tonic-gate 	pstats = &lgrp_stats[lgrpid];
17010Sstevel@tonic-gate 	LGRP_STAT_READ(pstats, stat, val);
17020Sstevel@tonic-gate 	return (val);
17030Sstevel@tonic-gate }
17040Sstevel@tonic-gate 
17050Sstevel@tonic-gate /*
17060Sstevel@tonic-gate  * Reset all kstats for lgrp specified by its lgrpid.
17070Sstevel@tonic-gate  */
17080Sstevel@tonic-gate static void
lgrp_kstat_reset(lgrp_id_t lgrpid)17090Sstevel@tonic-gate lgrp_kstat_reset(lgrp_id_t lgrpid)
17100Sstevel@tonic-gate {
17110Sstevel@tonic-gate 	lgrp_stat_t stat;
17120Sstevel@tonic-gate 
17130Sstevel@tonic-gate 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
17140Sstevel@tonic-gate 		return;
17150Sstevel@tonic-gate 
17160Sstevel@tonic-gate 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17170Sstevel@tonic-gate 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
17180Sstevel@tonic-gate 	}
17190Sstevel@tonic-gate }
17200Sstevel@tonic-gate 
17210Sstevel@tonic-gate /*
17220Sstevel@tonic-gate  * Collect all per-lgrp statistics for the lgrp associated with this
17230Sstevel@tonic-gate  * kstat, and store them in the ks_data array.
17240Sstevel@tonic-gate  *
17250Sstevel@tonic-gate  * The superuser can reset all the running counter statistics for an
17260Sstevel@tonic-gate  * lgrp by writing to any of the lgrp's stats.
17270Sstevel@tonic-gate  */
17280Sstevel@tonic-gate static int
lgrp_kstat_extract(kstat_t * ksp,int rw)17290Sstevel@tonic-gate lgrp_kstat_extract(kstat_t *ksp, int rw)
17300Sstevel@tonic-gate {
17310Sstevel@tonic-gate 	lgrp_stat_t		stat;
17320Sstevel@tonic-gate 	struct kstat_named	*ksd;
17330Sstevel@tonic-gate 	lgrp_t			*lgrp;
17340Sstevel@tonic-gate 	lgrp_id_t		lgrpid;
17350Sstevel@tonic-gate 
17360Sstevel@tonic-gate 	lgrp = (lgrp_t *)ksp->ks_private;
17370Sstevel@tonic-gate 
17380Sstevel@tonic-gate 	ksd = (struct kstat_named *)ksp->ks_data;
17390Sstevel@tonic-gate 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
17400Sstevel@tonic-gate 
17410Sstevel@tonic-gate 	lgrpid = lgrp->lgrp_id;
17420Sstevel@tonic-gate 
17430Sstevel@tonic-gate 	if (lgrpid == LGRP_NONE) {
17440Sstevel@tonic-gate 		/*
17450Sstevel@tonic-gate 		 * Return all zeroes as stats for freed lgrp.
17460Sstevel@tonic-gate 		 */
17470Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17480Sstevel@tonic-gate 			ksd[stat].value.i64 = 0;
17490Sstevel@tonic-gate 		}
17500Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
17510Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
17520Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
17530Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
17540Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
17550Sstevel@tonic-gate 	} else if (rw != KSTAT_WRITE) {
17560Sstevel@tonic-gate 		/*
17570Sstevel@tonic-gate 		 * Handle counter stats
17580Sstevel@tonic-gate 		 */
17590Sstevel@tonic-gate 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
17600Sstevel@tonic-gate 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
17610Sstevel@tonic-gate 		}
17620Sstevel@tonic-gate 
17630Sstevel@tonic-gate 		/*
17640Sstevel@tonic-gate 		 * Handle kernel data snapshot stats
17650Sstevel@tonic-gate 		 */
17660Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
17670Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
17680Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
17690Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
17700Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
17710Sstevel@tonic-gate 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
17720Sstevel@tonic-gate 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
17730Sstevel@tonic-gate 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
17742685Sakolb 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
17752685Sakolb 		    lgrp_loadavg_max_effect;
17760Sstevel@tonic-gate 	} else {
17770Sstevel@tonic-gate 		lgrp_kstat_reset(lgrpid);
17780Sstevel@tonic-gate 	}
17790Sstevel@tonic-gate 
17800Sstevel@tonic-gate 	return (0);
17810Sstevel@tonic-gate }
17820Sstevel@tonic-gate 
17830Sstevel@tonic-gate int
lgrp_query_cpu(processorid_t id,lgrp_id_t * lp)17840Sstevel@tonic-gate lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
17850Sstevel@tonic-gate {
17860Sstevel@tonic-gate 	cpu_t	*cp;
17870Sstevel@tonic-gate 
17880Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
17890Sstevel@tonic-gate 
17900Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
17910Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
17920Sstevel@tonic-gate 		return (EINVAL);
17930Sstevel@tonic-gate 	}
17940Sstevel@tonic-gate 
17950Sstevel@tonic-gate 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
17960Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
17970Sstevel@tonic-gate 		return (EINVAL);
17980Sstevel@tonic-gate 	}
17990Sstevel@tonic-gate 
18000Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
18010Sstevel@tonic-gate 
18020Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_lgrpid;
18030Sstevel@tonic-gate 
18040Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
18050Sstevel@tonic-gate 
18060Sstevel@tonic-gate 	return (0);
18070Sstevel@tonic-gate }
18080Sstevel@tonic-gate 
18090Sstevel@tonic-gate int
lgrp_query_load(processorid_t id,lgrp_load_t * lp)18100Sstevel@tonic-gate lgrp_query_load(processorid_t id, lgrp_load_t *lp)
18110Sstevel@tonic-gate {
18120Sstevel@tonic-gate 	cpu_t *cp;
18130Sstevel@tonic-gate 
18140Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
18150Sstevel@tonic-gate 
18160Sstevel@tonic-gate 	if ((cp = cpu_get(id)) == NULL) {
18170Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
18180Sstevel@tonic-gate 		return (EINVAL);
18190Sstevel@tonic-gate 	}
18200Sstevel@tonic-gate 
18210Sstevel@tonic-gate 	ASSERT(cp->cpu_lpl != NULL);
18220Sstevel@tonic-gate 
18230Sstevel@tonic-gate 	*lp = cp->cpu_lpl->lpl_loadavg;
18240Sstevel@tonic-gate 
18250Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
18260Sstevel@tonic-gate 
18270Sstevel@tonic-gate 	return (0);
18280Sstevel@tonic-gate }
18290Sstevel@tonic-gate 
18300Sstevel@tonic-gate /*
18310Sstevel@tonic-gate  * Add a resource named by lpl_leaf to rset of lpl_target
18320Sstevel@tonic-gate  *
18330Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
18340Sstevel@tonic-gate  * resource. It is adjusted here, as this is presently the only place that we
18350Sstevel@tonic-gate  * can be certain a resource addition has succeeded.
18360Sstevel@tonic-gate  *
18370Sstevel@tonic-gate  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
18380Sstevel@tonic-gate  * list in order until it reaches a NULL.  (This list is required to be NULL
18390Sstevel@tonic-gate  * terminated, too).  This is done so that we can mark start pos + 1, so that
18400Sstevel@tonic-gate  * each lpl is traversed sequentially, but in a different order.  We hope this
18410Sstevel@tonic-gate  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
18420Sstevel@tonic-gate  */
18430Sstevel@tonic-gate 
18440Sstevel@tonic-gate void
lpl_rset_add(lpl_t * lpl_target,lpl_t * lpl_leaf)18450Sstevel@tonic-gate lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
18460Sstevel@tonic-gate {
18470Sstevel@tonic-gate 	int		i;
18480Sstevel@tonic-gate 	int		entry_slot = 0;
18490Sstevel@tonic-gate 
18500Sstevel@tonic-gate 	/* return if leaf is already present */
18510Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
18520Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
18530Sstevel@tonic-gate 			return;
18540Sstevel@tonic-gate 		}
18550Sstevel@tonic-gate 
18560Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
18570Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) {
18580Sstevel@tonic-gate 			break;
18590Sstevel@tonic-gate 		}
18600Sstevel@tonic-gate 	}
18610Sstevel@tonic-gate 
18620Sstevel@tonic-gate 	/* insert leaf, update counts */
18630Sstevel@tonic-gate 	entry_slot = i;
18640Sstevel@tonic-gate 	i = lpl_target->lpl_nrset++;
18650Sstevel@tonic-gate 
18660Sstevel@tonic-gate 	/*
18670Sstevel@tonic-gate 	 * Start at the end of the rset array and work backwards towards the
18680Sstevel@tonic-gate 	 * slot into which the new lpl will be inserted. This effectively
18690Sstevel@tonic-gate 	 * preserves the current ordering by scooting everybody over one entry,
18700Sstevel@tonic-gate 	 * and placing the new entry into the space created.
18710Sstevel@tonic-gate 	 */
18720Sstevel@tonic-gate 	while (i-- > entry_slot) {
18730Sstevel@tonic-gate 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
18748408SEric.Saxe@Sun.COM 		lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
18758408SEric.Saxe@Sun.COM 		    i + 1;
18760Sstevel@tonic-gate 	}
18770Sstevel@tonic-gate 
18780Sstevel@tonic-gate 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
18798408SEric.Saxe@Sun.COM 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
18808408SEric.Saxe@Sun.COM 
18810Sstevel@tonic-gate 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
18820Sstevel@tonic-gate }
18830Sstevel@tonic-gate 
18840Sstevel@tonic-gate /*
18858408SEric.Saxe@Sun.COM  * Update each of lpl_parent's children with a reference to their parent.
18860Sstevel@tonic-gate  * The lgrp topology is used as the reference since it is fully
18870Sstevel@tonic-gate  * consistent and correct at this point.
18880Sstevel@tonic-gate  * This should be called after any potential change in lpl_parent's
18890Sstevel@tonic-gate  * rset.
18900Sstevel@tonic-gate  */
18910Sstevel@tonic-gate static void
lpl_child_update(lpl_t * lpl_parent,struct cpupart * cp)18920Sstevel@tonic-gate lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
18930Sstevel@tonic-gate {
18948408SEric.Saxe@Sun.COM 	klgrpset_t	children;
18958408SEric.Saxe@Sun.COM 	int		i;
18960Sstevel@tonic-gate 
18970Sstevel@tonic-gate 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
18980Sstevel@tonic-gate 	if (klgrpset_isempty(children))
18990Sstevel@tonic-gate 		return; /* nothing to do */
19000Sstevel@tonic-gate 
19010Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
19020Sstevel@tonic-gate 		if (klgrpset_ismember(children, i)) {
19030Sstevel@tonic-gate 			/*
19040Sstevel@tonic-gate 			 * (Re)set the parent. It may be incorrect if
19050Sstevel@tonic-gate 			 * lpl_parent is new in the topology.
19060Sstevel@tonic-gate 			 */
19070Sstevel@tonic-gate 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
19080Sstevel@tonic-gate 		}
19090Sstevel@tonic-gate 	}
19100Sstevel@tonic-gate }
19110Sstevel@tonic-gate 
19120Sstevel@tonic-gate /*
19130Sstevel@tonic-gate  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
19140Sstevel@tonic-gate  *
19150Sstevel@tonic-gate  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
19160Sstevel@tonic-gate  * resource. The values are adjusted here, as this is the only place that we can
19170Sstevel@tonic-gate  * be certain a resource was successfully deleted.
19180Sstevel@tonic-gate  */
19190Sstevel@tonic-gate void
lpl_rset_del(lpl_t * lpl_target,lpl_t * lpl_leaf)19200Sstevel@tonic-gate lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
19210Sstevel@tonic-gate {
19220Sstevel@tonic-gate 	int i;
19238408SEric.Saxe@Sun.COM 	lpl_t *leaf;
19248408SEric.Saxe@Sun.COM 
19258408SEric.Saxe@Sun.COM 	if (lpl_target->lpl_nrset == 0)
19268408SEric.Saxe@Sun.COM 		return;
19270Sstevel@tonic-gate 
19280Sstevel@tonic-gate 	/* find leaf in intermediate node */
19290Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
19300Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
19310Sstevel@tonic-gate 			break;
19320Sstevel@tonic-gate 	}
19330Sstevel@tonic-gate 
19340Sstevel@tonic-gate 	/* return if leaf not found */
19350Sstevel@tonic-gate 	if (lpl_target->lpl_rset[i] != lpl_leaf)
19360Sstevel@tonic-gate 		return;
19370Sstevel@tonic-gate 
19380Sstevel@tonic-gate 	/* prune leaf, compress array */
19390Sstevel@tonic-gate 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
19408408SEric.Saxe@Sun.COM 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
19410Sstevel@tonic-gate 	lpl_target->lpl_ncpu--;
19420Sstevel@tonic-gate 	do {
19430Sstevel@tonic-gate 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
19448408SEric.Saxe@Sun.COM 		/*
19458408SEric.Saxe@Sun.COM 		 * Update the lgrp id <=> rset mapping
19468408SEric.Saxe@Sun.COM 		 */
19478408SEric.Saxe@Sun.COM 		if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
19488408SEric.Saxe@Sun.COM 			lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
19498408SEric.Saxe@Sun.COM 		}
19500Sstevel@tonic-gate 	} while (i++ < lpl_target->lpl_nrset);
19510Sstevel@tonic-gate }
19520Sstevel@tonic-gate 
19530Sstevel@tonic-gate /*
19540Sstevel@tonic-gate  * Check to see if the resource set of the target lpl contains the
19550Sstevel@tonic-gate  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
19560Sstevel@tonic-gate  */
19570Sstevel@tonic-gate 
19580Sstevel@tonic-gate int
lpl_rset_contains(lpl_t * lpl_target,lpl_t * lpl_leaf)19590Sstevel@tonic-gate lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
19600Sstevel@tonic-gate {
19610Sstevel@tonic-gate 	int i;
19620Sstevel@tonic-gate 
19630Sstevel@tonic-gate 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
19640Sstevel@tonic-gate 		if (lpl_target->lpl_rset[i] == lpl_leaf)
19650Sstevel@tonic-gate 			return (1);
19660Sstevel@tonic-gate 	}
19670Sstevel@tonic-gate 
19680Sstevel@tonic-gate 	return (0);
19690Sstevel@tonic-gate }
19700Sstevel@tonic-gate 
19710Sstevel@tonic-gate /*
19720Sstevel@tonic-gate  * Called when we change cpu lpl membership.  This increments or decrements the
19730Sstevel@tonic-gate  * per-cpu counter in every lpl in which our leaf appears.
19740Sstevel@tonic-gate  */
19750Sstevel@tonic-gate void
lpl_cpu_adjcnt(lpl_act_t act,cpu_t * cp)19760Sstevel@tonic-gate lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
19770Sstevel@tonic-gate {
19780Sstevel@tonic-gate 	cpupart_t	*cpupart;
19790Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
19800Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
19810Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
19820Sstevel@tonic-gate 	lpl_t		*lpl_cur;
19830Sstevel@tonic-gate 	int		i;
19840Sstevel@tonic-gate 
19850Sstevel@tonic-gate 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
19860Sstevel@tonic-gate 
19870Sstevel@tonic-gate 	cpupart = cp->cpu_part;
19880Sstevel@tonic-gate 	lpl_leaf = cp->cpu_lpl;
19890Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
19900Sstevel@tonic-gate 
19910Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
19920Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
19930Sstevel@tonic-gate 
19940Sstevel@tonic-gate 		/*
19950Sstevel@tonic-gate 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
19960Sstevel@tonic-gate 		 * for the cpu in question, or if the current lgrp and leaf
19970Sstevel@tonic-gate 		 * don't share the same resources.
19980Sstevel@tonic-gate 		 */
19990Sstevel@tonic-gate 
20000Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
20010Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
20020Sstevel@tonic-gate 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
20030Sstevel@tonic-gate 			continue;
20040Sstevel@tonic-gate 
20050Sstevel@tonic-gate 
20060Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
20070Sstevel@tonic-gate 
20080Sstevel@tonic-gate 		if (lpl_cur->lpl_nrset > 0) {
20090Sstevel@tonic-gate 			if (act == LPL_INCREMENT) {
20100Sstevel@tonic-gate 				lpl_cur->lpl_ncpu++;
20110Sstevel@tonic-gate 			} else if (act == LPL_DECREMENT) {
20120Sstevel@tonic-gate 				lpl_cur->lpl_ncpu--;
20130Sstevel@tonic-gate 			}
20140Sstevel@tonic-gate 		}
20150Sstevel@tonic-gate 	}
20160Sstevel@tonic-gate }
20170Sstevel@tonic-gate 
20180Sstevel@tonic-gate /*
20190Sstevel@tonic-gate  * Initialize lpl with given resources and specified lgrp
20200Sstevel@tonic-gate  */
20210Sstevel@tonic-gate void
lpl_init(lpl_t * lpl,lpl_t * lpl_leaf,lgrp_t * lgrp)20220Sstevel@tonic-gate lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
20230Sstevel@tonic-gate {
20240Sstevel@tonic-gate 	lpl->lpl_lgrpid = lgrp->lgrp_id;
20250Sstevel@tonic-gate 	lpl->lpl_loadavg = 0;
20260Sstevel@tonic-gate 	if (lpl == lpl_leaf)
20270Sstevel@tonic-gate 		lpl->lpl_ncpu = 1;
20280Sstevel@tonic-gate 	else
20290Sstevel@tonic-gate 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
20300Sstevel@tonic-gate 	lpl->lpl_nrset = 1;
20310Sstevel@tonic-gate 	lpl->lpl_rset[0] = lpl_leaf;
20328408SEric.Saxe@Sun.COM 	lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
20330Sstevel@tonic-gate 	lpl->lpl_lgrp = lgrp;
20340Sstevel@tonic-gate 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
20350Sstevel@tonic-gate 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
20360Sstevel@tonic-gate }
20370Sstevel@tonic-gate 
20380Sstevel@tonic-gate /*
20390Sstevel@tonic-gate  * Clear an unused lpl
20400Sstevel@tonic-gate  */
20410Sstevel@tonic-gate void
lpl_clear(lpl_t * lpl)20420Sstevel@tonic-gate lpl_clear(lpl_t *lpl)
20430Sstevel@tonic-gate {
20448408SEric.Saxe@Sun.COM 	/*
20458408SEric.Saxe@Sun.COM 	 * Clear out all fields in the lpl except:
20468408SEric.Saxe@Sun.COM 	 *    lpl_lgrpid - to facilitate debugging
20478408SEric.Saxe@Sun.COM 	 *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
20488408SEric.Saxe@Sun.COM 	 *
20498408SEric.Saxe@Sun.COM 	 * Note that the lpl's rset and id2rset mapping are cleared as well.
20508408SEric.Saxe@Sun.COM 	 */
20518408SEric.Saxe@Sun.COM 	lpl->lpl_loadavg = 0;
20528408SEric.Saxe@Sun.COM 	lpl->lpl_ncpu = 0;
20538408SEric.Saxe@Sun.COM 	lpl->lpl_lgrp = NULL;
20548408SEric.Saxe@Sun.COM 	lpl->lpl_parent = NULL;
20558408SEric.Saxe@Sun.COM 	lpl->lpl_cpus = NULL;
20568408SEric.Saxe@Sun.COM 	lpl->lpl_nrset = 0;
20578408SEric.Saxe@Sun.COM 	lpl->lpl_homed_time = 0;
20588408SEric.Saxe@Sun.COM 	bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
20598408SEric.Saxe@Sun.COM 	bzero(lpl->lpl_id2rset,
20608408SEric.Saxe@Sun.COM 	    sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
20610Sstevel@tonic-gate }
20620Sstevel@tonic-gate 
20630Sstevel@tonic-gate /*
20640Sstevel@tonic-gate  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
20650Sstevel@tonic-gate  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
20660Sstevel@tonic-gate  * make full use of all of the lgroup topology, but this checks to make sure
20670Sstevel@tonic-gate  * that for the parts that it does use, it has correctly understood the
20680Sstevel@tonic-gate  * relationships that exist. This function returns
20690Sstevel@tonic-gate  * 0 if the topology is correct, and a non-zero error code, for non-debug
20700Sstevel@tonic-gate  * kernels if incorrect.  Asserts are spread throughout the code to aid in
20710Sstevel@tonic-gate  * debugging on a DEBUG kernel.
20720Sstevel@tonic-gate  */
20730Sstevel@tonic-gate int
lpl_topo_verify(cpupart_t * cpupart)20740Sstevel@tonic-gate lpl_topo_verify(cpupart_t *cpupart)
20750Sstevel@tonic-gate {
20760Sstevel@tonic-gate 	lgrp_t		*lgrp;
20770Sstevel@tonic-gate 	lpl_t		*lpl;
20780Sstevel@tonic-gate 	klgrpset_t	rset;
20790Sstevel@tonic-gate 	klgrpset_t	cset;
20800Sstevel@tonic-gate 	cpu_t		*cpu;
20810Sstevel@tonic-gate 	cpu_t		*cp_start;
20820Sstevel@tonic-gate 	int		i;
20830Sstevel@tonic-gate 	int		j;
20840Sstevel@tonic-gate 	int		sum;
20850Sstevel@tonic-gate 
20860Sstevel@tonic-gate 	/* topology can't be incorrect if it doesn't exist */
20870Sstevel@tonic-gate 	if (!lgrp_topo_initialized || !lgrp_initialized)
20880Sstevel@tonic-gate 		return (LPL_TOPO_CORRECT);
20890Sstevel@tonic-gate 
20900Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
20910Sstevel@tonic-gate 
20920Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
20930Sstevel@tonic-gate 		lgrp = lgrp_table[i];
20940Sstevel@tonic-gate 		lpl = NULL;
20950Sstevel@tonic-gate 		/* make sure lpls are allocated */
20960Sstevel@tonic-gate 		ASSERT(cpupart->cp_lgrploads);
20970Sstevel@tonic-gate 		if (!cpupart->cp_lgrploads)
20980Sstevel@tonic-gate 			return (LPL_TOPO_PART_HAS_NO_LPL);
20990Sstevel@tonic-gate 
21000Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[i];
21010Sstevel@tonic-gate 		/* make sure our index is good */
21020Sstevel@tonic-gate 		ASSERT(i < cpupart->cp_nlgrploads);
21030Sstevel@tonic-gate 
21040Sstevel@tonic-gate 		/* if lgroup doesn't exist, make sure lpl is empty */
21050Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp)) {
21060Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
21070Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
21080Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
21090Sstevel@tonic-gate 			} else {
21100Sstevel@tonic-gate 				continue;
21110Sstevel@tonic-gate 			}
21120Sstevel@tonic-gate 		}
21130Sstevel@tonic-gate 
21140Sstevel@tonic-gate 		/* verify that lgroup and lpl are identically numbered */
21150Sstevel@tonic-gate 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
21160Sstevel@tonic-gate 
21170Sstevel@tonic-gate 		/* if lgroup isn't in our partition, make sure lpl is empty */
21180Sstevel@tonic-gate 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
21190Sstevel@tonic-gate 		    cpupart->cp_lgrpset)) {
21200Sstevel@tonic-gate 			ASSERT(lpl->lpl_ncpu == 0);
21210Sstevel@tonic-gate 			if (lpl->lpl_ncpu > 0) {
21220Sstevel@tonic-gate 				return (LPL_TOPO_CPUS_NOT_EMPTY);
21230Sstevel@tonic-gate 			}
21240Sstevel@tonic-gate 			/*
21250Sstevel@tonic-gate 			 * lpl is empty, and lgroup isn't in partition.  verify
21260Sstevel@tonic-gate 			 * that lpl doesn't show up in anyone else's rsets (in
21270Sstevel@tonic-gate 			 * this partition, anyway)
21280Sstevel@tonic-gate 			 */
21290Sstevel@tonic-gate 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
21300Sstevel@tonic-gate 				lpl_t *i_lpl; /* lpl we're iterating over */
21310Sstevel@tonic-gate 
21320Sstevel@tonic-gate 				i_lpl = &cpupart->cp_lgrploads[j];
21330Sstevel@tonic-gate 
21340Sstevel@tonic-gate 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
21350Sstevel@tonic-gate 				if (lpl_rset_contains(i_lpl, lpl)) {
21360Sstevel@tonic-gate 					return (LPL_TOPO_LPL_ORPHANED);
21370Sstevel@tonic-gate 				}
21380Sstevel@tonic-gate 			}
21390Sstevel@tonic-gate 			/* lgroup is empty, and everything is ok. continue */
21400Sstevel@tonic-gate 			continue;
21410Sstevel@tonic-gate 		}
21420Sstevel@tonic-gate 
21430Sstevel@tonic-gate 
21440Sstevel@tonic-gate 		/* lgroup is in this partition, now check it against lpl */
21450Sstevel@tonic-gate 
21460Sstevel@tonic-gate 		/* do both have matching lgrps? */
21470Sstevel@tonic-gate 		ASSERT(lgrp == lpl->lpl_lgrp);
21480Sstevel@tonic-gate 		if (lgrp != lpl->lpl_lgrp) {
21490Sstevel@tonic-gate 			return (LPL_TOPO_LGRP_MISMATCH);
21500Sstevel@tonic-gate 		}
21510Sstevel@tonic-gate 
21520Sstevel@tonic-gate 		/* do the parent lgroups exist and do they match? */
21530Sstevel@tonic-gate 		if (lgrp->lgrp_parent) {
21540Sstevel@tonic-gate 			ASSERT(lpl->lpl_parent);
21550Sstevel@tonic-gate 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
21568408SEric.Saxe@Sun.COM 			    lpl->lpl_parent->lpl_lgrpid);
21570Sstevel@tonic-gate 
21580Sstevel@tonic-gate 			if (!lpl->lpl_parent) {
21590Sstevel@tonic-gate 				return (LPL_TOPO_MISSING_PARENT);
21600Sstevel@tonic-gate 			} else if (lgrp->lgrp_parent->lgrp_id !=
21610Sstevel@tonic-gate 			    lpl->lpl_parent->lpl_lgrpid) {
21620Sstevel@tonic-gate 				return (LPL_TOPO_PARENT_MISMATCH);
21630Sstevel@tonic-gate 			}
21640Sstevel@tonic-gate 		}
21650Sstevel@tonic-gate 
21660Sstevel@tonic-gate 		/* only leaf lgroups keep a cpucnt, only check leaves */
21670Sstevel@tonic-gate 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
21680Sstevel@tonic-gate 
21690Sstevel@tonic-gate 			/* verify that lgrp is also a leaf */
21700Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_childcnt == 0) &&
21710Sstevel@tonic-gate 			    (klgrpset_ismember(lgrp->lgrp_leaves,
21720Sstevel@tonic-gate 			    lpl->lpl_lgrpid)));
21730Sstevel@tonic-gate 
21740Sstevel@tonic-gate 			if ((lgrp->lgrp_childcnt > 0) ||
21750Sstevel@tonic-gate 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
21760Sstevel@tonic-gate 			    lpl->lpl_lgrpid))) {
21770Sstevel@tonic-gate 				return (LPL_TOPO_LGRP_NOT_LEAF);
21780Sstevel@tonic-gate 			}
21790Sstevel@tonic-gate 
21800Sstevel@tonic-gate 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
21810Sstevel@tonic-gate 			    (lpl->lpl_ncpu > 0));
21820Sstevel@tonic-gate 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
21838408SEric.Saxe@Sun.COM 			    (lpl->lpl_ncpu <= 0)) {
21840Sstevel@tonic-gate 				return (LPL_TOPO_BAD_CPUCNT);
21850Sstevel@tonic-gate 			}
21860Sstevel@tonic-gate 
21870Sstevel@tonic-gate 			/*
21880Sstevel@tonic-gate 			 * Check that lpl_ncpu also matches the number of
21890Sstevel@tonic-gate 			 * cpus in the lpl's linked list.  This only exists in
21900Sstevel@tonic-gate 			 * leaves, but they should always match.
21910Sstevel@tonic-gate 			 */
21920Sstevel@tonic-gate 			j = 0;
21930Sstevel@tonic-gate 			cpu = cp_start = lpl->lpl_cpus;
21940Sstevel@tonic-gate 			while (cpu != NULL) {
21950Sstevel@tonic-gate 				j++;
21960Sstevel@tonic-gate 
21970Sstevel@tonic-gate 				/* check to make sure cpu's lpl is leaf lpl */
21980Sstevel@tonic-gate 				ASSERT(cpu->cpu_lpl == lpl);
21990Sstevel@tonic-gate 				if (cpu->cpu_lpl != lpl) {
22000Sstevel@tonic-gate 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
22010Sstevel@tonic-gate 				}
22020Sstevel@tonic-gate 
22030Sstevel@tonic-gate 				/* check next cpu */
22040Sstevel@tonic-gate 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
22050Sstevel@tonic-gate 					continue;
22060Sstevel@tonic-gate 				} else {
22070Sstevel@tonic-gate 					cpu = NULL;
22080Sstevel@tonic-gate 				}
22090Sstevel@tonic-gate 			}
22100Sstevel@tonic-gate 
22110Sstevel@tonic-gate 			ASSERT(j == lpl->lpl_ncpu);
22120Sstevel@tonic-gate 			if (j != lpl->lpl_ncpu) {
22130Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
22140Sstevel@tonic-gate 			}
22150Sstevel@tonic-gate 
22160Sstevel@tonic-gate 			/*
22170Sstevel@tonic-gate 			 * Also, check that leaf lpl is contained in all
22180Sstevel@tonic-gate 			 * intermediate lpls that name the leaf as a descendant
22190Sstevel@tonic-gate 			 */
22200Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
22210Sstevel@tonic-gate 				klgrpset_t intersect;
22220Sstevel@tonic-gate 				lgrp_t *lgrp_cand;
22230Sstevel@tonic-gate 				lpl_t *lpl_cand;
22240Sstevel@tonic-gate 
22250Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
22260Sstevel@tonic-gate 				intersect = klgrpset_intersects(
22270Sstevel@tonic-gate 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
22280Sstevel@tonic-gate 				    cpupart->cp_lgrpset);
22290Sstevel@tonic-gate 
22300Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
22310Sstevel@tonic-gate 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
22320Sstevel@tonic-gate 				    cpupart->cp_lgrpset) ||
22330Sstevel@tonic-gate 				    (intersect == 0))
22340Sstevel@tonic-gate 					continue;
22350Sstevel@tonic-gate 
22360Sstevel@tonic-gate 				lpl_cand =
22370Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
22380Sstevel@tonic-gate 
22390Sstevel@tonic-gate 				if (klgrpset_ismember(intersect,
22400Sstevel@tonic-gate 				    lgrp->lgrp_id)) {
22410Sstevel@tonic-gate 					ASSERT(lpl_rset_contains(lpl_cand,
22420Sstevel@tonic-gate 					    lpl));
22430Sstevel@tonic-gate 
22440Sstevel@tonic-gate 					if (!lpl_rset_contains(lpl_cand, lpl)) {
22450Sstevel@tonic-gate 						return (LPL_TOPO_RSET_MSSNG_LF);
22460Sstevel@tonic-gate 					}
22470Sstevel@tonic-gate 				}
22480Sstevel@tonic-gate 			}
22490Sstevel@tonic-gate 
22500Sstevel@tonic-gate 		} else { /* non-leaf specific checks */
22510Sstevel@tonic-gate 
22520Sstevel@tonic-gate 			/*
22530Sstevel@tonic-gate 			 * Non-leaf lpls should have lpl_cpus == NULL
22540Sstevel@tonic-gate 			 * verify that this is so
22550Sstevel@tonic-gate 			 */
22560Sstevel@tonic-gate 			ASSERT(lpl->lpl_cpus == NULL);
22570Sstevel@tonic-gate 			if (lpl->lpl_cpus != NULL) {
22580Sstevel@tonic-gate 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
22590Sstevel@tonic-gate 			}
22600Sstevel@tonic-gate 
22610Sstevel@tonic-gate 			/*
22620Sstevel@tonic-gate 			 * verify that the sum of the cpus in the leaf resources
22630Sstevel@tonic-gate 			 * is equal to the total ncpu in the intermediate
22640Sstevel@tonic-gate 			 */
22650Sstevel@tonic-gate 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
22660Sstevel@tonic-gate 				sum += lpl->lpl_rset[j]->lpl_ncpu;
22670Sstevel@tonic-gate 			}
22680Sstevel@tonic-gate 
22690Sstevel@tonic-gate 			ASSERT(sum == lpl->lpl_ncpu);
22700Sstevel@tonic-gate 			if (sum != lpl->lpl_ncpu) {
22710Sstevel@tonic-gate 				return (LPL_TOPO_LPL_BAD_NCPU);
22720Sstevel@tonic-gate 			}
22730Sstevel@tonic-gate 		}
22740Sstevel@tonic-gate 
22750Sstevel@tonic-gate 		/*
22760Sstevel@tonic-gate 		 * Check the rset of the lpl in question.  Make sure that each
22770Sstevel@tonic-gate 		 * rset contains a subset of the resources in
22780Sstevel@tonic-gate 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
22790Sstevel@tonic-gate 		 * sure that each rset doesn't include resources that are
22800Sstevel@tonic-gate 		 * outside of that set.  (Which would be resources somehow not
22810Sstevel@tonic-gate 		 * accounted for).
22820Sstevel@tonic-gate 		 */
22830Sstevel@tonic-gate 		klgrpset_clear(rset);
22840Sstevel@tonic-gate 		for (j = 0; j < lpl->lpl_nrset; j++) {
22850Sstevel@tonic-gate 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
22860Sstevel@tonic-gate 		}
22870Sstevel@tonic-gate 		klgrpset_copy(cset, rset);
22880Sstevel@tonic-gate 		/* make sure lpl rset matches lgrp rset */
22890Sstevel@tonic-gate 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
22900Sstevel@tonic-gate 		/* make sure rset is contained with in partition, too */
22910Sstevel@tonic-gate 		klgrpset_diff(cset, cpupart->cp_lgrpset);
22920Sstevel@tonic-gate 
22938408SEric.Saxe@Sun.COM 		ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
22948408SEric.Saxe@Sun.COM 		if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
22950Sstevel@tonic-gate 			return (LPL_TOPO_RSET_MISMATCH);
22960Sstevel@tonic-gate 		}
22970Sstevel@tonic-gate 
22980Sstevel@tonic-gate 		/*
22990Sstevel@tonic-gate 		 * check to make sure lpl_nrset matches the number of rsets
23000Sstevel@tonic-gate 		 * contained in the lpl
23010Sstevel@tonic-gate 		 */
23028408SEric.Saxe@Sun.COM 		for (j = 0; j < lpl->lpl_nrset; j++) {
23038408SEric.Saxe@Sun.COM 			if (lpl->lpl_rset[j] == NULL)
23048408SEric.Saxe@Sun.COM 				break;
23058408SEric.Saxe@Sun.COM 		}
23060Sstevel@tonic-gate 
23070Sstevel@tonic-gate 		ASSERT(j == lpl->lpl_nrset);
23080Sstevel@tonic-gate 		if (j != lpl->lpl_nrset) {
23090Sstevel@tonic-gate 			return (LPL_TOPO_BAD_RSETCNT);
23100Sstevel@tonic-gate 		}
23110Sstevel@tonic-gate 
23120Sstevel@tonic-gate 	}
23130Sstevel@tonic-gate 	return (LPL_TOPO_CORRECT);
23140Sstevel@tonic-gate }
23150Sstevel@tonic-gate 
23160Sstevel@tonic-gate /*
23170Sstevel@tonic-gate  * Flatten lpl topology to given number of levels.  This is presently only
23180Sstevel@tonic-gate  * implemented for a flatten to 2 levels, which will prune out the intermediates
23190Sstevel@tonic-gate  * and home the leaf lpls to the root lpl.
23200Sstevel@tonic-gate  */
23210Sstevel@tonic-gate int
lpl_topo_flatten(int levels)23220Sstevel@tonic-gate lpl_topo_flatten(int levels)
23230Sstevel@tonic-gate {
23240Sstevel@tonic-gate 	int		i;
23250Sstevel@tonic-gate 	uint_t		sum;
23260Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
23270Sstevel@tonic-gate 	lpl_t		*lpl_cur;
23280Sstevel@tonic-gate 	lpl_t		*lpl_root;
23290Sstevel@tonic-gate 	cpupart_t	*cp;
23300Sstevel@tonic-gate 
23310Sstevel@tonic-gate 	if (levels != 2)
23320Sstevel@tonic-gate 		return (0);
23330Sstevel@tonic-gate 
23340Sstevel@tonic-gate 	/* called w/ cpus paused - grab no locks! */
23350Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
23360Sstevel@tonic-gate 	    !lgrp_initialized);
23370Sstevel@tonic-gate 
23380Sstevel@tonic-gate 	cp = cp_list_head;
23390Sstevel@tonic-gate 	do {
23400Sstevel@tonic-gate 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
23410Sstevel@tonic-gate 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
23420Sstevel@tonic-gate 
23430Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
23440Sstevel@tonic-gate 			lgrp_cur = lgrp_table[i];
23450Sstevel@tonic-gate 			lpl_cur = &cp->cp_lgrploads[i];
23460Sstevel@tonic-gate 
23470Sstevel@tonic-gate 			if ((lgrp_cur == lgrp_root) ||
23480Sstevel@tonic-gate 			    (!LGRP_EXISTS(lgrp_cur) &&
23490Sstevel@tonic-gate 			    (lpl_cur->lpl_ncpu == 0)))
23500Sstevel@tonic-gate 				continue;
23510Sstevel@tonic-gate 
23520Sstevel@tonic-gate 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
23530Sstevel@tonic-gate 				/*
23540Sstevel@tonic-gate 				 * this should be a deleted intermediate, so
23550Sstevel@tonic-gate 				 * clear it
23560Sstevel@tonic-gate 				 */
23570Sstevel@tonic-gate 				lpl_clear(lpl_cur);
23580Sstevel@tonic-gate 			} else if ((lpl_cur->lpl_nrset == 1) &&
23590Sstevel@tonic-gate 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
23600Sstevel@tonic-gate 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
23610Sstevel@tonic-gate 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
23620Sstevel@tonic-gate 				/*
23630Sstevel@tonic-gate 				 * this is a leaf whose parent was deleted, or
23640Sstevel@tonic-gate 				 * whose parent had their lgrp deleted.  (And
23650Sstevel@tonic-gate 				 * whose parent will soon be deleted).  Point
23660Sstevel@tonic-gate 				 * this guy back to the root lpl.
23670Sstevel@tonic-gate 				 */
23680Sstevel@tonic-gate 				lpl_cur->lpl_parent = lpl_root;
23690Sstevel@tonic-gate 				lpl_rset_add(lpl_root, lpl_cur);
23700Sstevel@tonic-gate 			}
23710Sstevel@tonic-gate 
23720Sstevel@tonic-gate 		}
23730Sstevel@tonic-gate 
23740Sstevel@tonic-gate 		/*
23750Sstevel@tonic-gate 		 * Now that we're done, make sure the count on the root lpl is
23760Sstevel@tonic-gate 		 * correct, and update the hints of the children for the sake of
23770Sstevel@tonic-gate 		 * thoroughness
23780Sstevel@tonic-gate 		 */
23790Sstevel@tonic-gate 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
23800Sstevel@tonic-gate 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
23810Sstevel@tonic-gate 		}
23820Sstevel@tonic-gate 		lpl_root->lpl_ncpu = sum;
23830Sstevel@tonic-gate 		lpl_child_update(lpl_root, cp);
23840Sstevel@tonic-gate 
23850Sstevel@tonic-gate 		cp = cp->cp_next;
23860Sstevel@tonic-gate 	} while (cp != cp_list_head);
23870Sstevel@tonic-gate 
23880Sstevel@tonic-gate 	return (levels);
23890Sstevel@tonic-gate }
23900Sstevel@tonic-gate 
23910Sstevel@tonic-gate /*
23920Sstevel@tonic-gate  * Insert a lpl into the resource hierarchy and create any additional lpls that
23930Sstevel@tonic-gate  * are necessary to represent the varying states of locality for the cpu
23940Sstevel@tonic-gate  * resoruces newly added to the partition.
23950Sstevel@tonic-gate  *
23960Sstevel@tonic-gate  * This routine is clever enough that it can correctly add resources from the
23970Sstevel@tonic-gate  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
23980Sstevel@tonic-gate  * those for which the lpl is a leaf as opposed to simply a named equally local
23990Sstevel@tonic-gate  * resource).  The one special case that needs additional processing is when a
24000Sstevel@tonic-gate  * new intermediate lpl is introduced.  Since the main loop only traverses
24010Sstevel@tonic-gate  * looking to add the leaf resource where it does not yet exist, additional work
24020Sstevel@tonic-gate  * is necessary to add other leaf resources that may need to exist in the newly
24030Sstevel@tonic-gate  * created intermediate.  This is performed by the second inner loop, and is
24040Sstevel@tonic-gate  * only done when the check for more than one overlapping resource succeeds.
24050Sstevel@tonic-gate  */
24060Sstevel@tonic-gate 
24070Sstevel@tonic-gate void
lpl_leaf_insert(lpl_t * lpl_leaf,cpupart_t * cpupart)24080Sstevel@tonic-gate lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
24090Sstevel@tonic-gate {
24100Sstevel@tonic-gate 	int		i;
24110Sstevel@tonic-gate 	int		j;
24120Sstevel@tonic-gate 	int		rset_num_intersect;
24130Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
24140Sstevel@tonic-gate 	lpl_t		*lpl_cur;
24150Sstevel@tonic-gate 	lpl_t		*lpl_parent;
24161892Sesaxe 	lgrp_id_t	parent_id;
24170Sstevel@tonic-gate 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
24180Sstevel@tonic-gate 
24190Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
24200Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
24210Sstevel@tonic-gate 
24220Sstevel@tonic-gate 		/*
24230Sstevel@tonic-gate 		 * Don't insert if the lgrp isn't there, if the leaf isn't
24240Sstevel@tonic-gate 		 * contained within the current lgrp, or if the current lgrp has
24250Sstevel@tonic-gate 		 * no leaves in this partition
24260Sstevel@tonic-gate 		 */
24270Sstevel@tonic-gate 
24280Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur) ||
24290Sstevel@tonic-gate 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
24300Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
24310Sstevel@tonic-gate 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
24320Sstevel@tonic-gate 		    cpupart->cp_lgrpset))
24330Sstevel@tonic-gate 			continue;
24340Sstevel@tonic-gate 
24350Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
24360Sstevel@tonic-gate 		if (lgrp_cur->lgrp_parent != NULL) {
24370Sstevel@tonic-gate 			/* if lgrp has a parent, assign it properly */
24380Sstevel@tonic-gate 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
24390Sstevel@tonic-gate 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
24400Sstevel@tonic-gate 		} else {
24410Sstevel@tonic-gate 			/* if not, make sure parent ptr gets set to null */
24420Sstevel@tonic-gate 			lpl_parent = NULL;
24430Sstevel@tonic-gate 		}
24440Sstevel@tonic-gate 
24450Sstevel@tonic-gate 		if (lpl_cur == lpl_leaf) {
24460Sstevel@tonic-gate 			/*
24470Sstevel@tonic-gate 			 * Almost all leaf state was initialized elsewhere.  The
24480Sstevel@tonic-gate 			 * only thing left to do is to set the parent.
24490Sstevel@tonic-gate 			 */
24500Sstevel@tonic-gate 			lpl_cur->lpl_parent = lpl_parent;
24510Sstevel@tonic-gate 			continue;
24520Sstevel@tonic-gate 		}
24530Sstevel@tonic-gate 
24540Sstevel@tonic-gate 		lpl_clear(lpl_cur);
24550Sstevel@tonic-gate 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
24560Sstevel@tonic-gate 
24570Sstevel@tonic-gate 		lpl_cur->lpl_parent = lpl_parent;
24580Sstevel@tonic-gate 
24590Sstevel@tonic-gate 		/* does new lpl need to be populated with other resources? */
24600Sstevel@tonic-gate 		rset_intersect =
24610Sstevel@tonic-gate 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
24628408SEric.Saxe@Sun.COM 		    cpupart->cp_lgrpset);
24630Sstevel@tonic-gate 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
24640Sstevel@tonic-gate 
24650Sstevel@tonic-gate 		if (rset_num_intersect > 1) {
24660Sstevel@tonic-gate 			/*
24670Sstevel@tonic-gate 			 * If so, figure out what lpls have resources that
24680Sstevel@tonic-gate 			 * intersect this one, and add them.
24690Sstevel@tonic-gate 			 */
24700Sstevel@tonic-gate 			for (j = 0; j <= lgrp_alloc_max; j++) {
24710Sstevel@tonic-gate 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
24720Sstevel@tonic-gate 				lpl_t	*lpl_cand;	/* candidate lpl */
24730Sstevel@tonic-gate 
24740Sstevel@tonic-gate 				lgrp_cand = lgrp_table[j];
24750Sstevel@tonic-gate 				if (!LGRP_EXISTS(lgrp_cand) ||
24760Sstevel@tonic-gate 				    !klgrpset_ismember(rset_intersect,
24778408SEric.Saxe@Sun.COM 				    lgrp_cand->lgrp_id))
24780Sstevel@tonic-gate 					continue;
24790Sstevel@tonic-gate 				lpl_cand =
24800Sstevel@tonic-gate 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
24810Sstevel@tonic-gate 				lpl_rset_add(lpl_cur, lpl_cand);
24820Sstevel@tonic-gate 			}
24830Sstevel@tonic-gate 		}
24840Sstevel@tonic-gate 		/*
24850Sstevel@tonic-gate 		 * This lpl's rset has changed. Update the hint in it's
24860Sstevel@tonic-gate 		 * children.
24870Sstevel@tonic-gate 		 */
24880Sstevel@tonic-gate 		lpl_child_update(lpl_cur, cpupart);
24890Sstevel@tonic-gate 	}
24900Sstevel@tonic-gate }
24910Sstevel@tonic-gate 
24920Sstevel@tonic-gate /*
24930Sstevel@tonic-gate  * remove a lpl from the hierarchy of resources, clearing its state when
24940Sstevel@tonic-gate  * finished.  If the lpls at the intermediate levels of the hierarchy have no
24950Sstevel@tonic-gate  * remaining resources, or no longer name a leaf resource in the cpu-partition,
24960Sstevel@tonic-gate  * delete them as well.
24970Sstevel@tonic-gate  */
24980Sstevel@tonic-gate 
24990Sstevel@tonic-gate void
lpl_leaf_remove(lpl_t * lpl_leaf,cpupart_t * cpupart)25000Sstevel@tonic-gate lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
25010Sstevel@tonic-gate {
25020Sstevel@tonic-gate 	int		i;
25030Sstevel@tonic-gate 	lgrp_t		*lgrp_cur;
25040Sstevel@tonic-gate 	lpl_t		*lpl_cur;
25050Sstevel@tonic-gate 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
25060Sstevel@tonic-gate 
25070Sstevel@tonic-gate 	for (i = 0; i <= lgrp_alloc_max; i++) {
25080Sstevel@tonic-gate 		lgrp_cur = lgrp_table[i];
25090Sstevel@tonic-gate 
25100Sstevel@tonic-gate 		/*
25110Sstevel@tonic-gate 		 * Don't attempt to remove from lgrps that aren't there, that
25120Sstevel@tonic-gate 		 * don't contain our leaf, or from the leaf itself. (We do that
25130Sstevel@tonic-gate 		 * later)
25140Sstevel@tonic-gate 		 */
25150Sstevel@tonic-gate 
25160Sstevel@tonic-gate 		if (!LGRP_EXISTS(lgrp_cur))
25170Sstevel@tonic-gate 			continue;
25180Sstevel@tonic-gate 
25190Sstevel@tonic-gate 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
25200Sstevel@tonic-gate 
25210Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
25220Sstevel@tonic-gate 		    lpl_leaf->lpl_lgrpid) ||
25230Sstevel@tonic-gate 		    (lpl_cur == lpl_leaf)) {
25240Sstevel@tonic-gate 			continue;
25250Sstevel@tonic-gate 		}
25260Sstevel@tonic-gate 
25270Sstevel@tonic-gate 		/*
25280Sstevel@tonic-gate 		 * This is a slightly sleazy simplification in that we have
25290Sstevel@tonic-gate 		 * already marked the cp_lgrpset as no longer containing the
25300Sstevel@tonic-gate 		 * leaf we've deleted.  Any lpls that pass the above checks
25310Sstevel@tonic-gate 		 * based upon lgrp membership but not necessarily cpu-part
25320Sstevel@tonic-gate 		 * membership also get cleared by the checks below.  Currently
25330Sstevel@tonic-gate 		 * this is harmless, as the lpls should be empty anyway.
25340Sstevel@tonic-gate 		 *
25350Sstevel@tonic-gate 		 * In particular, we want to preserve lpls that have additional
25360Sstevel@tonic-gate 		 * leaf resources, even though we don't yet have a processor
25370Sstevel@tonic-gate 		 * architecture that represents resources this way.
25380Sstevel@tonic-gate 		 */
25390Sstevel@tonic-gate 
25400Sstevel@tonic-gate 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
25410Sstevel@tonic-gate 		    cpupart->cp_lgrpset);
25420Sstevel@tonic-gate 
25430Sstevel@tonic-gate 		lpl_rset_del(lpl_cur, lpl_leaf);
25440Sstevel@tonic-gate 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
25450Sstevel@tonic-gate 			lpl_clear(lpl_cur);
25460Sstevel@tonic-gate 		} else {
25470Sstevel@tonic-gate 			/*
25480Sstevel@tonic-gate 			 * Update this lpl's children
25490Sstevel@tonic-gate 			 */
25500Sstevel@tonic-gate 			lpl_child_update(lpl_cur, cpupart);
25510Sstevel@tonic-gate 		}
25520Sstevel@tonic-gate 	}
25530Sstevel@tonic-gate 	lpl_clear(lpl_leaf);
25540Sstevel@tonic-gate }
25550Sstevel@tonic-gate 
25560Sstevel@tonic-gate /*
25570Sstevel@tonic-gate  * add a cpu to a partition in terms of lgrp load avg bookeeping
25580Sstevel@tonic-gate  *
25590Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
25600Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
25610Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
25620Sstevel@tonic-gate  * There are two general cases for cpu additon:
25630Sstevel@tonic-gate  *
25640Sstevel@tonic-gate  * 1. A lpl structure that contains resources already in the hierarchy tree.
25650Sstevel@tonic-gate  * In this case, all of the associated lpl relationships have been defined, and
25660Sstevel@tonic-gate  * all that is necessary is that we link the new cpu into the per-lpl list of
25670Sstevel@tonic-gate  * cpus, and increment the ncpu count of all places where this cpu resource will
25680Sstevel@tonic-gate  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
25690Sstevel@tonic-gate  * pushing is accomplished by this routine.
25700Sstevel@tonic-gate  *
25710Sstevel@tonic-gate  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
25720Sstevel@tonic-gate  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
25730Sstevel@tonic-gate  * construct the hierarchy of state necessary to name it's more distant
25740Sstevel@tonic-gate  * resources, if they should exist.  The leaf structure is initialized by this
25750Sstevel@tonic-gate  * routine, as is the cpu-partition state for the lgrp membership.  This routine
25760Sstevel@tonic-gate  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
25770Sstevel@tonic-gate  * and builds all of the "ancestoral" state necessary to identify resources at
25780Sstevel@tonic-gate  * differing levels of locality.
25790Sstevel@tonic-gate  */
25800Sstevel@tonic-gate void
lgrp_part_add_cpu(cpu_t * cp,lgrp_id_t lgrpid)25810Sstevel@tonic-gate lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
25820Sstevel@tonic-gate {
25830Sstevel@tonic-gate 	cpupart_t	*cpupart;
25840Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
25850Sstevel@tonic-gate 	lpl_t		*lpl_leaf;
25860Sstevel@tonic-gate 
25870Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
25880Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
25890Sstevel@tonic-gate 
25900Sstevel@tonic-gate 	cpupart = cp->cpu_part;
25910Sstevel@tonic-gate 	lgrp_leaf = lgrp_table[lgrpid];
25920Sstevel@tonic-gate 
25930Sstevel@tonic-gate 	/* don't add non-existent lgrp */
25940Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
25950Sstevel@tonic-gate 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
25960Sstevel@tonic-gate 	cp->cpu_lpl = lpl_leaf;
25970Sstevel@tonic-gate 
25980Sstevel@tonic-gate 	/* only leaf lpls contain cpus */
25990Sstevel@tonic-gate 
26000Sstevel@tonic-gate 	if (lpl_leaf->lpl_ncpu++ == 0) {
26010Sstevel@tonic-gate 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
26020Sstevel@tonic-gate 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
26030Sstevel@tonic-gate 		lpl_leaf_insert(lpl_leaf, cpupart);
26040Sstevel@tonic-gate 	} else {
26050Sstevel@tonic-gate 		/*
26060Sstevel@tonic-gate 		 * the lpl should already exist in the parent, so just update
26070Sstevel@tonic-gate 		 * the count of available CPUs
26080Sstevel@tonic-gate 		 */
26090Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
26100Sstevel@tonic-gate 	}
26110Sstevel@tonic-gate 
26120Sstevel@tonic-gate 	/* link cpu into list of cpus in lpl */
26130Sstevel@tonic-gate 
26140Sstevel@tonic-gate 	if (lpl_leaf->lpl_cpus) {
26150Sstevel@tonic-gate 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
26160Sstevel@tonic-gate 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
26170Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
26180Sstevel@tonic-gate 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
26190Sstevel@tonic-gate 	} else {
26200Sstevel@tonic-gate 		/*
26210Sstevel@tonic-gate 		 * We increment ncpu immediately after we create a new leaf
26220Sstevel@tonic-gate 		 * lpl, so assert that ncpu == 1 for the case where we don't
26230Sstevel@tonic-gate 		 * have any cpu pointers yet.
26240Sstevel@tonic-gate 		 */
26250Sstevel@tonic-gate 		ASSERT(lpl_leaf->lpl_ncpu == 1);
26260Sstevel@tonic-gate 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
26270Sstevel@tonic-gate 	}
26280Sstevel@tonic-gate 
26290Sstevel@tonic-gate }
26300Sstevel@tonic-gate 
26310Sstevel@tonic-gate 
26320Sstevel@tonic-gate /*
26330Sstevel@tonic-gate  * remove a cpu from a partition in terms of lgrp load avg bookeeping
26340Sstevel@tonic-gate  *
26350Sstevel@tonic-gate  * The lpl (cpu partition load average information) is now arranged in a
26360Sstevel@tonic-gate  * hierarchical fashion whereby resources that are closest, ie. most local, to
26370Sstevel@tonic-gate  * the cpu in question are considered to be leaves in a tree of resources.
26380Sstevel@tonic-gate  * There are two removal cases in question:
26390Sstevel@tonic-gate  *
26400Sstevel@tonic-gate  * 1. Removal of the resource in the leaf leaves other resources remaining in
26410Sstevel@tonic-gate  * that leaf.  (Another cpu still exists at this level of locality).  In this
26420Sstevel@tonic-gate  * case, the count of available cpus is decremented in all assocated lpls by
26430Sstevel@tonic-gate  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
26440Sstevel@tonic-gate  * from the per-cpu lpl list.
26450Sstevel@tonic-gate  *
26460Sstevel@tonic-gate  * 2. Removal of the resource results in the lpl containing no resources.  (It's
26470Sstevel@tonic-gate  * empty)  In this case, all of what has occurred for the first step must take
26480Sstevel@tonic-gate  * place; however, additionally we must remove the lpl structure itself, prune
26490Sstevel@tonic-gate  * out any stranded lpls that do not directly name a leaf resource, and mark the
26500Sstevel@tonic-gate  * cpu partition in question as no longer containing resources from the lgrp of
26510Sstevel@tonic-gate  * the lpl that has been delted.  Cpu-partition changes are handled by this
26520Sstevel@tonic-gate  * method, but the lpl_leaf_remove function deals with the details of pruning
26530Sstevel@tonic-gate  * out the empty lpl and any of its orphaned direct ancestors.
26540Sstevel@tonic-gate  */
26550Sstevel@tonic-gate void
lgrp_part_del_cpu(cpu_t * cp)26560Sstevel@tonic-gate lgrp_part_del_cpu(cpu_t *cp)
26570Sstevel@tonic-gate {
26580Sstevel@tonic-gate 	lpl_t		*lpl;
26590Sstevel@tonic-gate 	lpl_t		*leaf_lpl;
26600Sstevel@tonic-gate 	lgrp_t		*lgrp_leaf;
26610Sstevel@tonic-gate 
26620Sstevel@tonic-gate 	/* called sometimes w/ cpus paused - grab no locks */
26630Sstevel@tonic-gate 
26640Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
26650Sstevel@tonic-gate 
26660Sstevel@tonic-gate 	lpl = leaf_lpl = cp->cpu_lpl;
26670Sstevel@tonic-gate 	lgrp_leaf = leaf_lpl->lpl_lgrp;
26680Sstevel@tonic-gate 
26690Sstevel@tonic-gate 	/* don't delete a leaf that isn't there */
26700Sstevel@tonic-gate 	ASSERT(LGRP_EXISTS(lgrp_leaf));
26710Sstevel@tonic-gate 
26720Sstevel@tonic-gate 	/* no double-deletes */
26730Sstevel@tonic-gate 	ASSERT(lpl->lpl_ncpu);
26740Sstevel@tonic-gate 	if (--lpl->lpl_ncpu == 0) {
26750Sstevel@tonic-gate 		/*
26760Sstevel@tonic-gate 		 * This was the last cpu in this lgroup for this partition,
26770Sstevel@tonic-gate 		 * clear its bit in the partition's lgroup bitmask
26780Sstevel@tonic-gate 		 */
26790Sstevel@tonic-gate 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
26800Sstevel@tonic-gate 
26810Sstevel@tonic-gate 		/* eliminate remaning lpl link pointers in cpu, lpl */
26820Sstevel@tonic-gate 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
26830Sstevel@tonic-gate 
26840Sstevel@tonic-gate 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
26850Sstevel@tonic-gate 	} else {
26860Sstevel@tonic-gate 
26870Sstevel@tonic-gate 		/* unlink cpu from lists of cpus in lpl */
26880Sstevel@tonic-gate 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
26890Sstevel@tonic-gate 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
26900Sstevel@tonic-gate 		if (lpl->lpl_cpus == cp) {
26910Sstevel@tonic-gate 			lpl->lpl_cpus = cp->cpu_next_lpl;
26920Sstevel@tonic-gate 		}
26930Sstevel@tonic-gate 
26940Sstevel@tonic-gate 		/*
26950Sstevel@tonic-gate 		 * Update the cpu count in the lpls associated with parent
26960Sstevel@tonic-gate 		 * lgroups.
26970Sstevel@tonic-gate 		 */
26980Sstevel@tonic-gate 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
26990Sstevel@tonic-gate 
27000Sstevel@tonic-gate 	}
27010Sstevel@tonic-gate 	/* clear cpu's lpl ptr when we're all done */
27020Sstevel@tonic-gate 	cp->cpu_lpl = NULL;
27030Sstevel@tonic-gate }
27040Sstevel@tonic-gate 
27050Sstevel@tonic-gate /*
27060Sstevel@tonic-gate  * Recompute load average for the specified partition/lgrp fragment.
27070Sstevel@tonic-gate  *
27080Sstevel@tonic-gate  * We rely on the fact that this routine is called from the clock thread
27090Sstevel@tonic-gate  * at a point before the clock thread can block (i.e. before its first
27100Sstevel@tonic-gate  * lock request).  Since the clock thread can not be preempted (since it
27110Sstevel@tonic-gate  * runs at highest priority), we know that cpu partitions can not change
27120Sstevel@tonic-gate  * (since doing so would require either the repartition requester or the
27130Sstevel@tonic-gate  * cpu_pause thread to run on this cpu), so we can update the cpu's load
27140Sstevel@tonic-gate  * without grabbing cpu_lock.
27150Sstevel@tonic-gate  */
27160Sstevel@tonic-gate void
lgrp_loadavg(lpl_t * lpl,uint_t nrcpus,int ageflag)27170Sstevel@tonic-gate lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
27180Sstevel@tonic-gate {
27190Sstevel@tonic-gate 	uint_t		ncpu;
27200Sstevel@tonic-gate 	int64_t		old, new, f;
27210Sstevel@tonic-gate 
27220Sstevel@tonic-gate 	/*
27230Sstevel@tonic-gate 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
27240Sstevel@tonic-gate 	 */
27250Sstevel@tonic-gate 	static short expval[] = {
27260Sstevel@tonic-gate 	    0, 3196, 1618, 1083,
27270Sstevel@tonic-gate 	    814, 652, 543, 466,
27280Sstevel@tonic-gate 	    408, 363, 326, 297,
27290Sstevel@tonic-gate 	    272, 251, 233, 218,
27300Sstevel@tonic-gate 	    204, 192, 181, 172,
27310Sstevel@tonic-gate 	    163, 155, 148, 142,
27320Sstevel@tonic-gate 	    136, 130, 125, 121,
27330Sstevel@tonic-gate 	    116, 112, 109, 105
27340Sstevel@tonic-gate 	};
27350Sstevel@tonic-gate 
27360Sstevel@tonic-gate 	/* ASSERT (called from clock level) */
27370Sstevel@tonic-gate 
27380Sstevel@tonic-gate 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
27390Sstevel@tonic-gate 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
27400Sstevel@tonic-gate 		return;
27410Sstevel@tonic-gate 	}
27420Sstevel@tonic-gate 
27430Sstevel@tonic-gate 	for (;;) {
27440Sstevel@tonic-gate 
27450Sstevel@tonic-gate 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
27460Sstevel@tonic-gate 			f = expval[1]/ncpu; /* good approx. for large ncpu */
27470Sstevel@tonic-gate 		else
27480Sstevel@tonic-gate 			f = expval[ncpu];
27490Sstevel@tonic-gate 
27500Sstevel@tonic-gate 		/*
27510Sstevel@tonic-gate 		 * Modify the load average atomically to avoid losing
27520Sstevel@tonic-gate 		 * anticipatory load updates (see lgrp_move_thread()).
27530Sstevel@tonic-gate 		 */
27540Sstevel@tonic-gate 		if (ageflag) {
27550Sstevel@tonic-gate 			/*
27560Sstevel@tonic-gate 			 * We're supposed to both update and age the load.
27570Sstevel@tonic-gate 			 * This happens 10 times/sec. per cpu.  We do a
27580Sstevel@tonic-gate 			 * little hoop-jumping to avoid integer overflow.
27590Sstevel@tonic-gate 			 */
27600Sstevel@tonic-gate 			int64_t		q, r;
27610Sstevel@tonic-gate 
27620Sstevel@tonic-gate 			do {
27630Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
27640Sstevel@tonic-gate 				q = (old  >> 16) << 7;
27650Sstevel@tonic-gate 				r = (old  & 0xffff) << 7;
27660Sstevel@tonic-gate 				new += ((long long)(nrcpus - q) * f -
27670Sstevel@tonic-gate 				    ((r * f) >> 16)) >> 7;
27680Sstevel@tonic-gate 
27690Sstevel@tonic-gate 				/*
27700Sstevel@tonic-gate 				 * Check for overflow
27710Sstevel@tonic-gate 				 */
27720Sstevel@tonic-gate 				if (new > LGRP_LOADAVG_MAX)
27730Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
27740Sstevel@tonic-gate 				else if (new < 0)
27750Sstevel@tonic-gate 					new = 0;
27760Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
27770Sstevel@tonic-gate 			    new) != old);
27780Sstevel@tonic-gate 		} else {
27790Sstevel@tonic-gate 			/*
27800Sstevel@tonic-gate 			 * We're supposed to update the load, but not age it.
27810Sstevel@tonic-gate 			 * This option is used to update the load (which either
27820Sstevel@tonic-gate 			 * has already been aged in this 1/10 sec. interval or
27830Sstevel@tonic-gate 			 * soon will be) to account for a remotely executing
27840Sstevel@tonic-gate 			 * thread.
27850Sstevel@tonic-gate 			 */
27860Sstevel@tonic-gate 			do {
27870Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
27880Sstevel@tonic-gate 				new += f;
27890Sstevel@tonic-gate 				/*
27900Sstevel@tonic-gate 				 * Check for overflow
27910Sstevel@tonic-gate 				 * Underflow not possible here
27920Sstevel@tonic-gate 				 */
27930Sstevel@tonic-gate 				if (new < old)
27940Sstevel@tonic-gate 					new = LGRP_LOADAVG_MAX;
27950Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
27960Sstevel@tonic-gate 			    new) != old);
27970Sstevel@tonic-gate 		}
27980Sstevel@tonic-gate 
27990Sstevel@tonic-gate 		/*
28000Sstevel@tonic-gate 		 * Do the same for this lpl's parent
28010Sstevel@tonic-gate 		 */
28020Sstevel@tonic-gate 		if ((lpl = lpl->lpl_parent) == NULL)
28030Sstevel@tonic-gate 			break;
28040Sstevel@tonic-gate 		ncpu = lpl->lpl_ncpu;
28050Sstevel@tonic-gate 	}
28060Sstevel@tonic-gate }
28070Sstevel@tonic-gate 
28080Sstevel@tonic-gate /*
28090Sstevel@tonic-gate  * Initialize lpl topology in the target based on topology currently present in
28100Sstevel@tonic-gate  * lpl_bootstrap.
28110Sstevel@tonic-gate  *
28120Sstevel@tonic-gate  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
28130Sstevel@tonic-gate  * initialize cp_default list of lpls. Up to this point all topology operations
28140Sstevel@tonic-gate  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
28150Sstevel@tonic-gate  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
28160Sstevel@tonic-gate  * `target' points to the list of lpls in cp_default and `size' is the size of
28170Sstevel@tonic-gate  * this list.
28180Sstevel@tonic-gate  *
28190Sstevel@tonic-gate  * This function walks the lpl topology in lpl_bootstrap and does for things:
28200Sstevel@tonic-gate  *
28210Sstevel@tonic-gate  * 1) Copies all fields from lpl_bootstrap to the target.
28220Sstevel@tonic-gate  *
28230Sstevel@tonic-gate  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
28240Sstevel@tonic-gate  *
28250Sstevel@tonic-gate  * 3) Updates lpl_parent pointers to point to the lpls in the target list
28260Sstevel@tonic-gate  *    instead of lpl_bootstrap.
28270Sstevel@tonic-gate  *
28280Sstevel@tonic-gate  * 4) Updates pointers in the resource list of the target to point to the lpls
28290Sstevel@tonic-gate  *    in the target list instead of lpl_bootstrap.
28300Sstevel@tonic-gate  *
28310Sstevel@tonic-gate  * After lpl_topo_bootstrap() completes, target contains the same information
28320Sstevel@tonic-gate  * that would be present there if it were used during boot instead of
28330Sstevel@tonic-gate  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
28340Sstevel@tonic-gate  * and it is bzeroed.
28350Sstevel@tonic-gate  */
28360Sstevel@tonic-gate void
lpl_topo_bootstrap(lpl_t * target,int size)28370Sstevel@tonic-gate lpl_topo_bootstrap(lpl_t *target, int size)
28380Sstevel@tonic-gate {
28390Sstevel@tonic-gate 	lpl_t	*lpl = lpl_bootstrap;
28400Sstevel@tonic-gate 	lpl_t	*target_lpl = target;
28418408SEric.Saxe@Sun.COM 	lpl_t	**rset;
28428408SEric.Saxe@Sun.COM 	int	*id2rset;
28438408SEric.Saxe@Sun.COM 	int	sz;
28440Sstevel@tonic-gate 	int	howmany;
28450Sstevel@tonic-gate 	int	id;
28460Sstevel@tonic-gate 	int	i;
28470Sstevel@tonic-gate 
28480Sstevel@tonic-gate 	/*
28490Sstevel@tonic-gate 	 * The only target that should be passed here is cp_default lpl list.
28500Sstevel@tonic-gate 	 */
28510Sstevel@tonic-gate 	ASSERT(target == cp_default.cp_lgrploads);
28520Sstevel@tonic-gate 	ASSERT(size == cp_default.cp_nlgrploads);
28530Sstevel@tonic-gate 	ASSERT(!lgrp_topo_initialized);
28540Sstevel@tonic-gate 	ASSERT(ncpus == 1);
28550Sstevel@tonic-gate 
28560Sstevel@tonic-gate 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
28570Sstevel@tonic-gate 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
28580Sstevel@tonic-gate 		/*
28598408SEric.Saxe@Sun.COM 		 * Copy all fields from lpl, except for the rset,
28608408SEric.Saxe@Sun.COM 		 * lgrp id <=> rset mapping storage,
28618408SEric.Saxe@Sun.COM 		 * and amount of storage
28620Sstevel@tonic-gate 		 */
28638408SEric.Saxe@Sun.COM 		rset = target_lpl->lpl_rset;
28648408SEric.Saxe@Sun.COM 		id2rset = target_lpl->lpl_id2rset;
28658408SEric.Saxe@Sun.COM 		sz = target_lpl->lpl_rset_sz;
28660Sstevel@tonic-gate 
28670Sstevel@tonic-gate 		*target_lpl = *lpl;
28680Sstevel@tonic-gate 
28698408SEric.Saxe@Sun.COM 		target_lpl->lpl_rset_sz = sz;
28708408SEric.Saxe@Sun.COM 		target_lpl->lpl_rset = rset;
28718408SEric.Saxe@Sun.COM 		target_lpl->lpl_id2rset = id2rset;
28728408SEric.Saxe@Sun.COM 
28730Sstevel@tonic-gate 		/*
28740Sstevel@tonic-gate 		 * Substitute CPU0 lpl pointer with one relative to target.
28750Sstevel@tonic-gate 		 */
28760Sstevel@tonic-gate 		if (lpl->lpl_cpus == CPU) {
28770Sstevel@tonic-gate 			ASSERT(CPU->cpu_lpl == lpl);
28780Sstevel@tonic-gate 			CPU->cpu_lpl = target_lpl;
28790Sstevel@tonic-gate 		}
28800Sstevel@tonic-gate 
28810Sstevel@tonic-gate 		/*
28820Sstevel@tonic-gate 		 * Substitute parent information with parent relative to target.
28830Sstevel@tonic-gate 		 */
28840Sstevel@tonic-gate 		if (lpl->lpl_parent != NULL)
28850Sstevel@tonic-gate 			target_lpl->lpl_parent = (lpl_t *)
28860Sstevel@tonic-gate 			    (((uintptr_t)lpl->lpl_parent -
28878408SEric.Saxe@Sun.COM 			    (uintptr_t)lpl_bootstrap) +
28888408SEric.Saxe@Sun.COM 			    (uintptr_t)target);
28890Sstevel@tonic-gate 
28900Sstevel@tonic-gate 		/*
28910Sstevel@tonic-gate 		 * Walk over resource set substituting pointers relative to
28928408SEric.Saxe@Sun.COM 		 * lpl_bootstrap's rset to pointers relative to target's
28930Sstevel@tonic-gate 		 */
28940Sstevel@tonic-gate 		ASSERT(lpl->lpl_nrset <= 1);
28950Sstevel@tonic-gate 
28960Sstevel@tonic-gate 		for (id = 0; id < lpl->lpl_nrset; id++) {
28970Sstevel@tonic-gate 			if (lpl->lpl_rset[id] != NULL) {
28988408SEric.Saxe@Sun.COM 				target_lpl->lpl_rset[id] = (lpl_t *)
28990Sstevel@tonic-gate 				    (((uintptr_t)lpl->lpl_rset[id] -
29008408SEric.Saxe@Sun.COM 				    (uintptr_t)lpl_bootstrap) +
29018408SEric.Saxe@Sun.COM 				    (uintptr_t)target);
29020Sstevel@tonic-gate 			}
29038408SEric.Saxe@Sun.COM 			target_lpl->lpl_id2rset[id] =
29048408SEric.Saxe@Sun.COM 			    lpl->lpl_id2rset[id];
29050Sstevel@tonic-gate 		}
29060Sstevel@tonic-gate 	}
29070Sstevel@tonic-gate 
29080Sstevel@tonic-gate 	/*
29098408SEric.Saxe@Sun.COM 	 * Clean up the bootstrap lpls since we have switched over to the
29108408SEric.Saxe@Sun.COM 	 * actual lpl array in the default cpu partition.
29118408SEric.Saxe@Sun.COM 	 *
29128408SEric.Saxe@Sun.COM 	 * We still need to keep one empty lpl around for newly starting
29138408SEric.Saxe@Sun.COM 	 * slave CPUs to reference should they need to make it through the
29148408SEric.Saxe@Sun.COM 	 * dispatcher prior to their lgrp/lpl initialization.
29158408SEric.Saxe@Sun.COM 	 *
29168408SEric.Saxe@Sun.COM 	 * The lpl related dispatcher code has been designed to work properly
29178408SEric.Saxe@Sun.COM 	 * (and without extra checks) for this special case of a zero'ed
29188408SEric.Saxe@Sun.COM 	 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
29198408SEric.Saxe@Sun.COM 	 * with lgrpid 0 and an empty resource set. Iteration over the rset
29208408SEric.Saxe@Sun.COM 	 * array by the dispatcher is also NULL terminated for this reason.
29218408SEric.Saxe@Sun.COM 	 *
29228408SEric.Saxe@Sun.COM 	 * This provides the desired behaviour for an uninitialized CPU.
29238408SEric.Saxe@Sun.COM 	 * It shouldn't see any other CPU to either dispatch to or steal
29248408SEric.Saxe@Sun.COM 	 * from until it is properly initialized.
29250Sstevel@tonic-gate 	 */
29260Sstevel@tonic-gate 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
29278408SEric.Saxe@Sun.COM 	bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
29288408SEric.Saxe@Sun.COM 	bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
29298408SEric.Saxe@Sun.COM 
29308408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
29318408SEric.Saxe@Sun.COM 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
29320Sstevel@tonic-gate }
29330Sstevel@tonic-gate 
29340Sstevel@tonic-gate /*
29350Sstevel@tonic-gate  * If the lowest load among the lgroups a process' threads are currently
29360Sstevel@tonic-gate  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
29370Sstevel@tonic-gate  * expanding the process to a new lgroup.
29380Sstevel@tonic-gate  */
29390Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
29400Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
29410Sstevel@tonic-gate 
29420Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
29430Sstevel@tonic-gate 	((lgrp_expand_proc_thresh) / (ncpu))
29440Sstevel@tonic-gate 
29450Sstevel@tonic-gate /*
29460Sstevel@tonic-gate  * A process will be expanded to a new lgroup only if the difference between
29470Sstevel@tonic-gate  * the lowest load on the lgroups the process' thread's are currently spread
29480Sstevel@tonic-gate  * across and the lowest load on the other lgroups in the process' partition
29490Sstevel@tonic-gate  * is greater than lgrp_expand_proc_diff.
29500Sstevel@tonic-gate  */
29510Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
29520Sstevel@tonic-gate lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
29530Sstevel@tonic-gate 
29540Sstevel@tonic-gate #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
29550Sstevel@tonic-gate 	((lgrp_expand_proc_diff) / (ncpu))
29560Sstevel@tonic-gate 
29570Sstevel@tonic-gate /*
29580Sstevel@tonic-gate  * The loadavg tolerance accounts for "noise" inherent in the load, which may
29590Sstevel@tonic-gate  * be present due to impreciseness of the load average decay algorithm.
29600Sstevel@tonic-gate  *
29610Sstevel@tonic-gate  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
29620Sstevel@tonic-gate  * tolerance is scaled by the number of cpus in the lgroup just like
29630Sstevel@tonic-gate  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
29640Sstevel@tonic-gate  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
29650Sstevel@tonic-gate  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
29660Sstevel@tonic-gate  */
29670Sstevel@tonic-gate uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
29680Sstevel@tonic-gate #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
29690Sstevel@tonic-gate 	((lgrp_loadavg_tolerance) / ncpu)
29700Sstevel@tonic-gate 
29710Sstevel@tonic-gate /*
29720Sstevel@tonic-gate  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
29730Sstevel@tonic-gate  * average is above this threshold
29740Sstevel@tonic-gate  */
29750Sstevel@tonic-gate uint32_t	lgrp_load_thresh = UINT32_MAX;
29760Sstevel@tonic-gate 
29770Sstevel@tonic-gate /*
29780Sstevel@tonic-gate  * lgrp_choose() will try to skip any lgroups with less memory
29790Sstevel@tonic-gate  * than this free when choosing a home lgroup
29800Sstevel@tonic-gate  */
29810Sstevel@tonic-gate pgcnt_t	lgrp_mem_free_thresh = 0;
29820Sstevel@tonic-gate 
29830Sstevel@tonic-gate /*
29840Sstevel@tonic-gate  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
29850Sstevel@tonic-gate  * one based on one of the following policies:
29860Sstevel@tonic-gate  * - Random selection
29870Sstevel@tonic-gate  * - Pseudo round robin placement
29880Sstevel@tonic-gate  * - Longest time since a thread was last placed
29890Sstevel@tonic-gate  */
29900Sstevel@tonic-gate #define	LGRP_CHOOSE_RANDOM	1
29910Sstevel@tonic-gate #define	LGRP_CHOOSE_RR		2
29920Sstevel@tonic-gate #define	LGRP_CHOOSE_TIME	3
29930Sstevel@tonic-gate 
29940Sstevel@tonic-gate int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
29950Sstevel@tonic-gate 
29960Sstevel@tonic-gate /*
29970Sstevel@tonic-gate  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
29980Sstevel@tonic-gate  * be bound to a CPU or processor set.
29990Sstevel@tonic-gate  *
30000Sstevel@tonic-gate  * Arguments:
30010Sstevel@tonic-gate  *	t		The thread
30020Sstevel@tonic-gate  *	cpupart		The partition the thread belongs to.
30030Sstevel@tonic-gate  *
30040Sstevel@tonic-gate  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
30050Sstevel@tonic-gate  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
30060Sstevel@tonic-gate  *	 partitions changing out from under us and assumes that given thread is
30070Sstevel@tonic-gate  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
30080Sstevel@tonic-gate  *	 disabled, so don't grab any locks because we should never block under
30090Sstevel@tonic-gate  *	 those conditions.
30100Sstevel@tonic-gate  */
30110Sstevel@tonic-gate lpl_t *
lgrp_choose(kthread_t * t,cpupart_t * cpupart)30120Sstevel@tonic-gate lgrp_choose(kthread_t *t, cpupart_t *cpupart)
30130Sstevel@tonic-gate {
30140Sstevel@tonic-gate 	lgrp_load_t	bestload, bestrload;
30150Sstevel@tonic-gate 	int		lgrpid_offset, lgrp_count;
30160Sstevel@tonic-gate 	lgrp_id_t	lgrpid, lgrpid_start;
30170Sstevel@tonic-gate 	lpl_t		*lpl, *bestlpl, *bestrlpl;
30180Sstevel@tonic-gate 	klgrpset_t	lgrpset;
30190Sstevel@tonic-gate 	proc_t		*p;
30200Sstevel@tonic-gate 
30210Sstevel@tonic-gate 	ASSERT(t != NULL);
30220Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
30230Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
30240Sstevel@tonic-gate 	ASSERT(cpupart != NULL);
30250Sstevel@tonic-gate 
30260Sstevel@tonic-gate 	p = t->t_procp;
30270Sstevel@tonic-gate 
30280Sstevel@tonic-gate 	/* A process should always be in an active partition */
30290Sstevel@tonic-gate 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
30300Sstevel@tonic-gate 
30310Sstevel@tonic-gate 	bestlpl = bestrlpl = NULL;
30320Sstevel@tonic-gate 	bestload = bestrload = LGRP_LOADAVG_MAX;
30330Sstevel@tonic-gate 	lgrpset = cpupart->cp_lgrpset;
30340Sstevel@tonic-gate 
30350Sstevel@tonic-gate 	switch (lgrp_choose_policy) {
30360Sstevel@tonic-gate 	case LGRP_CHOOSE_RR:
30370Sstevel@tonic-gate 		lgrpid = cpupart->cp_lgrp_hint;
30380Sstevel@tonic-gate 		do {
30390Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
30400Sstevel@tonic-gate 				lgrpid = 0;
30410Sstevel@tonic-gate 		} while (!klgrpset_ismember(lgrpset, lgrpid));
30420Sstevel@tonic-gate 
30430Sstevel@tonic-gate 		break;
30440Sstevel@tonic-gate 	default:
30450Sstevel@tonic-gate 	case LGRP_CHOOSE_TIME:
30460Sstevel@tonic-gate 	case LGRP_CHOOSE_RANDOM:
30470Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrp_count);
30480Sstevel@tonic-gate 		lgrpid_offset =
30490Sstevel@tonic-gate 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
30500Sstevel@tonic-gate 		for (lgrpid = 0; ; lgrpid++) {
30510Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, lgrpid)) {
30520Sstevel@tonic-gate 				if (--lgrpid_offset == 0)
30530Sstevel@tonic-gate 					break;
30540Sstevel@tonic-gate 			}
30550Sstevel@tonic-gate 		}
30560Sstevel@tonic-gate 		break;
30570Sstevel@tonic-gate 	}
30580Sstevel@tonic-gate 
30590Sstevel@tonic-gate 	lgrpid_start = lgrpid;
30600Sstevel@tonic-gate 
30610Sstevel@tonic-gate 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
30620Sstevel@tonic-gate 	    lgrp_id_t, cpupart->cp_lgrp_hint);
30630Sstevel@tonic-gate 
30640Sstevel@tonic-gate 	/*
30650Sstevel@tonic-gate 	 * Use lgroup affinities (if any) to choose best lgroup
30660Sstevel@tonic-gate 	 *
30670Sstevel@tonic-gate 	 * NOTE: Assumes that thread is protected from going away and its
30680Sstevel@tonic-gate 	 *	 lgroup affinities won't change (ie. p_lock, or
30690Sstevel@tonic-gate 	 *	 thread_lock() being held and/or CPUs paused)
30700Sstevel@tonic-gate 	 */
30710Sstevel@tonic-gate 	if (t->t_lgrp_affinity) {
30722988Sjjc 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
30730Sstevel@tonic-gate 		if (lpl != NULL)
30740Sstevel@tonic-gate 			return (lpl);
30750Sstevel@tonic-gate 	}
30760Sstevel@tonic-gate 
30770Sstevel@tonic-gate 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
30780Sstevel@tonic-gate 
30790Sstevel@tonic-gate 	do {
30800Sstevel@tonic-gate 		pgcnt_t	npgs;
30810Sstevel@tonic-gate 
30820Sstevel@tonic-gate 		/*
30830Sstevel@tonic-gate 		 * Skip any lgroups outside of thread's pset
30840Sstevel@tonic-gate 		 */
30850Sstevel@tonic-gate 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
30860Sstevel@tonic-gate 			if (++lgrpid > lgrp_alloc_max)
30870Sstevel@tonic-gate 				lgrpid = 0;	/* wrap the search */
30880Sstevel@tonic-gate 			continue;
30890Sstevel@tonic-gate 		}
30900Sstevel@tonic-gate 
30910Sstevel@tonic-gate 		/*
30920Sstevel@tonic-gate 		 * Skip any non-leaf lgroups
30930Sstevel@tonic-gate 		 */
30940Sstevel@tonic-gate 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
30950Sstevel@tonic-gate 			continue;
30960Sstevel@tonic-gate 
30970Sstevel@tonic-gate 		/*
30980Sstevel@tonic-gate 		 * Skip any lgroups without enough free memory
30990Sstevel@tonic-gate 		 * (when threshold set to nonzero positive value)
31000Sstevel@tonic-gate 		 */
31010Sstevel@tonic-gate 		if (lgrp_mem_free_thresh > 0) {
31020Sstevel@tonic-gate 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
31030Sstevel@tonic-gate 			if (npgs < lgrp_mem_free_thresh) {
31040Sstevel@tonic-gate 				if (++lgrpid > lgrp_alloc_max)
31050Sstevel@tonic-gate 					lgrpid = 0;	/* wrap the search */
31060Sstevel@tonic-gate 				continue;
31070Sstevel@tonic-gate 			}
31080Sstevel@tonic-gate 		}
31090Sstevel@tonic-gate 
31100Sstevel@tonic-gate 		lpl = &cpupart->cp_lgrploads[lgrpid];
31110Sstevel@tonic-gate 		if (klgrpset_isempty(p->p_lgrpset) ||
31120Sstevel@tonic-gate 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
31130Sstevel@tonic-gate 			/*
31140Sstevel@tonic-gate 			 * Either this is a new process or the process already
31150Sstevel@tonic-gate 			 * has threads on this lgrp, so this is a preferred
31160Sstevel@tonic-gate 			 * lgroup for the thread.
31170Sstevel@tonic-gate 			 */
31181892Sesaxe 			if (bestlpl == NULL ||
31191892Sesaxe 			    lpl_pick(lpl, bestlpl)) {
31200Sstevel@tonic-gate 				bestload = lpl->lpl_loadavg;
31210Sstevel@tonic-gate 				bestlpl = lpl;
31220Sstevel@tonic-gate 			}
31230Sstevel@tonic-gate 		} else {
31240Sstevel@tonic-gate 			/*
31250Sstevel@tonic-gate 			 * The process doesn't have any threads on this lgrp,
31260Sstevel@tonic-gate 			 * but we're willing to consider this lgrp if the load
31270Sstevel@tonic-gate 			 * difference is big enough to justify splitting up
31280Sstevel@tonic-gate 			 * the process' threads.
31290Sstevel@tonic-gate 			 */
31301892Sesaxe 			if (bestrlpl == NULL ||
31311892Sesaxe 			    lpl_pick(lpl, bestrlpl)) {
31320Sstevel@tonic-gate 				bestrload = lpl->lpl_loadavg;
31330Sstevel@tonic-gate 				bestrlpl = lpl;
31340Sstevel@tonic-gate 			}
31350Sstevel@tonic-gate 		}
31360Sstevel@tonic-gate 		if (++lgrpid > lgrp_alloc_max)
31370Sstevel@tonic-gate 			lgrpid = 0;	/* wrap the search */
31380Sstevel@tonic-gate 	} while (lgrpid != lgrpid_start);
31390Sstevel@tonic-gate 
31400Sstevel@tonic-gate 	/*
31410Sstevel@tonic-gate 	 * Return root lgroup if threshold isn't set to maximum value and
31420Sstevel@tonic-gate 	 * lowest lgroup load average more than a certain threshold
31430Sstevel@tonic-gate 	 */
31440Sstevel@tonic-gate 	if (lgrp_load_thresh != UINT32_MAX &&
31450Sstevel@tonic-gate 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
31460Sstevel@tonic-gate 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
31470Sstevel@tonic-gate 
31480Sstevel@tonic-gate 	/*
31490Sstevel@tonic-gate 	 * If all the lgroups over which the thread's process is spread are
31501892Sesaxe 	 * heavily loaded, or otherwise undesirable, we'll consider placing
31511892Sesaxe 	 * the thread on one of the other leaf lgroups in the thread's
31521892Sesaxe 	 * partition.
31530Sstevel@tonic-gate 	 */
31541892Sesaxe 	if ((bestlpl == NULL) ||
31551892Sesaxe 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
31560Sstevel@tonic-gate 	    (bestrload < bestload) &&	/* paranoid about wraparound */
31570Sstevel@tonic-gate 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
31581892Sesaxe 	    bestload))) {
31590Sstevel@tonic-gate 		bestlpl = bestrlpl;
31600Sstevel@tonic-gate 	}
31610Sstevel@tonic-gate 
31621892Sesaxe 	if (bestlpl == NULL) {
31631892Sesaxe 		/*
31641892Sesaxe 		 * No lgroup looked particularly good, but we still
31651892Sesaxe 		 * have to pick something. Go with the randomly selected
31661892Sesaxe 		 * legal lgroup we started with above.
31671892Sesaxe 		 */
31681892Sesaxe 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
31691892Sesaxe 	}
31701892Sesaxe 
31710Sstevel@tonic-gate 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
31720Sstevel@tonic-gate 	bestlpl->lpl_homed_time = gethrtime_unscaled();
31730Sstevel@tonic-gate 
31740Sstevel@tonic-gate 	ASSERT(bestlpl->lpl_ncpu > 0);
31750Sstevel@tonic-gate 	return (bestlpl);
31760Sstevel@tonic-gate }
31770Sstevel@tonic-gate 
31780Sstevel@tonic-gate /*
31791892Sesaxe  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
31801892Sesaxe  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
31810Sstevel@tonic-gate  */
31820Sstevel@tonic-gate static int
lpl_pick(lpl_t * lpl1,lpl_t * lpl2)31830Sstevel@tonic-gate lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
31840Sstevel@tonic-gate {
31850Sstevel@tonic-gate 	lgrp_load_t	l1, l2;
31860Sstevel@tonic-gate 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
31870Sstevel@tonic-gate 
31880Sstevel@tonic-gate 	l1 = lpl1->lpl_loadavg;
31890Sstevel@tonic-gate 	l2 = lpl2->lpl_loadavg;
31900Sstevel@tonic-gate 
31910Sstevel@tonic-gate 	if ((l1 + tolerance < l2) && (l1 < l2)) {
31920Sstevel@tonic-gate 		/* lpl1 is significantly less loaded than lpl2 */
31930Sstevel@tonic-gate 		return (1);
31940Sstevel@tonic-gate 	}
31950Sstevel@tonic-gate 
31960Sstevel@tonic-gate 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
31970Sstevel@tonic-gate 	    l1 + tolerance >= l2 && l1 < l2 &&
31980Sstevel@tonic-gate 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
31990Sstevel@tonic-gate 		/*
32000Sstevel@tonic-gate 		 * lpl1's load is within the tolerance of lpl2. We're
32010Sstevel@tonic-gate 		 * willing to consider it be to better however if
32020Sstevel@tonic-gate 		 * it has been longer since we last homed a thread there
32030Sstevel@tonic-gate 		 */
32040Sstevel@tonic-gate 		return (1);
32050Sstevel@tonic-gate 	}
32060Sstevel@tonic-gate 
32070Sstevel@tonic-gate 	return (0);
32080Sstevel@tonic-gate }
32090Sstevel@tonic-gate 
32100Sstevel@tonic-gate /*
32114426Saguzovsk  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
32124426Saguzovsk  * process that uses text replication changed home lgrp. This info is used by
32134426Saguzovsk  * segvn asyncronous thread to detect if it needs to recheck what lgrps
32144426Saguzovsk  * should be used for text replication.
32154426Saguzovsk  */
32164426Saguzovsk static uint64_t lgrp_trthr_moves = 0;
32174426Saguzovsk 
32184426Saguzovsk uint64_t
lgrp_get_trthr_migrations(void)32194426Saguzovsk lgrp_get_trthr_migrations(void)
32204426Saguzovsk {
32214426Saguzovsk 	return (lgrp_trthr_moves);
32224426Saguzovsk }
32234426Saguzovsk 
32244426Saguzovsk void
lgrp_update_trthr_migrations(uint64_t incr)32254426Saguzovsk lgrp_update_trthr_migrations(uint64_t incr)
32264426Saguzovsk {
32274426Saguzovsk 	atomic_add_64(&lgrp_trthr_moves, incr);
32284426Saguzovsk }
32294426Saguzovsk 
32304426Saguzovsk /*
32310Sstevel@tonic-gate  * An LWP is expected to be assigned to an lgroup for at least this long
32320Sstevel@tonic-gate  * for its anticipatory load to be justified.  NOTE that this value should
32330Sstevel@tonic-gate  * not be set extremely huge (say, larger than 100 years), to avoid problems
32340Sstevel@tonic-gate  * with overflow in the calculation that uses it.
32350Sstevel@tonic-gate  */
32360Sstevel@tonic-gate #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
32370Sstevel@tonic-gate hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
32380Sstevel@tonic-gate 
32390Sstevel@tonic-gate /*
32400Sstevel@tonic-gate  * Routine to change a thread's lgroup affiliation.  This routine updates
32410Sstevel@tonic-gate  * the thread's kthread_t struct and its process' proc_t struct to note the
32420Sstevel@tonic-gate  * thread's new lgroup affiliation, and its lgroup affinities.
32430Sstevel@tonic-gate  *
32440Sstevel@tonic-gate  * Note that this is the only routine that modifies a thread's t_lpl field,
32450Sstevel@tonic-gate  * and that adds in or removes anticipatory load.
32460Sstevel@tonic-gate  *
32470Sstevel@tonic-gate  * If the thread is exiting, newlpl is NULL.
32480Sstevel@tonic-gate  *
32490Sstevel@tonic-gate  * Locking:
32500Sstevel@tonic-gate  * The following lock must be held on entry:
32510Sstevel@tonic-gate  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
32520Sstevel@tonic-gate  *		doesn't get removed from t's partition
32530Sstevel@tonic-gate  *
32540Sstevel@tonic-gate  * This routine is not allowed to grab any locks, since it may be called
32550Sstevel@tonic-gate  * with cpus paused (such as from cpu_offline).
32560Sstevel@tonic-gate  */
32570Sstevel@tonic-gate void
lgrp_move_thread(kthread_t * t,lpl_t * newlpl,int do_lgrpset_delete)32580Sstevel@tonic-gate lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
32590Sstevel@tonic-gate {
32600Sstevel@tonic-gate 	proc_t		*p;
32610Sstevel@tonic-gate 	lpl_t		*lpl, *oldlpl;
32620Sstevel@tonic-gate 	lgrp_id_t	oldid;
32630Sstevel@tonic-gate 	kthread_t	*tp;
32640Sstevel@tonic-gate 	uint_t		ncpu;
32650Sstevel@tonic-gate 	lgrp_load_t	old, new;
32660Sstevel@tonic-gate 
32670Sstevel@tonic-gate 	ASSERT(t);
32680Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
32690Sstevel@tonic-gate 	    THREAD_LOCK_HELD(t));
32700Sstevel@tonic-gate 
32710Sstevel@tonic-gate 	/*
32720Sstevel@tonic-gate 	 * If not changing lpls, just return
32730Sstevel@tonic-gate 	 */
32740Sstevel@tonic-gate 	if ((oldlpl = t->t_lpl) == newlpl)
32750Sstevel@tonic-gate 		return;
32760Sstevel@tonic-gate 
32770Sstevel@tonic-gate 	/*
32780Sstevel@tonic-gate 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
32790Sstevel@tonic-gate 	 * associated with process 0 rather than with its original process).
32800Sstevel@tonic-gate 	 */
32810Sstevel@tonic-gate 	if (t->t_proc_flag & TP_LWPEXIT) {
32820Sstevel@tonic-gate 		if (newlpl != NULL) {
32830Sstevel@tonic-gate 			t->t_lpl = newlpl;
32840Sstevel@tonic-gate 		}
32850Sstevel@tonic-gate 		return;
32860Sstevel@tonic-gate 	}
32870Sstevel@tonic-gate 
32880Sstevel@tonic-gate 	p = ttoproc(t);
32890Sstevel@tonic-gate 
32900Sstevel@tonic-gate 	/*
32910Sstevel@tonic-gate 	 * If the thread had a previous lgroup, update its process' p_lgrpset
32920Sstevel@tonic-gate 	 * to account for it being moved from its old lgroup.
32930Sstevel@tonic-gate 	 */
32940Sstevel@tonic-gate 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
32950Sstevel@tonic-gate 	    (p->p_tlist != NULL)) {
32960Sstevel@tonic-gate 		oldid = oldlpl->lpl_lgrpid;
32970Sstevel@tonic-gate 
32980Sstevel@tonic-gate 		if (newlpl != NULL)
32990Sstevel@tonic-gate 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
33000Sstevel@tonic-gate 
33010Sstevel@tonic-gate 		if ((do_lgrpset_delete) &&
33020Sstevel@tonic-gate 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
33030Sstevel@tonic-gate 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
33040Sstevel@tonic-gate 				/*
33050Sstevel@tonic-gate 				 * Check if a thread other than the thread
33060Sstevel@tonic-gate 				 * that's moving is assigned to the same
33070Sstevel@tonic-gate 				 * lgroup as the thread that's moving.  Note
33080Sstevel@tonic-gate 				 * that we have to compare lgroup IDs, rather
33090Sstevel@tonic-gate 				 * than simply comparing t_lpl's, since the
33100Sstevel@tonic-gate 				 * threads may belong to different partitions
33110Sstevel@tonic-gate 				 * but be assigned to the same lgroup.
33120Sstevel@tonic-gate 				 */
33130Sstevel@tonic-gate 				ASSERT(tp->t_lpl != NULL);
33140Sstevel@tonic-gate 
33150Sstevel@tonic-gate 				if ((tp != t) &&
33160Sstevel@tonic-gate 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
33170Sstevel@tonic-gate 					/*
33180Sstevel@tonic-gate 					 * Another thread is assigned to the
33190Sstevel@tonic-gate 					 * same lgroup as the thread that's
33200Sstevel@tonic-gate 					 * moving, p_lgrpset doesn't change.
33210Sstevel@tonic-gate 					 */
33220Sstevel@tonic-gate 					break;
33230Sstevel@tonic-gate 				} else if (tp == p->p_tlist) {
33240Sstevel@tonic-gate 					/*
33250Sstevel@tonic-gate 					 * No other thread is assigned to the
33260Sstevel@tonic-gate 					 * same lgroup as the exiting thread,
33270Sstevel@tonic-gate 					 * clear the lgroup's bit in p_lgrpset.
33280Sstevel@tonic-gate 					 */
33290Sstevel@tonic-gate 					klgrpset_del(p->p_lgrpset, oldid);
33300Sstevel@tonic-gate 					break;
33310Sstevel@tonic-gate 				}
33320Sstevel@tonic-gate 			}
33330Sstevel@tonic-gate 		}
33340Sstevel@tonic-gate 
33350Sstevel@tonic-gate 		/*
33360Sstevel@tonic-gate 		 * If this thread was assigned to its old lgroup for such a
33370Sstevel@tonic-gate 		 * short amount of time that the anticipatory load that was
33380Sstevel@tonic-gate 		 * added on its behalf has aged very little, remove that
33390Sstevel@tonic-gate 		 * anticipatory load.
33400Sstevel@tonic-gate 		 */
33410Sstevel@tonic-gate 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
33420Sstevel@tonic-gate 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
33430Sstevel@tonic-gate 			lpl = oldlpl;
33440Sstevel@tonic-gate 			for (;;) {
33450Sstevel@tonic-gate 				do {
33460Sstevel@tonic-gate 					old = new = lpl->lpl_loadavg;
33470Sstevel@tonic-gate 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
33480Sstevel@tonic-gate 					if (new > old) {
33490Sstevel@tonic-gate 						/*
33500Sstevel@tonic-gate 						 * this can happen if the load
33510Sstevel@tonic-gate 						 * average was aged since we
33520Sstevel@tonic-gate 						 * added in the anticipatory
33530Sstevel@tonic-gate 						 * load
33540Sstevel@tonic-gate 						 */
33550Sstevel@tonic-gate 						new = 0;
33560Sstevel@tonic-gate 					}
33570Sstevel@tonic-gate 				} while (cas32(
33588408SEric.Saxe@Sun.COM 				    (lgrp_load_t *)&lpl->lpl_loadavg, old,
33598408SEric.Saxe@Sun.COM 				    new) != old);
33600Sstevel@tonic-gate 
33610Sstevel@tonic-gate 				lpl = lpl->lpl_parent;
33620Sstevel@tonic-gate 				if (lpl == NULL)
33630Sstevel@tonic-gate 					break;
33640Sstevel@tonic-gate 
33650Sstevel@tonic-gate 				ncpu = lpl->lpl_ncpu;
33660Sstevel@tonic-gate 				ASSERT(ncpu > 0);
33670Sstevel@tonic-gate 			}
33680Sstevel@tonic-gate 		}
33690Sstevel@tonic-gate 	}
33700Sstevel@tonic-gate 	/*
33710Sstevel@tonic-gate 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
33720Sstevel@tonic-gate 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
33730Sstevel@tonic-gate 	 * to its new lgroup to account for its move to its new lgroup.
33740Sstevel@tonic-gate 	 */
33750Sstevel@tonic-gate 	if (newlpl != NULL) {
33760Sstevel@tonic-gate 		/*
33770Sstevel@tonic-gate 		 * This thread is moving to a new lgroup
33780Sstevel@tonic-gate 		 */
33790Sstevel@tonic-gate 		t->t_lpl = newlpl;
33804426Saguzovsk 		if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
33814426Saguzovsk 			p->p_t1_lgrpid = newlpl->lpl_lgrpid;
33824426Saguzovsk 			membar_producer();
33834426Saguzovsk 			if (p->p_tr_lgrpid != LGRP_NONE &&
33844426Saguzovsk 			    p->p_tr_lgrpid != p->p_t1_lgrpid) {
33854426Saguzovsk 				lgrp_update_trthr_migrations(1);
33864426Saguzovsk 			}
33874426Saguzovsk 		}
33880Sstevel@tonic-gate 
33890Sstevel@tonic-gate 		/*
33900Sstevel@tonic-gate 		 * Reflect move in load average of new lgroup
33910Sstevel@tonic-gate 		 * unless it is root lgroup
33920Sstevel@tonic-gate 		 */
33930Sstevel@tonic-gate 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
33940Sstevel@tonic-gate 			return;
33950Sstevel@tonic-gate 
33960Sstevel@tonic-gate 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
33970Sstevel@tonic-gate 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
33980Sstevel@tonic-gate 		}
33990Sstevel@tonic-gate 
34000Sstevel@tonic-gate 		/*
34010Sstevel@tonic-gate 		 * It'll take some time for the load on the new lgroup
34020Sstevel@tonic-gate 		 * to reflect this thread's placement on it.  We'd
34030Sstevel@tonic-gate 		 * like not, however, to have all threads between now
34040Sstevel@tonic-gate 		 * and then also piling on to this lgroup.  To avoid
34050Sstevel@tonic-gate 		 * this pileup, we anticipate the load this thread
34060Sstevel@tonic-gate 		 * will generate on its new lgroup.  The goal is to
34070Sstevel@tonic-gate 		 * make the lgroup's load appear as though the thread
34080Sstevel@tonic-gate 		 * had been there all along.  We're very conservative
34090Sstevel@tonic-gate 		 * in calculating this anticipatory load, we assume
34100Sstevel@tonic-gate 		 * the worst case case (100% CPU-bound thread).  This
34110Sstevel@tonic-gate 		 * may be modified in the future to be more accurate.
34120Sstevel@tonic-gate 		 */
34130Sstevel@tonic-gate 		lpl = newlpl;
34140Sstevel@tonic-gate 		for (;;) {
34150Sstevel@tonic-gate 			ncpu = lpl->lpl_ncpu;
34160Sstevel@tonic-gate 			ASSERT(ncpu > 0);
34170Sstevel@tonic-gate 			do {
34180Sstevel@tonic-gate 				old = new = lpl->lpl_loadavg;
34190Sstevel@tonic-gate 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
34200Sstevel@tonic-gate 				/*
34210Sstevel@tonic-gate 				 * Check for overflow
34220Sstevel@tonic-gate 				 * Underflow not possible here
34230Sstevel@tonic-gate 				 */
34240Sstevel@tonic-gate 				if (new < old)
34250Sstevel@tonic-gate 					new = UINT32_MAX;
34260Sstevel@tonic-gate 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
34270Sstevel@tonic-gate 			    new) != old);
34280Sstevel@tonic-gate 
34290Sstevel@tonic-gate 			lpl = lpl->lpl_parent;
34300Sstevel@tonic-gate 			if (lpl == NULL)
34310Sstevel@tonic-gate 				break;
34320Sstevel@tonic-gate 		}
34330Sstevel@tonic-gate 		t->t_anttime = gethrtime();
34340Sstevel@tonic-gate 	}
34350Sstevel@tonic-gate }
34360Sstevel@tonic-gate 
34370Sstevel@tonic-gate /*
34380Sstevel@tonic-gate  * Return lgroup memory allocation policy given advice from madvise(3C)
34390Sstevel@tonic-gate  */
34400Sstevel@tonic-gate lgrp_mem_policy_t
lgrp_madv_to_policy(uchar_t advice,size_t size,int type)34410Sstevel@tonic-gate lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
34420Sstevel@tonic-gate {
34430Sstevel@tonic-gate 	switch (advice) {
34440Sstevel@tonic-gate 	case MADV_ACCESS_LWP:
34450Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_NEXT);
34460Sstevel@tonic-gate 	case MADV_ACCESS_MANY:
34470Sstevel@tonic-gate 		return (LGRP_MEM_POLICY_RANDOM);
34480Sstevel@tonic-gate 	default:
34490Sstevel@tonic-gate 		return (lgrp_mem_policy_default(size, type));
34500Sstevel@tonic-gate 	}
34510Sstevel@tonic-gate }
34520Sstevel@tonic-gate 
34530Sstevel@tonic-gate /*
34540Sstevel@tonic-gate  * Figure out default policy
34550Sstevel@tonic-gate  */
34560Sstevel@tonic-gate lgrp_mem_policy_t
lgrp_mem_policy_default(size_t size,int type)34570Sstevel@tonic-gate lgrp_mem_policy_default(size_t size, int type)
34580Sstevel@tonic-gate {
34590Sstevel@tonic-gate 	cpupart_t		*cp;
34600Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
34610Sstevel@tonic-gate 	size_t			pset_mem_size;
34620Sstevel@tonic-gate 
34630Sstevel@tonic-gate 	/*
34640Sstevel@tonic-gate 	 * Randomly allocate memory across lgroups for shared memory
34650Sstevel@tonic-gate 	 * beyond a certain threshold
34660Sstevel@tonic-gate 	 */
34670Sstevel@tonic-gate 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
34680Sstevel@tonic-gate 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
34690Sstevel@tonic-gate 		/*
34700Sstevel@tonic-gate 		 * Get total memory size of current thread's pset
34710Sstevel@tonic-gate 		 */
34720Sstevel@tonic-gate 		kpreempt_disable();
34730Sstevel@tonic-gate 		cp = curthread->t_cpupart;
34740Sstevel@tonic-gate 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
34750Sstevel@tonic-gate 		kpreempt_enable();
34760Sstevel@tonic-gate 
34770Sstevel@tonic-gate 		/*
34780Sstevel@tonic-gate 		 * Choose policy to randomly allocate memory across
34790Sstevel@tonic-gate 		 * lgroups in pset if it will fit and is not default
34800Sstevel@tonic-gate 		 * partition.  Otherwise, allocate memory randomly
34810Sstevel@tonic-gate 		 * across machine.
34820Sstevel@tonic-gate 		 */
34830Sstevel@tonic-gate 		if (lgrp_mem_pset_aware && size < pset_mem_size)
34840Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
34850Sstevel@tonic-gate 		else
34860Sstevel@tonic-gate 			policy = LGRP_MEM_POLICY_RANDOM;
34870Sstevel@tonic-gate 	} else
34880Sstevel@tonic-gate 		/*
34890Sstevel@tonic-gate 		 * Apply default policy for private memory and
34900Sstevel@tonic-gate 		 * shared memory under the respective random
34910Sstevel@tonic-gate 		 * threshold.
34920Sstevel@tonic-gate 		 */
34930Sstevel@tonic-gate 		policy = lgrp_mem_default_policy;
34940Sstevel@tonic-gate 
34950Sstevel@tonic-gate 	return (policy);
34960Sstevel@tonic-gate }
34970Sstevel@tonic-gate 
34980Sstevel@tonic-gate /*
34990Sstevel@tonic-gate  * Get memory allocation policy for this segment
35000Sstevel@tonic-gate  */
35010Sstevel@tonic-gate lgrp_mem_policy_info_t *
lgrp_mem_policy_get(struct seg * seg,caddr_t vaddr)35020Sstevel@tonic-gate lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
35030Sstevel@tonic-gate {
35040Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
35050Sstevel@tonic-gate 	extern struct seg_ops	segspt_ops;
35060Sstevel@tonic-gate 	extern struct seg_ops	segspt_shmops;
35070Sstevel@tonic-gate 
35080Sstevel@tonic-gate 	/*
35090Sstevel@tonic-gate 	 * This is for binary compatibility to protect against third party
35100Sstevel@tonic-gate 	 * segment drivers which haven't recompiled to allow for
35110Sstevel@tonic-gate 	 * SEGOP_GETPOLICY()
35120Sstevel@tonic-gate 	 */
35130Sstevel@tonic-gate 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
35140Sstevel@tonic-gate 	    seg->s_ops != &segspt_shmops)
35150Sstevel@tonic-gate 		return (NULL);
35160Sstevel@tonic-gate 
35170Sstevel@tonic-gate 	policy_info = NULL;
35180Sstevel@tonic-gate 	if (seg->s_ops->getpolicy != NULL)
35190Sstevel@tonic-gate 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
35200Sstevel@tonic-gate 
35210Sstevel@tonic-gate 	return (policy_info);
35220Sstevel@tonic-gate }
35230Sstevel@tonic-gate 
35240Sstevel@tonic-gate /*
35250Sstevel@tonic-gate  * Set policy for allocating private memory given desired policy, policy info,
35260Sstevel@tonic-gate  * size in bytes of memory that policy is being applied.
35270Sstevel@tonic-gate  * Return 0 if policy wasn't set already and 1 if policy was set already
35280Sstevel@tonic-gate  */
35290Sstevel@tonic-gate int
lgrp_privm_policy_set(lgrp_mem_policy_t policy,lgrp_mem_policy_info_t * policy_info,size_t size)35300Sstevel@tonic-gate lgrp_privm_policy_set(lgrp_mem_policy_t policy,
35310Sstevel@tonic-gate     lgrp_mem_policy_info_t *policy_info, size_t size)
35320Sstevel@tonic-gate {
35330Sstevel@tonic-gate 
35340Sstevel@tonic-gate 	ASSERT(policy_info != NULL);
35350Sstevel@tonic-gate 
35360Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
35370Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
35380Sstevel@tonic-gate 
35390Sstevel@tonic-gate 	/*
35400Sstevel@tonic-gate 	 * Policy set already?
35410Sstevel@tonic-gate 	 */
35420Sstevel@tonic-gate 	if (policy == policy_info->mem_policy)
35430Sstevel@tonic-gate 		return (1);
35440Sstevel@tonic-gate 
35450Sstevel@tonic-gate 	/*
35460Sstevel@tonic-gate 	 * Set policy
35470Sstevel@tonic-gate 	 */
35480Sstevel@tonic-gate 	policy_info->mem_policy = policy;
35494426Saguzovsk 	policy_info->mem_lgrpid = LGRP_NONE;
35500Sstevel@tonic-gate 
35510Sstevel@tonic-gate 	return (0);
35520Sstevel@tonic-gate }
35530Sstevel@tonic-gate 
35540Sstevel@tonic-gate 
35550Sstevel@tonic-gate /*
35560Sstevel@tonic-gate  * Get shared memory allocation policy with given tree and offset
35570Sstevel@tonic-gate  */
35580Sstevel@tonic-gate lgrp_mem_policy_info_t *
lgrp_shm_policy_get(struct anon_map * amp,ulong_t anon_index,vnode_t * vp,u_offset_t vn_off)35590Sstevel@tonic-gate lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
35600Sstevel@tonic-gate     u_offset_t vn_off)
35610Sstevel@tonic-gate {
35620Sstevel@tonic-gate 	u_offset_t		off;
35630Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
35640Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*policy_seg;
35650Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
35660Sstevel@tonic-gate 	avl_tree_t		*tree;
35670Sstevel@tonic-gate 	avl_index_t		where;
35680Sstevel@tonic-gate 
35690Sstevel@tonic-gate 	/*
35700Sstevel@tonic-gate 	 * Get policy segment tree from anon_map or vnode and use specified
35710Sstevel@tonic-gate 	 * anon index or vnode offset as offset
35720Sstevel@tonic-gate 	 *
35730Sstevel@tonic-gate 	 * Assume that no lock needs to be held on anon_map or vnode, since
35740Sstevel@tonic-gate 	 * they should be protected by their reference count which must be
35750Sstevel@tonic-gate 	 * nonzero for an existing segment
35760Sstevel@tonic-gate 	 */
35770Sstevel@tonic-gate 	if (amp) {
35780Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
35790Sstevel@tonic-gate 		shm_locality = amp->locality;
35800Sstevel@tonic-gate 		if (shm_locality == NULL)
35810Sstevel@tonic-gate 			return (NULL);
35820Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
35830Sstevel@tonic-gate 		off = ptob(anon_index);
35840Sstevel@tonic-gate 	} else if (vp) {
35850Sstevel@tonic-gate 		shm_locality = vp->v_locality;
35860Sstevel@tonic-gate 		if (shm_locality == NULL)
35870Sstevel@tonic-gate 			return (NULL);
35880Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
35890Sstevel@tonic-gate 		tree = shm_locality->loc_tree;
35900Sstevel@tonic-gate 		off = vn_off;
35910Sstevel@tonic-gate 	}
35920Sstevel@tonic-gate 
35930Sstevel@tonic-gate 	if (tree == NULL)
35940Sstevel@tonic-gate 		return (NULL);
35950Sstevel@tonic-gate 
35960Sstevel@tonic-gate 	/*
35970Sstevel@tonic-gate 	 * Lookup policy segment for offset into shared object and return
35980Sstevel@tonic-gate 	 * policy info
35990Sstevel@tonic-gate 	 */
36000Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_READER);
36010Sstevel@tonic-gate 	policy_info = NULL;
36020Sstevel@tonic-gate 	policy_seg = avl_find(tree, &off, &where);
36030Sstevel@tonic-gate 	if (policy_seg)
36040Sstevel@tonic-gate 		policy_info = &policy_seg->shm_policy;
36050Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
36060Sstevel@tonic-gate 
36070Sstevel@tonic-gate 	return (policy_info);
36080Sstevel@tonic-gate }
36090Sstevel@tonic-gate 
36100Sstevel@tonic-gate /*
36112480Sesaxe  * Default memory allocation policy for kernel segmap pages
36122480Sesaxe  */
36132480Sesaxe lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
36142480Sesaxe 
36152480Sesaxe /*
36160Sstevel@tonic-gate  * Return lgroup to use for allocating memory
36170Sstevel@tonic-gate  * given the segment and address
36180Sstevel@tonic-gate  *
36190Sstevel@tonic-gate  * There isn't any mutual exclusion that exists between calls
36200Sstevel@tonic-gate  * to this routine and DR, so this routine and whomever calls it
36210Sstevel@tonic-gate  * should be mindful of the possibility that the lgrp returned
36220Sstevel@tonic-gate  * may be deleted. If this happens, dereferences of the lgrp
36230Sstevel@tonic-gate  * pointer will still be safe, but the resources in the lgrp will
36240Sstevel@tonic-gate  * be gone, and LGRP_EXISTS() will no longer be true.
36250Sstevel@tonic-gate  */
36260Sstevel@tonic-gate lgrp_t *
lgrp_mem_choose(struct seg * seg,caddr_t vaddr,size_t pgsz)36270Sstevel@tonic-gate lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
36280Sstevel@tonic-gate {
36290Sstevel@tonic-gate 	int			i;
36300Sstevel@tonic-gate 	lgrp_t			*lgrp;
36310Sstevel@tonic-gate 	klgrpset_t		lgrpset;
36320Sstevel@tonic-gate 	int			lgrps_spanned;
36330Sstevel@tonic-gate 	unsigned long		off;
36340Sstevel@tonic-gate 	lgrp_mem_policy_t	policy;
36350Sstevel@tonic-gate 	lgrp_mem_policy_info_t	*policy_info;
36360Sstevel@tonic-gate 	ushort_t		random;
36370Sstevel@tonic-gate 	int			stat = 0;
36382480Sesaxe 	extern struct seg	*segkmap;
36390Sstevel@tonic-gate 
36400Sstevel@tonic-gate 	/*
36410Sstevel@tonic-gate 	 * Just return null if the lgrp framework hasn't finished
36420Sstevel@tonic-gate 	 * initializing or if this is a UMA machine.
36430Sstevel@tonic-gate 	 */
36440Sstevel@tonic-gate 	if (nlgrps == 1 || !lgrp_initialized)
36450Sstevel@tonic-gate 		return (lgrp_root);
36460Sstevel@tonic-gate 
36470Sstevel@tonic-gate 	/*
36480Sstevel@tonic-gate 	 * Get memory allocation policy for this segment
36490Sstevel@tonic-gate 	 */
36500Sstevel@tonic-gate 	policy = lgrp_mem_default_policy;
36510Sstevel@tonic-gate 	if (seg != NULL) {
36520Sstevel@tonic-gate 		if (seg->s_as == &kas) {
36532480Sesaxe 			if (seg == segkmap)
36542480Sesaxe 				policy = lgrp_segmap_default_policy;
36550Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
36560Sstevel@tonic-gate 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
36570Sstevel@tonic-gate 				policy = LGRP_MEM_POLICY_RANDOM;
36580Sstevel@tonic-gate 		} else {
36590Sstevel@tonic-gate 			policy_info = lgrp_mem_policy_get(seg, vaddr);
36604426Saguzovsk 			if (policy_info != NULL) {
36610Sstevel@tonic-gate 				policy = policy_info->mem_policy;
36624426Saguzovsk 				if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
36634426Saguzovsk 					lgrp_id_t id = policy_info->mem_lgrpid;
36644426Saguzovsk 					ASSERT(id != LGRP_NONE);
36654426Saguzovsk 					ASSERT(id < NLGRPS_MAX);
36664426Saguzovsk 					lgrp = lgrp_table[id];
36674426Saguzovsk 					if (!LGRP_EXISTS(lgrp)) {
36684426Saguzovsk 						policy = LGRP_MEM_POLICY_NEXT;
36694426Saguzovsk 					} else {
36704426Saguzovsk 						lgrp_stat_add(id,
36714426Saguzovsk 						    LGRP_NUM_NEXT_SEG, 1);
36724426Saguzovsk 						return (lgrp);
36734426Saguzovsk 					}
36744426Saguzovsk 				}
36754426Saguzovsk 			}
36760Sstevel@tonic-gate 		}
36770Sstevel@tonic-gate 	}
36780Sstevel@tonic-gate 	lgrpset = 0;
36790Sstevel@tonic-gate 
36800Sstevel@tonic-gate 	/*
36810Sstevel@tonic-gate 	 * Initialize lgroup to home by default
36820Sstevel@tonic-gate 	 */
36830Sstevel@tonic-gate 	lgrp = lgrp_home_lgrp();
36840Sstevel@tonic-gate 
36850Sstevel@tonic-gate 	/*
36860Sstevel@tonic-gate 	 * When homing threads on root lgrp, override default memory
36870Sstevel@tonic-gate 	 * allocation policies with root lgroup memory allocation policy
36880Sstevel@tonic-gate 	 */
36890Sstevel@tonic-gate 	if (lgrp == lgrp_root)
36900Sstevel@tonic-gate 		policy = lgrp_mem_policy_root;
36910Sstevel@tonic-gate 
36920Sstevel@tonic-gate 	/*
36930Sstevel@tonic-gate 	 * Implement policy
36940Sstevel@tonic-gate 	 */
36950Sstevel@tonic-gate 	switch (policy) {
36960Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT_CPU:
36970Sstevel@tonic-gate 
36980Sstevel@tonic-gate 		/*
36990Sstevel@tonic-gate 		 * Return lgroup of current CPU which faulted on memory
370060Sesaxe 		 * If the CPU isn't currently in an lgrp, then opt to
370160Sesaxe 		 * allocate from the root.
370260Sesaxe 		 *
370360Sesaxe 		 * Kernel preemption needs to be disabled here to prevent
370460Sesaxe 		 * the current CPU from going away before lgrp is found.
37050Sstevel@tonic-gate 		 */
370660Sesaxe 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
370760Sesaxe 			lgrp = lgrp_root;
370860Sesaxe 		} else {
370960Sesaxe 			kpreempt_disable();
371060Sesaxe 			lgrp = lgrp_cpu_to_lgrp(CPU);
371160Sesaxe 			kpreempt_enable();
371260Sesaxe 		}
37130Sstevel@tonic-gate 		break;
37140Sstevel@tonic-gate 
37150Sstevel@tonic-gate 	case LGRP_MEM_POLICY_NEXT:
37160Sstevel@tonic-gate 	case LGRP_MEM_POLICY_DEFAULT:
37170Sstevel@tonic-gate 	default:
37180Sstevel@tonic-gate 
37190Sstevel@tonic-gate 		/*
37200Sstevel@tonic-gate 		 * Just return current thread's home lgroup
37210Sstevel@tonic-gate 		 * for default policy (next touch)
37220Sstevel@tonic-gate 		 * If the thread is homed to the root,
37230Sstevel@tonic-gate 		 * then the default policy is random across lgroups.
37240Sstevel@tonic-gate 		 * Fallthrough to the random case.
37250Sstevel@tonic-gate 		 */
37260Sstevel@tonic-gate 		if (lgrp != lgrp_root) {
37270Sstevel@tonic-gate 			if (policy == LGRP_MEM_POLICY_NEXT)
37280Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
37290Sstevel@tonic-gate 			else
37300Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id,
37310Sstevel@tonic-gate 				    LGRP_NUM_DEFAULT, 1);
37320Sstevel@tonic-gate 			break;
37330Sstevel@tonic-gate 		}
37340Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
37350Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM:
37360Sstevel@tonic-gate 
37370Sstevel@tonic-gate 		/*
37380Sstevel@tonic-gate 		 * Return a random leaf lgroup with memory
37390Sstevel@tonic-gate 		 */
37400Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
37410Sstevel@tonic-gate 		/*
37420Sstevel@tonic-gate 		 * Count how many lgroups are spanned
37430Sstevel@tonic-gate 		 */
37440Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
37450Sstevel@tonic-gate 
37460Sstevel@tonic-gate 		/*
37470Sstevel@tonic-gate 		 * There may be no memnodes in the root lgroup during DR copy
37480Sstevel@tonic-gate 		 * rename on a system with only two boards (memnodes)
37490Sstevel@tonic-gate 		 * configured. In this case just return the root lgrp.
37500Sstevel@tonic-gate 		 */
37510Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
37520Sstevel@tonic-gate 			lgrp = lgrp_root;
37530Sstevel@tonic-gate 			break;
37540Sstevel@tonic-gate 		}
37550Sstevel@tonic-gate 
37560Sstevel@tonic-gate 		/*
37570Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
37580Sstevel@tonic-gate 		 * and return lgroup at that offset
37590Sstevel@tonic-gate 		 */
37600Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
37610Sstevel@tonic-gate 		off = random % lgrps_spanned;
37620Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
37630Sstevel@tonic-gate 
37640Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
37650Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
37660Sstevel@tonic-gate 				continue;
37670Sstevel@tonic-gate 			if (off)
37680Sstevel@tonic-gate 				off--;
37690Sstevel@tonic-gate 			else {
37700Sstevel@tonic-gate 				lgrp = lgrp_table[i];
37710Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
37720Sstevel@tonic-gate 				    1);
37730Sstevel@tonic-gate 				break;
37740Sstevel@tonic-gate 			}
37750Sstevel@tonic-gate 		}
37760Sstevel@tonic-gate 		break;
37770Sstevel@tonic-gate 
37780Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PROC:
37790Sstevel@tonic-gate 
37800Sstevel@tonic-gate 		/*
37810Sstevel@tonic-gate 		 * Grab copy of bitmask of lgroups spanned by
37820Sstevel@tonic-gate 		 * this process
37830Sstevel@tonic-gate 		 */
37840Sstevel@tonic-gate 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
37850Sstevel@tonic-gate 		stat = LGRP_NUM_RANDOM_PROC;
37860Sstevel@tonic-gate 
37870Sstevel@tonic-gate 		/* LINTED fallthrough on case statement */
37880Sstevel@tonic-gate 	case LGRP_MEM_POLICY_RANDOM_PSET:
37890Sstevel@tonic-gate 
37900Sstevel@tonic-gate 		if (!stat)
37910Sstevel@tonic-gate 			stat = LGRP_NUM_RANDOM_PSET;
37920Sstevel@tonic-gate 
37930Sstevel@tonic-gate 		if (klgrpset_isempty(lgrpset)) {
37940Sstevel@tonic-gate 			/*
37950Sstevel@tonic-gate 			 * Grab copy of bitmask of lgroups spanned by
37960Sstevel@tonic-gate 			 * this processor set
37970Sstevel@tonic-gate 			 */
37980Sstevel@tonic-gate 			kpreempt_disable();
37990Sstevel@tonic-gate 			klgrpset_copy(lgrpset,
38000Sstevel@tonic-gate 			    curthread->t_cpupart->cp_lgrpset);
38010Sstevel@tonic-gate 			kpreempt_enable();
38020Sstevel@tonic-gate 		}
38030Sstevel@tonic-gate 
38040Sstevel@tonic-gate 		/*
38050Sstevel@tonic-gate 		 * Count how many lgroups are spanned
38060Sstevel@tonic-gate 		 */
38070Sstevel@tonic-gate 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
38080Sstevel@tonic-gate 		ASSERT(lgrps_spanned <= nlgrps);
38090Sstevel@tonic-gate 
38100Sstevel@tonic-gate 		/*
38110Sstevel@tonic-gate 		 * Probably lgrps_spanned should be always non-zero, but to be
38120Sstevel@tonic-gate 		 * on the safe side we return lgrp_root if it is empty.
38130Sstevel@tonic-gate 		 */
38140Sstevel@tonic-gate 		if (lgrps_spanned == 0) {
38150Sstevel@tonic-gate 			lgrp = lgrp_root;
38160Sstevel@tonic-gate 			break;
38170Sstevel@tonic-gate 		}
38180Sstevel@tonic-gate 
38190Sstevel@tonic-gate 		/*
38200Sstevel@tonic-gate 		 * Pick a random offset within lgroups spanned
38210Sstevel@tonic-gate 		 * and return lgroup at that offset
38220Sstevel@tonic-gate 		 */
38230Sstevel@tonic-gate 		random = (ushort_t)gethrtime() >> 4;
38240Sstevel@tonic-gate 		off = random % lgrps_spanned;
38250Sstevel@tonic-gate 		ASSERT(off <= lgrp_alloc_max);
38260Sstevel@tonic-gate 
38270Sstevel@tonic-gate 		for (i = 0; i <= lgrp_alloc_max; i++) {
38280Sstevel@tonic-gate 			if (!klgrpset_ismember(lgrpset, i))
38290Sstevel@tonic-gate 				continue;
38300Sstevel@tonic-gate 			if (off)
38310Sstevel@tonic-gate 				off--;
38320Sstevel@tonic-gate 			else {
38330Sstevel@tonic-gate 				lgrp = lgrp_table[i];
38340Sstevel@tonic-gate 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
38350Sstevel@tonic-gate 				    1);
38360Sstevel@tonic-gate 				break;
38370Sstevel@tonic-gate 			}
38380Sstevel@tonic-gate 		}
38390Sstevel@tonic-gate 		break;
38400Sstevel@tonic-gate 
38410Sstevel@tonic-gate 	case LGRP_MEM_POLICY_ROUNDROBIN:
38420Sstevel@tonic-gate 
38430Sstevel@tonic-gate 		/*
38440Sstevel@tonic-gate 		 * Use offset within segment to determine
38450Sstevel@tonic-gate 		 * offset from home lgroup to choose for
38460Sstevel@tonic-gate 		 * next lgroup to allocate memory from
38470Sstevel@tonic-gate 		 */
38480Sstevel@tonic-gate 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
38490Sstevel@tonic-gate 		    (lgrp_alloc_max + 1);
38500Sstevel@tonic-gate 
38510Sstevel@tonic-gate 		kpreempt_disable();
38520Sstevel@tonic-gate 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
38530Sstevel@tonic-gate 		i = lgrp->lgrp_id;
38540Sstevel@tonic-gate 		kpreempt_enable();
38550Sstevel@tonic-gate 
38560Sstevel@tonic-gate 		while (off > 0) {
38570Sstevel@tonic-gate 			i = (i + 1) % (lgrp_alloc_max + 1);
38580Sstevel@tonic-gate 			lgrp = lgrp_table[i];
38590Sstevel@tonic-gate 			if (klgrpset_ismember(lgrpset, i))
38600Sstevel@tonic-gate 				off--;
38610Sstevel@tonic-gate 		}
38620Sstevel@tonic-gate 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
38630Sstevel@tonic-gate 
38640Sstevel@tonic-gate 		break;
38650Sstevel@tonic-gate 	}
38660Sstevel@tonic-gate 
38670Sstevel@tonic-gate 	ASSERT(lgrp != NULL);
38680Sstevel@tonic-gate 	return (lgrp);
38690Sstevel@tonic-gate }
38700Sstevel@tonic-gate 
38710Sstevel@tonic-gate /*
38720Sstevel@tonic-gate  * Return the number of pages in an lgroup
38730Sstevel@tonic-gate  *
38740Sstevel@tonic-gate  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
38750Sstevel@tonic-gate  *	 could cause tests that rely on the numat driver to fail....
38760Sstevel@tonic-gate  */
38770Sstevel@tonic-gate pgcnt_t
lgrp_mem_size(lgrp_id_t lgrpid,lgrp_mem_query_t query)38780Sstevel@tonic-gate lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
38790Sstevel@tonic-gate {
38800Sstevel@tonic-gate 	lgrp_t *lgrp;
38810Sstevel@tonic-gate 
38820Sstevel@tonic-gate 	lgrp = lgrp_table[lgrpid];
38830Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp) ||
38840Sstevel@tonic-gate 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
38850Sstevel@tonic-gate 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
38860Sstevel@tonic-gate 		return (0);
38870Sstevel@tonic-gate 
38880Sstevel@tonic-gate 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
38890Sstevel@tonic-gate }
38900Sstevel@tonic-gate 
38910Sstevel@tonic-gate /*
38920Sstevel@tonic-gate  * Initialize lgroup shared memory allocation policy support
38930Sstevel@tonic-gate  */
38940Sstevel@tonic-gate void
lgrp_shm_policy_init(struct anon_map * amp,vnode_t * vp)38950Sstevel@tonic-gate lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
38960Sstevel@tonic-gate {
38970Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
38980Sstevel@tonic-gate 
38990Sstevel@tonic-gate 	/*
39000Sstevel@tonic-gate 	 * Initialize locality field in anon_map
39010Sstevel@tonic-gate 	 * Don't need any locks because this is called when anon_map is
39020Sstevel@tonic-gate 	 * allocated, but not used anywhere yet.
39030Sstevel@tonic-gate 	 */
39040Sstevel@tonic-gate 	if (amp) {
39050Sstevel@tonic-gate 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
39060Sstevel@tonic-gate 		if (amp->locality == NULL) {
39070Sstevel@tonic-gate 			/*
39080Sstevel@tonic-gate 			 * Allocate and initialize shared memory locality info
39090Sstevel@tonic-gate 			 * and set anon_map locality pointer to it
39100Sstevel@tonic-gate 			 * Drop lock across kmem_alloc(KM_SLEEP)
39110Sstevel@tonic-gate 			 */
39120Sstevel@tonic-gate 			ANON_LOCK_EXIT(&amp->a_rwlock);
39130Sstevel@tonic-gate 			shm_locality = kmem_alloc(sizeof (*shm_locality),
39140Sstevel@tonic-gate 			    KM_SLEEP);
39150Sstevel@tonic-gate 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
39160Sstevel@tonic-gate 			    NULL);
39170Sstevel@tonic-gate 			shm_locality->loc_count = 1;	/* not used for amp */
39180Sstevel@tonic-gate 			shm_locality->loc_tree = NULL;
39190Sstevel@tonic-gate 
39200Sstevel@tonic-gate 			/*
39210Sstevel@tonic-gate 			 * Reacquire lock and check to see whether anyone beat
39220Sstevel@tonic-gate 			 * us to initializing the locality info
39230Sstevel@tonic-gate 			 */
39240Sstevel@tonic-gate 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
39250Sstevel@tonic-gate 			if (amp->locality != NULL) {
39260Sstevel@tonic-gate 				rw_destroy(&shm_locality->loc_lock);
39270Sstevel@tonic-gate 				kmem_free(shm_locality,
39280Sstevel@tonic-gate 				    sizeof (*shm_locality));
39290Sstevel@tonic-gate 			} else
39300Sstevel@tonic-gate 				amp->locality = shm_locality;
39310Sstevel@tonic-gate 		}
39320Sstevel@tonic-gate 		ANON_LOCK_EXIT(&amp->a_rwlock);
39330Sstevel@tonic-gate 		return;
39340Sstevel@tonic-gate 	}
39350Sstevel@tonic-gate 
39360Sstevel@tonic-gate 	/*
39370Sstevel@tonic-gate 	 * Allocate shared vnode policy info if vnode is not locality aware yet
39380Sstevel@tonic-gate 	 */
39390Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
39400Sstevel@tonic-gate 	if ((vp->v_flag & V_LOCALITY) == 0) {
39410Sstevel@tonic-gate 		/*
39420Sstevel@tonic-gate 		 * Allocate and initialize shared memory locality info
39430Sstevel@tonic-gate 		 */
39440Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
39450Sstevel@tonic-gate 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
39460Sstevel@tonic-gate 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
39470Sstevel@tonic-gate 		shm_locality->loc_count = 1;
39480Sstevel@tonic-gate 		shm_locality->loc_tree = NULL;
39490Sstevel@tonic-gate 
39500Sstevel@tonic-gate 		/*
39510Sstevel@tonic-gate 		 * Point vnode locality field at shared vnode policy info
39520Sstevel@tonic-gate 		 * and set locality aware flag in vnode
39530Sstevel@tonic-gate 		 */
39540Sstevel@tonic-gate 		mutex_enter(&vp->v_lock);
39550Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0) {
39560Sstevel@tonic-gate 			vp->v_locality = shm_locality;
39570Sstevel@tonic-gate 			vp->v_flag |= V_LOCALITY;
39580Sstevel@tonic-gate 		} else {
39590Sstevel@tonic-gate 			/*
39600Sstevel@tonic-gate 			 * Lost race so free locality info and increment count.
39610Sstevel@tonic-gate 			 */
39620Sstevel@tonic-gate 			rw_destroy(&shm_locality->loc_lock);
39630Sstevel@tonic-gate 			kmem_free(shm_locality, sizeof (*shm_locality));
39640Sstevel@tonic-gate 			shm_locality = vp->v_locality;
39650Sstevel@tonic-gate 			shm_locality->loc_count++;
39660Sstevel@tonic-gate 		}
39670Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
39680Sstevel@tonic-gate 
39690Sstevel@tonic-gate 		return;
39700Sstevel@tonic-gate 	}
39710Sstevel@tonic-gate 
39720Sstevel@tonic-gate 	/*
39730Sstevel@tonic-gate 	 * Increment reference count of number of segments mapping this vnode
39740Sstevel@tonic-gate 	 * shared
39750Sstevel@tonic-gate 	 */
39760Sstevel@tonic-gate 	shm_locality = vp->v_locality;
39770Sstevel@tonic-gate 	shm_locality->loc_count++;
39780Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
39790Sstevel@tonic-gate }
39800Sstevel@tonic-gate 
39810Sstevel@tonic-gate /*
39820Sstevel@tonic-gate  * Destroy the given shared memory policy segment tree
39830Sstevel@tonic-gate  */
39840Sstevel@tonic-gate void
lgrp_shm_policy_tree_destroy(avl_tree_t * tree)39850Sstevel@tonic-gate lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
39860Sstevel@tonic-gate {
39870Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*cur;
39880Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
39890Sstevel@tonic-gate 
39900Sstevel@tonic-gate 	if (tree == NULL)
39910Sstevel@tonic-gate 		return;
39920Sstevel@tonic-gate 
39930Sstevel@tonic-gate 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
39940Sstevel@tonic-gate 	while (cur != NULL) {
39950Sstevel@tonic-gate 		next = AVL_NEXT(tree, cur);
39960Sstevel@tonic-gate 		avl_remove(tree, cur);
39970Sstevel@tonic-gate 		kmem_free(cur, sizeof (*cur));
39980Sstevel@tonic-gate 		cur = next;
39990Sstevel@tonic-gate 	}
40000Sstevel@tonic-gate 	kmem_free(tree, sizeof (avl_tree_t));
40010Sstevel@tonic-gate }
40020Sstevel@tonic-gate 
40030Sstevel@tonic-gate /*
40040Sstevel@tonic-gate  * Uninitialize lgroup shared memory allocation policy support
40050Sstevel@tonic-gate  */
40060Sstevel@tonic-gate void
lgrp_shm_policy_fini(struct anon_map * amp,vnode_t * vp)40070Sstevel@tonic-gate lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
40080Sstevel@tonic-gate {
40090Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
40100Sstevel@tonic-gate 
40110Sstevel@tonic-gate 	/*
40120Sstevel@tonic-gate 	 * For anon_map, deallocate shared memory policy tree and
40130Sstevel@tonic-gate 	 * zero locality field
40140Sstevel@tonic-gate 	 * Don't need any locks because anon_map is being freed
40150Sstevel@tonic-gate 	 */
40160Sstevel@tonic-gate 	if (amp) {
40170Sstevel@tonic-gate 		if (amp->locality == NULL)
40180Sstevel@tonic-gate 			return;
40190Sstevel@tonic-gate 		shm_locality = amp->locality;
40200Sstevel@tonic-gate 		shm_locality->loc_count = 0;	/* not really used for amp */
40210Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
40220Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
40230Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
40240Sstevel@tonic-gate 		amp->locality = 0;
40250Sstevel@tonic-gate 		return;
40260Sstevel@tonic-gate 	}
40270Sstevel@tonic-gate 
40280Sstevel@tonic-gate 	/*
40290Sstevel@tonic-gate 	 * For vnode, decrement reference count of segments mapping this vnode
40300Sstevel@tonic-gate 	 * shared and delete locality info if reference count drops to 0
40310Sstevel@tonic-gate 	 */
40320Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
40330Sstevel@tonic-gate 	shm_locality = vp->v_locality;
40340Sstevel@tonic-gate 	shm_locality->loc_count--;
40350Sstevel@tonic-gate 
40360Sstevel@tonic-gate 	if (shm_locality->loc_count == 0) {
40370Sstevel@tonic-gate 		rw_destroy(&shm_locality->loc_lock);
40380Sstevel@tonic-gate 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
40390Sstevel@tonic-gate 		kmem_free(shm_locality, sizeof (*shm_locality));
40400Sstevel@tonic-gate 		vp->v_locality = 0;
40410Sstevel@tonic-gate 		vp->v_flag &= ~V_LOCALITY;
40420Sstevel@tonic-gate 	}
40430Sstevel@tonic-gate 	mutex_exit(&vp->v_lock);
40440Sstevel@tonic-gate }
40450Sstevel@tonic-gate 
40460Sstevel@tonic-gate /*
40470Sstevel@tonic-gate  * Compare two shared memory policy segments
40480Sstevel@tonic-gate  * Used by AVL tree code for searching
40490Sstevel@tonic-gate  */
40500Sstevel@tonic-gate int
lgrp_shm_policy_compar(const void * x,const void * y)40510Sstevel@tonic-gate lgrp_shm_policy_compar(const void *x, const void *y)
40520Sstevel@tonic-gate {
40530Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
40540Sstevel@tonic-gate 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
40550Sstevel@tonic-gate 
40560Sstevel@tonic-gate 	if (a->shm_off < b->shm_off)
40570Sstevel@tonic-gate 		return (-1);
40580Sstevel@tonic-gate 	if (a->shm_off >= b->shm_off + b->shm_size)
40590Sstevel@tonic-gate 		return (1);
40600Sstevel@tonic-gate 	return (0);
40610Sstevel@tonic-gate }
40620Sstevel@tonic-gate 
40630Sstevel@tonic-gate /*
40640Sstevel@tonic-gate  * Concatenate seg1 with seg2 and remove seg2
40650Sstevel@tonic-gate  */
40660Sstevel@tonic-gate static int
lgrp_shm_policy_concat(avl_tree_t * tree,lgrp_shm_policy_seg_t * seg1,lgrp_shm_policy_seg_t * seg2)40670Sstevel@tonic-gate lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
40680Sstevel@tonic-gate     lgrp_shm_policy_seg_t *seg2)
40690Sstevel@tonic-gate {
40700Sstevel@tonic-gate 	if (!seg1 || !seg2 ||
40710Sstevel@tonic-gate 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
40720Sstevel@tonic-gate 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
40730Sstevel@tonic-gate 		return (-1);
40740Sstevel@tonic-gate 
40750Sstevel@tonic-gate 	seg1->shm_size += seg2->shm_size;
40760Sstevel@tonic-gate 	avl_remove(tree, seg2);
40770Sstevel@tonic-gate 	kmem_free(seg2, sizeof (*seg2));
40780Sstevel@tonic-gate 	return (0);
40790Sstevel@tonic-gate }
40800Sstevel@tonic-gate 
40810Sstevel@tonic-gate /*
40820Sstevel@tonic-gate  * Split segment at given offset and return rightmost (uppermost) segment
40830Sstevel@tonic-gate  * Assumes that there are no overlapping segments
40840Sstevel@tonic-gate  */
40850Sstevel@tonic-gate static lgrp_shm_policy_seg_t *
lgrp_shm_policy_split(avl_tree_t * tree,lgrp_shm_policy_seg_t * seg,u_offset_t off)40860Sstevel@tonic-gate lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
40870Sstevel@tonic-gate     u_offset_t off)
40880Sstevel@tonic-gate {
40890Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
40900Sstevel@tonic-gate 	avl_index_t		where;
40910Sstevel@tonic-gate 
40920Sstevel@tonic-gate 	ASSERT(seg != NULL);
40930Sstevel@tonic-gate 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
40940Sstevel@tonic-gate 
40950Sstevel@tonic-gate 	if (!seg || off < seg->shm_off || off > seg->shm_off +
40960Sstevel@tonic-gate 	    seg->shm_size)
40970Sstevel@tonic-gate 		return (NULL);
40980Sstevel@tonic-gate 
40990Sstevel@tonic-gate 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
41000Sstevel@tonic-gate 		return (seg);
41010Sstevel@tonic-gate 
41020Sstevel@tonic-gate 	/*
41030Sstevel@tonic-gate 	 * Adjust size of left segment and allocate new (right) segment
41040Sstevel@tonic-gate 	 */
41050Sstevel@tonic-gate 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
41060Sstevel@tonic-gate 	newseg->shm_policy = seg->shm_policy;
41070Sstevel@tonic-gate 	newseg->shm_off = off;
41080Sstevel@tonic-gate 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
41090Sstevel@tonic-gate 	seg->shm_size = off - seg->shm_off;
41100Sstevel@tonic-gate 
41110Sstevel@tonic-gate 	/*
41120Sstevel@tonic-gate 	 * Find where to insert new segment in AVL tree and insert it
41130Sstevel@tonic-gate 	 */
41140Sstevel@tonic-gate 	(void) avl_find(tree, &off, &where);
41150Sstevel@tonic-gate 	avl_insert(tree, newseg, where);
41160Sstevel@tonic-gate 
41170Sstevel@tonic-gate 	return (newseg);
41180Sstevel@tonic-gate }
41190Sstevel@tonic-gate 
41200Sstevel@tonic-gate /*
41210Sstevel@tonic-gate  * Set shared memory allocation policy on specified shared object at given
41220Sstevel@tonic-gate  * offset and length
41230Sstevel@tonic-gate  *
41240Sstevel@tonic-gate  * Return 0 if policy wasn't set already, 1 if policy was set already, and
41250Sstevel@tonic-gate  * -1 if can't set policy.
41260Sstevel@tonic-gate  */
41270Sstevel@tonic-gate int
lgrp_shm_policy_set(lgrp_mem_policy_t policy,struct anon_map * amp,ulong_t anon_index,vnode_t * vp,u_offset_t vn_off,size_t len)41280Sstevel@tonic-gate lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
41290Sstevel@tonic-gate     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
41300Sstevel@tonic-gate {
41310Sstevel@tonic-gate 	u_offset_t		eoff;
41320Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*next;
41330Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*newseg;
41340Sstevel@tonic-gate 	u_offset_t		off;
41350Sstevel@tonic-gate 	u_offset_t		oldeoff;
41360Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*prev;
41370Sstevel@tonic-gate 	int			retval;
41380Sstevel@tonic-gate 	lgrp_shm_policy_seg_t	*seg;
41390Sstevel@tonic-gate 	lgrp_shm_locality_t	*shm_locality;
41400Sstevel@tonic-gate 	avl_tree_t		*tree;
41410Sstevel@tonic-gate 	avl_index_t		where;
41420Sstevel@tonic-gate 
41430Sstevel@tonic-gate 	ASSERT(amp || vp);
41440Sstevel@tonic-gate 	ASSERT((len & PAGEOFFSET) == 0);
41450Sstevel@tonic-gate 
41460Sstevel@tonic-gate 	if (len == 0)
41470Sstevel@tonic-gate 		return (-1);
41480Sstevel@tonic-gate 
41490Sstevel@tonic-gate 	retval = 0;
41500Sstevel@tonic-gate 
41510Sstevel@tonic-gate 	/*
41520Sstevel@tonic-gate 	 * Get locality info and starting offset into shared object
41530Sstevel@tonic-gate 	 * Try anon map first and then vnode
41540Sstevel@tonic-gate 	 * Assume that no locks need to be held on anon_map or vnode, since
41550Sstevel@tonic-gate 	 * it should be protected by its reference count which must be nonzero
41560Sstevel@tonic-gate 	 * for an existing segment.
41570Sstevel@tonic-gate 	 */
41580Sstevel@tonic-gate 	if (amp) {
41590Sstevel@tonic-gate 		/*
41600Sstevel@tonic-gate 		 * Get policy info from anon_map
41610Sstevel@tonic-gate 		 *
41620Sstevel@tonic-gate 		 */
41630Sstevel@tonic-gate 		ASSERT(amp->refcnt != 0);
41640Sstevel@tonic-gate 		if (amp->locality == NULL)
41650Sstevel@tonic-gate 			lgrp_shm_policy_init(amp, NULL);
41660Sstevel@tonic-gate 		shm_locality = amp->locality;
41670Sstevel@tonic-gate 		off = ptob(anon_index);
41680Sstevel@tonic-gate 	} else if (vp) {
41690Sstevel@tonic-gate 		/*
41700Sstevel@tonic-gate 		 * Get policy info from vnode
41710Sstevel@tonic-gate 		 */
41720Sstevel@tonic-gate 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
41730Sstevel@tonic-gate 			lgrp_shm_policy_init(NULL, vp);
41740Sstevel@tonic-gate 		shm_locality = vp->v_locality;
41750Sstevel@tonic-gate 		ASSERT(shm_locality->loc_count != 0);
41760Sstevel@tonic-gate 		off = vn_off;
41770Sstevel@tonic-gate 	} else
41780Sstevel@tonic-gate 		return (-1);
41790Sstevel@tonic-gate 
41800Sstevel@tonic-gate 	ASSERT((off & PAGEOFFSET) == 0);
41810Sstevel@tonic-gate 
41820Sstevel@tonic-gate 	/*
41830Sstevel@tonic-gate 	 * Figure out default policy
41840Sstevel@tonic-gate 	 */
41850Sstevel@tonic-gate 	if (policy == LGRP_MEM_POLICY_DEFAULT)
41860Sstevel@tonic-gate 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
41870Sstevel@tonic-gate 
41880Sstevel@tonic-gate 	/*
41890Sstevel@tonic-gate 	 * Create AVL tree if there isn't one yet
41900Sstevel@tonic-gate 	 * and set locality field to point at it
41910Sstevel@tonic-gate 	 */
41920Sstevel@tonic-gate 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
41930Sstevel@tonic-gate 	tree = shm_locality->loc_tree;
41940Sstevel@tonic-gate 	if (!tree) {
41950Sstevel@tonic-gate 		rw_exit(&shm_locality->loc_lock);
41960Sstevel@tonic-gate 
41970Sstevel@tonic-gate 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
41980Sstevel@tonic-gate 
41990Sstevel@tonic-gate 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
42000Sstevel@tonic-gate 		if (shm_locality->loc_tree == NULL) {
42010Sstevel@tonic-gate 			avl_create(tree, lgrp_shm_policy_compar,
42020Sstevel@tonic-gate 			    sizeof (lgrp_shm_policy_seg_t),
42030Sstevel@tonic-gate 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
42040Sstevel@tonic-gate 			shm_locality->loc_tree = tree;
42050Sstevel@tonic-gate 		} else {
42060Sstevel@tonic-gate 			/*
42070Sstevel@tonic-gate 			 * Another thread managed to set up the tree
42080Sstevel@tonic-gate 			 * before we could. Free the tree we allocated
42090Sstevel@tonic-gate 			 * and use the one that's already there.
42100Sstevel@tonic-gate 			 */
42110Sstevel@tonic-gate 			kmem_free(tree, sizeof (*tree));
42120Sstevel@tonic-gate 			tree = shm_locality->loc_tree;
42130Sstevel@tonic-gate 		}
42140Sstevel@tonic-gate 	}
42150Sstevel@tonic-gate 
42160Sstevel@tonic-gate 	/*
42170Sstevel@tonic-gate 	 * Set policy
42180Sstevel@tonic-gate 	 *
42190Sstevel@tonic-gate 	 * Need to maintain hold on writer's lock to keep tree from
42200Sstevel@tonic-gate 	 * changing out from under us
42210Sstevel@tonic-gate 	 */
42220Sstevel@tonic-gate 	while (len != 0) {
42230Sstevel@tonic-gate 		/*
42240Sstevel@tonic-gate 		 * Find policy segment for specified offset into shared object
42250Sstevel@tonic-gate 		 */
42260Sstevel@tonic-gate 		seg = avl_find(tree, &off, &where);
42270Sstevel@tonic-gate 
42280Sstevel@tonic-gate 		/*
42290Sstevel@tonic-gate 		 * Didn't find any existing segment that contains specified
42300Sstevel@tonic-gate 		 * offset, so allocate new segment, insert it, and concatenate
42310Sstevel@tonic-gate 		 * with adjacent segments if possible
42320Sstevel@tonic-gate 		 */
42330Sstevel@tonic-gate 		if (seg == NULL) {
42340Sstevel@tonic-gate 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
42350Sstevel@tonic-gate 			    KM_SLEEP);
42360Sstevel@tonic-gate 			newseg->shm_policy.mem_policy = policy;
42374426Saguzovsk 			newseg->shm_policy.mem_lgrpid = LGRP_NONE;
42380Sstevel@tonic-gate 			newseg->shm_off = off;
42390Sstevel@tonic-gate 			avl_insert(tree, newseg, where);
42400Sstevel@tonic-gate 
42410Sstevel@tonic-gate 			/*
42420Sstevel@tonic-gate 			 * Check to see whether new segment overlaps with next
42430Sstevel@tonic-gate 			 * one, set length of new segment accordingly, and
42440Sstevel@tonic-gate 			 * calculate remaining length and next offset
42450Sstevel@tonic-gate 			 */
42460Sstevel@tonic-gate 			seg = AVL_NEXT(tree, newseg);
42470Sstevel@tonic-gate 			if (seg == NULL || off + len <= seg->shm_off) {
42480Sstevel@tonic-gate 				newseg->shm_size = len;
42490Sstevel@tonic-gate 				len = 0;
42500Sstevel@tonic-gate 			} else {
42510Sstevel@tonic-gate 				newseg->shm_size = seg->shm_off - off;
42520Sstevel@tonic-gate 				off = seg->shm_off;
42530Sstevel@tonic-gate 				len -= newseg->shm_size;
42540Sstevel@tonic-gate 			}
42550Sstevel@tonic-gate 
42560Sstevel@tonic-gate 			/*
42570Sstevel@tonic-gate 			 * Try to concatenate new segment with next and
42580Sstevel@tonic-gate 			 * previous ones, since they might have the same policy
42590Sstevel@tonic-gate 			 * now.  Grab previous and next segments first because
42600Sstevel@tonic-gate 			 * they will change on concatenation.
42610Sstevel@tonic-gate 			 */
42620Sstevel@tonic-gate 			prev =  AVL_PREV(tree, newseg);
42630Sstevel@tonic-gate 			next = AVL_NEXT(tree, newseg);
42640Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, newseg, next);
42650Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
42660Sstevel@tonic-gate 
42670Sstevel@tonic-gate 			continue;
42680Sstevel@tonic-gate 		}
42690Sstevel@tonic-gate 
42700Sstevel@tonic-gate 		eoff = off + len;
42710Sstevel@tonic-gate 		oldeoff = seg->shm_off + seg->shm_size;
42720Sstevel@tonic-gate 
42730Sstevel@tonic-gate 		/*
42740Sstevel@tonic-gate 		 * Policy set already?
42750Sstevel@tonic-gate 		 */
42760Sstevel@tonic-gate 		if (policy == seg->shm_policy.mem_policy) {
42770Sstevel@tonic-gate 			/*
42780Sstevel@tonic-gate 			 * Nothing left to do if offset and length
42790Sstevel@tonic-gate 			 * fall within this segment
42800Sstevel@tonic-gate 			 */
42810Sstevel@tonic-gate 			if (eoff <= oldeoff) {
42820Sstevel@tonic-gate 				retval = 1;
42830Sstevel@tonic-gate 				break;
42840Sstevel@tonic-gate 			} else {
42850Sstevel@tonic-gate 				len = eoff - oldeoff;
42860Sstevel@tonic-gate 				off = oldeoff;
42870Sstevel@tonic-gate 				continue;
42880Sstevel@tonic-gate 			}
42890Sstevel@tonic-gate 		}
42900Sstevel@tonic-gate 
42910Sstevel@tonic-gate 		/*
42920Sstevel@tonic-gate 		 * Specified offset and length match existing segment exactly
42930Sstevel@tonic-gate 		 */
42940Sstevel@tonic-gate 		if (off == seg->shm_off && len == seg->shm_size) {
42950Sstevel@tonic-gate 			/*
42960Sstevel@tonic-gate 			 * Set policy and update current length
42970Sstevel@tonic-gate 			 */
42980Sstevel@tonic-gate 			seg->shm_policy.mem_policy = policy;
42994426Saguzovsk 			seg->shm_policy.mem_lgrpid = LGRP_NONE;
43000Sstevel@tonic-gate 			len = 0;
43010Sstevel@tonic-gate 
43020Sstevel@tonic-gate 			/*
43030Sstevel@tonic-gate 			 * Try concatenating new segment with previous and next
43040Sstevel@tonic-gate 			 * segments, since they might have the same policy now.
43050Sstevel@tonic-gate 			 * Grab previous and next segments first because they
43060Sstevel@tonic-gate 			 * will change on concatenation.
43070Sstevel@tonic-gate 			 */
43080Sstevel@tonic-gate 			prev =  AVL_PREV(tree, seg);
43090Sstevel@tonic-gate 			next = AVL_NEXT(tree, seg);
43100Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, seg, next);
43110Sstevel@tonic-gate 			(void) lgrp_shm_policy_concat(tree, prev, seg);
43120Sstevel@tonic-gate 		} else {
43130Sstevel@tonic-gate 			/*
43140Sstevel@tonic-gate 			 * Specified offset and length only apply to part of
43150Sstevel@tonic-gate 			 * existing segment
43160Sstevel@tonic-gate 			 */
43170Sstevel@tonic-gate 
43180Sstevel@tonic-gate 			/*
43190Sstevel@tonic-gate 			 * New segment starts in middle of old one, so split
43200Sstevel@tonic-gate 			 * new one off near beginning of old one
43210Sstevel@tonic-gate 			 */
43220Sstevel@tonic-gate 			newseg = NULL;
43230Sstevel@tonic-gate 			if (off > seg->shm_off) {
43240Sstevel@tonic-gate 				newseg = lgrp_shm_policy_split(tree, seg, off);
43250Sstevel@tonic-gate 
43260Sstevel@tonic-gate 				/*
43270Sstevel@tonic-gate 				 * New segment ends where old one did, so try
43280Sstevel@tonic-gate 				 * to concatenate with next segment
43290Sstevel@tonic-gate 				 */
43300Sstevel@tonic-gate 				if (eoff == oldeoff) {
43310Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
43324426Saguzovsk 					newseg->shm_policy.mem_lgrpid =
43334426Saguzovsk 					    LGRP_NONE;
43340Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
43350Sstevel@tonic-gate 					    newseg, AVL_NEXT(tree, newseg));
43360Sstevel@tonic-gate 					break;
43370Sstevel@tonic-gate 				}
43380Sstevel@tonic-gate 			}
43390Sstevel@tonic-gate 
43400Sstevel@tonic-gate 			/*
43410Sstevel@tonic-gate 			 * New segment ends before old one, so split off end of
43420Sstevel@tonic-gate 			 * old one
43430Sstevel@tonic-gate 			 */
43440Sstevel@tonic-gate 			if (eoff < oldeoff) {
43450Sstevel@tonic-gate 				if (newseg) {
43460Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree,
43470Sstevel@tonic-gate 					    newseg, eoff);
43480Sstevel@tonic-gate 					newseg->shm_policy.mem_policy = policy;
43494426Saguzovsk 					newseg->shm_policy.mem_lgrpid =
43504426Saguzovsk 					    LGRP_NONE;
43510Sstevel@tonic-gate 				} else {
43520Sstevel@tonic-gate 					(void) lgrp_shm_policy_split(tree, seg,
43530Sstevel@tonic-gate 					    eoff);
43540Sstevel@tonic-gate 					seg->shm_policy.mem_policy = policy;
43554426Saguzovsk 					seg->shm_policy.mem_lgrpid = LGRP_NONE;
43560Sstevel@tonic-gate 				}
43570Sstevel@tonic-gate 
43580Sstevel@tonic-gate 				if (off == seg->shm_off)
43590Sstevel@tonic-gate 					(void) lgrp_shm_policy_concat(tree,
43600Sstevel@tonic-gate 					    AVL_PREV(tree, seg), seg);
43610Sstevel@tonic-gate 				break;
43620Sstevel@tonic-gate 			}
43630Sstevel@tonic-gate 
43640Sstevel@tonic-gate 			/*
43650Sstevel@tonic-gate 			 * Calculate remaining length and next offset
43660Sstevel@tonic-gate 			 */
43670Sstevel@tonic-gate 			len = eoff - oldeoff;
43680Sstevel@tonic-gate 			off = oldeoff;
43690Sstevel@tonic-gate 		}
43700Sstevel@tonic-gate 	}
43710Sstevel@tonic-gate 
43720Sstevel@tonic-gate 	rw_exit(&shm_locality->loc_lock);
43730Sstevel@tonic-gate 	return (retval);
43740Sstevel@tonic-gate }
43750Sstevel@tonic-gate 
43760Sstevel@tonic-gate /*
43770Sstevel@tonic-gate  * Return the best memnode from which to allocate memory given
43780Sstevel@tonic-gate  * an lgroup.
43790Sstevel@tonic-gate  *
43800Sstevel@tonic-gate  * "c" is for cookie, which is good enough for me.
43810Sstevel@tonic-gate  * It references a cookie struct that should be zero'ed to initialize.
43820Sstevel@tonic-gate  * The cookie should live on the caller's stack.
43830Sstevel@tonic-gate  *
43840Sstevel@tonic-gate  * The routine returns -1 when:
43850Sstevel@tonic-gate  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
43860Sstevel@tonic-gate  *	- traverse is 1, and all the memnodes in the system have been
43870Sstevel@tonic-gate  *	  returned.
43880Sstevel@tonic-gate  */
43890Sstevel@tonic-gate int
lgrp_memnode_choose(lgrp_mnode_cookie_t * c)43900Sstevel@tonic-gate lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
43910Sstevel@tonic-gate {
43920Sstevel@tonic-gate 	lgrp_t		*lp = c->lmc_lgrp;
43930Sstevel@tonic-gate 	mnodeset_t	nodes = c->lmc_nodes;
43940Sstevel@tonic-gate 	int		cnt = c->lmc_cnt;
43950Sstevel@tonic-gate 	int		offset, mnode;
43960Sstevel@tonic-gate 
43970Sstevel@tonic-gate 	extern int	max_mem_nodes;
43980Sstevel@tonic-gate 
43990Sstevel@tonic-gate 	/*
44000Sstevel@tonic-gate 	 * If the set is empty, and the caller is willing, traverse
44010Sstevel@tonic-gate 	 * up the hierarchy until we find a non-empty set.
44020Sstevel@tonic-gate 	 */
44030Sstevel@tonic-gate 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
44040Sstevel@tonic-gate 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
44050Sstevel@tonic-gate 		    ((lp = lp->lgrp_parent) == NULL))
44060Sstevel@tonic-gate 			return (-1);
44070Sstevel@tonic-gate 
44080Sstevel@tonic-gate 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
44090Sstevel@tonic-gate 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
44100Sstevel@tonic-gate 	}
44110Sstevel@tonic-gate 
44120Sstevel@tonic-gate 	/*
44130Sstevel@tonic-gate 	 * Select a memnode by picking one at a "random" offset.
44140Sstevel@tonic-gate 	 * Because of DR, memnodes can come and go at any time.
44150Sstevel@tonic-gate 	 * This code must be able to cope with the possibility
44160Sstevel@tonic-gate 	 * that the nodes count "cnt" is inconsistent with respect
44170Sstevel@tonic-gate 	 * to the number of elements actually in "nodes", and
44180Sstevel@tonic-gate 	 * therefore that the offset chosen could be greater than
44190Sstevel@tonic-gate 	 * the number of elements in the set (some memnodes may
44200Sstevel@tonic-gate 	 * have dissapeared just before cnt was read).
44210Sstevel@tonic-gate 	 * If this happens, the search simply wraps back to the
44220Sstevel@tonic-gate 	 * beginning of the set.
44230Sstevel@tonic-gate 	 */
44240Sstevel@tonic-gate 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
44250Sstevel@tonic-gate 	offset = c->lmc_rand % cnt;
44260Sstevel@tonic-gate 	do {
44270Sstevel@tonic-gate 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
44280Sstevel@tonic-gate 			if (nodes & ((mnodeset_t)1 << mnode))
44290Sstevel@tonic-gate 				if (!offset--)
44300Sstevel@tonic-gate 					break;
44310Sstevel@tonic-gate 	} while (mnode >= max_mem_nodes);
44320Sstevel@tonic-gate 
44330Sstevel@tonic-gate 	/* Found a node. Store state before returning. */
44340Sstevel@tonic-gate 	c->lmc_lgrp = lp;
44350Sstevel@tonic-gate 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
44360Sstevel@tonic-gate 	c->lmc_cnt = cnt - 1;
44370Sstevel@tonic-gate 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
44380Sstevel@tonic-gate 	c->lmc_ntried++;
44390Sstevel@tonic-gate 
44400Sstevel@tonic-gate 	return (mnode);
44410Sstevel@tonic-gate }
4442