lgrp.c revision 3434:5142e1d7d0bc
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Basic NUMA support in terms of locality groups
30 *
31 * Solaris needs to know which CPUs, memory, etc. are near each other to
32 * provide good performance on NUMA machines by optimizing for locality.
33 * In order to do this, a new abstraction called a "locality group (lgroup)"
34 * has been introduced to keep track of which CPU-like and memory-like hardware
35 * resources are close to each other.  Currently, latency is the only measure
36 * used to determine how to group hardware resources into lgroups, but this
37 * does not limit the groupings to be based solely on latency.  Other factors
38 * may be used to determine the groupings in the future.
39 *
40 * Lgroups are organized into a hieararchy or topology that represents the
41 * latency topology of the machine.  There is always at least a root lgroup in
42 * the system.  It represents all the hardware resources in the machine at a
43 * latency big enough that any hardware resource can at least access any other
44 * hardware resource within that latency.  A Uniform Memory Access (UMA)
45 * machine is represented with one lgroup (the root).  In contrast, a NUMA
46 * machine is represented at least by the root lgroup and some number of leaf
47 * lgroups where the leaf lgroups contain the hardware resources within the
48 * least latency of each other and the root lgroup still contains all the
49 * resources in the machine.  Some number of intermediate lgroups may exist
50 * which represent more levels of locality than just the local latency of the
51 * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
52 * (eg. root and intermediate lgroups) contain the next nearest resources to
53 * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
54 * to the root lgroup shows the hardware resources from closest to farthest
55 * from the leaf lgroup such that each successive ancestor lgroup contains
56 * the next nearest resources at the next level of locality from the previous.
57 *
58 * The kernel uses the lgroup abstraction to know how to allocate resources
59 * near a given process/thread.  At fork() and lwp/thread_create() time, a
60 * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
61 * with the lowest load average.  Binding to a processor or processor set will
62 * change the home lgroup for a thread.  The scheduler has been modified to try
63 * to dispatch a thread on a CPU in its home lgroup.  Physical memory
64 * allocation is lgroup aware too, so memory will be allocated from the current
65 * thread's home lgroup if possible.  If the desired resources are not
66 * available, the kernel traverses the lgroup hierarchy going to the parent
67 * lgroup to find resources at the next level of locality until it reaches the
68 * root lgroup.
69 */
70
71#include <sys/lgrp.h>
72#include <sys/lgrp_user.h>
73#include <sys/types.h>
74#include <sys/mman.h>
75#include <sys/param.h>
76#include <sys/var.h>
77#include <sys/thread.h>
78#include <sys/cpuvar.h>
79#include <sys/cpupart.h>
80#include <sys/kmem.h>
81#include <vm/seg.h>
82#include <vm/seg_kmem.h>
83#include <vm/seg_spt.h>
84#include <vm/seg_vn.h>
85#include <vm/as.h>
86#include <sys/atomic.h>
87#include <sys/systm.h>
88#include <sys/errno.h>
89#include <sys/cmn_err.h>
90#include <sys/kstat.h>
91#include <sys/sysmacros.h>
92#include <sys/pg.h>
93#include <sys/promif.h>
94#include <sys/sdt.h>
95
96lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
97lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98				/* indexed by lgrp_id */
99int	nlgrps;			/* number of lgroups in machine */
100int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
101int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
102
103/*
104 * Kstat data for lgroups.
105 *
106 * Actual kstat data is collected in lgrp_stats array.
107 * The lgrp_kstat_data array of named kstats is used to extract data from
108 * lgrp_stats and present it to kstat framework. It is protected from partallel
109 * modifications by lgrp_kstat_mutex. This may cause some contention when
110 * several kstat commands run in parallel but this is not the
111 * performance-critical path.
112 */
113extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
114
115/*
116 * Declare kstat names statically for enums as defined in the header file.
117 */
118LGRP_KSTAT_NAMES;
119
120static void	lgrp_kstat_init(void);
121static int	lgrp_kstat_extract(kstat_t *, int);
122static void	lgrp_kstat_reset(lgrp_id_t);
123
124static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125static kmutex_t lgrp_kstat_mutex;
126
127
128/*
129 * max number of lgroups supported by the platform
130 */
131int	nlgrpsmax = 0;
132
133/*
134 * The root lgroup. Represents the set of resources at the system wide
135 * level of locality.
136 */
137lgrp_t		*lgrp_root = NULL;
138
139/*
140 * During system bootstrap cp_default does not contain the list of lgrp load
141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142 * on-line when cp_default is initialized by cpupart_initialize_default().
143 * Configuring CPU0 may create a two-level topology with root and one leaf node
144 * containing CPU0. This topology is initially constructed in a special
145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147 * for all lpl operations until cp_default is fully constructed.
148 *
149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151 * the first element of lpl_bootstrap_list.
152 *
153 * CPUs that are added to the system, but have not yet been assigned to an
154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155 * on some architectures (x86) it's possible for the slave CPU startup thread
156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157 */
158#define	LPL_BOOTSTRAP_SIZE 2
159static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160lpl_t		*lpl_bootstrap;
161
162/*
163 * If cp still references the bootstrap lpl, it has not yet been added to
164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165 * a thread is trying to allocate memory close to a CPU that has no lgrp.
166 */
167#define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
168
169static lgrp_t	lroot;
170
171/*
172 * Size, in bytes, beyond which random memory allocation policy is applied
173 * to non-shared memory.  Default is the maximum size, so random memory
174 * allocation won't be used for non-shared memory by default.
175 */
176size_t	lgrp_privm_random_thresh = (size_t)(-1);
177
178/* the maximum effect that a single thread can have on it's lgroup's load */
179#define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180	((lgrp_loadavg_max_effect) / (ncpu))
181uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182
183
184/*
185 * Size, in bytes, beyond which random memory allocation policy is applied to
186 * shared memory.  Default is 8MB (2 ISM pages).
187 */
188size_t	lgrp_shm_random_thresh = 8*1024*1024;
189
190/*
191 * Whether to do processor set aware memory allocation by default
192 */
193int	lgrp_mem_pset_aware = 0;
194
195/*
196 * Set the default memory allocation policy for root lgroup
197 */
198lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199
200/*
201 * Set the default memory allocation policy.  For most platforms,
202 * next touch is sufficient, but some platforms may wish to override
203 * this.
204 */
205lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206
207
208/*
209 * lgroup CPU event handlers
210 */
211static void	lgrp_cpu_init(struct cpu *);
212static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
214
215/*
216 * lgroup memory event handlers
217 */
218static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221
222/*
223 * lgroup CPU partition event handlers
224 */
225static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226static void	lgrp_part_del_cpu(struct cpu *);
227
228static void	lgrp_root_init(void);
229
230/*
231 * lpl topology
232 */
233static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
234static void	lpl_clear(lpl_t *);
235static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
236static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
237static void	lpl_rset_add(lpl_t *, lpl_t *);
238static void	lpl_rset_del(lpl_t *, lpl_t *);
239static int	lpl_rset_contains(lpl_t *, lpl_t *);
240static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
241static void	lpl_child_update(lpl_t *, struct cpupart *);
242static int	lpl_pick(lpl_t *, lpl_t *);
243static void	lpl_verify_wrapper(struct cpupart *);
244
245/*
246 * defines for lpl topology verifier return codes
247 */
248
249#define	LPL_TOPO_CORRECT			0
250#define	LPL_TOPO_PART_HAS_NO_LPL		-1
251#define	LPL_TOPO_CPUS_NOT_EMPTY			-2
252#define	LPL_TOPO_LGRP_MISMATCH			-3
253#define	LPL_TOPO_MISSING_PARENT			-4
254#define	LPL_TOPO_PARENT_MISMATCH		-5
255#define	LPL_TOPO_BAD_CPUCNT			-6
256#define	LPL_TOPO_RSET_MISMATCH			-7
257#define	LPL_TOPO_LPL_ORPHANED			-8
258#define	LPL_TOPO_LPL_BAD_NCPU			-9
259#define	LPL_TOPO_RSET_MSSNG_LF			-10
260#define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
261#define	LPL_TOPO_BOGUS_HINT			-12
262#define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
263#define	LPL_TOPO_LGRP_NOT_LEAF			-14
264#define	LPL_TOPO_BAD_RSETCNT			-15
265
266/*
267 * Return whether lgroup optimizations should be enabled on this system
268 */
269int
270lgrp_optimizations(void)
271{
272	/*
273	 * System must have more than 2 lgroups to enable lgroup optimizations
274	 *
275	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
276	 * with one child lgroup containing all the resources. A 2 lgroup
277	 * system with a root lgroup directly containing CPUs or memory might
278	 * need lgroup optimizations with its child lgroup, but there
279	 * isn't such a machine for now....
280	 */
281	if (nlgrps > 2)
282		return (1);
283
284	return (0);
285}
286
287/*
288 * Build full lgroup topology
289 */
290static void
291lgrp_root_init(void)
292{
293	lgrp_handle_t	hand;
294	int		i;
295	lgrp_id_t	id;
296
297	/*
298	 * Create the "root" lgroup
299	 */
300	ASSERT(nlgrps == 0);
301	id = nlgrps++;
302
303	lgrp_root = &lroot;
304
305	lgrp_root->lgrp_cpu = NULL;
306	lgrp_root->lgrp_mnodes = 0;
307	lgrp_root->lgrp_nmnodes = 0;
308	hand = lgrp_plat_root_hand();
309	lgrp_root->lgrp_plathand = hand;
310
311	lgrp_root->lgrp_id = id;
312	lgrp_root->lgrp_cpucnt = 0;
313	lgrp_root->lgrp_childcnt = 0;
314	klgrpset_clear(lgrp_root->lgrp_children);
315	klgrpset_clear(lgrp_root->lgrp_leaves);
316	lgrp_root->lgrp_parent = NULL;
317	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318
319	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320		klgrpset_clear(lgrp_root->lgrp_set[i]);
321
322	lgrp_root->lgrp_kstat = NULL;
323
324	lgrp_table[id] = lgrp_root;
325
326	/*
327	 * Setup initial lpl list for CPU0 and initial t0 home.
328	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329	 * all topology operations until cp_default is initialized at which
330	 * point t0.t_lpl will be updated.
331	 */
332	lpl_bootstrap = lpl_bootstrap_list;
333	t0.t_lpl = lpl_bootstrap;
334	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336	cp_default.cp_lgrploads = lpl_bootstrap;
337}
338
339/*
340 * Initialize the lgroup framework and allow the platform to do the same
341 */
342void
343lgrp_init(void)
344{
345	/*
346	 * Initialize the platform
347	 */
348	lgrp_plat_init();
349
350	/*
351	 * Set max number of lgroups supported on this platform which must be
352	 * less than the max number of lgroups supported by the common lgroup
353	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354	 */
355	nlgrpsmax = lgrp_plat_max_lgrps();
356	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357}
358
359/*
360 * Create the root and cpu0's lgroup, and set t0's home.
361 */
362void
363lgrp_setup(void)
364{
365	/*
366	 * Setup the root lgroup
367	 */
368	lgrp_root_init();
369
370	/*
371	 * Add cpu0 to an lgroup
372	 */
373	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375}
376
377/*
378 * Lgroup initialization is split in two parts. The first part
379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381 * when all CPUs are brought online and all distance information is available.
382 *
383 * When lgrp_main_init() is complete it sets lgrp_initialized. The
384 * lgrp_main_mp_init() sets lgrp_topo_initialized.
385 */
386
387/*
388 * true when lgrp initialization has been completed.
389 */
390int	lgrp_initialized = 0;
391
392/*
393 * True when lgrp topology is constructed.
394 */
395int	lgrp_topo_initialized = 0;
396
397/*
398 * Init routine called after startup(), /etc/system has been processed,
399 * and cpu0 has been added to an lgroup.
400 */
401void
402lgrp_main_init(void)
403{
404	cpu_t		*cp = CPU;
405	lgrp_id_t	lgrpid;
406	int		i;
407	/*
408	 * Enforce a valid lgrp_mem_default_policy
409	 */
410	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
411	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
412		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
413
414	/*
415	 * See if mpo should be disabled.
416	 * This may happen in the case of null proc LPA on Starcat.
417	 * The platform won't be able to detect null proc LPA until after
418	 * cpu0 and memory have already been added to lgroups.
419	 * When and if it is detected, the Starcat platform will return
420	 * a different platform handle for cpu0 which is what we check for
421	 * here. If mpo should be disabled move cpu0 to it's rightful place
422	 * (the root), and destroy the remaining lgroups. This effectively
423	 * provides an UMA lgroup topology.
424	 */
425	lgrpid = cp->cpu_lpl->lpl_lgrpid;
426	if (lgrp_table[lgrpid]->lgrp_plathand !=
427	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
428		lgrp_part_del_cpu(cp);
429		lgrp_cpu_fini(cp, lgrpid);
430
431		lgrp_cpu_init(cp);
432		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
433
434		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
435
436		/*
437		 * Destroy all lgroups except for root
438		 */
439		for (i = 0; i <= lgrp_alloc_max; i++) {
440			if (LGRP_EXISTS(lgrp_table[i]) &&
441			    lgrp_table[i] != lgrp_root)
442				lgrp_destroy(lgrp_table[i]);
443		}
444
445		/*
446		 * Fix up root to point at itself for leaves and resources
447		 * and not have any children
448		 */
449		lgrp_root->lgrp_childcnt = 0;
450		klgrpset_clear(lgrp_root->lgrp_children);
451		klgrpset_clear(lgrp_root->lgrp_leaves);
452		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
453		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
454		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
455	}
456
457	/*
458	 * Initialize kstats framework.
459	 */
460	lgrp_kstat_init();
461	/*
462	 * cpu0 is finally where it should be, so create it's lgroup's kstats
463	 */
464	mutex_enter(&cpu_lock);
465	lgrp_kstat_create(cp);
466	mutex_exit(&cpu_lock);
467
468	lgrp_plat_main_init();
469	lgrp_initialized = 1;
470}
471
472/*
473 * Finish lgrp initialization after all CPUS are brought on-line.
474 * This routine is called after start_other_cpus().
475 */
476void
477lgrp_main_mp_init(void)
478{
479	klgrpset_t changed;
480
481	/*
482	 * Update lgroup topology (if necessary)
483	 */
484	klgrpset_clear(changed);
485	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
486	lgrp_topo_initialized = 1;
487}
488
489/*
490 * Change latency of lgroup with specified lgroup platform handle (if one is
491 * given) or change all lgroups with old latency to new latency
492 */
493void
494lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
495    u_longlong_t newtime)
496{
497	lgrp_t		*lgrp;
498	int		i;
499
500	for (i = 0; i <= lgrp_alloc_max; i++) {
501		lgrp = lgrp_table[i];
502
503		if (!LGRP_EXISTS(lgrp))
504			continue;
505
506		if ((hand == LGRP_NULL_HANDLE &&
507		    lgrp->lgrp_latency == oldtime) ||
508		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
509			lgrp->lgrp_latency = (int)newtime;
510	}
511}
512
513/*
514 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
515 */
516void
517lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
518{
519	klgrpset_t	changed;
520	cpu_t		*cp;
521	lgrp_id_t	id;
522	int		rc;
523
524	switch (event) {
525	/*
526	 * The following (re)configuration events are common code
527	 * initiated. lgrp_plat_config() is called here to inform the
528	 * platform of the reconfiguration event.
529	 */
530	case LGRP_CONFIG_CPU_ADD:
531		cp = (cpu_t *)resource;
532
533		/*
534		 * Initialize the new CPU's lgrp related next/prev
535		 * links, and give it a bootstrap lpl so that it can
536		 * survive should it need to enter the dispatcher.
537		 */
538		cp->cpu_next_lpl = cp;
539		cp->cpu_prev_lpl = cp;
540		cp->cpu_next_lgrp = cp;
541		cp->cpu_prev_lgrp = cp;
542		cp->cpu_lpl = lpl_bootstrap;
543
544		lgrp_plat_config(event, resource);
545		atomic_add_32(&lgrp_gen, 1);
546
547		break;
548	case LGRP_CONFIG_CPU_DEL:
549		lgrp_plat_config(event, resource);
550		atomic_add_32(&lgrp_gen, 1);
551
552		break;
553	case LGRP_CONFIG_CPU_ONLINE:
554		cp = (cpu_t *)resource;
555		lgrp_cpu_init(cp);
556		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
557		rc = lpl_topo_verify(cp->cpu_part);
558		if (rc != LPL_TOPO_CORRECT) {
559			panic("lpl_topo_verify failed: %d", rc);
560		}
561		lgrp_plat_config(event, resource);
562		atomic_add_32(&lgrp_gen, 1);
563
564		break;
565	case LGRP_CONFIG_CPU_OFFLINE:
566		cp = (cpu_t *)resource;
567		id = cp->cpu_lpl->lpl_lgrpid;
568		lgrp_part_del_cpu(cp);
569		lgrp_cpu_fini(cp, id);
570		rc = lpl_topo_verify(cp->cpu_part);
571		if (rc != LPL_TOPO_CORRECT) {
572			panic("lpl_topo_verify failed: %d", rc);
573		}
574		lgrp_plat_config(event, resource);
575		atomic_add_32(&lgrp_gen, 1);
576
577		break;
578	case LGRP_CONFIG_CPUPART_ADD:
579		cp = (cpu_t *)resource;
580		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
581		rc = lpl_topo_verify(cp->cpu_part);
582		if (rc != LPL_TOPO_CORRECT) {
583			panic("lpl_topo_verify failed: %d", rc);
584		}
585		lgrp_plat_config(event, resource);
586
587		break;
588	case LGRP_CONFIG_CPUPART_DEL:
589		cp = (cpu_t *)resource;
590		lgrp_part_del_cpu((cpu_t *)resource);
591		rc = lpl_topo_verify(cp->cpu_part);
592		if (rc != LPL_TOPO_CORRECT) {
593			panic("lpl_topo_verify failed: %d", rc);
594		}
595		lgrp_plat_config(event, resource);
596
597		break;
598	/*
599	 * The following events are initiated by the memnode
600	 * subsystem.
601	 */
602	case LGRP_CONFIG_MEM_ADD:
603		lgrp_mem_init((int)resource, where, B_FALSE);
604		atomic_add_32(&lgrp_gen, 1);
605
606		break;
607	case LGRP_CONFIG_MEM_DEL:
608		lgrp_mem_fini((int)resource, where, B_FALSE);
609		atomic_add_32(&lgrp_gen, 1);
610
611		break;
612	case LGRP_CONFIG_MEM_RENAME: {
613		lgrp_config_mem_rename_t *ren_arg =
614		    (lgrp_config_mem_rename_t *)where;
615
616		lgrp_mem_rename((int)resource,
617		    ren_arg->lmem_rename_from,
618		    ren_arg->lmem_rename_to);
619		atomic_add_32(&lgrp_gen, 1);
620
621		break;
622	}
623	case LGRP_CONFIG_GEN_UPDATE:
624		atomic_add_32(&lgrp_gen, 1);
625
626		break;
627	case LGRP_CONFIG_FLATTEN:
628		if (where == 0)
629			lgrp_topo_levels = (int)resource;
630		else
631			(void) lgrp_topo_flatten(resource,
632			    lgrp_table, lgrp_alloc_max, &changed);
633
634		break;
635	/*
636	 * Update any lgroups with old latency to new latency
637	 */
638	case LGRP_CONFIG_LAT_CHANGE_ALL:
639		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
640		    (u_longlong_t)where);
641
642		break;
643	/*
644	 * Update lgroup with specified lgroup platform handle to have
645	 * new latency
646	 */
647	case LGRP_CONFIG_LAT_CHANGE:
648		lgrp_latency_change((lgrp_handle_t)resource, 0,
649		    (u_longlong_t)where);
650
651		break;
652	case LGRP_CONFIG_NOP:
653
654		break;
655	default:
656		break;
657	}
658
659}
660
661/*
662 * Called to add lgrp info into cpu structure from cpu_add_unit;
663 * do not assume cpu is in cpu[] yet!
664 *
665 * CPUs are brought online with all other CPUs paused so we can't
666 * allocate memory or we could deadlock the system, so we rely on
667 * the platform to statically allocate as much space as we need
668 * for the lgrp structs and stats.
669 */
670static void
671lgrp_cpu_init(struct cpu *cp)
672{
673	klgrpset_t	changed;
674	int		count;
675	lgrp_handle_t	hand;
676	int		first_cpu;
677	lgrp_t		*my_lgrp;
678	lgrp_id_t	lgrpid;
679	struct cpu	*cptr;
680
681	/*
682	 * This is the first time through if the resource set
683	 * for the root lgroup is empty. After cpu0 has been
684	 * initially added to an lgroup, the root's CPU resource
685	 * set can never be empty, since the system's last CPU
686	 * cannot be offlined.
687	 */
688	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
689		/*
690		 * First time through.
691		 */
692		first_cpu = 1;
693	} else {
694		/*
695		 * If cpu0 needs to move lgroups, we may come
696		 * through here again, at which time cpu_lock won't
697		 * be held, and lgrp_initialized will be false.
698		 */
699		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
700		ASSERT(cp->cpu_part != NULL);
701		first_cpu = 0;
702	}
703
704	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
705	my_lgrp = lgrp_hand_to_lgrp(hand);
706
707	if (my_lgrp == NULL) {
708		/*
709		 * Create new lgrp and add it to lgroup topology
710		 */
711		my_lgrp = lgrp_create();
712		my_lgrp->lgrp_plathand = hand;
713		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
714		lgrpid = my_lgrp->lgrp_id;
715		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
716		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
717
718		count = 0;
719		klgrpset_clear(changed);
720		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
721		    &changed);
722		/*
723		 * May have added new intermediate lgroups, so need to add
724		 * resources other than CPUs which are added below
725		 */
726		(void) lgrp_mnode_update(changed, NULL);
727	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
728	    > 0) {
729		/*
730		 * Leaf lgroup was created, but latency wasn't available
731		 * then.  So, set latency for it and fill in rest of lgroup
732		 * topology  now that we know how far it is from other leaf
733		 * lgroups.
734		 */
735		lgrpid = my_lgrp->lgrp_id;
736		klgrpset_clear(changed);
737		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
738		    lgrpid))
739			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
740		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
741		    &changed);
742
743		/*
744		 * May have added new intermediate lgroups, so need to add
745		 * resources other than CPUs which are added below
746		 */
747		(void) lgrp_mnode_update(changed, NULL);
748	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
749	    my_lgrp->lgrp_id)) {
750		int	i;
751
752		/*
753		 * Update existing lgroup and lgroups containing it with CPU
754		 * resource
755		 */
756		lgrpid = my_lgrp->lgrp_id;
757		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
758		for (i = 0; i <= lgrp_alloc_max; i++) {
759			lgrp_t		*lgrp;
760
761			lgrp = lgrp_table[i];
762			if (!LGRP_EXISTS(lgrp) ||
763			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
764				continue;
765
766			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
767		}
768	}
769
770	lgrpid = my_lgrp->lgrp_id;
771	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
772
773	/*
774	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
775	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
776	 * not since none of lgroup IDs in the lpl's have been set yet.
777	 */
778	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
779		cp->cpu_lpl->lpl_lgrpid = lgrpid;
780
781	/*
782	 * link the CPU into the lgrp's CPU list
783	 */
784	if (my_lgrp->lgrp_cpucnt == 0) {
785		my_lgrp->lgrp_cpu = cp;
786		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
787	} else {
788		cptr = my_lgrp->lgrp_cpu;
789		cp->cpu_next_lgrp = cptr;
790		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
791		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
792		cptr->cpu_prev_lgrp = cp;
793	}
794	my_lgrp->lgrp_cpucnt++;
795}
796
797lgrp_t *
798lgrp_create(void)
799{
800	lgrp_t		*my_lgrp;
801	lgrp_id_t	lgrpid;
802	int		i;
803
804	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
805
806	/*
807	 * Find an open slot in the lgroup table and recycle unused lgroup
808	 * left there if any
809	 */
810	my_lgrp = NULL;
811	if (lgrp_alloc_hint == -1)
812		/*
813		 * Allocate from end when hint not set yet because no lgroups
814		 * have been deleted yet
815		 */
816		lgrpid = nlgrps++;
817	else {
818		/*
819		 * Start looking for next open slot from hint and leave hint
820		 * at slot allocated
821		 */
822		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
823			my_lgrp = lgrp_table[i];
824			if (!LGRP_EXISTS(my_lgrp)) {
825				lgrpid = i;
826				nlgrps++;
827				break;
828			}
829		}
830		lgrp_alloc_hint = lgrpid;
831	}
832
833	/*
834	 * Keep track of max lgroup ID allocated so far to cut down on searches
835	 */
836	if (lgrpid > lgrp_alloc_max)
837		lgrp_alloc_max = lgrpid;
838
839	/*
840	 * Need to allocate new lgroup if next open slot didn't have one
841	 * for recycling
842	 */
843	if (my_lgrp == NULL)
844		my_lgrp = lgrp_plat_alloc(lgrpid);
845
846	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
847		panic("Too many lgrps for platform (%d)", nlgrps);
848
849	my_lgrp->lgrp_id = lgrpid;
850	my_lgrp->lgrp_latency = 0;
851	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
852	my_lgrp->lgrp_parent = NULL;
853	my_lgrp->lgrp_childcnt = 0;
854	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
855	my_lgrp->lgrp_nmnodes = 0;
856	klgrpset_clear(my_lgrp->lgrp_children);
857	klgrpset_clear(my_lgrp->lgrp_leaves);
858	for (i = 0; i < LGRP_RSRC_COUNT; i++)
859		klgrpset_clear(my_lgrp->lgrp_set[i]);
860
861	my_lgrp->lgrp_cpu = NULL;
862	my_lgrp->lgrp_cpucnt = 0;
863
864	if (my_lgrp->lgrp_kstat != NULL)
865		lgrp_kstat_reset(lgrpid);
866
867	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
868
869	return (my_lgrp);
870}
871
872void
873lgrp_destroy(lgrp_t *lgrp)
874{
875	int		i;
876
877	/*
878	 * Unless this lgroup is being destroyed on behalf of
879	 * the boot CPU, cpu_lock must be held
880	 */
881	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
882
883	if (nlgrps == 1)
884		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
885
886	if (!LGRP_EXISTS(lgrp))
887		return;
888
889	/*
890	 * Set hint to lgroup being deleted and try to keep lower numbered
891	 * hints to facilitate finding empty slots
892	 */
893	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
894		lgrp_alloc_hint = lgrp->lgrp_id;
895
896	/*
897	 * Mark this lgroup to be recycled by setting its lgroup ID to
898	 * LGRP_NONE and clear relevant fields
899	 */
900	lgrp->lgrp_id = LGRP_NONE;
901	lgrp->lgrp_latency = 0;
902	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
903	lgrp->lgrp_parent = NULL;
904	lgrp->lgrp_childcnt = 0;
905
906	klgrpset_clear(lgrp->lgrp_children);
907	klgrpset_clear(lgrp->lgrp_leaves);
908	for (i = 0; i < LGRP_RSRC_COUNT; i++)
909		klgrpset_clear(lgrp->lgrp_set[i]);
910
911	lgrp->lgrp_mnodes = (mnodeset_t)0;
912	lgrp->lgrp_nmnodes = 0;
913
914	lgrp->lgrp_cpu = NULL;
915	lgrp->lgrp_cpucnt = 0;
916
917	nlgrps--;
918}
919
920/*
921 * Initialize kstat data. Called from lgrp intialization code.
922 */
923static void
924lgrp_kstat_init(void)
925{
926	lgrp_stat_t	stat;
927
928	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
929
930	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
931		kstat_named_init(&lgrp_kstat_data[stat],
932		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
933}
934
935/*
936 * initialize an lgrp's kstats if needed
937 * called with cpu_lock held but not with cpus paused.
938 * we don't tear these down now because we don't know about
939 * memory leaving the lgrp yet...
940 */
941
942void
943lgrp_kstat_create(cpu_t *cp)
944{
945	kstat_t		*lgrp_kstat;
946	lgrp_id_t	lgrpid;
947	lgrp_t		*my_lgrp;
948
949	ASSERT(MUTEX_HELD(&cpu_lock));
950
951	lgrpid = cp->cpu_lpl->lpl_lgrpid;
952	my_lgrp = lgrp_table[lgrpid];
953
954	if (my_lgrp->lgrp_kstat != NULL)
955		return; /* already initialized */
956
957	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
958	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
959	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
960
961	if (lgrp_kstat != NULL) {
962		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
963		lgrp_kstat->ks_private = my_lgrp;
964		lgrp_kstat->ks_data = &lgrp_kstat_data;
965		lgrp_kstat->ks_update = lgrp_kstat_extract;
966		my_lgrp->lgrp_kstat = lgrp_kstat;
967		kstat_install(lgrp_kstat);
968	}
969}
970
971/*
972 * this will do something when we manage to remove now unused lgrps
973 */
974
975/* ARGSUSED */
976void
977lgrp_kstat_destroy(cpu_t *cp)
978{
979	ASSERT(MUTEX_HELD(&cpu_lock));
980}
981
982/*
983 * Called when a CPU is off-lined.
984 */
985static void
986lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
987{
988	lgrp_t *my_lgrp;
989	struct cpu *prev;
990	struct cpu *next;
991
992	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
993
994	prev = cp->cpu_prev_lgrp;
995	next = cp->cpu_next_lgrp;
996
997	prev->cpu_next_lgrp = next;
998	next->cpu_prev_lgrp = prev;
999
1000	/*
1001	 * just because I'm paranoid doesn't mean...
1002	 */
1003
1004	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1005
1006	my_lgrp = lgrp_table[lgrpid];
1007	my_lgrp->lgrp_cpucnt--;
1008
1009	/*
1010	 * Removing last CPU in lgroup, so update lgroup topology
1011	 */
1012	if (my_lgrp->lgrp_cpucnt == 0) {
1013		klgrpset_t	changed;
1014		int		count;
1015		int		i;
1016
1017		my_lgrp->lgrp_cpu = NULL;
1018
1019		/*
1020		 * Remove this lgroup from its lgroup CPU resources and remove
1021		 * lgroup from lgroup topology if it doesn't have any more
1022		 * resources in it now
1023		 */
1024		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1025		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1026			count = 0;
1027			klgrpset_clear(changed);
1028			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1029			    lgrp_alloc_max + 1, &changed);
1030			return;
1031		}
1032
1033		/*
1034		 * This lgroup isn't empty, so just remove it from CPU
1035		 * resources of any lgroups that contain it as such
1036		 */
1037		for (i = 0; i <= lgrp_alloc_max; i++) {
1038			lgrp_t		*lgrp;
1039
1040			lgrp = lgrp_table[i];
1041			if (!LGRP_EXISTS(lgrp) ||
1042			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1043			    lgrpid))
1044				continue;
1045
1046			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1047		}
1048		return;
1049	}
1050
1051	if (my_lgrp->lgrp_cpu == cp)
1052		my_lgrp->lgrp_cpu = next;
1053
1054}
1055
1056/*
1057 * Update memory nodes in target lgroups and return ones that get changed
1058 */
1059int
1060lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1061{
1062	int	count;
1063	int	i;
1064	int	j;
1065	lgrp_t	*lgrp;
1066	lgrp_t	*lgrp_rsrc;
1067
1068	count = 0;
1069	if (changed)
1070		klgrpset_clear(*changed);
1071
1072	if (klgrpset_isempty(target))
1073		return (0);
1074
1075	/*
1076	 * Find each lgroup in target lgroups
1077	 */
1078	for (i = 0; i <= lgrp_alloc_max; i++) {
1079		/*
1080		 * Skip any lgroups that don't exist or aren't in target group
1081		 */
1082		lgrp = lgrp_table[i];
1083		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1084			continue;
1085		}
1086
1087		/*
1088		 * Initialize memnodes for intermediate lgroups to 0
1089		 * and update them from scratch since they may have completely
1090		 * changed
1091		 */
1092		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1093			lgrp->lgrp_mnodes = (mnodeset_t)0;
1094			lgrp->lgrp_nmnodes = 0;
1095		}
1096
1097		/*
1098		 * Update memory nodes of of target lgroup with memory nodes
1099		 * from each lgroup in its lgroup memory resource set
1100		 */
1101		for (j = 0; j <= lgrp_alloc_max; j++) {
1102			int	k;
1103
1104			/*
1105			 * Skip any lgroups that don't exist or aren't in
1106			 * memory resources of target lgroup
1107			 */
1108			lgrp_rsrc = lgrp_table[j];
1109			if (!LGRP_EXISTS(lgrp_rsrc) ||
1110			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1111			    j))
1112				continue;
1113
1114			/*
1115			 * Update target lgroup's memnodes to include memnodes
1116			 * of this lgroup
1117			 */
1118			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1119				mnodeset_t	mnode_mask;
1120
1121				mnode_mask = (mnodeset_t)1 << k;
1122				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1123				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1124					lgrp->lgrp_mnodes |= mnode_mask;
1125					lgrp->lgrp_nmnodes++;
1126				}
1127			}
1128			count++;
1129			if (changed)
1130				klgrpset_add(*changed, lgrp->lgrp_id);
1131		}
1132	}
1133
1134	return (count);
1135}
1136
1137/*
1138 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1139 * is moved from one board to another. The "from" and "to" arguments specify the
1140 * source and the destination of the move.
1141 *
1142 * See plat_lgrp_config() for a detailed description of the copy-rename
1143 * semantics.
1144 *
1145 * The lgrp_mem_rename() is called by the platform copy-rename code to update
1146 * the lgroup topology which is changing as memory moves from one lgroup to
1147 * another. It removes the mnode from the source lgroup and re-inserts it in the
1148 * target lgroup.
1149 *
1150 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1151 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1152 * copy-rename operation.
1153 *
1154 * There is one case which requires special handling. If the system contains
1155 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1156 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1157 * lgrp_mem_init), but there is a window when the system has no memory in the
1158 * lgroup hierarchy. If another thread tries to allocate memory during this
1159 * window, the allocation will fail, although the system has physical memory.
1160 * This may cause a system panic or a deadlock (some sleeping memory allocations
1161 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1162 * the mnode back).
1163 *
1164 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1165 * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1166 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1167 * but it updates the rest of the lgroup topology as if the mnode was actually
1168 * removed. The lgrp_mem_init() function recognizes that the mnode being
1169 * inserted represents such a special case and updates the topology
1170 * appropriately.
1171 */
1172void
1173lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1174{
1175	/*
1176	 * Remove the memory from the source node and add it to the destination
1177	 * node.
1178	 */
1179	lgrp_mem_fini(mnode, from, B_TRUE);
1180	lgrp_mem_init(mnode, to, B_TRUE);
1181}
1182
1183/*
1184 * Called to indicate that the lgrp with platform handle "hand" now
1185 * contains the memory identified by "mnode".
1186 *
1187 * LOCKING for this routine is a bit tricky. Usually it is called without
1188 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1189 * callers. During DR of the board containing the caged memory it may be called
1190 * with cpu_lock already held and CPUs paused.
1191 *
1192 * If the insertion is part of the DR copy-rename and the inserted mnode (and
1193 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1194 * dealing with the special case of DR copy-rename described in
1195 * lgrp_mem_rename().
1196 */
1197void
1198lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1199{
1200	klgrpset_t	changed;
1201	int		count;
1202	int		i;
1203	lgrp_t		*my_lgrp;
1204	lgrp_id_t	lgrpid;
1205	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1206	boolean_t	drop_lock = B_FALSE;
1207	boolean_t	need_synch = B_FALSE;
1208
1209	/*
1210	 * Grab CPU lock (if we haven't already)
1211	 */
1212	if (!MUTEX_HELD(&cpu_lock)) {
1213		mutex_enter(&cpu_lock);
1214		drop_lock = B_TRUE;
1215	}
1216
1217	/*
1218	 * This routine may be called from a context where we already
1219	 * hold cpu_lock, and have already paused cpus.
1220	 */
1221	if (!cpus_paused())
1222		need_synch = B_TRUE;
1223
1224	/*
1225	 * Check if this mnode is already configured and return immediately if
1226	 * it is.
1227	 *
1228	 * NOTE: in special case of copy-rename of the only remaining mnode,
1229	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1230	 * recognize this case and continue as usual, but skip the update to
1231	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1232	 * in topology, temporarily introduced by lgrp_mem_fini().
1233	 */
1234	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1235	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1236		if (drop_lock)
1237			mutex_exit(&cpu_lock);
1238		return;
1239	}
1240
1241	/*
1242	 * Update lgroup topology with new memory resources, keeping track of
1243	 * which lgroups change
1244	 */
1245	count = 0;
1246	klgrpset_clear(changed);
1247	my_lgrp = lgrp_hand_to_lgrp(hand);
1248	if (my_lgrp == NULL) {
1249		/* new lgrp */
1250		my_lgrp = lgrp_create();
1251		lgrpid = my_lgrp->lgrp_id;
1252		my_lgrp->lgrp_plathand = hand;
1253		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1254		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1255		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1256
1257		if (need_synch)
1258			pause_cpus(NULL);
1259		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1260		    &changed);
1261		if (need_synch)
1262			start_cpus();
1263	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1264	    > 0) {
1265		/*
1266		 * Leaf lgroup was created, but latency wasn't available
1267		 * then.  So, set latency for it and fill in rest of lgroup
1268		 * topology  now that we know how far it is from other leaf
1269		 * lgroups.
1270		 */
1271		klgrpset_clear(changed);
1272		lgrpid = my_lgrp->lgrp_id;
1273		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1274		    lgrpid))
1275			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1276		if (need_synch)
1277			pause_cpus(NULL);
1278		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1279		    &changed);
1280		if (need_synch)
1281			start_cpus();
1282	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1283	    my_lgrp->lgrp_id)) {
1284		/*
1285		 * Add new lgroup memory resource to existing lgroup
1286		 */
1287		lgrpid = my_lgrp->lgrp_id;
1288		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1289		klgrpset_add(changed, lgrpid);
1290		count++;
1291		for (i = 0; i <= lgrp_alloc_max; i++) {
1292			lgrp_t		*lgrp;
1293
1294			lgrp = lgrp_table[i];
1295			if (!LGRP_EXISTS(lgrp) ||
1296			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1297				continue;
1298
1299			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1300			klgrpset_add(changed, lgrp->lgrp_id);
1301			count++;
1302		}
1303	}
1304
1305	/*
1306	 * Add memory node to lgroup and remove lgroup from ones that need
1307	 * to be updated
1308	 */
1309	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1310		my_lgrp->lgrp_mnodes |= mnodes_mask;
1311		my_lgrp->lgrp_nmnodes++;
1312	}
1313	klgrpset_del(changed, lgrpid);
1314
1315	/*
1316	 * Update memory node information for all lgroups that changed and
1317	 * contain new memory node as a resource
1318	 */
1319	if (count)
1320		(void) lgrp_mnode_update(changed, NULL);
1321
1322	if (drop_lock)
1323		mutex_exit(&cpu_lock);
1324}
1325
1326/*
1327 * Called to indicate that the lgroup associated with the platform
1328 * handle "hand" no longer contains given memory node
1329 *
1330 * LOCKING for this routine is a bit tricky. Usually it is called without
1331 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1332 * callers. During DR of the board containing the caged memory it may be called
1333 * with cpu_lock already held and CPUs paused.
1334 *
1335 * If the deletion is part of the DR copy-rename and the deleted mnode is the
1336 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1337 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1338 * the same mnode back into the topology. See lgrp_mem_rename() and
1339 * lgrp_mem_init() for additional details.
1340 */
1341void
1342lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1343{
1344	klgrpset_t	changed;
1345	int		count;
1346	int		i;
1347	lgrp_t		*my_lgrp;
1348	lgrp_id_t	lgrpid;
1349	mnodeset_t	mnodes_mask;
1350	boolean_t	drop_lock = B_FALSE;
1351	boolean_t	need_synch = B_FALSE;
1352
1353	/*
1354	 * Grab CPU lock (if we haven't already)
1355	 */
1356	if (!MUTEX_HELD(&cpu_lock)) {
1357		mutex_enter(&cpu_lock);
1358		drop_lock = B_TRUE;
1359	}
1360
1361	/*
1362	 * This routine may be called from a context where we already
1363	 * hold cpu_lock and have already paused cpus.
1364	 */
1365	if (!cpus_paused())
1366		need_synch = B_TRUE;
1367
1368	my_lgrp = lgrp_hand_to_lgrp(hand);
1369
1370	/*
1371	 * The lgrp *must* be pre-existing
1372	 */
1373	ASSERT(my_lgrp != NULL);
1374
1375	/*
1376	 * Delete memory node from lgroups which contain it
1377	 */
1378	mnodes_mask = ((mnodeset_t)1 << mnode);
1379	for (i = 0; i <= lgrp_alloc_max; i++) {
1380		lgrp_t *lgrp = lgrp_table[i];
1381		/*
1382		 * Skip any non-existent lgroups and any lgroups that don't
1383		 * contain leaf lgroup of memory as a memory resource
1384		 */
1385		if (!LGRP_EXISTS(lgrp) ||
1386		    !(lgrp->lgrp_mnodes & mnodes_mask))
1387			continue;
1388
1389		/*
1390		 * Avoid removing the last mnode from the root in the DR
1391		 * copy-rename case. See lgrp_mem_rename() for details.
1392		 */
1393		if (is_copy_rename &&
1394		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1395			continue;
1396
1397		/*
1398		 * Remove memory node from lgroup.
1399		 */
1400		lgrp->lgrp_mnodes &= ~mnodes_mask;
1401		lgrp->lgrp_nmnodes--;
1402		ASSERT(lgrp->lgrp_nmnodes >= 0);
1403	}
1404	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1405
1406	/*
1407	 * Don't need to update lgroup topology if this lgroup still has memory.
1408	 *
1409	 * In the special case of DR copy-rename with the only mnode being
1410	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1411	 * still need to update the lgroup topology.
1412	 */
1413	if ((my_lgrp->lgrp_nmnodes > 0) &&
1414	    !(is_copy_rename &&
1415		(my_lgrp == lgrp_root) &&
1416		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1417		if (drop_lock)
1418			mutex_exit(&cpu_lock);
1419		return;
1420	}
1421
1422	/*
1423	 * This lgroup does not contain any memory now
1424	 */
1425	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1426
1427	/*
1428	 * Remove this lgroup from lgroup topology if it does not contain any
1429	 * resources now
1430	 */
1431	lgrpid = my_lgrp->lgrp_id;
1432	count = 0;
1433	klgrpset_clear(changed);
1434	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1435		/*
1436		 * Delete lgroup when no more resources
1437		 */
1438		if (need_synch)
1439			pause_cpus(NULL);
1440		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1441		    lgrp_alloc_max + 1, &changed);
1442		ASSERT(count > 0);
1443		if (need_synch)
1444			start_cpus();
1445	} else {
1446		/*
1447		 * Remove lgroup from memory resources of any lgroups that
1448		 * contain it as such
1449		 */
1450		for (i = 0; i <= lgrp_alloc_max; i++) {
1451			lgrp_t		*lgrp;
1452
1453			lgrp = lgrp_table[i];
1454			if (!LGRP_EXISTS(lgrp) ||
1455			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1456			    lgrpid))
1457				continue;
1458
1459			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1460		}
1461	}
1462	if (drop_lock)
1463		mutex_exit(&cpu_lock);
1464}
1465
1466/*
1467 * Return lgroup with given platform handle
1468 */
1469lgrp_t *
1470lgrp_hand_to_lgrp(lgrp_handle_t hand)
1471{
1472	int	i;
1473	lgrp_t	*lgrp;
1474
1475	if (hand == LGRP_NULL_HANDLE)
1476		return (NULL);
1477
1478	for (i = 0; i <= lgrp_alloc_max; i++) {
1479		lgrp = lgrp_table[i];
1480		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1481			return (lgrp);
1482	}
1483	return (NULL);
1484}
1485
1486/*
1487 * Return the home lgroup of the current thread.
1488 * We must do this with kernel preemption disabled, since we don't want our
1489 * thread to be re-homed while we're poking around with its lpl, and the lpl
1490 * should never be NULL.
1491 *
1492 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1493 * is enabled because of DR.  Callers can use disable kernel preemption
1494 * around this call to guarantee that the lgroup will be valid beyond this
1495 * routine, since kernel preemption can be recursive.
1496 */
1497lgrp_t *
1498lgrp_home_lgrp(void)
1499{
1500	lgrp_t	*lgrp;
1501	lpl_t	*lpl;
1502
1503	kpreempt_disable();
1504
1505	lpl = curthread->t_lpl;
1506	ASSERT(lpl != NULL);
1507	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1508	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1509	lgrp = lgrp_table[lpl->lpl_lgrpid];
1510
1511	kpreempt_enable();
1512
1513	return (lgrp);
1514}
1515
1516/*
1517 * Return ID of home lgroup for given thread
1518 * (See comments for lgrp_home_lgrp() for special care and handling
1519 * instructions)
1520 */
1521lgrp_id_t
1522lgrp_home_id(kthread_t *t)
1523{
1524	lgrp_id_t	lgrp;
1525	lpl_t		*lpl;
1526
1527	ASSERT(t != NULL);
1528	/*
1529	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1530	 * cannot since the HAT layer can call into this routine to
1531	 * determine the locality for its data structures in the context
1532	 * of a page fault.
1533	 */
1534
1535	kpreempt_disable();
1536
1537	lpl = t->t_lpl;
1538	ASSERT(lpl != NULL);
1539	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1540	lgrp = lpl->lpl_lgrpid;
1541
1542	kpreempt_enable();
1543
1544	return (lgrp);
1545}
1546
1547/*
1548 * Return lgroup containing the physical memory for the given page frame number
1549 */
1550lgrp_t *
1551lgrp_pfn_to_lgrp(pfn_t pfn)
1552{
1553	lgrp_handle_t	hand;
1554	int		i;
1555	lgrp_t		*lgrp;
1556
1557	hand = lgrp_plat_pfn_to_hand(pfn);
1558	if (hand != LGRP_NULL_HANDLE)
1559		for (i = 0; i <= lgrp_alloc_max; i++) {
1560			lgrp = lgrp_table[i];
1561			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1562				return (lgrp);
1563		}
1564	return (NULL);
1565}
1566
1567/*
1568 * Return lgroup containing the physical memory for the given page frame number
1569 */
1570lgrp_t *
1571lgrp_phys_to_lgrp(u_longlong_t physaddr)
1572{
1573	lgrp_handle_t	hand;
1574	int		i;
1575	lgrp_t		*lgrp;
1576	pfn_t		pfn;
1577
1578	pfn = btop(physaddr);
1579	hand = lgrp_plat_pfn_to_hand(pfn);
1580	if (hand != LGRP_NULL_HANDLE)
1581		for (i = 0; i <= lgrp_alloc_max; i++) {
1582			lgrp = lgrp_table[i];
1583			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1584				return (lgrp);
1585		}
1586	return (NULL);
1587}
1588
1589/*
1590 * Return the leaf lgroup containing the given CPU
1591 *
1592 * The caller needs to take precautions necessary to prevent
1593 * "cpu", and it's lpl from going away across a call to this function.
1594 * hint: kpreempt_disable()/kpreempt_enable()
1595 */
1596static lgrp_t *
1597lgrp_cpu_to_lgrp(cpu_t *cpu)
1598{
1599	return (cpu->cpu_lpl->lpl_lgrp);
1600}
1601
1602/*
1603 * Return the sum of the partition loads in an lgrp divided by
1604 * the number of CPUs in the lgrp.  This is our best approximation
1605 * of an 'lgroup load average' for a useful per-lgroup kstat.
1606 */
1607static uint64_t
1608lgrp_sum_loadavgs(lgrp_t *lgrp)
1609{
1610	cpu_t *cpu;
1611	int ncpu;
1612	uint64_t loads = 0;
1613
1614	mutex_enter(&cpu_lock);
1615
1616	cpu = lgrp->lgrp_cpu;
1617	ncpu = lgrp->lgrp_cpucnt;
1618
1619	if (cpu == NULL || ncpu == 0) {
1620		mutex_exit(&cpu_lock);
1621		return (0ull);
1622	}
1623
1624	do {
1625		loads += cpu->cpu_lpl->lpl_loadavg;
1626		cpu = cpu->cpu_next_lgrp;
1627	} while (cpu != lgrp->lgrp_cpu);
1628
1629	mutex_exit(&cpu_lock);
1630
1631	return (loads / ncpu);
1632}
1633
1634void
1635lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1636{
1637	struct lgrp_stats *pstats;
1638
1639	/*
1640	 * Verify that the caller isn't trying to add to
1641	 * a statistic for an lgroup that has gone away
1642	 */
1643	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1644		return;
1645
1646	pstats = &lgrp_stats[lgrpid];
1647	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1648}
1649
1650int64_t
1651lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1652{
1653	uint64_t val;
1654	struct lgrp_stats *pstats;
1655
1656	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1657		return ((int64_t)0);
1658
1659	pstats = &lgrp_stats[lgrpid];
1660	LGRP_STAT_READ(pstats, stat, val);
1661	return (val);
1662}
1663
1664/*
1665 * Reset all kstats for lgrp specified by its lgrpid.
1666 */
1667static void
1668lgrp_kstat_reset(lgrp_id_t lgrpid)
1669{
1670	lgrp_stat_t stat;
1671
1672	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1673		return;
1674
1675	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1676		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1677	}
1678}
1679
1680/*
1681 * Collect all per-lgrp statistics for the lgrp associated with this
1682 * kstat, and store them in the ks_data array.
1683 *
1684 * The superuser can reset all the running counter statistics for an
1685 * lgrp by writing to any of the lgrp's stats.
1686 */
1687static int
1688lgrp_kstat_extract(kstat_t *ksp, int rw)
1689{
1690	lgrp_stat_t		stat;
1691	struct kstat_named	*ksd;
1692	lgrp_t			*lgrp;
1693	lgrp_id_t		lgrpid;
1694
1695	lgrp = (lgrp_t *)ksp->ks_private;
1696
1697	ksd = (struct kstat_named *)ksp->ks_data;
1698	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1699
1700	lgrpid = lgrp->lgrp_id;
1701
1702	if (lgrpid == LGRP_NONE) {
1703		/*
1704		 * Return all zeroes as stats for freed lgrp.
1705		 */
1706		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1707			ksd[stat].value.i64 = 0;
1708		}
1709		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1710		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1711		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1712		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1713		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1714	} else if (rw != KSTAT_WRITE) {
1715		/*
1716		 * Handle counter stats
1717		 */
1718		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1719			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1720		}
1721
1722		/*
1723		 * Handle kernel data snapshot stats
1724		 */
1725		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1726		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1727		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1728		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1729		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1730		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1731		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1732		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1733		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1734		    lgrp_loadavg_max_effect;
1735	} else {
1736		lgrp_kstat_reset(lgrpid);
1737	}
1738
1739	return (0);
1740}
1741
1742int
1743lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1744{
1745	cpu_t	*cp;
1746
1747	mutex_enter(&cpu_lock);
1748
1749	if ((cp = cpu_get(id)) == NULL) {
1750		mutex_exit(&cpu_lock);
1751		return (EINVAL);
1752	}
1753
1754	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1755		mutex_exit(&cpu_lock);
1756		return (EINVAL);
1757	}
1758
1759	ASSERT(cp->cpu_lpl != NULL);
1760
1761	*lp = cp->cpu_lpl->lpl_lgrpid;
1762
1763	mutex_exit(&cpu_lock);
1764
1765	return (0);
1766}
1767
1768int
1769lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1770{
1771	cpu_t *cp;
1772
1773	mutex_enter(&cpu_lock);
1774
1775	if ((cp = cpu_get(id)) == NULL) {
1776		mutex_exit(&cpu_lock);
1777		return (EINVAL);
1778	}
1779
1780	ASSERT(cp->cpu_lpl != NULL);
1781
1782	*lp = cp->cpu_lpl->lpl_loadavg;
1783
1784	mutex_exit(&cpu_lock);
1785
1786	return (0);
1787}
1788
1789/*
1790 * Add a resource named by lpl_leaf to rset of lpl_target
1791 *
1792 * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1793 * resource. It is adjusted here, as this is presently the only place that we
1794 * can be certain a resource addition has succeeded.
1795 *
1796 * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1797 * list in order until it reaches a NULL.  (This list is required to be NULL
1798 * terminated, too).  This is done so that we can mark start pos + 1, so that
1799 * each lpl is traversed sequentially, but in a different order.  We hope this
1800 * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1801 */
1802
1803void
1804lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1805{
1806	int		i;
1807	int		entry_slot = 0;
1808
1809	/* return if leaf is already present */
1810	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1811		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1812			return;
1813		}
1814
1815		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1816		    lpl_leaf->lpl_lgrpid) {
1817			break;
1818		}
1819	}
1820
1821	/* insert leaf, update counts */
1822	entry_slot = i;
1823	i = lpl_target->lpl_nrset++;
1824	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1825		panic("More leaf lgrps in system than are supported!\n");
1826	}
1827
1828	/*
1829	 * Start at the end of the rset array and work backwards towards the
1830	 * slot into which the new lpl will be inserted. This effectively
1831	 * preserves the current ordering by scooting everybody over one entry,
1832	 * and placing the new entry into the space created.
1833	 */
1834
1835	while (i-- > entry_slot) {
1836		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1837	}
1838
1839	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1840	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1841}
1842
1843/*
1844 * Update each of lpl_parent's children with a proper hint and
1845 * a reference to their parent.
1846 * The lgrp topology is used as the reference since it is fully
1847 * consistent and correct at this point.
1848 *
1849 * Each child's hint will reference an element in lpl_parent's
1850 * rset that designates where the child should start searching
1851 * for CPU resources. The hint selected is the highest order leaf present
1852 * in the child's lineage.
1853 *
1854 * This should be called after any potential change in lpl_parent's
1855 * rset.
1856 */
1857static void
1858lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1859{
1860	klgrpset_t	children, leaves;
1861	lpl_t		*lpl;
1862	int		hint;
1863	int		i, j;
1864
1865	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1866	if (klgrpset_isempty(children))
1867		return; /* nothing to do */
1868
1869	for (i = 0; i <= lgrp_alloc_max; i++) {
1870		if (klgrpset_ismember(children, i)) {
1871
1872			/*
1873			 * Given the set of leaves in this child's lineage,
1874			 * find the highest order leaf present in the parent's
1875			 * rset. Select this as the hint for the child.
1876			 */
1877			leaves = lgrp_table[i]->lgrp_leaves;
1878			hint = 0;
1879			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1880				lpl = lpl_parent->lpl_rset[j];
1881				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1882					hint = j;
1883			}
1884			cp->cp_lgrploads[i].lpl_hint = hint;
1885
1886			/*
1887			 * (Re)set the parent. It may be incorrect if
1888			 * lpl_parent is new in the topology.
1889			 */
1890			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1891		}
1892	}
1893}
1894
1895/*
1896 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1897 *
1898 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1899 * resource. The values are adjusted here, as this is the only place that we can
1900 * be certain a resource was successfully deleted.
1901 */
1902void
1903lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1904{
1905	int i;
1906
1907	/* find leaf in intermediate node */
1908	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1909		if (lpl_target->lpl_rset[i] == lpl_leaf)
1910			break;
1911	}
1912
1913	/* return if leaf not found */
1914	if (lpl_target->lpl_rset[i] != lpl_leaf)
1915		return;
1916
1917	/* prune leaf, compress array */
1918	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1919	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1920	lpl_target->lpl_ncpu--;
1921	do {
1922		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1923	} while (i++ < lpl_target->lpl_nrset);
1924}
1925
1926/*
1927 * Check to see if the resource set of the target lpl contains the
1928 * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1929 */
1930
1931int
1932lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1933{
1934	int i;
1935
1936	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1937		if (lpl_target->lpl_rset[i] == lpl_leaf)
1938			return (1);
1939	}
1940
1941	return (0);
1942}
1943
1944/*
1945 * Called when we change cpu lpl membership.  This increments or decrements the
1946 * per-cpu counter in every lpl in which our leaf appears.
1947 */
1948void
1949lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1950{
1951	cpupart_t	*cpupart;
1952	lgrp_t		*lgrp_leaf;
1953	lgrp_t		*lgrp_cur;
1954	lpl_t		*lpl_leaf;
1955	lpl_t		*lpl_cur;
1956	int		i;
1957
1958	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1959
1960	cpupart = cp->cpu_part;
1961	lpl_leaf = cp->cpu_lpl;
1962	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1963
1964	for (i = 0; i <= lgrp_alloc_max; i++) {
1965		lgrp_cur = lgrp_table[i];
1966
1967		/*
1968		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1969		 * for the cpu in question, or if the current lgrp and leaf
1970		 * don't share the same resources.
1971		 */
1972
1973		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
1974		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
1975		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
1976			continue;
1977
1978
1979		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
1980
1981		if (lpl_cur->lpl_nrset > 0) {
1982			if (act == LPL_INCREMENT) {
1983				lpl_cur->lpl_ncpu++;
1984			} else if (act == LPL_DECREMENT) {
1985				lpl_cur->lpl_ncpu--;
1986			}
1987		}
1988	}
1989}
1990
1991/*
1992 * Initialize lpl with given resources and specified lgrp
1993 */
1994
1995void
1996lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
1997{
1998	lpl->lpl_lgrpid = lgrp->lgrp_id;
1999	lpl->lpl_loadavg = 0;
2000	if (lpl == lpl_leaf)
2001		lpl->lpl_ncpu = 1;
2002	else
2003		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2004	lpl->lpl_nrset = 1;
2005	lpl->lpl_rset[0] = lpl_leaf;
2006	lpl->lpl_lgrp = lgrp;
2007	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2008	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2009}
2010
2011/*
2012 * Clear an unused lpl
2013 */
2014
2015void
2016lpl_clear(lpl_t *lpl)
2017{
2018	lgrp_id_t	lid;
2019
2020	/* save lid for debugging purposes */
2021	lid = lpl->lpl_lgrpid;
2022	bzero(lpl, sizeof (lpl_t));
2023	lpl->lpl_lgrpid = lid;
2024}
2025
2026/*
2027 * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2028 * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2029 * make full use of all of the lgroup topology, but this checks to make sure
2030 * that for the parts that it does use, it has correctly understood the
2031 * relationships that exist. This function returns
2032 * 0 if the topology is correct, and a non-zero error code, for non-debug
2033 * kernels if incorrect.  Asserts are spread throughout the code to aid in
2034 * debugging on a DEBUG kernel.
2035 */
2036int
2037lpl_topo_verify(cpupart_t *cpupart)
2038{
2039	lgrp_t		*lgrp;
2040	lpl_t		*lpl;
2041	klgrpset_t	rset;
2042	klgrpset_t	cset;
2043	cpu_t		*cpu;
2044	cpu_t		*cp_start;
2045	int		i;
2046	int		j;
2047	int		sum;
2048
2049	/* topology can't be incorrect if it doesn't exist */
2050	if (!lgrp_topo_initialized || !lgrp_initialized)
2051		return (LPL_TOPO_CORRECT);
2052
2053	ASSERT(cpupart != NULL);
2054
2055	for (i = 0; i <= lgrp_alloc_max; i++) {
2056		lgrp = lgrp_table[i];
2057		lpl = NULL;
2058		/* make sure lpls are allocated */
2059		ASSERT(cpupart->cp_lgrploads);
2060		if (!cpupart->cp_lgrploads)
2061			return (LPL_TOPO_PART_HAS_NO_LPL);
2062
2063		lpl = &cpupart->cp_lgrploads[i];
2064		/* make sure our index is good */
2065		ASSERT(i < cpupart->cp_nlgrploads);
2066
2067		/* if lgroup doesn't exist, make sure lpl is empty */
2068		if (!LGRP_EXISTS(lgrp)) {
2069			ASSERT(lpl->lpl_ncpu == 0);
2070			if (lpl->lpl_ncpu > 0) {
2071				return (LPL_TOPO_CPUS_NOT_EMPTY);
2072			} else {
2073				continue;
2074			}
2075		}
2076
2077		/* verify that lgroup and lpl are identically numbered */
2078		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2079
2080		/* if lgroup isn't in our partition, make sure lpl is empty */
2081		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2082		    cpupart->cp_lgrpset)) {
2083			ASSERT(lpl->lpl_ncpu == 0);
2084			if (lpl->lpl_ncpu > 0) {
2085				return (LPL_TOPO_CPUS_NOT_EMPTY);
2086			}
2087			/*
2088			 * lpl is empty, and lgroup isn't in partition.  verify
2089			 * that lpl doesn't show up in anyone else's rsets (in
2090			 * this partition, anyway)
2091			 */
2092
2093			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2094				lpl_t *i_lpl; /* lpl we're iterating over */
2095
2096				i_lpl = &cpupart->cp_lgrploads[j];
2097
2098				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2099				if (lpl_rset_contains(i_lpl, lpl)) {
2100					return (LPL_TOPO_LPL_ORPHANED);
2101				}
2102			}
2103			/* lgroup is empty, and everything is ok. continue */
2104			continue;
2105		}
2106
2107
2108		/* lgroup is in this partition, now check it against lpl */
2109
2110		/* do both have matching lgrps? */
2111		ASSERT(lgrp == lpl->lpl_lgrp);
2112		if (lgrp != lpl->lpl_lgrp) {
2113			return (LPL_TOPO_LGRP_MISMATCH);
2114		}
2115
2116		/* do the parent lgroups exist and do they match? */
2117		if (lgrp->lgrp_parent) {
2118			ASSERT(lpl->lpl_parent);
2119			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2120				    lpl->lpl_parent->lpl_lgrpid);
2121
2122			if (!lpl->lpl_parent) {
2123				return (LPL_TOPO_MISSING_PARENT);
2124			} else if (lgrp->lgrp_parent->lgrp_id !=
2125			    lpl->lpl_parent->lpl_lgrpid) {
2126				return (LPL_TOPO_PARENT_MISMATCH);
2127			}
2128		}
2129
2130		/* only leaf lgroups keep a cpucnt, only check leaves */
2131		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2132
2133			/* verify that lgrp is also a leaf */
2134			ASSERT((lgrp->lgrp_childcnt == 0) &&
2135			    (klgrpset_ismember(lgrp->lgrp_leaves,
2136			    lpl->lpl_lgrpid)));
2137
2138			if ((lgrp->lgrp_childcnt > 0) ||
2139			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2140			    lpl->lpl_lgrpid))) {
2141				return (LPL_TOPO_LGRP_NOT_LEAF);
2142			}
2143
2144			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2145			    (lpl->lpl_ncpu > 0));
2146			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2147				(lpl->lpl_ncpu <= 0)) {
2148				return (LPL_TOPO_BAD_CPUCNT);
2149			}
2150
2151			/*
2152			 * Check that lpl_ncpu also matches the number of
2153			 * cpus in the lpl's linked list.  This only exists in
2154			 * leaves, but they should always match.
2155			 */
2156			j = 0;
2157			cpu = cp_start = lpl->lpl_cpus;
2158			while (cpu != NULL) {
2159				j++;
2160
2161				/* check to make sure cpu's lpl is leaf lpl */
2162				ASSERT(cpu->cpu_lpl == lpl);
2163				if (cpu->cpu_lpl != lpl) {
2164					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2165				}
2166
2167				/* check next cpu */
2168				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2169					continue;
2170				} else {
2171					cpu = NULL;
2172				}
2173			}
2174
2175			ASSERT(j == lpl->lpl_ncpu);
2176			if (j != lpl->lpl_ncpu) {
2177				return (LPL_TOPO_LPL_BAD_NCPU);
2178			}
2179
2180			/*
2181			 * Also, check that leaf lpl is contained in all
2182			 * intermediate lpls that name the leaf as a descendant
2183			 */
2184
2185			for (j = 0; j <= lgrp_alloc_max; j++) {
2186				klgrpset_t intersect;
2187				lgrp_t *lgrp_cand;
2188				lpl_t *lpl_cand;
2189
2190				lgrp_cand = lgrp_table[j];
2191				intersect = klgrpset_intersects(
2192				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2193				    cpupart->cp_lgrpset);
2194
2195				if (!LGRP_EXISTS(lgrp_cand) ||
2196				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2197				    cpupart->cp_lgrpset) ||
2198				    (intersect == 0))
2199					continue;
2200
2201				lpl_cand =
2202				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2203
2204				if (klgrpset_ismember(intersect,
2205				    lgrp->lgrp_id)) {
2206					ASSERT(lpl_rset_contains(lpl_cand,
2207					    lpl));
2208
2209					if (!lpl_rset_contains(lpl_cand, lpl)) {
2210						return (LPL_TOPO_RSET_MSSNG_LF);
2211					}
2212				}
2213			}
2214
2215		} else { /* non-leaf specific checks */
2216
2217			/*
2218			 * Non-leaf lpls should have lpl_cpus == NULL
2219			 * verify that this is so
2220			 */
2221			ASSERT(lpl->lpl_cpus == NULL);
2222			if (lpl->lpl_cpus != NULL) {
2223				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2224			}
2225
2226			/*
2227			 * verify that the sum of the cpus in the leaf resources
2228			 * is equal to the total ncpu in the intermediate
2229			 */
2230			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2231				sum += lpl->lpl_rset[j]->lpl_ncpu;
2232			}
2233
2234			ASSERT(sum == lpl->lpl_ncpu);
2235			if (sum != lpl->lpl_ncpu) {
2236				return (LPL_TOPO_LPL_BAD_NCPU);
2237			}
2238		}
2239
2240		/*
2241		 * check on lpl_hint. Don't check root, since it has no parent.
2242		 */
2243		if (lpl->lpl_parent != NULL) {
2244			int hint;
2245			lpl_t *hint_lpl;
2246
2247			/* make sure hint is within limits of nrset */
2248			hint = lpl->lpl_hint;
2249			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2250			if (lpl->lpl_parent->lpl_nrset < hint) {
2251				return (LPL_TOPO_BOGUS_HINT);
2252			}
2253
2254			/* make sure hint points to valid lpl */
2255			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2256			ASSERT(hint_lpl->lpl_ncpu > 0);
2257			if (hint_lpl->lpl_ncpu <= 0) {
2258				return (LPL_TOPO_BOGUS_HINT);
2259			}
2260		}
2261
2262		/*
2263		 * Check the rset of the lpl in question.  Make sure that each
2264		 * rset contains a subset of the resources in
2265		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2266		 * sure that each rset doesn't include resources that are
2267		 * outside of that set.  (Which would be resources somehow not
2268		 * accounted for).
2269		 */
2270
2271		klgrpset_clear(rset);
2272		for (j = 0; j < lpl->lpl_nrset; j++) {
2273			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2274		}
2275		klgrpset_copy(cset, rset);
2276		/* make sure lpl rset matches lgrp rset */
2277		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2278		/* make sure rset is contained with in partition, too */
2279		klgrpset_diff(cset, cpupart->cp_lgrpset);
2280
2281		ASSERT(klgrpset_isempty(rset) &&
2282			    klgrpset_isempty(cset));
2283		if (!klgrpset_isempty(rset) ||
2284		    !klgrpset_isempty(cset)) {
2285			return (LPL_TOPO_RSET_MISMATCH);
2286		}
2287
2288		/*
2289		 * check to make sure lpl_nrset matches the number of rsets
2290		 * contained in the lpl
2291		 */
2292
2293		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2294		    j++);
2295
2296		ASSERT(j == lpl->lpl_nrset);
2297		if (j != lpl->lpl_nrset) {
2298			return (LPL_TOPO_BAD_RSETCNT);
2299		}
2300
2301	}
2302	return (LPL_TOPO_CORRECT);
2303}
2304
2305/*
2306 * Flatten lpl topology to given number of levels.  This is presently only
2307 * implemented for a flatten to 2 levels, which will prune out the intermediates
2308 * and home the leaf lpls to the root lpl.
2309 */
2310int
2311lpl_topo_flatten(int levels)
2312{
2313	int		i;
2314	uint_t		sum;
2315	lgrp_t		*lgrp_cur;
2316	lpl_t		*lpl_cur;
2317	lpl_t		*lpl_root;
2318	cpupart_t	*cp;
2319
2320	if (levels != 2)
2321		return (0);
2322
2323	/* called w/ cpus paused - grab no locks! */
2324	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2325	    !lgrp_initialized);
2326
2327	cp = cp_list_head;
2328	do {
2329		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2330		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2331
2332		for (i = 0; i <= lgrp_alloc_max; i++) {
2333			lgrp_cur = lgrp_table[i];
2334			lpl_cur = &cp->cp_lgrploads[i];
2335
2336			if ((lgrp_cur == lgrp_root) ||
2337			    (!LGRP_EXISTS(lgrp_cur) &&
2338			    (lpl_cur->lpl_ncpu == 0)))
2339				continue;
2340
2341			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2342				/*
2343				 * this should be a deleted intermediate, so
2344				 * clear it
2345				 */
2346				lpl_clear(lpl_cur);
2347			} else if ((lpl_cur->lpl_nrset == 1) &&
2348			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2349			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2350			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2351				/*
2352				 * this is a leaf whose parent was deleted, or
2353				 * whose parent had their lgrp deleted.  (And
2354				 * whose parent will soon be deleted).  Point
2355				 * this guy back to the root lpl.
2356				 */
2357				lpl_cur->lpl_parent = lpl_root;
2358				lpl_rset_add(lpl_root, lpl_cur);
2359			}
2360
2361		}
2362
2363		/*
2364		 * Now that we're done, make sure the count on the root lpl is
2365		 * correct, and update the hints of the children for the sake of
2366		 * thoroughness
2367		 */
2368		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2369			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2370		}
2371		lpl_root->lpl_ncpu = sum;
2372		lpl_child_update(lpl_root, cp);
2373
2374		cp = cp->cp_next;
2375	} while (cp != cp_list_head);
2376
2377	return (levels);
2378}
2379
2380/*
2381 * Insert a lpl into the resource hierarchy and create any additional lpls that
2382 * are necessary to represent the varying states of locality for the cpu
2383 * resoruces newly added to the partition.
2384 *
2385 * This routine is clever enough that it can correctly add resources from the
2386 * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2387 * those for which the lpl is a leaf as opposed to simply a named equally local
2388 * resource).  The one special case that needs additional processing is when a
2389 * new intermediate lpl is introduced.  Since the main loop only traverses
2390 * looking to add the leaf resource where it does not yet exist, additional work
2391 * is necessary to add other leaf resources that may need to exist in the newly
2392 * created intermediate.  This is performed by the second inner loop, and is
2393 * only done when the check for more than one overlapping resource succeeds.
2394 */
2395
2396void
2397lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2398{
2399	int		i;
2400	int		j;
2401	int		hint;
2402	int		rset_num_intersect;
2403	lgrp_t		*lgrp_cur;
2404	lpl_t		*lpl_cur;
2405	lpl_t		*lpl_parent;
2406	lgrp_id_t	parent_id;
2407	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2408
2409	for (i = 0; i <= lgrp_alloc_max; i++) {
2410		lgrp_cur = lgrp_table[i];
2411
2412		/*
2413		 * Don't insert if the lgrp isn't there, if the leaf isn't
2414		 * contained within the current lgrp, or if the current lgrp has
2415		 * no leaves in this partition
2416		 */
2417
2418		if (!LGRP_EXISTS(lgrp_cur) ||
2419		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2420		    lpl_leaf->lpl_lgrpid) ||
2421		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2422		    cpupart->cp_lgrpset))
2423			continue;
2424
2425		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2426		if (lgrp_cur->lgrp_parent != NULL) {
2427			/* if lgrp has a parent, assign it properly */
2428			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2429			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2430		} else {
2431			/* if not, make sure parent ptr gets set to null */
2432			lpl_parent = NULL;
2433		}
2434
2435		if (lpl_cur == lpl_leaf) {
2436			/*
2437			 * Almost all leaf state was initialized elsewhere.  The
2438			 * only thing left to do is to set the parent.
2439			 */
2440			lpl_cur->lpl_parent = lpl_parent;
2441			continue;
2442		}
2443
2444		/*
2445		 * Initialize intermediate lpl
2446		 * Save this lpl's hint though. Since we're changing this
2447		 * lpl's resources, we need to update the hint in this lpl's
2448		 * children, but the hint in this lpl is unaffected and
2449		 * should be preserved.
2450		 */
2451		hint = lpl_cur->lpl_hint;
2452
2453		lpl_clear(lpl_cur);
2454		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2455
2456		lpl_cur->lpl_hint = hint;
2457		lpl_cur->lpl_parent = lpl_parent;
2458
2459		/* does new lpl need to be populated with other resources? */
2460		rset_intersect =
2461		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2462			cpupart->cp_lgrpset);
2463		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2464
2465		if (rset_num_intersect > 1) {
2466			/*
2467			 * If so, figure out what lpls have resources that
2468			 * intersect this one, and add them.
2469			 */
2470			for (j = 0; j <= lgrp_alloc_max; j++) {
2471				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2472				lpl_t	*lpl_cand;	/* candidate lpl */
2473
2474				lgrp_cand = lgrp_table[j];
2475				if (!LGRP_EXISTS(lgrp_cand) ||
2476				    !klgrpset_ismember(rset_intersect,
2477					lgrp_cand->lgrp_id))
2478					continue;
2479				lpl_cand =
2480				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2481				lpl_rset_add(lpl_cur, lpl_cand);
2482			}
2483		}
2484		/*
2485		 * This lpl's rset has changed. Update the hint in it's
2486		 * children.
2487		 */
2488		lpl_child_update(lpl_cur, cpupart);
2489	}
2490}
2491
2492/*
2493 * remove a lpl from the hierarchy of resources, clearing its state when
2494 * finished.  If the lpls at the intermediate levels of the hierarchy have no
2495 * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496 * delete them as well.
2497 */
2498
2499void
2500lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2501{
2502	int		i;
2503	lgrp_t		*lgrp_cur;
2504	lpl_t		*lpl_cur;
2505	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2506
2507	for (i = 0; i <= lgrp_alloc_max; i++) {
2508		lgrp_cur = lgrp_table[i];
2509
2510		/*
2511		 * Don't attempt to remove from lgrps that aren't there, that
2512		 * don't contain our leaf, or from the leaf itself. (We do that
2513		 * later)
2514		 */
2515
2516		if (!LGRP_EXISTS(lgrp_cur))
2517			continue;
2518
2519		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2520
2521		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2522		    lpl_leaf->lpl_lgrpid) ||
2523		    (lpl_cur == lpl_leaf)) {
2524			continue;
2525		}
2526
2527		/*
2528		 * This is a slightly sleazy simplification in that we have
2529		 * already marked the cp_lgrpset as no longer containing the
2530		 * leaf we've deleted.  Any lpls that pass the above checks
2531		 * based upon lgrp membership but not necessarily cpu-part
2532		 * membership also get cleared by the checks below.  Currently
2533		 * this is harmless, as the lpls should be empty anyway.
2534		 *
2535		 * In particular, we want to preserve lpls that have additional
2536		 * leaf resources, even though we don't yet have a processor
2537		 * architecture that represents resources this way.
2538		 */
2539
2540		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2541		    cpupart->cp_lgrpset);
2542
2543		lpl_rset_del(lpl_cur, lpl_leaf);
2544		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2545			lpl_clear(lpl_cur);
2546		} else {
2547			/*
2548			 * Update this lpl's children
2549			 */
2550			lpl_child_update(lpl_cur, cpupart);
2551		}
2552	}
2553	lpl_clear(lpl_leaf);
2554}
2555
2556/*
2557 * add a cpu to a partition in terms of lgrp load avg bookeeping
2558 *
2559 * The lpl (cpu partition load average information) is now arranged in a
2560 * hierarchical fashion whereby resources that are closest, ie. most local, to
2561 * the cpu in question are considered to be leaves in a tree of resources.
2562 * There are two general cases for cpu additon:
2563 *
2564 * 1. A lpl structure that contains resources already in the hierarchy tree.
2565 * In this case, all of the associated lpl relationships have been defined, and
2566 * all that is necessary is that we link the new cpu into the per-lpl list of
2567 * cpus, and increment the ncpu count of all places where this cpu resource will
2568 * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569 * pushing is accomplished by this routine.
2570 *
2571 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572 * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2573 * construct the hierarchy of state necessary to name it's more distant
2574 * resources, if they should exist.  The leaf structure is initialized by this
2575 * routine, as is the cpu-partition state for the lgrp membership.  This routine
2576 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577 * and builds all of the "ancestoral" state necessary to identify resources at
2578 * differing levels of locality.
2579 */
2580void
2581lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2582{
2583	cpupart_t	*cpupart;
2584	lgrp_t		*lgrp_leaf;
2585	lpl_t		*lpl_leaf;
2586
2587	/* called sometimes w/ cpus paused - grab no locks */
2588	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2589
2590	cpupart = cp->cpu_part;
2591	lgrp_leaf = lgrp_table[lgrpid];
2592
2593	/* don't add non-existent lgrp */
2594	ASSERT(LGRP_EXISTS(lgrp_leaf));
2595	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2596	cp->cpu_lpl = lpl_leaf;
2597
2598	/* only leaf lpls contain cpus */
2599
2600	if (lpl_leaf->lpl_ncpu++ == 0) {
2601		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2602		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2603		lpl_leaf_insert(lpl_leaf, cpupart);
2604	} else {
2605		/*
2606		 * the lpl should already exist in the parent, so just update
2607		 * the count of available CPUs
2608		 */
2609		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2610	}
2611
2612	/* link cpu into list of cpus in lpl */
2613
2614	if (lpl_leaf->lpl_cpus) {
2615		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2616		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2617		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2618		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2619	} else {
2620		/*
2621		 * We increment ncpu immediately after we create a new leaf
2622		 * lpl, so assert that ncpu == 1 for the case where we don't
2623		 * have any cpu pointers yet.
2624		 */
2625		ASSERT(lpl_leaf->lpl_ncpu == 1);
2626		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2627	}
2628
2629}
2630
2631
2632/*
2633 * remove a cpu from a partition in terms of lgrp load avg bookeeping
2634 *
2635 * The lpl (cpu partition load average information) is now arranged in a
2636 * hierarchical fashion whereby resources that are closest, ie. most local, to
2637 * the cpu in question are considered to be leaves in a tree of resources.
2638 * There are two removal cases in question:
2639 *
2640 * 1. Removal of the resource in the leaf leaves other resources remaining in
2641 * that leaf.  (Another cpu still exists at this level of locality).  In this
2642 * case, the count of available cpus is decremented in all assocated lpls by
2643 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644 * from the per-cpu lpl list.
2645 *
2646 * 2. Removal of the resource results in the lpl containing no resources.  (It's
2647 * empty)  In this case, all of what has occurred for the first step must take
2648 * place; however, additionally we must remove the lpl structure itself, prune
2649 * out any stranded lpls that do not directly name a leaf resource, and mark the
2650 * cpu partition in question as no longer containing resources from the lgrp of
2651 * the lpl that has been delted.  Cpu-partition changes are handled by this
2652 * method, but the lpl_leaf_remove function deals with the details of pruning
2653 * out the empty lpl and any of its orphaned direct ancestors.
2654 */
2655void
2656lgrp_part_del_cpu(cpu_t *cp)
2657{
2658	lpl_t		*lpl;
2659	lpl_t		*leaf_lpl;
2660	lgrp_t		*lgrp_leaf;
2661
2662	/* called sometimes w/ cpus paused - grab no locks */
2663
2664	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2665
2666	lpl = leaf_lpl = cp->cpu_lpl;
2667	lgrp_leaf = leaf_lpl->lpl_lgrp;
2668
2669	/* don't delete a leaf that isn't there */
2670	ASSERT(LGRP_EXISTS(lgrp_leaf));
2671
2672	/* no double-deletes */
2673	ASSERT(lpl->lpl_ncpu);
2674	if (--lpl->lpl_ncpu == 0) {
2675		/*
2676		 * This was the last cpu in this lgroup for this partition,
2677		 * clear its bit in the partition's lgroup bitmask
2678		 */
2679		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2680
2681		/* eliminate remaning lpl link pointers in cpu, lpl */
2682		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2683
2684		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2685	} else {
2686
2687		/* unlink cpu from lists of cpus in lpl */
2688		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2689		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2690		if (lpl->lpl_cpus == cp) {
2691			lpl->lpl_cpus = cp->cpu_next_lpl;
2692		}
2693
2694		/*
2695		 * Update the cpu count in the lpls associated with parent
2696		 * lgroups.
2697		 */
2698		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2699
2700	}
2701	/* clear cpu's lpl ptr when we're all done */
2702	cp->cpu_lpl = NULL;
2703}
2704
2705/*
2706 * Recompute load average for the specified partition/lgrp fragment.
2707 *
2708 * We rely on the fact that this routine is called from the clock thread
2709 * at a point before the clock thread can block (i.e. before its first
2710 * lock request).  Since the clock thread can not be preempted (since it
2711 * runs at highest priority), we know that cpu partitions can not change
2712 * (since doing so would require either the repartition requester or the
2713 * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714 * without grabbing cpu_lock.
2715 */
2716void
2717lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2718{
2719	uint_t		ncpu;
2720	int64_t		old, new, f;
2721
2722	/*
2723	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2724	 */
2725	static short expval[] = {
2726	    0, 3196, 1618, 1083,
2727	    814, 652, 543, 466,
2728	    408, 363, 326, 297,
2729	    272, 251, 233, 218,
2730	    204, 192, 181, 172,
2731	    163, 155, 148, 142,
2732	    136, 130, 125, 121,
2733	    116, 112, 109, 105
2734	};
2735
2736	/* ASSERT (called from clock level) */
2737
2738	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2739	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2740		return;
2741	}
2742
2743	for (;;) {
2744
2745		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2746			f = expval[1]/ncpu; /* good approx. for large ncpu */
2747		else
2748			f = expval[ncpu];
2749
2750		/*
2751		 * Modify the load average atomically to avoid losing
2752		 * anticipatory load updates (see lgrp_move_thread()).
2753		 */
2754		if (ageflag) {
2755			/*
2756			 * We're supposed to both update and age the load.
2757			 * This happens 10 times/sec. per cpu.  We do a
2758			 * little hoop-jumping to avoid integer overflow.
2759			 */
2760			int64_t		q, r;
2761
2762			do {
2763				old = new = lpl->lpl_loadavg;
2764				q = (old  >> 16) << 7;
2765				r = (old  & 0xffff) << 7;
2766				new += ((long long)(nrcpus - q) * f -
2767				    ((r * f) >> 16)) >> 7;
2768
2769				/*
2770				 * Check for overflow
2771				 */
2772				if (new > LGRP_LOADAVG_MAX)
2773					new = LGRP_LOADAVG_MAX;
2774				else if (new < 0)
2775					new = 0;
2776			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2777			    new) != old);
2778		} else {
2779			/*
2780			 * We're supposed to update the load, but not age it.
2781			 * This option is used to update the load (which either
2782			 * has already been aged in this 1/10 sec. interval or
2783			 * soon will be) to account for a remotely executing
2784			 * thread.
2785			 */
2786			do {
2787				old = new = lpl->lpl_loadavg;
2788				new += f;
2789				/*
2790				 * Check for overflow
2791				 * Underflow not possible here
2792				 */
2793				if (new < old)
2794					new = LGRP_LOADAVG_MAX;
2795			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2796			    new) != old);
2797		}
2798
2799		/*
2800		 * Do the same for this lpl's parent
2801		 */
2802		if ((lpl = lpl->lpl_parent) == NULL)
2803			break;
2804		ncpu = lpl->lpl_ncpu;
2805	}
2806}
2807
2808/*
2809 * Initialize lpl topology in the target based on topology currently present in
2810 * lpl_bootstrap.
2811 *
2812 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813 * initialize cp_default list of lpls. Up to this point all topology operations
2814 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816 * `target' points to the list of lpls in cp_default and `size' is the size of
2817 * this list.
2818 *
2819 * This function walks the lpl topology in lpl_bootstrap and does for things:
2820 *
2821 * 1) Copies all fields from lpl_bootstrap to the target.
2822 *
2823 * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2824 *
2825 * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826 *    instead of lpl_bootstrap.
2827 *
2828 * 4) Updates pointers in the resource list of the target to point to the lpls
2829 *    in the target list instead of lpl_bootstrap.
2830 *
2831 * After lpl_topo_bootstrap() completes, target contains the same information
2832 * that would be present there if it were used during boot instead of
2833 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834 * and it is bzeroed.
2835 */
2836void
2837lpl_topo_bootstrap(lpl_t *target, int size)
2838{
2839	lpl_t	*lpl = lpl_bootstrap;
2840	lpl_t	*target_lpl = target;
2841	int	howmany;
2842	int	id;
2843	int	i;
2844
2845	/*
2846	 * The only target that should be passed here is cp_default lpl list.
2847	 */
2848	ASSERT(target == cp_default.cp_lgrploads);
2849	ASSERT(size == cp_default.cp_nlgrploads);
2850	ASSERT(!lgrp_topo_initialized);
2851	ASSERT(ncpus == 1);
2852
2853	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2854	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2855		/*
2856		 * Copy all fields from lpl.
2857		 */
2858
2859		*target_lpl = *lpl;
2860
2861		/*
2862		 * Substitute CPU0 lpl pointer with one relative to target.
2863		 */
2864		if (lpl->lpl_cpus == CPU) {
2865			ASSERT(CPU->cpu_lpl == lpl);
2866			CPU->cpu_lpl = target_lpl;
2867		}
2868
2869		/*
2870		 * Substitute parent information with parent relative to target.
2871		 */
2872		if (lpl->lpl_parent != NULL)
2873			target_lpl->lpl_parent = (lpl_t *)
2874			    (((uintptr_t)lpl->lpl_parent -
2875				(uintptr_t)lpl_bootstrap) +
2876				(uintptr_t)target);
2877
2878		/*
2879		 * Walk over resource set substituting pointers relative to
2880		 * lpl_bootstrap to pointers relative to target.
2881		 */
2882		ASSERT(lpl->lpl_nrset <= 1);
2883
2884		for (id = 0; id < lpl->lpl_nrset; id++) {
2885			if (lpl->lpl_rset[id] != NULL) {
2886				target_lpl->lpl_rset[id] =
2887				    (lpl_t *)
2888				    (((uintptr_t)lpl->lpl_rset[id] -
2889					(uintptr_t)lpl_bootstrap) +
2890					(uintptr_t)target);
2891			}
2892		}
2893	}
2894
2895	/*
2896	 * Topology information in lpl_bootstrap is no longer needed.
2897	 */
2898	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2899}
2900
2901/*
2902 * If the lowest load among the lgroups a process' threads are currently
2903 * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2904 * expanding the process to a new lgroup.
2905 */
2906#define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2907lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2908
2909#define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2910	((lgrp_expand_proc_thresh) / (ncpu))
2911
2912/*
2913 * A process will be expanded to a new lgroup only if the difference between
2914 * the lowest load on the lgroups the process' thread's are currently spread
2915 * across and the lowest load on the other lgroups in the process' partition
2916 * is greater than lgrp_expand_proc_diff.
2917 */
2918#define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2919lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2920
2921#define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2922	((lgrp_expand_proc_diff) / (ncpu))
2923
2924/*
2925 * The loadavg tolerance accounts for "noise" inherent in the load, which may
2926 * be present due to impreciseness of the load average decay algorithm.
2927 *
2928 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2929 * tolerance is scaled by the number of cpus in the lgroup just like
2930 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2931 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2932 * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2933 */
2934uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2935#define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2936	((lgrp_loadavg_tolerance) / ncpu)
2937
2938/*
2939 * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2940 * average is above this threshold
2941 */
2942uint32_t	lgrp_load_thresh = UINT32_MAX;
2943
2944/*
2945 * lgrp_choose() will try to skip any lgroups with less memory
2946 * than this free when choosing a home lgroup
2947 */
2948pgcnt_t	lgrp_mem_free_thresh = 0;
2949
2950/*
2951 * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2952 * one based on one of the following policies:
2953 * - Random selection
2954 * - Pseudo round robin placement
2955 * - Longest time since a thread was last placed
2956 */
2957#define	LGRP_CHOOSE_RANDOM	1
2958#define	LGRP_CHOOSE_RR		2
2959#define	LGRP_CHOOSE_TIME	3
2960
2961int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
2962
2963/*
2964 * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
2965 * be bound to a CPU or processor set.
2966 *
2967 * Arguments:
2968 *	t		The thread
2969 *	cpupart		The partition the thread belongs to.
2970 *
2971 * NOTE: Should at least be called with the cpu_lock held, kernel preemption
2972 *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
2973 *	 partitions changing out from under us and assumes that given thread is
2974 *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
2975 *	 disabled, so don't grab any locks because we should never block under
2976 *	 those conditions.
2977 */
2978lpl_t *
2979lgrp_choose(kthread_t *t, cpupart_t *cpupart)
2980{
2981	lgrp_load_t	bestload, bestrload;
2982	int		lgrpid_offset, lgrp_count;
2983	lgrp_id_t	lgrpid, lgrpid_start;
2984	lpl_t		*lpl, *bestlpl, *bestrlpl;
2985	klgrpset_t	lgrpset;
2986	proc_t		*p;
2987
2988	ASSERT(t != NULL);
2989	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2990	    THREAD_LOCK_HELD(t));
2991	ASSERT(cpupart != NULL);
2992
2993	p = t->t_procp;
2994
2995	/* A process should always be in an active partition */
2996	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
2997
2998	bestlpl = bestrlpl = NULL;
2999	bestload = bestrload = LGRP_LOADAVG_MAX;
3000	lgrpset = cpupart->cp_lgrpset;
3001
3002	switch (lgrp_choose_policy) {
3003	case LGRP_CHOOSE_RR:
3004		lgrpid = cpupart->cp_lgrp_hint;
3005		do {
3006			if (++lgrpid > lgrp_alloc_max)
3007				lgrpid = 0;
3008		} while (!klgrpset_ismember(lgrpset, lgrpid));
3009
3010		break;
3011	default:
3012	case LGRP_CHOOSE_TIME:
3013	case LGRP_CHOOSE_RANDOM:
3014		klgrpset_nlgrps(lgrpset, lgrp_count);
3015		lgrpid_offset =
3016		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3017		for (lgrpid = 0; ; lgrpid++) {
3018			if (klgrpset_ismember(lgrpset, lgrpid)) {
3019				if (--lgrpid_offset == 0)
3020					break;
3021			}
3022		}
3023		break;
3024	}
3025
3026	lgrpid_start = lgrpid;
3027
3028	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3029	    lgrp_id_t, cpupart->cp_lgrp_hint);
3030
3031	/*
3032	 * Use lgroup affinities (if any) to choose best lgroup
3033	 *
3034	 * NOTE: Assumes that thread is protected from going away and its
3035	 *	 lgroup affinities won't change (ie. p_lock, or
3036	 *	 thread_lock() being held and/or CPUs paused)
3037	 */
3038	if (t->t_lgrp_affinity) {
3039		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3040		if (lpl != NULL)
3041			return (lpl);
3042	}
3043
3044	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3045
3046	do {
3047		pgcnt_t	npgs;
3048
3049		/*
3050		 * Skip any lgroups outside of thread's pset
3051		 */
3052		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3053			if (++lgrpid > lgrp_alloc_max)
3054				lgrpid = 0;	/* wrap the search */
3055			continue;
3056		}
3057
3058		/*
3059		 * Skip any non-leaf lgroups
3060		 */
3061		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3062			continue;
3063
3064		/*
3065		 * Skip any lgroups without enough free memory
3066		 * (when threshold set to nonzero positive value)
3067		 */
3068		if (lgrp_mem_free_thresh > 0) {
3069			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3070			if (npgs < lgrp_mem_free_thresh) {
3071				if (++lgrpid > lgrp_alloc_max)
3072					lgrpid = 0;	/* wrap the search */
3073				continue;
3074			}
3075		}
3076
3077		lpl = &cpupart->cp_lgrploads[lgrpid];
3078		if (klgrpset_isempty(p->p_lgrpset) ||
3079		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3080			/*
3081			 * Either this is a new process or the process already
3082			 * has threads on this lgrp, so this is a preferred
3083			 * lgroup for the thread.
3084			 */
3085			if (bestlpl == NULL ||
3086			    lpl_pick(lpl, bestlpl)) {
3087				bestload = lpl->lpl_loadavg;
3088				bestlpl = lpl;
3089			}
3090		} else {
3091			/*
3092			 * The process doesn't have any threads on this lgrp,
3093			 * but we're willing to consider this lgrp if the load
3094			 * difference is big enough to justify splitting up
3095			 * the process' threads.
3096			 */
3097			if (bestrlpl == NULL ||
3098			    lpl_pick(lpl, bestrlpl)) {
3099				bestrload = lpl->lpl_loadavg;
3100				bestrlpl = lpl;
3101			}
3102		}
3103		if (++lgrpid > lgrp_alloc_max)
3104			lgrpid = 0;	/* wrap the search */
3105	} while (lgrpid != lgrpid_start);
3106
3107	/*
3108	 * Return root lgroup if threshold isn't set to maximum value and
3109	 * lowest lgroup load average more than a certain threshold
3110	 */
3111	if (lgrp_load_thresh != UINT32_MAX &&
3112	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3113		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3114
3115	/*
3116	 * If all the lgroups over which the thread's process is spread are
3117	 * heavily loaded, or otherwise undesirable, we'll consider placing
3118	 * the thread on one of the other leaf lgroups in the thread's
3119	 * partition.
3120	 */
3121	if ((bestlpl == NULL) ||
3122	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3123	    (bestrload < bestload) &&	/* paranoid about wraparound */
3124	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3125	    bestload))) {
3126		bestlpl = bestrlpl;
3127	}
3128
3129	if (bestlpl == NULL) {
3130		/*
3131		 * No lgroup looked particularly good, but we still
3132		 * have to pick something. Go with the randomly selected
3133		 * legal lgroup we started with above.
3134		 */
3135		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3136	}
3137
3138	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3139	bestlpl->lpl_homed_time = gethrtime_unscaled();
3140
3141	ASSERT(bestlpl->lpl_ncpu > 0);
3142	return (bestlpl);
3143}
3144
3145/*
3146 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3147 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3148 */
3149static int
3150lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3151{
3152	lgrp_load_t	l1, l2;
3153	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3154
3155	l1 = lpl1->lpl_loadavg;
3156	l2 = lpl2->lpl_loadavg;
3157
3158	if ((l1 + tolerance < l2) && (l1 < l2)) {
3159		/* lpl1 is significantly less loaded than lpl2 */
3160		return (1);
3161	}
3162
3163	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3164	    l1 + tolerance >= l2 && l1 < l2 &&
3165	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3166		/*
3167		 * lpl1's load is within the tolerance of lpl2. We're
3168		 * willing to consider it be to better however if
3169		 * it has been longer since we last homed a thread there
3170		 */
3171		return (1);
3172	}
3173
3174	return (0);
3175}
3176
3177/*
3178 * An LWP is expected to be assigned to an lgroup for at least this long
3179 * for its anticipatory load to be justified.  NOTE that this value should
3180 * not be set extremely huge (say, larger than 100 years), to avoid problems
3181 * with overflow in the calculation that uses it.
3182 */
3183#define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3184hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3185
3186/*
3187 * Routine to change a thread's lgroup affiliation.  This routine updates
3188 * the thread's kthread_t struct and its process' proc_t struct to note the
3189 * thread's new lgroup affiliation, and its lgroup affinities.
3190 *
3191 * Note that this is the only routine that modifies a thread's t_lpl field,
3192 * and that adds in or removes anticipatory load.
3193 *
3194 * If the thread is exiting, newlpl is NULL.
3195 *
3196 * Locking:
3197 * The following lock must be held on entry:
3198 *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3199 *		doesn't get removed from t's partition
3200 *
3201 * This routine is not allowed to grab any locks, since it may be called
3202 * with cpus paused (such as from cpu_offline).
3203 */
3204void
3205lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3206{
3207	proc_t		*p;
3208	lpl_t		*lpl, *oldlpl;
3209	lgrp_id_t	oldid;
3210	kthread_t	*tp;
3211	uint_t		ncpu;
3212	lgrp_load_t	old, new;
3213
3214	ASSERT(t);
3215	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3216	    THREAD_LOCK_HELD(t));
3217
3218	/*
3219	 * If not changing lpls, just return
3220	 */
3221	if ((oldlpl = t->t_lpl) == newlpl)
3222		return;
3223
3224	/*
3225	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3226	 * associated with process 0 rather than with its original process).
3227	 */
3228	if (t->t_proc_flag & TP_LWPEXIT) {
3229		if (newlpl != NULL) {
3230			t->t_lpl = newlpl;
3231		}
3232		return;
3233	}
3234
3235	p = ttoproc(t);
3236
3237	/*
3238	 * If the thread had a previous lgroup, update its process' p_lgrpset
3239	 * to account for it being moved from its old lgroup.
3240	 */
3241	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3242	    (p->p_tlist != NULL)) {
3243		oldid = oldlpl->lpl_lgrpid;
3244
3245		if (newlpl != NULL)
3246			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3247
3248		if ((do_lgrpset_delete) &&
3249		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3250			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3251				/*
3252				 * Check if a thread other than the thread
3253				 * that's moving is assigned to the same
3254				 * lgroup as the thread that's moving.  Note
3255				 * that we have to compare lgroup IDs, rather
3256				 * than simply comparing t_lpl's, since the
3257				 * threads may belong to different partitions
3258				 * but be assigned to the same lgroup.
3259				 */
3260				ASSERT(tp->t_lpl != NULL);
3261
3262				if ((tp != t) &&
3263				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3264					/*
3265					 * Another thread is assigned to the
3266					 * same lgroup as the thread that's
3267					 * moving, p_lgrpset doesn't change.
3268					 */
3269					break;
3270				} else if (tp == p->p_tlist) {
3271					/*
3272					 * No other thread is assigned to the
3273					 * same lgroup as the exiting thread,
3274					 * clear the lgroup's bit in p_lgrpset.
3275					 */
3276					klgrpset_del(p->p_lgrpset, oldid);
3277					break;
3278				}
3279			}
3280		}
3281
3282		/*
3283		 * If this thread was assigned to its old lgroup for such a
3284		 * short amount of time that the anticipatory load that was
3285		 * added on its behalf has aged very little, remove that
3286		 * anticipatory load.
3287		 */
3288		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3289		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3290			lpl = oldlpl;
3291			for (;;) {
3292				do {
3293					old = new = lpl->lpl_loadavg;
3294					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3295					if (new > old) {
3296						/*
3297						 * this can happen if the load
3298						 * average was aged since we
3299						 * added in the anticipatory
3300						 * load
3301						 */
3302						new = 0;
3303					}
3304				} while (cas32(
3305					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3306					    new) != old);
3307
3308				lpl = lpl->lpl_parent;
3309				if (lpl == NULL)
3310					break;
3311
3312				ncpu = lpl->lpl_ncpu;
3313				ASSERT(ncpu > 0);
3314			}
3315		}
3316	}
3317	/*
3318	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3319	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3320	 * to its new lgroup to account for its move to its new lgroup.
3321	 */
3322	if (newlpl != NULL) {
3323		/*
3324		 * This thread is moving to a new lgroup
3325		 */
3326		t->t_lpl = newlpl;
3327
3328		/*
3329		 * Reflect move in load average of new lgroup
3330		 * unless it is root lgroup
3331		 */
3332		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3333			return;
3334
3335		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3336			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3337		}
3338
3339		/*
3340		 * It'll take some time for the load on the new lgroup
3341		 * to reflect this thread's placement on it.  We'd
3342		 * like not, however, to have all threads between now
3343		 * and then also piling on to this lgroup.  To avoid
3344		 * this pileup, we anticipate the load this thread
3345		 * will generate on its new lgroup.  The goal is to
3346		 * make the lgroup's load appear as though the thread
3347		 * had been there all along.  We're very conservative
3348		 * in calculating this anticipatory load, we assume
3349		 * the worst case case (100% CPU-bound thread).  This
3350		 * may be modified in the future to be more accurate.
3351		 */
3352		lpl = newlpl;
3353		for (;;) {
3354			ncpu = lpl->lpl_ncpu;
3355			ASSERT(ncpu > 0);
3356			do {
3357				old = new = lpl->lpl_loadavg;
3358				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3359				/*
3360				 * Check for overflow
3361				 * Underflow not possible here
3362				 */
3363				if (new < old)
3364					new = UINT32_MAX;
3365			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3366			    new) != old);
3367
3368			lpl = lpl->lpl_parent;
3369			if (lpl == NULL)
3370				break;
3371		}
3372		t->t_anttime = gethrtime();
3373	}
3374}
3375
3376/*
3377 * Return lgroup memory allocation policy given advice from madvise(3C)
3378 */
3379lgrp_mem_policy_t
3380lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3381{
3382	switch (advice) {
3383	case MADV_ACCESS_LWP:
3384		return (LGRP_MEM_POLICY_NEXT);
3385	case MADV_ACCESS_MANY:
3386		return (LGRP_MEM_POLICY_RANDOM);
3387	default:
3388		return (lgrp_mem_policy_default(size, type));
3389	}
3390}
3391
3392/*
3393 * Figure out default policy
3394 */
3395lgrp_mem_policy_t
3396lgrp_mem_policy_default(size_t size, int type)
3397{
3398	cpupart_t		*cp;
3399	lgrp_mem_policy_t	policy;
3400	size_t			pset_mem_size;
3401
3402	/*
3403	 * Randomly allocate memory across lgroups for shared memory
3404	 * beyond a certain threshold
3405	 */
3406	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3407	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3408		/*
3409		 * Get total memory size of current thread's pset
3410		 */
3411		kpreempt_disable();
3412		cp = curthread->t_cpupart;
3413		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3414		kpreempt_enable();
3415
3416		/*
3417		 * Choose policy to randomly allocate memory across
3418		 * lgroups in pset if it will fit and is not default
3419		 * partition.  Otherwise, allocate memory randomly
3420		 * across machine.
3421		 */
3422		if (lgrp_mem_pset_aware && size < pset_mem_size)
3423			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3424		else
3425			policy = LGRP_MEM_POLICY_RANDOM;
3426	} else
3427		/*
3428		 * Apply default policy for private memory and
3429		 * shared memory under the respective random
3430		 * threshold.
3431		 */
3432		policy = lgrp_mem_default_policy;
3433
3434	return (policy);
3435}
3436
3437/*
3438 * Get memory allocation policy for this segment
3439 */
3440lgrp_mem_policy_info_t *
3441lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3442{
3443	lgrp_mem_policy_info_t	*policy_info;
3444	extern struct seg_ops	segspt_ops;
3445	extern struct seg_ops	segspt_shmops;
3446
3447	/*
3448	 * This is for binary compatibility to protect against third party
3449	 * segment drivers which haven't recompiled to allow for
3450	 * SEGOP_GETPOLICY()
3451	 */
3452	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3453	    seg->s_ops != &segspt_shmops)
3454		return (NULL);
3455
3456	policy_info = NULL;
3457	if (seg->s_ops->getpolicy != NULL)
3458		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3459
3460	return (policy_info);
3461}
3462
3463/*
3464 * Set policy for allocating private memory given desired policy, policy info,
3465 * size in bytes of memory that policy is being applied.
3466 * Return 0 if policy wasn't set already and 1 if policy was set already
3467 */
3468int
3469lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3470    lgrp_mem_policy_info_t *policy_info, size_t size)
3471{
3472
3473	ASSERT(policy_info != NULL);
3474
3475	if (policy == LGRP_MEM_POLICY_DEFAULT)
3476		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3477
3478	/*
3479	 * Policy set already?
3480	 */
3481	if (policy == policy_info->mem_policy)
3482		return (1);
3483
3484	/*
3485	 * Set policy
3486	 */
3487	policy_info->mem_policy = policy;
3488	policy_info->mem_reserved = 0;
3489
3490	return (0);
3491}
3492
3493
3494/*
3495 * Get shared memory allocation policy with given tree and offset
3496 */
3497lgrp_mem_policy_info_t *
3498lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3499    u_offset_t vn_off)
3500{
3501	u_offset_t		off;
3502	lgrp_mem_policy_info_t	*policy_info;
3503	lgrp_shm_policy_seg_t	*policy_seg;
3504	lgrp_shm_locality_t	*shm_locality;
3505	avl_tree_t		*tree;
3506	avl_index_t		where;
3507
3508	/*
3509	 * Get policy segment tree from anon_map or vnode and use specified
3510	 * anon index or vnode offset as offset
3511	 *
3512	 * Assume that no lock needs to be held on anon_map or vnode, since
3513	 * they should be protected by their reference count which must be
3514	 * nonzero for an existing segment
3515	 */
3516	if (amp) {
3517		ASSERT(amp->refcnt != 0);
3518		shm_locality = amp->locality;
3519		if (shm_locality == NULL)
3520			return (NULL);
3521		tree = shm_locality->loc_tree;
3522		off = ptob(anon_index);
3523	} else if (vp) {
3524		shm_locality = vp->v_locality;
3525		if (shm_locality == NULL)
3526			return (NULL);
3527		ASSERT(shm_locality->loc_count != 0);
3528		tree = shm_locality->loc_tree;
3529		off = vn_off;
3530	}
3531
3532	if (tree == NULL)
3533		return (NULL);
3534
3535	/*
3536	 * Lookup policy segment for offset into shared object and return
3537	 * policy info
3538	 */
3539	rw_enter(&shm_locality->loc_lock, RW_READER);
3540	policy_info = NULL;
3541	policy_seg = avl_find(tree, &off, &where);
3542	if (policy_seg)
3543		policy_info = &policy_seg->shm_policy;
3544	rw_exit(&shm_locality->loc_lock);
3545
3546	return (policy_info);
3547}
3548
3549/*
3550 * Default memory allocation policy for kernel segmap pages
3551 */
3552lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3553
3554/*
3555 * Return lgroup to use for allocating memory
3556 * given the segment and address
3557 *
3558 * There isn't any mutual exclusion that exists between calls
3559 * to this routine and DR, so this routine and whomever calls it
3560 * should be mindful of the possibility that the lgrp returned
3561 * may be deleted. If this happens, dereferences of the lgrp
3562 * pointer will still be safe, but the resources in the lgrp will
3563 * be gone, and LGRP_EXISTS() will no longer be true.
3564 */
3565lgrp_t *
3566lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3567{
3568	int			i;
3569	lgrp_t			*lgrp;
3570	klgrpset_t		lgrpset;
3571	int			lgrps_spanned;
3572	unsigned long		off;
3573	lgrp_mem_policy_t	policy;
3574	lgrp_mem_policy_info_t	*policy_info;
3575	ushort_t		random;
3576	int			stat = 0;
3577	extern struct seg	*segkmap;
3578
3579	/*
3580	 * Just return null if the lgrp framework hasn't finished
3581	 * initializing or if this is a UMA machine.
3582	 */
3583	if (nlgrps == 1 || !lgrp_initialized)
3584		return (lgrp_root);
3585
3586	/*
3587	 * Get memory allocation policy for this segment
3588	 */
3589	policy = lgrp_mem_default_policy;
3590	if (seg != NULL) {
3591		if (seg->s_as == &kas) {
3592			if (seg == segkmap)
3593				policy = lgrp_segmap_default_policy;
3594			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3595			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3596				policy = LGRP_MEM_POLICY_RANDOM;
3597		} else {
3598			policy_info = lgrp_mem_policy_get(seg, vaddr);
3599			if (policy_info != NULL)
3600				policy = policy_info->mem_policy;
3601		}
3602	}
3603	lgrpset = 0;
3604
3605	/*
3606	 * Initialize lgroup to home by default
3607	 */
3608	lgrp = lgrp_home_lgrp();
3609
3610	/*
3611	 * When homing threads on root lgrp, override default memory
3612	 * allocation policies with root lgroup memory allocation policy
3613	 */
3614	if (lgrp == lgrp_root)
3615		policy = lgrp_mem_policy_root;
3616
3617	/*
3618	 * Implement policy
3619	 */
3620	switch (policy) {
3621	case LGRP_MEM_POLICY_NEXT_CPU:
3622
3623		/*
3624		 * Return lgroup of current CPU which faulted on memory
3625		 * If the CPU isn't currently in an lgrp, then opt to
3626		 * allocate from the root.
3627		 *
3628		 * Kernel preemption needs to be disabled here to prevent
3629		 * the current CPU from going away before lgrp is found.
3630		 */
3631		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3632			lgrp = lgrp_root;
3633		} else {
3634			kpreempt_disable();
3635			lgrp = lgrp_cpu_to_lgrp(CPU);
3636			kpreempt_enable();
3637		}
3638		break;
3639
3640	case LGRP_MEM_POLICY_NEXT:
3641	case LGRP_MEM_POLICY_DEFAULT:
3642	default:
3643
3644		/*
3645		 * Just return current thread's home lgroup
3646		 * for default policy (next touch)
3647		 * If the thread is homed to the root,
3648		 * then the default policy is random across lgroups.
3649		 * Fallthrough to the random case.
3650		 */
3651		if (lgrp != lgrp_root) {
3652			if (policy == LGRP_MEM_POLICY_NEXT)
3653				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3654			else
3655				lgrp_stat_add(lgrp->lgrp_id,
3656				    LGRP_NUM_DEFAULT, 1);
3657			break;
3658		}
3659		/* LINTED fallthrough on case statement */
3660	case LGRP_MEM_POLICY_RANDOM:
3661
3662		/*
3663		 * Return a random leaf lgroup with memory
3664		 */
3665		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3666		/*
3667		 * Count how many lgroups are spanned
3668		 */
3669		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3670
3671		/*
3672		 * There may be no memnodes in the root lgroup during DR copy
3673		 * rename on a system with only two boards (memnodes)
3674		 * configured. In this case just return the root lgrp.
3675		 */
3676		if (lgrps_spanned == 0) {
3677			lgrp = lgrp_root;
3678			break;
3679		}
3680
3681		/*
3682		 * Pick a random offset within lgroups spanned
3683		 * and return lgroup at that offset
3684		 */
3685		random = (ushort_t)gethrtime() >> 4;
3686		off = random % lgrps_spanned;
3687		ASSERT(off <= lgrp_alloc_max);
3688
3689		for (i = 0; i <= lgrp_alloc_max; i++) {
3690			if (!klgrpset_ismember(lgrpset, i))
3691				continue;
3692			if (off)
3693				off--;
3694			else {
3695				lgrp = lgrp_table[i];
3696				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3697				    1);
3698				break;
3699			}
3700		}
3701		break;
3702
3703	case LGRP_MEM_POLICY_RANDOM_PROC:
3704
3705		/*
3706		 * Grab copy of bitmask of lgroups spanned by
3707		 * this process
3708		 */
3709		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3710		stat = LGRP_NUM_RANDOM_PROC;
3711
3712		/* LINTED fallthrough on case statement */
3713	case LGRP_MEM_POLICY_RANDOM_PSET:
3714
3715		if (!stat)
3716			stat = LGRP_NUM_RANDOM_PSET;
3717
3718		if (klgrpset_isempty(lgrpset)) {
3719			/*
3720			 * Grab copy of bitmask of lgroups spanned by
3721			 * this processor set
3722			 */
3723			kpreempt_disable();
3724			klgrpset_copy(lgrpset,
3725			    curthread->t_cpupart->cp_lgrpset);
3726			kpreempt_enable();
3727		}
3728
3729		/*
3730		 * Count how many lgroups are spanned
3731		 */
3732		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3733		ASSERT(lgrps_spanned <= nlgrps);
3734
3735		/*
3736		 * Probably lgrps_spanned should be always non-zero, but to be
3737		 * on the safe side we return lgrp_root if it is empty.
3738		 */
3739		if (lgrps_spanned == 0) {
3740			lgrp = lgrp_root;
3741			break;
3742		}
3743
3744		/*
3745		 * Pick a random offset within lgroups spanned
3746		 * and return lgroup at that offset
3747		 */
3748		random = (ushort_t)gethrtime() >> 4;
3749		off = random % lgrps_spanned;
3750		ASSERT(off <= lgrp_alloc_max);
3751
3752		for (i = 0; i <= lgrp_alloc_max; i++) {
3753			if (!klgrpset_ismember(lgrpset, i))
3754				continue;
3755			if (off)
3756				off--;
3757			else {
3758				lgrp = lgrp_table[i];
3759				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3760				    1);
3761				break;
3762			}
3763		}
3764		break;
3765
3766	case LGRP_MEM_POLICY_ROUNDROBIN:
3767
3768		/*
3769		 * Use offset within segment to determine
3770		 * offset from home lgroup to choose for
3771		 * next lgroup to allocate memory from
3772		 */
3773		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3774		    (lgrp_alloc_max + 1);
3775
3776		kpreempt_disable();
3777		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3778		i = lgrp->lgrp_id;
3779		kpreempt_enable();
3780
3781		while (off > 0) {
3782			i = (i + 1) % (lgrp_alloc_max + 1);
3783			lgrp = lgrp_table[i];
3784			if (klgrpset_ismember(lgrpset, i))
3785				off--;
3786		}
3787		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3788
3789		break;
3790	}
3791
3792	ASSERT(lgrp != NULL);
3793	return (lgrp);
3794}
3795
3796/*
3797 * Return the number of pages in an lgroup
3798 *
3799 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3800 *	 could cause tests that rely on the numat driver to fail....
3801 */
3802pgcnt_t
3803lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3804{
3805	lgrp_t *lgrp;
3806
3807	lgrp = lgrp_table[lgrpid];
3808	if (!LGRP_EXISTS(lgrp) ||
3809	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3810	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3811		return (0);
3812
3813	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3814}
3815
3816/*
3817 * Initialize lgroup shared memory allocation policy support
3818 */
3819void
3820lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3821{
3822	lgrp_shm_locality_t	*shm_locality;
3823
3824	/*
3825	 * Initialize locality field in anon_map
3826	 * Don't need any locks because this is called when anon_map is
3827	 * allocated, but not used anywhere yet.
3828	 */
3829	if (amp) {
3830		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3831		if (amp->locality == NULL) {
3832			/*
3833			 * Allocate and initialize shared memory locality info
3834			 * and set anon_map locality pointer to it
3835			 * Drop lock across kmem_alloc(KM_SLEEP)
3836			 */
3837			ANON_LOCK_EXIT(&amp->a_rwlock);
3838			shm_locality = kmem_alloc(sizeof (*shm_locality),
3839			    KM_SLEEP);
3840			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3841			    NULL);
3842			shm_locality->loc_count = 1;	/* not used for amp */
3843			shm_locality->loc_tree = NULL;
3844
3845			/*
3846			 * Reacquire lock and check to see whether anyone beat
3847			 * us to initializing the locality info
3848			 */
3849			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3850			if (amp->locality != NULL) {
3851				rw_destroy(&shm_locality->loc_lock);
3852				kmem_free(shm_locality,
3853				    sizeof (*shm_locality));
3854			} else
3855				amp->locality = shm_locality;
3856		}
3857		ANON_LOCK_EXIT(&amp->a_rwlock);
3858		return;
3859	}
3860
3861	/*
3862	 * Allocate shared vnode policy info if vnode is not locality aware yet
3863	 */
3864	mutex_enter(&vp->v_lock);
3865	if ((vp->v_flag & V_LOCALITY) == 0) {
3866		/*
3867		 * Allocate and initialize shared memory locality info
3868		 */
3869		mutex_exit(&vp->v_lock);
3870		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3871		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3872		shm_locality->loc_count = 1;
3873		shm_locality->loc_tree = NULL;
3874
3875		/*
3876		 * Point vnode locality field at shared vnode policy info
3877		 * and set locality aware flag in vnode
3878		 */
3879		mutex_enter(&vp->v_lock);
3880		if ((vp->v_flag & V_LOCALITY) == 0) {
3881			vp->v_locality = shm_locality;
3882			vp->v_flag |= V_LOCALITY;
3883		} else {
3884			/*
3885			 * Lost race so free locality info and increment count.
3886			 */
3887			rw_destroy(&shm_locality->loc_lock);
3888			kmem_free(shm_locality, sizeof (*shm_locality));
3889			shm_locality = vp->v_locality;
3890			shm_locality->loc_count++;
3891		}
3892		mutex_exit(&vp->v_lock);
3893
3894		return;
3895	}
3896
3897	/*
3898	 * Increment reference count of number of segments mapping this vnode
3899	 * shared
3900	 */
3901	shm_locality = vp->v_locality;
3902	shm_locality->loc_count++;
3903	mutex_exit(&vp->v_lock);
3904}
3905
3906/*
3907 * Destroy the given shared memory policy segment tree
3908 */
3909void
3910lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3911{
3912	lgrp_shm_policy_seg_t	*cur;
3913	lgrp_shm_policy_seg_t	*next;
3914
3915	if (tree == NULL)
3916		return;
3917
3918	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3919	while (cur != NULL) {
3920		next = AVL_NEXT(tree, cur);
3921		avl_remove(tree, cur);
3922		kmem_free(cur, sizeof (*cur));
3923		cur = next;
3924	}
3925	kmem_free(tree, sizeof (avl_tree_t));
3926}
3927
3928/*
3929 * Uninitialize lgroup shared memory allocation policy support
3930 */
3931void
3932lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3933{
3934	lgrp_shm_locality_t	*shm_locality;
3935
3936	/*
3937	 * For anon_map, deallocate shared memory policy tree and
3938	 * zero locality field
3939	 * Don't need any locks because anon_map is being freed
3940	 */
3941	if (amp) {
3942		if (amp->locality == NULL)
3943			return;
3944		shm_locality = amp->locality;
3945		shm_locality->loc_count = 0;	/* not really used for amp */
3946		rw_destroy(&shm_locality->loc_lock);
3947		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3948		kmem_free(shm_locality, sizeof (*shm_locality));
3949		amp->locality = 0;
3950		return;
3951	}
3952
3953	/*
3954	 * For vnode, decrement reference count of segments mapping this vnode
3955	 * shared and delete locality info if reference count drops to 0
3956	 */
3957	mutex_enter(&vp->v_lock);
3958	shm_locality = vp->v_locality;
3959	shm_locality->loc_count--;
3960
3961	if (shm_locality->loc_count == 0) {
3962		rw_destroy(&shm_locality->loc_lock);
3963		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3964		kmem_free(shm_locality, sizeof (*shm_locality));
3965		vp->v_locality = 0;
3966		vp->v_flag &= ~V_LOCALITY;
3967	}
3968	mutex_exit(&vp->v_lock);
3969}
3970
3971/*
3972 * Compare two shared memory policy segments
3973 * Used by AVL tree code for searching
3974 */
3975int
3976lgrp_shm_policy_compar(const void *x, const void *y)
3977{
3978	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
3979	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
3980
3981	if (a->shm_off < b->shm_off)
3982		return (-1);
3983	if (a->shm_off >= b->shm_off + b->shm_size)
3984		return (1);
3985	return (0);
3986}
3987
3988/*
3989 * Concatenate seg1 with seg2 and remove seg2
3990 */
3991static int
3992lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
3993    lgrp_shm_policy_seg_t *seg2)
3994{
3995	if (!seg1 || !seg2 ||
3996	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
3997	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
3998		return (-1);
3999
4000	seg1->shm_size += seg2->shm_size;
4001	avl_remove(tree, seg2);
4002	kmem_free(seg2, sizeof (*seg2));
4003	return (0);
4004}
4005
4006/*
4007 * Split segment at given offset and return rightmost (uppermost) segment
4008 * Assumes that there are no overlapping segments
4009 */
4010static lgrp_shm_policy_seg_t *
4011lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4012    u_offset_t off)
4013{
4014	lgrp_shm_policy_seg_t	*newseg;
4015	avl_index_t		where;
4016
4017	ASSERT(seg != NULL);
4018	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4019
4020	if (!seg || off < seg->shm_off || off > seg->shm_off +
4021	    seg->shm_size)
4022		return (NULL);
4023
4024	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4025		return (seg);
4026
4027	/*
4028	 * Adjust size of left segment and allocate new (right) segment
4029	 */
4030	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4031	newseg->shm_policy = seg->shm_policy;
4032	newseg->shm_off = off;
4033	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4034	seg->shm_size = off - seg->shm_off;
4035
4036	/*
4037	 * Find where to insert new segment in AVL tree and insert it
4038	 */
4039	(void) avl_find(tree, &off, &where);
4040	avl_insert(tree, newseg, where);
4041
4042	return (newseg);
4043}
4044
4045/*
4046 * Set shared memory allocation policy on specified shared object at given
4047 * offset and length
4048 *
4049 * Return 0 if policy wasn't set already, 1 if policy was set already, and
4050 * -1 if can't set policy.
4051 */
4052int
4053lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4054    ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4055{
4056	u_offset_t		eoff;
4057	lgrp_shm_policy_seg_t	*next;
4058	lgrp_shm_policy_seg_t	*newseg;
4059	u_offset_t		off;
4060	u_offset_t		oldeoff;
4061	lgrp_shm_policy_seg_t	*prev;
4062	int			retval;
4063	lgrp_shm_policy_seg_t	*seg;
4064	lgrp_shm_locality_t	*shm_locality;
4065	avl_tree_t		*tree;
4066	avl_index_t		where;
4067
4068	ASSERT(amp || vp);
4069	ASSERT((len & PAGEOFFSET) == 0);
4070
4071	if (len == 0)
4072		return (-1);
4073
4074	retval = 0;
4075
4076	/*
4077	 * Get locality info and starting offset into shared object
4078	 * Try anon map first and then vnode
4079	 * Assume that no locks need to be held on anon_map or vnode, since
4080	 * it should be protected by its reference count which must be nonzero
4081	 * for an existing segment.
4082	 */
4083	if (amp) {
4084		/*
4085		 * Get policy info from anon_map
4086		 *
4087		 */
4088		ASSERT(amp->refcnt != 0);
4089		if (amp->locality == NULL)
4090			lgrp_shm_policy_init(amp, NULL);
4091		shm_locality = amp->locality;
4092		off = ptob(anon_index);
4093	} else if (vp) {
4094		/*
4095		 * Get policy info from vnode
4096		 */
4097		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4098			lgrp_shm_policy_init(NULL, vp);
4099		shm_locality = vp->v_locality;
4100		ASSERT(shm_locality->loc_count != 0);
4101		off = vn_off;
4102	} else
4103		return (-1);
4104
4105	ASSERT((off & PAGEOFFSET) == 0);
4106
4107	/*
4108	 * Figure out default policy
4109	 */
4110	if (policy == LGRP_MEM_POLICY_DEFAULT)
4111		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4112
4113	/*
4114	 * Create AVL tree if there isn't one yet
4115	 * and set locality field to point at it
4116	 */
4117	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4118	tree = shm_locality->loc_tree;
4119	if (!tree) {
4120		rw_exit(&shm_locality->loc_lock);
4121
4122		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4123
4124		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4125		if (shm_locality->loc_tree == NULL) {
4126			avl_create(tree, lgrp_shm_policy_compar,
4127			    sizeof (lgrp_shm_policy_seg_t),
4128			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4129			shm_locality->loc_tree = tree;
4130		} else {
4131			/*
4132			 * Another thread managed to set up the tree
4133			 * before we could. Free the tree we allocated
4134			 * and use the one that's already there.
4135			 */
4136			kmem_free(tree, sizeof (*tree));
4137			tree = shm_locality->loc_tree;
4138		}
4139	}
4140
4141	/*
4142	 * Set policy
4143	 *
4144	 * Need to maintain hold on writer's lock to keep tree from
4145	 * changing out from under us
4146	 */
4147	while (len != 0) {
4148		/*
4149		 * Find policy segment for specified offset into shared object
4150		 */
4151		seg = avl_find(tree, &off, &where);
4152
4153		/*
4154		 * Didn't find any existing segment that contains specified
4155		 * offset, so allocate new segment, insert it, and concatenate
4156		 * with adjacent segments if possible
4157		 */
4158		if (seg == NULL) {
4159			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4160			    KM_SLEEP);
4161			newseg->shm_policy.mem_policy = policy;
4162			newseg->shm_policy.mem_reserved = 0;
4163			newseg->shm_off = off;
4164			avl_insert(tree, newseg, where);
4165
4166			/*
4167			 * Check to see whether new segment overlaps with next
4168			 * one, set length of new segment accordingly, and
4169			 * calculate remaining length and next offset
4170			 */
4171			seg = AVL_NEXT(tree, newseg);
4172			if (seg == NULL || off + len <= seg->shm_off) {
4173				newseg->shm_size = len;
4174				len = 0;
4175			} else {
4176				newseg->shm_size = seg->shm_off - off;
4177				off = seg->shm_off;
4178				len -= newseg->shm_size;
4179			}
4180
4181			/*
4182			 * Try to concatenate new segment with next and
4183			 * previous ones, since they might have the same policy
4184			 * now.  Grab previous and next segments first because
4185			 * they will change on concatenation.
4186			 */
4187			prev =  AVL_PREV(tree, newseg);
4188			next = AVL_NEXT(tree, newseg);
4189			(void) lgrp_shm_policy_concat(tree, newseg, next);
4190			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4191
4192			continue;
4193		}
4194
4195		eoff = off + len;
4196		oldeoff = seg->shm_off + seg->shm_size;
4197
4198		/*
4199		 * Policy set already?
4200		 */
4201		if (policy == seg->shm_policy.mem_policy) {
4202			/*
4203			 * Nothing left to do if offset and length
4204			 * fall within this segment
4205			 */
4206			if (eoff <= oldeoff) {
4207				retval = 1;
4208				break;
4209			} else {
4210				len = eoff - oldeoff;
4211				off = oldeoff;
4212				continue;
4213			}
4214		}
4215
4216		/*
4217		 * Specified offset and length match existing segment exactly
4218		 */
4219		if (off == seg->shm_off && len == seg->shm_size) {
4220			/*
4221			 * Set policy and update current length
4222			 */
4223			seg->shm_policy.mem_policy = policy;
4224			seg->shm_policy.mem_reserved = 0;
4225			len = 0;
4226
4227			/*
4228			 * Try concatenating new segment with previous and next
4229			 * segments, since they might have the same policy now.
4230			 * Grab previous and next segments first because they
4231			 * will change on concatenation.
4232			 */
4233			prev =  AVL_PREV(tree, seg);
4234			next = AVL_NEXT(tree, seg);
4235			(void) lgrp_shm_policy_concat(tree, seg, next);
4236			(void) lgrp_shm_policy_concat(tree, prev, seg);
4237		} else {
4238			/*
4239			 * Specified offset and length only apply to part of
4240			 * existing segment
4241			 */
4242
4243			/*
4244			 * New segment starts in middle of old one, so split
4245			 * new one off near beginning of old one
4246			 */
4247			newseg = NULL;
4248			if (off > seg->shm_off) {
4249				newseg = lgrp_shm_policy_split(tree, seg, off);
4250
4251				/*
4252				 * New segment ends where old one did, so try
4253				 * to concatenate with next segment
4254				 */
4255				if (eoff == oldeoff) {
4256					newseg->shm_policy.mem_policy = policy;
4257					newseg->shm_policy.mem_reserved = 0;
4258					(void) lgrp_shm_policy_concat(tree,
4259					    newseg, AVL_NEXT(tree, newseg));
4260					break;
4261				}
4262			}
4263
4264			/*
4265			 * New segment ends before old one, so split off end of
4266			 * old one
4267			 */
4268			if (eoff < oldeoff) {
4269				if (newseg) {
4270					(void) lgrp_shm_policy_split(tree,
4271					    newseg, eoff);
4272					newseg->shm_policy.mem_policy = policy;
4273					newseg->shm_policy.mem_reserved = 0;
4274				} else {
4275					(void) lgrp_shm_policy_split(tree, seg,
4276					    eoff);
4277					seg->shm_policy.mem_policy = policy;
4278					seg->shm_policy.mem_reserved = 0;
4279				}
4280
4281				if (off == seg->shm_off)
4282					(void) lgrp_shm_policy_concat(tree,
4283					    AVL_PREV(tree, seg), seg);
4284				break;
4285			}
4286
4287			/*
4288			 * Calculate remaining length and next offset
4289			 */
4290			len = eoff - oldeoff;
4291			off = oldeoff;
4292		}
4293	}
4294
4295	rw_exit(&shm_locality->loc_lock);
4296	return (retval);
4297}
4298
4299/*
4300 * Return the best memnode from which to allocate memory given
4301 * an lgroup.
4302 *
4303 * "c" is for cookie, which is good enough for me.
4304 * It references a cookie struct that should be zero'ed to initialize.
4305 * The cookie should live on the caller's stack.
4306 *
4307 * The routine returns -1 when:
4308 *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4309 *	- traverse is 1, and all the memnodes in the system have been
4310 *	  returned.
4311 */
4312int
4313lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4314{
4315	lgrp_t		*lp = c->lmc_lgrp;
4316	mnodeset_t	nodes = c->lmc_nodes;
4317	int		cnt = c->lmc_cnt;
4318	int		offset, mnode;
4319
4320	extern int	max_mem_nodes;
4321
4322	/*
4323	 * If the set is empty, and the caller is willing, traverse
4324	 * up the hierarchy until we find a non-empty set.
4325	 */
4326	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4327		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4328		    ((lp = lp->lgrp_parent) == NULL))
4329			return (-1);
4330
4331		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4332		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4333	}
4334
4335	/*
4336	 * Select a memnode by picking one at a "random" offset.
4337	 * Because of DR, memnodes can come and go at any time.
4338	 * This code must be able to cope with the possibility
4339	 * that the nodes count "cnt" is inconsistent with respect
4340	 * to the number of elements actually in "nodes", and
4341	 * therefore that the offset chosen could be greater than
4342	 * the number of elements in the set (some memnodes may
4343	 * have dissapeared just before cnt was read).
4344	 * If this happens, the search simply wraps back to the
4345	 * beginning of the set.
4346	 */
4347	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4348	offset = c->lmc_rand % cnt;
4349	do {
4350		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4351			if (nodes & ((mnodeset_t)1 << mnode))
4352				if (!offset--)
4353					break;
4354	} while (mnode >= max_mem_nodes);
4355
4356	/* Found a node. Store state before returning. */
4357	c->lmc_lgrp = lp;
4358	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4359	c->lmc_cnt = cnt - 1;
4360	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4361	c->lmc_ntried++;
4362
4363	return (mnode);
4364}
4365