cpupart.c revision 11173:87f3734e64df
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/systm.h>
28#include <sys/cmn_err.h>
29#include <sys/cpuvar.h>
30#include <sys/thread.h>
31#include <sys/disp.h>
32#include <sys/kmem.h>
33#include <sys/debug.h>
34#include <sys/cpupart.h>
35#include <sys/pset.h>
36#include <sys/var.h>
37#include <sys/cyclic.h>
38#include <sys/lgrp.h>
39#include <sys/pghw.h>
40#include <sys/loadavg.h>
41#include <sys/class.h>
42#include <sys/fss.h>
43#include <sys/pool.h>
44#include <sys/pool_pset.h>
45#include <sys/policy.h>
46
47/*
48 * Calling pool_lock() protects the pools configuration, which includes
49 * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
50 * partitions from being created or destroyed while the lock is held.
51 * The lock ordering with respect to related locks is:
52 *
53 *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
54 *
55 * Blocking memory allocations may be made while holding "pool_lock"
56 * or cpu_lock.
57 */
58
59/*
60 * The cp_default partition is allocated statically, but its lgroup load average
61 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
62 * saves some memory since the space allocated reflects the actual number of
63 * lgroups supported by the platform. The lgrp facility provides a temporary
64 * space to hold lpl information during system bootstrap.
65 */
66
67cpupart_t		*cp_list_head;
68cpupart_t		cp_default;
69static cpupartid_t	cp_id_next;
70uint_t			cp_numparts;
71uint_t			cp_numparts_nonempty;
72
73/*
74 * Need to limit total number of partitions to avoid slowing down the
75 * clock code too much.  The clock code traverses the list of
76 * partitions and needs to be able to execute in a reasonable amount
77 * of time (less than 1/hz seconds).  The maximum is sized based on
78 * max_ncpus so it shouldn't be a problem unless there are large
79 * numbers of empty partitions.
80 */
81static uint_t		cp_max_numparts;
82
83/*
84 * Processor sets and CPU partitions are different but related concepts.
85 * A processor set is a user-level abstraction allowing users to create
86 * sets of CPUs and bind threads exclusively to those sets.  A CPU
87 * partition is a kernel dispatcher object consisting of a set of CPUs
88 * and a global dispatch queue.  The processor set abstraction is
89 * implemented via a CPU partition, and currently there is a 1-1
90 * mapping between processor sets and partitions (excluding the default
91 * partition, which is not visible as a processor set).  Hence, the
92 * numbering for processor sets and CPU partitions is identical.  This
93 * may not always be true in the future, and these macros could become
94 * less trivial if we support e.g. a processor set containing multiple
95 * CPU partitions.
96 */
97#define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
98#define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
99
100
101static int cpupart_unbind_threads(cpupart_t *, boolean_t);
102
103/*
104 * Find a CPU partition given a processor set ID.
105 */
106static cpupart_t *
107cpupart_find_all(psetid_t psid)
108{
109	cpupart_t *cp;
110	cpupartid_t cpid = PSTOCP(psid);
111
112	ASSERT(MUTEX_HELD(&cpu_lock));
113
114	/* default partition not visible as a processor set */
115	if (psid == CP_DEFAULT)
116		return (NULL);
117
118	if (psid == PS_MYID)
119		return (curthread->t_cpupart);
120
121	cp = cp_list_head;
122	do {
123		if (cp->cp_id == cpid)
124			return (cp);
125		cp = cp->cp_next;
126	} while (cp != cp_list_head);
127	return (NULL);
128}
129
130/*
131 * Find a CPU partition given a processor set ID if the processor set
132 * should be visible from the calling zone.
133 */
134cpupart_t *
135cpupart_find(psetid_t psid)
136{
137	cpupart_t *cp;
138
139	ASSERT(MUTEX_HELD(&cpu_lock));
140	cp = cpupart_find_all(psid);
141	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
142	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
143			return (NULL);
144	return (cp);
145}
146
147static int
148cpupart_kstat_update(kstat_t *ksp, int rw)
149{
150	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
151	cpupart_kstat_t *cpksp = ksp->ks_data;
152
153	if (rw == KSTAT_WRITE)
154		return (EACCES);
155
156	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
157	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
158	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
159	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
160	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
161	    (16 - FSHIFT);
162	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
163	    (16 - FSHIFT);
164	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
165	    (16 - FSHIFT);
166	return (0);
167}
168
169static void
170cpupart_kstat_create(cpupart_t *cp)
171{
172	kstat_t *ksp;
173	zoneid_t zoneid;
174
175	ASSERT(MUTEX_HELD(&cpu_lock));
176
177	/*
178	 * We have a bit of a chicken-egg problem since this code will
179	 * get called to create the kstats for CP_DEFAULT before the
180	 * pools framework gets initialized.  We circumvent the problem
181	 * by special-casing cp_default.
182	 */
183	if (cp != &cp_default && pool_pset_enabled())
184		zoneid = GLOBAL_ZONEID;
185	else
186		zoneid = ALL_ZONES;
187	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
188	    KSTAT_TYPE_NAMED,
189	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
190	if (ksp != NULL) {
191		cpupart_kstat_t *cpksp = ksp->ks_data;
192
193		kstat_named_init(&cpksp->cpk_updates, "updates",
194		    KSTAT_DATA_UINT64);
195		kstat_named_init(&cpksp->cpk_runnable, "runnable",
196		    KSTAT_DATA_UINT64);
197		kstat_named_init(&cpksp->cpk_waiting, "waiting",
198		    KSTAT_DATA_UINT64);
199		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
200		    KSTAT_DATA_UINT32);
201		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
202		    KSTAT_DATA_UINT32);
203		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
204		    KSTAT_DATA_UINT32);
205		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
206		    KSTAT_DATA_UINT32);
207
208		ksp->ks_update = cpupart_kstat_update;
209		ksp->ks_private = cp;
210
211		kstat_install(ksp);
212	}
213	cp->cp_kstat = ksp;
214}
215
216/*
217 * Initialize the cpupart's lgrp partions (lpls)
218 */
219static void
220cpupart_lpl_initialize(cpupart_t *cp)
221{
222	int i, sz;
223
224	sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
225	cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
226
227	for (i = 0; i < sz; i++) {
228		/*
229		 * The last entry of the lpl's resource set is always NULL
230		 * by design (to facilitate iteration)...hence the "oversizing"
231		 * by 1.
232		 */
233		cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
234		cp->cp_lgrploads[i].lpl_rset =
235		    kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
236		cp->cp_lgrploads[i].lpl_id2rset =
237		    kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
238		cp->cp_lgrploads[i].lpl_lgrpid = i;
239	}
240}
241
242/*
243 * Teardown the cpupart's lgrp partitions
244 */
245static void
246cpupart_lpl_teardown(cpupart_t *cp)
247{
248	int i, sz;
249	lpl_t *lpl;
250
251	for (i = 0; i < cp->cp_nlgrploads; i++) {
252		lpl = &cp->cp_lgrploads[i];
253
254		sz = lpl->lpl_rset_sz;
255		kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
256		kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
257		lpl->lpl_rset = NULL;
258		lpl->lpl_id2rset = NULL;
259	}
260	kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
261	cp->cp_lgrploads = NULL;
262}
263
264/*
265 * Initialize the default partition and kpreempt disp queue.
266 */
267void
268cpupart_initialize_default(void)
269{
270	lgrp_id_t i;
271
272	cp_list_head = &cp_default;
273	cp_default.cp_next = &cp_default;
274	cp_default.cp_prev = &cp_default;
275	cp_default.cp_id = CP_DEFAULT;
276	cp_default.cp_kp_queue.disp_maxrunpri = -1;
277	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
278	cp_default.cp_kp_queue.disp_cpu = NULL;
279	cp_default.cp_gen = 0;
280	cp_default.cp_loadavg.lg_cur = 0;
281	cp_default.cp_loadavg.lg_len = 0;
282	cp_default.cp_loadavg.lg_total = 0;
283	for (i = 0; i < S_LOADAVG_SZ; i++) {
284		cp_default.cp_loadavg.lg_loads[i] = 0;
285	}
286	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
287	cp_id_next = CP_DEFAULT + 1;
288	cpupart_kstat_create(&cp_default);
289	cp_numparts = 1;
290	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
291		cp_max_numparts = max_ncpus * 2 + 1;
292	/*
293	 * Allocate space for cp_default list of lgrploads
294	 */
295	cpupart_lpl_initialize(&cp_default);
296
297	/*
298	 * The initial lpl topology is created in a special lpl list
299	 * lpl_bootstrap. It should be copied to cp_default.
300	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
301	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
302	 */
303	lpl_topo_bootstrap(cp_default.cp_lgrploads,
304	    cp_default.cp_nlgrploads);
305
306
307	cp_default.cp_attr = PSET_NOESCAPE;
308	cp_numparts_nonempty = 1;
309	/*
310	 * Set t0's home
311	 */
312	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
313
314	bitset_init(&cp_default.cp_cmt_pgs);
315	bitset_init(&cp_default.cp_haltset);
316	bitset_resize(&cp_default.cp_haltset, max_ncpus);
317}
318
319
320static int
321cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
322{
323	cpupart_t *oldpp;
324	cpu_t	*ncp, *newlist;
325	kthread_t *t;
326	int	move_threads = 1;
327	lgrp_id_t lgrpid;
328	proc_t 	*p;
329	int lgrp_diff_lpl;
330	lpl_t	*cpu_lpl;
331	int	ret;
332	boolean_t unbind_all_threads = (forced != 0);
333
334	ASSERT(MUTEX_HELD(&cpu_lock));
335	ASSERT(newpp != NULL);
336
337	oldpp = cp->cpu_part;
338	ASSERT(oldpp != NULL);
339	ASSERT(oldpp->cp_ncpus > 0);
340
341	if (newpp == oldpp) {
342		/*
343		 * Don't need to do anything.
344		 */
345		return (0);
346	}
347
348	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
349
350	if (!disp_bound_partition(cp, 0)) {
351		/*
352		 * Don't need to move threads if there are no threads in
353		 * the partition.  Note that threads can't enter the
354		 * partition while we're holding cpu_lock.
355		 */
356		move_threads = 0;
357	} else if (oldpp->cp_ncpus == 1) {
358		/*
359		 * The last CPU is removed from a partition which has threads
360		 * running in it. Some of these threads may be bound to this
361		 * CPU.
362		 *
363		 * Attempt to unbind threads from the CPU and from the processor
364		 * set. Note that no threads should be bound to this CPU since
365		 * cpupart_move_threads will refuse to move bound threads to
366		 * other CPUs.
367		 */
368		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
369		(void) cpupart_unbind_threads(oldpp, B_FALSE);
370
371		if (!disp_bound_partition(cp, 0)) {
372			/*
373			 * No bound threads in this partition any more
374			 */
375			move_threads = 0;
376		} else {
377			/*
378			 * There are still threads bound to the partition
379			 */
380			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
381			return (EBUSY);
382		}
383	}
384
385	/*
386	 * If forced flag is set unbind any threads from this CPU.
387	 * Otherwise unbind soft-bound threads only.
388	 */
389	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
390		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
391		return (ret);
392	}
393
394	/*
395	 * Stop further threads weak binding to this cpu.
396	 */
397	cpu_inmotion = cp;
398	membar_enter();
399
400	/*
401	 * Notify the Processor Groups subsystem that the CPU
402	 * will be moving cpu partitions. This is done before
403	 * CPUs are paused to provide an opportunity for any
404	 * needed memory allocations.
405	 */
406	pg_cpupart_out(cp, oldpp);
407	pg_cpupart_in(cp, newpp);
408
409again:
410	if (move_threads) {
411		int loop_count;
412		/*
413		 * Check for threads strong or weak bound to this CPU.
414		 */
415		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
416			if (loop_count >= 5) {
417				cpu_state_change_notify(cp->cpu_id,
418				    CPU_CPUPART_IN);
419				pg_cpupart_out(cp, newpp);
420				pg_cpupart_in(cp, oldpp);
421				cpu_inmotion = NULL;
422				return (EBUSY);	/* some threads still bound */
423			}
424			delay(1);
425		}
426	}
427
428	/*
429	 * Before we actually start changing data structures, notify
430	 * the cyclic subsystem that we want to move this CPU out of its
431	 * partition.
432	 */
433	if (!cyclic_move_out(cp)) {
434		/*
435		 * This CPU must be the last CPU in a processor set with
436		 * a bound cyclic.
437		 */
438		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
439		pg_cpupart_out(cp, newpp);
440		pg_cpupart_in(cp, oldpp);
441		cpu_inmotion = NULL;
442		return (EBUSY);
443	}
444
445	pause_cpus(cp);
446
447	if (move_threads) {
448		/*
449		 * The thread on cpu before the pause thread may have read
450		 * cpu_inmotion before we raised the barrier above.  Check
451		 * again.
452		 */
453		if (disp_bound_threads(cp, 1)) {
454			start_cpus();
455			goto again;
456		}
457
458	}
459
460	/*
461	 * Now that CPUs are paused, let the PG subsystem perform
462	 * any necessary data structure updates.
463	 */
464	pg_cpupart_move(cp, oldpp, newpp);
465
466	/* save this cpu's lgroup -- it'll be the same in the new partition */
467	lgrpid = cp->cpu_lpl->lpl_lgrpid;
468
469	cpu_lpl = cp->cpu_lpl;
470	/*
471	 * let the lgroup framework know cp has left the partition
472	 */
473	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
474
475	/* move out of old partition */
476	oldpp->cp_ncpus--;
477	if (oldpp->cp_ncpus > 0) {
478
479		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
480		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
481		if (oldpp->cp_cpulist == cp) {
482			oldpp->cp_cpulist = ncp;
483		}
484	} else {
485		ncp = oldpp->cp_cpulist = NULL;
486		cp_numparts_nonempty--;
487		ASSERT(cp_numparts_nonempty != 0);
488	}
489	oldpp->cp_gen++;
490
491	/* move into new partition */
492	newlist = newpp->cp_cpulist;
493	if (newlist == NULL) {
494		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
495		cp_numparts_nonempty++;
496		ASSERT(cp_numparts_nonempty != 0);
497	} else {
498		cp->cpu_next_part = newlist;
499		cp->cpu_prev_part = newlist->cpu_prev_part;
500		newlist->cpu_prev_part->cpu_next_part = cp;
501		newlist->cpu_prev_part = cp;
502	}
503	cp->cpu_part = newpp;
504	newpp->cp_ncpus++;
505	newpp->cp_gen++;
506
507	ASSERT(bitset_is_null(&newpp->cp_haltset));
508	ASSERT(bitset_is_null(&oldpp->cp_haltset));
509
510	/*
511	 * let the lgroup framework know cp has entered the partition
512	 */
513	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
514
515	/*
516	 * If necessary, move threads off processor.
517	 */
518	if (move_threads) {
519		ASSERT(ncp != NULL);
520
521		/*
522		 * Walk thru the active process list to look for
523		 * threads that need to have a new home lgroup,
524		 * or the last CPU they run on is the same CPU
525		 * being moved out of the partition.
526		 */
527
528		for (p = practive; p != NULL; p = p->p_next) {
529
530			t = p->p_tlist;
531
532			if (t == NULL)
533				continue;
534
535			lgrp_diff_lpl = 0;
536
537			do {
538
539				ASSERT(t->t_lpl != NULL);
540
541				/*
542				 * Update the count of how many threads are
543				 * in this CPU's lgroup but have a different lpl
544				 */
545
546				if (t->t_lpl != cpu_lpl &&
547				    t->t_lpl->lpl_lgrpid == lgrpid)
548					lgrp_diff_lpl++;
549				/*
550				 * If the lgroup that t is assigned to no
551				 * longer has any CPUs in t's partition,
552				 * we'll have to choose a new lgroup for t.
553				 */
554
555				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
556				    t->t_cpupart)) {
557					lgrp_move_thread(t,
558					    lgrp_choose(t, t->t_cpupart), 0);
559				}
560
561				/*
562				 * make sure lpl points to our own partition
563				 */
564				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
565				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
566				    t->t_cpupart->cp_nlgrploads));
567
568				ASSERT(t->t_lpl->lpl_ncpu > 0);
569
570				/* Update CPU last ran on if it was this CPU */
571				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
572				    t->t_bound_cpu != cp) {
573					t->t_cpu = disp_lowpri_cpu(ncp,
574					    t->t_lpl, t->t_pri, NULL);
575				}
576				t = t->t_forw;
577			} while (t != p->p_tlist);
578
579			/*
580			 * Didn't find any threads in the same lgroup as this
581			 * CPU with a different lpl, so remove the lgroup from
582			 * the process lgroup bitmask.
583			 */
584
585			if (lgrp_diff_lpl)
586				klgrpset_del(p->p_lgrpset, lgrpid);
587		}
588
589		/*
590		 * Walk thread list looking for threads that need to be
591		 * rehomed, since there are some threads that are not in
592		 * their process's p_tlist.
593		 */
594
595		t = curthread;
596
597		do {
598			ASSERT(t != NULL && t->t_lpl != NULL);
599
600			/*
601			 * If the lgroup that t is assigned to no
602			 * longer has any CPUs in t's partition,
603			 * we'll have to choose a new lgroup for t.
604			 * Also, choose best lgroup for home when
605			 * thread has specified lgroup affinities,
606			 * since there may be an lgroup with more
607			 * affinity available after moving CPUs
608			 * around.
609			 */
610			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
611			    t->t_cpupart) || t->t_lgrp_affinity) {
612				lgrp_move_thread(t,
613				    lgrp_choose(t, t->t_cpupart), 1);
614			}
615
616			/* make sure lpl points to our own partition */
617			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
618			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
619			    t->t_cpupart->cp_nlgrploads));
620
621			ASSERT(t->t_lpl->lpl_ncpu > 0);
622
623			/* Update CPU last ran on if it was this CPU */
624			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
625			    t->t_bound_cpu != cp) {
626				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
627				    t->t_pri, NULL);
628			}
629
630			t = t->t_next;
631		} while (t != curthread);
632
633		/*
634		 * Clear off the CPU's run queue, and the kp queue if the
635		 * partition is now empty.
636		 */
637		disp_cpu_inactive(cp);
638
639		/*
640		 * Make cp switch to a thread from the new partition.
641		 */
642		cp->cpu_runrun = 1;
643		cp->cpu_kprunrun = 1;
644	}
645
646	cpu_inmotion = NULL;
647	start_cpus();
648
649	/*
650	 * Let anyone interested know that cpu has been added to the set.
651	 */
652	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
653
654	/*
655	 * Now let the cyclic subsystem know that it can reshuffle cyclics
656	 * bound to the new processor set.
657	 */
658	cyclic_move_in(cp);
659
660	return (0);
661}
662
663/*
664 * Check if thread can be moved to a new cpu partition.  Called by
665 * cpupart_move_thread() and pset_bind_start().
666 */
667int
668cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
669{
670	ASSERT(MUTEX_HELD(&cpu_lock));
671	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
672	ASSERT(cp != NULL);
673	ASSERT(THREAD_LOCK_HELD(tp));
674
675	/*
676	 * CPU-bound threads can't be moved.
677	 */
678	if (!ignore) {
679		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
680		    tp->t_weakbound_cpu;
681		if (boundcpu != NULL && boundcpu->cpu_part != cp)
682			return (EBUSY);
683	}
684
685	if (tp->t_cid == sysdccid) {
686		return (EINVAL);	/* For now, sysdc threads can't move */
687	}
688
689	return (0);
690}
691
692/*
693 * Move thread to new partition.  If ignore is non-zero, then CPU
694 * bindings should be ignored (this is used when destroying a
695 * partition).
696 */
697static int
698cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
699    void *projbuf, void *zonebuf)
700{
701	cpupart_t *oldpp = tp->t_cpupart;
702	int ret;
703
704	ASSERT(MUTEX_HELD(&cpu_lock));
705	ASSERT(MUTEX_HELD(&pidlock));
706	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
707	ASSERT(newpp != NULL);
708
709	if (newpp->cp_cpulist == NULL)
710		return (EINVAL);
711
712	/*
713	 * Check for errors first.
714	 */
715	thread_lock(tp);
716	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
717		thread_unlock(tp);
718		return (ret);
719	}
720
721	/* move the thread */
722	if (oldpp != newpp) {
723		/*
724		 * Make the thread switch to the new partition.
725		 */
726		tp->t_cpupart = newpp;
727		ASSERT(tp->t_lpl != NULL);
728		/*
729		 * Leave the thread on the same lgroup if possible; otherwise
730		 * choose a new lgroup for it.  In either case, update its
731		 * t_lpl.
732		 */
733		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
734		    tp->t_lgrp_affinity == NULL) {
735			/*
736			 * The thread's lgroup has CPUs in the thread's new
737			 * partition, so the thread can stay assigned to the
738			 * same lgroup.  Update its t_lpl to point to the
739			 * lpl_t for its lgroup in its new partition.
740			 */
741			lgrp_move_thread(tp, &tp->t_cpupart->\
742			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
743		} else {
744			/*
745			 * The thread's lgroup has no cpus in its new
746			 * partition or it has specified lgroup affinities,
747			 * so choose the best lgroup for the thread and
748			 * assign it to that lgroup.
749			 */
750			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
751			    1);
752		}
753		/*
754		 * make sure lpl points to our own partition
755		 */
756		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
757		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
758		    tp->t_cpupart->cp_nlgrploads));
759
760		ASSERT(tp->t_lpl->lpl_ncpu > 0);
761
762		if (tp->t_state == TS_ONPROC) {
763			cpu_surrender(tp);
764		} else if (tp->t_state == TS_RUN) {
765			(void) dispdeq(tp);
766			setbackdq(tp);
767		}
768	}
769
770	/*
771	 * Our binding has changed; set TP_CHANGEBIND.
772	 */
773	tp->t_proc_flag |= TP_CHANGEBIND;
774	aston(tp);
775
776	thread_unlock(tp);
777	fss_changepset(tp, newpp, projbuf, zonebuf);
778
779	return (0);		/* success */
780}
781
782
783/*
784 * This function binds a thread to a partition.  Must be called with the
785 * p_lock of the containing process held (to keep the thread from going
786 * away), and thus also with cpu_lock held (since cpu_lock must be
787 * acquired before p_lock).  If ignore is non-zero, then CPU bindings
788 * should be ignored (this is used when destroying a partition).
789 */
790int
791cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
792    void *zonebuf)
793{
794	cpupart_t	*newpp;
795
796	ASSERT(pool_lock_held());
797	ASSERT(MUTEX_HELD(&cpu_lock));
798	ASSERT(MUTEX_HELD(&pidlock));
799	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
800
801	if (psid == PS_NONE)
802		newpp = &cp_default;
803	else {
804		newpp = cpupart_find(psid);
805		if (newpp == NULL) {
806			return (EINVAL);
807		}
808	}
809	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
810}
811
812
813/*
814 * Create a new partition.  On MP systems, this also allocates a
815 * kpreempt disp queue for that partition.
816 */
817int
818cpupart_create(psetid_t *psid)
819{
820	cpupart_t	*pp;
821
822	ASSERT(pool_lock_held());
823
824	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
825	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
826	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
827	    KM_SLEEP);
828
829	mutex_enter(&cpu_lock);
830	if (cp_numparts == cp_max_numparts) {
831		mutex_exit(&cpu_lock);
832		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
833		pp->cp_lgrploads = NULL;
834		kmem_free(pp, sizeof (cpupart_t));
835		return (ENOMEM);
836	}
837	cp_numparts++;
838	/* find the next free partition ID */
839	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
840		cp_id_next++;
841	pp->cp_id = cp_id_next++;
842	pp->cp_ncpus = 0;
843	pp->cp_cpulist = NULL;
844	pp->cp_attr = 0;
845	klgrpset_clear(pp->cp_lgrpset);
846	pp->cp_kp_queue.disp_maxrunpri = -1;
847	pp->cp_kp_queue.disp_max_unbound_pri = -1;
848	pp->cp_kp_queue.disp_cpu = NULL;
849	pp->cp_gen = 0;
850	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
851	*psid = CPTOPS(pp->cp_id);
852	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
853	cpupart_kstat_create(pp);
854	cpupart_lpl_initialize(pp);
855
856	bitset_init(&pp->cp_cmt_pgs);
857
858	/*
859	 * Initialize and size the partition's bitset of halted CPUs
860	 */
861	bitset_init(&pp->cp_haltset);
862	bitset_resize(&pp->cp_haltset, max_ncpus);
863
864	/*
865	 * Pause all CPUs while changing the partition list, to make sure
866	 * the clock thread (which traverses the list without holding
867	 * cpu_lock) isn't running.
868	 */
869	pause_cpus(NULL);
870	pp->cp_next = cp_list_head;
871	pp->cp_prev = cp_list_head->cp_prev;
872	cp_list_head->cp_prev->cp_next = pp;
873	cp_list_head->cp_prev = pp;
874	start_cpus();
875	mutex_exit(&cpu_lock);
876
877	return (0);
878}
879
880/*
881 * Move threads from specified partition to cp_default. If `force' is specified,
882 * move all threads, otherwise move only soft-bound threads.
883 */
884static int
885cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
886{
887	void 	*projbuf, *zonebuf;
888	kthread_t *t;
889	proc_t	*p;
890	int	err = 0;
891	psetid_t psid = pp->cp_id;
892
893	ASSERT(pool_lock_held());
894	ASSERT(MUTEX_HELD(&cpu_lock));
895
896	if (pp == NULL || pp == &cp_default) {
897		return (EINVAL);
898	}
899
900	/*
901	 * Pre-allocate enough buffers for FSS for all active projects and
902	 * for all active zones on the system.  Unused buffers will be
903	 * freed later by fss_freebuf().
904	 */
905	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
906	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
907
908	mutex_enter(&pidlock);
909	t = curthread;
910	do {
911		if (t->t_bind_pset == psid) {
912again:			p = ttoproc(t);
913			mutex_enter(&p->p_lock);
914			if (ttoproc(t) != p) {
915				/*
916				 * lwp_exit has changed this thread's process
917				 * pointer before we grabbed its p_lock.
918				 */
919				mutex_exit(&p->p_lock);
920				goto again;
921			}
922
923			/*
924			 * Can only unbind threads which have revocable binding
925			 * unless force unbinding requested.
926			 */
927			if (unbind_all || TB_PSET_IS_SOFT(t)) {
928				err = cpupart_bind_thread(t, PS_NONE, 1,
929				    projbuf, zonebuf);
930				if (err) {
931					mutex_exit(&p->p_lock);
932					mutex_exit(&pidlock);
933					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
934					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
935					return (err);
936				}
937				t->t_bind_pset = PS_NONE;
938			}
939			mutex_exit(&p->p_lock);
940		}
941		t = t->t_next;
942	} while (t != curthread);
943
944	mutex_exit(&pidlock);
945	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
946	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
947	return (err);
948}
949
950/*
951 * Destroy a partition.
952 */
953int
954cpupart_destroy(psetid_t psid)
955{
956	cpu_t	*cp, *first_cp;
957	cpupart_t *pp, *newpp;
958	int	err = 0;
959
960	ASSERT(pool_lock_held());
961	mutex_enter(&cpu_lock);
962
963	pp = cpupart_find(psid);
964	if (pp == NULL || pp == &cp_default) {
965		mutex_exit(&cpu_lock);
966		return (EINVAL);
967	}
968
969	/*
970	 * Unbind all the threads currently bound to the partition.
971	 */
972	err = cpupart_unbind_threads(pp, B_TRUE);
973	if (err) {
974		mutex_exit(&cpu_lock);
975		return (err);
976	}
977
978	newpp = &cp_default;
979	while ((cp = pp->cp_cpulist) != NULL) {
980		if (err = cpupart_move_cpu(cp, newpp, 0)) {
981			mutex_exit(&cpu_lock);
982			return (err);
983		}
984	}
985
986	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
987	ASSERT(bitset_is_null(&pp->cp_haltset));
988
989	/*
990	 * Teardown the partition's group of active CMT PGs and halted
991	 * CPUs now that they have all left.
992	 */
993	bitset_fini(&pp->cp_cmt_pgs);
994	bitset_fini(&pp->cp_haltset);
995
996	/*
997	 * Reset the pointers in any offline processors so they won't
998	 * try to rejoin the destroyed partition when they're turned
999	 * online.
1000	 */
1001	first_cp = cp = CPU;
1002	do {
1003		if (cp->cpu_part == pp) {
1004			ASSERT(cp->cpu_flags & CPU_OFFLINE);
1005			cp->cpu_part = newpp;
1006		}
1007		cp = cp->cpu_next;
1008	} while (cp != first_cp);
1009
1010	/*
1011	 * Pause all CPUs while changing the partition list, to make sure
1012	 * the clock thread (which traverses the list without holding
1013	 * cpu_lock) isn't running.
1014	 */
1015	pause_cpus(NULL);
1016	pp->cp_prev->cp_next = pp->cp_next;
1017	pp->cp_next->cp_prev = pp->cp_prev;
1018	if (cp_list_head == pp)
1019		cp_list_head = pp->cp_next;
1020	start_cpus();
1021
1022	if (cp_id_next > pp->cp_id)
1023		cp_id_next = pp->cp_id;
1024
1025	if (pp->cp_kstat)
1026		kstat_delete(pp->cp_kstat);
1027
1028	cp_numparts--;
1029
1030	disp_kp_free(&pp->cp_kp_queue);
1031
1032	cpupart_lpl_teardown(pp);
1033
1034	kmem_free(pp, sizeof (cpupart_t));
1035	mutex_exit(&cpu_lock);
1036
1037	return (err);
1038}
1039
1040
1041/*
1042 * Return the ID of the partition to which the specified processor belongs.
1043 */
1044psetid_t
1045cpupart_query_cpu(cpu_t *cp)
1046{
1047	ASSERT(MUTEX_HELD(&cpu_lock));
1048
1049	return (CPTOPS(cp->cpu_part->cp_id));
1050}
1051
1052
1053/*
1054 * Attach a processor to an existing partition.
1055 */
1056int
1057cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1058{
1059	cpupart_t	*pp;
1060	int		err;
1061
1062	ASSERT(pool_lock_held());
1063	ASSERT(MUTEX_HELD(&cpu_lock));
1064
1065	pp = cpupart_find(psid);
1066	if (pp == NULL)
1067		return (EINVAL);
1068	if (cp->cpu_flags & CPU_OFFLINE)
1069		return (EINVAL);
1070
1071	err = cpupart_move_cpu(cp, pp, forced);
1072	return (err);
1073}
1074
1075/*
1076 * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1077 * this just checks for a valid partition.  If numcpus is non-NULL but
1078 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1079 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1080 * and a list of those cpus up to the size originally in *numcpus is
1081 * stored in cpulist[].  Also, store the processor set id in *psid.
1082 * This is useful in case the processor set id passed in was PS_MYID.
1083 */
1084int
1085cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1086{
1087	cpupart_t	*pp;
1088	uint_t		ncpus;
1089	cpu_t		*c;
1090	int		i;
1091
1092	mutex_enter(&cpu_lock);
1093	pp = cpupart_find(*psid);
1094	if (pp == NULL) {
1095		mutex_exit(&cpu_lock);
1096		return (EINVAL);
1097	}
1098	*psid = CPTOPS(pp->cp_id);
1099	ncpus = pp->cp_ncpus;
1100	if (numcpus) {
1101		if (ncpus > *numcpus) {
1102			/*
1103			 * Only copy as many cpus as were passed in, but
1104			 * pass back the real number.
1105			 */
1106			uint_t t = ncpus;
1107			ncpus = *numcpus;
1108			*numcpus = t;
1109		} else
1110			*numcpus = ncpus;
1111
1112		if (cpulist) {
1113			c = pp->cp_cpulist;
1114			for (i = 0; i < ncpus; i++) {
1115				ASSERT(c != NULL);
1116				cpulist[i] = c->cpu_id;
1117				c = c->cpu_next_part;
1118			}
1119		}
1120	}
1121	mutex_exit(&cpu_lock);
1122	return (0);
1123}
1124
1125/*
1126 * Reallocate kpreempt queues for each CPU partition.  Called from
1127 * disp_setup when a new scheduling class is loaded that increases the
1128 * number of priorities in the system.
1129 */
1130void
1131cpupart_kpqalloc(pri_t npri)
1132{
1133	cpupart_t *cpp;
1134
1135	ASSERT(MUTEX_HELD(&cpu_lock));
1136	cpp = cp_list_head;
1137	do {
1138		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1139		cpp = cpp->cp_next;
1140	} while (cpp != cp_list_head);
1141}
1142
1143int
1144cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1145{
1146	cpupart_t *cp;
1147	int i;
1148
1149	ASSERT(nelem >= 0);
1150	ASSERT(nelem <= LOADAVG_NSTATS);
1151	ASSERT(MUTEX_HELD(&cpu_lock));
1152
1153	cp = cpupart_find(psid);
1154	if (cp == NULL)
1155		return (EINVAL);
1156	for (i = 0; i < nelem; i++)
1157		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1158
1159	return (0);
1160}
1161
1162
1163uint_t
1164cpupart_list(psetid_t *list, uint_t nelem, int flag)
1165{
1166	uint_t numpart = 0;
1167	cpupart_t *cp;
1168
1169	ASSERT(MUTEX_HELD(&cpu_lock));
1170	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1171
1172	if (list != NULL) {
1173		cp = cp_list_head;
1174		do {
1175			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1176			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1177				if (numpart == nelem)
1178					break;
1179				list[numpart++] = CPTOPS(cp->cp_id);
1180			}
1181			cp = cp->cp_next;
1182		} while (cp != cp_list_head);
1183	}
1184
1185	ASSERT(numpart < cp_numparts);
1186
1187	if (flag == CP_ALL)
1188		numpart = cp_numparts - 1; /* leave out default partition */
1189	else if (flag == CP_NONEMPTY)
1190		numpart = cp_numparts_nonempty;
1191
1192	return (numpart);
1193}
1194
1195int
1196cpupart_setattr(psetid_t psid, uint_t attr)
1197{
1198	cpupart_t *cp;
1199
1200	ASSERT(pool_lock_held());
1201
1202	mutex_enter(&cpu_lock);
1203	if ((cp = cpupart_find(psid)) == NULL) {
1204		mutex_exit(&cpu_lock);
1205		return (EINVAL);
1206	}
1207	/*
1208	 * PSET_NOESCAPE attribute for default cpu partition is always set
1209	 */
1210	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1211		mutex_exit(&cpu_lock);
1212		return (EINVAL);
1213	}
1214	cp->cp_attr = attr;
1215	mutex_exit(&cpu_lock);
1216	return (0);
1217}
1218
1219int
1220cpupart_getattr(psetid_t psid, uint_t *attrp)
1221{
1222	cpupart_t *cp;
1223
1224	mutex_enter(&cpu_lock);
1225	if ((cp = cpupart_find(psid)) == NULL) {
1226		mutex_exit(&cpu_lock);
1227		return (EINVAL);
1228	}
1229	*attrp = cp->cp_attr;
1230	mutex_exit(&cpu_lock);
1231	return (0);
1232}
1233