kern_cpuset.c revision 219399
1176730Sjeff/*-
2176730Sjeff * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3176730Sjeff * All rights reserved.
4177904Sjeff *
5177904Sjeff * Copyright (c) 2008 Nokia Corporation
6177904Sjeff * All rights reserved.
7176730Sjeff *
8176730Sjeff * Redistribution and use in source and binary forms, with or without
9176730Sjeff * modification, are permitted provided that the following conditions
10176730Sjeff * are met:
11176730Sjeff * 1. Redistributions of source code must retain the above copyright
12176730Sjeff *    notice unmodified, this list of conditions, and the following
13176730Sjeff *    disclaimer.
14176730Sjeff * 2. Redistributions in binary form must reproduce the above copyright
15176730Sjeff *    notice, this list of conditions and the following disclaimer in the
16176730Sjeff *    documentation and/or other materials provided with the distribution.
17176730Sjeff *
18176730Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19176730Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20176730Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21176730Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22176730Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23176730Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24176730Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25176730Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26176730Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27176730Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28176730Sjeff *
29176730Sjeff */
30176730Sjeff
31176730Sjeff#include <sys/cdefs.h>
32176730Sjeff__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 219399 2011-03-08 14:18:21Z jhb $");
33176730Sjeff
34180358Sbz#include "opt_ddb.h"
35180358Sbz
36176730Sjeff#include <sys/param.h>
37176730Sjeff#include <sys/systm.h>
38176730Sjeff#include <sys/sysproto.h>
39192895Sjamie#include <sys/jail.h>
40176730Sjeff#include <sys/kernel.h>
41176730Sjeff#include <sys/lock.h>
42176730Sjeff#include <sys/malloc.h>
43176730Sjeff#include <sys/mutex.h>
44176730Sjeff#include <sys/priv.h>
45176730Sjeff#include <sys/proc.h>
46176730Sjeff#include <sys/refcount.h>
47176730Sjeff#include <sys/sched.h>
48176730Sjeff#include <sys/smp.h>
49176730Sjeff#include <sys/syscallsubr.h>
50176730Sjeff#include <sys/cpuset.h>
51176730Sjeff#include <sys/sx.h>
52176730Sjeff#include <sys/queue.h>
53176730Sjeff#include <sys/limits.h>
54177738Sjeff#include <sys/bus.h>
55177738Sjeff#include <sys/interrupt.h>
56176730Sjeff
57176730Sjeff#include <vm/uma.h>
58176730Sjeff
59180358Sbz#ifdef DDB
60180358Sbz#include <ddb/ddb.h>
61180358Sbz#endif /* DDB */
62180358Sbz
63176730Sjeff/*
64176730Sjeff * cpusets provide a mechanism for creating and manipulating sets of
65176730Sjeff * processors for the purpose of constraining the scheduling of threads to
66176730Sjeff * specific processors.
67176730Sjeff *
68176730Sjeff * Each process belongs to an identified set, by default this is set 1.  Each
69176730Sjeff * thread may further restrict the cpus it may run on to a subset of this
70176730Sjeff * named set.  This creates an anonymous set which other threads and processes
71176730Sjeff * may not join by number.
72176730Sjeff *
73176730Sjeff * The named set is referred to herein as the 'base' set to avoid ambiguity.
74176730Sjeff * This set is usually a child of a 'root' set while the anonymous set may
75176730Sjeff * simply be referred to as a mask.  In the syscall api these are referred to
76176730Sjeff * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
77176730Sjeff *
78176730Sjeff * Threads inherit their set from their creator whether it be anonymous or
79176730Sjeff * not.  This means that anonymous sets are immutable because they may be
80176730Sjeff * shared.  To modify an anonymous set a new set is created with the desired
81176730Sjeff * mask and the same parent as the existing anonymous set.  This gives the
82198493Sjhb * illusion of each thread having a private mask.
83176730Sjeff *
84176730Sjeff * Via the syscall apis a user may ask to retrieve or modify the root, base,
85176730Sjeff * or mask that is discovered via a pid, tid, or setid.  Modifying a set
86176730Sjeff * modifies all numbered and anonymous child sets to comply with the new mask.
87176730Sjeff * Modifying a pid or tid's mask applies only to that tid but must still
88176730Sjeff * exist within the assigned parent set.
89176730Sjeff *
90198495Sjhb * A thread may not be assigned to a group separate from other threads in
91176730Sjeff * the process.  This is to remove ambiguity when the setid is queried with
92176730Sjeff * a pid argument.  There is no other technical limitation.
93176730Sjeff *
94176730Sjeff * This somewhat complex arrangement is intended to make it easy for
95176730Sjeff * applications to query available processors and bind their threads to
96176730Sjeff * specific processors while also allowing administrators to dynamically
97176730Sjeff * reprovision by changing sets which apply to groups of processes.
98176730Sjeff *
99176730Sjeff * A simple application should not concern itself with sets at all and
100176730Sjeff * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
101198493Sjhb * meaning 'curthread'.  It may query available cpus for that tid with a
102176730Sjeff * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
103176730Sjeff */
104176730Sjeffstatic uma_zone_t cpuset_zone;
105176730Sjeffstatic struct mtx cpuset_lock;
106176730Sjeffstatic struct setlist cpuset_ids;
107176730Sjeffstatic struct unrhdr *cpuset_unr;
108177738Sjeffstatic struct cpuset *cpuset_zero;
109176730Sjeff
110214611Sdavidxu/* Return the size of cpuset_t at the kernel level */
111214611SdavidxuSYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
112214611Sdavidxu	0, sizeof(cpuset_t), "sizeof(cpuset_t)");
113214611Sdavidxu
114177738Sjeffcpuset_t *cpuset_root;
115177738Sjeff
116176730Sjeff/*
117176730Sjeff * Acquire a reference to a cpuset, all pointers must be tracked with refs.
118176730Sjeff */
119176730Sjeffstruct cpuset *
120176730Sjeffcpuset_ref(struct cpuset *set)
121176730Sjeff{
122176730Sjeff
123176730Sjeff	refcount_acquire(&set->cs_ref);
124176730Sjeff	return (set);
125176730Sjeff}
126176730Sjeff
127176730Sjeff/*
128180356Sbz * Walks up the tree from 'set' to find the root.  Returns the root
129180356Sbz * referenced.
130180356Sbz */
131180356Sbzstatic struct cpuset *
132180356Sbzcpuset_refroot(struct cpuset *set)
133180356Sbz{
134180356Sbz
135180356Sbz	for (; set->cs_parent != NULL; set = set->cs_parent)
136180356Sbz		if (set->cs_flags & CPU_SET_ROOT)
137180356Sbz			break;
138180356Sbz	cpuset_ref(set);
139180356Sbz
140180356Sbz	return (set);
141180356Sbz}
142180356Sbz
143180356Sbz/*
144180356Sbz * Find the first non-anonymous set starting from 'set'.  Returns this set
145180356Sbz * referenced.  May return the passed in set with an extra ref if it is
146180356Sbz * not anonymous.
147180356Sbz */
148180356Sbzstatic struct cpuset *
149180356Sbzcpuset_refbase(struct cpuset *set)
150180356Sbz{
151180356Sbz
152180356Sbz	if (set->cs_id == CPUSET_INVALID)
153180356Sbz		set = set->cs_parent;
154180356Sbz	cpuset_ref(set);
155180356Sbz
156180356Sbz	return (set);
157180356Sbz}
158180356Sbz
159180356Sbz/*
160198493Sjhb * Release a reference in a context where it is safe to allocate.
161176730Sjeff */
162176730Sjeffvoid
163176730Sjeffcpuset_rel(struct cpuset *set)
164176730Sjeff{
165176730Sjeff	cpusetid_t id;
166176730Sjeff
167176730Sjeff	if (refcount_release(&set->cs_ref) == 0)
168176730Sjeff		return;
169176730Sjeff	mtx_lock_spin(&cpuset_lock);
170176730Sjeff	LIST_REMOVE(set, cs_siblings);
171176730Sjeff	id = set->cs_id;
172176730Sjeff	if (id != CPUSET_INVALID)
173176730Sjeff		LIST_REMOVE(set, cs_link);
174176730Sjeff	mtx_unlock_spin(&cpuset_lock);
175176730Sjeff	cpuset_rel(set->cs_parent);
176176730Sjeff	uma_zfree(cpuset_zone, set);
177176730Sjeff	if (id != CPUSET_INVALID)
178176730Sjeff		free_unr(cpuset_unr, id);
179176730Sjeff}
180176730Sjeff
181176730Sjeff/*
182176730Sjeff * Deferred release must be used when in a context that is not safe to
183176730Sjeff * allocate/free.  This places any unreferenced sets on the list 'head'.
184176730Sjeff */
185176730Sjeffstatic void
186176730Sjeffcpuset_rel_defer(struct setlist *head, struct cpuset *set)
187176730Sjeff{
188176730Sjeff
189176730Sjeff	if (refcount_release(&set->cs_ref) == 0)
190176730Sjeff		return;
191176730Sjeff	mtx_lock_spin(&cpuset_lock);
192176730Sjeff	LIST_REMOVE(set, cs_siblings);
193176730Sjeff	if (set->cs_id != CPUSET_INVALID)
194176730Sjeff		LIST_REMOVE(set, cs_link);
195176730Sjeff	LIST_INSERT_HEAD(head, set, cs_link);
196176730Sjeff	mtx_unlock_spin(&cpuset_lock);
197176730Sjeff}
198176730Sjeff
199176730Sjeff/*
200176730Sjeff * Complete a deferred release.  Removes the set from the list provided to
201176730Sjeff * cpuset_rel_defer.
202176730Sjeff */
203176730Sjeffstatic void
204176730Sjeffcpuset_rel_complete(struct cpuset *set)
205176730Sjeff{
206176730Sjeff	LIST_REMOVE(set, cs_link);
207176730Sjeff	cpuset_rel(set->cs_parent);
208176730Sjeff	uma_zfree(cpuset_zone, set);
209176730Sjeff}
210176730Sjeff
211176730Sjeff/*
212176730Sjeff * Find a set based on an id.  Returns it with a ref.
213176730Sjeff */
214176730Sjeffstatic struct cpuset *
215185435Sbzcpuset_lookup(cpusetid_t setid, struct thread *td)
216176730Sjeff{
217176730Sjeff	struct cpuset *set;
218176730Sjeff
219176730Sjeff	if (setid == CPUSET_INVALID)
220176730Sjeff		return (NULL);
221176730Sjeff	mtx_lock_spin(&cpuset_lock);
222176730Sjeff	LIST_FOREACH(set, &cpuset_ids, cs_link)
223176730Sjeff		if (set->cs_id == setid)
224176730Sjeff			break;
225176730Sjeff	if (set)
226176730Sjeff		cpuset_ref(set);
227176730Sjeff	mtx_unlock_spin(&cpuset_lock);
228185435Sbz
229185435Sbz	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
230185435Sbz	if (set != NULL && jailed(td->td_ucred)) {
231192895Sjamie		struct cpuset *jset, *tset;
232185435Sbz
233192895Sjamie		jset = td->td_ucred->cr_prison->pr_cpuset;
234192895Sjamie		for (tset = set; tset != NULL; tset = tset->cs_parent)
235192895Sjamie			if (tset == jset)
236192895Sjamie				break;
237192895Sjamie		if (tset == NULL) {
238185435Sbz			cpuset_rel(set);
239185435Sbz			set = NULL;
240185435Sbz		}
241185435Sbz	}
242185435Sbz
243176730Sjeff	return (set);
244176730Sjeff}
245176730Sjeff
246176730Sjeff/*
247176730Sjeff * Create a set in the space provided in 'set' with the provided parameters.
248176730Sjeff * The set is returned with a single ref.  May return EDEADLK if the set
249176730Sjeff * will have no valid cpu based on restrictions from the parent.
250176730Sjeff */
251176730Sjeffstatic int
252219399Sjhb_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
253176730Sjeff    cpusetid_t id)
254176730Sjeff{
255176730Sjeff
256176811Sjeff	if (!CPU_OVERLAP(&parent->cs_mask, mask))
257176811Sjeff		return (EDEADLK);
258176730Sjeff	CPU_COPY(mask, &set->cs_mask);
259176730Sjeff	LIST_INIT(&set->cs_children);
260176730Sjeff	refcount_init(&set->cs_ref, 1);
261176730Sjeff	set->cs_flags = 0;
262176730Sjeff	mtx_lock_spin(&cpuset_lock);
263219399Sjhb	CPU_AND(&set->cs_mask, &parent->cs_mask);
264176811Sjeff	set->cs_id = id;
265176811Sjeff	set->cs_parent = cpuset_ref(parent);
266176811Sjeff	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
267176811Sjeff	if (set->cs_id != CPUSET_INVALID)
268176811Sjeff		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
269176730Sjeff	mtx_unlock_spin(&cpuset_lock);
270176730Sjeff
271176811Sjeff	return (0);
272176730Sjeff}
273176730Sjeff
274176730Sjeff/*
275176730Sjeff * Create a new non-anonymous set with the requested parent and mask.  May
276176730Sjeff * return failures if the mask is invalid or a new number can not be
277176730Sjeff * allocated.
278176730Sjeff */
279176730Sjeffstatic int
280219399Sjhbcpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
281176730Sjeff{
282176730Sjeff	struct cpuset *set;
283176730Sjeff	cpusetid_t id;
284176730Sjeff	int error;
285176730Sjeff
286176730Sjeff	id = alloc_unr(cpuset_unr);
287176730Sjeff	if (id == -1)
288176730Sjeff		return (ENFILE);
289176730Sjeff	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
290176730Sjeff	error = _cpuset_create(set, parent, mask, id);
291176730Sjeff	if (error == 0)
292176730Sjeff		return (0);
293176730Sjeff	free_unr(cpuset_unr, id);
294176730Sjeff	uma_zfree(cpuset_zone, set);
295176730Sjeff
296176730Sjeff	return (error);
297176730Sjeff}
298176730Sjeff
299176730Sjeff/*
300176730Sjeff * Recursively check for errors that would occur from applying mask to
301176730Sjeff * the tree of sets starting at 'set'.  Checks for sets that would become
302176730Sjeff * empty as well as RDONLY flags.
303176730Sjeff */
304176730Sjeffstatic int
305176730Sjeffcpuset_testupdate(struct cpuset *set, cpuset_t *mask)
306176730Sjeff{
307176730Sjeff	struct cpuset *nset;
308176730Sjeff	cpuset_t newmask;
309176730Sjeff	int error;
310176730Sjeff
311176730Sjeff	mtx_assert(&cpuset_lock, MA_OWNED);
312176730Sjeff	if (set->cs_flags & CPU_SET_RDONLY)
313176730Sjeff		return (EPERM);
314176811Sjeff	if (!CPU_OVERLAP(&set->cs_mask, mask))
315176811Sjeff		return (EDEADLK);
316176730Sjeff	CPU_COPY(&set->cs_mask, &newmask);
317176730Sjeff	CPU_AND(&newmask, mask);
318176811Sjeff	error = 0;
319176730Sjeff	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
320176730Sjeff		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
321176730Sjeff			break;
322176730Sjeff	return (error);
323176730Sjeff}
324176730Sjeff
325176730Sjeff/*
326176730Sjeff * Applies the mask 'mask' without checking for empty sets or permissions.
327176730Sjeff */
328176730Sjeffstatic void
329176730Sjeffcpuset_update(struct cpuset *set, cpuset_t *mask)
330176730Sjeff{
331176730Sjeff	struct cpuset *nset;
332176730Sjeff
333176730Sjeff	mtx_assert(&cpuset_lock, MA_OWNED);
334176730Sjeff	CPU_AND(&set->cs_mask, mask);
335176730Sjeff	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
336176730Sjeff		cpuset_update(nset, &set->cs_mask);
337176730Sjeff
338176730Sjeff	return;
339176730Sjeff}
340176730Sjeff
341176730Sjeff/*
342176730Sjeff * Modify the set 'set' to use a copy of the mask provided.  Apply this new
343176730Sjeff * mask to restrict all children in the tree.  Checks for validity before
344176730Sjeff * applying the changes.
345176730Sjeff */
346176730Sjeffstatic int
347176730Sjeffcpuset_modify(struct cpuset *set, cpuset_t *mask)
348176730Sjeff{
349176811Sjeff	struct cpuset *root;
350176730Sjeff	int error;
351176730Sjeff
352180098Sbz	error = priv_check(curthread, PRIV_SCHED_CPUSET);
353176730Sjeff	if (error)
354176730Sjeff		return (error);
355176811Sjeff	/*
356191639Sbz	 * In case we are called from within the jail
357191639Sbz	 * we do not allow modifying the dedicated root
358191639Sbz	 * cpuset of the jail but may still allow to
359191639Sbz	 * change child sets.
360191639Sbz	 */
361191639Sbz	if (jailed(curthread->td_ucred) &&
362191639Sbz	    set->cs_flags & CPU_SET_ROOT)
363191639Sbz		return (EPERM);
364191639Sbz	/*
365176811Sjeff	 * Verify that we have access to this set of
366176811Sjeff	 * cpus.
367176811Sjeff	 */
368176811Sjeff	root = set->cs_parent;
369176811Sjeff	if (root && !CPU_SUBSET(&root->cs_mask, mask))
370176811Sjeff		return (EINVAL);
371176730Sjeff	mtx_lock_spin(&cpuset_lock);
372176730Sjeff	error = cpuset_testupdate(set, mask);
373176730Sjeff	if (error)
374176730Sjeff		goto out;
375176730Sjeff	cpuset_update(set, mask);
376176730Sjeff	CPU_COPY(mask, &set->cs_mask);
377176730Sjeffout:
378176730Sjeff	mtx_unlock_spin(&cpuset_lock);
379176730Sjeff
380176730Sjeff	return (error);
381176730Sjeff}
382176730Sjeff
383176730Sjeff/*
384176730Sjeff * Resolve the 'which' parameter of several cpuset apis.
385176730Sjeff *
386176730Sjeff * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
387176730Sjeff * checks for permission via p_cansched().
388176730Sjeff *
389176730Sjeff * For WHICH_SET returns a valid set with a new reference.
390176730Sjeff *
391176730Sjeff * -1 may be supplied for any argument to mean the current proc/thread or
392176730Sjeff * the base set of the current thread.  May fail with ESRCH/EPERM.
393176730Sjeff */
394176730Sjeffstatic int
395176730Sjeffcpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
396176730Sjeff    struct cpuset **setp)
397176730Sjeff{
398176730Sjeff	struct cpuset *set;
399176730Sjeff	struct thread *td;
400176730Sjeff	struct proc *p;
401176730Sjeff	int error;
402176730Sjeff
403176730Sjeff	*pp = p = NULL;
404176730Sjeff	*tdp = td = NULL;
405176730Sjeff	*setp = set = NULL;
406176730Sjeff	switch (which) {
407176730Sjeff	case CPU_WHICH_PID:
408176730Sjeff		if (id == -1) {
409176730Sjeff			PROC_LOCK(curproc);
410176730Sjeff			p = curproc;
411176730Sjeff			break;
412176730Sjeff		}
413176730Sjeff		if ((p = pfind(id)) == NULL)
414176730Sjeff			return (ESRCH);
415176730Sjeff		break;
416176730Sjeff	case CPU_WHICH_TID:
417176730Sjeff		if (id == -1) {
418176730Sjeff			PROC_LOCK(curproc);
419176730Sjeff			p = curproc;
420176730Sjeff			td = curthread;
421176730Sjeff			break;
422176730Sjeff		}
423214337Sdavidxu		td = tdfind(id, -1);
424176730Sjeff		if (td == NULL)
425176730Sjeff			return (ESRCH);
426214337Sdavidxu		p = td->td_proc;
427176730Sjeff		break;
428176730Sjeff	case CPU_WHICH_CPUSET:
429176730Sjeff		if (id == -1) {
430176730Sjeff			thread_lock(curthread);
431177738Sjeff			set = cpuset_refbase(curthread->td_cpuset);
432176730Sjeff			thread_unlock(curthread);
433176730Sjeff		} else
434185435Sbz			set = cpuset_lookup(id, curthread);
435176730Sjeff		if (set) {
436176730Sjeff			*setp = set;
437176730Sjeff			return (0);
438176730Sjeff		}
439176730Sjeff		return (ESRCH);
440185435Sbz	case CPU_WHICH_JAIL:
441185435Sbz	{
442185435Sbz		/* Find `set' for prison with given id. */
443185435Sbz		struct prison *pr;
444185435Sbz
445185435Sbz		sx_slock(&allprison_lock);
446192895Sjamie		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
447185435Sbz		sx_sunlock(&allprison_lock);
448185435Sbz		if (pr == NULL)
449185435Sbz			return (ESRCH);
450192895Sjamie		cpuset_ref(pr->pr_cpuset);
451192895Sjamie		*setp = pr->pr_cpuset;
452185435Sbz		mtx_unlock(&pr->pr_mtx);
453192895Sjamie		return (0);
454185435Sbz	}
455178092Sjeff	case CPU_WHICH_IRQ:
456178092Sjeff		return (0);
457176730Sjeff	default:
458176730Sjeff		return (EINVAL);
459176730Sjeff	}
460176730Sjeff	error = p_cansched(curthread, p);
461176730Sjeff	if (error) {
462176730Sjeff		PROC_UNLOCK(p);
463176730Sjeff		return (error);
464176730Sjeff	}
465176730Sjeff	if (td == NULL)
466176730Sjeff		td = FIRST_THREAD_IN_PROC(p);
467176730Sjeff	*pp = p;
468176730Sjeff	*tdp = td;
469176730Sjeff	return (0);
470176730Sjeff}
471176730Sjeff
472176730Sjeff/*
473176730Sjeff * Create an anonymous set with the provided mask in the space provided by
474176730Sjeff * 'fset'.  If the passed in set is anonymous we use its parent otherwise
475176730Sjeff * the new set is a child of 'set'.
476176730Sjeff */
477176730Sjeffstatic int
478219399Sjhbcpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
479176730Sjeff{
480176730Sjeff	struct cpuset *parent;
481176730Sjeff
482176730Sjeff	if (set->cs_id == CPUSET_INVALID)
483176730Sjeff		parent = set->cs_parent;
484176730Sjeff	else
485176730Sjeff		parent = set;
486176811Sjeff	if (!CPU_SUBSET(&parent->cs_mask, mask))
487177738Sjeff		return (EDEADLK);
488176730Sjeff	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
489176730Sjeff}
490176730Sjeff
491176730Sjeff/*
492176730Sjeff * Handle two cases for replacing the base set or mask of an entire process.
493176730Sjeff *
494176730Sjeff * 1) Set is non-null and mask is null.  This reparents all anonymous sets
495176730Sjeff *    to the provided set and replaces all non-anonymous td_cpusets with the
496176730Sjeff *    provided set.
497176730Sjeff * 2) Mask is non-null and set is null.  This replaces or creates anonymous
498176730Sjeff *    sets for every thread with the existing base as a parent.
499176730Sjeff *
500176730Sjeff * This is overly complicated because we can't allocate while holding a
501176730Sjeff * spinlock and spinlocks must be held while changing and examining thread
502176730Sjeff * state.
503176730Sjeff */
504176730Sjeffstatic int
505176730Sjeffcpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
506176730Sjeff{
507176730Sjeff	struct setlist freelist;
508176730Sjeff	struct setlist droplist;
509176811Sjeff	struct cpuset *tdset;
510176730Sjeff	struct cpuset *nset;
511176730Sjeff	struct thread *td;
512176730Sjeff	struct proc *p;
513176730Sjeff	int threads;
514176730Sjeff	int nfree;
515176730Sjeff	int error;
516176730Sjeff	/*
517176730Sjeff	 * The algorithm requires two passes due to locking considerations.
518176730Sjeff	 *
519176730Sjeff	 * 1) Lookup the process and acquire the locks in the required order.
520176730Sjeff	 * 2) If enough cpusets have not been allocated release the locks and
521176730Sjeff	 *    allocate them.  Loop.
522176730Sjeff	 */
523176730Sjeff	LIST_INIT(&freelist);
524176730Sjeff	LIST_INIT(&droplist);
525176730Sjeff	nfree = 0;
526176730Sjeff	for (;;) {
527176730Sjeff		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
528176730Sjeff		if (error)
529176730Sjeff			goto out;
530176730Sjeff		if (nfree >= p->p_numthreads)
531176730Sjeff			break;
532176730Sjeff		threads = p->p_numthreads;
533176730Sjeff		PROC_UNLOCK(p);
534176730Sjeff		for (; nfree < threads; nfree++) {
535176730Sjeff			nset = uma_zalloc(cpuset_zone, M_WAITOK);
536176730Sjeff			LIST_INSERT_HEAD(&freelist, nset, cs_link);
537176730Sjeff		}
538176730Sjeff	}
539176730Sjeff	PROC_LOCK_ASSERT(p, MA_OWNED);
540176730Sjeff	/*
541176730Sjeff	 * Now that the appropriate locks are held and we have enough cpusets,
542176811Sjeff	 * make sure the operation will succeed before applying changes.  The
543176811Sjeff	 * proc lock prevents td_cpuset from changing between calls.
544176811Sjeff	 */
545176811Sjeff	error = 0;
546176811Sjeff	FOREACH_THREAD_IN_PROC(p, td) {
547176811Sjeff		thread_lock(td);
548176811Sjeff		tdset = td->td_cpuset;
549176811Sjeff		/*
550176811Sjeff		 * Verify that a new mask doesn't specify cpus outside of
551176811Sjeff		 * the set the thread is a member of.
552176811Sjeff		 */
553176811Sjeff		if (mask) {
554176811Sjeff			if (tdset->cs_id == CPUSET_INVALID)
555176811Sjeff				tdset = tdset->cs_parent;
556176811Sjeff			if (!CPU_SUBSET(&tdset->cs_mask, mask))
557177738Sjeff				error = EDEADLK;
558176811Sjeff		/*
559176811Sjeff		 * Verify that a new set won't leave an existing thread
560176811Sjeff		 * mask without a cpu to run on.  It can, however, restrict
561176811Sjeff		 * the set.
562176811Sjeff		 */
563176811Sjeff		} else if (tdset->cs_id == CPUSET_INVALID) {
564176811Sjeff			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
565177738Sjeff				error = EDEADLK;
566176811Sjeff		}
567176811Sjeff		thread_unlock(td);
568176811Sjeff		if (error)
569176811Sjeff			goto unlock_out;
570176811Sjeff	}
571176811Sjeff	/*
572176811Sjeff	 * Replace each thread's cpuset while using deferred release.  We
573177368Sjeff	 * must do this because the thread lock must be held while operating
574177368Sjeff	 * on the thread and this limits the type of operations allowed.
575176730Sjeff	 */
576176730Sjeff	FOREACH_THREAD_IN_PROC(p, td) {
577176730Sjeff		thread_lock(td);
578176730Sjeff		/*
579176730Sjeff		 * If we presently have an anonymous set or are applying a
580176730Sjeff		 * mask we must create an anonymous shadow set.  That is
581176730Sjeff		 * either parented to our existing base or the supplied set.
582176730Sjeff		 *
583176730Sjeff		 * If we have a base set with no anonymous shadow we simply
584176730Sjeff		 * replace it outright.
585176730Sjeff		 */
586176730Sjeff		tdset = td->td_cpuset;
587176730Sjeff		if (tdset->cs_id == CPUSET_INVALID || mask) {
588176730Sjeff			nset = LIST_FIRST(&freelist);
589176730Sjeff			LIST_REMOVE(nset, cs_link);
590176730Sjeff			if (mask)
591176730Sjeff				error = cpuset_shadow(tdset, nset, mask);
592176730Sjeff			else
593176730Sjeff				error = _cpuset_create(nset, set,
594176730Sjeff				    &tdset->cs_mask, CPUSET_INVALID);
595176730Sjeff			if (error) {
596176730Sjeff				LIST_INSERT_HEAD(&freelist, nset, cs_link);
597176730Sjeff				thread_unlock(td);
598176730Sjeff				break;
599176730Sjeff			}
600176730Sjeff		} else
601176730Sjeff			nset = cpuset_ref(set);
602176730Sjeff		cpuset_rel_defer(&droplist, tdset);
603176730Sjeff		td->td_cpuset = nset;
604176730Sjeff		sched_affinity(td);
605176730Sjeff		thread_unlock(td);
606176730Sjeff	}
607176811Sjeffunlock_out:
608176730Sjeff	PROC_UNLOCK(p);
609176730Sjeffout:
610176730Sjeff	while ((nset = LIST_FIRST(&droplist)) != NULL)
611176730Sjeff		cpuset_rel_complete(nset);
612176730Sjeff	while ((nset = LIST_FIRST(&freelist)) != NULL) {
613176730Sjeff		LIST_REMOVE(nset, cs_link);
614176730Sjeff		uma_zfree(cpuset_zone, nset);
615176730Sjeff	}
616176730Sjeff	return (error);
617176730Sjeff}
618176730Sjeff
619176730Sjeff/*
620176730Sjeff * Apply an anonymous mask to a single thread.
621176730Sjeff */
622177738Sjeffint
623176730Sjeffcpuset_setthread(lwpid_t id, cpuset_t *mask)
624176730Sjeff{
625176730Sjeff	struct cpuset *nset;
626176730Sjeff	struct cpuset *set;
627176730Sjeff	struct thread *td;
628176730Sjeff	struct proc *p;
629176730Sjeff	int error;
630176730Sjeff
631176730Sjeff	nset = uma_zalloc(cpuset_zone, M_WAITOK);
632176821Sjeff	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
633176730Sjeff	if (error)
634176730Sjeff		goto out;
635177738Sjeff	set = NULL;
636176730Sjeff	thread_lock(td);
637177738Sjeff	error = cpuset_shadow(td->td_cpuset, nset, mask);
638176730Sjeff	if (error == 0) {
639177738Sjeff		set = td->td_cpuset;
640176730Sjeff		td->td_cpuset = nset;
641176730Sjeff		sched_affinity(td);
642176730Sjeff		nset = NULL;
643176730Sjeff	}
644176730Sjeff	thread_unlock(td);
645176730Sjeff	PROC_UNLOCK(p);
646177738Sjeff	if (set)
647177738Sjeff		cpuset_rel(set);
648176730Sjeffout:
649176730Sjeff	if (nset)
650176730Sjeff		uma_zfree(cpuset_zone, nset);
651176730Sjeff	return (error);
652176730Sjeff}
653176730Sjeff
654176730Sjeff/*
655176730Sjeff * Creates the cpuset for thread0.  We make two sets:
656176730Sjeff *
657176730Sjeff * 0 - The root set which should represent all valid processors in the
658176730Sjeff *     system.  It is initially created with a mask of all processors
659176730Sjeff *     because we don't know what processors are valid until cpuset_init()
660176730Sjeff *     runs.  This set is immutable.
661176730Sjeff * 1 - The default set which all processes are a member of until changed.
662176730Sjeff *     This allows an administrator to move all threads off of given cpus to
663176730Sjeff *     dedicate them to high priority tasks or save power etc.
664176730Sjeff */
665176730Sjeffstruct cpuset *
666176730Sjeffcpuset_thread0(void)
667176730Sjeff{
668176730Sjeff	struct cpuset *set;
669176730Sjeff	int error;
670176730Sjeff
671176730Sjeff	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
672176730Sjeff	    NULL, NULL, UMA_ALIGN_PTR, 0);
673176730Sjeff	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
674176730Sjeff	/*
675176730Sjeff	 * Create the root system set for the whole machine.  Doesn't use
676176730Sjeff	 * cpuset_create() due to NULL parent.
677176730Sjeff	 */
678176730Sjeff	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
679176730Sjeff	set->cs_mask.__bits[0] = -1;
680176730Sjeff	LIST_INIT(&set->cs_children);
681176730Sjeff	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
682176730Sjeff	set->cs_ref = 1;
683176730Sjeff	set->cs_flags = CPU_SET_ROOT;
684176730Sjeff	cpuset_zero = set;
685177738Sjeff	cpuset_root = &set->cs_mask;
686176730Sjeff	/*
687176730Sjeff	 * Now derive a default, modifiable set from that to give out.
688176730Sjeff	 */
689176730Sjeff	set = uma_zalloc(cpuset_zone, M_WAITOK);
690176730Sjeff	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
691176730Sjeff	KASSERT(error == 0, ("Error creating default set: %d\n", error));
692176730Sjeff	/*
693176730Sjeff	 * Initialize the unit allocator. 0 and 1 are allocated above.
694176730Sjeff	 */
695176730Sjeff	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
696176730Sjeff
697176730Sjeff	return (set);
698176730Sjeff}
699176730Sjeff
700176730Sjeff/*
701185435Sbz * Create a cpuset, which would be cpuset_create() but
702185435Sbz * mark the new 'set' as root.
703185435Sbz *
704191403Sbz * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
705191403Sbz * for that.
706185435Sbz *
707185435Sbz * In case of no error, returns the set in *setp locked with a reference.
708185435Sbz */
709185435Sbzint
710192895Sjamiecpuset_create_root(struct prison *pr, struct cpuset **setp)
711185435Sbz{
712185435Sbz	struct cpuset *set;
713185435Sbz	int error;
714185435Sbz
715192895Sjamie	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
716185435Sbz	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
717185435Sbz
718192895Sjamie	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
719185435Sbz	if (error)
720185435Sbz		return (error);
721185435Sbz
722185435Sbz	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
723185435Sbz	    __func__, __LINE__));
724185435Sbz
725185435Sbz	/* Mark the set as root. */
726185435Sbz	set = *setp;
727185435Sbz	set->cs_flags |= CPU_SET_ROOT;
728185435Sbz
729185435Sbz	return (0);
730185435Sbz}
731185435Sbz
732185435Sbzint
733185435Sbzcpuset_setproc_update_set(struct proc *p, struct cpuset *set)
734185435Sbz{
735185435Sbz	int error;
736185435Sbz
737185435Sbz	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
738185435Sbz	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
739185435Sbz
740185435Sbz	cpuset_ref(set);
741185435Sbz	error = cpuset_setproc(p->p_pid, set, NULL);
742185435Sbz	if (error)
743185435Sbz		return (error);
744185435Sbz	cpuset_rel(set);
745185435Sbz	return (0);
746185435Sbz}
747185435Sbz
748185435Sbz/*
749176730Sjeff * This is called once the final set of system cpus is known.  Modifies
750198493Sjhb * the root set and all children and mark the root read-only.
751176730Sjeff */
752176730Sjeffstatic void
753176730Sjeffcpuset_init(void *arg)
754176730Sjeff{
755176730Sjeff	cpuset_t mask;
756176730Sjeff
757176730Sjeff	CPU_ZERO(&mask);
758176730Sjeff#ifdef SMP
759176730Sjeff	mask.__bits[0] = all_cpus;
760176730Sjeff#else
761176730Sjeff	mask.__bits[0] = 1;
762176730Sjeff#endif
763176730Sjeff	if (cpuset_modify(cpuset_zero, &mask))
764176730Sjeff		panic("Can't set initial cpuset mask.\n");
765176730Sjeff	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
766176730Sjeff}
767176730SjeffSYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
768176730Sjeff
769176730Sjeff#ifndef _SYS_SYSPROTO_H_
770176730Sjeffstruct cpuset_args {
771176730Sjeff	cpusetid_t	*setid;
772176730Sjeff};
773176730Sjeff#endif
774176730Sjeffint
775176730Sjeffcpuset(struct thread *td, struct cpuset_args *uap)
776176730Sjeff{
777176730Sjeff	struct cpuset *root;
778176730Sjeff	struct cpuset *set;
779176730Sjeff	int error;
780176730Sjeff
781176730Sjeff	thread_lock(td);
782177738Sjeff	root = cpuset_refroot(td->td_cpuset);
783176730Sjeff	thread_unlock(td);
784176730Sjeff	error = cpuset_create(&set, root, &root->cs_mask);
785176730Sjeff	cpuset_rel(root);
786176730Sjeff	if (error)
787176730Sjeff		return (error);
788177738Sjeff	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
789176730Sjeff	if (error == 0)
790177738Sjeff		error = cpuset_setproc(-1, set, NULL);
791176730Sjeff	cpuset_rel(set);
792176730Sjeff	return (error);
793176730Sjeff}
794176730Sjeff
795176730Sjeff#ifndef _SYS_SYSPROTO_H_
796176730Sjeffstruct cpuset_setid_args {
797176730Sjeff	cpuwhich_t	which;
798176730Sjeff	id_t		id;
799176730Sjeff	cpusetid_t	setid;
800176730Sjeff};
801176730Sjeff#endif
802176730Sjeffint
803176730Sjeffcpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
804176730Sjeff{
805176730Sjeff	struct cpuset *set;
806176730Sjeff	int error;
807176730Sjeff
808176730Sjeff	/*
809176730Sjeff	 * Presently we only support per-process sets.
810176730Sjeff	 */
811176730Sjeff	if (uap->which != CPU_WHICH_PID)
812176730Sjeff		return (EINVAL);
813185435Sbz	set = cpuset_lookup(uap->setid, td);
814176730Sjeff	if (set == NULL)
815176730Sjeff		return (ESRCH);
816176730Sjeff	error = cpuset_setproc(uap->id, set, NULL);
817176730Sjeff	cpuset_rel(set);
818176730Sjeff	return (error);
819176730Sjeff}
820176730Sjeff
821176730Sjeff#ifndef _SYS_SYSPROTO_H_
822176730Sjeffstruct cpuset_getid_args {
823176730Sjeff	cpulevel_t	level;
824176730Sjeff	cpuwhich_t	which;
825176730Sjeff	id_t		id;
826176730Sjeff	cpusetid_t	*setid;
827176730Sjeff#endif
828176730Sjeffint
829176730Sjeffcpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
830176730Sjeff{
831176730Sjeff	struct cpuset *nset;
832176730Sjeff	struct cpuset *set;
833176730Sjeff	struct thread *ttd;
834176730Sjeff	struct proc *p;
835176730Sjeff	cpusetid_t id;
836176730Sjeff	int error;
837176730Sjeff
838176730Sjeff	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
839176730Sjeff		return (EINVAL);
840176730Sjeff	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
841176730Sjeff	if (error)
842176730Sjeff		return (error);
843176730Sjeff	switch (uap->which) {
844176730Sjeff	case CPU_WHICH_TID:
845176730Sjeff	case CPU_WHICH_PID:
846176730Sjeff		thread_lock(ttd);
847177738Sjeff		set = cpuset_refbase(ttd->td_cpuset);
848176730Sjeff		thread_unlock(ttd);
849176730Sjeff		PROC_UNLOCK(p);
850176730Sjeff		break;
851176730Sjeff	case CPU_WHICH_CPUSET:
852185435Sbz	case CPU_WHICH_JAIL:
853176730Sjeff		break;
854178092Sjeff	case CPU_WHICH_IRQ:
855178092Sjeff		return (EINVAL);
856176730Sjeff	}
857176730Sjeff	switch (uap->level) {
858176730Sjeff	case CPU_LEVEL_ROOT:
859177738Sjeff		nset = cpuset_refroot(set);
860176730Sjeff		cpuset_rel(set);
861176730Sjeff		set = nset;
862176730Sjeff		break;
863176730Sjeff	case CPU_LEVEL_CPUSET:
864176730Sjeff		break;
865176730Sjeff	case CPU_LEVEL_WHICH:
866176730Sjeff		break;
867176730Sjeff	}
868176730Sjeff	id = set->cs_id;
869176730Sjeff	cpuset_rel(set);
870176730Sjeff	if (error == 0)
871176730Sjeff		error = copyout(&id, uap->setid, sizeof(id));
872176730Sjeff
873176730Sjeff	return (error);
874176730Sjeff}
875176730Sjeff
876176730Sjeff#ifndef _SYS_SYSPROTO_H_
877176730Sjeffstruct cpuset_getaffinity_args {
878177597Sru	cpulevel_t	level;
879177597Sru	cpuwhich_t	which;
880177597Sru	id_t		id;
881177597Sru	size_t		cpusetsize;
882177597Sru	cpuset_t	*mask;
883176730Sjeff};
884176730Sjeff#endif
885176730Sjeffint
886176730Sjeffcpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
887176730Sjeff{
888176730Sjeff	struct thread *ttd;
889176730Sjeff	struct cpuset *nset;
890176730Sjeff	struct cpuset *set;
891176730Sjeff	struct proc *p;
892176730Sjeff	cpuset_t *mask;
893176730Sjeff	int error;
894177597Sru	size_t size;
895176730Sjeff
896176811Sjeff	if (uap->cpusetsize < sizeof(cpuset_t) ||
897179313Skib	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
898176730Sjeff		return (ERANGE);
899176811Sjeff	size = uap->cpusetsize;
900176730Sjeff	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
901176730Sjeff	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
902176730Sjeff	if (error)
903176730Sjeff		goto out;
904176730Sjeff	switch (uap->level) {
905176730Sjeff	case CPU_LEVEL_ROOT:
906176730Sjeff	case CPU_LEVEL_CPUSET:
907176730Sjeff		switch (uap->which) {
908176730Sjeff		case CPU_WHICH_TID:
909176730Sjeff		case CPU_WHICH_PID:
910176730Sjeff			thread_lock(ttd);
911176730Sjeff			set = cpuset_ref(ttd->td_cpuset);
912176730Sjeff			thread_unlock(ttd);
913176730Sjeff			break;
914176730Sjeff		case CPU_WHICH_CPUSET:
915185435Sbz		case CPU_WHICH_JAIL:
916176730Sjeff			break;
917178092Sjeff		case CPU_WHICH_IRQ:
918178092Sjeff			error = EINVAL;
919178092Sjeff			goto out;
920176730Sjeff		}
921176730Sjeff		if (uap->level == CPU_LEVEL_ROOT)
922177738Sjeff			nset = cpuset_refroot(set);
923176730Sjeff		else
924177738Sjeff			nset = cpuset_refbase(set);
925176730Sjeff		CPU_COPY(&nset->cs_mask, mask);
926176730Sjeff		cpuset_rel(nset);
927176730Sjeff		break;
928176730Sjeff	case CPU_LEVEL_WHICH:
929176730Sjeff		switch (uap->which) {
930176730Sjeff		case CPU_WHICH_TID:
931176730Sjeff			thread_lock(ttd);
932176730Sjeff			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
933176730Sjeff			thread_unlock(ttd);
934176730Sjeff			break;
935176730Sjeff		case CPU_WHICH_PID:
936176730Sjeff			FOREACH_THREAD_IN_PROC(p, ttd) {
937176730Sjeff				thread_lock(ttd);
938176730Sjeff				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
939176730Sjeff				thread_unlock(ttd);
940176730Sjeff			}
941176730Sjeff			break;
942176730Sjeff		case CPU_WHICH_CPUSET:
943185435Sbz		case CPU_WHICH_JAIL:
944176730Sjeff			CPU_COPY(&set->cs_mask, mask);
945176730Sjeff			break;
946178092Sjeff		case CPU_WHICH_IRQ:
947178092Sjeff			error = intr_getaffinity(uap->id, mask);
948178092Sjeff			break;
949176730Sjeff		}
950176730Sjeff		break;
951176730Sjeff	default:
952176730Sjeff		error = EINVAL;
953176730Sjeff		break;
954176730Sjeff	}
955176730Sjeff	if (set)
956176730Sjeff		cpuset_rel(set);
957176730Sjeff	if (p)
958176730Sjeff		PROC_UNLOCK(p);
959176730Sjeff	if (error == 0)
960176730Sjeff		error = copyout(mask, uap->mask, size);
961176730Sjeffout:
962176730Sjeff	free(mask, M_TEMP);
963176730Sjeff	return (error);
964176730Sjeff}
965176730Sjeff
966176730Sjeff#ifndef _SYS_SYSPROTO_H_
967176730Sjeffstruct cpuset_setaffinity_args {
968176730Sjeff	cpulevel_t	level;
969177597Sru	cpuwhich_t	which;
970177597Sru	id_t		id;
971177597Sru	size_t		cpusetsize;
972177597Sru	const cpuset_t	*mask;
973176730Sjeff};
974176730Sjeff#endif
975176730Sjeffint
976176730Sjeffcpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
977176730Sjeff{
978176730Sjeff	struct cpuset *nset;
979176730Sjeff	struct cpuset *set;
980176730Sjeff	struct thread *ttd;
981176730Sjeff	struct proc *p;
982176730Sjeff	cpuset_t *mask;
983176730Sjeff	int error;
984176730Sjeff
985176811Sjeff	if (uap->cpusetsize < sizeof(cpuset_t) ||
986179313Skib	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
987176730Sjeff		return (ERANGE);
988176811Sjeff	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
989176811Sjeff	error = copyin(uap->mask, mask, uap->cpusetsize);
990176730Sjeff	if (error)
991176730Sjeff		goto out;
992176811Sjeff	/*
993176811Sjeff	 * Verify that no high bits are set.
994176811Sjeff	 */
995176811Sjeff	if (uap->cpusetsize > sizeof(cpuset_t)) {
996176811Sjeff		char *end;
997176811Sjeff		char *cp;
998176811Sjeff
999176811Sjeff		end = cp = (char *)&mask->__bits;
1000176811Sjeff		end += uap->cpusetsize;
1001176811Sjeff		cp += sizeof(cpuset_t);
1002176811Sjeff		while (cp != end)
1003176811Sjeff			if (*cp++ != 0) {
1004176811Sjeff				error = EINVAL;
1005176811Sjeff				goto out;
1006176811Sjeff			}
1007176811Sjeff
1008176811Sjeff	}
1009176730Sjeff	switch (uap->level) {
1010176730Sjeff	case CPU_LEVEL_ROOT:
1011176730Sjeff	case CPU_LEVEL_CPUSET:
1012176730Sjeff		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1013176730Sjeff		if (error)
1014176730Sjeff			break;
1015176730Sjeff		switch (uap->which) {
1016176730Sjeff		case CPU_WHICH_TID:
1017176730Sjeff		case CPU_WHICH_PID:
1018176730Sjeff			thread_lock(ttd);
1019176730Sjeff			set = cpuset_ref(ttd->td_cpuset);
1020176730Sjeff			thread_unlock(ttd);
1021176880Sjeff			PROC_UNLOCK(p);
1022176730Sjeff			break;
1023176730Sjeff		case CPU_WHICH_CPUSET:
1024185435Sbz		case CPU_WHICH_JAIL:
1025176730Sjeff			break;
1026178092Sjeff		case CPU_WHICH_IRQ:
1027178092Sjeff			error = EINVAL;
1028178092Sjeff			goto out;
1029176730Sjeff		}
1030176730Sjeff		if (uap->level == CPU_LEVEL_ROOT)
1031177738Sjeff			nset = cpuset_refroot(set);
1032176730Sjeff		else
1033177738Sjeff			nset = cpuset_refbase(set);
1034176730Sjeff		error = cpuset_modify(nset, mask);
1035176730Sjeff		cpuset_rel(nset);
1036176730Sjeff		cpuset_rel(set);
1037176730Sjeff		break;
1038176730Sjeff	case CPU_LEVEL_WHICH:
1039176730Sjeff		switch (uap->which) {
1040176730Sjeff		case CPU_WHICH_TID:
1041176730Sjeff			error = cpuset_setthread(uap->id, mask);
1042176730Sjeff			break;
1043176730Sjeff		case CPU_WHICH_PID:
1044176730Sjeff			error = cpuset_setproc(uap->id, NULL, mask);
1045176730Sjeff			break;
1046176730Sjeff		case CPU_WHICH_CPUSET:
1047185435Sbz		case CPU_WHICH_JAIL:
1048185435Sbz			error = cpuset_which(uap->which, uap->id, &p,
1049176730Sjeff			    &ttd, &set);
1050176730Sjeff			if (error == 0) {
1051176730Sjeff				error = cpuset_modify(set, mask);
1052176730Sjeff				cpuset_rel(set);
1053176730Sjeff			}
1054176730Sjeff			break;
1055178092Sjeff		case CPU_WHICH_IRQ:
1056178092Sjeff			error = intr_setaffinity(uap->id, mask);
1057178092Sjeff			break;
1058176730Sjeff		default:
1059176730Sjeff			error = EINVAL;
1060176730Sjeff			break;
1061176730Sjeff		}
1062176730Sjeff		break;
1063176730Sjeff	default:
1064176730Sjeff		error = EINVAL;
1065176730Sjeff		break;
1066176730Sjeff	}
1067176730Sjeffout:
1068176730Sjeff	free(mask, M_TEMP);
1069176730Sjeff	return (error);
1070176730Sjeff}
1071180358Sbz
1072180358Sbz#ifdef DDB
1073180358SbzDB_SHOW_COMMAND(cpusets, db_show_cpusets)
1074180358Sbz{
1075180358Sbz	struct cpuset *set;
1076180358Sbz	int cpu, once;
1077180358Sbz
1078180358Sbz	LIST_FOREACH(set, &cpuset_ids, cs_link) {
1079180358Sbz		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
1080180358Sbz		    set, set->cs_id, set->cs_ref, set->cs_flags,
1081180358Sbz		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
1082180358Sbz		db_printf("  mask=");
1083180358Sbz		for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
1084180358Sbz			if (CPU_ISSET(cpu, &set->cs_mask)) {
1085180358Sbz				if (once == 0) {
1086180358Sbz					db_printf("%d", cpu);
1087180358Sbz					once = 1;
1088180358Sbz				} else
1089180358Sbz					db_printf(",%d", cpu);
1090180358Sbz			}
1091180358Sbz		}
1092180358Sbz		db_printf("\n");
1093180358Sbz		if (db_pager_quit)
1094180358Sbz			break;
1095180358Sbz	}
1096180358Sbz}
1097180358Sbz#endif /* DDB */
1098