kern_cpuset.c revision 180358
1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Copyright (c) 2008 Nokia Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 180358 2008-07-07 21:32:02Z bz $");
33
34#include "opt_ddb.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/sysproto.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/malloc.h>
42#include <sys/mutex.h>
43#include <sys/priv.h>
44#include <sys/proc.h>
45#include <sys/refcount.h>
46#include <sys/sched.h>
47#include <sys/smp.h>
48#include <sys/syscallsubr.h>
49#include <sys/cpuset.h>
50#include <sys/sx.h>
51#include <sys/refcount.h>
52#include <sys/queue.h>
53#include <sys/limits.h>
54#include <sys/bus.h>
55#include <sys/interrupt.h>
56
57#include <vm/uma.h>
58
59#ifdef DDB
60#include <ddb/ddb.h>
61#endif /* DDB */
62
63/*
64 * cpusets provide a mechanism for creating and manipulating sets of
65 * processors for the purpose of constraining the scheduling of threads to
66 * specific processors.
67 *
68 * Each process belongs to an identified set, by default this is set 1.  Each
69 * thread may further restrict the cpus it may run on to a subset of this
70 * named set.  This creates an anonymous set which other threads and processes
71 * may not join by number.
72 *
73 * The named set is referred to herein as the 'base' set to avoid ambiguity.
74 * This set is usually a child of a 'root' set while the anonymous set may
75 * simply be referred to as a mask.  In the syscall api these are referred to
76 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
77 *
78 * Threads inherit their set from their creator whether it be anonymous or
79 * not.  This means that anonymous sets are immutable because they may be
80 * shared.  To modify an anonymous set a new set is created with the desired
81 * mask and the same parent as the existing anonymous set.  This gives the
82 * illusion of each thread having a private mask.A
83 *
84 * Via the syscall apis a user may ask to retrieve or modify the root, base,
85 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
86 * modifies all numbered and anonymous child sets to comply with the new mask.
87 * Modifying a pid or tid's mask applies only to that tid but must still
88 * exist within the assigned parent set.
89 *
90 * A thread may not be assigned to a a group seperate from other threads in
91 * the process.  This is to remove ambiguity when the setid is queried with
92 * a pid argument.  There is no other technical limitation.
93 *
94 * This somewhat complex arrangement is intended to make it easy for
95 * applications to query available processors and bind their threads to
96 * specific processors while also allowing administrators to dynamically
97 * reprovision by changing sets which apply to groups of processes.
98 *
99 * A simple application should not concern itself with sets at all and
100 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
101 * meaning 'curthread'.  It may query availble cpus for that tid with a
102 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
103 */
104static uma_zone_t cpuset_zone;
105static struct mtx cpuset_lock;
106static struct setlist cpuset_ids;
107static struct unrhdr *cpuset_unr;
108static struct cpuset *cpuset_zero;
109
110cpuset_t *cpuset_root;
111
112/*
113 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
114 */
115struct cpuset *
116cpuset_ref(struct cpuset *set)
117{
118
119	refcount_acquire(&set->cs_ref);
120	return (set);
121}
122
123/*
124 * Walks up the tree from 'set' to find the root.  Returns the root
125 * referenced.
126 */
127static struct cpuset *
128cpuset_refroot(struct cpuset *set)
129{
130
131	for (; set->cs_parent != NULL; set = set->cs_parent)
132		if (set->cs_flags & CPU_SET_ROOT)
133			break;
134	cpuset_ref(set);
135
136	return (set);
137}
138
139/*
140 * Find the first non-anonymous set starting from 'set'.  Returns this set
141 * referenced.  May return the passed in set with an extra ref if it is
142 * not anonymous.
143 */
144static struct cpuset *
145cpuset_refbase(struct cpuset *set)
146{
147
148	if (set->cs_id == CPUSET_INVALID)
149		set = set->cs_parent;
150	cpuset_ref(set);
151
152	return (set);
153}
154
155/*
156 * Release a reference in a context where it is safe to allocte.
157 */
158void
159cpuset_rel(struct cpuset *set)
160{
161	cpusetid_t id;
162
163	if (refcount_release(&set->cs_ref) == 0)
164		return;
165	mtx_lock_spin(&cpuset_lock);
166	LIST_REMOVE(set, cs_siblings);
167	id = set->cs_id;
168	if (id != CPUSET_INVALID)
169		LIST_REMOVE(set, cs_link);
170	mtx_unlock_spin(&cpuset_lock);
171	cpuset_rel(set->cs_parent);
172	uma_zfree(cpuset_zone, set);
173	if (id != CPUSET_INVALID)
174		free_unr(cpuset_unr, id);
175}
176
177/*
178 * Deferred release must be used when in a context that is not safe to
179 * allocate/free.  This places any unreferenced sets on the list 'head'.
180 */
181static void
182cpuset_rel_defer(struct setlist *head, struct cpuset *set)
183{
184
185	if (refcount_release(&set->cs_ref) == 0)
186		return;
187	mtx_lock_spin(&cpuset_lock);
188	LIST_REMOVE(set, cs_siblings);
189	if (set->cs_id != CPUSET_INVALID)
190		LIST_REMOVE(set, cs_link);
191	LIST_INSERT_HEAD(head, set, cs_link);
192	mtx_unlock_spin(&cpuset_lock);
193}
194
195/*
196 * Complete a deferred release.  Removes the set from the list provided to
197 * cpuset_rel_defer.
198 */
199static void
200cpuset_rel_complete(struct cpuset *set)
201{
202	LIST_REMOVE(set, cs_link);
203	cpuset_rel(set->cs_parent);
204	uma_zfree(cpuset_zone, set);
205}
206
207/*
208 * Find a set based on an id.  Returns it with a ref.
209 */
210static struct cpuset *
211cpuset_lookup(cpusetid_t setid)
212{
213	struct cpuset *set;
214
215	if (setid == CPUSET_INVALID)
216		return (NULL);
217	mtx_lock_spin(&cpuset_lock);
218	LIST_FOREACH(set, &cpuset_ids, cs_link)
219		if (set->cs_id == setid)
220			break;
221	if (set)
222		cpuset_ref(set);
223	mtx_unlock_spin(&cpuset_lock);
224	return (set);
225}
226
227/*
228 * Create a set in the space provided in 'set' with the provided parameters.
229 * The set is returned with a single ref.  May return EDEADLK if the set
230 * will have no valid cpu based on restrictions from the parent.
231 */
232static int
233_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
234    cpusetid_t id)
235{
236
237	if (!CPU_OVERLAP(&parent->cs_mask, mask))
238		return (EDEADLK);
239	CPU_COPY(mask, &set->cs_mask);
240	LIST_INIT(&set->cs_children);
241	refcount_init(&set->cs_ref, 1);
242	set->cs_flags = 0;
243	mtx_lock_spin(&cpuset_lock);
244	CPU_AND(mask, &parent->cs_mask);
245	set->cs_id = id;
246	set->cs_parent = cpuset_ref(parent);
247	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
248	if (set->cs_id != CPUSET_INVALID)
249		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
250	mtx_unlock_spin(&cpuset_lock);
251
252	return (0);
253}
254
255/*
256 * Create a new non-anonymous set with the requested parent and mask.  May
257 * return failures if the mask is invalid or a new number can not be
258 * allocated.
259 */
260static int
261cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
262{
263	struct cpuset *set;
264	cpusetid_t id;
265	int error;
266
267	id = alloc_unr(cpuset_unr);
268	if (id == -1)
269		return (ENFILE);
270	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
271	error = _cpuset_create(set, parent, mask, id);
272	if (error == 0)
273		return (0);
274	free_unr(cpuset_unr, id);
275	uma_zfree(cpuset_zone, set);
276
277	return (error);
278}
279
280/*
281 * Recursively check for errors that would occur from applying mask to
282 * the tree of sets starting at 'set'.  Checks for sets that would become
283 * empty as well as RDONLY flags.
284 */
285static int
286cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
287{
288	struct cpuset *nset;
289	cpuset_t newmask;
290	int error;
291
292	mtx_assert(&cpuset_lock, MA_OWNED);
293	if (set->cs_flags & CPU_SET_RDONLY)
294		return (EPERM);
295	if (!CPU_OVERLAP(&set->cs_mask, mask))
296		return (EDEADLK);
297	CPU_COPY(&set->cs_mask, &newmask);
298	CPU_AND(&newmask, mask);
299	error = 0;
300	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
301		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
302			break;
303	return (error);
304}
305
306/*
307 * Applies the mask 'mask' without checking for empty sets or permissions.
308 */
309static void
310cpuset_update(struct cpuset *set, cpuset_t *mask)
311{
312	struct cpuset *nset;
313
314	mtx_assert(&cpuset_lock, MA_OWNED);
315	CPU_AND(&set->cs_mask, mask);
316	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
317		cpuset_update(nset, &set->cs_mask);
318
319	return;
320}
321
322/*
323 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
324 * mask to restrict all children in the tree.  Checks for validity before
325 * applying the changes.
326 */
327static int
328cpuset_modify(struct cpuset *set, cpuset_t *mask)
329{
330	struct cpuset *root;
331	int error;
332
333	error = priv_check(curthread, PRIV_SCHED_CPUSET);
334	if (error)
335		return (error);
336	/*
337	 * Verify that we have access to this set of
338	 * cpus.
339	 */
340	root = set->cs_parent;
341	if (root && !CPU_SUBSET(&root->cs_mask, mask))
342		return (EINVAL);
343	mtx_lock_spin(&cpuset_lock);
344	error = cpuset_testupdate(set, mask);
345	if (error)
346		goto out;
347	cpuset_update(set, mask);
348	CPU_COPY(mask, &set->cs_mask);
349out:
350	mtx_unlock_spin(&cpuset_lock);
351
352	return (error);
353}
354
355/*
356 * Resolve the 'which' parameter of several cpuset apis.
357 *
358 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
359 * checks for permission via p_cansched().
360 *
361 * For WHICH_SET returns a valid set with a new reference.
362 *
363 * -1 may be supplied for any argument to mean the current proc/thread or
364 * the base set of the current thread.  May fail with ESRCH/EPERM.
365 */
366static int
367cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
368    struct cpuset **setp)
369{
370	struct cpuset *set;
371	struct thread *td;
372	struct proc *p;
373	int error;
374
375	*pp = p = NULL;
376	*tdp = td = NULL;
377	*setp = set = NULL;
378	switch (which) {
379	case CPU_WHICH_PID:
380		if (id == -1) {
381			PROC_LOCK(curproc);
382			p = curproc;
383			break;
384		}
385		if ((p = pfind(id)) == NULL)
386			return (ESRCH);
387		break;
388	case CPU_WHICH_TID:
389		if (id == -1) {
390			PROC_LOCK(curproc);
391			p = curproc;
392			td = curthread;
393			break;
394		}
395		sx_slock(&allproc_lock);
396		FOREACH_PROC_IN_SYSTEM(p) {
397			PROC_LOCK(p);
398			FOREACH_THREAD_IN_PROC(p, td)
399				if (td->td_tid == id)
400					break;
401			if (td != NULL)
402				break;
403			PROC_UNLOCK(p);
404		}
405		sx_sunlock(&allproc_lock);
406		if (td == NULL)
407			return (ESRCH);
408		break;
409	case CPU_WHICH_CPUSET:
410		if (id == -1) {
411			thread_lock(curthread);
412			set = cpuset_refbase(curthread->td_cpuset);
413			thread_unlock(curthread);
414		} else
415			set = cpuset_lookup(id);
416		if (set) {
417			*setp = set;
418			return (0);
419		}
420		return (ESRCH);
421	case CPU_WHICH_IRQ:
422		return (0);
423	default:
424		return (EINVAL);
425	}
426	error = p_cansched(curthread, p);
427	if (error) {
428		PROC_UNLOCK(p);
429		return (error);
430	}
431	if (td == NULL)
432		td = FIRST_THREAD_IN_PROC(p);
433	*pp = p;
434	*tdp = td;
435	return (0);
436}
437
438/*
439 * Create an anonymous set with the provided mask in the space provided by
440 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
441 * the new set is a child of 'set'.
442 */
443static int
444cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
445{
446	struct cpuset *parent;
447
448	if (set->cs_id == CPUSET_INVALID)
449		parent = set->cs_parent;
450	else
451		parent = set;
452	if (!CPU_SUBSET(&parent->cs_mask, mask))
453		return (EDEADLK);
454	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
455}
456
457/*
458 * Handle two cases for replacing the base set or mask of an entire process.
459 *
460 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
461 *    to the provided set and replaces all non-anonymous td_cpusets with the
462 *    provided set.
463 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
464 *    sets for every thread with the existing base as a parent.
465 *
466 * This is overly complicated because we can't allocate while holding a
467 * spinlock and spinlocks must be held while changing and examining thread
468 * state.
469 */
470static int
471cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
472{
473	struct setlist freelist;
474	struct setlist droplist;
475	struct cpuset *tdset;
476	struct cpuset *nset;
477	struct thread *td;
478	struct proc *p;
479	int threads;
480	int nfree;
481	int error;
482	/*
483	 * The algorithm requires two passes due to locking considerations.
484	 *
485	 * 1) Lookup the process and acquire the locks in the required order.
486	 * 2) If enough cpusets have not been allocated release the locks and
487	 *    allocate them.  Loop.
488	 */
489	LIST_INIT(&freelist);
490	LIST_INIT(&droplist);
491	nfree = 0;
492	for (;;) {
493		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
494		if (error)
495			goto out;
496		if (nfree >= p->p_numthreads)
497			break;
498		threads = p->p_numthreads;
499		PROC_UNLOCK(p);
500		for (; nfree < threads; nfree++) {
501			nset = uma_zalloc(cpuset_zone, M_WAITOK);
502			LIST_INSERT_HEAD(&freelist, nset, cs_link);
503		}
504	}
505	PROC_LOCK_ASSERT(p, MA_OWNED);
506	/*
507	 * Now that the appropriate locks are held and we have enough cpusets,
508	 * make sure the operation will succeed before applying changes.  The
509	 * proc lock prevents td_cpuset from changing between calls.
510	 */
511	error = 0;
512	FOREACH_THREAD_IN_PROC(p, td) {
513		thread_lock(td);
514		tdset = td->td_cpuset;
515		/*
516		 * Verify that a new mask doesn't specify cpus outside of
517		 * the set the thread is a member of.
518		 */
519		if (mask) {
520			if (tdset->cs_id == CPUSET_INVALID)
521				tdset = tdset->cs_parent;
522			if (!CPU_SUBSET(&tdset->cs_mask, mask))
523				error = EDEADLK;
524		/*
525		 * Verify that a new set won't leave an existing thread
526		 * mask without a cpu to run on.  It can, however, restrict
527		 * the set.
528		 */
529		} else if (tdset->cs_id == CPUSET_INVALID) {
530			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
531				error = EDEADLK;
532		}
533		thread_unlock(td);
534		if (error)
535			goto unlock_out;
536	}
537	/*
538	 * Replace each thread's cpuset while using deferred release.  We
539	 * must do this because the thread lock must be held while operating
540	 * on the thread and this limits the type of operations allowed.
541	 */
542	FOREACH_THREAD_IN_PROC(p, td) {
543		thread_lock(td);
544		/*
545		 * If we presently have an anonymous set or are applying a
546		 * mask we must create an anonymous shadow set.  That is
547		 * either parented to our existing base or the supplied set.
548		 *
549		 * If we have a base set with no anonymous shadow we simply
550		 * replace it outright.
551		 */
552		tdset = td->td_cpuset;
553		if (tdset->cs_id == CPUSET_INVALID || mask) {
554			nset = LIST_FIRST(&freelist);
555			LIST_REMOVE(nset, cs_link);
556			if (mask)
557				error = cpuset_shadow(tdset, nset, mask);
558			else
559				error = _cpuset_create(nset, set,
560				    &tdset->cs_mask, CPUSET_INVALID);
561			if (error) {
562				LIST_INSERT_HEAD(&freelist, nset, cs_link);
563				thread_unlock(td);
564				break;
565			}
566		} else
567			nset = cpuset_ref(set);
568		cpuset_rel_defer(&droplist, tdset);
569		td->td_cpuset = nset;
570		sched_affinity(td);
571		thread_unlock(td);
572	}
573unlock_out:
574	PROC_UNLOCK(p);
575out:
576	while ((nset = LIST_FIRST(&droplist)) != NULL)
577		cpuset_rel_complete(nset);
578	while ((nset = LIST_FIRST(&freelist)) != NULL) {
579		LIST_REMOVE(nset, cs_link);
580		uma_zfree(cpuset_zone, nset);
581	}
582	return (error);
583}
584
585/*
586 * Apply an anonymous mask to a single thread.
587 */
588int
589cpuset_setthread(lwpid_t id, cpuset_t *mask)
590{
591	struct cpuset *nset;
592	struct cpuset *set;
593	struct thread *td;
594	struct proc *p;
595	int error;
596
597	nset = uma_zalloc(cpuset_zone, M_WAITOK);
598	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
599	if (error)
600		goto out;
601	set = NULL;
602	thread_lock(td);
603	error = cpuset_shadow(td->td_cpuset, nset, mask);
604	if (error == 0) {
605		set = td->td_cpuset;
606		td->td_cpuset = nset;
607		sched_affinity(td);
608		nset = NULL;
609	}
610	thread_unlock(td);
611	PROC_UNLOCK(p);
612	if (set)
613		cpuset_rel(set);
614out:
615	if (nset)
616		uma_zfree(cpuset_zone, nset);
617	return (error);
618}
619
620/*
621 * Creates the cpuset for thread0.  We make two sets:
622 *
623 * 0 - The root set which should represent all valid processors in the
624 *     system.  It is initially created with a mask of all processors
625 *     because we don't know what processors are valid until cpuset_init()
626 *     runs.  This set is immutable.
627 * 1 - The default set which all processes are a member of until changed.
628 *     This allows an administrator to move all threads off of given cpus to
629 *     dedicate them to high priority tasks or save power etc.
630 */
631struct cpuset *
632cpuset_thread0(void)
633{
634	struct cpuset *set;
635	int error;
636
637	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
638	    NULL, NULL, UMA_ALIGN_PTR, 0);
639	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
640	/*
641	 * Create the root system set for the whole machine.  Doesn't use
642	 * cpuset_create() due to NULL parent.
643	 */
644	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
645	set->cs_mask.__bits[0] = -1;
646	LIST_INIT(&set->cs_children);
647	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
648	set->cs_ref = 1;
649	set->cs_flags = CPU_SET_ROOT;
650	cpuset_zero = set;
651	cpuset_root = &set->cs_mask;
652	/*
653	 * Now derive a default, modifiable set from that to give out.
654	 */
655	set = uma_zalloc(cpuset_zone, M_WAITOK);
656	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
657	KASSERT(error == 0, ("Error creating default set: %d\n", error));
658	/*
659	 * Initialize the unit allocator. 0 and 1 are allocated above.
660	 */
661	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
662
663	return (set);
664}
665
666/*
667 * This is called once the final set of system cpus is known.  Modifies
668 * the root set and all children and mark the root readonly.
669 */
670static void
671cpuset_init(void *arg)
672{
673	cpuset_t mask;
674
675	CPU_ZERO(&mask);
676#ifdef SMP
677	mask.__bits[0] = all_cpus;
678#else
679	mask.__bits[0] = 1;
680#endif
681	if (cpuset_modify(cpuset_zero, &mask))
682		panic("Can't set initial cpuset mask.\n");
683	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
684}
685SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
686
687#ifndef _SYS_SYSPROTO_H_
688struct cpuset_args {
689	cpusetid_t	*setid;
690};
691#endif
692int
693cpuset(struct thread *td, struct cpuset_args *uap)
694{
695	struct cpuset *root;
696	struct cpuset *set;
697	int error;
698
699	thread_lock(td);
700	root = cpuset_refroot(td->td_cpuset);
701	thread_unlock(td);
702	error = cpuset_create(&set, root, &root->cs_mask);
703	cpuset_rel(root);
704	if (error)
705		return (error);
706	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
707	if (error == 0)
708		error = cpuset_setproc(-1, set, NULL);
709	cpuset_rel(set);
710	return (error);
711}
712
713#ifndef _SYS_SYSPROTO_H_
714struct cpuset_setid_args {
715	cpuwhich_t	which;
716	id_t		id;
717	cpusetid_t	setid;
718};
719#endif
720int
721cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
722{
723	struct cpuset *set;
724	int error;
725
726	/*
727	 * Presently we only support per-process sets.
728	 */
729	if (uap->which != CPU_WHICH_PID)
730		return (EINVAL);
731	set = cpuset_lookup(uap->setid);
732	if (set == NULL)
733		return (ESRCH);
734	error = cpuset_setproc(uap->id, set, NULL);
735	cpuset_rel(set);
736	return (error);
737}
738
739#ifndef _SYS_SYSPROTO_H_
740struct cpuset_getid_args {
741	cpulevel_t	level;
742	cpuwhich_t	which;
743	id_t		id;
744	cpusetid_t	*setid;
745#endif
746int
747cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
748{
749	struct cpuset *nset;
750	struct cpuset *set;
751	struct thread *ttd;
752	struct proc *p;
753	cpusetid_t id;
754	int error;
755
756	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
757		return (EINVAL);
758	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
759	if (error)
760		return (error);
761	switch (uap->which) {
762	case CPU_WHICH_TID:
763	case CPU_WHICH_PID:
764		thread_lock(ttd);
765		set = cpuset_refbase(ttd->td_cpuset);
766		thread_unlock(ttd);
767		PROC_UNLOCK(p);
768		break;
769	case CPU_WHICH_CPUSET:
770		break;
771	case CPU_WHICH_IRQ:
772		return (EINVAL);
773	}
774	switch (uap->level) {
775	case CPU_LEVEL_ROOT:
776		nset = cpuset_refroot(set);
777		cpuset_rel(set);
778		set = nset;
779		break;
780	case CPU_LEVEL_CPUSET:
781		break;
782	case CPU_LEVEL_WHICH:
783		break;
784	}
785	id = set->cs_id;
786	cpuset_rel(set);
787	if (error == 0)
788		error = copyout(&id, uap->setid, sizeof(id));
789
790	return (error);
791}
792
793#ifndef _SYS_SYSPROTO_H_
794struct cpuset_getaffinity_args {
795	cpulevel_t	level;
796	cpuwhich_t	which;
797	id_t		id;
798	size_t		cpusetsize;
799	cpuset_t	*mask;
800};
801#endif
802int
803cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
804{
805	struct thread *ttd;
806	struct cpuset *nset;
807	struct cpuset *set;
808	struct proc *p;
809	cpuset_t *mask;
810	int error;
811	size_t size;
812
813	if (uap->cpusetsize < sizeof(cpuset_t) ||
814	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
815		return (ERANGE);
816	size = uap->cpusetsize;
817	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
818	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
819	if (error)
820		goto out;
821	switch (uap->level) {
822	case CPU_LEVEL_ROOT:
823	case CPU_LEVEL_CPUSET:
824		switch (uap->which) {
825		case CPU_WHICH_TID:
826		case CPU_WHICH_PID:
827			thread_lock(ttd);
828			set = cpuset_ref(ttd->td_cpuset);
829			thread_unlock(ttd);
830			break;
831		case CPU_WHICH_CPUSET:
832			break;
833		case CPU_WHICH_IRQ:
834			error = EINVAL;
835			goto out;
836		}
837		if (uap->level == CPU_LEVEL_ROOT)
838			nset = cpuset_refroot(set);
839		else
840			nset = cpuset_refbase(set);
841		CPU_COPY(&nset->cs_mask, mask);
842		cpuset_rel(nset);
843		break;
844	case CPU_LEVEL_WHICH:
845		switch (uap->which) {
846		case CPU_WHICH_TID:
847			thread_lock(ttd);
848			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
849			thread_unlock(ttd);
850			break;
851		case CPU_WHICH_PID:
852			FOREACH_THREAD_IN_PROC(p, ttd) {
853				thread_lock(ttd);
854				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
855				thread_unlock(ttd);
856			}
857			break;
858		case CPU_WHICH_CPUSET:
859			CPU_COPY(&set->cs_mask, mask);
860			break;
861		case CPU_WHICH_IRQ:
862			error = intr_getaffinity(uap->id, mask);
863			break;
864		}
865		break;
866	default:
867		error = EINVAL;
868		break;
869	}
870	if (set)
871		cpuset_rel(set);
872	if (p)
873		PROC_UNLOCK(p);
874	if (error == 0)
875		error = copyout(mask, uap->mask, size);
876out:
877	free(mask, M_TEMP);
878	return (error);
879}
880
881#ifndef _SYS_SYSPROTO_H_
882struct cpuset_setaffinity_args {
883	cpulevel_t	level;
884	cpuwhich_t	which;
885	id_t		id;
886	size_t		cpusetsize;
887	const cpuset_t	*mask;
888};
889#endif
890int
891cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
892{
893	struct cpuset *nset;
894	struct cpuset *set;
895	struct thread *ttd;
896	struct proc *p;
897	cpuset_t *mask;
898	int error;
899
900	if (uap->cpusetsize < sizeof(cpuset_t) ||
901	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
902		return (ERANGE);
903	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
904	error = copyin(uap->mask, mask, uap->cpusetsize);
905	if (error)
906		goto out;
907	/*
908	 * Verify that no high bits are set.
909	 */
910	if (uap->cpusetsize > sizeof(cpuset_t)) {
911		char *end;
912		char *cp;
913
914		end = cp = (char *)&mask->__bits;
915		end += uap->cpusetsize;
916		cp += sizeof(cpuset_t);
917		while (cp != end)
918			if (*cp++ != 0) {
919				error = EINVAL;
920				goto out;
921			}
922
923	}
924	switch (uap->level) {
925	case CPU_LEVEL_ROOT:
926	case CPU_LEVEL_CPUSET:
927		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
928		if (error)
929			break;
930		switch (uap->which) {
931		case CPU_WHICH_TID:
932		case CPU_WHICH_PID:
933			thread_lock(ttd);
934			set = cpuset_ref(ttd->td_cpuset);
935			thread_unlock(ttd);
936			PROC_UNLOCK(p);
937			break;
938		case CPU_WHICH_CPUSET:
939			break;
940		case CPU_WHICH_IRQ:
941			error = EINVAL;
942			goto out;
943		}
944		if (uap->level == CPU_LEVEL_ROOT)
945			nset = cpuset_refroot(set);
946		else
947			nset = cpuset_refbase(set);
948		error = cpuset_modify(nset, mask);
949		cpuset_rel(nset);
950		cpuset_rel(set);
951		break;
952	case CPU_LEVEL_WHICH:
953		switch (uap->which) {
954		case CPU_WHICH_TID:
955			error = cpuset_setthread(uap->id, mask);
956			break;
957		case CPU_WHICH_PID:
958			error = cpuset_setproc(uap->id, NULL, mask);
959			break;
960		case CPU_WHICH_CPUSET:
961			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
962			    &ttd, &set);
963			if (error == 0) {
964				error = cpuset_modify(set, mask);
965				cpuset_rel(set);
966			}
967			break;
968		case CPU_WHICH_IRQ:
969			error = intr_setaffinity(uap->id, mask);
970			break;
971		default:
972			error = EINVAL;
973			break;
974		}
975		break;
976	default:
977		error = EINVAL;
978		break;
979	}
980out:
981	free(mask, M_TEMP);
982	return (error);
983}
984
985#ifdef DDB
986DB_SHOW_COMMAND(cpusets, db_show_cpusets)
987{
988	struct cpuset *set;
989	int cpu, once;
990
991	LIST_FOREACH(set, &cpuset_ids, cs_link) {
992		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
993		    set, set->cs_id, set->cs_ref, set->cs_flags,
994		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
995		db_printf("  mask=");
996		for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
997			if (CPU_ISSET(cpu, &set->cs_mask)) {
998				if (once == 0) {
999					db_printf("%d", cpu);
1000					once = 1;
1001				} else
1002					db_printf(",%d", cpu);
1003			}
1004		}
1005		db_printf("\n");
1006		if (db_pager_quit)
1007			break;
1008	}
1009}
1010#endif /* DDB */
1011