kern_cpuset.c revision 177904
1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Copyright (c) 2008 Nokia Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 177904 2008-04-04 01:22:04Z jeff $");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/sysproto.h>
37#include <sys/kernel.h>
38#include <sys/lock.h>
39#include <sys/malloc.h>
40#include <sys/mutex.h>
41#include <sys/priv.h>
42#include <sys/proc.h>
43#include <sys/refcount.h>
44#include <sys/sched.h>
45#include <sys/smp.h>
46#include <sys/syscallsubr.h>
47#include <sys/cpuset.h>
48#include <sys/sx.h>
49#include <sys/refcount.h>
50#include <sys/queue.h>
51#include <sys/limits.h>
52#include <sys/bus.h>
53#include <sys/interrupt.h>
54
55#include <vm/uma.h>
56
57/*
58 * cpusets provide a mechanism for creating and manipulating sets of
59 * processors for the purpose of constraining the scheduling of threads to
60 * specific processors.
61 *
62 * Each process belongs to an identified set, by default this is set 1.  Each
63 * thread may further restrict the cpus it may run on to a subset of this
64 * named set.  This creates an anonymous set which other threads and processes
65 * may not join by number.
66 *
67 * The named set is referred to herein as the 'base' set to avoid ambiguity.
68 * This set is usually a child of a 'root' set while the anonymous set may
69 * simply be referred to as a mask.  In the syscall api these are referred to
70 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
71 *
72 * Threads inherit their set from their creator whether it be anonymous or
73 * not.  This means that anonymous sets are immutable because they may be
74 * shared.  To modify an anonymous set a new set is created with the desired
75 * mask and the same parent as the existing anonymous set.  This gives the
76 * illusion of each thread having a private mask.A
77 *
78 * Via the syscall apis a user may ask to retrieve or modify the root, base,
79 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
80 * modifies all numbered and anonymous child sets to comply with the new mask.
81 * Modifying a pid or tid's mask applies only to that tid but must still
82 * exist within the assigned parent set.
83 *
84 * A thread may not be assigned to a a group seperate from other threads in
85 * the process.  This is to remove ambiguity when the setid is queried with
86 * a pid argument.  There is no other technical limitation.
87 *
88 * This somewhat complex arrangement is intended to make it easy for
89 * applications to query available processors and bind their threads to
90 * specific processors while also allowing administrators to dynamically
91 * reprovision by changing sets which apply to groups of processes.
92 *
93 * A simple application should not concern itself with sets at all and
94 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
95 * meaning 'curthread'.  It may query availble cpus for that tid with a
96 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
97 */
98static uma_zone_t cpuset_zone;
99static struct mtx cpuset_lock;
100static struct setlist cpuset_ids;
101static struct unrhdr *cpuset_unr;
102static struct cpuset *cpuset_zero;
103
104cpuset_t *cpuset_root;
105
106/*
107 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
108 */
109struct cpuset *
110cpuset_ref(struct cpuset *set)
111{
112
113	refcount_acquire(&set->cs_ref);
114	return (set);
115}
116
117/*
118 * Release a reference in a context where it is safe to allocte.
119 */
120void
121cpuset_rel(struct cpuset *set)
122{
123	cpusetid_t id;
124
125	if (refcount_release(&set->cs_ref) == 0)
126		return;
127	mtx_lock_spin(&cpuset_lock);
128	LIST_REMOVE(set, cs_siblings);
129	id = set->cs_id;
130	if (id != CPUSET_INVALID)
131		LIST_REMOVE(set, cs_link);
132	mtx_unlock_spin(&cpuset_lock);
133	cpuset_rel(set->cs_parent);
134	uma_zfree(cpuset_zone, set);
135	if (id != CPUSET_INVALID)
136		free_unr(cpuset_unr, id);
137}
138
139/*
140 * Deferred release must be used when in a context that is not safe to
141 * allocate/free.  This places any unreferenced sets on the list 'head'.
142 */
143static void
144cpuset_rel_defer(struct setlist *head, struct cpuset *set)
145{
146
147	if (refcount_release(&set->cs_ref) == 0)
148		return;
149	mtx_lock_spin(&cpuset_lock);
150	LIST_REMOVE(set, cs_siblings);
151	if (set->cs_id != CPUSET_INVALID)
152		LIST_REMOVE(set, cs_link);
153	LIST_INSERT_HEAD(head, set, cs_link);
154	mtx_unlock_spin(&cpuset_lock);
155}
156
157/*
158 * Complete a deferred release.  Removes the set from the list provided to
159 * cpuset_rel_defer.
160 */
161static void
162cpuset_rel_complete(struct cpuset *set)
163{
164	LIST_REMOVE(set, cs_link);
165	cpuset_rel(set->cs_parent);
166	uma_zfree(cpuset_zone, set);
167}
168
169/*
170 * Find a set based on an id.  Returns it with a ref.
171 */
172static struct cpuset *
173cpuset_lookup(cpusetid_t setid)
174{
175	struct cpuset *set;
176
177	if (setid == CPUSET_INVALID)
178		return (NULL);
179	mtx_lock_spin(&cpuset_lock);
180	LIST_FOREACH(set, &cpuset_ids, cs_link)
181		if (set->cs_id == setid)
182			break;
183	if (set)
184		cpuset_ref(set);
185	mtx_unlock_spin(&cpuset_lock);
186	return (set);
187}
188
189/*
190 * Create a set in the space provided in 'set' with the provided parameters.
191 * The set is returned with a single ref.  May return EDEADLK if the set
192 * will have no valid cpu based on restrictions from the parent.
193 */
194static int
195_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
196    cpusetid_t id)
197{
198
199	if (!CPU_OVERLAP(&parent->cs_mask, mask))
200		return (EDEADLK);
201	CPU_COPY(mask, &set->cs_mask);
202	LIST_INIT(&set->cs_children);
203	refcount_init(&set->cs_ref, 1);
204	set->cs_flags = 0;
205	mtx_lock_spin(&cpuset_lock);
206	CPU_AND(mask, &parent->cs_mask);
207	set->cs_id = id;
208	set->cs_parent = cpuset_ref(parent);
209	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
210	if (set->cs_id != CPUSET_INVALID)
211		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
212	mtx_unlock_spin(&cpuset_lock);
213
214	return (0);
215}
216
217/*
218 * Create a new non-anonymous set with the requested parent and mask.  May
219 * return failures if the mask is invalid or a new number can not be
220 * allocated.
221 */
222static int
223cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
224{
225	struct cpuset *set;
226	cpusetid_t id;
227	int error;
228
229	id = alloc_unr(cpuset_unr);
230	if (id == -1)
231		return (ENFILE);
232	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
233	error = _cpuset_create(set, parent, mask, id);
234	if (error == 0)
235		return (0);
236	free_unr(cpuset_unr, id);
237	uma_zfree(cpuset_zone, set);
238
239	return (error);
240}
241
242/*
243 * Recursively check for errors that would occur from applying mask to
244 * the tree of sets starting at 'set'.  Checks for sets that would become
245 * empty as well as RDONLY flags.
246 */
247static int
248cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
249{
250	struct cpuset *nset;
251	cpuset_t newmask;
252	int error;
253
254	mtx_assert(&cpuset_lock, MA_OWNED);
255	if (set->cs_flags & CPU_SET_RDONLY)
256		return (EPERM);
257	if (!CPU_OVERLAP(&set->cs_mask, mask))
258		return (EDEADLK);
259	CPU_COPY(&set->cs_mask, &newmask);
260	CPU_AND(&newmask, mask);
261	error = 0;
262	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
263		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
264			break;
265	return (error);
266}
267
268/*
269 * Applies the mask 'mask' without checking for empty sets or permissions.
270 */
271static void
272cpuset_update(struct cpuset *set, cpuset_t *mask)
273{
274	struct cpuset *nset;
275
276	mtx_assert(&cpuset_lock, MA_OWNED);
277	CPU_AND(&set->cs_mask, mask);
278	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
279		cpuset_update(nset, &set->cs_mask);
280
281	return;
282}
283
284/*
285 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
286 * mask to restrict all children in the tree.  Checks for validity before
287 * applying the changes.
288 */
289static int
290cpuset_modify(struct cpuset *set, cpuset_t *mask)
291{
292	struct cpuset *root;
293	int error;
294
295	error = suser(curthread);
296	if (error)
297		return (error);
298	/*
299	 * Verify that we have access to this set of
300	 * cpus.
301	 */
302	root = set->cs_parent;
303	if (root && !CPU_SUBSET(&root->cs_mask, mask))
304		return (EINVAL);
305	mtx_lock_spin(&cpuset_lock);
306	error = cpuset_testupdate(set, mask);
307	if (error)
308		goto out;
309	cpuset_update(set, mask);
310	CPU_COPY(mask, &set->cs_mask);
311out:
312	mtx_unlock_spin(&cpuset_lock);
313
314	return (error);
315}
316
317/*
318 * Walks up the tree from 'set' to find the root.  Returns the root
319 * referenced.
320 */
321static struct cpuset *
322cpuset_refroot(struct cpuset *set)
323{
324
325	for (; set->cs_parent != NULL; set = set->cs_parent)
326		if (set->cs_flags & CPU_SET_ROOT)
327			break;
328	cpuset_ref(set);
329
330	return (set);
331}
332
333/*
334 * Find the first non-anonymous set starting from 'set'.  Returns this set
335 * referenced.  May return the passed in set with an extra ref if it is
336 * not anonymous.
337 */
338static struct cpuset *
339cpuset_refbase(struct cpuset *set)
340{
341
342	if (set->cs_id == CPUSET_INVALID)
343		set = set->cs_parent;
344	cpuset_ref(set);
345
346	return (set);
347}
348
349/*
350 * Resolve the 'which' parameter of several cpuset apis.
351 *
352 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
353 * checks for permission via p_cansched().
354 *
355 * For WHICH_SET returns a valid set with a new reference.
356 *
357 * -1 may be supplied for any argument to mean the current proc/thread or
358 * the base set of the current thread.  May fail with ESRCH/EPERM.
359 */
360static int
361cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
362    struct cpuset **setp)
363{
364	struct cpuset *set;
365	struct thread *td;
366	struct proc *p;
367	int error;
368
369	*pp = p = NULL;
370	*tdp = td = NULL;
371	*setp = set = NULL;
372	switch (which) {
373	case CPU_WHICH_PID:
374		if (id == -1) {
375			PROC_LOCK(curproc);
376			p = curproc;
377			break;
378		}
379		if ((p = pfind(id)) == NULL)
380			return (ESRCH);
381		break;
382	case CPU_WHICH_TID:
383		if (id == -1) {
384			PROC_LOCK(curproc);
385			p = curproc;
386			td = curthread;
387			break;
388		}
389		sx_slock(&allproc_lock);
390		FOREACH_PROC_IN_SYSTEM(p) {
391			PROC_LOCK(p);
392			FOREACH_THREAD_IN_PROC(p, td)
393				if (td->td_tid == id)
394					break;
395			if (td != NULL)
396				break;
397			PROC_UNLOCK(p);
398		}
399		sx_sunlock(&allproc_lock);
400		if (td == NULL)
401			return (ESRCH);
402		break;
403	case CPU_WHICH_CPUSET:
404		if (id == -1) {
405			thread_lock(curthread);
406			set = cpuset_refbase(curthread->td_cpuset);
407			thread_unlock(curthread);
408		} else
409			set = cpuset_lookup(id);
410		if (set) {
411			*setp = set;
412			return (0);
413		}
414		return (ESRCH);
415	default:
416		return (EINVAL);
417	}
418	error = p_cansched(curthread, p);
419	if (error) {
420		PROC_UNLOCK(p);
421		return (error);
422	}
423	if (td == NULL)
424		td = FIRST_THREAD_IN_PROC(p);
425	*pp = p;
426	*tdp = td;
427	return (0);
428}
429
430/*
431 * Create an anonymous set with the provided mask in the space provided by
432 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
433 * the new set is a child of 'set'.
434 */
435static int
436cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
437{
438	struct cpuset *parent;
439
440	if (set->cs_id == CPUSET_INVALID)
441		parent = set->cs_parent;
442	else
443		parent = set;
444	if (!CPU_SUBSET(&parent->cs_mask, mask))
445		return (EDEADLK);
446	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
447}
448
449/*
450 * Handle two cases for replacing the base set or mask of an entire process.
451 *
452 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
453 *    to the provided set and replaces all non-anonymous td_cpusets with the
454 *    provided set.
455 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
456 *    sets for every thread with the existing base as a parent.
457 *
458 * This is overly complicated because we can't allocate while holding a
459 * spinlock and spinlocks must be held while changing and examining thread
460 * state.
461 */
462static int
463cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
464{
465	struct setlist freelist;
466	struct setlist droplist;
467	struct cpuset *tdset;
468	struct cpuset *nset;
469	struct thread *td;
470	struct proc *p;
471	int threads;
472	int nfree;
473	int error;
474	/*
475	 * The algorithm requires two passes due to locking considerations.
476	 *
477	 * 1) Lookup the process and acquire the locks in the required order.
478	 * 2) If enough cpusets have not been allocated release the locks and
479	 *    allocate them.  Loop.
480	 */
481	LIST_INIT(&freelist);
482	LIST_INIT(&droplist);
483	nfree = 0;
484	for (;;) {
485		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
486		if (error)
487			goto out;
488		if (nfree >= p->p_numthreads)
489			break;
490		threads = p->p_numthreads;
491		PROC_UNLOCK(p);
492		for (; nfree < threads; nfree++) {
493			nset = uma_zalloc(cpuset_zone, M_WAITOK);
494			LIST_INSERT_HEAD(&freelist, nset, cs_link);
495		}
496	}
497	PROC_LOCK_ASSERT(p, MA_OWNED);
498	/*
499	 * Now that the appropriate locks are held and we have enough cpusets,
500	 * make sure the operation will succeed before applying changes.  The
501	 * proc lock prevents td_cpuset from changing between calls.
502	 */
503	error = 0;
504	FOREACH_THREAD_IN_PROC(p, td) {
505		thread_lock(td);
506		tdset = td->td_cpuset;
507		/*
508		 * Verify that a new mask doesn't specify cpus outside of
509		 * the set the thread is a member of.
510		 */
511		if (mask) {
512			if (tdset->cs_id == CPUSET_INVALID)
513				tdset = tdset->cs_parent;
514			if (!CPU_SUBSET(&tdset->cs_mask, mask))
515				error = EDEADLK;
516		/*
517		 * Verify that a new set won't leave an existing thread
518		 * mask without a cpu to run on.  It can, however, restrict
519		 * the set.
520		 */
521		} else if (tdset->cs_id == CPUSET_INVALID) {
522			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
523				error = EDEADLK;
524		}
525		thread_unlock(td);
526		if (error)
527			goto unlock_out;
528	}
529	/*
530	 * Replace each thread's cpuset while using deferred release.  We
531	 * must do this because the thread lock must be held while operating
532	 * on the thread and this limits the type of operations allowed.
533	 */
534	FOREACH_THREAD_IN_PROC(p, td) {
535		thread_lock(td);
536		/*
537		 * If we presently have an anonymous set or are applying a
538		 * mask we must create an anonymous shadow set.  That is
539		 * either parented to our existing base or the supplied set.
540		 *
541		 * If we have a base set with no anonymous shadow we simply
542		 * replace it outright.
543		 */
544		tdset = td->td_cpuset;
545		if (tdset->cs_id == CPUSET_INVALID || mask) {
546			nset = LIST_FIRST(&freelist);
547			LIST_REMOVE(nset, cs_link);
548			if (mask)
549				error = cpuset_shadow(tdset, nset, mask);
550			else
551				error = _cpuset_create(nset, set,
552				    &tdset->cs_mask, CPUSET_INVALID);
553			if (error) {
554				LIST_INSERT_HEAD(&freelist, nset, cs_link);
555				thread_unlock(td);
556				break;
557			}
558		} else
559			nset = cpuset_ref(set);
560		cpuset_rel_defer(&droplist, tdset);
561		td->td_cpuset = nset;
562		sched_affinity(td);
563		thread_unlock(td);
564	}
565unlock_out:
566	PROC_UNLOCK(p);
567out:
568	while ((nset = LIST_FIRST(&droplist)) != NULL)
569		cpuset_rel_complete(nset);
570	while ((nset = LIST_FIRST(&freelist)) != NULL) {
571		LIST_REMOVE(nset, cs_link);
572		uma_zfree(cpuset_zone, nset);
573	}
574	return (error);
575}
576
577/*
578 * Apply an anonymous mask to a single thread.
579 */
580int
581cpuset_setthread(lwpid_t id, cpuset_t *mask)
582{
583	struct cpuset *nset;
584	struct cpuset *set;
585	struct thread *td;
586	struct proc *p;
587	int error;
588
589	nset = uma_zalloc(cpuset_zone, M_WAITOK);
590	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
591	if (error)
592		goto out;
593	set = NULL;
594	thread_lock(td);
595	error = cpuset_shadow(td->td_cpuset, nset, mask);
596	if (error == 0) {
597		set = td->td_cpuset;
598		td->td_cpuset = nset;
599		sched_affinity(td);
600		nset = NULL;
601	}
602	thread_unlock(td);
603	PROC_UNLOCK(p);
604	if (set)
605		cpuset_rel(set);
606out:
607	if (nset)
608		uma_zfree(cpuset_zone, nset);
609	return (error);
610}
611
612/*
613 * Creates the cpuset for thread0.  We make two sets:
614 *
615 * 0 - The root set which should represent all valid processors in the
616 *     system.  It is initially created with a mask of all processors
617 *     because we don't know what processors are valid until cpuset_init()
618 *     runs.  This set is immutable.
619 * 1 - The default set which all processes are a member of until changed.
620 *     This allows an administrator to move all threads off of given cpus to
621 *     dedicate them to high priority tasks or save power etc.
622 */
623struct cpuset *
624cpuset_thread0(void)
625{
626	struct cpuset *set;
627	int error;
628
629	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
630	    NULL, NULL, UMA_ALIGN_PTR, 0);
631	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
632	/*
633	 * Create the root system set for the whole machine.  Doesn't use
634	 * cpuset_create() due to NULL parent.
635	 */
636	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
637	set->cs_mask.__bits[0] = -1;
638	LIST_INIT(&set->cs_children);
639	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
640	set->cs_ref = 1;
641	set->cs_flags = CPU_SET_ROOT;
642	cpuset_zero = set;
643	cpuset_root = &set->cs_mask;
644	/*
645	 * Now derive a default, modifiable set from that to give out.
646	 */
647	set = uma_zalloc(cpuset_zone, M_WAITOK);
648	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
649	KASSERT(error == 0, ("Error creating default set: %d\n", error));
650	/*
651	 * Initialize the unit allocator. 0 and 1 are allocated above.
652	 */
653	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
654
655	return (set);
656}
657
658/*
659 * This is called once the final set of system cpus is known.  Modifies
660 * the root set and all children and mark the root readonly.
661 */
662static void
663cpuset_init(void *arg)
664{
665	cpuset_t mask;
666
667	CPU_ZERO(&mask);
668#ifdef SMP
669	mask.__bits[0] = all_cpus;
670#else
671	mask.__bits[0] = 1;
672#endif
673	if (cpuset_modify(cpuset_zero, &mask))
674		panic("Can't set initial cpuset mask.\n");
675	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
676}
677SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
678
679#ifndef _SYS_SYSPROTO_H_
680struct cpuset_args {
681	cpusetid_t	*setid;
682};
683#endif
684int
685cpuset(struct thread *td, struct cpuset_args *uap)
686{
687	struct cpuset *root;
688	struct cpuset *set;
689	int error;
690
691	thread_lock(td);
692	root = cpuset_refroot(td->td_cpuset);
693	thread_unlock(td);
694	error = cpuset_create(&set, root, &root->cs_mask);
695	cpuset_rel(root);
696	if (error)
697		return (error);
698	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
699	if (error == 0)
700		error = cpuset_setproc(-1, set, NULL);
701	cpuset_rel(set);
702	return (error);
703}
704
705#ifndef _SYS_SYSPROTO_H_
706struct cpuset_setid_args {
707	cpuwhich_t	which;
708	id_t		id;
709	cpusetid_t	setid;
710};
711#endif
712int
713cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
714{
715	struct cpuset *set;
716	int error;
717
718	/*
719	 * Presently we only support per-process sets.
720	 */
721	if (uap->which != CPU_WHICH_PID)
722		return (EINVAL);
723	set = cpuset_lookup(uap->setid);
724	if (set == NULL)
725		return (ESRCH);
726	error = cpuset_setproc(uap->id, set, NULL);
727	cpuset_rel(set);
728	return (error);
729}
730
731#ifndef _SYS_SYSPROTO_H_
732struct cpuset_getid_args {
733	cpulevel_t	level;
734	cpuwhich_t	which;
735	id_t		id;
736	cpusetid_t	*setid;
737#endif
738int
739cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
740{
741	struct cpuset *nset;
742	struct cpuset *set;
743	struct thread *ttd;
744	struct proc *p;
745	cpusetid_t id;
746	int error;
747
748	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
749		return (EINVAL);
750	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
751	if (error)
752		return (error);
753	switch (uap->which) {
754	case CPU_WHICH_TID:
755	case CPU_WHICH_PID:
756		thread_lock(ttd);
757		set = cpuset_refbase(ttd->td_cpuset);
758		thread_unlock(ttd);
759		PROC_UNLOCK(p);
760		break;
761	case CPU_WHICH_CPUSET:
762		break;
763	}
764	switch (uap->level) {
765	case CPU_LEVEL_ROOT:
766		nset = cpuset_refroot(set);
767		cpuset_rel(set);
768		set = nset;
769		break;
770	case CPU_LEVEL_CPUSET:
771		break;
772	case CPU_LEVEL_WHICH:
773		break;
774	}
775	id = set->cs_id;
776	cpuset_rel(set);
777	if (error == 0)
778		error = copyout(&id, uap->setid, sizeof(id));
779
780	return (error);
781}
782
783#ifndef _SYS_SYSPROTO_H_
784struct cpuset_getaffinity_args {
785	cpulevel_t	level;
786	cpuwhich_t	which;
787	id_t		id;
788	size_t		cpusetsize;
789	cpuset_t	*mask;
790};
791#endif
792int
793cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
794{
795	struct thread *ttd;
796	struct cpuset *nset;
797	struct cpuset *set;
798	struct proc *p;
799	cpuset_t *mask;
800	int error;
801	size_t size;
802
803	if (uap->cpusetsize < sizeof(cpuset_t) ||
804	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
805		return (ERANGE);
806	size = uap->cpusetsize;
807	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
808	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
809	if (error)
810		goto out;
811	switch (uap->level) {
812	case CPU_LEVEL_ROOT:
813	case CPU_LEVEL_CPUSET:
814		switch (uap->which) {
815		case CPU_WHICH_TID:
816		case CPU_WHICH_PID:
817			thread_lock(ttd);
818			set = cpuset_ref(ttd->td_cpuset);
819			thread_unlock(ttd);
820			break;
821		case CPU_WHICH_CPUSET:
822			break;
823		}
824		if (uap->level == CPU_LEVEL_ROOT)
825			nset = cpuset_refroot(set);
826		else
827			nset = cpuset_refbase(set);
828		CPU_COPY(&nset->cs_mask, mask);
829		cpuset_rel(nset);
830		break;
831	case CPU_LEVEL_WHICH:
832		switch (uap->which) {
833		case CPU_WHICH_TID:
834			thread_lock(ttd);
835			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
836			thread_unlock(ttd);
837			break;
838		case CPU_WHICH_PID:
839			FOREACH_THREAD_IN_PROC(p, ttd) {
840				thread_lock(ttd);
841				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
842				thread_unlock(ttd);
843			}
844			break;
845		case CPU_WHICH_CPUSET:
846			CPU_COPY(&set->cs_mask, mask);
847			break;
848		}
849		break;
850	default:
851		error = EINVAL;
852		break;
853	}
854	if (set)
855		cpuset_rel(set);
856	if (p)
857		PROC_UNLOCK(p);
858	if (error == 0)
859		error = copyout(mask, uap->mask, size);
860out:
861	free(mask, M_TEMP);
862	return (error);
863}
864
865#ifndef _SYS_SYSPROTO_H_
866struct cpuset_setaffinity_args {
867	cpulevel_t	level;
868	cpuwhich_t	which;
869	id_t		id;
870	size_t		cpusetsize;
871	const cpuset_t	*mask;
872};
873#endif
874int
875cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
876{
877	struct cpuset *nset;
878	struct cpuset *set;
879	struct thread *ttd;
880	struct proc *p;
881	cpuset_t *mask;
882	int error;
883
884	if (uap->cpusetsize < sizeof(cpuset_t) ||
885	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
886		return (ERANGE);
887	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
888	error = copyin(uap->mask, mask, uap->cpusetsize);
889	if (error)
890		goto out;
891	/*
892	 * Verify that no high bits are set.
893	 */
894	if (uap->cpusetsize > sizeof(cpuset_t)) {
895		char *end;
896		char *cp;
897
898		end = cp = (char *)&mask->__bits;
899		end += uap->cpusetsize;
900		cp += sizeof(cpuset_t);
901		while (cp != end)
902			if (*cp++ != 0) {
903				error = EINVAL;
904				goto out;
905			}
906
907	}
908	switch (uap->level) {
909	case CPU_LEVEL_ROOT:
910	case CPU_LEVEL_CPUSET:
911		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
912		if (error)
913			break;
914		switch (uap->which) {
915		case CPU_WHICH_TID:
916		case CPU_WHICH_PID:
917			thread_lock(ttd);
918			set = cpuset_ref(ttd->td_cpuset);
919			thread_unlock(ttd);
920			PROC_UNLOCK(p);
921			break;
922		case CPU_WHICH_CPUSET:
923			break;
924		}
925		if (uap->level == CPU_LEVEL_ROOT)
926			nset = cpuset_refroot(set);
927		else
928			nset = cpuset_refbase(set);
929		error = cpuset_modify(nset, mask);
930		cpuset_rel(nset);
931		cpuset_rel(set);
932		break;
933	case CPU_LEVEL_WHICH:
934		switch (uap->which) {
935		case CPU_WHICH_TID:
936			error = cpuset_setthread(uap->id, mask);
937			break;
938		case CPU_WHICH_PID:
939			error = cpuset_setproc(uap->id, NULL, mask);
940			break;
941		case CPU_WHICH_CPUSET:
942			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
943			    &ttd, &set);
944			if (error == 0) {
945				error = cpuset_modify(set, mask);
946				cpuset_rel(set);
947			}
948			break;
949		default:
950			error = EINVAL;
951			break;
952		}
953		break;
954	default:
955		error = EINVAL;
956		break;
957	}
958out:
959	free(mask, M_TEMP);
960	return (error);
961}
962