kern_cpuset.c revision 178092
1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Copyright (c) 2008 Nokia Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 178092 2008-04-11 03:26:41Z jeff $");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/sysproto.h>
37#include <sys/kernel.h>
38#include <sys/lock.h>
39#include <sys/malloc.h>
40#include <sys/mutex.h>
41#include <sys/priv.h>
42#include <sys/proc.h>
43#include <sys/refcount.h>
44#include <sys/sched.h>
45#include <sys/smp.h>
46#include <sys/syscallsubr.h>
47#include <sys/cpuset.h>
48#include <sys/sx.h>
49#include <sys/refcount.h>
50#include <sys/queue.h>
51#include <sys/limits.h>
52#include <sys/bus.h>
53#include <sys/interrupt.h>
54
55#include <vm/uma.h>
56
57/*
58 * cpusets provide a mechanism for creating and manipulating sets of
59 * processors for the purpose of constraining the scheduling of threads to
60 * specific processors.
61 *
62 * Each process belongs to an identified set, by default this is set 1.  Each
63 * thread may further restrict the cpus it may run on to a subset of this
64 * named set.  This creates an anonymous set which other threads and processes
65 * may not join by number.
66 *
67 * The named set is referred to herein as the 'base' set to avoid ambiguity.
68 * This set is usually a child of a 'root' set while the anonymous set may
69 * simply be referred to as a mask.  In the syscall api these are referred to
70 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
71 *
72 * Threads inherit their set from their creator whether it be anonymous or
73 * not.  This means that anonymous sets are immutable because they may be
74 * shared.  To modify an anonymous set a new set is created with the desired
75 * mask and the same parent as the existing anonymous set.  This gives the
76 * illusion of each thread having a private mask.A
77 *
78 * Via the syscall apis a user may ask to retrieve or modify the root, base,
79 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
80 * modifies all numbered and anonymous child sets to comply with the new mask.
81 * Modifying a pid or tid's mask applies only to that tid but must still
82 * exist within the assigned parent set.
83 *
84 * A thread may not be assigned to a a group seperate from other threads in
85 * the process.  This is to remove ambiguity when the setid is queried with
86 * a pid argument.  There is no other technical limitation.
87 *
88 * This somewhat complex arrangement is intended to make it easy for
89 * applications to query available processors and bind their threads to
90 * specific processors while also allowing administrators to dynamically
91 * reprovision by changing sets which apply to groups of processes.
92 *
93 * A simple application should not concern itself with sets at all and
94 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
95 * meaning 'curthread'.  It may query availble cpus for that tid with a
96 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
97 */
98static uma_zone_t cpuset_zone;
99static struct mtx cpuset_lock;
100static struct setlist cpuset_ids;
101static struct unrhdr *cpuset_unr;
102static struct cpuset *cpuset_zero;
103
104cpuset_t *cpuset_root;
105
106/*
107 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
108 */
109struct cpuset *
110cpuset_ref(struct cpuset *set)
111{
112
113	refcount_acquire(&set->cs_ref);
114	return (set);
115}
116
117/*
118 * Release a reference in a context where it is safe to allocte.
119 */
120void
121cpuset_rel(struct cpuset *set)
122{
123	cpusetid_t id;
124
125	if (refcount_release(&set->cs_ref) == 0)
126		return;
127	mtx_lock_spin(&cpuset_lock);
128	LIST_REMOVE(set, cs_siblings);
129	id = set->cs_id;
130	if (id != CPUSET_INVALID)
131		LIST_REMOVE(set, cs_link);
132	mtx_unlock_spin(&cpuset_lock);
133	cpuset_rel(set->cs_parent);
134	uma_zfree(cpuset_zone, set);
135	if (id != CPUSET_INVALID)
136		free_unr(cpuset_unr, id);
137}
138
139/*
140 * Deferred release must be used when in a context that is not safe to
141 * allocate/free.  This places any unreferenced sets on the list 'head'.
142 */
143static void
144cpuset_rel_defer(struct setlist *head, struct cpuset *set)
145{
146
147	if (refcount_release(&set->cs_ref) == 0)
148		return;
149	mtx_lock_spin(&cpuset_lock);
150	LIST_REMOVE(set, cs_siblings);
151	if (set->cs_id != CPUSET_INVALID)
152		LIST_REMOVE(set, cs_link);
153	LIST_INSERT_HEAD(head, set, cs_link);
154	mtx_unlock_spin(&cpuset_lock);
155}
156
157/*
158 * Complete a deferred release.  Removes the set from the list provided to
159 * cpuset_rel_defer.
160 */
161static void
162cpuset_rel_complete(struct cpuset *set)
163{
164	LIST_REMOVE(set, cs_link);
165	cpuset_rel(set->cs_parent);
166	uma_zfree(cpuset_zone, set);
167}
168
169/*
170 * Find a set based on an id.  Returns it with a ref.
171 */
172static struct cpuset *
173cpuset_lookup(cpusetid_t setid)
174{
175	struct cpuset *set;
176
177	if (setid == CPUSET_INVALID)
178		return (NULL);
179	mtx_lock_spin(&cpuset_lock);
180	LIST_FOREACH(set, &cpuset_ids, cs_link)
181		if (set->cs_id == setid)
182			break;
183	if (set)
184		cpuset_ref(set);
185	mtx_unlock_spin(&cpuset_lock);
186	return (set);
187}
188
189/*
190 * Create a set in the space provided in 'set' with the provided parameters.
191 * The set is returned with a single ref.  May return EDEADLK if the set
192 * will have no valid cpu based on restrictions from the parent.
193 */
194static int
195_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
196    cpusetid_t id)
197{
198
199	if (!CPU_OVERLAP(&parent->cs_mask, mask))
200		return (EDEADLK);
201	CPU_COPY(mask, &set->cs_mask);
202	LIST_INIT(&set->cs_children);
203	refcount_init(&set->cs_ref, 1);
204	set->cs_flags = 0;
205	mtx_lock_spin(&cpuset_lock);
206	CPU_AND(mask, &parent->cs_mask);
207	set->cs_id = id;
208	set->cs_parent = cpuset_ref(parent);
209	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
210	if (set->cs_id != CPUSET_INVALID)
211		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
212	mtx_unlock_spin(&cpuset_lock);
213
214	return (0);
215}
216
217/*
218 * Create a new non-anonymous set with the requested parent and mask.  May
219 * return failures if the mask is invalid or a new number can not be
220 * allocated.
221 */
222static int
223cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
224{
225	struct cpuset *set;
226	cpusetid_t id;
227	int error;
228
229	id = alloc_unr(cpuset_unr);
230	if (id == -1)
231		return (ENFILE);
232	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
233	error = _cpuset_create(set, parent, mask, id);
234	if (error == 0)
235		return (0);
236	free_unr(cpuset_unr, id);
237	uma_zfree(cpuset_zone, set);
238
239	return (error);
240}
241
242/*
243 * Recursively check for errors that would occur from applying mask to
244 * the tree of sets starting at 'set'.  Checks for sets that would become
245 * empty as well as RDONLY flags.
246 */
247static int
248cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
249{
250	struct cpuset *nset;
251	cpuset_t newmask;
252	int error;
253
254	mtx_assert(&cpuset_lock, MA_OWNED);
255	if (set->cs_flags & CPU_SET_RDONLY)
256		return (EPERM);
257	if (!CPU_OVERLAP(&set->cs_mask, mask))
258		return (EDEADLK);
259	CPU_COPY(&set->cs_mask, &newmask);
260	CPU_AND(&newmask, mask);
261	error = 0;
262	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
263		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
264			break;
265	return (error);
266}
267
268/*
269 * Applies the mask 'mask' without checking for empty sets or permissions.
270 */
271static void
272cpuset_update(struct cpuset *set, cpuset_t *mask)
273{
274	struct cpuset *nset;
275
276	mtx_assert(&cpuset_lock, MA_OWNED);
277	CPU_AND(&set->cs_mask, mask);
278	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
279		cpuset_update(nset, &set->cs_mask);
280
281	return;
282}
283
284/*
285 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
286 * mask to restrict all children in the tree.  Checks for validity before
287 * applying the changes.
288 */
289static int
290cpuset_modify(struct cpuset *set, cpuset_t *mask)
291{
292	struct cpuset *root;
293	int error;
294
295	error = suser(curthread);
296	if (error)
297		return (error);
298	/*
299	 * Verify that we have access to this set of
300	 * cpus.
301	 */
302	root = set->cs_parent;
303	if (root && !CPU_SUBSET(&root->cs_mask, mask))
304		return (EINVAL);
305	mtx_lock_spin(&cpuset_lock);
306	error = cpuset_testupdate(set, mask);
307	if (error)
308		goto out;
309	cpuset_update(set, mask);
310	CPU_COPY(mask, &set->cs_mask);
311out:
312	mtx_unlock_spin(&cpuset_lock);
313
314	return (error);
315}
316
317/*
318 * Walks up the tree from 'set' to find the root.  Returns the root
319 * referenced.
320 */
321static struct cpuset *
322cpuset_refroot(struct cpuset *set)
323{
324
325	for (; set->cs_parent != NULL; set = set->cs_parent)
326		if (set->cs_flags & CPU_SET_ROOT)
327			break;
328	cpuset_ref(set);
329
330	return (set);
331}
332
333/*
334 * Find the first non-anonymous set starting from 'set'.  Returns this set
335 * referenced.  May return the passed in set with an extra ref if it is
336 * not anonymous.
337 */
338static struct cpuset *
339cpuset_refbase(struct cpuset *set)
340{
341
342	if (set->cs_id == CPUSET_INVALID)
343		set = set->cs_parent;
344	cpuset_ref(set);
345
346	return (set);
347}
348
349/*
350 * Resolve the 'which' parameter of several cpuset apis.
351 *
352 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
353 * checks for permission via p_cansched().
354 *
355 * For WHICH_SET returns a valid set with a new reference.
356 *
357 * -1 may be supplied for any argument to mean the current proc/thread or
358 * the base set of the current thread.  May fail with ESRCH/EPERM.
359 */
360static int
361cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
362    struct cpuset **setp)
363{
364	struct cpuset *set;
365	struct thread *td;
366	struct proc *p;
367	int error;
368
369	*pp = p = NULL;
370	*tdp = td = NULL;
371	*setp = set = NULL;
372	switch (which) {
373	case CPU_WHICH_PID:
374		if (id == -1) {
375			PROC_LOCK(curproc);
376			p = curproc;
377			break;
378		}
379		if ((p = pfind(id)) == NULL)
380			return (ESRCH);
381		break;
382	case CPU_WHICH_TID:
383		if (id == -1) {
384			PROC_LOCK(curproc);
385			p = curproc;
386			td = curthread;
387			break;
388		}
389		sx_slock(&allproc_lock);
390		FOREACH_PROC_IN_SYSTEM(p) {
391			PROC_LOCK(p);
392			FOREACH_THREAD_IN_PROC(p, td)
393				if (td->td_tid == id)
394					break;
395			if (td != NULL)
396				break;
397			PROC_UNLOCK(p);
398		}
399		sx_sunlock(&allproc_lock);
400		if (td == NULL)
401			return (ESRCH);
402		break;
403	case CPU_WHICH_CPUSET:
404		if (id == -1) {
405			thread_lock(curthread);
406			set = cpuset_refbase(curthread->td_cpuset);
407			thread_unlock(curthread);
408		} else
409			set = cpuset_lookup(id);
410		if (set) {
411			*setp = set;
412			return (0);
413		}
414		return (ESRCH);
415	case CPU_WHICH_IRQ:
416		return (0);
417	default:
418		return (EINVAL);
419	}
420	error = p_cansched(curthread, p);
421	if (error) {
422		PROC_UNLOCK(p);
423		return (error);
424	}
425	if (td == NULL)
426		td = FIRST_THREAD_IN_PROC(p);
427	*pp = p;
428	*tdp = td;
429	return (0);
430}
431
432/*
433 * Create an anonymous set with the provided mask in the space provided by
434 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
435 * the new set is a child of 'set'.
436 */
437static int
438cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
439{
440	struct cpuset *parent;
441
442	if (set->cs_id == CPUSET_INVALID)
443		parent = set->cs_parent;
444	else
445		parent = set;
446	if (!CPU_SUBSET(&parent->cs_mask, mask))
447		return (EDEADLK);
448	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
449}
450
451/*
452 * Handle two cases for replacing the base set or mask of an entire process.
453 *
454 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
455 *    to the provided set and replaces all non-anonymous td_cpusets with the
456 *    provided set.
457 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
458 *    sets for every thread with the existing base as a parent.
459 *
460 * This is overly complicated because we can't allocate while holding a
461 * spinlock and spinlocks must be held while changing and examining thread
462 * state.
463 */
464static int
465cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
466{
467	struct setlist freelist;
468	struct setlist droplist;
469	struct cpuset *tdset;
470	struct cpuset *nset;
471	struct thread *td;
472	struct proc *p;
473	int threads;
474	int nfree;
475	int error;
476	/*
477	 * The algorithm requires two passes due to locking considerations.
478	 *
479	 * 1) Lookup the process and acquire the locks in the required order.
480	 * 2) If enough cpusets have not been allocated release the locks and
481	 *    allocate them.  Loop.
482	 */
483	LIST_INIT(&freelist);
484	LIST_INIT(&droplist);
485	nfree = 0;
486	for (;;) {
487		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
488		if (error)
489			goto out;
490		if (nfree >= p->p_numthreads)
491			break;
492		threads = p->p_numthreads;
493		PROC_UNLOCK(p);
494		for (; nfree < threads; nfree++) {
495			nset = uma_zalloc(cpuset_zone, M_WAITOK);
496			LIST_INSERT_HEAD(&freelist, nset, cs_link);
497		}
498	}
499	PROC_LOCK_ASSERT(p, MA_OWNED);
500	/*
501	 * Now that the appropriate locks are held and we have enough cpusets,
502	 * make sure the operation will succeed before applying changes.  The
503	 * proc lock prevents td_cpuset from changing between calls.
504	 */
505	error = 0;
506	FOREACH_THREAD_IN_PROC(p, td) {
507		thread_lock(td);
508		tdset = td->td_cpuset;
509		/*
510		 * Verify that a new mask doesn't specify cpus outside of
511		 * the set the thread is a member of.
512		 */
513		if (mask) {
514			if (tdset->cs_id == CPUSET_INVALID)
515				tdset = tdset->cs_parent;
516			if (!CPU_SUBSET(&tdset->cs_mask, mask))
517				error = EDEADLK;
518		/*
519		 * Verify that a new set won't leave an existing thread
520		 * mask without a cpu to run on.  It can, however, restrict
521		 * the set.
522		 */
523		} else if (tdset->cs_id == CPUSET_INVALID) {
524			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
525				error = EDEADLK;
526		}
527		thread_unlock(td);
528		if (error)
529			goto unlock_out;
530	}
531	/*
532	 * Replace each thread's cpuset while using deferred release.  We
533	 * must do this because the thread lock must be held while operating
534	 * on the thread and this limits the type of operations allowed.
535	 */
536	FOREACH_THREAD_IN_PROC(p, td) {
537		thread_lock(td);
538		/*
539		 * If we presently have an anonymous set or are applying a
540		 * mask we must create an anonymous shadow set.  That is
541		 * either parented to our existing base or the supplied set.
542		 *
543		 * If we have a base set with no anonymous shadow we simply
544		 * replace it outright.
545		 */
546		tdset = td->td_cpuset;
547		if (tdset->cs_id == CPUSET_INVALID || mask) {
548			nset = LIST_FIRST(&freelist);
549			LIST_REMOVE(nset, cs_link);
550			if (mask)
551				error = cpuset_shadow(tdset, nset, mask);
552			else
553				error = _cpuset_create(nset, set,
554				    &tdset->cs_mask, CPUSET_INVALID);
555			if (error) {
556				LIST_INSERT_HEAD(&freelist, nset, cs_link);
557				thread_unlock(td);
558				break;
559			}
560		} else
561			nset = cpuset_ref(set);
562		cpuset_rel_defer(&droplist, tdset);
563		td->td_cpuset = nset;
564		sched_affinity(td);
565		thread_unlock(td);
566	}
567unlock_out:
568	PROC_UNLOCK(p);
569out:
570	while ((nset = LIST_FIRST(&droplist)) != NULL)
571		cpuset_rel_complete(nset);
572	while ((nset = LIST_FIRST(&freelist)) != NULL) {
573		LIST_REMOVE(nset, cs_link);
574		uma_zfree(cpuset_zone, nset);
575	}
576	return (error);
577}
578
579/*
580 * Apply an anonymous mask to a single thread.
581 */
582int
583cpuset_setthread(lwpid_t id, cpuset_t *mask)
584{
585	struct cpuset *nset;
586	struct cpuset *set;
587	struct thread *td;
588	struct proc *p;
589	int error;
590
591	nset = uma_zalloc(cpuset_zone, M_WAITOK);
592	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
593	if (error)
594		goto out;
595	set = NULL;
596	thread_lock(td);
597	error = cpuset_shadow(td->td_cpuset, nset, mask);
598	if (error == 0) {
599		set = td->td_cpuset;
600		td->td_cpuset = nset;
601		sched_affinity(td);
602		nset = NULL;
603	}
604	thread_unlock(td);
605	PROC_UNLOCK(p);
606	if (set)
607		cpuset_rel(set);
608out:
609	if (nset)
610		uma_zfree(cpuset_zone, nset);
611	return (error);
612}
613
614/*
615 * Creates the cpuset for thread0.  We make two sets:
616 *
617 * 0 - The root set which should represent all valid processors in the
618 *     system.  It is initially created with a mask of all processors
619 *     because we don't know what processors are valid until cpuset_init()
620 *     runs.  This set is immutable.
621 * 1 - The default set which all processes are a member of until changed.
622 *     This allows an administrator to move all threads off of given cpus to
623 *     dedicate them to high priority tasks or save power etc.
624 */
625struct cpuset *
626cpuset_thread0(void)
627{
628	struct cpuset *set;
629	int error;
630
631	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
632	    NULL, NULL, UMA_ALIGN_PTR, 0);
633	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
634	/*
635	 * Create the root system set for the whole machine.  Doesn't use
636	 * cpuset_create() due to NULL parent.
637	 */
638	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
639	set->cs_mask.__bits[0] = -1;
640	LIST_INIT(&set->cs_children);
641	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
642	set->cs_ref = 1;
643	set->cs_flags = CPU_SET_ROOT;
644	cpuset_zero = set;
645	cpuset_root = &set->cs_mask;
646	/*
647	 * Now derive a default, modifiable set from that to give out.
648	 */
649	set = uma_zalloc(cpuset_zone, M_WAITOK);
650	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
651	KASSERT(error == 0, ("Error creating default set: %d\n", error));
652	/*
653	 * Initialize the unit allocator. 0 and 1 are allocated above.
654	 */
655	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
656
657	return (set);
658}
659
660/*
661 * This is called once the final set of system cpus is known.  Modifies
662 * the root set and all children and mark the root readonly.
663 */
664static void
665cpuset_init(void *arg)
666{
667	cpuset_t mask;
668
669	CPU_ZERO(&mask);
670#ifdef SMP
671	mask.__bits[0] = all_cpus;
672#else
673	mask.__bits[0] = 1;
674#endif
675	if (cpuset_modify(cpuset_zero, &mask))
676		panic("Can't set initial cpuset mask.\n");
677	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
678}
679SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
680
681#ifndef _SYS_SYSPROTO_H_
682struct cpuset_args {
683	cpusetid_t	*setid;
684};
685#endif
686int
687cpuset(struct thread *td, struct cpuset_args *uap)
688{
689	struct cpuset *root;
690	struct cpuset *set;
691	int error;
692
693	thread_lock(td);
694	root = cpuset_refroot(td->td_cpuset);
695	thread_unlock(td);
696	error = cpuset_create(&set, root, &root->cs_mask);
697	cpuset_rel(root);
698	if (error)
699		return (error);
700	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
701	if (error == 0)
702		error = cpuset_setproc(-1, set, NULL);
703	cpuset_rel(set);
704	return (error);
705}
706
707#ifndef _SYS_SYSPROTO_H_
708struct cpuset_setid_args {
709	cpuwhich_t	which;
710	id_t		id;
711	cpusetid_t	setid;
712};
713#endif
714int
715cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
716{
717	struct cpuset *set;
718	int error;
719
720	/*
721	 * Presently we only support per-process sets.
722	 */
723	if (uap->which != CPU_WHICH_PID)
724		return (EINVAL);
725	set = cpuset_lookup(uap->setid);
726	if (set == NULL)
727		return (ESRCH);
728	error = cpuset_setproc(uap->id, set, NULL);
729	cpuset_rel(set);
730	return (error);
731}
732
733#ifndef _SYS_SYSPROTO_H_
734struct cpuset_getid_args {
735	cpulevel_t	level;
736	cpuwhich_t	which;
737	id_t		id;
738	cpusetid_t	*setid;
739#endif
740int
741cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
742{
743	struct cpuset *nset;
744	struct cpuset *set;
745	struct thread *ttd;
746	struct proc *p;
747	cpusetid_t id;
748	int error;
749
750	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
751		return (EINVAL);
752	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
753	if (error)
754		return (error);
755	switch (uap->which) {
756	case CPU_WHICH_TID:
757	case CPU_WHICH_PID:
758		thread_lock(ttd);
759		set = cpuset_refbase(ttd->td_cpuset);
760		thread_unlock(ttd);
761		PROC_UNLOCK(p);
762		break;
763	case CPU_WHICH_CPUSET:
764		break;
765	case CPU_WHICH_IRQ:
766		return (EINVAL);
767	}
768	switch (uap->level) {
769	case CPU_LEVEL_ROOT:
770		nset = cpuset_refroot(set);
771		cpuset_rel(set);
772		set = nset;
773		break;
774	case CPU_LEVEL_CPUSET:
775		break;
776	case CPU_LEVEL_WHICH:
777		break;
778	}
779	id = set->cs_id;
780	cpuset_rel(set);
781	if (error == 0)
782		error = copyout(&id, uap->setid, sizeof(id));
783
784	return (error);
785}
786
787#ifndef _SYS_SYSPROTO_H_
788struct cpuset_getaffinity_args {
789	cpulevel_t	level;
790	cpuwhich_t	which;
791	id_t		id;
792	size_t		cpusetsize;
793	cpuset_t	*mask;
794};
795#endif
796int
797cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
798{
799	struct thread *ttd;
800	struct cpuset *nset;
801	struct cpuset *set;
802	struct proc *p;
803	cpuset_t *mask;
804	int error;
805	size_t size;
806
807	if (uap->cpusetsize < sizeof(cpuset_t) ||
808	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
809		return (ERANGE);
810	size = uap->cpusetsize;
811	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
812	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
813	if (error)
814		goto out;
815	switch (uap->level) {
816	case CPU_LEVEL_ROOT:
817	case CPU_LEVEL_CPUSET:
818		switch (uap->which) {
819		case CPU_WHICH_TID:
820		case CPU_WHICH_PID:
821			thread_lock(ttd);
822			set = cpuset_ref(ttd->td_cpuset);
823			thread_unlock(ttd);
824			break;
825		case CPU_WHICH_CPUSET:
826			break;
827		case CPU_WHICH_IRQ:
828			error = EINVAL;
829			goto out;
830		}
831		if (uap->level == CPU_LEVEL_ROOT)
832			nset = cpuset_refroot(set);
833		else
834			nset = cpuset_refbase(set);
835		CPU_COPY(&nset->cs_mask, mask);
836		cpuset_rel(nset);
837		break;
838	case CPU_LEVEL_WHICH:
839		switch (uap->which) {
840		case CPU_WHICH_TID:
841			thread_lock(ttd);
842			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
843			thread_unlock(ttd);
844			break;
845		case CPU_WHICH_PID:
846			FOREACH_THREAD_IN_PROC(p, ttd) {
847				thread_lock(ttd);
848				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
849				thread_unlock(ttd);
850			}
851			break;
852		case CPU_WHICH_CPUSET:
853			CPU_COPY(&set->cs_mask, mask);
854			break;
855		case CPU_WHICH_IRQ:
856			error = intr_getaffinity(uap->id, mask);
857			break;
858		}
859		break;
860	default:
861		error = EINVAL;
862		break;
863	}
864	if (set)
865		cpuset_rel(set);
866	if (p)
867		PROC_UNLOCK(p);
868	if (error == 0)
869		error = copyout(mask, uap->mask, size);
870out:
871	free(mask, M_TEMP);
872	return (error);
873}
874
875#ifndef _SYS_SYSPROTO_H_
876struct cpuset_setaffinity_args {
877	cpulevel_t	level;
878	cpuwhich_t	which;
879	id_t		id;
880	size_t		cpusetsize;
881	const cpuset_t	*mask;
882};
883#endif
884int
885cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
886{
887	struct cpuset *nset;
888	struct cpuset *set;
889	struct thread *ttd;
890	struct proc *p;
891	cpuset_t *mask;
892	int error;
893
894	if (uap->cpusetsize < sizeof(cpuset_t) ||
895	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
896		return (ERANGE);
897	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
898	error = copyin(uap->mask, mask, uap->cpusetsize);
899	if (error)
900		goto out;
901	/*
902	 * Verify that no high bits are set.
903	 */
904	if (uap->cpusetsize > sizeof(cpuset_t)) {
905		char *end;
906		char *cp;
907
908		end = cp = (char *)&mask->__bits;
909		end += uap->cpusetsize;
910		cp += sizeof(cpuset_t);
911		while (cp != end)
912			if (*cp++ != 0) {
913				error = EINVAL;
914				goto out;
915			}
916
917	}
918	switch (uap->level) {
919	case CPU_LEVEL_ROOT:
920	case CPU_LEVEL_CPUSET:
921		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
922		if (error)
923			break;
924		switch (uap->which) {
925		case CPU_WHICH_TID:
926		case CPU_WHICH_PID:
927			thread_lock(ttd);
928			set = cpuset_ref(ttd->td_cpuset);
929			thread_unlock(ttd);
930			PROC_UNLOCK(p);
931			break;
932		case CPU_WHICH_CPUSET:
933			break;
934		case CPU_WHICH_IRQ:
935			error = EINVAL;
936			goto out;
937		}
938		if (uap->level == CPU_LEVEL_ROOT)
939			nset = cpuset_refroot(set);
940		else
941			nset = cpuset_refbase(set);
942		error = cpuset_modify(nset, mask);
943		cpuset_rel(nset);
944		cpuset_rel(set);
945		break;
946	case CPU_LEVEL_WHICH:
947		switch (uap->which) {
948		case CPU_WHICH_TID:
949			error = cpuset_setthread(uap->id, mask);
950			break;
951		case CPU_WHICH_PID:
952			error = cpuset_setproc(uap->id, NULL, mask);
953			break;
954		case CPU_WHICH_CPUSET:
955			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
956			    &ttd, &set);
957			if (error == 0) {
958				error = cpuset_modify(set, mask);
959				cpuset_rel(set);
960			}
961			break;
962		case CPU_WHICH_IRQ:
963			error = intr_setaffinity(uap->id, mask);
964			break;
965		default:
966			error = EINVAL;
967			break;
968		}
969		break;
970	default:
971		error = EINVAL;
972		break;
973	}
974out:
975	free(mask, M_TEMP);
976	return (error);
977}
978