kern_cpuset.c revision 177738
1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 177738 2008-03-30 11:31:14Z jeff $");
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/refcount.h>
41#include <sys/sched.h>
42#include <sys/smp.h>
43#include <sys/syscallsubr.h>
44#include <sys/cpuset.h>
45#include <sys/sx.h>
46#include <sys/refcount.h>
47#include <sys/queue.h>
48#include <sys/limits.h>
49#include <sys/bus.h>
50#include <sys/interrupt.h>
51
52#include <vm/uma.h>
53
54/*
55 * cpusets provide a mechanism for creating and manipulating sets of
56 * processors for the purpose of constraining the scheduling of threads to
57 * specific processors.
58 *
59 * Each process belongs to an identified set, by default this is set 1.  Each
60 * thread may further restrict the cpus it may run on to a subset of this
61 * named set.  This creates an anonymous set which other threads and processes
62 * may not join by number.
63 *
64 * The named set is referred to herein as the 'base' set to avoid ambiguity.
65 * This set is usually a child of a 'root' set while the anonymous set may
66 * simply be referred to as a mask.  In the syscall api these are referred to
67 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
68 *
69 * Threads inherit their set from their creator whether it be anonymous or
70 * not.  This means that anonymous sets are immutable because they may be
71 * shared.  To modify an anonymous set a new set is created with the desired
72 * mask and the same parent as the existing anonymous set.  This gives the
73 * illusion of each thread having a private mask.A
74 *
75 * Via the syscall apis a user may ask to retrieve or modify the root, base,
76 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
77 * modifies all numbered and anonymous child sets to comply with the new mask.
78 * Modifying a pid or tid's mask applies only to that tid but must still
79 * exist within the assigned parent set.
80 *
81 * A thread may not be assigned to a a group seperate from other threads in
82 * the process.  This is to remove ambiguity when the setid is queried with
83 * a pid argument.  There is no other technical limitation.
84 *
85 * This somewhat complex arrangement is intended to make it easy for
86 * applications to query available processors and bind their threads to
87 * specific processors while also allowing administrators to dynamically
88 * reprovision by changing sets which apply to groups of processes.
89 *
90 * A simple application should not concern itself with sets at all and
91 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
92 * meaning 'curthread'.  It may query availble cpus for that tid with a
93 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
94 */
95static uma_zone_t cpuset_zone;
96static struct mtx cpuset_lock;
97static struct setlist cpuset_ids;
98static struct unrhdr *cpuset_unr;
99static struct cpuset *cpuset_zero;
100
101cpuset_t *cpuset_root;
102
103/*
104 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
105 */
106struct cpuset *
107cpuset_ref(struct cpuset *set)
108{
109
110	refcount_acquire(&set->cs_ref);
111	return (set);
112}
113
114/*
115 * Release a reference in a context where it is safe to allocte.
116 */
117void
118cpuset_rel(struct cpuset *set)
119{
120	cpusetid_t id;
121
122	if (refcount_release(&set->cs_ref) == 0)
123		return;
124	mtx_lock_spin(&cpuset_lock);
125	LIST_REMOVE(set, cs_siblings);
126	id = set->cs_id;
127	if (id != CPUSET_INVALID)
128		LIST_REMOVE(set, cs_link);
129	mtx_unlock_spin(&cpuset_lock);
130	cpuset_rel(set->cs_parent);
131	uma_zfree(cpuset_zone, set);
132	if (id != CPUSET_INVALID)
133		free_unr(cpuset_unr, id);
134}
135
136/*
137 * Deferred release must be used when in a context that is not safe to
138 * allocate/free.  This places any unreferenced sets on the list 'head'.
139 */
140static void
141cpuset_rel_defer(struct setlist *head, struct cpuset *set)
142{
143
144	if (refcount_release(&set->cs_ref) == 0)
145		return;
146	mtx_lock_spin(&cpuset_lock);
147	LIST_REMOVE(set, cs_siblings);
148	if (set->cs_id != CPUSET_INVALID)
149		LIST_REMOVE(set, cs_link);
150	LIST_INSERT_HEAD(head, set, cs_link);
151	mtx_unlock_spin(&cpuset_lock);
152}
153
154/*
155 * Complete a deferred release.  Removes the set from the list provided to
156 * cpuset_rel_defer.
157 */
158static void
159cpuset_rel_complete(struct cpuset *set)
160{
161	LIST_REMOVE(set, cs_link);
162	cpuset_rel(set->cs_parent);
163	uma_zfree(cpuset_zone, set);
164}
165
166/*
167 * Find a set based on an id.  Returns it with a ref.
168 */
169static struct cpuset *
170cpuset_lookup(cpusetid_t setid)
171{
172	struct cpuset *set;
173
174	if (setid == CPUSET_INVALID)
175		return (NULL);
176	mtx_lock_spin(&cpuset_lock);
177	LIST_FOREACH(set, &cpuset_ids, cs_link)
178		if (set->cs_id == setid)
179			break;
180	if (set)
181		cpuset_ref(set);
182	mtx_unlock_spin(&cpuset_lock);
183	return (set);
184}
185
186/*
187 * Create a set in the space provided in 'set' with the provided parameters.
188 * The set is returned with a single ref.  May return EDEADLK if the set
189 * will have no valid cpu based on restrictions from the parent.
190 */
191static int
192_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
193    cpusetid_t id)
194{
195
196	if (!CPU_OVERLAP(&parent->cs_mask, mask))
197		return (EDEADLK);
198	CPU_COPY(mask, &set->cs_mask);
199	LIST_INIT(&set->cs_children);
200	refcount_init(&set->cs_ref, 1);
201	set->cs_flags = 0;
202	mtx_lock_spin(&cpuset_lock);
203	CPU_AND(mask, &parent->cs_mask);
204	set->cs_id = id;
205	set->cs_parent = cpuset_ref(parent);
206	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
207	if (set->cs_id != CPUSET_INVALID)
208		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
209	mtx_unlock_spin(&cpuset_lock);
210
211	return (0);
212}
213
214/*
215 * Create a new non-anonymous set with the requested parent and mask.  May
216 * return failures if the mask is invalid or a new number can not be
217 * allocated.
218 */
219static int
220cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
221{
222	struct cpuset *set;
223	cpusetid_t id;
224	int error;
225
226	id = alloc_unr(cpuset_unr);
227	if (id == -1)
228		return (ENFILE);
229	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
230	error = _cpuset_create(set, parent, mask, id);
231	if (error == 0)
232		return (0);
233	free_unr(cpuset_unr, id);
234	uma_zfree(cpuset_zone, set);
235
236	return (error);
237}
238
239/*
240 * Recursively check for errors that would occur from applying mask to
241 * the tree of sets starting at 'set'.  Checks for sets that would become
242 * empty as well as RDONLY flags.
243 */
244static int
245cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
246{
247	struct cpuset *nset;
248	cpuset_t newmask;
249	int error;
250
251	mtx_assert(&cpuset_lock, MA_OWNED);
252	if (set->cs_flags & CPU_SET_RDONLY)
253		return (EPERM);
254	if (!CPU_OVERLAP(&set->cs_mask, mask))
255		return (EDEADLK);
256	CPU_COPY(&set->cs_mask, &newmask);
257	CPU_AND(&newmask, mask);
258	error = 0;
259	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
260		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
261			break;
262	return (error);
263}
264
265/*
266 * Applies the mask 'mask' without checking for empty sets or permissions.
267 */
268static void
269cpuset_update(struct cpuset *set, cpuset_t *mask)
270{
271	struct cpuset *nset;
272
273	mtx_assert(&cpuset_lock, MA_OWNED);
274	CPU_AND(&set->cs_mask, mask);
275	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
276		cpuset_update(nset, &set->cs_mask);
277
278	return;
279}
280
281/*
282 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
283 * mask to restrict all children in the tree.  Checks for validity before
284 * applying the changes.
285 */
286static int
287cpuset_modify(struct cpuset *set, cpuset_t *mask)
288{
289	struct cpuset *root;
290	int error;
291
292	error = suser(curthread);
293	if (error)
294		return (error);
295	/*
296	 * Verify that we have access to this set of
297	 * cpus.
298	 */
299	root = set->cs_parent;
300	if (root && !CPU_SUBSET(&root->cs_mask, mask))
301		return (EINVAL);
302	mtx_lock_spin(&cpuset_lock);
303	error = cpuset_testupdate(set, mask);
304	if (error)
305		goto out;
306	cpuset_update(set, mask);
307	CPU_COPY(mask, &set->cs_mask);
308out:
309	mtx_unlock_spin(&cpuset_lock);
310
311	return (error);
312}
313
314/*
315 * Walks up the tree from 'set' to find the root.  Returns the root
316 * referenced.
317 */
318static struct cpuset *
319cpuset_refroot(struct cpuset *set)
320{
321
322	for (; set->cs_parent != NULL; set = set->cs_parent)
323		if (set->cs_flags & CPU_SET_ROOT)
324			break;
325	cpuset_ref(set);
326
327	return (set);
328}
329
330/*
331 * Find the first non-anonymous set starting from 'set'.  Returns this set
332 * referenced.  May return the passed in set with an extra ref if it is
333 * not anonymous.
334 */
335static struct cpuset *
336cpuset_refbase(struct cpuset *set)
337{
338
339	if (set->cs_id == CPUSET_INVALID)
340		set = set->cs_parent;
341	cpuset_ref(set);
342
343	return (set);
344}
345
346/*
347 * Resolve the 'which' parameter of several cpuset apis.
348 *
349 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
350 * checks for permission via p_cansched().
351 *
352 * For WHICH_SET returns a valid set with a new reference.
353 *
354 * -1 may be supplied for any argument to mean the current proc/thread or
355 * the base set of the current thread.  May fail with ESRCH/EPERM.
356 */
357static int
358cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
359    struct cpuset **setp)
360{
361	struct cpuset *set;
362	struct thread *td;
363	struct proc *p;
364	int error;
365
366	*pp = p = NULL;
367	*tdp = td = NULL;
368	*setp = set = NULL;
369	switch (which) {
370	case CPU_WHICH_PID:
371		if (id == -1) {
372			PROC_LOCK(curproc);
373			p = curproc;
374			break;
375		}
376		if ((p = pfind(id)) == NULL)
377			return (ESRCH);
378		break;
379	case CPU_WHICH_TID:
380		if (id == -1) {
381			PROC_LOCK(curproc);
382			p = curproc;
383			td = curthread;
384			break;
385		}
386		sx_slock(&allproc_lock);
387		FOREACH_PROC_IN_SYSTEM(p) {
388			PROC_LOCK(p);
389			FOREACH_THREAD_IN_PROC(p, td)
390				if (td->td_tid == id)
391					break;
392			if (td != NULL)
393				break;
394			PROC_UNLOCK(p);
395		}
396		sx_sunlock(&allproc_lock);
397		if (td == NULL)
398			return (ESRCH);
399		break;
400	case CPU_WHICH_CPUSET:
401		if (id == -1) {
402			thread_lock(curthread);
403			set = cpuset_refbase(curthread->td_cpuset);
404			thread_unlock(curthread);
405		} else
406			set = cpuset_lookup(id);
407		if (set) {
408			*setp = set;
409			return (0);
410		}
411		return (ESRCH);
412	default:
413		return (EINVAL);
414	}
415	error = p_cansched(curthread, p);
416	if (error) {
417		PROC_UNLOCK(p);
418		return (error);
419	}
420	if (td == NULL)
421		td = FIRST_THREAD_IN_PROC(p);
422	*pp = p;
423	*tdp = td;
424	return (0);
425}
426
427/*
428 * Create an anonymous set with the provided mask in the space provided by
429 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
430 * the new set is a child of 'set'.
431 */
432static int
433cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
434{
435	struct cpuset *parent;
436
437	if (set->cs_id == CPUSET_INVALID)
438		parent = set->cs_parent;
439	else
440		parent = set;
441	if (!CPU_SUBSET(&parent->cs_mask, mask))
442		return (EDEADLK);
443	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
444}
445
446/*
447 * Handle two cases for replacing the base set or mask of an entire process.
448 *
449 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
450 *    to the provided set and replaces all non-anonymous td_cpusets with the
451 *    provided set.
452 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
453 *    sets for every thread with the existing base as a parent.
454 *
455 * This is overly complicated because we can't allocate while holding a
456 * spinlock and spinlocks must be held while changing and examining thread
457 * state.
458 */
459static int
460cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
461{
462	struct setlist freelist;
463	struct setlist droplist;
464	struct cpuset *tdset;
465	struct cpuset *nset;
466	struct thread *td;
467	struct proc *p;
468	int threads;
469	int nfree;
470	int error;
471	/*
472	 * The algorithm requires two passes due to locking considerations.
473	 *
474	 * 1) Lookup the process and acquire the locks in the required order.
475	 * 2) If enough cpusets have not been allocated release the locks and
476	 *    allocate them.  Loop.
477	 */
478	LIST_INIT(&freelist);
479	LIST_INIT(&droplist);
480	nfree = 0;
481	for (;;) {
482		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
483		if (error)
484			goto out;
485		if (nfree >= p->p_numthreads)
486			break;
487		threads = p->p_numthreads;
488		PROC_UNLOCK(p);
489		for (; nfree < threads; nfree++) {
490			nset = uma_zalloc(cpuset_zone, M_WAITOK);
491			LIST_INSERT_HEAD(&freelist, nset, cs_link);
492		}
493	}
494	PROC_LOCK_ASSERT(p, MA_OWNED);
495	/*
496	 * Now that the appropriate locks are held and we have enough cpusets,
497	 * make sure the operation will succeed before applying changes.  The
498	 * proc lock prevents td_cpuset from changing between calls.
499	 */
500	error = 0;
501	FOREACH_THREAD_IN_PROC(p, td) {
502		thread_lock(td);
503		tdset = td->td_cpuset;
504		/*
505		 * Verify that a new mask doesn't specify cpus outside of
506		 * the set the thread is a member of.
507		 */
508		if (mask) {
509			if (tdset->cs_id == CPUSET_INVALID)
510				tdset = tdset->cs_parent;
511			if (!CPU_SUBSET(&tdset->cs_mask, mask))
512				error = EDEADLK;
513		/*
514		 * Verify that a new set won't leave an existing thread
515		 * mask without a cpu to run on.  It can, however, restrict
516		 * the set.
517		 */
518		} else if (tdset->cs_id == CPUSET_INVALID) {
519			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
520				error = EDEADLK;
521		}
522		thread_unlock(td);
523		if (error)
524			goto unlock_out;
525	}
526	/*
527	 * Replace each thread's cpuset while using deferred release.  We
528	 * must do this because the thread lock must be held while operating
529	 * on the thread and this limits the type of operations allowed.
530	 */
531	FOREACH_THREAD_IN_PROC(p, td) {
532		thread_lock(td);
533		/*
534		 * If we presently have an anonymous set or are applying a
535		 * mask we must create an anonymous shadow set.  That is
536		 * either parented to our existing base or the supplied set.
537		 *
538		 * If we have a base set with no anonymous shadow we simply
539		 * replace it outright.
540		 */
541		tdset = td->td_cpuset;
542		if (tdset->cs_id == CPUSET_INVALID || mask) {
543			nset = LIST_FIRST(&freelist);
544			LIST_REMOVE(nset, cs_link);
545			if (mask)
546				error = cpuset_shadow(tdset, nset, mask);
547			else
548				error = _cpuset_create(nset, set,
549				    &tdset->cs_mask, CPUSET_INVALID);
550			if (error) {
551				LIST_INSERT_HEAD(&freelist, nset, cs_link);
552				thread_unlock(td);
553				break;
554			}
555		} else
556			nset = cpuset_ref(set);
557		cpuset_rel_defer(&droplist, tdset);
558		td->td_cpuset = nset;
559		sched_affinity(td);
560		thread_unlock(td);
561	}
562unlock_out:
563	PROC_UNLOCK(p);
564out:
565	while ((nset = LIST_FIRST(&droplist)) != NULL)
566		cpuset_rel_complete(nset);
567	while ((nset = LIST_FIRST(&freelist)) != NULL) {
568		LIST_REMOVE(nset, cs_link);
569		uma_zfree(cpuset_zone, nset);
570	}
571	return (error);
572}
573
574/*
575 * Apply an anonymous mask to a single thread.
576 */
577int
578cpuset_setthread(lwpid_t id, cpuset_t *mask)
579{
580	struct cpuset *nset;
581	struct cpuset *set;
582	struct thread *td;
583	struct proc *p;
584	int error;
585
586	nset = uma_zalloc(cpuset_zone, M_WAITOK);
587	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
588	if (error)
589		goto out;
590	set = NULL;
591	thread_lock(td);
592	error = cpuset_shadow(td->td_cpuset, nset, mask);
593	if (error == 0) {
594		set = td->td_cpuset;
595		td->td_cpuset = nset;
596		sched_affinity(td);
597		nset = NULL;
598	}
599	thread_unlock(td);
600	PROC_UNLOCK(p);
601	if (set)
602		cpuset_rel(set);
603out:
604	if (nset)
605		uma_zfree(cpuset_zone, nset);
606	return (error);
607}
608
609/*
610 * Creates the cpuset for thread0.  We make two sets:
611 *
612 * 0 - The root set which should represent all valid processors in the
613 *     system.  It is initially created with a mask of all processors
614 *     because we don't know what processors are valid until cpuset_init()
615 *     runs.  This set is immutable.
616 * 1 - The default set which all processes are a member of until changed.
617 *     This allows an administrator to move all threads off of given cpus to
618 *     dedicate them to high priority tasks or save power etc.
619 */
620struct cpuset *
621cpuset_thread0(void)
622{
623	struct cpuset *set;
624	int error;
625
626	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
627	    NULL, NULL, UMA_ALIGN_PTR, 0);
628	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
629	/*
630	 * Create the root system set for the whole machine.  Doesn't use
631	 * cpuset_create() due to NULL parent.
632	 */
633	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
634	set->cs_mask.__bits[0] = -1;
635	LIST_INIT(&set->cs_children);
636	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
637	set->cs_ref = 1;
638	set->cs_flags = CPU_SET_ROOT;
639	cpuset_zero = set;
640	cpuset_root = &set->cs_mask;
641	/*
642	 * Now derive a default, modifiable set from that to give out.
643	 */
644	set = uma_zalloc(cpuset_zone, M_WAITOK);
645	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
646	KASSERT(error == 0, ("Error creating default set: %d\n", error));
647	/*
648	 * Initialize the unit allocator. 0 and 1 are allocated above.
649	 */
650	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
651
652	return (set);
653}
654
655/*
656 * This is called once the final set of system cpus is known.  Modifies
657 * the root set and all children and mark the root readonly.
658 */
659static void
660cpuset_init(void *arg)
661{
662	cpuset_t mask;
663
664	CPU_ZERO(&mask);
665#ifdef SMP
666	mask.__bits[0] = all_cpus;
667#else
668	mask.__bits[0] = 1;
669#endif
670	if (cpuset_modify(cpuset_zero, &mask))
671		panic("Can't set initial cpuset mask.\n");
672	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
673}
674SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
675
676#ifndef _SYS_SYSPROTO_H_
677struct cpuset_args {
678	cpusetid_t	*setid;
679};
680#endif
681int
682cpuset(struct thread *td, struct cpuset_args *uap)
683{
684	struct cpuset *root;
685	struct cpuset *set;
686	int error;
687
688	thread_lock(td);
689	root = cpuset_refroot(td->td_cpuset);
690	thread_unlock(td);
691	error = cpuset_create(&set, root, &root->cs_mask);
692	cpuset_rel(root);
693	if (error)
694		return (error);
695	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
696	if (error == 0)
697		error = cpuset_setproc(-1, set, NULL);
698	cpuset_rel(set);
699	return (error);
700}
701
702#ifndef _SYS_SYSPROTO_H_
703struct cpuset_setid_args {
704	cpuwhich_t	which;
705	id_t		id;
706	cpusetid_t	setid;
707};
708#endif
709int
710cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
711{
712	struct cpuset *set;
713	int error;
714
715	/*
716	 * Presently we only support per-process sets.
717	 */
718	if (uap->which != CPU_WHICH_PID)
719		return (EINVAL);
720	set = cpuset_lookup(uap->setid);
721	if (set == NULL)
722		return (ESRCH);
723	error = cpuset_setproc(uap->id, set, NULL);
724	cpuset_rel(set);
725	return (error);
726}
727
728#ifndef _SYS_SYSPROTO_H_
729struct cpuset_getid_args {
730	cpulevel_t	level;
731	cpuwhich_t	which;
732	id_t		id;
733	cpusetid_t	*setid;
734#endif
735int
736cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
737{
738	struct cpuset *nset;
739	struct cpuset *set;
740	struct thread *ttd;
741	struct proc *p;
742	cpusetid_t id;
743	int error;
744
745	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
746		return (EINVAL);
747	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
748	if (error)
749		return (error);
750	switch (uap->which) {
751	case CPU_WHICH_TID:
752	case CPU_WHICH_PID:
753		thread_lock(ttd);
754		set = cpuset_refbase(ttd->td_cpuset);
755		thread_unlock(ttd);
756		PROC_UNLOCK(p);
757		break;
758	case CPU_WHICH_CPUSET:
759		break;
760	}
761	switch (uap->level) {
762	case CPU_LEVEL_ROOT:
763		nset = cpuset_refroot(set);
764		cpuset_rel(set);
765		set = nset;
766		break;
767	case CPU_LEVEL_CPUSET:
768		break;
769	case CPU_LEVEL_WHICH:
770		break;
771	}
772	id = set->cs_id;
773	cpuset_rel(set);
774	if (error == 0)
775		error = copyout(&id, uap->setid, sizeof(id));
776
777	return (error);
778}
779
780#ifndef _SYS_SYSPROTO_H_
781struct cpuset_getaffinity_args {
782	cpulevel_t	level;
783	cpuwhich_t	which;
784	id_t		id;
785	size_t		cpusetsize;
786	cpuset_t	*mask;
787};
788#endif
789int
790cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
791{
792	struct thread *ttd;
793	struct cpuset *nset;
794	struct cpuset *set;
795	struct proc *p;
796	cpuset_t *mask;
797	int error;
798	size_t size;
799
800	if (uap->cpusetsize < sizeof(cpuset_t) ||
801	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
802		return (ERANGE);
803	size = uap->cpusetsize;
804	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
805	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
806	if (error)
807		goto out;
808	switch (uap->level) {
809	case CPU_LEVEL_ROOT:
810	case CPU_LEVEL_CPUSET:
811		switch (uap->which) {
812		case CPU_WHICH_TID:
813		case CPU_WHICH_PID:
814			thread_lock(ttd);
815			set = cpuset_ref(ttd->td_cpuset);
816			thread_unlock(ttd);
817			break;
818		case CPU_WHICH_CPUSET:
819			break;
820		}
821		if (uap->level == CPU_LEVEL_ROOT)
822			nset = cpuset_refroot(set);
823		else
824			nset = cpuset_refbase(set);
825		CPU_COPY(&nset->cs_mask, mask);
826		cpuset_rel(nset);
827		break;
828	case CPU_LEVEL_WHICH:
829		switch (uap->which) {
830		case CPU_WHICH_TID:
831			thread_lock(ttd);
832			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
833			thread_unlock(ttd);
834			break;
835		case CPU_WHICH_PID:
836			FOREACH_THREAD_IN_PROC(p, ttd) {
837				thread_lock(ttd);
838				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
839				thread_unlock(ttd);
840			}
841			break;
842		case CPU_WHICH_CPUSET:
843			CPU_COPY(&set->cs_mask, mask);
844			break;
845		}
846		break;
847	default:
848		error = EINVAL;
849		break;
850	}
851	if (set)
852		cpuset_rel(set);
853	if (p)
854		PROC_UNLOCK(p);
855	if (error == 0)
856		error = copyout(mask, uap->mask, size);
857out:
858	free(mask, M_TEMP);
859	return (error);
860}
861
862#ifndef _SYS_SYSPROTO_H_
863struct cpuset_setaffinity_args {
864	cpulevel_t	level;
865	cpuwhich_t	which;
866	id_t		id;
867	size_t		cpusetsize;
868	const cpuset_t	*mask;
869};
870#endif
871int
872cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
873{
874	struct cpuset *nset;
875	struct cpuset *set;
876	struct thread *ttd;
877	struct proc *p;
878	cpuset_t *mask;
879	int error;
880
881	if (uap->cpusetsize < sizeof(cpuset_t) ||
882	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
883		return (ERANGE);
884	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
885	error = copyin(uap->mask, mask, uap->cpusetsize);
886	if (error)
887		goto out;
888	/*
889	 * Verify that no high bits are set.
890	 */
891	if (uap->cpusetsize > sizeof(cpuset_t)) {
892		char *end;
893		char *cp;
894
895		end = cp = (char *)&mask->__bits;
896		end += uap->cpusetsize;
897		cp += sizeof(cpuset_t);
898		while (cp != end)
899			if (*cp++ != 0) {
900				error = EINVAL;
901				goto out;
902			}
903
904	}
905	switch (uap->level) {
906	case CPU_LEVEL_ROOT:
907	case CPU_LEVEL_CPUSET:
908		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
909		if (error)
910			break;
911		switch (uap->which) {
912		case CPU_WHICH_TID:
913		case CPU_WHICH_PID:
914			thread_lock(ttd);
915			set = cpuset_ref(ttd->td_cpuset);
916			thread_unlock(ttd);
917			PROC_UNLOCK(p);
918			break;
919		case CPU_WHICH_CPUSET:
920			break;
921		}
922		if (uap->level == CPU_LEVEL_ROOT)
923			nset = cpuset_refroot(set);
924		else
925			nset = cpuset_refbase(set);
926		error = cpuset_modify(nset, mask);
927		cpuset_rel(nset);
928		cpuset_rel(set);
929		break;
930	case CPU_LEVEL_WHICH:
931		switch (uap->which) {
932		case CPU_WHICH_TID:
933			error = cpuset_setthread(uap->id, mask);
934			break;
935		case CPU_WHICH_PID:
936			error = cpuset_setproc(uap->id, NULL, mask);
937			break;
938		case CPU_WHICH_CPUSET:
939			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
940			    &ttd, &set);
941			if (error == 0) {
942				error = cpuset_modify(set, mask);
943				cpuset_rel(set);
944			}
945			break;
946		default:
947			error = EINVAL;
948			break;
949		}
950		break;
951	default:
952		error = EINVAL;
953		break;
954	}
955out:
956	free(mask, M_TEMP);
957	return (error);
958}
959