kern_cpuset.c revision 176821
1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_cpuset.c 176821 2008-03-05 08:08:32Z jeff $");
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/refcount.h>
41#include <sys/sched.h>
42#include <sys/smp.h>
43#include <sys/syscallsubr.h>
44#include <sys/cpuset.h>
45#include <sys/sx.h>
46#include <sys/refcount.h>
47#include <sys/queue.h>
48#include <sys/limits.h>
49
50#include <vm/uma.h>
51
52/*
53 * cpusets provide a mechanism for creating and manipulating sets of
54 * processors for the purpose of constraining the scheduling of threads to
55 * specific processors.
56 *
57 * Each process belongs to an identified set, by default this is set 1.  Each
58 * thread may further restrict the cpus it may run on to a subset of this
59 * named set.  This creates an anonymous set which other threads and processes
60 * may not join by number.
61 *
62 * The named set is referred to herein as the 'base' set to avoid ambiguity.
63 * This set is usually a child of a 'root' set while the anonymous set may
64 * simply be referred to as a mask.  In the syscall api these are referred to
65 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
66 *
67 * Threads inherit their set from their creator whether it be anonymous or
68 * not.  This means that anonymous sets are immutable because they may be
69 * shared.  To modify an anonymous set a new set is created with the desired
70 * mask and the same parent as the existing anonymous set.  This gives the
71 * illusion of each thread having a private mask.A
72 *
73 * Via the syscall apis a user may ask to retrieve or modify the root, base,
74 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
75 * modifies all numbered and anonymous child sets to comply with the new mask.
76 * Modifying a pid or tid's mask applies only to that tid but must still
77 * exist within the assigned parent set.
78 *
79 * A thread may not be assigned to a a group seperate from other threads in
80 * the process.  This is to remove ambiguity when the setid is queried with
81 * a pid argument.  There is no other technical limitation.
82 *
83 * This somewhat complex arrangement is intended to make it easy for
84 * applications to query available processors and bind their threads to
85 * specific processors while also allowing administrators to dynamically
86 * reprovision by changing sets which apply to groups of processes.
87 *
88 * A simple application should not concern itself with sets at all and
89 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
90 * meaning 'curthread'.  It may query availble cpus for that tid with a
91 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
92 */
93static uma_zone_t cpuset_zone;
94static struct mtx cpuset_lock;
95static struct setlist cpuset_ids;
96struct cpuset *cpuset_zero;
97static struct unrhdr *cpuset_unr;
98
99/*
100 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
101 */
102struct cpuset *
103cpuset_ref(struct cpuset *set)
104{
105
106	refcount_acquire(&set->cs_ref);
107	return (set);
108}
109
110/*
111 * Release a reference in a context where it is safe to allocte.
112 */
113void
114cpuset_rel(struct cpuset *set)
115{
116	cpusetid_t id;
117
118	if (refcount_release(&set->cs_ref) == 0)
119		return;
120	mtx_lock_spin(&cpuset_lock);
121	LIST_REMOVE(set, cs_siblings);
122	id = set->cs_id;
123	if (id != CPUSET_INVALID)
124		LIST_REMOVE(set, cs_link);
125	mtx_unlock_spin(&cpuset_lock);
126	cpuset_rel(set->cs_parent);
127	uma_zfree(cpuset_zone, set);
128	if (id != CPUSET_INVALID)
129		free_unr(cpuset_unr, id);
130}
131
132/*
133 * Deferred release must be used when in a context that is not safe to
134 * allocate/free.  This places any unreferenced sets on the list 'head'.
135 */
136static void
137cpuset_rel_defer(struct setlist *head, struct cpuset *set)
138{
139
140	if (refcount_release(&set->cs_ref) == 0)
141		return;
142	mtx_lock_spin(&cpuset_lock);
143	LIST_REMOVE(set, cs_siblings);
144	if (set->cs_id != CPUSET_INVALID)
145		LIST_REMOVE(set, cs_link);
146	LIST_INSERT_HEAD(head, set, cs_link);
147	mtx_unlock_spin(&cpuset_lock);
148}
149
150/*
151 * Complete a deferred release.  Removes the set from the list provided to
152 * cpuset_rel_defer.
153 */
154static void
155cpuset_rel_complete(struct cpuset *set)
156{
157	LIST_REMOVE(set, cs_link);
158	cpuset_rel(set->cs_parent);
159	uma_zfree(cpuset_zone, set);
160}
161
162/*
163 * Find a set based on an id.  Returns it with a ref.
164 */
165static struct cpuset *
166cpuset_lookup(cpusetid_t setid)
167{
168	struct cpuset *set;
169
170	if (setid == CPUSET_INVALID)
171		return (NULL);
172	mtx_lock_spin(&cpuset_lock);
173	LIST_FOREACH(set, &cpuset_ids, cs_link)
174		if (set->cs_id == setid)
175			break;
176	if (set)
177		cpuset_ref(set);
178	mtx_unlock_spin(&cpuset_lock);
179	return (set);
180}
181
182/*
183 * Create a set in the space provided in 'set' with the provided parameters.
184 * The set is returned with a single ref.  May return EDEADLK if the set
185 * will have no valid cpu based on restrictions from the parent.
186 */
187static int
188_cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
189    cpusetid_t id)
190{
191
192	if (!CPU_OVERLAP(&parent->cs_mask, mask))
193		return (EDEADLK);
194	CPU_COPY(mask, &set->cs_mask);
195	LIST_INIT(&set->cs_children);
196	refcount_init(&set->cs_ref, 1);
197	set->cs_flags = 0;
198	mtx_lock_spin(&cpuset_lock);
199	CPU_AND(mask, &parent->cs_mask);
200	set->cs_id = id;
201	set->cs_parent = cpuset_ref(parent);
202	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
203	if (set->cs_id != CPUSET_INVALID)
204		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
205	mtx_unlock_spin(&cpuset_lock);
206
207	return (0);
208}
209
210/*
211 * Create a new non-anonymous set with the requested parent and mask.  May
212 * return failures if the mask is invalid or a new number can not be
213 * allocated.
214 */
215static int
216cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
217{
218	struct cpuset *set;
219	cpusetid_t id;
220	int error;
221
222	id = alloc_unr(cpuset_unr);
223	if (id == -1)
224		return (ENFILE);
225	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
226	error = _cpuset_create(set, parent, mask, id);
227	if (error == 0)
228		return (0);
229	free_unr(cpuset_unr, id);
230	uma_zfree(cpuset_zone, set);
231
232	return (error);
233}
234
235/*
236 * Recursively check for errors that would occur from applying mask to
237 * the tree of sets starting at 'set'.  Checks for sets that would become
238 * empty as well as RDONLY flags.
239 */
240static int
241cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
242{
243	struct cpuset *nset;
244	cpuset_t newmask;
245	int error;
246
247	mtx_assert(&cpuset_lock, MA_OWNED);
248	if (set->cs_flags & CPU_SET_RDONLY)
249		return (EPERM);
250	if (!CPU_OVERLAP(&set->cs_mask, mask))
251		return (EDEADLK);
252	CPU_COPY(&set->cs_mask, &newmask);
253	CPU_AND(&newmask, mask);
254	error = 0;
255	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
256		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
257			break;
258	return (error);
259}
260
261/*
262 * Applies the mask 'mask' without checking for empty sets or permissions.
263 */
264static void
265cpuset_update(struct cpuset *set, cpuset_t *mask)
266{
267	struct cpuset *nset;
268
269	mtx_assert(&cpuset_lock, MA_OWNED);
270	CPU_AND(&set->cs_mask, mask);
271	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
272		cpuset_update(nset, &set->cs_mask);
273
274	return;
275}
276
277/*
278 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
279 * mask to restrict all children in the tree.  Checks for validity before
280 * applying the changes.
281 */
282static int
283cpuset_modify(struct cpuset *set, cpuset_t *mask)
284{
285	struct cpuset *root;
286	int error;
287
288	error = suser(curthread);
289	if (error)
290		return (error);
291	/*
292	 * Verify that we have access to this set of
293	 * cpus.
294	 */
295	root = set->cs_parent;
296	if (root && !CPU_SUBSET(&root->cs_mask, mask))
297		return (EINVAL);
298	mtx_lock_spin(&cpuset_lock);
299	error = cpuset_testupdate(set, mask);
300	if (error)
301		goto out;
302	cpuset_update(set, mask);
303	CPU_COPY(mask, &set->cs_mask);
304out:
305	mtx_unlock_spin(&cpuset_lock);
306
307	return (error);
308}
309
310/*
311 * Walks up the tree from 'set' to find the root.  Returns the root
312 * referenced.
313 */
314static struct cpuset *
315cpuset_root(struct cpuset *set)
316{
317
318	for (; set->cs_parent != NULL; set = set->cs_parent)
319		if (set->cs_flags & CPU_SET_ROOT)
320			break;
321	cpuset_ref(set);
322
323	return (set);
324}
325
326/*
327 * Find the first non-anonymous set starting from 'set'.  Returns this set
328 * referenced.  May return the passed in set with an extra ref if it is
329 * not anonymous.
330 */
331static struct cpuset *
332cpuset_base(struct cpuset *set)
333{
334
335	if (set->cs_id == CPUSET_INVALID)
336		set = set->cs_parent;
337	cpuset_ref(set);
338
339	return (set);
340}
341
342/*
343 * Resolve the 'which' parameter of several cpuset apis.
344 *
345 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
346 * checks for permission via p_cansched().
347 *
348 * For WHICH_SET returns a valid set with a new reference.
349 *
350 * -1 may be supplied for any argument to mean the current proc/thread or
351 * the base set of the current thread.  May fail with ESRCH/EPERM.
352 */
353static int
354cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
355    struct cpuset **setp)
356{
357	struct cpuset *set;
358	struct thread *td;
359	struct proc *p;
360	int error;
361
362	*pp = p = NULL;
363	*tdp = td = NULL;
364	*setp = set = NULL;
365	switch (which) {
366	case CPU_WHICH_PID:
367		if (id == -1) {
368			PROC_LOCK(curproc);
369			p = curproc;
370			break;
371		}
372		if ((p = pfind(id)) == NULL)
373			return (ESRCH);
374		break;
375	case CPU_WHICH_TID:
376		if (id == -1) {
377			PROC_LOCK(curproc);
378			p = curproc;
379			td = curthread;
380			break;
381		}
382		sx_slock(&allproc_lock);
383		FOREACH_PROC_IN_SYSTEM(p) {
384			PROC_LOCK(p);
385			PROC_SLOCK(p);
386			FOREACH_THREAD_IN_PROC(p, td)
387				if (td->td_tid == id)
388					break;
389			PROC_SUNLOCK(p);
390			if (td != NULL)
391				break;
392			PROC_UNLOCK(p);
393		}
394		sx_sunlock(&allproc_lock);
395		if (td == NULL)
396			return (ESRCH);
397		break;
398	case CPU_WHICH_CPUSET:
399		if (id == -1) {
400			thread_lock(curthread);
401			set = cpuset_base(curthread->td_cpuset);
402			thread_unlock(curthread);
403		} else
404			set = cpuset_lookup(id);
405		if (set) {
406			*setp = set;
407			return (0);
408		}
409		return (ESRCH);
410	default:
411		return (EINVAL);
412	}
413	error = p_cansched(curthread, p);
414	if (error) {
415		PROC_UNLOCK(p);
416		return (error);
417	}
418	if (td == NULL)
419		td = FIRST_THREAD_IN_PROC(p);
420	*pp = p;
421	*tdp = td;
422	return (0);
423}
424
425/*
426 * Create an anonymous set with the provided mask in the space provided by
427 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
428 * the new set is a child of 'set'.
429 */
430static int
431cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
432{
433	struct cpuset *parent;
434
435	if (set->cs_id == CPUSET_INVALID)
436		parent = set->cs_parent;
437	else
438		parent = set;
439	if (!CPU_SUBSET(&parent->cs_mask, mask))
440		return (EINVAL);
441	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
442}
443
444/*
445 * Handle two cases for replacing the base set or mask of an entire process.
446 *
447 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
448 *    to the provided set and replaces all non-anonymous td_cpusets with the
449 *    provided set.
450 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
451 *    sets for every thread with the existing base as a parent.
452 *
453 * This is overly complicated because we can't allocate while holding a
454 * spinlock and spinlocks must be held while changing and examining thread
455 * state.
456 */
457static int
458cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
459{
460	struct setlist freelist;
461	struct setlist droplist;
462	struct cpuset *tdset;
463	struct cpuset *nset;
464	struct thread *td;
465	struct proc *p;
466	int threads;
467	int nfree;
468	int error;
469	/*
470	 * The algorithm requires two passes due to locking considerations.
471	 *
472	 * 1) Lookup the process and acquire the locks in the required order.
473	 * 2) If enough cpusets have not been allocated release the locks and
474	 *    allocate them.  Loop.
475	 */
476	LIST_INIT(&freelist);
477	LIST_INIT(&droplist);
478	nfree = 0;
479	for (;;) {
480		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
481		if (error)
482			goto out;
483		PROC_SLOCK(p);
484		if (nfree >= p->p_numthreads)
485			break;
486		threads = p->p_numthreads;
487		PROC_SUNLOCK(p);
488		PROC_UNLOCK(p);
489		for (; nfree < threads; nfree++) {
490			nset = uma_zalloc(cpuset_zone, M_WAITOK);
491			LIST_INSERT_HEAD(&freelist, nset, cs_link);
492		}
493	}
494	PROC_LOCK_ASSERT(p, MA_OWNED);
495	PROC_SLOCK_ASSERT(p, MA_OWNED);
496	/*
497	 * Now that the appropriate locks are held and we have enough cpusets,
498	 * make sure the operation will succeed before applying changes.  The
499	 * proc lock prevents td_cpuset from changing between calls.
500	 */
501	error = 0;
502	FOREACH_THREAD_IN_PROC(p, td) {
503		thread_lock(td);
504		tdset = td->td_cpuset;
505		/*
506		 * Verify that a new mask doesn't specify cpus outside of
507		 * the set the thread is a member of.
508		 */
509		if (mask) {
510			if (tdset->cs_id == CPUSET_INVALID)
511				tdset = tdset->cs_parent;
512			if (!CPU_SUBSET(&tdset->cs_mask, mask))
513				error = EINVAL;
514		/*
515		 * Verify that a new set won't leave an existing thread
516		 * mask without a cpu to run on.  It can, however, restrict
517		 * the set.
518		 */
519		} else if (tdset->cs_id == CPUSET_INVALID) {
520			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
521				error = EINVAL;
522		}
523		thread_unlock(td);
524		if (error)
525			goto unlock_out;
526	}
527	/*
528	 * Replace each thread's cpuset while using deferred release.  We
529	 * must do this because the PROC_SLOCK has to be held while traversing
530	 * the thread list and this limits the type of operations allowed.
531	 */
532	FOREACH_THREAD_IN_PROC(p, td) {
533		thread_lock(td);
534		/*
535		 * If we presently have an anonymous set or are applying a
536		 * mask we must create an anonymous shadow set.  That is
537		 * either parented to our existing base or the supplied set.
538		 *
539		 * If we have a base set with no anonymous shadow we simply
540		 * replace it outright.
541		 */
542		tdset = td->td_cpuset;
543		if (tdset->cs_id == CPUSET_INVALID || mask) {
544			nset = LIST_FIRST(&freelist);
545			LIST_REMOVE(nset, cs_link);
546			if (mask)
547				error = cpuset_shadow(tdset, nset, mask);
548			else
549				error = _cpuset_create(nset, set,
550				    &tdset->cs_mask, CPUSET_INVALID);
551			if (error) {
552				LIST_INSERT_HEAD(&freelist, nset, cs_link);
553				thread_unlock(td);
554				break;
555			}
556		} else
557			nset = cpuset_ref(set);
558		cpuset_rel_defer(&droplist, tdset);
559		td->td_cpuset = nset;
560		sched_affinity(td);
561		thread_unlock(td);
562	}
563unlock_out:
564	PROC_SUNLOCK(p);
565	PROC_UNLOCK(p);
566out:
567	while ((nset = LIST_FIRST(&droplist)) != NULL)
568		cpuset_rel_complete(nset);
569	while ((nset = LIST_FIRST(&freelist)) != NULL) {
570		LIST_REMOVE(nset, cs_link);
571		uma_zfree(cpuset_zone, nset);
572	}
573	return (error);
574}
575
576/*
577 * Apply an anonymous mask to a single thread.
578 */
579static int
580cpuset_setthread(lwpid_t id, cpuset_t *mask)
581{
582	struct cpuset *nset;
583	struct cpuset *set;
584	struct thread *td;
585	struct proc *p;
586	int error;
587
588	nset = uma_zalloc(cpuset_zone, M_WAITOK);
589	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
590	if (error)
591		goto out;
592	thread_lock(td);
593	set = td->td_cpuset;
594	error = cpuset_shadow(set, nset, mask);
595	if (error == 0) {
596		cpuset_rel(td->td_cpuset);
597		td->td_cpuset = nset;
598		sched_affinity(td);
599		nset = NULL;
600	}
601	thread_unlock(td);
602	PROC_UNLOCK(p);
603out:
604	if (nset)
605		uma_zfree(cpuset_zone, nset);
606	return (error);
607}
608
609/*
610 * Creates the cpuset for thread0.  We make two sets:
611 *
612 * 0 - The root set which should represent all valid processors in the
613 *     system.  It is initially created with a mask of all processors
614 *     because we don't know what processors are valid until cpuset_init()
615 *     runs.  This set is immutable.
616 * 1 - The default set which all processes are a member of until changed.
617 *     This allows an administrator to move all threads off of given cpus to
618 *     dedicate them to high priority tasks or save power etc.
619 */
620struct cpuset *
621cpuset_thread0(void)
622{
623	struct cpuset *set;
624	int error;
625
626	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
627	    NULL, NULL, UMA_ALIGN_PTR, 0);
628	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
629	/*
630	 * Create the root system set for the whole machine.  Doesn't use
631	 * cpuset_create() due to NULL parent.
632	 */
633	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
634	set->cs_mask.__bits[0] = -1;
635	LIST_INIT(&set->cs_children);
636	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
637	set->cs_ref = 1;
638	set->cs_flags = CPU_SET_ROOT;
639	cpuset_zero = set;
640	/*
641	 * Now derive a default, modifiable set from that to give out.
642	 */
643	set = uma_zalloc(cpuset_zone, M_WAITOK);
644	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
645	KASSERT(error == 0, ("Error creating default set: %d\n", error));
646	/*
647	 * Initialize the unit allocator. 0 and 1 are allocated above.
648	 */
649	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
650
651	return (set);
652}
653
654/*
655 * This is called once the final set of system cpus is known.  Modifies
656 * the root set and all children and mark the root readonly.
657 */
658static void
659cpuset_init(void *arg)
660{
661	cpuset_t mask;
662
663	CPU_ZERO(&mask);
664#ifdef SMP
665	mask.__bits[0] = all_cpus;
666#else
667	mask.__bits[0] = 1;
668#endif
669	if (cpuset_modify(cpuset_zero, &mask))
670		panic("Can't set initial cpuset mask.\n");
671	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
672}
673SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
674
675#ifndef _SYS_SYSPROTO_H_
676struct cpuset_args {
677	cpusetid_t	*setid;
678};
679#endif
680int
681cpuset(struct thread *td, struct cpuset_args *uap)
682{
683	struct cpuset *root;
684	struct cpuset *set;
685	int error;
686
687	thread_lock(td);
688	root = cpuset_root(td->td_cpuset);
689	thread_unlock(td);
690	error = cpuset_create(&set, root, &root->cs_mask);
691	cpuset_rel(root);
692	if (error)
693		return (error);
694	error = cpuset_setproc(-1, set, NULL);
695	if (error == 0)
696		error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
697	cpuset_rel(set);
698	return (error);
699}
700
701#ifndef _SYS_SYSPROTO_H_
702struct cpuset_setid_args {
703	cpuwhich_t	which;
704	id_t		id;
705	cpusetid_t	setid;
706};
707#endif
708int
709cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
710{
711	struct cpuset *set;
712	int error;
713
714	/*
715	 * Presently we only support per-process sets.
716	 */
717	if (uap->which != CPU_WHICH_PID)
718		return (EINVAL);
719	set = cpuset_lookup(uap->setid);
720	if (set == NULL)
721		return (ESRCH);
722	error = cpuset_setproc(uap->id, set, NULL);
723	cpuset_rel(set);
724	return (error);
725}
726
727#ifndef _SYS_SYSPROTO_H_
728struct cpuset_getid_args {
729	cpulevel_t	level;
730	cpuwhich_t	which;
731	id_t		id;
732	cpusetid_t	*setid;
733#endif
734int
735cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
736{
737	struct cpuset *nset;
738	struct cpuset *set;
739	struct thread *ttd;
740	struct proc *p;
741	cpusetid_t id;
742	int error;
743
744	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
745		return (EINVAL);
746	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
747	if (error)
748		return (error);
749	switch (uap->which) {
750	case CPU_WHICH_TID:
751	case CPU_WHICH_PID:
752		thread_lock(ttd);
753		set = cpuset_base(ttd->td_cpuset);
754		thread_unlock(ttd);
755		PROC_UNLOCK(p);
756		break;
757	case CPU_WHICH_CPUSET:
758		break;
759	}
760	switch (uap->level) {
761	case CPU_LEVEL_ROOT:
762		nset = cpuset_root(set);
763		cpuset_rel(set);
764		set = nset;
765		break;
766	case CPU_LEVEL_CPUSET:
767		break;
768	case CPU_LEVEL_WHICH:
769		break;
770	}
771	id = set->cs_id;
772	cpuset_rel(set);
773	if (error == 0)
774		error = copyout(&id, uap->setid, sizeof(id));
775
776	return (error);
777}
778
779#ifndef _SYS_SYSPROTO_H_
780struct cpuset_getaffinity_args {
781        cpulevel_t	level;
782        cpuwhich_t	which;
783        int		id;
784        int		cpusetsize;
785        long 		*mask;
786};
787#endif
788int
789cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
790{
791	struct thread *ttd;
792	struct cpuset *nset;
793	struct cpuset *set;
794	struct proc *p;
795	cpuset_t *mask;
796	int error;
797	int size;
798
799	if (uap->cpusetsize < sizeof(cpuset_t) ||
800	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
801		return (ERANGE);
802	size = uap->cpusetsize;
803	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
804	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
805	if (error)
806		goto out;
807	error = 0;
808	switch (uap->level) {
809	case CPU_LEVEL_ROOT:
810	case CPU_LEVEL_CPUSET:
811		switch (uap->which) {
812		case CPU_WHICH_TID:
813		case CPU_WHICH_PID:
814			thread_lock(ttd);
815			set = cpuset_ref(ttd->td_cpuset);
816			thread_unlock(ttd);
817			break;
818		case CPU_WHICH_CPUSET:
819			break;
820		}
821		if (uap->level == CPU_LEVEL_ROOT)
822			nset = cpuset_root(set);
823		else
824			nset = cpuset_base(set);
825		CPU_COPY(&nset->cs_mask, mask);
826		cpuset_rel(nset);
827		break;
828	case CPU_LEVEL_WHICH:
829		switch (uap->which) {
830		case CPU_WHICH_TID:
831			thread_lock(ttd);
832			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
833			thread_unlock(ttd);
834			break;
835		case CPU_WHICH_PID:
836			PROC_SLOCK(p);
837			FOREACH_THREAD_IN_PROC(p, ttd) {
838				thread_lock(ttd);
839				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
840				thread_unlock(ttd);
841			}
842			PROC_SUNLOCK(p);
843			break;
844		case CPU_WHICH_CPUSET:
845			CPU_COPY(&set->cs_mask, mask);
846			break;
847		}
848		break;
849	default:
850		error = EINVAL;
851		break;
852	}
853	if (set)
854		cpuset_rel(set);
855	if (p)
856		PROC_UNLOCK(p);
857	if (error == 0)
858		error = copyout(mask, uap->mask, size);
859out:
860	free(mask, M_TEMP);
861	return (error);
862}
863
864#ifndef _SYS_SYSPROTO_H_
865struct cpuset_setaffinity_args {
866	cpulevel_t	level;
867        cpuwhich_t	which;
868        int		id;
869        int		cpusetsize;
870        long 	*	mask;
871};
872#endif
873int
874cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
875{
876	struct cpuset *nset;
877	struct cpuset *set;
878	struct thread *ttd;
879	struct proc *p;
880	cpuset_t *mask;
881	int error;
882
883	if (uap->cpusetsize < sizeof(cpuset_t) ||
884	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
885		return (ERANGE);
886	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
887	error = copyin(uap->mask, mask, uap->cpusetsize);
888	if (error)
889		goto out;
890	/*
891	 * Verify that no high bits are set.
892	 */
893	if (uap->cpusetsize > sizeof(cpuset_t)) {
894		char *end;
895		char *cp;
896
897		end = cp = (char *)&mask->__bits;
898		end += uap->cpusetsize;
899		cp += sizeof(cpuset_t);
900		while (cp != end)
901			if (*cp++ != 0) {
902				error = EINVAL;
903				goto out;
904			}
905
906	}
907	switch (uap->level) {
908	case CPU_LEVEL_ROOT:
909	case CPU_LEVEL_CPUSET:
910		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
911		if (error)
912			break;
913		switch (uap->which) {
914		case CPU_WHICH_TID:
915		case CPU_WHICH_PID:
916			thread_lock(ttd);
917			set = cpuset_ref(ttd->td_cpuset);
918			thread_unlock(ttd);
919			break;
920		case CPU_WHICH_CPUSET:
921			break;
922		}
923		if (uap->level == CPU_LEVEL_ROOT)
924			nset = cpuset_root(set);
925		else
926			nset = cpuset_base(set);
927		error = cpuset_modify(nset, mask);
928		cpuset_rel(nset);
929		cpuset_rel(set);
930		break;
931	case CPU_LEVEL_WHICH:
932		switch (uap->which) {
933		case CPU_WHICH_TID:
934			error = cpuset_setthread(uap->id, mask);
935			break;
936		case CPU_WHICH_PID:
937			error = cpuset_setproc(uap->id, NULL, mask);
938			break;
939		case CPU_WHICH_CPUSET:
940			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
941			    &ttd, &set);
942			if (error == 0) {
943				error = cpuset_modify(set, mask);
944				cpuset_rel(set);
945			}
946			break;
947		default:
948			error = EINVAL;
949			break;
950		}
951		break;
952	default:
953		error = EINVAL;
954		break;
955	}
956out:
957	free(mask, M_TEMP);
958	return (error);
959}
960