1/*-
2 * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Copyright (c) 2008 Nokia Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: stable/11/sys/kern/kern_cpuset.c 333338 2018-05-07 21:42:22Z shurd $");
33
34#include "opt_ddb.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/sysproto.h>
39#include <sys/jail.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/malloc.h>
43#include <sys/mutex.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/refcount.h>
47#include <sys/sched.h>
48#include <sys/smp.h>
49#include <sys/syscallsubr.h>
50#include <sys/capsicum.h>
51#include <sys/cpuset.h>
52#include <sys/sx.h>
53#include <sys/queue.h>
54#include <sys/libkern.h>
55#include <sys/limits.h>
56#include <sys/bus.h>
57#include <sys/interrupt.h>
58
59#include <vm/uma.h>
60#include <vm/vm.h>
61#include <vm/vm_page.h>
62#include <vm/vm_param.h>
63#include <vm/vm_phys.h>
64
65#ifdef DDB
66#include <ddb/ddb.h>
67#endif /* DDB */
68
69/*
70 * cpusets provide a mechanism for creating and manipulating sets of
71 * processors for the purpose of constraining the scheduling of threads to
72 * specific processors.
73 *
74 * Each process belongs to an identified set, by default this is set 1.  Each
75 * thread may further restrict the cpus it may run on to a subset of this
76 * named set.  This creates an anonymous set which other threads and processes
77 * may not join by number.
78 *
79 * The named set is referred to herein as the 'base' set to avoid ambiguity.
80 * This set is usually a child of a 'root' set while the anonymous set may
81 * simply be referred to as a mask.  In the syscall api these are referred to
82 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
83 *
84 * Threads inherit their set from their creator whether it be anonymous or
85 * not.  This means that anonymous sets are immutable because they may be
86 * shared.  To modify an anonymous set a new set is created with the desired
87 * mask and the same parent as the existing anonymous set.  This gives the
88 * illusion of each thread having a private mask.
89 *
90 * Via the syscall apis a user may ask to retrieve or modify the root, base,
91 * or mask that is discovered via a pid, tid, or setid.  Modifying a set
92 * modifies all numbered and anonymous child sets to comply with the new mask.
93 * Modifying a pid or tid's mask applies only to that tid but must still
94 * exist within the assigned parent set.
95 *
96 * A thread may not be assigned to a group separate from other threads in
97 * the process.  This is to remove ambiguity when the setid is queried with
98 * a pid argument.  There is no other technical limitation.
99 *
100 * This somewhat complex arrangement is intended to make it easy for
101 * applications to query available processors and bind their threads to
102 * specific processors while also allowing administrators to dynamically
103 * reprovision by changing sets which apply to groups of processes.
104 *
105 * A simple application should not concern itself with sets at all and
106 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
107 * meaning 'curthread'.  It may query available cpus for that tid with a
108 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
109 */
110static uma_zone_t cpuset_zone;
111static struct mtx cpuset_lock;
112static struct setlist cpuset_ids;
113static struct unrhdr *cpuset_unr;
114static struct cpuset *cpuset_zero, *cpuset_default;
115
116/* Return the size of cpuset_t at the kernel level */
117SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
118    SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
119
120cpuset_t *cpuset_root;
121cpuset_t cpuset_domain[MAXMEMDOM];
122
123/*
124 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
125 */
126struct cpuset *
127cpuset_ref(struct cpuset *set)
128{
129
130	refcount_acquire(&set->cs_ref);
131	return (set);
132}
133
134/*
135 * Walks up the tree from 'set' to find the root.  Returns the root
136 * referenced.
137 */
138static struct cpuset *
139cpuset_refroot(struct cpuset *set)
140{
141
142	for (; set->cs_parent != NULL; set = set->cs_parent)
143		if (set->cs_flags & CPU_SET_ROOT)
144			break;
145	cpuset_ref(set);
146
147	return (set);
148}
149
150/*
151 * Find the first non-anonymous set starting from 'set'.  Returns this set
152 * referenced.  May return the passed in set with an extra ref if it is
153 * not anonymous.
154 */
155static struct cpuset *
156cpuset_refbase(struct cpuset *set)
157{
158
159	if (set->cs_id == CPUSET_INVALID)
160		set = set->cs_parent;
161	cpuset_ref(set);
162
163	return (set);
164}
165
166/*
167 * Release a reference in a context where it is safe to allocate.
168 */
169void
170cpuset_rel(struct cpuset *set)
171{
172	cpusetid_t id;
173
174	if (refcount_release(&set->cs_ref) == 0)
175		return;
176	mtx_lock_spin(&cpuset_lock);
177	LIST_REMOVE(set, cs_siblings);
178	id = set->cs_id;
179	if (id != CPUSET_INVALID)
180		LIST_REMOVE(set, cs_link);
181	mtx_unlock_spin(&cpuset_lock);
182	cpuset_rel(set->cs_parent);
183	uma_zfree(cpuset_zone, set);
184	if (id != CPUSET_INVALID)
185		free_unr(cpuset_unr, id);
186}
187
188/*
189 * Deferred release must be used when in a context that is not safe to
190 * allocate/free.  This places any unreferenced sets on the list 'head'.
191 */
192static void
193cpuset_rel_defer(struct setlist *head, struct cpuset *set)
194{
195
196	if (refcount_release(&set->cs_ref) == 0)
197		return;
198	mtx_lock_spin(&cpuset_lock);
199	LIST_REMOVE(set, cs_siblings);
200	if (set->cs_id != CPUSET_INVALID)
201		LIST_REMOVE(set, cs_link);
202	LIST_INSERT_HEAD(head, set, cs_link);
203	mtx_unlock_spin(&cpuset_lock);
204}
205
206/*
207 * Complete a deferred release.  Removes the set from the list provided to
208 * cpuset_rel_defer.
209 */
210static void
211cpuset_rel_complete(struct cpuset *set)
212{
213	LIST_REMOVE(set, cs_link);
214	cpuset_rel(set->cs_parent);
215	uma_zfree(cpuset_zone, set);
216}
217
218/*
219 * Find a set based on an id.  Returns it with a ref.
220 */
221static struct cpuset *
222cpuset_lookup(cpusetid_t setid, struct thread *td)
223{
224	struct cpuset *set;
225
226	if (setid == CPUSET_INVALID)
227		return (NULL);
228	mtx_lock_spin(&cpuset_lock);
229	LIST_FOREACH(set, &cpuset_ids, cs_link)
230		if (set->cs_id == setid)
231			break;
232	if (set)
233		cpuset_ref(set);
234	mtx_unlock_spin(&cpuset_lock);
235
236	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
237	if (set != NULL && jailed(td->td_ucred)) {
238		struct cpuset *jset, *tset;
239
240		jset = td->td_ucred->cr_prison->pr_cpuset;
241		for (tset = set; tset != NULL; tset = tset->cs_parent)
242			if (tset == jset)
243				break;
244		if (tset == NULL) {
245			cpuset_rel(set);
246			set = NULL;
247		}
248	}
249
250	return (set);
251}
252
253/*
254 * Create a set in the space provided in 'set' with the provided parameters.
255 * The set is returned with a single ref.  May return EDEADLK if the set
256 * will have no valid cpu based on restrictions from the parent.
257 */
258static int
259_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
260    cpusetid_t id)
261{
262
263	if (!CPU_OVERLAP(&parent->cs_mask, mask))
264		return (EDEADLK);
265	CPU_COPY(mask, &set->cs_mask);
266	LIST_INIT(&set->cs_children);
267	refcount_init(&set->cs_ref, 1);
268	set->cs_flags = 0;
269	mtx_lock_spin(&cpuset_lock);
270	CPU_AND(&set->cs_mask, &parent->cs_mask);
271	set->cs_id = id;
272	set->cs_parent = cpuset_ref(parent);
273	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
274	if (set->cs_id != CPUSET_INVALID)
275		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
276	mtx_unlock_spin(&cpuset_lock);
277
278	return (0);
279}
280
281/*
282 * Create a new non-anonymous set with the requested parent and mask.  May
283 * return failures if the mask is invalid or a new number can not be
284 * allocated.
285 */
286static int
287cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
288{
289	struct cpuset *set;
290	cpusetid_t id;
291	int error;
292
293	id = alloc_unr(cpuset_unr);
294	if (id == -1)
295		return (ENFILE);
296	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
297	error = _cpuset_create(set, parent, mask, id);
298	if (error == 0)
299		return (0);
300	free_unr(cpuset_unr, id);
301	uma_zfree(cpuset_zone, set);
302
303	return (error);
304}
305
306/*
307 * Recursively check for errors that would occur from applying mask to
308 * the tree of sets starting at 'set'.  Checks for sets that would become
309 * empty as well as RDONLY flags.
310 */
311static int
312cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
313{
314	struct cpuset *nset;
315	cpuset_t newmask;
316	int error;
317
318	mtx_assert(&cpuset_lock, MA_OWNED);
319	if (set->cs_flags & CPU_SET_RDONLY)
320		return (EPERM);
321	if (check_mask) {
322		if (!CPU_OVERLAP(&set->cs_mask, mask))
323			return (EDEADLK);
324		CPU_COPY(&set->cs_mask, &newmask);
325		CPU_AND(&newmask, mask);
326	} else
327		CPU_COPY(mask, &newmask);
328	error = 0;
329	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
330		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
331			break;
332	return (error);
333}
334
335/*
336 * Applies the mask 'mask' without checking for empty sets or permissions.
337 */
338static void
339cpuset_update(struct cpuset *set, cpuset_t *mask)
340{
341	struct cpuset *nset;
342
343	mtx_assert(&cpuset_lock, MA_OWNED);
344	CPU_AND(&set->cs_mask, mask);
345	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
346		cpuset_update(nset, &set->cs_mask);
347
348	return;
349}
350
351/*
352 * Modify the set 'set' to use a copy of the mask provided.  Apply this new
353 * mask to restrict all children in the tree.  Checks for validity before
354 * applying the changes.
355 */
356static int
357cpuset_modify(struct cpuset *set, cpuset_t *mask)
358{
359	struct cpuset *root;
360	int error;
361
362	error = priv_check(curthread, PRIV_SCHED_CPUSET);
363	if (error)
364		return (error);
365	/*
366	 * In case we are called from within the jail
367	 * we do not allow modifying the dedicated root
368	 * cpuset of the jail but may still allow to
369	 * change child sets.
370	 */
371	if (jailed(curthread->td_ucred) &&
372	    set->cs_flags & CPU_SET_ROOT)
373		return (EPERM);
374	/*
375	 * Verify that we have access to this set of
376	 * cpus.
377	 */
378	root = set->cs_parent;
379	if (root && !CPU_SUBSET(&root->cs_mask, mask))
380		return (EINVAL);
381	mtx_lock_spin(&cpuset_lock);
382	error = cpuset_testupdate(set, mask, 0);
383	if (error)
384		goto out;
385	CPU_COPY(mask, &set->cs_mask);
386	cpuset_update(set, mask);
387out:
388	mtx_unlock_spin(&cpuset_lock);
389
390	return (error);
391}
392
393/*
394 * Resolve the 'which' parameter of several cpuset apis.
395 *
396 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
397 * checks for permission via p_cansched().
398 *
399 * For WHICH_SET returns a valid set with a new reference.
400 *
401 * -1 may be supplied for any argument to mean the current proc/thread or
402 * the base set of the current thread.  May fail with ESRCH/EPERM.
403 */
404int
405cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
406    struct cpuset **setp)
407{
408	struct cpuset *set;
409	struct thread *td;
410	struct proc *p;
411	int error;
412
413	*pp = p = NULL;
414	*tdp = td = NULL;
415	*setp = set = NULL;
416	switch (which) {
417	case CPU_WHICH_PID:
418		if (id == -1) {
419			PROC_LOCK(curproc);
420			p = curproc;
421			break;
422		}
423		if ((p = pfind(id)) == NULL)
424			return (ESRCH);
425		break;
426	case CPU_WHICH_TID:
427		if (id == -1) {
428			PROC_LOCK(curproc);
429			p = curproc;
430			td = curthread;
431			break;
432		}
433		td = tdfind(id, -1);
434		if (td == NULL)
435			return (ESRCH);
436		p = td->td_proc;
437		break;
438	case CPU_WHICH_CPUSET:
439		if (id == -1) {
440			thread_lock(curthread);
441			set = cpuset_refbase(curthread->td_cpuset);
442			thread_unlock(curthread);
443		} else
444			set = cpuset_lookup(id, curthread);
445		if (set) {
446			*setp = set;
447			return (0);
448		}
449		return (ESRCH);
450	case CPU_WHICH_JAIL:
451	{
452		/* Find `set' for prison with given id. */
453		struct prison *pr;
454
455		sx_slock(&allprison_lock);
456		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
457		sx_sunlock(&allprison_lock);
458		if (pr == NULL)
459			return (ESRCH);
460		cpuset_ref(pr->pr_cpuset);
461		*setp = pr->pr_cpuset;
462		mtx_unlock(&pr->pr_mtx);
463		return (0);
464	}
465	case CPU_WHICH_IRQ:
466	case CPU_WHICH_DOMAIN:
467		return (0);
468	default:
469		return (EINVAL);
470	}
471	error = p_cansched(curthread, p);
472	if (error) {
473		PROC_UNLOCK(p);
474		return (error);
475	}
476	if (td == NULL)
477		td = FIRST_THREAD_IN_PROC(p);
478	*pp = p;
479	*tdp = td;
480	return (0);
481}
482
483/*
484 * Create an anonymous set with the provided mask in the space provided by
485 * 'fset'.  If the passed in set is anonymous we use its parent otherwise
486 * the new set is a child of 'set'.
487 */
488static int
489cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
490{
491	struct cpuset *parent;
492
493	if (set->cs_id == CPUSET_INVALID)
494		parent = set->cs_parent;
495	else
496		parent = set;
497	if (!CPU_SUBSET(&parent->cs_mask, mask))
498		return (EDEADLK);
499	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
500}
501
502/*
503 * Handle two cases for replacing the base set or mask of an entire process.
504 *
505 * 1) Set is non-null and mask is null.  This reparents all anonymous sets
506 *    to the provided set and replaces all non-anonymous td_cpusets with the
507 *    provided set.
508 * 2) Mask is non-null and set is null.  This replaces or creates anonymous
509 *    sets for every thread with the existing base as a parent.
510 *
511 * This is overly complicated because we can't allocate while holding a
512 * spinlock and spinlocks must be held while changing and examining thread
513 * state.
514 */
515static int
516cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
517{
518	struct setlist freelist;
519	struct setlist droplist;
520	struct cpuset *tdset;
521	struct cpuset *nset;
522	struct thread *td;
523	struct proc *p;
524	int threads;
525	int nfree;
526	int error;
527
528	/*
529	 * The algorithm requires two passes due to locking considerations.
530	 *
531	 * 1) Lookup the process and acquire the locks in the required order.
532	 * 2) If enough cpusets have not been allocated release the locks and
533	 *    allocate them.  Loop.
534	 */
535	LIST_INIT(&freelist);
536	LIST_INIT(&droplist);
537	nfree = 0;
538	for (;;) {
539		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
540		if (error)
541			goto out;
542		if (nfree >= p->p_numthreads)
543			break;
544		threads = p->p_numthreads;
545		PROC_UNLOCK(p);
546		for (; nfree < threads; nfree++) {
547			nset = uma_zalloc(cpuset_zone, M_WAITOK);
548			LIST_INSERT_HEAD(&freelist, nset, cs_link);
549		}
550	}
551	PROC_LOCK_ASSERT(p, MA_OWNED);
552	/*
553	 * Now that the appropriate locks are held and we have enough cpusets,
554	 * make sure the operation will succeed before applying changes.  The
555	 * proc lock prevents td_cpuset from changing between calls.
556	 */
557	error = 0;
558	FOREACH_THREAD_IN_PROC(p, td) {
559		thread_lock(td);
560		tdset = td->td_cpuset;
561		/*
562		 * Verify that a new mask doesn't specify cpus outside of
563		 * the set the thread is a member of.
564		 */
565		if (mask) {
566			if (tdset->cs_id == CPUSET_INVALID)
567				tdset = tdset->cs_parent;
568			if (!CPU_SUBSET(&tdset->cs_mask, mask))
569				error = EDEADLK;
570		/*
571		 * Verify that a new set won't leave an existing thread
572		 * mask without a cpu to run on.  It can, however, restrict
573		 * the set.
574		 */
575		} else if (tdset->cs_id == CPUSET_INVALID) {
576			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
577				error = EDEADLK;
578		}
579		thread_unlock(td);
580		if (error)
581			goto unlock_out;
582	}
583	/*
584	 * Replace each thread's cpuset while using deferred release.  We
585	 * must do this because the thread lock must be held while operating
586	 * on the thread and this limits the type of operations allowed.
587	 */
588	FOREACH_THREAD_IN_PROC(p, td) {
589		thread_lock(td);
590		/*
591		 * If we presently have an anonymous set or are applying a
592		 * mask we must create an anonymous shadow set.  That is
593		 * either parented to our existing base or the supplied set.
594		 *
595		 * If we have a base set with no anonymous shadow we simply
596		 * replace it outright.
597		 */
598		tdset = td->td_cpuset;
599		if (tdset->cs_id == CPUSET_INVALID || mask) {
600			nset = LIST_FIRST(&freelist);
601			LIST_REMOVE(nset, cs_link);
602			if (mask)
603				error = cpuset_shadow(tdset, nset, mask);
604			else
605				error = _cpuset_create(nset, set,
606				    &tdset->cs_mask, CPUSET_INVALID);
607			if (error) {
608				LIST_INSERT_HEAD(&freelist, nset, cs_link);
609				thread_unlock(td);
610				break;
611			}
612		} else
613			nset = cpuset_ref(set);
614		cpuset_rel_defer(&droplist, tdset);
615		td->td_cpuset = nset;
616		sched_affinity(td);
617		thread_unlock(td);
618	}
619unlock_out:
620	PROC_UNLOCK(p);
621out:
622	while ((nset = LIST_FIRST(&droplist)) != NULL)
623		cpuset_rel_complete(nset);
624	while ((nset = LIST_FIRST(&freelist)) != NULL) {
625		LIST_REMOVE(nset, cs_link);
626		uma_zfree(cpuset_zone, nset);
627	}
628	return (error);
629}
630
631/*
632 * Return a string representing a valid layout for a cpuset_t object.
633 * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
634 */
635char *
636cpusetobj_strprint(char *buf, const cpuset_t *set)
637{
638	char *tbuf;
639	size_t i, bytesp, bufsiz;
640
641	tbuf = buf;
642	bytesp = 0;
643	bufsiz = CPUSETBUFSIZ;
644
645	for (i = 0; i < (_NCPUWORDS - 1); i++) {
646		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
647		bufsiz -= bytesp;
648		tbuf += bytesp;
649	}
650	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
651	return (buf);
652}
653
654/*
655 * Build a valid cpuset_t object from a string representation.
656 * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
657 */
658int
659cpusetobj_strscan(cpuset_t *set, const char *buf)
660{
661	u_int nwords;
662	int i, ret;
663
664	if (strlen(buf) > CPUSETBUFSIZ - 1)
665		return (-1);
666
667	/* Allow to pass a shorter version of the mask when necessary. */
668	nwords = 1;
669	for (i = 0; buf[i] != '\0'; i++)
670		if (buf[i] == ',')
671			nwords++;
672	if (nwords > _NCPUWORDS)
673		return (-1);
674
675	CPU_ZERO(set);
676	for (i = 0; i < (nwords - 1); i++) {
677		ret = sscanf(buf, "%lx,", &set->__bits[i]);
678		if (ret == 0 || ret == -1)
679			return (-1);
680		buf = strstr(buf, ",");
681		if (buf == NULL)
682			return (-1);
683		buf++;
684	}
685	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
686	if (ret == 0 || ret == -1)
687		return (-1);
688	return (0);
689}
690
691/*
692 * Apply an anonymous mask to a single thread.
693 */
694int
695cpuset_setthread(lwpid_t id, cpuset_t *mask)
696{
697	struct cpuset *nset;
698	struct cpuset *set;
699	struct thread *td;
700	struct proc *p;
701	int error;
702
703	nset = uma_zalloc(cpuset_zone, M_WAITOK);
704	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
705	if (error)
706		goto out;
707	set = NULL;
708	thread_lock(td);
709	error = cpuset_shadow(td->td_cpuset, nset, mask);
710	if (error == 0) {
711		set = td->td_cpuset;
712		td->td_cpuset = nset;
713		sched_affinity(td);
714		nset = NULL;
715	}
716	thread_unlock(td);
717	PROC_UNLOCK(p);
718	if (set)
719		cpuset_rel(set);
720out:
721	if (nset)
722		uma_zfree(cpuset_zone, nset);
723	return (error);
724}
725
726/*
727 * Apply new cpumask to the ithread.
728 */
729int
730cpuset_setithread(lwpid_t id, int cpu)
731{
732	struct cpuset *nset, *rset;
733	struct cpuset *parent, *old_set;
734	struct thread *td;
735	struct proc *p;
736	cpusetid_t cs_id;
737	cpuset_t mask;
738	int error;
739
740	nset = uma_zalloc(cpuset_zone, M_WAITOK);
741	rset = uma_zalloc(cpuset_zone, M_WAITOK);
742	cs_id = CPUSET_INVALID;
743
744	CPU_ZERO(&mask);
745	if (cpu == NOCPU)
746		CPU_COPY(cpuset_root, &mask);
747	else
748		CPU_SET(cpu, &mask);
749
750	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
751	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
752		goto out;
753
754	/* cpuset_which() returns with PROC_LOCK held. */
755	old_set = td->td_cpuset;
756
757	if (cpu == NOCPU) {
758
759		/*
760		 * roll back to default set. We're not using cpuset_shadow()
761		 * here because we can fail CPU_SUBSET() check. This can happen
762		 * if default set does not contain all CPUs.
763		 */
764		error = _cpuset_create(nset, cpuset_default, &mask,
765		    CPUSET_INVALID);
766
767		goto applyset;
768	}
769
770	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
771	    old_set->cs_parent->cs_id == 1)) {
772
773		/*
774		 * Current set is either default (1) or
775		 * shadowed version of default set.
776		 *
777		 * Allocate new root set to be able to shadow it
778		 * with any mask.
779		 */
780		error = _cpuset_create(rset, cpuset_zero,
781		    &cpuset_zero->cs_mask, cs_id);
782		if (error != 0) {
783			PROC_UNLOCK(p);
784			goto out;
785		}
786		rset->cs_flags |= CPU_SET_ROOT;
787		parent = rset;
788		rset = NULL;
789		cs_id = CPUSET_INVALID;
790	} else {
791		/* Assume existing set was already allocated by previous call */
792		parent = old_set;
793		old_set = NULL;
794	}
795
796	error = cpuset_shadow(parent, nset, &mask);
797applyset:
798	if (error == 0) {
799		thread_lock(td);
800		td->td_cpuset = nset;
801		sched_affinity(td);
802		thread_unlock(td);
803		nset = NULL;
804	} else
805		old_set = NULL;
806	PROC_UNLOCK(p);
807	if (old_set != NULL)
808		cpuset_rel(old_set);
809out:
810	if (nset != NULL)
811		uma_zfree(cpuset_zone, nset);
812	if (rset != NULL)
813		uma_zfree(cpuset_zone, rset);
814	if (cs_id != CPUSET_INVALID)
815		free_unr(cpuset_unr, cs_id);
816	return (error);
817}
818
819
820/*
821 * Creates system-wide cpusets and the cpuset for thread0 including two
822 * sets:
823 *
824 * 0 - The root set which should represent all valid processors in the
825 *     system.  It is initially created with a mask of all processors
826 *     because we don't know what processors are valid until cpuset_init()
827 *     runs.  This set is immutable.
828 * 1 - The default set which all processes are a member of until changed.
829 *     This allows an administrator to move all threads off of given cpus to
830 *     dedicate them to high priority tasks or save power etc.
831 */
832struct cpuset *
833cpuset_thread0(void)
834{
835	struct cpuset *set;
836	int error, i;
837
838	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
839	    NULL, NULL, UMA_ALIGN_PTR, 0);
840	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
841
842	/*
843	 * Create the root system set for the whole machine.  Doesn't use
844	 * cpuset_create() due to NULL parent.
845	 */
846	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
847	CPU_FILL(&set->cs_mask);
848	LIST_INIT(&set->cs_children);
849	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
850	set->cs_ref = 1;
851	set->cs_flags = CPU_SET_ROOT;
852	cpuset_zero = set;
853	cpuset_root = &set->cs_mask;
854
855	/*
856	 * Now derive a default, modifiable set from that to give out.
857	 */
858	set = uma_zalloc(cpuset_zone, M_WAITOK);
859	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
860	KASSERT(error == 0, ("Error creating default set: %d\n", error));
861	cpuset_default = set;
862
863	/*
864	 * Initialize the unit allocator. 0 and 1 are allocated above.
865	 */
866	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
867
868	/*
869	 * If MD code has not initialized per-domain cpusets, place all
870	 * CPUs in domain 0.
871	 */
872	for (i = 0; i < MAXMEMDOM; i++)
873		if (!CPU_EMPTY(&cpuset_domain[i]))
874			goto domains_set;
875	CPU_COPY(&all_cpus, &cpuset_domain[0]);
876domains_set:
877
878	return (set);
879}
880
881/*
882 * Create a cpuset, which would be cpuset_create() but
883 * mark the new 'set' as root.
884 *
885 * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
886 * for that.
887 *
888 * In case of no error, returns the set in *setp locked with a reference.
889 */
890int
891cpuset_create_root(struct prison *pr, struct cpuset **setp)
892{
893	struct cpuset *set;
894	int error;
895
896	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
897	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
898
899	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
900	if (error)
901		return (error);
902
903	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
904	    __func__, __LINE__));
905
906	/* Mark the set as root. */
907	set = *setp;
908	set->cs_flags |= CPU_SET_ROOT;
909
910	return (0);
911}
912
913int
914cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
915{
916	int error;
917
918	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
919	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
920
921	cpuset_ref(set);
922	error = cpuset_setproc(p->p_pid, set, NULL);
923	if (error)
924		return (error);
925	cpuset_rel(set);
926	return (0);
927}
928
929/*
930 * This is called once the final set of system cpus is known.  Modifies
931 * the root set and all children and mark the root read-only.
932 */
933static void
934cpuset_init(void *arg)
935{
936	cpuset_t mask;
937
938	mask = all_cpus;
939	if (cpuset_modify(cpuset_zero, &mask))
940		panic("Can't set initial cpuset mask.\n");
941	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
942}
943SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
944
945#ifndef _SYS_SYSPROTO_H_
946struct cpuset_args {
947	cpusetid_t	*setid;
948};
949#endif
950int
951sys_cpuset(struct thread *td, struct cpuset_args *uap)
952{
953	struct cpuset *root;
954	struct cpuset *set;
955	int error;
956
957	thread_lock(td);
958	root = cpuset_refroot(td->td_cpuset);
959	thread_unlock(td);
960	error = cpuset_create(&set, root, &root->cs_mask);
961	cpuset_rel(root);
962	if (error)
963		return (error);
964	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
965	if (error == 0)
966		error = cpuset_setproc(-1, set, NULL);
967	cpuset_rel(set);
968	return (error);
969}
970
971#ifndef _SYS_SYSPROTO_H_
972struct cpuset_setid_args {
973	cpuwhich_t	which;
974	id_t		id;
975	cpusetid_t	setid;
976};
977#endif
978int
979sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
980{
981
982	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
983}
984
985int
986kern_cpuset_setid(struct thread *td, cpuwhich_t which,
987    id_t id, cpusetid_t setid)
988{
989	struct cpuset *set;
990	int error;
991
992	/*
993	 * Presently we only support per-process sets.
994	 */
995	if (which != CPU_WHICH_PID)
996		return (EINVAL);
997	set = cpuset_lookup(setid, td);
998	if (set == NULL)
999		return (ESRCH);
1000	error = cpuset_setproc(id, set, NULL);
1001	cpuset_rel(set);
1002	return (error);
1003}
1004
1005#ifndef _SYS_SYSPROTO_H_
1006struct cpuset_getid_args {
1007	cpulevel_t	level;
1008	cpuwhich_t	which;
1009	id_t		id;
1010	cpusetid_t	*setid;
1011};
1012#endif
1013int
1014sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
1015{
1016
1017	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
1018	    uap->setid));
1019}
1020
1021int
1022kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
1023    id_t id, cpusetid_t *setid)
1024{
1025	struct cpuset *nset;
1026	struct cpuset *set;
1027	struct thread *ttd;
1028	struct proc *p;
1029	cpusetid_t tmpid;
1030	int error;
1031
1032	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
1033		return (EINVAL);
1034	error = cpuset_which(which, id, &p, &ttd, &set);
1035	if (error)
1036		return (error);
1037	switch (which) {
1038	case CPU_WHICH_TID:
1039	case CPU_WHICH_PID:
1040		thread_lock(ttd);
1041		set = cpuset_refbase(ttd->td_cpuset);
1042		thread_unlock(ttd);
1043		PROC_UNLOCK(p);
1044		break;
1045	case CPU_WHICH_CPUSET:
1046	case CPU_WHICH_JAIL:
1047		break;
1048	case CPU_WHICH_IRQ:
1049	case CPU_WHICH_DOMAIN:
1050		return (EINVAL);
1051	}
1052	switch (level) {
1053	case CPU_LEVEL_ROOT:
1054		nset = cpuset_refroot(set);
1055		cpuset_rel(set);
1056		set = nset;
1057		break;
1058	case CPU_LEVEL_CPUSET:
1059		break;
1060	case CPU_LEVEL_WHICH:
1061		break;
1062	}
1063	tmpid = set->cs_id;
1064	cpuset_rel(set);
1065	if (error == 0)
1066		error = copyout(&tmpid, setid, sizeof(tmpid));
1067
1068	return (error);
1069}
1070
1071#ifndef _SYS_SYSPROTO_H_
1072struct cpuset_getaffinity_args {
1073	cpulevel_t	level;
1074	cpuwhich_t	which;
1075	id_t		id;
1076	size_t		cpusetsize;
1077	cpuset_t	*mask;
1078};
1079#endif
1080int
1081sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
1082{
1083
1084	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
1085	    uap->id, uap->cpusetsize, uap->mask));
1086}
1087
1088int
1089kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1090    id_t id, size_t cpusetsize, cpuset_t *maskp)
1091{
1092	struct thread *ttd;
1093	struct cpuset *nset;
1094	struct cpuset *set;
1095	struct proc *p;
1096	cpuset_t *mask;
1097	int error;
1098	size_t size;
1099
1100	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1101		return (ERANGE);
1102	/* In Capability mode, you can only get your own CPU set. */
1103	if (IN_CAPABILITY_MODE(td)) {
1104	    if (level != CPU_LEVEL_WHICH)
1105		return (ECAPMODE);
1106	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1107		return (ECAPMODE);
1108	    if (id != -1)
1109		return (ECAPMODE);
1110	}
1111	size = cpusetsize;
1112	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
1113	error = cpuset_which(which, id, &p, &ttd, &set);
1114	if (error)
1115		goto out;
1116	switch (level) {
1117	case CPU_LEVEL_ROOT:
1118	case CPU_LEVEL_CPUSET:
1119		switch (which) {
1120		case CPU_WHICH_TID:
1121		case CPU_WHICH_PID:
1122			thread_lock(ttd);
1123			set = cpuset_ref(ttd->td_cpuset);
1124			thread_unlock(ttd);
1125			break;
1126		case CPU_WHICH_CPUSET:
1127		case CPU_WHICH_JAIL:
1128			break;
1129		case CPU_WHICH_IRQ:
1130		case CPU_WHICH_INTRHANDLER:
1131		case CPU_WHICH_ITHREAD:
1132		case CPU_WHICH_DOMAIN:
1133			error = EINVAL;
1134			goto out;
1135		}
1136		if (level == CPU_LEVEL_ROOT)
1137			nset = cpuset_refroot(set);
1138		else
1139			nset = cpuset_refbase(set);
1140		CPU_COPY(&nset->cs_mask, mask);
1141		cpuset_rel(nset);
1142		break;
1143	case CPU_LEVEL_WHICH:
1144		switch (which) {
1145		case CPU_WHICH_TID:
1146			thread_lock(ttd);
1147			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
1148			thread_unlock(ttd);
1149			break;
1150		case CPU_WHICH_PID:
1151			FOREACH_THREAD_IN_PROC(p, ttd) {
1152				thread_lock(ttd);
1153				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
1154				thread_unlock(ttd);
1155			}
1156			break;
1157		case CPU_WHICH_CPUSET:
1158		case CPU_WHICH_JAIL:
1159			CPU_COPY(&set->cs_mask, mask);
1160			break;
1161		case CPU_WHICH_IRQ:
1162		case CPU_WHICH_INTRHANDLER:
1163		case CPU_WHICH_ITHREAD:
1164			error = intr_getaffinity(id, which, mask);
1165			break;
1166		case CPU_WHICH_DOMAIN:
1167			if (id < 0 || id >= MAXMEMDOM)
1168				error = ESRCH;
1169			else
1170				CPU_COPY(&cpuset_domain[id], mask);
1171			break;
1172		}
1173		break;
1174	default:
1175		error = EINVAL;
1176		break;
1177	}
1178	if (set)
1179		cpuset_rel(set);
1180	if (p)
1181		PROC_UNLOCK(p);
1182	if (error == 0)
1183		error = copyout(mask, maskp, size);
1184out:
1185	free(mask, M_TEMP);
1186	return (error);
1187}
1188
1189#ifndef _SYS_SYSPROTO_H_
1190struct cpuset_setaffinity_args {
1191	cpulevel_t	level;
1192	cpuwhich_t	which;
1193	id_t		id;
1194	size_t		cpusetsize;
1195	const cpuset_t	*mask;
1196};
1197#endif
1198int
1199sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
1200{
1201
1202	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
1203	    uap->id, uap->cpusetsize, uap->mask));
1204}
1205
1206int
1207kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1208    id_t id, size_t cpusetsize, const cpuset_t *maskp)
1209{
1210	struct cpuset *nset;
1211	struct cpuset *set;
1212	struct thread *ttd;
1213	struct proc *p;
1214	cpuset_t *mask;
1215	int error;
1216
1217	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1218		return (ERANGE);
1219	/* In Capability mode, you can only set your own CPU set. */
1220	if (IN_CAPABILITY_MODE(td)) {
1221	    if (level != CPU_LEVEL_WHICH)
1222		return (ECAPMODE);
1223	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1224		return (ECAPMODE);
1225	    if (id != -1)
1226		return (ECAPMODE);
1227	}
1228	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
1229	error = copyin(maskp, mask, cpusetsize);
1230	if (error)
1231		goto out;
1232	/*
1233	 * Verify that no high bits are set.
1234	 */
1235	if (cpusetsize > sizeof(cpuset_t)) {
1236		char *end;
1237		char *cp;
1238
1239		end = cp = (char *)&mask->__bits;
1240		end += cpusetsize;
1241		cp += sizeof(cpuset_t);
1242		while (cp != end)
1243			if (*cp++ != 0) {
1244				error = EINVAL;
1245				goto out;
1246			}
1247
1248	}
1249	switch (level) {
1250	case CPU_LEVEL_ROOT:
1251	case CPU_LEVEL_CPUSET:
1252		error = cpuset_which(which, id, &p, &ttd, &set);
1253		if (error)
1254			break;
1255		switch (which) {
1256		case CPU_WHICH_TID:
1257		case CPU_WHICH_PID:
1258			thread_lock(ttd);
1259			set = cpuset_ref(ttd->td_cpuset);
1260			thread_unlock(ttd);
1261			PROC_UNLOCK(p);
1262			break;
1263		case CPU_WHICH_CPUSET:
1264		case CPU_WHICH_JAIL:
1265			break;
1266		case CPU_WHICH_IRQ:
1267		case CPU_WHICH_INTRHANDLER:
1268		case CPU_WHICH_ITHREAD:
1269		case CPU_WHICH_DOMAIN:
1270			error = EINVAL;
1271			goto out;
1272		}
1273		if (level == CPU_LEVEL_ROOT)
1274			nset = cpuset_refroot(set);
1275		else
1276			nset = cpuset_refbase(set);
1277		error = cpuset_modify(nset, mask);
1278		cpuset_rel(nset);
1279		cpuset_rel(set);
1280		break;
1281	case CPU_LEVEL_WHICH:
1282		switch (which) {
1283		case CPU_WHICH_TID:
1284			error = cpuset_setthread(id, mask);
1285			break;
1286		case CPU_WHICH_PID:
1287			error = cpuset_setproc(id, NULL, mask);
1288			break;
1289		case CPU_WHICH_CPUSET:
1290		case CPU_WHICH_JAIL:
1291			error = cpuset_which(which, id, &p, &ttd, &set);
1292			if (error == 0) {
1293				error = cpuset_modify(set, mask);
1294				cpuset_rel(set);
1295			}
1296			break;
1297		case CPU_WHICH_IRQ:
1298		case CPU_WHICH_INTRHANDLER:
1299		case CPU_WHICH_ITHREAD:
1300			error = intr_setaffinity(id, which, mask);
1301			break;
1302		default:
1303			error = EINVAL;
1304			break;
1305		}
1306		break;
1307	default:
1308		error = EINVAL;
1309		break;
1310	}
1311out:
1312	free(mask, M_TEMP);
1313	return (error);
1314}
1315
1316#ifdef DDB
1317void
1318ddb_display_cpuset(const cpuset_t *set)
1319{
1320	int cpu, once;
1321
1322	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
1323		if (CPU_ISSET(cpu, set)) {
1324			if (once == 0) {
1325				db_printf("%d", cpu);
1326				once = 1;
1327			} else
1328				db_printf(",%d", cpu);
1329		}
1330	}
1331	if (once == 0)
1332		db_printf("<none>");
1333}
1334
1335DB_SHOW_COMMAND(cpusets, db_show_cpusets)
1336{
1337	struct cpuset *set;
1338
1339	LIST_FOREACH(set, &cpuset_ids, cs_link) {
1340		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
1341		    set, set->cs_id, set->cs_ref, set->cs_flags,
1342		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
1343		db_printf("  mask=");
1344		ddb_display_cpuset(&set->cs_mask);
1345		db_printf("\n");
1346		if (db_pager_quit)
1347			break;
1348	}
1349}
1350#endif /* DDB */
1351