1139804Simp/*-
2184601Sjhb * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
3184601Sjhb * All rights reserved.
425164Speter *
525164Speter * Redistribution and use in source and binary forms, with or without
625164Speter * modification, are permitted provided that the following conditions
725164Speter * are met:
825164Speter * 1. Redistributions of source code must retain the above copyright
925164Speter *    notice, this list of conditions and the following disclaimer.
1076078Sjhb * 2. Redistributions in binary form must reproduce the above copyright
1176078Sjhb *    notice, this list of conditions and the following disclaimer in the
1276078Sjhb *    documentation and/or other materials provided with the distribution.
1325164Speter *
14184601Sjhb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1525164Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1625164Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17184601Sjhb * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18184601Sjhb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19184601Sjhb * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20184601Sjhb * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21184601Sjhb * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22184601Sjhb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23184601Sjhb * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24184601Sjhb * SUCH DAMAGE.
2525164Speter */
2625164Speter
2776078Sjhb/*
2876078Sjhb * This module holds the global variables and machine independent functions
2976440Sjhb * used for the kernel SMP support.
3076078Sjhb */
3125164Speter
32116182Sobrien#include <sys/cdefs.h>
33116182Sobrien__FBSDID("$FreeBSD: stable/10/sys/kern/subr_smp.c 331910 2018-04-03 07:52:06Z avg $");
34116182Sobrien
3528743Sbde#include <sys/param.h>
3625164Speter#include <sys/systm.h>
3776440Sjhb#include <sys/kernel.h>
3876078Sjhb#include <sys/ktr.h>
3928808Speter#include <sys/proc.h>
40126763Snjl#include <sys/bus.h>
4176078Sjhb#include <sys/lock.h>
4267365Sjhb#include <sys/mutex.h>
4376440Sjhb#include <sys/pcpu.h>
44243046Sjeff#include <sys/sched.h>
4576078Sjhb#include <sys/smp.h>
4676078Sjhb#include <sys/sysctl.h>
4725164Speter
48171191Sjhb#include <machine/cpu.h>
4991778Sjake#include <machine/smp.h>
5091778Sjake
51134591Sjulian#include "opt_sched.h"
52134591Sjulian
53123125Sjhb#ifdef SMP
54222813Sattiliovolatile cpuset_t stopped_cpus;
55222813Sattiliovolatile cpuset_t started_cpus;
56236772Siwasakivolatile cpuset_t suspended_cpus;
57222813Sattiliocpuset_t hlt_cpus_mask;
58222813Sattiliocpuset_t logical_cpus_mask;
5925164Speter
6092723Salfredvoid (*cpustop_restartfunc)(void);
61123125Sjhb#endif
62265606Sscottl
63265606Sscottlstatic int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
64265606Sscottl
65134688Sjulian/* This is used in modules that need to work in both SMP and UP. */
66222813Sattiliocpuset_t all_cpus;
67123125Sjhb
6876078Sjhbint mp_ncpus;
69123766Salfred/* export this for libkvm consumers. */
70123766Salfredint mp_maxcpus = MAXCPU;
7128027Sfsmp
7285787Smarcelvolatile int smp_started;
7391673Sjeffu_int mp_maxid;
7425164Speter
75227309Sedstatic SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
76227309Sed    "Kernel SMP");
7725164Speter
78224159SrwatsonSYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
79179230Sjb    "Max CPU ID.");
80179230Sjb
81224159SrwatsonSYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
82224159Srwatson    0, "Max number of CPUs that the system was compiled for.");
83123766Salfred
84265606SscottlSYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0,
85265606Sscottl    sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode");
8626155Sfsmp
87108371Sjakeint smp_disabled = 0;	/* has smp been disabled? */
88224159SrwatsonSYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
89224159Srwatson    &smp_disabled, 0, "SMP has been disabled from the loader");
90108371SjakeTUNABLE_INT("kern.smp.disabled", &smp_disabled);
91108371Sjake
9276078Sjhbint smp_cpus = 1;	/* how many cpu's running */
93224159SrwatsonSYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
94116270Sdes    "Number of CPUs online");
9527005Sfsmp
96176734Sjeffint smp_topology = 0;	/* Which topology we're using. */
97176734SjeffSYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
98176734Sjeff    "Topology override setting; 0 is default provided by hardware.");
99176734SjeffTUNABLE_INT("kern.smp.topology", &smp_topology);
100176734Sjeff
101123125Sjhb#ifdef SMP
10271525Sjhb/* Enable forwarding of a signal to a process running on a different CPU */
10371525Sjhbstatic int forward_signal_enabled = 1;
10476078SjhbSYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
105116270Sdes	   &forward_signal_enabled, 0,
106116270Sdes	   "Forwarding of a signal to a process on a different CPU");
10771525Sjhb
10876078Sjhb/* Variables needed for SMP rendezvous. */
109182292Sjhbstatic volatile int smp_rv_ncpus;
110173444Supsstatic void (*volatile smp_rv_setup_func)(void *arg);
111173444Supsstatic void (*volatile smp_rv_action_func)(void *arg);
112175057Sjhbstatic void (*volatile smp_rv_teardown_func)(void *arg);
113187719Sjhbstatic void *volatile smp_rv_func_arg;
114224527Savgstatic volatile int smp_rv_waiters[4];
11571525Sjhb
116134227Speter/*
117134227Speter * Shared mutex to restrict busywaits between smp_rendezvous() and
118134227Speter * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
119134227Speter * functions trigger at once and cause multiple CPUs to busywait with
120134227Speter * interrupts disabled.
121134227Speter */
122134416Sobrienstruct mtx smp_ipi_mtx;
123134227Speter
12425164Speter/*
125122947Sjhb * Let the MD SMP code initialize mp_maxid very early if it can.
12625164Speter */
12771525Sjhbstatic void
128122947Sjhbmp_setmaxid(void *dummy)
12991673Sjeff{
130122947Sjhb	cpu_mp_setmaxid();
13191673Sjeff}
132177253SrwatsonSYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
13391673Sjeff
13491673Sjeff/*
13591673Sjeff * Call the MD SMP initialization code.
13691673Sjeff */
13791673Sjeffstatic void
13876078Sjhbmp_start(void *dummy)
13971525Sjhb{
14071525Sjhb
141207921Sattilio	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
142207921Sattilio
14376078Sjhb	/* Probe for MP hardware. */
144122947Sjhb	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
145121756Sjhb		mp_ncpus = 1;
146223758Sattilio		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
14776078Sjhb		return;
148121756Sjhb	}
14976078Sjhb
15076078Sjhb	cpu_mp_start();
15176078Sjhb	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
15276078Sjhb	    mp_ncpus);
15376078Sjhb	cpu_mp_announce();
15471525Sjhb}
155177253SrwatsonSYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
15671525Sjhb
15725164Spetervoid
15883366Sjulianforward_signal(struct thread *td)
15925164Speter{
16076078Sjhb	int id;
16125164Speter
16226108Sfsmp	/*
163112888Sjeff	 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
164112888Sjeff	 * this thread, so all we need to do is poke it if it is currently
16593873Sbde	 * executing so that it executes ast().
16626108Sfsmp	 */
167170307Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
168103216Sjulian	KASSERT(TD_IS_RUNNING(td),
16999072Sjulian	    ("forward_signal: thread is not TDS_RUNNING"));
17031639Sfsmp
17183366Sjulian	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
17265557Sjasone
17376078Sjhb	if (!smp_started || cold || panicstr)
17431639Sfsmp		return;
17576078Sjhb	if (!forward_signal_enabled)
17676078Sjhb		return;
17731639Sfsmp
17876078Sjhb	/* No need to IPI ourself. */
17983366Sjulian	if (td == curthread)
18031639Sfsmp		return;
18131639Sfsmp
182113339Sjulian	id = td->td_oncpu;
18376078Sjhb	if (id == NOCPU)
18434020Stegge		return;
185210939Sjhb	ipi_cpu(id, IPI_AST);
18634020Stegge}
18734021Stegge
18871525Sjhb/*
18971525Sjhb * When called the executing CPU will send an IPI to all other CPUs
19071525Sjhb *  requesting that they halt execution.
19171525Sjhb *
19271525Sjhb * Usually (but not necessarily) called with 'other_cpus' as its arg.
19371525Sjhb *
19471525Sjhb *  - Signals all CPUs in map to stop.
19571525Sjhb *  - Waits for each to stop.
19671525Sjhb *
19771525Sjhb * Returns:
19871525Sjhb *  -1: error
19971525Sjhb *   0: NA
20071525Sjhb *   1: ok
20171525Sjhb *
20271525Sjhb */
203196196Sattiliostatic int
204222813Sattiliogeneric_stop_cpus(cpuset_t map, u_int type)
20571525Sjhb{
206222813Sattilio#ifdef KTR
207222813Sattilio	char cpusetbuf[CPUSETBUFSIZ];
208222813Sattilio#endif
209213736Savg	static volatile u_int stopping_cpu = NOCPU;
21076078Sjhb	int i;
211236906Siwasaki	volatile cpuset_t *cpus;
21236135Stegge
213213736Savg	KASSERT(
214235622Siwasaki#if defined(__amd64__) || defined(__i386__)
215213736Savg	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
216213736Savg#else
217213736Savg	    type == IPI_STOP || type == IPI_STOP_HARD,
218213736Savg#endif
219196196Sattilio	    ("%s: invalid stop type", __func__));
220196196Sattilio
22171525Sjhb	if (!smp_started)
222213736Savg		return (0);
22371525Sjhb
224222813Sattilio	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
225222813Sattilio	    cpusetobj_strprint(cpusetbuf, &map), type);
22676078Sjhb
227255783Sgibbs#if defined(__amd64__) || defined(__i386__)
228255726Sgibbs	/*
229255744Sgibbs	 * When suspending, ensure there are are no IPIs in progress.
230255744Sgibbs	 * IPIs that have been issued, but not yet delivered (e.g.
231255744Sgibbs	 * not pending on a vCPU when running under virtualization)
232255744Sgibbs	 * will be lost, violating FreeBSD's assumption of reliable
233255744Sgibbs	 * IPI delivery.
234255726Sgibbs	 */
235255726Sgibbs	if (type == IPI_SUSPEND)
236255726Sgibbs		mtx_lock_spin(&smp_ipi_mtx);
237255783Sgibbs#endif
238255726Sgibbs
239213736Savg	if (stopping_cpu != PCPU_GET(cpuid))
240213736Savg		while (atomic_cmpset_int(&stopping_cpu, NOCPU,
241213736Savg		    PCPU_GET(cpuid)) == 0)
242213736Savg			while (stopping_cpu != NOCPU)
243213736Savg				cpu_spinwait(); /* spin */
244213736Savg
24576078Sjhb	/* send the stop IPI to all CPUs in map */
246196196Sattilio	ipi_selected(map, type);
247127498Smarcel
248236906Siwasaki#if defined(__amd64__) || defined(__i386__)
249236906Siwasaki	if (type == IPI_SUSPEND)
250236906Siwasaki		cpus = &suspended_cpus;
251236906Siwasaki	else
252236906Siwasaki#endif
253236906Siwasaki		cpus = &stopped_cpus;
254236906Siwasaki
25576078Sjhb	i = 0;
256236906Siwasaki	while (!CPU_SUBSET(cpus, &map)) {
25776078Sjhb		/* spin */
258171191Sjhb		cpu_spinwait();
25976078Sjhb		i++;
260223530Savg		if (i == 100000000) {
26176078Sjhb			printf("timeout stopping cpus\n");
26276078Sjhb			break;
26376078Sjhb		}
26476078Sjhb	}
26571525Sjhb
266255783Sgibbs#if defined(__amd64__) || defined(__i386__)
267255726Sgibbs	if (type == IPI_SUSPEND)
268255726Sgibbs		mtx_unlock_spin(&smp_ipi_mtx);
269255783Sgibbs#endif
270255726Sgibbs
271213736Savg	stopping_cpu = NOCPU;
272213736Savg	return (1);
27371525Sjhb}
27471525Sjhb
275196196Sattilioint
276222813Sattiliostop_cpus(cpuset_t map)
277196196Sattilio{
278196196Sattilio
279196196Sattilio	return (generic_stop_cpus(map, IPI_STOP));
280196196Sattilio}
281196196Sattilio
282196196Sattilioint
283222813Sattiliostop_cpus_hard(cpuset_t map)
284196196Sattilio{
285196196Sattilio
286196196Sattilio	return (generic_stop_cpus(map, IPI_STOP_HARD));
287196196Sattilio}
288196196Sattilio
289235622Siwasaki#if defined(__amd64__) || defined(__i386__)
290189903Sjkimint
291222813Sattiliosuspend_cpus(cpuset_t map)
292189903Sjkim{
293189903Sjkim
294213736Savg	return (generic_stop_cpus(map, IPI_SUSPEND));
295189903Sjkim}
296189903Sjkim#endif
297189903Sjkim
298189903Sjkim/*
29971525Sjhb * Called by a CPU to restart stopped CPUs.
30071525Sjhb *
30171525Sjhb * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
30271525Sjhb *
30371525Sjhb *  - Signals all CPUs in map to restart.
30471525Sjhb *  - Waits for each to restart.
30571525Sjhb *
30671525Sjhb * Returns:
30771525Sjhb *  -1: error
30871525Sjhb *   0: NA
30971525Sjhb *   1: ok
31071525Sjhb */
311255726Sgibbsstatic int
312255726Sgibbsgeneric_restart_cpus(cpuset_t map, u_int type)
31371525Sjhb{
314222813Sattilio#ifdef KTR
315222813Sattilio	char cpusetbuf[CPUSETBUFSIZ];
316222813Sattilio#endif
317255726Sgibbs	volatile cpuset_t *cpus;
31871525Sjhb
319255726Sgibbs	KASSERT(
320255726Sgibbs#if defined(__amd64__) || defined(__i386__)
321255726Sgibbs	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
322255726Sgibbs#else
323255726Sgibbs	    type == IPI_STOP || type == IPI_STOP_HARD,
324255726Sgibbs#endif
325255726Sgibbs	    ("%s: invalid stop type", __func__));
326255726Sgibbs
32771525Sjhb	if (!smp_started)
32871525Sjhb		return 0;
32971525Sjhb
330222813Sattilio	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
33171525Sjhb
332255726Sgibbs#if defined(__amd64__) || defined(__i386__)
333255726Sgibbs	if (type == IPI_SUSPEND)
334331910Savg		cpus = &resuming_cpus;
335255726Sgibbs	else
336255726Sgibbs#endif
337255726Sgibbs		cpus = &stopped_cpus;
338255726Sgibbs
33976078Sjhb	/* signal other cpus to restart */
340331910Savg#if defined(__amd64__) || defined(__i386__)
341331910Savg	if (type == IPI_SUSPEND)
342331910Savg		CPU_COPY_STORE_REL(&map, &toresume_cpus);
343331910Savg	else
344331910Savg#endif
345331910Savg		CPU_COPY_STORE_REL(&map, &started_cpus);
34676078Sjhb
34771525Sjhb	/* wait for each to clear its bit */
348255726Sgibbs	while (CPU_OVERLAP(cpus, &map))
349171191Sjhb		cpu_spinwait();
35071525Sjhb
35171525Sjhb	return 1;
35271525Sjhb}
35371525Sjhb
354255726Sgibbsint
355255726Sgibbsrestart_cpus(cpuset_t map)
356255726Sgibbs{
357255726Sgibbs
358255726Sgibbs	return (generic_restart_cpus(map, IPI_STOP));
359255726Sgibbs}
360255726Sgibbs
361255726Sgibbs#if defined(__amd64__) || defined(__i386__)
362255726Sgibbsint
363255726Sgibbsresume_cpus(cpuset_t map)
364255726Sgibbs{
365255726Sgibbs
366255726Sgibbs	return (generic_restart_cpus(map, IPI_SUSPEND));
367255726Sgibbs}
368255726Sgibbs#endif
369255726Sgibbs
37034021Stegge/*
37148924Smsmith * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
37248924Smsmith * (if specified), rendezvous, execute the action function (if specified),
37348924Smsmith * rendezvous again, execute the teardown function (if specified), and then
37448924Smsmith * resume.
37548924Smsmith *
37648924Smsmith * Note that the supplied external functions _must_ be reentrant and aware
37748924Smsmith * that they are running in parallel and in an unknown lock context.
37848924Smsmith */
37948924Smsmithvoid
38048924Smsmithsmp_rendezvous_action(void)
38148924Smsmith{
382222254Sjhb	struct thread *td;
383222032Sjhb	void *local_func_arg;
384222032Sjhb	void (*local_setup_func)(void*);
385222032Sjhb	void (*local_action_func)(void*);
386222032Sjhb	void (*local_teardown_func)(void*);
387222254Sjhb#ifdef INVARIANTS
388222254Sjhb	int owepreempt;
389222254Sjhb#endif
390175057Sjhb
391171191Sjhb	/* Ensure we have up-to-date values. */
392171191Sjhb	atomic_add_acq_int(&smp_rv_waiters[0], 1);
393182292Sjhb	while (smp_rv_waiters[0] < smp_rv_ncpus)
394171191Sjhb		cpu_spinwait();
395171191Sjhb
396222032Sjhb	/* Fetch rendezvous parameters after acquire barrier. */
397222032Sjhb	local_func_arg = smp_rv_func_arg;
398222032Sjhb	local_setup_func = smp_rv_setup_func;
399222032Sjhb	local_action_func = smp_rv_action_func;
400222032Sjhb	local_teardown_func = smp_rv_teardown_func;
401222032Sjhb
402222032Sjhb	/*
403222254Sjhb	 * Use a nested critical section to prevent any preemptions
404222254Sjhb	 * from occurring during a rendezvous action routine.
405222254Sjhb	 * Specifically, if a rendezvous handler is invoked via an IPI
406222254Sjhb	 * and the interrupted thread was in the critical_exit()
407222254Sjhb	 * function after setting td_critnest to 0 but before
408222254Sjhb	 * performing a deferred preemption, this routine can be
409222254Sjhb	 * invoked with td_critnest set to 0 and td_owepreempt true.
410222254Sjhb	 * In that case, a critical_exit() during the rendezvous
411222254Sjhb	 * action would trigger a preemption which is not permitted in
412222254Sjhb	 * a rendezvous action.  To fix this, wrap all of the
413222254Sjhb	 * rendezvous action handlers in a critical section.  We
414222254Sjhb	 * cannot use a regular critical section however as having
415222254Sjhb	 * critical_exit() preempt from this routine would also be
416222254Sjhb	 * problematic (the preemption must not occur before the IPI
417222266Sjhb	 * has been acknowledged via an EOI).  Instead, we
418222254Sjhb	 * intentionally ignore td_owepreempt when leaving the
419222266Sjhb	 * critical section.  This should be harmless because we do
420222266Sjhb	 * not permit rendezvous action routines to schedule threads,
421222266Sjhb	 * and thus td_owepreempt should never transition from 0 to 1
422222254Sjhb	 * during this routine.
423222254Sjhb	 */
424222254Sjhb	td = curthread;
425222254Sjhb	td->td_critnest++;
426222254Sjhb#ifdef INVARIANTS
427222254Sjhb	owepreempt = td->td_owepreempt;
428222254Sjhb#endif
429222254Sjhb
430222254Sjhb	/*
431222032Sjhb	 * If requested, run a setup function before the main action
432222032Sjhb	 * function.  Ensure all CPUs have completed the setup
433222032Sjhb	 * function before moving on to the action function.
434222032Sjhb	 */
435173444Sups	if (local_setup_func != smp_no_rendevous_barrier) {
436173444Sups		if (smp_rv_setup_func != NULL)
437173444Sups			smp_rv_setup_func(smp_rv_func_arg);
438173444Sups		atomic_add_int(&smp_rv_waiters[1], 1);
439182292Sjhb		while (smp_rv_waiters[1] < smp_rv_ncpus)
440173444Sups                	cpu_spinwait();
441173444Sups	}
442171191Sjhb
443173444Sups	if (local_action_func != NULL)
444173444Sups		local_action_func(local_func_arg);
445171191Sjhb
446222254Sjhb	if (local_teardown_func != smp_no_rendevous_barrier) {
447224527Savg		/*
448224527Savg		 * Signal that the main action has been completed.  If a
449224527Savg		 * full exit rendezvous is requested, then all CPUs will
450224527Savg		 * wait here until all CPUs have finished the main action.
451224527Savg		 */
452224527Savg		atomic_add_int(&smp_rv_waiters[2], 1);
453224527Savg		while (smp_rv_waiters[2] < smp_rv_ncpus)
454222254Sjhb			cpu_spinwait();
455175057Sjhb
456222254Sjhb		if (local_teardown_func != NULL)
457222254Sjhb			local_teardown_func(local_func_arg);
458222254Sjhb	}
459222254Sjhb
460224527Savg	/*
461224527Savg	 * Signal that the rendezvous is fully completed by this CPU.
462224527Savg	 * This means that no member of smp_rv_* pseudo-structure will be
463224527Savg	 * accessed by this target CPU after this point; in particular,
464224527Savg	 * memory pointed by smp_rv_func_arg.
465224527Savg	 */
466224527Savg	atomic_add_int(&smp_rv_waiters[3], 1);
467224527Savg
468222254Sjhb	td->td_critnest--;
469222254Sjhb	KASSERT(owepreempt == td->td_owepreempt,
470222254Sjhb	    ("rendezvous action changed td_owepreempt"));
47148924Smsmith}
47248924Smsmith
47348924Smsmithvoid
474222813Sattiliosmp_rendezvous_cpus(cpuset_t map,
475179230Sjb	void (* setup_func)(void *),
476179230Sjb	void (* action_func)(void *),
477179230Sjb	void (* teardown_func)(void *),
478179230Sjb	void *arg)
47948924Smsmith{
480222813Sattilio	int curcpumap, i, ncpus = 0;
48171576Sjasone
482227058Sattilio	/* Look comments in the !SMP case. */
48376078Sjhb	if (!smp_started) {
484227058Sattilio		spinlock_enter();
48576078Sjhb		if (setup_func != NULL)
48676078Sjhb			setup_func(arg);
48776078Sjhb		if (action_func != NULL)
48876078Sjhb			action_func(arg);
48976078Sjhb		if (teardown_func != NULL)
49076078Sjhb			teardown_func(arg);
491227058Sattilio		spinlock_exit();
49276078Sjhb		return;
49376078Sjhb	}
494179230Sjb
495209059Sjhb	CPU_FOREACH(i) {
496222813Sattilio		if (CPU_ISSET(i, &map))
497179230Sjb			ncpus++;
498209059Sjhb	}
499189232Sdchagin	if (ncpus == 0)
500222813Sattilio		panic("ncpus is 0 with non-zero map");
501182292Sjhb
502134416Sobrien	mtx_lock_spin(&smp_ipi_mtx);
50348924Smsmith
504222032Sjhb	/* Pass rendezvous parameters via global variables. */
505182292Sjhb	smp_rv_ncpus = ncpus;
50648924Smsmith	smp_rv_setup_func = setup_func;
50748924Smsmith	smp_rv_action_func = action_func;
50848924Smsmith	smp_rv_teardown_func = teardown_func;
50948924Smsmith	smp_rv_func_arg = arg;
51048924Smsmith	smp_rv_waiters[1] = 0;
511171191Sjhb	smp_rv_waiters[2] = 0;
512224527Savg	smp_rv_waiters[3] = 0;
513171191Sjhb	atomic_store_rel_int(&smp_rv_waiters[0], 0);
51448924Smsmith
515222032Sjhb	/*
516222032Sjhb	 * Signal other processors, which will enter the IPI with
517222032Sjhb	 * interrupts off.
518222032Sjhb	 */
519222813Sattilio	curcpumap = CPU_ISSET(curcpu, &map);
520222813Sattilio	CPU_CLR(curcpu, &map);
521222813Sattilio	ipi_selected(map, IPI_RENDEZVOUS);
52248924Smsmith
523179230Sjb	/* Check if the current CPU is in the map */
524222813Sattilio	if (curcpumap != 0)
525179230Sjb		smp_rendezvous_action();
52648924Smsmith
527222032Sjhb	/*
528224527Savg	 * Ensure that the master CPU waits for all the other
529224527Savg	 * CPUs to finish the rendezvous, so that smp_rv_*
530224527Savg	 * pseudo-structure and the arg are guaranteed to not
531224527Savg	 * be in use.
532222032Sjhb	 */
533224527Savg	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
534224527Savg		cpu_spinwait();
535175057Sjhb
536134416Sobrien	mtx_unlock_spin(&smp_ipi_mtx);
53748924Smsmith}
538123125Sjhb
539179230Sjbvoid
540179230Sjbsmp_rendezvous(void (* setup_func)(void *),
541179230Sjb	       void (* action_func)(void *),
542179230Sjb	       void (* teardown_func)(void *),
543179230Sjb	       void *arg)
544179230Sjb{
545179230Sjb	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
546179230Sjb}
547179230Sjb
548176734Sjeffstatic struct cpu_group group[MAXCPU];
549176734Sjeff
550176734Sjeffstruct cpu_group *
551176734Sjeffsmp_topo(void)
552123125Sjhb{
553222813Sattilio	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
554176734Sjeff	struct cpu_group *top;
555176734Sjeff
556176734Sjeff	/*
557176734Sjeff	 * Check for a fake topology request for debugging purposes.
558176734Sjeff	 */
559176734Sjeff	switch (smp_topology) {
560176734Sjeff	case 1:
561176734Sjeff		/* Dual core with no sharing.  */
562176734Sjeff		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
563176734Sjeff		break;
564177007Sjeff	case 2:
565177007Sjeff		/* No topology, all cpus are equal. */
566177007Sjeff		top = smp_topo_none();
567177007Sjeff		break;
568176734Sjeff	case 3:
569176734Sjeff		/* Dual core with shared L2.  */
570176734Sjeff		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
571176734Sjeff		break;
572176734Sjeff	case 4:
573176734Sjeff		/* quad core, shared l3 among each package, private l2.  */
574176734Sjeff		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
575176734Sjeff		break;
576176734Sjeff	case 5:
577176734Sjeff		/* quad core,  2 dualcore parts on each package share l2.  */
578176734Sjeff		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
579176734Sjeff		break;
580176734Sjeff	case 6:
581176734Sjeff		/* Single-core 2xHTT */
582176734Sjeff		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
583176734Sjeff		break;
584176734Sjeff	case 7:
585176734Sjeff		/* quad core with a shared l3, 8 threads sharing L2.  */
586176734Sjeff		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
587191643Sjeff		    CG_FLAG_SMT);
588176734Sjeff		break;
589176734Sjeff	default:
590176734Sjeff		/* Default, ask the system what it wants. */
591176734Sjeff		top = cpu_topo();
592176734Sjeff		break;
593176734Sjeff	}
594176734Sjeff	/*
595176734Sjeff	 * Verify the returned topology.
596176734Sjeff	 */
597176734Sjeff	if (top->cg_count != mp_ncpus)
598176734Sjeff		panic("Built bad topology at %p.  CPU count %d != %d",
599176734Sjeff		    top, top->cg_count, mp_ncpus);
600222813Sattilio	if (CPU_CMP(&top->cg_mask, &all_cpus))
601222813Sattilio		panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
602222813Sattilio		    top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
603222813Sattilio		    cpusetobj_strprint(cpusetbuf2, &all_cpus));
604176734Sjeff	return (top);
605123125Sjhb}
606123125Sjhb
607176734Sjeffstruct cpu_group *
608176734Sjeffsmp_topo_none(void)
609176734Sjeff{
610176734Sjeff	struct cpu_group *top;
611176734Sjeff
612176734Sjeff	top = &group[0];
613176734Sjeff	top->cg_parent = NULL;
614176734Sjeff	top->cg_child = NULL;
615218584Sjmallett	top->cg_mask = all_cpus;
616176734Sjeff	top->cg_count = mp_ncpus;
617176734Sjeff	top->cg_children = 0;
618176734Sjeff	top->cg_level = CG_SHARE_NONE;
619176734Sjeff	top->cg_flags = 0;
620176734Sjeff
621176734Sjeff	return (top);
622176734Sjeff}
623176734Sjeff
624176734Sjeffstatic int
625176734Sjeffsmp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
626176734Sjeff    int count, int flags, int start)
627176734Sjeff{
628222813Sattilio	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
629222813Sattilio	cpuset_t mask;
630176734Sjeff	int i;
631176734Sjeff
632222813Sattilio	CPU_ZERO(&mask);
633222813Sattilio	for (i = 0; i < count; i++, start++)
634222813Sattilio		CPU_SET(start, &mask);
635176734Sjeff	child->cg_parent = parent;
636176734Sjeff	child->cg_child = NULL;
637176734Sjeff	child->cg_children = 0;
638176734Sjeff	child->cg_level = share;
639176734Sjeff	child->cg_count = count;
640176734Sjeff	child->cg_flags = flags;
641176734Sjeff	child->cg_mask = mask;
642176734Sjeff	parent->cg_children++;
643176734Sjeff	for (; parent != NULL; parent = parent->cg_parent) {
644222813Sattilio		if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
645222813Sattilio			panic("Duplicate children in %p.  mask (%s) child (%s)",
646222813Sattilio			    parent,
647222813Sattilio			    cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
648222813Sattilio			    cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
649222813Sattilio		CPU_OR(&parent->cg_mask, &child->cg_mask);
650176734Sjeff		parent->cg_count += child->cg_count;
651176734Sjeff	}
652176734Sjeff
653176734Sjeff	return (start);
654176734Sjeff}
655176734Sjeff
656176734Sjeffstruct cpu_group *
657176734Sjeffsmp_topo_1level(int share, int count, int flags)
658176734Sjeff{
659176734Sjeff	struct cpu_group *child;
660176734Sjeff	struct cpu_group *top;
661176734Sjeff	int packages;
662176734Sjeff	int cpu;
663176734Sjeff	int i;
664176734Sjeff
665176734Sjeff	cpu = 0;
666176734Sjeff	top = &group[0];
667176734Sjeff	packages = mp_ncpus / count;
668176734Sjeff	top->cg_child = child = &group[1];
669176734Sjeff	top->cg_level = CG_SHARE_NONE;
670176734Sjeff	for (i = 0; i < packages; i++, child++)
671176734Sjeff		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
672176734Sjeff	return (top);
673176734Sjeff}
674176734Sjeff
675176734Sjeffstruct cpu_group *
676176734Sjeffsmp_topo_2level(int l2share, int l2count, int l1share, int l1count,
677176734Sjeff    int l1flags)
678176734Sjeff{
679176734Sjeff	struct cpu_group *top;
680176734Sjeff	struct cpu_group *l1g;
681176734Sjeff	struct cpu_group *l2g;
682176734Sjeff	int cpu;
683176734Sjeff	int i;
684176734Sjeff	int j;
685176734Sjeff
686176734Sjeff	cpu = 0;
687176734Sjeff	top = &group[0];
688176734Sjeff	l2g = &group[1];
689176734Sjeff	top->cg_child = l2g;
690176734Sjeff	top->cg_level = CG_SHARE_NONE;
691176734Sjeff	top->cg_children = mp_ncpus / (l2count * l1count);
692176734Sjeff	l1g = l2g + top->cg_children;
693176734Sjeff	for (i = 0; i < top->cg_children; i++, l2g++) {
694176734Sjeff		l2g->cg_parent = top;
695176734Sjeff		l2g->cg_child = l1g;
696176734Sjeff		l2g->cg_level = l2share;
697176734Sjeff		for (j = 0; j < l2count; j++, l1g++)
698176734Sjeff			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
699176734Sjeff			    l1flags, cpu);
700176734Sjeff	}
701176734Sjeff	return (top);
702176734Sjeff}
703176734Sjeff
704176734Sjeff
705176734Sjeffstruct cpu_group *
706176734Sjeffsmp_topo_find(struct cpu_group *top, int cpu)
707176734Sjeff{
708176734Sjeff	struct cpu_group *cg;
709222813Sattilio	cpuset_t mask;
710176734Sjeff	int children;
711176734Sjeff	int i;
712176734Sjeff
713222813Sattilio	CPU_SETOF(cpu, &mask);
714176734Sjeff	cg = top;
715176734Sjeff	for (;;) {
716222813Sattilio		if (!CPU_OVERLAP(&cg->cg_mask, &mask))
717176734Sjeff			return (NULL);
718176734Sjeff		if (cg->cg_children == 0)
719176734Sjeff			return (cg);
720176734Sjeff		children = cg->cg_children;
721176734Sjeff		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
722222813Sattilio			if (CPU_OVERLAP(&cg->cg_mask, &mask))
723176734Sjeff				break;
724176734Sjeff	}
725176734Sjeff	return (NULL);
726176734Sjeff}
727176734Sjeff#else /* !SMP */
728176734Sjeff
729123125Sjhbvoid
730222813Sattiliosmp_rendezvous_cpus(cpuset_t map,
731179230Sjb	void (*setup_func)(void *),
732179230Sjb	void (*action_func)(void *),
733179230Sjb	void (*teardown_func)(void *),
734179230Sjb	void *arg)
735179230Sjb{
736227058Sattilio	/*
737227058Sattilio	 * In the !SMP case we just need to ensure the same initial conditions
738227058Sattilio	 * as the SMP case.
739227058Sattilio	 */
740227058Sattilio	spinlock_enter();
741179230Sjb	if (setup_func != NULL)
742179230Sjb		setup_func(arg);
743179230Sjb	if (action_func != NULL)
744179230Sjb		action_func(arg);
745179230Sjb	if (teardown_func != NULL)
746179230Sjb		teardown_func(arg);
747227058Sattilio	spinlock_exit();
748179230Sjb}
749179230Sjb
750179230Sjbvoid
751175057Sjhbsmp_rendezvous(void (*setup_func)(void *),
752175057Sjhb	       void (*action_func)(void *),
753175057Sjhb	       void (*teardown_func)(void *),
754123125Sjhb	       void *arg)
755123125Sjhb{
756123125Sjhb
757227058Sattilio	/* Look comments in the smp_rendezvous_cpus() case. */
758227058Sattilio	spinlock_enter();
759123125Sjhb	if (setup_func != NULL)
760123125Sjhb		setup_func(arg);
761123125Sjhb	if (action_func != NULL)
762123125Sjhb		action_func(arg);
763123125Sjhb	if (teardown_func != NULL)
764123125Sjhb		teardown_func(arg);
765227058Sattilio	spinlock_exit();
766123125Sjhb}
767176734Sjeff
768176734Sjeff/*
769176734Sjeff * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
770176734Sjeff * APIs will still work using this dummy support.
771176734Sjeff */
772176734Sjeffstatic void
773176734Sjeffmp_setvariables_for_up(void *dummy)
774176734Sjeff{
775176734Sjeff	mp_ncpus = 1;
776176734Sjeff	mp_maxid = PCPU_GET(cpuid);
777223758Sattilio	CPU_SETOF(mp_maxid, &all_cpus);
778176734Sjeff	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
779176734Sjeff}
780176734SjeffSYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
781177253Srwatson    mp_setvariables_for_up, NULL);
782123125Sjhb#endif /* SMP */
783179230Sjb
784179230Sjbvoid
785179230Sjbsmp_no_rendevous_barrier(void *dummy)
786179230Sjb{
787179230Sjb#ifdef SMP
788179230Sjb	KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
789179230Sjb#endif
790179230Sjb}
791243046Sjeff
792243046Sjeff/*
793243046Sjeff * Wait specified idle threads to switch once.  This ensures that even
794243046Sjeff * preempted threads have cycled through the switch function once,
795243046Sjeff * exiting their codepaths.  This allows us to change global pointers
796243046Sjeff * with no other synchronization.
797243046Sjeff */
798243046Sjeffint
799243046Sjeffquiesce_cpus(cpuset_t map, const char *wmesg, int prio)
800243046Sjeff{
801243046Sjeff	struct pcpu *pcpu;
802243046Sjeff	u_int gen[MAXCPU];
803243046Sjeff	int error;
804243046Sjeff	int cpu;
805243046Sjeff
806243046Sjeff	error = 0;
807243046Sjeff	for (cpu = 0; cpu <= mp_maxid; cpu++) {
808243046Sjeff		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
809243046Sjeff			continue;
810243046Sjeff		pcpu = pcpu_find(cpu);
811243046Sjeff		gen[cpu] = pcpu->pc_idlethread->td_generation;
812243046Sjeff	}
813243046Sjeff	for (cpu = 0; cpu <= mp_maxid; cpu++) {
814243046Sjeff		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
815243046Sjeff			continue;
816243046Sjeff		pcpu = pcpu_find(cpu);
817243046Sjeff		thread_lock(curthread);
818243046Sjeff		sched_bind(curthread, cpu);
819243046Sjeff		thread_unlock(curthread);
820243046Sjeff		while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
821243046Sjeff			error = tsleep(quiesce_cpus, prio, wmesg, 1);
822244444Sjeff			if (error != EWOULDBLOCK)
823243046Sjeff				goto out;
824244444Sjeff			error = 0;
825243046Sjeff		}
826243046Sjeff	}
827243046Sjeffout:
828243046Sjeff	thread_lock(curthread);
829243046Sjeff	sched_unbind(curthread);
830243046Sjeff	thread_unlock(curthread);
831243046Sjeff
832243046Sjeff	return (error);
833243046Sjeff}
834243046Sjeff
835243046Sjeffint
836243046Sjeffquiesce_all_cpus(const char *wmesg, int prio)
837243046Sjeff{
838243046Sjeff
839243046Sjeff	return quiesce_cpus(all_cpus, wmesg, prio);
840243046Sjeff}
841265606Sscottl
842265606Sscottl/* Extra care is taken with this sysctl because the data type is volatile */
843265606Sscottlstatic int
844265606Sscottlsysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
845265606Sscottl{
846265606Sscottl	int error, active;
847265606Sscottl
848265606Sscottl	active = smp_started;
849265606Sscottl	error = SYSCTL_OUT(req, &active, sizeof(active));
850265606Sscottl	return (error);
851265606Sscottl}
852265606Sscottl
853