mp_machdep.c revision 130224
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 130224 2004-06-08 01:02:52Z peter $");
29
30#include "opt_cpu.h"
31#include "opt_kstack_pages.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/bus.h>
36#ifdef GPROF
37#include <sys/gmon.h>
38#endif
39#include <sys/kernel.h>
40#include <sys/ktr.h>
41#include <sys/lock.h>
42#include <sys/malloc.h>
43#include <sys/memrange.h>
44#include <sys/mutex.h>
45#include <sys/pcpu.h>
46#include <sys/proc.h>
47#include <sys/smp.h>
48#include <sys/sysctl.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_extern.h>
55
56#include <machine/apicreg.h>
57#include <machine/clock.h>
58#include <machine/md_var.h>
59#include <machine/pcb.h>
60#include <machine/psl.h>
61#include <machine/smp.h>
62#include <machine/specialreg.h>
63#include <machine/tss.h>
64
65#define WARMBOOT_TARGET		0
66#define WARMBOOT_OFF		(KERNBASE + 0x0467)
67#define WARMBOOT_SEG		(KERNBASE + 0x0469)
68
69#define CMOS_REG		(0x70)
70#define CMOS_DATA		(0x71)
71#define BIOS_RESET		(0x0f)
72#define BIOS_WARM		(0x0a)
73
74/* lock region used by kernel profiling */
75int	mcount_lock;
76
77int	mp_naps;		/* # of Applications processors */
78int	boot_cpu_id = -1;	/* designated BSP */
79extern	int nkpt;
80
81/*
82 * CPU topology map datastructures for HTT.
83 */
84static struct cpu_group mp_groups[MAXCPU];
85static struct cpu_top mp_top;
86
87/* AP uses this during bootstrap.  Do not staticize.  */
88char *bootSTK;
89static int bootAP;
90
91/* Free these after use */
92void *bootstacks[MAXCPU];
93
94/* Hotwire a 0->4MB V==P mapping */
95extern pt_entry_t *KPTphys;
96
97/* SMP page table page */
98extern pt_entry_t *SMPpt;
99
100struct pcb stoppcbs[MAXCPU];
101
102/* Variables needed for SMP tlb shootdown. */
103vm_offset_t smp_tlb_addr1;
104vm_offset_t smp_tlb_addr2;
105volatile int smp_tlb_wait;
106struct mtx smp_tlb_mtx;
107
108extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
109
110/*
111 * Local data and functions.
112 */
113
114static u_int logical_cpus;
115static u_int logical_cpus_mask;
116
117/* used to hold the AP's until we are ready to release them */
118static struct mtx ap_boot_mtx;
119
120/* Set to 1 once we're ready to let the APs out of the pen. */
121static volatile int aps_ready = 0;
122
123/*
124 * Store data from cpu_add() until later in the boot when we actually setup
125 * the APs.
126 */
127struct cpu_info {
128	int	cpu_present:1;
129	int	cpu_bsp:1;
130} static cpu_info[MAXCPU];
131static int cpu_apic_ids[MAXCPU];
132
133static u_int boot_address;
134
135static void	set_logical_apic_ids(void);
136static int	start_all_aps(void);
137static int	start_ap(int apic_id);
138static void	release_aps(void *dummy);
139
140static int	hlt_cpus_mask;
141static int	hlt_logical_cpus;
142static struct	sysctl_ctx_list logical_cpu_clist;
143static u_int	bootMP_size;
144
145void
146mp_topology(void)
147{
148	struct cpu_group *group;
149	int logical_cpus;
150	int apic_id;
151	int groups;
152	int cpu;
153
154	/* Build the smp_topology map. */
155	/* Nothing to do if there is no HTT support. */
156	if ((cpu_feature & CPUID_HTT) == 0)
157		return;
158	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
159	if (logical_cpus <= 1)
160		return;
161	group = &mp_groups[0];
162	groups = 1;
163	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
164		if (!cpu_info[apic_id].cpu_present)
165			continue;
166		/*
167		 * If the current group has members and we're not a logical
168		 * cpu, create a new group.
169		 */
170		if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
171			group++;
172			groups++;
173		}
174		group->cg_count++;
175		group->cg_mask |= 1 << cpu;
176		cpu++;
177	}
178
179	mp_top.ct_count = groups;
180	mp_top.ct_group = mp_groups;
181	smp_topology = &mp_top;
182}
183
184
185/*
186 * Calculate usable address in base memory for AP trampoline code.
187 */
188u_int
189mp_bootaddress(u_int basemem)
190{
191
192	bootMP_size = mptramp_end - mptramp_start;
193	boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
194	if ((basemem - boot_address) < bootMP_size)
195		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
196	/* 3 levels of page table pages */
197	mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
198
199	return mptramp_pagetables;
200}
201
202void
203cpu_add(u_int apic_id, char boot_cpu)
204{
205
206	if (apic_id >= MAXCPU) {
207		printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
208		    apic_id, MAXCPU - 1);
209		return;
210	}
211	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
212	    apic_id));
213	cpu_info[apic_id].cpu_present = 1;
214	if (boot_cpu) {
215		KASSERT(boot_cpu_id == -1,
216		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
217		    boot_cpu_id));
218		boot_cpu_id = apic_id;
219		cpu_info[apic_id].cpu_bsp = 1;
220	}
221	mp_ncpus++;
222	if (apic_id > mp_maxid)
223		mp_maxid = apic_id;
224	if (bootverbose)
225		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
226		    "AP");
227
228}
229
230void
231cpu_mp_setmaxid(void)
232{
233
234	/*
235	 * mp_maxid should be already set by calls to cpu_add().
236	 * Just sanity check its value here.
237	 */
238	if (mp_ncpus == 0)
239		KASSERT(mp_maxid == 0,
240		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
241	else if (mp_ncpus == 1)
242		mp_maxid = 0;
243	else
244		KASSERT(mp_maxid >= mp_ncpus - 1,
245		    ("%s: counters out of sync: max %d, count %d", __func__,
246			mp_maxid, mp_ncpus));
247
248}
249
250int
251cpu_mp_probe(void)
252{
253
254	/*
255	 * Always record BSP in CPU map so that the mbuf init code works
256	 * correctly.
257	 */
258	all_cpus = 1;
259	if (mp_ncpus == 0) {
260		/*
261		 * No CPUs were found, so this must be a UP system.  Setup
262		 * the variables to represent a system with a single CPU
263		 * with an id of 0.
264		 */
265		mp_ncpus = 1;
266		return (0);
267	}
268
269	/* At least one CPU was found. */
270	if (mp_ncpus == 1) {
271		/*
272		 * One CPU was found, so this must be a UP system with
273		 * an I/O APIC.
274		 */
275		mp_maxid = 0;
276		return (0);
277	}
278
279	/* At least two CPUs were found. */
280	return (1);
281}
282
283/*
284 * Initialize the IPI handlers and start up the AP's.
285 */
286void
287cpu_mp_start(void)
288{
289	int i;
290
291	/* Initialize the logical ID to APIC ID table. */
292	for (i = 0; i < MAXCPU; i++)
293		cpu_apic_ids[i] = -1;
294
295	/* Install an inter-CPU IPI for TLB invalidation */
296	setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
297	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
298	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
299
300	/* Install an inter-CPU IPI for forwarding hardclock() */
301	setidt(IPI_HARDCLOCK, IDTVEC(hardclock), SDT_SYSIGT, SEL_KPL, 0);
302
303	/* Install an inter-CPU IPI for forwarding statclock() */
304	setidt(IPI_STATCLOCK, IDTVEC(statclock), SDT_SYSIGT, SEL_KPL, 0);
305
306	/* Install an inter-CPU IPI for all-CPU rendezvous */
307	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
308
309	/* Install an inter-CPU IPI for forcing an additional software trap */
310	setidt(IPI_AST, IDTVEC(cpuast), SDT_SYSIGT, SEL_KPL, 0);
311
312	/* Install an inter-CPU IPI for CPU stop/restart */
313	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
314
315	mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
316
317	/* Set boot_cpu_id if needed. */
318	if (boot_cpu_id == -1) {
319		boot_cpu_id = PCPU_GET(apic_id);
320		cpu_info[boot_cpu_id].cpu_bsp = 1;
321	} else
322		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
323		    ("BSP's APIC ID doesn't match boot_cpu_id"));
324	cpu_apic_ids[0] = boot_cpu_id;
325
326	/* Start each Application Processor */
327	start_all_aps();
328
329	/* Setup the initial logical CPUs info. */
330	logical_cpus = logical_cpus_mask = 0;
331	if (cpu_feature & CPUID_HTT)
332		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
333
334	set_logical_apic_ids();
335}
336
337
338/*
339 * Print various information about the SMP system hardware and setup.
340 */
341void
342cpu_mp_announce(void)
343{
344	int i, x;
345
346	/* List CPUs */
347	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
348	for (i = 1, x = 0; x < MAXCPU; x++) {
349		if (cpu_info[x].cpu_present && !cpu_info[x].cpu_bsp) {
350			KASSERT(i < mp_ncpus,
351			    ("mp_ncpus and actual cpus are out of whack"));
352			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
353		}
354	}
355}
356
357/*
358 * AP CPU's call this to initialize themselves.
359 */
360void
361init_secondary(void)
362{
363	struct pcpu *pc;
364	u_int64_t msr, cr0;
365	int cpu, gsel_tss;
366
367	/* Set by the startup code for us to use */
368	cpu = bootAP;
369
370	/* Init tss */
371	common_tss[cpu] = common_tss[0];
372	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
373
374	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
375	ssdtosyssd(&gdt_segs[GPROC0_SEL],
376	   (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
377
378	lgdt(&r_gdt);			/* does magic intra-segment return */
379
380	/* Get per-cpu data */
381	pc = &__pcpu[cpu];
382
383	/* prime data page for it to use */
384	pcpu_init(pc, cpu, sizeof(struct pcpu));
385	pc->pc_apic_id = cpu_apic_ids[cpu];
386	pc->pc_prvspace = pc;
387	pc->pc_curthread = 0;
388	pc->pc_tssp = &common_tss[cpu];
389	pc->pc_rsp0 = 0;
390
391	wrmsr(MSR_FSBASE, 0);		/* User value */
392	wrmsr(MSR_GSBASE, (u_int64_t)pc);
393	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
394
395	lidt(&r_idt);
396
397	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
398	ltr(gsel_tss);
399
400	/*
401	 * Set to a known state:
402	 * Set by mpboot.s: CR0_PG, CR0_PE
403	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
404	 */
405	cr0 = rcr0();
406	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
407	load_cr0(cr0);
408
409	/* Set up the fast syscall stuff */
410	msr = rdmsr(MSR_EFER) | EFER_SCE;
411	wrmsr(MSR_EFER, msr);
412	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
413	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
414	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
415	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
416	wrmsr(MSR_STAR, msr);
417	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
418
419	/* Disable local apic just to be sure. */
420	lapic_disable();
421
422	/* signal our startup to the BSP. */
423	mp_naps++;
424
425	/* Spin until the BSP releases the AP's. */
426	while (!aps_ready)
427		ia32_pause();
428
429	/* set up CPU registers and state */
430	cpu_setregs();
431
432	/* set up FPU state on the AP */
433	fpuinit();
434
435	/* set up SSE/NX registers */
436	initializecpu();
437
438	/* A quick check from sanity claus */
439	if (PCPU_GET(apic_id) != lapic_id()) {
440		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
441		printf("SMP: actual apic_id = %d\n", lapic_id());
442		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
443		panic("cpuid mismatch! boom!!");
444	}
445
446	mtx_lock_spin(&ap_boot_mtx);
447
448	/* Init local apic for irq's */
449	lapic_setup();
450
451	/* Set memory range attributes for this CPU to match the BSP */
452	mem_range_AP_init();
453
454	smp_cpus++;
455
456	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
457	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
458
459	/* Determine if we are a logical CPU. */
460	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
461		logical_cpus_mask |= PCPU_GET(cpumask);
462
463	/* Build our map of 'other' CPUs. */
464	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
465
466	if (bootverbose)
467		lapic_dump("AP");
468
469	if (smp_cpus == mp_ncpus) {
470		/* enable IPI's, tlb shootdown, freezes etc */
471		atomic_store_rel_int(&smp_started, 1);
472		smp_active = 1;	 /* historic */
473	}
474
475	mtx_unlock_spin(&ap_boot_mtx);
476
477	/* wait until all the AP's are up */
478	while (smp_started == 0)
479		ia32_pause();
480
481	/* ok, now grab sched_lock and enter the scheduler */
482	mtx_lock_spin(&sched_lock);
483
484	binuptime(PCPU_PTR(switchtime));
485	PCPU_SET(switchticks, ticks);
486
487	cpu_throw(NULL, choosethread());	/* doesn't return */
488
489	panic("scheduler returned us to %s", __func__);
490	/* NOTREACHED */
491}
492
493/*******************************************************************
494 * local functions and data
495 */
496
497/*
498 * Set the APIC logical IDs.
499 *
500 * We want to cluster logical CPU's within the same APIC ID cluster.
501 * Since logical CPU's are aligned simply filling in the clusters in
502 * APIC ID order works fine.  Note that this does not try to balance
503 * the number of CPU's in each cluster. (XXX?)
504 */
505static void
506set_logical_apic_ids(void)
507{
508	u_int apic_id, cluster, cluster_id;
509
510	/* Force us to allocate cluster 0 at the start. */
511	cluster = -1;
512	cluster_id = APIC_MAX_INTRACLUSTER_ID;
513	for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
514		if (!cpu_info[apic_id].cpu_present)
515			continue;
516		if (cluster_id == APIC_MAX_INTRACLUSTER_ID) {
517			cluster = ioapic_next_logical_cluster();
518			cluster_id = 0;
519		} else
520			cluster_id++;
521		if (bootverbose)
522			printf("APIC ID: physical %u, logical %u:%u\n",
523			    apic_id, cluster, cluster_id);
524		lapic_set_logical_id(apic_id, cluster, cluster_id);
525	}
526}
527
528/*
529 * start each AP in our list
530 */
531static int
532start_all_aps(void)
533{
534	u_char mpbiosreason;
535	u_int32_t mpbioswarmvec;
536	int apic_id, cpu, i;
537	u_int64_t *pt4, *pt3, *pt2;
538
539	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
540
541	/* install the AP 1st level boot code */
542	pmap_kenter(boot_address + KERNBASE, boot_address);
543	bcopy(mptramp_start, (void *)((uintptr_t)boot_address + KERNBASE), bootMP_size);
544
545	/* Locate the page tables, they'll be below the trampoline */
546	pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
547	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
548	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
549
550	/* Create the initial 1GB replicated page tables */
551	for (i = 0; i < 512; i++) {
552		/* Each slot of the level 4 pages points to the same level 3 page */
553		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
554		pt4[i] |= PG_V | PG_RW | PG_U;
555
556		/* Each slot of the level 3 pages points to the same level 2 page */
557		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
558		pt3[i] |= PG_V | PG_RW | PG_U;
559
560		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
561		pt2[i] = i * (2 * 1024 * 1024);
562		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
563	}
564
565	/* save the current value of the warm-start vector */
566	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
567	outb(CMOS_REG, BIOS_RESET);
568	mpbiosreason = inb(CMOS_DATA);
569
570	/* setup a vector to our boot code */
571	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
572	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
573	outb(CMOS_REG, BIOS_RESET);
574	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
575
576	/* start each AP */
577	cpu = 0;
578	for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
579		if (!cpu_info[apic_id].cpu_present ||
580		    cpu_info[apic_id].cpu_bsp)
581			continue;
582		cpu++;
583
584		/* save APIC ID for this logical ID */
585		cpu_apic_ids[cpu] = apic_id;
586
587		/* allocate and set up an idle stack data page */
588		bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
589
590		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
591		bootAP = cpu;
592
593		/* attempt to start the Application Processor */
594		if (!start_ap(apic_id)) {
595			/* restore the warmstart vector */
596			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
597			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
598		}
599
600		all_cpus |= (1 << cpu);		/* record AP in CPU map */
601	}
602
603	/* build our map of 'other' CPUs */
604	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
605
606	/* restore the warmstart vector */
607	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
608
609	outb(CMOS_REG, BIOS_RESET);
610	outb(CMOS_DATA, mpbiosreason);
611
612	/* number of APs actually started */
613	return mp_naps;
614}
615
616
617/*
618 * This function starts the AP (application processor) identified
619 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
620 * to accomplish this.  This is necessary because of the nuances
621 * of the different hardware we might encounter.  It isn't pretty,
622 * but it seems to work.
623 */
624static int
625start_ap(int apic_id)
626{
627	int vector, ms;
628	int cpus;
629
630	/* calculate the vector */
631	vector = (boot_address >> 12) & 0xff;
632
633	/* used as a watchpoint to signal AP startup */
634	cpus = mp_naps;
635
636	/*
637	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
638	 * and running the target CPU. OR this INIT IPI might be latched (P5
639	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
640	 * ignored.
641	 */
642
643	/* do an INIT IPI: assert RESET */
644	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
645	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
646
647	/* wait for pending status end */
648	lapic_ipi_wait(-1);
649
650	/* do an INIT IPI: deassert RESET */
651	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
652	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
653
654	/* wait for pending status end */
655	DELAY(10000);		/* wait ~10mS */
656	lapic_ipi_wait(-1);
657
658	/*
659	 * next we do a STARTUP IPI: the previous INIT IPI might still be
660	 * latched, (P5 bug) this 1st STARTUP would then terminate
661	 * immediately, and the previously started INIT IPI would continue. OR
662	 * the previous INIT IPI has already run. and this STARTUP IPI will
663	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
664	 * will run.
665	 */
666
667	/* do a STARTUP IPI */
668	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
669	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
670	    vector, apic_id);
671	lapic_ipi_wait(-1);
672	DELAY(200);		/* wait ~200uS */
673
674	/*
675	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
676	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
677	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
678	 * recognized after hardware RESET or INIT IPI.
679	 */
680
681	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
682	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
683	    vector, apic_id);
684	lapic_ipi_wait(-1);
685	DELAY(200);		/* wait ~200uS */
686
687	/* Wait up to 5 seconds for it to start. */
688	for (ms = 0; ms < 50; ms++) {
689		if (mp_naps > cpus)
690			return 1;	/* return SUCCESS */
691		DELAY(100000);
692	}
693	return 0;		/* return FAILURE */
694}
695
696/*
697 * Flush the TLB on all other CPU's
698 */
699static void
700smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
701{
702	u_int ncpu;
703
704	ncpu = mp_ncpus - 1;	/* does not shootdown self */
705	if (ncpu < 1)
706		return;		/* no other cpus */
707	mtx_assert(&smp_tlb_mtx, MA_OWNED);
708	smp_tlb_addr1 = addr1;
709	smp_tlb_addr2 = addr2;
710	atomic_store_rel_int(&smp_tlb_wait, 0);
711	ipi_all_but_self(vector);
712	while (smp_tlb_wait < ncpu)
713		ia32_pause();
714}
715
716/*
717 * This is about as magic as it gets.  fortune(1) has got similar code
718 * for reversing bits in a word.  Who thinks up this stuff??
719 *
720 * Yes, it does appear to be consistently faster than:
721 * while (i = ffs(m)) {
722 *	m >>= i;
723 *	bits++;
724 * }
725 * and
726 * while (lsb = (m & -m)) {	// This is magic too
727 * 	m &= ~lsb;		// or: m ^= lsb
728 *	bits++;
729 * }
730 * Both of these latter forms do some very strange things on gcc-3.1 with
731 * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
732 * There is probably an SSE or MMX popcnt instruction.
733 *
734 * I wonder if this should be in libkern?
735 *
736 * XXX Stop the presses!  Another one:
737 * static __inline u_int32_t
738 * popcnt1(u_int32_t v)
739 * {
740 *	v -= ((v >> 1) & 0x55555555);
741 *	v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
742 *	v = (v + (v >> 4)) & 0x0F0F0F0F;
743 *	return (v * 0x01010101) >> 24;
744 * }
745 * The downside is that it has a multiply.  With a pentium3 with
746 * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
747 * an imull, and in that case it is faster.  In most other cases
748 * it appears slightly slower.
749 *
750 * Another variant (also from fortune):
751 * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
752 * #define  BX_(x)     ((x) - (((x)>>1)&0x77777777)            \
753 *                          - (((x)>>2)&0x33333333)            \
754 *                          - (((x)>>3)&0x11111111))
755 */
756static __inline u_int32_t
757popcnt(u_int32_t m)
758{
759
760	m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
761	m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
762	m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
763	m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
764	m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
765	return m;
766}
767
768static void
769smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
770{
771	int ncpu, othercpus;
772
773	othercpus = mp_ncpus - 1;
774	if (mask == (u_int)-1) {
775		ncpu = othercpus;
776		if (ncpu < 1)
777			return;
778	} else {
779		mask &= ~PCPU_GET(cpumask);
780		if (mask == 0)
781			return;
782		ncpu = popcnt(mask);
783		if (ncpu > othercpus) {
784			/* XXX this should be a panic offence */
785			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
786			    ncpu, othercpus);
787			ncpu = othercpus;
788		}
789		/* XXX should be a panic, implied by mask == 0 above */
790		if (ncpu < 1)
791			return;
792	}
793	mtx_assert(&smp_tlb_mtx, MA_OWNED);
794	smp_tlb_addr1 = addr1;
795	smp_tlb_addr2 = addr2;
796	atomic_store_rel_int(&smp_tlb_wait, 0);
797	if (mask == (u_int)-1)
798		ipi_all_but_self(vector);
799	else
800		ipi_selected(mask, vector);
801	while (smp_tlb_wait < ncpu)
802		ia32_pause();
803}
804
805void
806smp_invltlb(void)
807{
808
809	if (smp_started)
810		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
811}
812
813void
814smp_invlpg(vm_offset_t addr)
815{
816
817	if (smp_started)
818		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
819}
820
821void
822smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
823{
824
825	if (smp_started)
826		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
827}
828
829void
830smp_masked_invltlb(u_int mask)
831{
832
833	if (smp_started)
834		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
835}
836
837void
838smp_masked_invlpg(u_int mask, vm_offset_t addr)
839{
840
841	if (smp_started)
842		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
843}
844
845void
846smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
847{
848
849	if (smp_started)
850		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
851}
852
853
854/*
855 * For statclock, we send an IPI to all CPU's to have them call this
856 * function.
857 */
858void
859forwarded_statclock(struct clockframe frame)
860{
861	struct thread *td;
862
863	CTR0(KTR_SMP, "forwarded_statclock");
864	td = curthread;
865	td->td_intr_nesting_level++;
866	if (profprocs != 0)
867		profclock(&frame);
868	if (pscnt == psdiv)
869		statclock(&frame);
870	td->td_intr_nesting_level--;
871}
872
873void
874forward_statclock(void)
875{
876	int map;
877
878	CTR0(KTR_SMP, "forward_statclock");
879
880	if (!smp_started || cold || panicstr)
881		return;
882
883	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
884	if (map != 0)
885		ipi_selected(map, IPI_STATCLOCK);
886}
887
888/*
889 * For each hardclock(), we send an IPI to all other CPU's to have them
890 * execute this function.  It would be nice to reduce contention on
891 * sched_lock if we could simply peek at the CPU to determine the user/kernel
892 * state and call hardclock_process() on the CPU receiving the clock interrupt
893 * and then just use a simple IPI to handle any ast's if needed.
894 */
895void
896forwarded_hardclock(struct clockframe frame)
897{
898	struct thread *td;
899
900	CTR0(KTR_SMP, "forwarded_hardclock");
901	td = curthread;
902	td->td_intr_nesting_level++;
903	hardclock_process(&frame);
904	td->td_intr_nesting_level--;
905}
906
907void
908forward_hardclock(void)
909{
910	u_int map;
911
912	CTR0(KTR_SMP, "forward_hardclock");
913
914	if (!smp_started || cold || panicstr)
915		return;
916
917	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
918	if (map != 0)
919		ipi_selected(map, IPI_HARDCLOCK);
920}
921
922/*
923 * send an IPI to a set of cpus.
924 */
925void
926ipi_selected(u_int32_t cpus, u_int ipi)
927{
928	int cpu;
929
930	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
931	while ((cpu = ffs(cpus)) != 0) {
932		cpu--;
933		KASSERT(cpu_apic_ids[cpu] != -1,
934		    ("IPI to non-existent CPU %d", cpu));
935		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
936		cpus &= ~(1 << cpu);
937	}
938}
939
940/*
941 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
942 */
943void
944ipi_all(u_int ipi)
945{
946
947	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
948	lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
949}
950
951/*
952 * send an IPI to all CPUs EXCEPT myself
953 */
954void
955ipi_all_but_self(u_int ipi)
956{
957
958	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
959	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
960}
961
962/*
963 * send an IPI to myself
964 */
965void
966ipi_self(u_int ipi)
967{
968
969	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
970	lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
971}
972
973/*
974 * This is called once the rest of the system is up and running and we're
975 * ready to let the AP's out of the pen.
976 */
977static void
978release_aps(void *dummy __unused)
979{
980
981	if (mp_ncpus == 1)
982		return;
983	mtx_lock_spin(&sched_lock);
984	atomic_store_rel_int(&aps_ready, 1);
985	while (smp_started == 0)
986		ia32_pause();
987	mtx_unlock_spin(&sched_lock);
988}
989SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
990
991static int
992sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
993{
994	u_int mask;
995	int error;
996
997	mask = hlt_cpus_mask;
998	error = sysctl_handle_int(oidp, &mask, 0, req);
999	if (error || !req->newptr)
1000		return (error);
1001
1002	if (logical_cpus_mask != 0 &&
1003	    (mask & logical_cpus_mask) == logical_cpus_mask)
1004		hlt_logical_cpus = 1;
1005	else
1006		hlt_logical_cpus = 0;
1007
1008	if ((mask & all_cpus) == all_cpus)
1009		mask &= ~(1<<0);
1010	hlt_cpus_mask = mask;
1011	return (error);
1012}
1013SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1014    0, 0, sysctl_hlt_cpus, "IU",
1015    "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1016
1017static int
1018sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1019{
1020	int disable, error;
1021
1022	disable = hlt_logical_cpus;
1023	error = sysctl_handle_int(oidp, &disable, 0, req);
1024	if (error || !req->newptr)
1025		return (error);
1026
1027	if (disable)
1028		hlt_cpus_mask |= logical_cpus_mask;
1029	else
1030		hlt_cpus_mask &= ~logical_cpus_mask;
1031
1032	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1033		hlt_cpus_mask &= ~(1<<0);
1034
1035	hlt_logical_cpus = disable;
1036	return (error);
1037}
1038
1039static void
1040cpu_hlt_setup(void *dummy __unused)
1041{
1042
1043	if (logical_cpus_mask != 0) {
1044		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1045		    &hlt_logical_cpus);
1046		sysctl_ctx_init(&logical_cpu_clist);
1047		SYSCTL_ADD_PROC(&logical_cpu_clist,
1048		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1049		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1050		    sysctl_hlt_logical_cpus, "IU", "");
1051		SYSCTL_ADD_UINT(&logical_cpu_clist,
1052		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1053		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1054		    &logical_cpus_mask, 0, "");
1055
1056		if (hlt_logical_cpus)
1057			hlt_cpus_mask |= logical_cpus_mask;
1058	}
1059}
1060SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1061
1062int
1063mp_grab_cpu_hlt(void)
1064{
1065	u_int mask = PCPU_GET(cpumask);
1066	int retval;
1067
1068	retval = mask & hlt_cpus_mask;
1069	while (mask & hlt_cpus_mask)
1070		__asm __volatile("sti; hlt" : : : "memory");
1071	return (retval);
1072}
1073