1/*-
2 * Copyright (c) 2015 Nathan Whitehorn
3 * Copyright (c) 2017-2018 Semihalf
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD$");
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/kernel.h>
34#include <sys/bus.h>
35#include <sys/pcpu.h>
36#include <sys/proc.h>
37#include <sys/smp.h>
38#include <vm/vm.h>
39#include <vm/pmap.h>
40
41#include <machine/bus.h>
42#include <machine/cpu.h>
43#include <machine/hid.h>
44#include <machine/platformvar.h>
45#include <machine/pmap.h>
46#include <machine/rtas.h>
47#include <machine/smp.h>
48#include <machine/spr.h>
49#include <machine/trap.h>
50
51#include <dev/ofw/openfirm.h>
52#include <dev/ofw/ofw_bus.h>
53#include <dev/ofw/ofw_bus_subr.h>
54#include <machine/ofw_machdep.h>
55#include <powerpc/aim/mmu_oea64.h>
56
57#include "platform_if.h"
58#include "opal.h"
59
60#ifdef SMP
61extern void *ap_pcpu;
62#endif
63
64void (*powernv_smp_ap_extra_init)(void);
65
66static int powernv_probe(platform_t);
67static int powernv_attach(platform_t);
68void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz,
69    struct mem_region *avail, int *availsz);
70static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz);
71static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref);
72static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref);
73static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref);
74static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref);
75static void powernv_smp_ap_init(platform_t);
76#ifdef SMP
77static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu);
78static void powernv_smp_probe_threads(platform_t);
79static struct cpu_group *powernv_smp_topo(platform_t plat);
80#endif
81static void powernv_reset(platform_t);
82static void powernv_cpu_idle(sbintime_t sbt);
83static int powernv_cpuref_init(void);
84static int powernv_node_numa_domain(platform_t platform, phandle_t node);
85
86static platform_method_t powernv_methods[] = {
87	PLATFORMMETHOD(platform_probe, 		powernv_probe),
88	PLATFORMMETHOD(platform_attach,		powernv_attach),
89	PLATFORMMETHOD(platform_mem_regions,	powernv_mem_regions),
90	PLATFORMMETHOD(platform_numa_mem_regions,	powernv_numa_mem_regions),
91	PLATFORMMETHOD(platform_timebase_freq,	powernv_timebase_freq),
92
93	PLATFORMMETHOD(platform_smp_ap_init,	powernv_smp_ap_init),
94	PLATFORMMETHOD(platform_smp_first_cpu,	powernv_smp_first_cpu),
95	PLATFORMMETHOD(platform_smp_next_cpu,	powernv_smp_next_cpu),
96	PLATFORMMETHOD(platform_smp_get_bsp,	powernv_smp_get_bsp),
97#ifdef SMP
98	PLATFORMMETHOD(platform_smp_start_cpu,	powernv_smp_start_cpu),
99	PLATFORMMETHOD(platform_smp_probe_threads,	powernv_smp_probe_threads),
100	PLATFORMMETHOD(platform_smp_topo,	powernv_smp_topo),
101#endif
102	PLATFORMMETHOD(platform_node_numa_domain,	powernv_node_numa_domain),
103
104	PLATFORMMETHOD(platform_reset,		powernv_reset),
105	{ 0, 0 }
106};
107
108static platform_def_t powernv_platform = {
109	"powernv",
110	powernv_methods,
111	0
112};
113
114static struct cpuref platform_cpuref[MAXCPU];
115static int platform_cpuref_cnt;
116static int platform_cpuref_valid;
117static int platform_associativity;
118
119PLATFORM_DEF(powernv_platform);
120
121static uint64_t powernv_boot_pir;
122
123static int
124powernv_probe(platform_t plat)
125{
126	if (opal_check() == 0)
127		return (BUS_PROBE_SPECIFIC);
128
129	return (ENXIO);
130}
131
132static int
133powernv_attach(platform_t plat)
134{
135	uint32_t nptlp, shift = 0, slb_encoding = 0;
136	int32_t lp_size, lp_encoding;
137	char buf[255];
138	pcell_t refpoints[3];
139	pcell_t prop;
140	phandle_t cpu;
141	phandle_t opal;
142	int res, len, idx;
143	register_t msr;
144	bool has_lp;
145
146	/* Ping OPAL again just to make sure */
147	opal_check();
148
149#if BYTE_ORDER == LITTLE_ENDIAN
150	opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */);
151#else
152	opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */);
153#endif
154	opal = OF_finddevice("/ibm,opal");
155
156	platform_associativity = 4; /* Skiboot default. */
157	if (OF_getencprop(opal, "ibm,associativity-reference-points", refpoints,
158	    sizeof(refpoints)) > 0) {
159		platform_associativity = refpoints[0];
160	}
161
162       if (cpu_idle_hook == NULL)
163                cpu_idle_hook = powernv_cpu_idle;
164
165	powernv_boot_pir = mfspr(SPR_PIR);
166
167	/* LPID must not be altered when PSL_DR or PSL_IR is set */
168	msr = mfmsr();
169	mtmsr(msr & ~(PSL_DR | PSL_IR));
170
171	/* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */
172	mtspr(SPR_LPID, 0);
173	isync();
174
175	if (cpu_features2 & PPC_FEATURE2_ARCH_3_00)
176		lpcr |= LPCR_HVICE;
177
178#if BYTE_ORDER == LITTLE_ENDIAN
179	lpcr |= LPCR_ILE;
180#endif
181
182	mtspr(SPR_LPCR, lpcr);
183	isync();
184
185	mtmsr(msr);
186
187	powernv_cpuref_init();
188
189	/* Set SLB count from device tree */
190	cpu = OF_peer(0);
191	cpu = OF_child(cpu);
192	while (cpu != 0) {
193		res = OF_getprop(cpu, "name", buf, sizeof(buf));
194		if (res > 0 && strcmp(buf, "cpus") == 0)
195			break;
196		cpu = OF_peer(cpu);
197	}
198	if (cpu == 0)
199		goto out;
200
201	cpu = OF_child(cpu);
202	while (cpu != 0) {
203		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
204		if (res > 0 && strcmp(buf, "cpu") == 0)
205			break;
206		cpu = OF_peer(cpu);
207	}
208	if (cpu == 0)
209		goto out;
210
211	res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop));
212	if (res > 0)
213		n_slbs = prop;
214
215	/*
216	 * Scan the large page size property for PAPR compatible machines.
217	 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties'
218	 * for the encoding of the property.
219	 */
220
221	len = OF_getproplen(cpu, "ibm,segment-page-sizes");
222	if (len > 0) {
223		/*
224		 * We have to use a variable length array on the stack
225		 * since we have very limited stack space.
226		 */
227		pcell_t arr[len/sizeof(cell_t)];
228		res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr,
229		    sizeof(arr));
230		len /= 4;
231		idx = 0;
232		has_lp = false;
233		while (len > 0) {
234			shift = arr[idx];
235			slb_encoding = arr[idx + 1];
236			nptlp = arr[idx + 2];
237			idx += 3;
238			len -= 3;
239			while (len > 0 && nptlp) {
240				lp_size = arr[idx];
241				lp_encoding = arr[idx+1];
242				if (slb_encoding == SLBV_L && lp_encoding == 0)
243					has_lp = true;
244
245				if (slb_encoding == SLB_PGSZ_4K_4K &&
246				    lp_encoding == LP_4K_16M)
247					moea64_has_lp_4k_16m = true;
248
249				idx += 2;
250				len -= 2;
251				nptlp--;
252			}
253			if (has_lp && moea64_has_lp_4k_16m)
254				break;
255		}
256
257		if (!has_lp)
258			panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
259			    "not supported by this system.");
260
261		moea64_large_page_shift = shift;
262		moea64_large_page_size = 1ULL << lp_size;
263	}
264
265out:
266	return (0);
267}
268
269void
270powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz,
271    struct mem_region *avail, int *availsz)
272{
273
274	ofw_mem_regions(phys, physsz, avail, availsz);
275}
276
277static void
278powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz)
279{
280
281	ofw_numa_mem_regions(phys, physsz);
282}
283
284static u_long
285powernv_timebase_freq(platform_t plat, struct cpuref *cpuref)
286{
287	char buf[8];
288	phandle_t cpu, dev, root;
289	int res;
290	int32_t ticks = -1;
291
292	root = OF_peer(0);
293	dev = OF_child(root);
294	while (dev != 0) {
295		res = OF_getprop(dev, "name", buf, sizeof(buf));
296		if (res > 0 && strcmp(buf, "cpus") == 0)
297			break;
298		dev = OF_peer(dev);
299	}
300
301	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
302		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
303		if (res > 0 && strcmp(buf, "cpu") == 0)
304			break;
305	}
306	if (cpu == 0)
307		return (512000000);
308
309	OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks));
310
311	if (ticks <= 0)
312		panic("Unable to determine timebase frequency!");
313
314	return (ticks);
315
316}
317
318static int
319powernv_cpuref_init(void)
320{
321	phandle_t cpu, dev;
322	char buf[32];
323	int a, res, tmp_cpuref_cnt;
324	static struct cpuref tmp_cpuref[MAXCPU];
325	cell_t interrupt_servers[32];
326	uint64_t bsp;
327
328	if (platform_cpuref_valid)
329		return (0);
330
331	dev = OF_peer(0);
332	dev = OF_child(dev);
333	while (dev != 0) {
334		res = OF_getprop(dev, "name", buf, sizeof(buf));
335		if (res > 0 && strcmp(buf, "cpus") == 0)
336			break;
337		dev = OF_peer(dev);
338	}
339
340	bsp = 0;
341	tmp_cpuref_cnt = 0;
342	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
343		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
344		if (res > 0 && strcmp(buf, "cpu") == 0) {
345			if (!ofw_bus_node_status_okay(cpu))
346				continue;
347			res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
348			if (res > 0) {
349				OF_getencprop(cpu, "ibm,ppc-interrupt-server#s",
350				    interrupt_servers, res);
351
352				for (a = 0; a < res/sizeof(cell_t); a++) {
353					tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a];
354					tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt;
355					tmp_cpuref[tmp_cpuref_cnt].cr_domain =
356					    powernv_node_numa_domain(NULL, cpu);
357					if (interrupt_servers[a] == (uint32_t)powernv_boot_pir)
358						bsp = tmp_cpuref_cnt;
359
360					tmp_cpuref_cnt++;
361				}
362			}
363		}
364	}
365
366	/* Map IDs, so BSP has CPUID 0 regardless of hwref */
367	for (a = bsp; a < tmp_cpuref_cnt; a++) {
368		platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
369		platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
370		platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
371		platform_cpuref_cnt++;
372	}
373	for (a = 0; a < bsp; a++) {
374		platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
375		platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
376		platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
377		platform_cpuref_cnt++;
378	}
379
380	platform_cpuref_valid = 1;
381
382	return (0);
383}
384
385static int
386powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref)
387{
388	if (platform_cpuref_valid == 0)
389		return (EINVAL);
390
391	cpuref->cr_cpuid = 0;
392	cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
393	cpuref->cr_domain = platform_cpuref[0].cr_domain;
394
395	return (0);
396}
397
398static int
399powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref)
400{
401	int id;
402
403	if (platform_cpuref_valid == 0)
404		return (EINVAL);
405
406	id = cpuref->cr_cpuid + 1;
407	if (id >= platform_cpuref_cnt)
408		return (ENOENT);
409
410	cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid;
411	cpuref->cr_hwref = platform_cpuref[id].cr_hwref;
412	cpuref->cr_domain = platform_cpuref[id].cr_domain;
413
414	return (0);
415}
416
417static int
418powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref)
419{
420
421	cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid;
422	cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
423	cpuref->cr_domain = platform_cpuref[0].cr_domain;
424	return (0);
425}
426
427#ifdef SMP
428static int
429powernv_smp_start_cpu(platform_t plat, struct pcpu *pc)
430{
431	int result;
432
433	ap_pcpu = pc;
434	powerpc_sync();
435
436	result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST);
437	if (result != OPAL_SUCCESS) {
438		printf("OPAL error (%d): unable to start AP %d\n",
439		    result, (int)pc->pc_hwref);
440		return (ENXIO);
441	}
442
443	return (0);
444}
445
446static void
447powernv_smp_probe_threads(platform_t plat)
448{
449	char buf[8];
450	phandle_t cpu, dev, root;
451	int res, nthreads;
452
453	root = OF_peer(0);
454
455	dev = OF_child(root);
456	while (dev != 0) {
457		res = OF_getprop(dev, "name", buf, sizeof(buf));
458		if (res > 0 && strcmp(buf, "cpus") == 0)
459			break;
460		dev = OF_peer(dev);
461	}
462
463	nthreads = 1;
464	for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
465		res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
466		if (res <= 0 || strcmp(buf, "cpu") != 0)
467			continue;
468
469		res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
470
471		if (res >= 0)
472			nthreads = res / sizeof(cell_t);
473		else
474			nthreads = 1;
475		break;
476	}
477
478	smp_threads_per_core = nthreads;
479	if (mp_ncpus % nthreads == 0)
480		mp_ncores = mp_ncpus / nthreads;
481}
482
483static struct cpu_group *
484cpu_group_init(struct cpu_group *group, struct cpu_group *parent,
485    const cpuset_t *cpus, int children, int level, int flags)
486{
487	struct cpu_group *child;
488
489	child = children != 0 ? smp_topo_alloc(children) : NULL;
490
491	group->cg_parent = parent;
492	group->cg_child = child;
493	CPU_COPY(cpus, &group->cg_mask);
494	group->cg_count = CPU_COUNT(cpus);
495	group->cg_children = children;
496	group->cg_level = level;
497	group->cg_flags = flags;
498
499	return (child);
500}
501
502static struct cpu_group *
503powernv_smp_topo(platform_t plat)
504{
505	struct cpu_group *core, *dom, *root;
506	cpuset_t corecpus, domcpus;
507	int cpuid, i, j, k, ncores;
508
509	if (mp_ncpus % smp_threads_per_core != 0) {
510		printf("%s: irregular SMP topology (%d threads, %d per core)\n",
511		    __func__, mp_ncpus, smp_threads_per_core);
512		return (smp_topo_none());
513	}
514
515	root = smp_topo_alloc(1);
516	dom = cpu_group_init(root, NULL, &all_cpus, vm_ndomains, CG_SHARE_NONE,
517	    0);
518
519	/*
520	 * Redundant layers will be collapsed by the caller so we don't need a
521	 * special case for a single domain.
522	 */
523	for (i = 0; i < vm_ndomains; i++, dom++) {
524		CPU_COPY(&cpuset_domain[i], &domcpus);
525		ncores = CPU_COUNT(&domcpus) / smp_threads_per_core;
526		KASSERT(CPU_COUNT(&domcpus) % smp_threads_per_core == 0,
527		    ("%s: domain %d core count not divisible by thread count",
528		    __func__, i));
529
530		core = cpu_group_init(dom, root, &domcpus, ncores, CG_SHARE_L3,
531		    0);
532		for (j = 0; j < ncores; j++, core++) {
533			/*
534			 * Assume that consecutive CPU IDs correspond to sibling
535			 * threads.
536			 */
537			CPU_ZERO(&corecpus);
538			for (k = 0; k < smp_threads_per_core; k++) {
539				cpuid = CPU_FFS(&domcpus) - 1;
540				CPU_CLR(cpuid, &domcpus);
541				CPU_SET(cpuid, &corecpus);
542			}
543			(void)cpu_group_init(core, dom, &corecpus, 0,
544			    CG_SHARE_L1, CG_FLAG_SMT);
545		}
546	}
547
548	return (root);
549}
550
551#endif
552
553static void
554powernv_reset(platform_t platform)
555{
556
557	opal_call(OPAL_CEC_REBOOT);
558}
559
560static void
561powernv_smp_ap_init(platform_t platform)
562{
563
564	if (powernv_smp_ap_extra_init != NULL)
565		powernv_smp_ap_extra_init();
566}
567
568static void
569powernv_cpu_idle(sbintime_t sbt)
570{
571}
572
573static int
574powernv_node_numa_domain(platform_t platform, phandle_t node)
575{
576	/* XXX: Is locking necessary in here? */
577	static int numa_domains[MAXMEMDOM];
578	static int numa_max_domain;
579	cell_t associativity[5];
580	int i, res;
581
582#ifndef NUMA
583	return (0);
584#endif
585	i = 0;
586	TUNABLE_INT_FETCH("vm.numa.disabled", &i);
587	if (i)
588		return (0);
589
590	res = OF_getencprop(node, "ibm,associativity",
591		associativity, sizeof(associativity));
592
593	/*
594	 * If this node doesn't have associativity, or if there are not
595	 * enough elements in it, check its parent.
596	 */
597	if (res < (int)(sizeof(cell_t) * (platform_associativity + 1))) {
598		node = OF_parent(node);
599		/* If already at the root, use default domain. */
600		if (node == 0)
601			return (0);
602		return (powernv_node_numa_domain(platform, node));
603	}
604
605	for (i = 0; i < numa_max_domain; i++) {
606		if (numa_domains[i] == associativity[platform_associativity])
607			return (i);
608	}
609	if (i < MAXMEMDOM)
610		numa_domains[numa_max_domain++] =
611		    associativity[platform_associativity];
612	else
613		i = 0;
614
615	return (i);
616}
617
618/* Set up the Nest MMU on POWER9 relatively early, but after pmap is setup. */
619static void
620powernv_setup_nmmu(void *unused)
621{
622	if (opal_check() != 0)
623		return;
624	opal_call(OPAL_NMMU_SET_PTCR, -1, mfspr(SPR_PTCR));
625}
626
627SYSINIT(powernv_setup_nmmu, SI_SUB_CPU, SI_ORDER_ANY, powernv_setup_nmmu, NULL);
628