1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
20#include <asm/proto.h>
21#include <asm/numa.h>
22#include <asm/e820.h>
23
24int acpi_numa __initdata;
25
26static struct acpi_table_slit *acpi_slit;
27
28static nodemask_t nodes_parsed __initdata;
29static struct bootnode nodes[MAX_NUMNODES] __initdata;
30static struct bootnode nodes_add[MAX_NUMNODES];
31static int found_add_area __initdata;
32int hotadd_percent __initdata = 0;
33
34/* Too small nodes confuse the VM badly. Usually they result
35   from BIOS bugs. */
36#define NODE_MIN_SIZE (4*1024*1024)
37
38static __init int setup_node(int pxm)
39{
40	return acpi_map_pxm_to_node(pxm);
41}
42
43static __init int conflicting_nodes(unsigned long start, unsigned long end)
44{
45	int i;
46	for_each_node_mask(i, nodes_parsed) {
47		struct bootnode *nd = &nodes[i];
48		if (nd->start == nd->end)
49			continue;
50		if (nd->end > start && nd->start < end)
51			return i;
52		if (nd->end == end && nd->start == start)
53			return i;
54	}
55	return -1;
56}
57
58static __init void cutoff_node(int i, unsigned long start, unsigned long end)
59{
60	struct bootnode *nd = &nodes[i];
61
62	if (found_add_area)
63		return;
64
65	if (nd->start < start) {
66		nd->start = start;
67		if (nd->end < nd->start)
68			nd->start = nd->end;
69	}
70	if (nd->end > end) {
71		nd->end = end;
72		if (nd->start > nd->end)
73			nd->start = nd->end;
74	}
75}
76
77static __init void bad_srat(void)
78{
79	int i;
80	printk(KERN_ERR "SRAT: SRAT not used.\n");
81	acpi_numa = -1;
82	found_add_area = 0;
83	for (i = 0; i < MAX_LOCAL_APIC; i++)
84		apicid_to_node[i] = NUMA_NO_NODE;
85	for (i = 0; i < MAX_NUMNODES; i++)
86		nodes_add[i].start = nodes[i].end = 0;
87	remove_all_active_ranges();
88}
89
90static __init inline int srat_disabled(void)
91{
92	return numa_off || acpi_numa < 0;
93}
94
95/*
96 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
97 * up the NUMA heuristics which wants the local node to have a smaller
98 * distance than the others.
99 * Do some quick checks here and only use the SLIT if it passes.
100 */
101static __init int slit_valid(struct acpi_table_slit *slit)
102{
103	int i, j;
104	int d = slit->locality_count;
105	for (i = 0; i < d; i++) {
106		for (j = 0; j < d; j++)  {
107			u8 val = slit->entry[d*i + j];
108			if (i == j) {
109				if (val != 10)
110					return 0;
111			} else if (val <= 10)
112				return 0;
113		}
114	}
115	return 1;
116}
117
118/* Callback for SLIT parsing */
119void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
120{
121	if (!slit_valid(slit)) {
122		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
123		return;
124	}
125	acpi_slit = slit;
126}
127
128/* Callback for Proximity Domain -> LAPIC mapping */
129void __init
130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131{
132	int pxm, node;
133	if (srat_disabled())
134		return;
135	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
136		bad_srat();
137		return;
138	}
139	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
140		return;
141	pxm = pa->proximity_domain_lo;
142	node = setup_node(pxm);
143	if (node < 0) {
144		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
145		bad_srat();
146		return;
147	}
148	apicid_to_node[pa->apic_id] = node;
149	acpi_numa = 1;
150	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
151	       pxm, pa->apic_id, node);
152}
153
154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155/*
156 * Protect against too large hotadd areas that would fill up memory.
157 */
158static int hotadd_enough_memory(struct bootnode *nd)
159{
160	static unsigned long allocated;
161	static unsigned long last_area_end;
162	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
163	long mem = pages * sizeof(struct page);
164	unsigned long addr;
165	unsigned long allowed;
166	unsigned long oldpages = pages;
167
168	if (mem < 0)
169		return 0;
170	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171	allowed = (allowed / 100) * hotadd_percent;
172	if (allocated + mem > allowed) {
173		unsigned long range;
174		/* Give them at least part of their hotadd memory upto hotadd_percent
175		   It would be better to spread the limit out
176		   over multiple hotplug areas, but that is too complicated
177		   right now */
178		if (allocated >= allowed)
179			return 0;
180		range = allowed - allocated;
181		pages = (range / PAGE_SIZE);
182		mem = pages * sizeof(struct page);
183		nd->end = nd->start + range;
184	}
185	/* Not completely fool proof, but a good sanity check */
186	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
187	if (addr == -1UL)
188		return 0;
189	if (pages != oldpages)
190		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
191			pages << PAGE_SHIFT);
192	last_area_end = addr + mem;
193	allocated += mem;
194	return 1;
195}
196
197static int update_end_of_memory(unsigned long end)
198{
199	found_add_area = 1;
200	if ((end >> PAGE_SHIFT) > end_pfn)
201		end_pfn = end >> PAGE_SHIFT;
202	return 1;
203}
204
205static inline int save_add_info(void)
206{
207	return hotadd_percent > 0;
208}
209#else
210int update_end_of_memory(unsigned long end) {return -1;}
211static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
213static inline int save_add_info(void) {return 1;}
214#else
215static inline int save_add_info(void) {return 0;}
216#endif
217#endif
218/*
219 * Update nodes_add and decide if to include add are in the zone.
220 * Both SPARSE and RESERVE need nodes_add infomation.
221 * This code supports one contigious hot add area per node.
222 */
223static int reserve_hotadd(int node, unsigned long start, unsigned long end)
224{
225	unsigned long s_pfn = start >> PAGE_SHIFT;
226	unsigned long e_pfn = end >> PAGE_SHIFT;
227	int ret = 0, changed = 0;
228	struct bootnode *nd = &nodes_add[node];
229
230	/* I had some trouble with strange memory hotadd regions breaking
231	   the boot. Be very strict here and reject anything unexpected.
232	   If you want working memory hotadd write correct SRATs.
233
234	   The node size check is a basic sanity check to guard against
235	   mistakes */
236	if ((signed long)(end - start) < NODE_MIN_SIZE) {
237		printk(KERN_ERR "SRAT: Hotplug area too small\n");
238		return -1;
239	}
240
241	/* This check might be a bit too strict, but I'm keeping it for now. */
242	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
243		printk(KERN_ERR
244			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
245			s_pfn, e_pfn);
246		return -1;
247	}
248
249	if (!hotadd_enough_memory(&nodes_add[node]))  {
250		printk(KERN_ERR "SRAT: Hotplug area too large\n");
251		return -1;
252	}
253
254	/* Looks good */
255
256	if (nd->start == nd->end) {
257		nd->start = start;
258		nd->end = end;
259		changed = 1;
260	} else {
261		if (nd->start == end) {
262			nd->start = start;
263			changed = 1;
264		}
265		if (nd->end == start) {
266			nd->end = end;
267			changed = 1;
268		}
269		if (!changed)
270			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
271	}
272
273	ret = update_end_of_memory(nd->end);
274
275	if (changed)
276	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
277	return ret;
278}
279
280/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
281void __init
282acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
283{
284	struct bootnode *nd, oldnode;
285	unsigned long start, end;
286	int node, pxm;
287	int i;
288
289	if (srat_disabled())
290		return;
291	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
292		bad_srat();
293		return;
294	}
295	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
296		return;
297
298	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
299		return;
300	start = ma->base_address;
301	end = start + ma->length;
302	pxm = ma->proximity_domain;
303	node = setup_node(pxm);
304	if (node < 0) {
305		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
306		bad_srat();
307		return;
308	}
309	i = conflicting_nodes(start, end);
310	if (i == node) {
311		printk(KERN_WARNING
312		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
313			pxm, start, end, nodes[i].start, nodes[i].end);
314	} else if (i >= 0) {
315		printk(KERN_ERR
316		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
317		       pxm, start, end, node_to_pxm(i),
318			nodes[i].start, nodes[i].end);
319		bad_srat();
320		return;
321	}
322	nd = &nodes[node];
323	oldnode = *nd;
324	if (!node_test_and_set(node, nodes_parsed)) {
325		nd->start = start;
326		nd->end = end;
327	} else {
328		if (start < nd->start)
329			nd->start = start;
330		if (nd->end < end)
331			nd->end = end;
332	}
333
334	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
335	       nd->start, nd->end);
336	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
337						nd->end >> PAGE_SHIFT);
338	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
339						nd->end >> PAGE_SHIFT);
340
341	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
342	    (reserve_hotadd(node, start, end) < 0)) {
343		/* Ignore hotadd region. Undo damage */
344		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
345		*nd = oldnode;
346		if ((nd->start | nd->end) == 0)
347			node_clear(node, nodes_parsed);
348	}
349}
350
351/* Sanity check to catch more bad SRATs (they are amazingly common).
352   Make sure the PXMs cover all memory. */
353static int nodes_cover_memory(void)
354{
355	int i;
356	unsigned long pxmram, e820ram;
357
358	pxmram = 0;
359	for_each_node_mask(i, nodes_parsed) {
360		unsigned long s = nodes[i].start >> PAGE_SHIFT;
361		unsigned long e = nodes[i].end >> PAGE_SHIFT;
362		pxmram += e - s;
363		pxmram -= absent_pages_in_range(s, e);
364		if ((long)pxmram < 0)
365			pxmram = 0;
366	}
367
368	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
369	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
370	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
371		printk(KERN_ERR
372	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
373			(pxmram << PAGE_SHIFT) >> 20,
374			(e820ram << PAGE_SHIFT) >> 20);
375		return 0;
376	}
377	return 1;
378}
379
380static void unparse_node(int node)
381{
382	int i;
383	node_clear(node, nodes_parsed);
384	for (i = 0; i < MAX_LOCAL_APIC; i++) {
385		if (apicid_to_node[i] == node)
386			apicid_to_node[i] = NUMA_NO_NODE;
387	}
388}
389
390void __init acpi_numa_arch_fixup(void) {}
391
392/* Use the information discovered above to actually set up the nodes. */
393int __init acpi_scan_nodes(unsigned long start, unsigned long end)
394{
395	int i;
396
397	/* First clean up the node list */
398	for (i = 0; i < MAX_NUMNODES; i++) {
399		cutoff_node(i, start, end);
400		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
401			unparse_node(i);
402			node_set_offline(i);
403		}
404	}
405
406	if (acpi_numa <= 0)
407		return -1;
408
409	if (!nodes_cover_memory()) {
410		bad_srat();
411		return -1;
412	}
413
414	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
415	if (memnode_shift < 0) {
416		printk(KERN_ERR
417		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
418		bad_srat();
419		return -1;
420	}
421
422	node_possible_map = nodes_parsed;
423
424	/* Finally register nodes */
425	for_each_node_mask(i, node_possible_map)
426		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
427	/* Try again in case setup_node_bootmem missed one due
428	   to missing bootmem */
429	for_each_node_mask(i, node_possible_map)
430		if (!node_online(i))
431			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432
433	for (i = 0; i < NR_CPUS; i++) {
434		if (cpu_to_node[i] == NUMA_NO_NODE)
435			continue;
436		if (!node_isset(cpu_to_node[i], node_possible_map))
437			numa_set_node(i, NUMA_NO_NODE);
438	}
439	numa_init_array();
440	return 0;
441}
442
443void __init srat_reserve_add_area(int nodeid)
444{
445	if (found_add_area && nodes_add[nodeid].end) {
446		u64 total_mb;
447
448		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
449				"for node %d at %Lx-%Lx\n",
450			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
451		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
452					>> PAGE_SHIFT;
453		total_mb *= sizeof(struct page);
454		total_mb >>= 20;
455		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
456				"pre-allocated memory.\n", (unsigned long long)total_mb);
457		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
458			       nodes_add[nodeid].end - nodes_add[nodeid].start);
459	}
460}
461
462int __node_distance(int a, int b)
463{
464	int index;
465
466	if (!acpi_slit)
467		return a == b ? 10 : 20;
468	index = acpi_slit->locality_count * node_to_pxm(a);
469	return acpi_slit->entry[index + node_to_pxm(b)];
470}
471
472EXPORT_SYMBOL(__node_distance);
473
474int memory_add_physaddr_to_nid(u64 start)
475{
476	int i, ret = 0;
477
478	for_each_node(i)
479		if (nodes_add[i].start <= start && nodes_add[i].end > start)
480			ret = i;
481
482	return ret;
483}
484EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
485