memnode.c revision 4769:291956cbfc21
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/systm.h>
29#include <sys/platform_module.h>
30#include <sys/sysmacros.h>
31#include <sys/atomic.h>
32#include <sys/memlist.h>
33#include <sys/memnode.h>
34#include <vm/vm_dep.h>
35
36int max_mem_nodes = 1;		/* max memory nodes on this system */
37
38struct mem_node_conf mem_node_config[MAX_MEM_NODES];
39int mem_node_pfn_shift;
40/*
41 * num_memnodes should be updated atomically and always >=
42 * the number of bits in memnodes_mask or the algorithm may fail.
43 */
44uint16_t num_memnodes;
45mnodeset_t memnodes_mask; /* assumes 8*(sizeof(mnodeset_t)) >= MAX_MEM_NODES */
46
47/*
48 * If set, mem_node_physalign should be a power of two, and
49 * should reflect the minimum address alignment of each node.
50 */
51uint64_t mem_node_physalign;
52
53/*
54 * Platform hooks we will need.
55 */
56
57#pragma weak plat_build_mem_nodes
58#pragma weak plat_slice_add
59#pragma weak plat_slice_del
60
61/*
62 * Adjust the memnode config after a DR operation.
63 *
64 * It is rather tricky to do these updates since we can't
65 * protect the memnode structures with locks, so we must
66 * be mindful of the order in which updates and reads to
67 * these values can occur.
68 */
69void
70mem_node_add_slice(pfn_t start, pfn_t end)
71{
72	int mnode;
73	mnodeset_t newmask, oldmask;
74
75	/*
76	 * DR will pass us the first pfn that is allocatable.
77	 * We need to round down to get the real start of
78	 * the slice.
79	 */
80	if (mem_node_physalign) {
81		start &= ~(btop(mem_node_physalign) - 1);
82		end = roundup(end, btop(mem_node_physalign)) - 1;
83	}
84
85	if (&plat_slice_add != NULL)
86		plat_slice_add(start, end);
87
88	mnode = PFN_2_MEM_NODE(start);
89	ASSERT(mnode < max_mem_nodes);
90
91	if (cas32((uint32_t *)&mem_node_config[mnode].exists, 0, 1)) {
92		/*
93		 * Add slice to existing node.
94		 */
95		if (start < mem_node_config[mnode].physbase)
96			mem_node_config[mnode].physbase = start;
97		if (end > mem_node_config[mnode].physmax)
98			mem_node_config[mnode].physmax = end;
99	} else {
100		mem_node_config[mnode].physbase = start;
101		mem_node_config[mnode].physmax = end;
102		atomic_add_16(&num_memnodes, 1);
103		do {
104			oldmask = memnodes_mask;
105			newmask = memnodes_mask | (1ull << mnode);
106		} while (cas64(&memnodes_mask, oldmask, newmask) != oldmask);
107	}
108	/*
109	 * Let the common lgrp framework know about the new memory
110	 */
111	lgrp_config(LGRP_CONFIG_MEM_ADD, mnode, MEM_NODE_2_LGRPHAND(mnode));
112}
113
114/* ARGSUSED */
115void
116mem_node_pre_del_slice(pfn_t start, pfn_t end)
117{
118	int mnode = PFN_2_MEM_NODE(start);
119
120	ASSERT(mnode < max_mem_nodes);
121	ASSERT(mem_node_config[mnode].exists == 1);
122}
123
124/*
125 * Remove a PFN range from a memnode.  On some platforms,
126 * the memnode will be created with physbase at the first
127 * allocatable PFN, but later deleted with the MC slice
128 * base address converted to a PFN, in which case we need
129 * to assume physbase and up.
130 */
131void
132mem_node_post_del_slice(pfn_t start, pfn_t end, int cancelled)
133{
134	int mnode;
135	pgcnt_t delta_pgcnt, node_size;
136	mnodeset_t omask, nmask;
137
138	if (mem_node_physalign) {
139		start &= ~(btop(mem_node_physalign) - 1);
140		end = roundup(end, btop(mem_node_physalign)) - 1;
141	}
142	mnode = PFN_2_MEM_NODE(start);
143
144	ASSERT(mnode < max_mem_nodes);
145	ASSERT(mem_node_config[mnode].exists == 1);
146
147	if (!cancelled) {
148		delta_pgcnt = end - start;
149		node_size = mem_node_config[mnode].physmax -
150		    mem_node_config[mnode].physbase;
151
152		if (node_size > delta_pgcnt) {
153			/*
154			 * Subtract the slice from the memnode.
155			 */
156			if (start <= mem_node_config[mnode].physbase)
157				mem_node_config[mnode].physbase = end + 1;
158			ASSERT(end <= mem_node_config[mnode].physmax);
159			if (end == mem_node_config[mnode].physmax)
160				mem_node_config[mnode].physmax = start - 1;
161		} else {
162
163			/*
164			 * Let the common lgrp framework know the mnode is
165			 * leaving
166			 */
167			lgrp_config(LGRP_CONFIG_MEM_DEL, mnode,
168			    MEM_NODE_2_LGRPHAND(mnode));
169
170			/*
171			 * Delete the whole node.
172			 */
173			ASSERT(MNODE_PGCNT(mnode) == 0);
174			do {
175				omask = memnodes_mask;
176				nmask = omask & ~(1ull << mnode);
177			} while (cas64(&memnodes_mask, omask, nmask) != omask);
178			atomic_add_16(&num_memnodes, -1);
179			mem_node_config[mnode].exists = 0;
180		}
181
182		if (&plat_slice_del != NULL)
183			plat_slice_del(start, end);
184	}
185}
186
187void
188startup_build_mem_nodes(u_longlong_t *list, size_t nelems)
189{
190	size_t	elem;
191	pfn_t	basepfn;
192	pgcnt_t	npgs;
193
194	/* LINTED: ASSERT will always true or false */
195	ASSERT(NBBY * sizeof (mnodeset_t) >= max_mem_nodes);
196
197	if (&plat_build_mem_nodes != NULL) {
198		plat_build_mem_nodes(list, nelems);
199	} else {
200		/*
201		 * Boot install lists are arranged <addr, len>, ...
202		 */
203		for (elem = 0; elem < nelems; elem += 2) {
204			basepfn = btop(list[elem]);
205			npgs = btop(list[elem+1]);
206			mem_node_add_slice(basepfn, basepfn + npgs - 1);
207		}
208		mem_node_physalign = 0;
209		mem_node_pfn_shift = 0;
210	}
211}
212
213/*
214 * Allocate an unassigned memnode.
215 */
216int
217mem_node_alloc()
218{
219	int mnode;
220	mnodeset_t newmask, oldmask;
221
222	/*
223	 * Find an unused memnode.  Update it atomically to prevent
224	 * a first time memnode creation race.
225	 */
226	for (mnode = 0; mnode < max_mem_nodes; mnode++)
227		if (cas32((uint32_t *)&mem_node_config[mnode].exists,
228		    0, 1) == 0)
229			break;
230
231	if (mnode >= max_mem_nodes)
232			panic("Out of free memnodes\n");
233
234	mem_node_config[mnode].physbase = (uint64_t)-1;
235	mem_node_config[mnode].physmax = 0;
236	atomic_add_16(&num_memnodes, 1);
237	do {
238		oldmask = memnodes_mask;
239		newmask = memnodes_mask | (1ull << mnode);
240	} while (cas64(&memnodes_mask, oldmask, newmask) != oldmask);
241
242	return (mnode);
243}
244
245/*
246 * Find the intersection between a memnode and a memlist
247 * and returns the number of pages that overlap.
248 *
249 * Grab the memlist lock to protect the list from DR operations.
250 */
251pgcnt_t
252mem_node_memlist_pages(int mnode, struct memlist *mlist)
253{
254	pfn_t		base, end;
255	pfn_t		cur_base, cur_end;
256	pgcnt_t		npgs = 0;
257	pgcnt_t		pages;
258	struct memlist	*pmem;
259
260	if (&plat_mem_node_intersect_range != NULL) {
261		memlist_read_lock();
262
263		for (pmem = mlist; pmem; pmem = pmem->next) {
264			plat_mem_node_intersect_range(btop(pmem->address),
265			    btop(pmem->size), mnode, &pages);
266			npgs += pages;
267		}
268
269		memlist_read_unlock();
270		return (npgs);
271	}
272
273	base = mem_node_config[mnode].physbase;
274	end = mem_node_config[mnode].physmax;
275
276	memlist_read_lock();
277
278	for (pmem = mlist; pmem; pmem = pmem->next) {
279		cur_base = btop(pmem->address);
280		cur_end = cur_base + btop(pmem->size) - 1;
281		if (end < cur_base || base > cur_end)
282			continue;
283		npgs = npgs + (MIN(cur_end, end) -
284		    MAX(cur_base, base)) + 1;
285	}
286
287	memlist_read_unlock();
288
289	return (npgs);
290}
291
292/*
293 * Find MIN(physbase) and MAX(physmax) over all mnodes
294 *
295 * Called during startup and DR to find hpm_counters limits when
296 * interleaved_mnodes is set.
297 * NOTE: there is a race condition with DR if it tries to change more than
298 * one mnode in parallel. Sizing shared hpm_counters depends on finding the
299 * min(physbase) and max(physmax) across all mnodes. Therefore, the caller of
300 * page_ctrs_adjust must ensure that mem_node_config does not change while it
301 * is running.
302 */
303void
304mem_node_max_range(pfn_t *basep, pfn_t *maxp)
305{
306	int mnode;
307	pfn_t max = 0;
308	pfn_t base = (pfn_t)-1;
309
310	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
311		if (mem_node_config[mnode].exists == 0)
312			continue;
313		if (max < mem_node_config[mnode].physmax)
314			max = mem_node_config[mnode].physmax;
315		if (base > mem_node_config[mnode].physbase)
316			base = mem_node_config[mnode].physbase;
317	}
318	ASSERT(base != (pfn_t)-1 && max != 0);
319	*basep = base;
320	*maxp = max;
321}
322