vm_dep.c revision 11173:87f3734e64df
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * UNIX machine dependent virtual memory support.
28 */
29
30#include <sys/vm.h>
31#include <sys/exec.h>
32
33#include <sys/exechdr.h>
34#include <vm/seg_kmem.h>
35#include <sys/atomic.h>
36#include <sys/archsystm.h>
37#include <sys/machsystm.h>
38#include <sys/kdi.h>
39#include <sys/cpu_module.h>
40
41#include <vm/hat_sfmmu.h>
42
43#include <sys/memnode.h>
44
45#include <sys/mem_config.h>
46#include <sys/mem_cage.h>
47#include <vm/vm_dep.h>
48#include <vm/page.h>
49#include <sys/platform_module.h>
50
51/*
52 * These variables are set by module specific config routines.
53 * They are only set by modules which will use physical cache page coloring.
54 */
55int do_pg_coloring = 0;
56
57/*
58 * These variables can be conveniently patched at kernel load time to
59 * prevent do_pg_coloring from being enabled by
60 * module specific config routines.
61 */
62
63int use_page_coloring = 1;
64
65/*
66 * initialized by page_coloring_init()
67 */
68extern uint_t page_colors;
69extern uint_t page_colors_mask;
70extern uint_t page_coloring_shift;
71int cpu_page_colors;
72uint_t vac_colors = 0;
73uint_t vac_colors_mask = 0;
74
75/* cpu specific coloring initialization */
76extern void page_coloring_init_cpu();
77#pragma weak page_coloring_init_cpu
78
79/*
80 * get the ecache setsize for the current cpu.
81 */
82#define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
83
84plcnt_t		plcnt;		/* page list count */
85
86/*
87 * This variable is set by the cpu module to contain the lowest
88 * address not affected by the SF_ERRATA_57 workaround.  It should
89 * remain 0 if the workaround is not needed.
90 */
91#if defined(SF_ERRATA_57)
92caddr_t errata57_limit;
93#endif
94
95extern void page_relocate_hash(page_t *, page_t *);
96
97/*
98 * these must be defined in platform specific areas
99 */
100extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
101	struct proc *, uint_t);
102extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
103	caddr_t, size_t, uint_t, struct lgrp *);
104/*
105 * Convert page frame number to an OBMEM page frame number
106 * (i.e. put in the type bits -- zero for this implementation)
107 */
108pfn_t
109impl_obmem_pfnum(pfn_t pf)
110{
111	return (pf);
112}
113
114/*
115 * Use physmax to determine the highest physical page of DRAM memory
116 * It is assumed that any physical addresses above physmax is in IO space.
117 * We don't bother checking the low end because we assume that memory space
118 * begins at physical page frame 0.
119 *
120 * Return 1 if the page frame is onboard DRAM memory, else 0.
121 * Returns 0 for nvram so it won't be cached.
122 */
123int
124pf_is_memory(pfn_t pf)
125{
126	/* We must be IO space */
127	if (pf > physmax)
128		return (0);
129
130	/* We must be memory space */
131	return (1);
132}
133
134/*
135 * Handle a pagefault.
136 */
137faultcode_t
138pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
139{
140	struct as *as;
141	struct proc *p;
142	faultcode_t res;
143	caddr_t base;
144	size_t len;
145	int err;
146
147	if (INVALID_VADDR(addr))
148		return (FC_NOMAP);
149
150	if (iskernel) {
151		as = &kas;
152	} else {
153		p = curproc;
154		as = p->p_as;
155#if defined(SF_ERRATA_57)
156		/*
157		 * Prevent infinite loops due to a segment driver
158		 * setting the execute permissions and the sfmmu hat
159		 * silently ignoring them.
160		 */
161		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
162		    addr < errata57_limit) {
163			res = FC_NOMAP;
164			goto out;
165		}
166#endif
167	}
168
169	/*
170	 * Dispatch pagefault.
171	 */
172	res = as_fault(as->a_hat, as, addr, 1, type, rw);
173
174	/*
175	 * If this isn't a potential unmapped hole in the user's
176	 * UNIX data or stack segments, just return status info.
177	 */
178	if (!(res == FC_NOMAP && iskernel == 0))
179		goto out;
180
181	/*
182	 * Check to see if we happened to faulted on a currently unmapped
183	 * part of the UNIX data or stack segments.  If so, create a zfod
184	 * mapping there and then try calling the fault routine again.
185	 */
186	base = p->p_brkbase;
187	len = p->p_brksize;
188
189	if (addr < base || addr >= base + len) {		/* data seg? */
190		base = (caddr_t)(p->p_usrstack - p->p_stksize);
191		len = p->p_stksize;
192		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
193			/* not in either UNIX data or stack segments */
194			res = FC_NOMAP;
195			goto out;
196		}
197	}
198
199	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
200	/* This code is probably not needed anymore */
201
202	/* expand the gap to the page boundaries on each side */
203	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
204	    ((uintptr_t)base & PAGEMASK);
205	base = (caddr_t)((uintptr_t)base & PAGEMASK);
206
207	as_rangelock(as);
208	as_purge(as);
209	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
210		err = as_map(as, base, len, segvn_create, zfod_argsp);
211		as_rangeunlock(as);
212		if (err) {
213			res = FC_MAKE_ERR(err);
214			goto out;
215		}
216	} else {
217		/*
218		 * This page is already mapped by another thread after we
219		 * returned from as_fault() above.  We just fallthrough
220		 * as_fault() below.
221		 */
222		as_rangeunlock(as);
223	}
224
225	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
226
227out:
228
229	return (res);
230}
231
232/*
233 * This is the routine which defines the address limit implied
234 * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
235 * mappable address in a 32-bit process on this platform (though
236 * perhaps we should make it be UINT32_MAX here?)
237 */
238void
239map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
240{
241	struct proc *p = curproc;
242	caddr_t userlimit = flags & _MAP_LOW32 ?
243	    (caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
244	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
245}
246
247/*
248 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
249 */
250caddr_t	hole_start, hole_end;
251
252/*
253 * kpm mapping window
254 */
255caddr_t kpm_vbase;
256size_t  kpm_size;
257uchar_t kpm_size_shift;
258
259int valid_va_range_aligned_wraparound;
260/*
261 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
262 * addresses at least "minlen" long, where the base of the range is at "off"
263 * phase from an "align" boundary and there is space for a "redzone"-sized
264 * redzone on either side of the range.  On success, 1 is returned and *basep
265 * and *lenp are adjusted to describe the acceptable range (including
266 * the redzone).  On failure, 0 is returned.
267 */
268int
269valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
270    size_t align, size_t redzone, size_t off)
271{
272	caddr_t hi, lo;
273	size_t tot_len;
274
275	ASSERT(align == 0 ? off == 0 : off < align);
276	ASSERT(ISP2(align));
277	ASSERT(align == 0 || align >= PAGESIZE);
278
279	lo = *basep;
280	hi = lo + *lenp;
281	tot_len = minlen + 2 * redzone;	/* need at least this much space */
282
283	/* If hi rolled over the top try cutting back. */
284	if (hi < lo) {
285		*lenp = 0UL - (uintptr_t)lo - 1UL;
286		/* Trying to see if this really happens, and then if so, why */
287		valid_va_range_aligned_wraparound++;
288		hi = lo + *lenp;
289	}
290	if (*lenp < tot_len) {
291		return (0);
292	}
293
294	/*
295	 * Deal with a possible hole in the address range between
296	 * hole_start and hole_end that should never be mapped by the MMU.
297	 */
298
299	if (lo < hole_start) {
300		if (hi > hole_start)
301			if (hi < hole_end)
302				hi = hole_start;
303			else
304				/* lo < hole_start && hi >= hole_end */
305				if (dir == AH_LO) {
306					/*
307					 * prefer lowest range
308					 */
309					if (hole_start - lo >= tot_len)
310						hi = hole_start;
311					else if (hi - hole_end >= tot_len)
312						lo = hole_end;
313					else
314						return (0);
315				} else {
316					/*
317					 * prefer highest range
318					 */
319					if (hi - hole_end >= tot_len)
320						lo = hole_end;
321					else if (hole_start - lo >= tot_len)
322						hi = hole_start;
323					else
324						return (0);
325				}
326	} else {
327		/* lo >= hole_start */
328		if (hi < hole_end)
329			return (0);
330		if (lo < hole_end)
331			lo = hole_end;
332	}
333
334	/* Check if remaining length is too small */
335	if (hi - lo < tot_len) {
336		return (0);
337	}
338	if (align > 1) {
339		caddr_t tlo = lo + redzone;
340		caddr_t thi = hi - redzone;
341		tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off);
342		if (tlo < lo + redzone) {
343			return (0);
344		}
345		if (thi < tlo || thi - tlo < minlen) {
346			return (0);
347		}
348	}
349	*basep = lo;
350	*lenp = hi - lo;
351	return (1);
352}
353
354/*
355 * Determine whether [*basep, *basep + *lenp) contains a mappable range of
356 * addresses at least "minlen" long.  On success, 1 is returned and *basep
357 * and *lenp are adjusted to describe the acceptable range.  On failure, 0
358 * is returned.
359 */
360int
361valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
362{
363	return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
364}
365
366/*
367 * Determine whether [addr, addr+len] with protections `prot' are valid
368 * for a user address space.
369 */
370/*ARGSUSED*/
371int
372valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
373    caddr_t userlimit)
374{
375	caddr_t eaddr = addr + len;
376
377	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
378		return (RANGE_BADADDR);
379
380	/*
381	 * Determine if the address range falls within an illegal
382	 * range of the MMU.
383	 */
384	if (eaddr > hole_start && addr < hole_end)
385		return (RANGE_BADADDR);
386
387#if defined(SF_ERRATA_57)
388	/*
389	 * Make sure USERLIMIT isn't raised too high
390	 */
391	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
392	    errata57_limit == 0);
393
394	if (AS_TYPE_64BIT(as) &&
395	    (addr < errata57_limit) &&
396	    (prot & PROT_EXEC))
397		return (RANGE_BADPROT);
398#endif /* SF_ERRATA57 */
399	return (RANGE_OKAY);
400}
401
402/*
403 * Routine used to check to see if an a.out can be executed
404 * by the current machine/architecture.
405 */
406int
407chkaout(struct exdata *exp)
408{
409	if (exp->ux_mach == M_SPARC)
410		return (0);
411	else
412		return (ENOEXEC);
413}
414
415/*
416 * The following functions return information about an a.out
417 * which is used when a program is executed.
418 */
419
420/*
421 * Return the load memory address for the data segment.
422 */
423caddr_t
424getdmem(struct exec *exp)
425{
426	/*
427	 * XXX - Sparc Reference Hack approaching
428	 * Remember that we are loading
429	 * 8k executables into a 4k machine
430	 * DATA_ALIGN == 2 * PAGESIZE
431	 */
432	if (exp->a_text)
433		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
434	else
435		return ((caddr_t)USRTEXT);
436}
437
438/*
439 * Return the starting disk address for the data segment.
440 */
441ulong_t
442getdfile(struct exec *exp)
443{
444	if (exp->a_magic == ZMAGIC)
445		return (exp->a_text);
446	else
447		return (sizeof (struct exec) + exp->a_text);
448}
449
450/*
451 * Return the load memory address for the text segment.
452 */
453
454/*ARGSUSED*/
455caddr_t
456gettmem(struct exec *exp)
457{
458	return ((caddr_t)USRTEXT);
459}
460
461/*
462 * Return the file byte offset for the text segment.
463 */
464uint_t
465gettfile(struct exec *exp)
466{
467	if (exp->a_magic == ZMAGIC)
468		return (0);
469	else
470		return (sizeof (struct exec));
471}
472
473void
474getexinfo(
475	struct exdata *edp_in,
476	struct exdata *edp_out,
477	int *pagetext,
478	int *pagedata)
479{
480	*edp_out = *edp_in;	/* structure copy */
481
482	if ((edp_in->ux_mag == ZMAGIC) &&
483	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
484		*pagetext = 1;
485		*pagedata = 1;
486	} else {
487		*pagetext = 0;
488		*pagedata = 0;
489	}
490}
491
492/*
493 * Return non 0 value if the address may cause a VAC alias with KPM mappings.
494 * KPM selects an address such that it's equal offset modulo shm_alignment and
495 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
496 */
497int
498map_addr_vacalign_check(caddr_t addr, u_offset_t off)
499{
500	if (vac) {
501		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
502	} else {
503		return (0);
504	}
505}
506
507/*
508 * Sanity control. Don't use large pages regardless of user
509 * settings if there's less than priv or shm_lpg_min_physmem memory installed.
510 * The units for this variable is 8K pages.
511 */
512pgcnt_t shm_lpg_min_physmem = 131072;			/* 1GB */
513pgcnt_t privm_lpg_min_physmem = 131072;			/* 1GB */
514
515static size_t
516map_pgszheap(struct proc *p, caddr_t addr, size_t len)
517{
518	size_t		pgsz = MMU_PAGESIZE;
519	int		szc;
520
521	/*
522	 * If len is zero, retrieve from proc and don't demote the page size.
523	 * Use atleast the default pagesize.
524	 */
525	if (len == 0) {
526		len = p->p_brkbase + p->p_brksize - p->p_bssbase;
527	}
528	len = MAX(len, default_uheap_lpsize);
529
530	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
531		pgsz = hw_page_array[szc].hp_size;
532		if ((disable_auto_data_large_pages & (1 << szc)) ||
533		    pgsz > max_uheap_lpsize)
534			continue;
535		if (len >= pgsz) {
536			break;
537		}
538	}
539
540	/*
541	 * If addr == 0 we were called by memcntl() when the
542	 * size code is 0.  Don't set pgsz less than current size.
543	 */
544	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
545		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
546	}
547
548	return (pgsz);
549}
550
551static size_t
552map_pgszstk(struct proc *p, caddr_t addr, size_t len)
553{
554	size_t		pgsz = MMU_PAGESIZE;
555	int		szc;
556
557	/*
558	 * If len is zero, retrieve from proc and don't demote the page size.
559	 * Use atleast the default pagesize.
560	 */
561	if (len == 0) {
562		len = p->p_stksize;
563	}
564	len = MAX(len, default_ustack_lpsize);
565
566	for (szc = mmu_page_sizes - 1; szc >= 0; szc--) {
567		pgsz = hw_page_array[szc].hp_size;
568		if ((disable_auto_data_large_pages & (1 << szc)) ||
569		    pgsz > max_ustack_lpsize)
570			continue;
571		if (len >= pgsz) {
572			break;
573		}
574	}
575
576	/*
577	 * If addr == 0 we were called by memcntl() or exec_args() when the
578	 * size code is 0.  Don't set pgsz less than current size.
579	 */
580	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
581		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
582	}
583
584	return (pgsz);
585}
586
587static size_t
588map_pgszism(caddr_t addr, size_t len)
589{
590	uint_t szc;
591	size_t pgsz;
592
593	for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) {
594		if (disable_ism_large_pages & (1 << szc))
595			continue;
596
597		pgsz = hw_page_array[szc].hp_size;
598		if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz))
599			return (pgsz);
600	}
601
602	return (DEFAULT_ISM_PAGESIZE);
603}
604
605/*
606 * Suggest a page size to be used to map a segment of type maptype and length
607 * len.  Returns a page size (not a size code).
608 */
609/* ARGSUSED */
610size_t
611map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
612{
613	size_t	pgsz = MMU_PAGESIZE;
614
615	ASSERT(maptype != MAPPGSZ_VA);
616
617	if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
618		return (MMU_PAGESIZE);
619	}
620
621	switch (maptype) {
622	case MAPPGSZ_ISM:
623		pgsz = map_pgszism(addr, len);
624		break;
625
626	case MAPPGSZ_STK:
627		if (max_ustack_lpsize > MMU_PAGESIZE) {
628			pgsz = map_pgszstk(p, addr, len);
629		}
630		break;
631
632	case MAPPGSZ_HEAP:
633		if (max_uheap_lpsize > MMU_PAGESIZE) {
634			pgsz = map_pgszheap(p, addr, len);
635		}
636		break;
637	}
638	return (pgsz);
639}
640
641
642/* assumes TTE8K...TTE4M == szc */
643
644static uint_t
645map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs,
646    size_t max_lpsize, size_t min_physmem)
647{
648	caddr_t eaddr = addr + size;
649	uint_t szcvec = 0;
650	caddr_t raddr;
651	caddr_t readdr;
652	size_t pgsz;
653	int i;
654
655	if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
656		return (0);
657	}
658	for (i = mmu_page_sizes - 1; i > 0; i--) {
659		if (disable_lpgs & (1 << i)) {
660			continue;
661		}
662		pgsz = page_get_pagesize(i);
663		if (pgsz > max_lpsize) {
664			continue;
665		}
666		raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
667		readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
668		if (raddr < addr || raddr >= readdr) {
669			continue;
670		}
671		if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
672			continue;
673		}
674		szcvec |= (1 << i);
675		/*
676		 * And or in the remaining enabled page sizes.
677		 */
678		szcvec |= P2PHASE(~disable_lpgs, (1 << i));
679		szcvec &= ~1; /* no need to return 8K pagesize */
680		break;
681	}
682	return (szcvec);
683}
684
685/*
686 * Return a bit vector of large page size codes that
687 * can be used to map [addr, addr + len) region.
688 */
689/* ARGSUSED */
690uint_t
691map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
692    int memcntl)
693{
694	if (flags & MAP_TEXT) {
695		return (map_szcvec(addr, size, off,
696		    disable_auto_text_large_pages,
697		    max_utext_lpsize, shm_lpg_min_physmem));
698
699	} else if (flags & MAP_INITDATA) {
700		return (map_szcvec(addr, size, off,
701		    disable_auto_data_large_pages,
702		    max_uidata_lpsize, privm_lpg_min_physmem));
703
704	} else if (type == MAPPGSZC_SHM) {
705		return (map_szcvec(addr, size, off,
706		    disable_auto_data_large_pages,
707		    max_shm_lpsize, shm_lpg_min_physmem));
708
709	} else if (type == MAPPGSZC_HEAP) {
710		return (map_szcvec(addr, size, off,
711		    disable_auto_data_large_pages,
712		    max_uheap_lpsize, privm_lpg_min_physmem));
713
714	} else if (type == MAPPGSZC_STACK) {
715		return (map_szcvec(addr, size, off,
716		    disable_auto_data_large_pages,
717		    max_ustack_lpsize, privm_lpg_min_physmem));
718
719	} else {
720		return (map_szcvec(addr, size, off,
721		    disable_auto_data_large_pages,
722		    max_privmap_lpsize, privm_lpg_min_physmem));
723	}
724}
725
726/*
727 * Anchored in the table below are counters used to keep track
728 * of free contiguous physical memory. Each element of the table contains
729 * the array of counters, the size of array which is allocated during
730 * startup based on physmax and a shift value used to convert a pagenum
731 * into a counter array index or vice versa. The table has page size
732 * for rows and region size for columns:
733 *
734 *	page_counters[page_size][region_size]
735 *
736 *	page_size: 	TTE size code of pages on page_size freelist.
737 *
738 *	region_size:	TTE size code of a candidate larger page made up
739 *			made up of contiguous free page_size pages.
740 *
741 * As you go across a page_size row increasing region_size each
742 * element keeps track of how many (region_size - 1) size groups
743 * made up of page_size free pages can be coalesced into a
744 * regsion_size page. Yuck! Lets try an example:
745 *
746 * 	page_counters[1][3] is the table element used for identifying
747 *	candidate 4M pages from contiguous pages off the 64K free list.
748 *	Each index in the page_counters[1][3].array spans 4M. Its the
749 *	number of free 512K size (regsion_size - 1) groups of contiguous
750 *	64K free pages.	So when page_counters[1][3].counters[n] == 8
751 *	we know we have a candidate 4M page made up of 512K size groups
752 *	of 64K free pages.
753 */
754
755/*
756 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
757 * dimensions are allocated dynamically.
758 */
759page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
760
761/*
762 * For now there is only a single size cache list.
763 * Allocated dynamically.
764 */
765page_t ***page_cachelists[MAX_MEM_TYPES];
766
767kmutex_t *fpc_mutex[NPC_MUTEX];
768kmutex_t *cpc_mutex[NPC_MUTEX];
769
770/*
771 * Calculate space needed for page freelists and counters
772 */
773size_t
774calc_free_pagelist_sz(void)
775{
776	int szc;
777	size_t alloc_sz, cache_sz, free_sz;
778
779	/*
780	 * one cachelist per color, node, and type
781	 */
782	cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) +
783	    sizeof (page_t **);
784	cache_sz *= max_mem_nodes * MAX_MEM_TYPES;
785
786	/*
787	 * one freelist per size, color, node, and type
788	 */
789	free_sz = sizeof (page_t **);
790	for (szc = 0; szc < mmu_page_sizes; szc++)
791		free_sz += sizeof (page_t *) * page_get_pagecolors(szc);
792	free_sz *= max_mem_nodes * MAX_MEM_TYPES;
793
794	alloc_sz = cache_sz + free_sz + page_ctrs_sz();
795	return (alloc_sz);
796}
797
798caddr_t
799alloc_page_freelists(caddr_t alloc_base)
800{
801	int	mnode, mtype;
802	int	szc, clrs;
803
804	/*
805	 * We only support small pages in the cachelist.
806	 */
807	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
808		page_cachelists[mtype] = (page_t ***)alloc_base;
809		alloc_base += (max_mem_nodes * sizeof (page_t **));
810		for (mnode = 0; mnode < max_mem_nodes; mnode++) {
811			page_cachelists[mtype][mnode] = (page_t **)alloc_base;
812			alloc_base +=
813			    (page_get_pagecolors(0) * sizeof (page_t *));
814		}
815	}
816
817	/*
818	 * Allocate freelists bins for all
819	 * supported page sizes.
820	 */
821	for (szc = 0; szc < mmu_page_sizes; szc++) {
822		clrs = page_get_pagecolors(szc);
823		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
824			page_freelists[szc][mtype] = (page_t ***)alloc_base;
825			alloc_base += (max_mem_nodes * sizeof (page_t **));
826			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
827				page_freelists[szc][mtype][mnode] =
828				    (page_t **)alloc_base;
829				alloc_base += (clrs * (sizeof (page_t *)));
830			}
831		}
832	}
833
834	alloc_base = page_ctrs_alloc(alloc_base);
835	return (alloc_base);
836}
837
838/*
839 * Allocate page_freelists locks for a memnode from the nucleus data
840 * area. This is the first time that mmu_page_sizes is used during
841 * bootup, so check mmu_page_sizes initialization.
842 */
843int
844ndata_alloc_page_mutexs(struct memlist *ndata)
845{
846	size_t alloc_sz;
847	caddr_t alloc_base;
848	int	i;
849	void	page_coloring_init();
850
851	page_coloring_init();
852	if (&mmu_init_mmu_page_sizes) {
853		if (!mmu_init_mmu_page_sizes(0)) {
854			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
855			    mmu_page_sizes);
856		}
857	}
858	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
859
860	/* fpc_mutex and cpc_mutex */
861	alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
862
863	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
864	if (alloc_base == NULL)
865		return (-1);
866
867	ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
868
869	for (i = 0; i < NPC_MUTEX; i++) {
870		fpc_mutex[i] = (kmutex_t *)alloc_base;
871		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
872		cpc_mutex[i] = (kmutex_t *)alloc_base;
873		alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
874	}
875	return (0);
876}
877
878/*
879 * To select our starting bin, we stride through the bins with a stride
880 * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
881 * in simulation and practice for different workloads on varying cache sizes.
882 */
883uint32_t color_start_current = 0;
884uint32_t color_start_stride = 337;
885int color_start_random = 0;
886
887/* ARGSUSED */
888uint_t
889get_color_start(struct as *as)
890{
891	uint32_t old, new;
892
893	if (consistent_coloring == 2 || color_start_random) {
894		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
895		    (hw_page_array[0].hp_colors - 1)));
896	}
897
898	do {
899		old = color_start_current;
900		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
901	} while (cas32(&color_start_current, old, new) != old);
902
903	return ((uint_t)(new));
904}
905
906/*
907 * Called once at startup from kphysm_init() -- before memialloc()
908 * is invoked to do the 1st page_free()/page_freelist_add().
909 *
910 * initializes page_colors and page_colors_mask based on ecache_setsize.
911 *
912 * Also initializes the counter locks.
913 */
914void
915page_coloring_init()
916{
917	int	a, i;
918	uint_t colors;
919
920	if (do_pg_coloring == 0) {
921		page_colors = 1;
922		for (i = 0; i < mmu_page_sizes; i++) {
923			colorequivszc[i] = 0;
924			hw_page_array[i].hp_colors = 1;
925		}
926		return;
927	}
928
929	/*
930	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
931	 * the max ecache setsize of all cpus configured in the system or, for
932	 * cheetah+ systems, the max possible ecache setsize for all possible
933	 * cheetah+ cpus.
934	 */
935	page_colors = ecache_setsize / MMU_PAGESIZE;
936	page_colors_mask = page_colors - 1;
937
938	vac_colors = vac_size / MMU_PAGESIZE;
939	vac_colors_mask = vac_colors -1;
940
941	page_coloring_shift = 0;
942	a = ecache_setsize;
943	while (a >>= 1) {
944		page_coloring_shift++;
945	}
946
947	/* initialize number of colors per page size */
948	for (i = 0; i < mmu_page_sizes; i++) {
949		hw_page_array[i].hp_colors = (page_colors_mask >>
950		    (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
951		    + 1;
952		colorequivszc[i] = 0;
953	}
954
955	/*
956	 * initialize cpu_page_colors if ecache setsizes are homogenous.
957	 * cpu_page_colors set to -1 during DR operation or during startup
958	 * if setsizes are heterogenous.
959	 *
960	 * The value of cpu_page_colors determines if additional color bins
961	 * need to be checked for a particular color in the page_get routines.
962	 */
963	if (cpu_setsize > 0 && cpu_page_colors == 0 &&
964	    cpu_setsize < ecache_setsize) {
965		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
966		a = lowbit(page_colors) - lowbit(cpu_page_colors);
967		ASSERT(a > 0);
968		ASSERT(a < 16);
969
970		for (i = 0; i < mmu_page_sizes; i++) {
971			if ((colors = hw_page_array[i].hp_colors) <= 1) {
972				continue;
973			}
974			while ((colors >> a) == 0)
975				a--;
976			ASSERT(a >= 0);
977
978			/* higher 4 bits encodes color equiv mask */
979			colorequivszc[i] = (a << 4);
980		}
981	}
982
983	/* do cpu specific color initialization */
984	if (&page_coloring_init_cpu) {
985		page_coloring_init_cpu();
986	}
987}
988
989int
990bp_color(struct buf *bp)
991{
992	int color = -1;
993
994	if (vac) {
995		if ((bp->b_flags & B_PAGEIO) != 0) {
996			color = sfmmu_get_ppvcolor(bp->b_pages);
997		} else if (bp->b_un.b_addr != NULL) {
998			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
999		}
1000	}
1001	return (color < 0 ? 0 : ptob(color));
1002}
1003
1004/*
1005 * Function for flushing D-cache when performing module relocations
1006 * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1007 * at least for now.
1008 */
1009void
1010dcache_flushall()
1011{
1012	sfmmu_cache_flushall();
1013}
1014
1015static int
1016kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1017{
1018	if (va1 < va2 && va1 + sz1 <= va2)
1019		return (0);
1020
1021	if (va2 < va1 && va2 + sz2 <= va1)
1022		return (0);
1023
1024	return (1);
1025}
1026
1027/*
1028 * Return the number of bytes, relative to the beginning of a given range, that
1029 * are non-toxic (can be read from and written to with relative impunity).
1030 */
1031size_t
1032kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1033{
1034	/* OBP reads are harmless, but we don't want people writing there */
1035	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1036	    OFW_START_ADDR + 1))
1037		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1038
1039	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1040		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1041
1042	return (sz); /* no overlap */
1043}
1044
1045/*
1046 * Minimum physmem required for enabling large pages for kernel heap
1047 * Currently we do not enable lp for kmem on systems with less
1048 * than 1GB of memory. This value can be changed via /etc/system
1049 */
1050size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1051
1052/*
1053 * this function chooses large page size for kernel heap
1054 */
1055size_t
1056get_segkmem_lpsize(size_t lpsize)
1057{
1058	size_t memtotal = physmem * PAGESIZE;
1059	size_t mmusz;
1060	uint_t szc;
1061
1062	if (memtotal < segkmem_lpminphysmem)
1063		return (PAGESIZE);
1064
1065	if (plat_lpkmem_is_supported != NULL &&
1066	    plat_lpkmem_is_supported() == 0)
1067		return (PAGESIZE);
1068
1069	mmusz = mmu_get_kernel_lpsize(lpsize);
1070	szc = page_szc(mmusz);
1071
1072	while (szc) {
1073		if (!(disable_large_pages & (1 << szc)))
1074			return (page_get_pagesize(szc));
1075		szc--;
1076	}
1077	return (PAGESIZE);
1078}
1079