1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30#include <sys/types.h>
31#include <sys/inttypes.h>
32#include <sys/param.h>
33#include <sys/sysmacros.h>
34#include <sys/systm.h>
35#include <sys/signal.h>
36#include <sys/user.h>
37#include <sys/errno.h>
38#include <sys/var.h>
39#include <sys/proc.h>
40#include <sys/tuneable.h>
41#include <sys/debug.h>
42#include <sys/cmn_err.h>
43#include <sys/cred.h>
44#include <sys/vnode.h>
45#include <sys/vfs.h>
46#include <sys/vm.h>
47#include <sys/file.h>
48#include <sys/mman.h>
49#include <sys/vmparam.h>
50#include <sys/fcntl.h>
51#include <sys/lwpchan_impl.h>
52#include <sys/nbmlock.h>
53
54#include <vm/hat.h>
55#include <vm/as.h>
56#include <vm/seg.h>
57#include <vm/seg_dev.h>
58#include <vm/seg_vn.h>
59
60int use_brk_lpg = 1;
61int use_stk_lpg = 1;
62
63static int brk_lpg(caddr_t nva);
64static int grow_lpg(caddr_t sp);
65
66int
67brk(caddr_t nva)
68{
69	int error;
70	proc_t *p = curproc;
71
72	/*
73	 * Serialize brk operations on an address space.
74	 * This also serves as the lock protecting p_brksize
75	 * and p_brkpageszc.
76	 */
77	as_rangelock(p->p_as);
78	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
79		error = brk_lpg(nva);
80	} else {
81		error = brk_internal(nva, p->p_brkpageszc);
82	}
83	as_rangeunlock(p->p_as);
84	return ((error != 0 ? set_errno(error) : 0));
85}
86
87/*
88 * Algorithm: call arch-specific map_pgsz to get best page size to use,
89 * then call brk_internal().
90 * Returns 0 on success.
91 */
92static int
93brk_lpg(caddr_t nva)
94{
95	struct proc *p = curproc;
96	size_t pgsz, len;
97	caddr_t addr, brkend;
98	caddr_t bssbase = p->p_bssbase;
99	caddr_t brkbase = p->p_brkbase;
100	int oszc, szc;
101	int err;
102
103	oszc = p->p_brkpageszc;
104
105	/*
106	 * If p_brkbase has not yet been set, the first call
107	 * to brk_internal() will initialize it.
108	 */
109	if (brkbase == 0) {
110		return (brk_internal(nva, oszc));
111	}
112
113	len = nva - bssbase;
114
115	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
116	szc = page_szc(pgsz);
117
118	/*
119	 * Covers two cases:
120	 * 1. page_szc() returns -1 for invalid page size, so we want to
121	 * ignore it in that case.
122	 * 2. By design we never decrease page size, as it is more stable.
123	 */
124	if (szc <= oszc) {
125		err = brk_internal(nva, oszc);
126		/* If failed, back off to base page size. */
127		if (err != 0 && oszc != 0) {
128			err = brk_internal(nva, 0);
129		}
130		return (err);
131	}
132
133	err = brk_internal(nva, szc);
134	/* If using szc failed, map with base page size and return. */
135	if (err != 0) {
136		if (szc != 0) {
137			err = brk_internal(nva, 0);
138		}
139		return (err);
140	}
141
142	/*
143	 * Round up brk base to a large page boundary and remap
144	 * anything in the segment already faulted in beyond that
145	 * point.
146	 */
147	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
148	brkend = brkbase + p->p_brksize;
149	len = brkend - addr;
150	/* Check that len is not negative. Update page size code for heap. */
151	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
152		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
153		p->p_brkpageszc = szc;
154	}
155
156	ASSERT(err == 0);
157	return (err);		/* should always be 0 */
158}
159
160/*
161 * Returns 0 on success.
162 */
163int
164brk_internal(caddr_t nva, uint_t brkszc)
165{
166	caddr_t ova;			/* current break address */
167	size_t size;
168	int	error;
169	struct proc *p = curproc;
170	struct as *as = p->p_as;
171	size_t pgsz;
172	uint_t szc;
173	rctl_qty_t as_rctl;
174
175	/*
176	 * extend heap to brkszc alignment but use current p->p_brkpageszc
177	 * for the newly created segment. This allows the new extension
178	 * segment to be concatenated successfully with the existing brk
179	 * segment.
180	 */
181	if ((szc = brkszc) != 0) {
182		pgsz = page_get_pagesize(szc);
183		ASSERT(pgsz > PAGESIZE);
184	} else {
185		pgsz = PAGESIZE;
186	}
187
188	mutex_enter(&p->p_lock);
189	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
190	    p->p_rctls, p);
191	mutex_exit(&p->p_lock);
192
193	/*
194	 * If p_brkbase has not yet been set, the first call
195	 * to brk() will initialize it.
196	 */
197	if (p->p_brkbase == 0)
198		p->p_brkbase = nva;
199
200	/*
201	 * Before multiple page size support existed p_brksize was the value
202	 * not rounded to the pagesize (i.e. it stored the exact user request
203	 * for heap size). If pgsz is greater than PAGESIZE calculate the
204	 * heap size as the real new heap size by rounding it up to pgsz.
205	 * This is useful since we may want to know where the heap ends
206	 * without knowing heap pagesize (e.g. some old code) and also if
207	 * heap pagesize changes we can update p_brkpageszc but delay adding
208	 * new mapping yet still know from p_brksize where the heap really
209	 * ends. The user requested heap end is stored in libc variable.
210	 */
211	if (pgsz > PAGESIZE) {
212		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
213		size = tnva - p->p_brkbase;
214		if (tnva < p->p_brkbase || (size > p->p_brksize &&
215		    size > (size_t)as_rctl)) {
216			szc = 0;
217			pgsz = PAGESIZE;
218			size = nva - p->p_brkbase;
219		}
220	} else {
221		size = nva - p->p_brkbase;
222	}
223
224	/*
225	 * use PAGESIZE to roundup ova because we want to know the real value
226	 * of the current heap end in case p_brkpageszc changes since the last
227	 * p_brksize was computed.
228	 */
229	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
230	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
231	    PAGESIZE);
232
233	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
234	    size > as_rctl)) {
235		mutex_enter(&p->p_lock);
236		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
237		    RCA_SAFE);
238		mutex_exit(&p->p_lock);
239		return (ENOMEM);
240	}
241
242	if (nva > ova) {
243		struct segvn_crargs crargs =
244		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
245
246		if (!(p->p_datprot & PROT_EXEC)) {
247			crargs.prot &= ~PROT_EXEC;
248		}
249
250		/*
251		 * Add new zfod mapping to extend UNIX data segment
252		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
253		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
254		 * page sizes if ova is not aligned to szc's pgsz.
255		 */
256		if (szc > 0) {
257			caddr_t rbss;
258
259			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
260			    pgsz);
261			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
262				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
263				    AS_MAP_NO_LPOOB;
264			} else if (ova == rbss) {
265				crargs.szc = szc;
266			} else {
267				crargs.szc = AS_MAP_HEAP;
268			}
269		} else {
270			crargs.szc = AS_MAP_NO_LPOOB;
271		}
272		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
273		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
274		    &crargs);
275		if (error) {
276			return (error);
277		}
278
279	} else if (nva < ova) {
280		/*
281		 * Release mapping to shrink UNIX data segment.
282		 */
283		(void) as_unmap(as, nva, (size_t)(ova - nva));
284	}
285	p->p_brksize = size;
286	return (0);
287}
288
289/*
290 * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
291 * This routine assumes that the stack grows downward.
292 */
293int
294grow(caddr_t sp)
295{
296	struct proc *p = curproc;
297	struct as *as = p->p_as;
298	size_t oldsize = p->p_stksize;
299	size_t newsize;
300	int err;
301
302	/*
303	 * Serialize grow operations on an address space.
304	 * This also serves as the lock protecting p_stksize
305	 * and p_stkpageszc.
306	 */
307	as_rangelock(as);
308	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
309		err = grow_lpg(sp);
310	} else {
311		err = grow_internal(sp, p->p_stkpageszc);
312	}
313	as_rangeunlock(as);
314
315	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
316		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
317		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
318		/*
319		 * Set up translations so the process doesn't have to fault in
320		 * the stack pages we just gave it.
321		 */
322		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
323		    newsize - oldsize, F_INVAL, S_WRITE);
324	}
325	return ((err == 0 ? 1 : 0));
326}
327
328/*
329 * Algorithm: call arch-specific map_pgsz to get best page size to use,
330 * then call grow_internal().
331 * Returns 0 on success.
332 */
333static int
334grow_lpg(caddr_t sp)
335{
336	struct proc *p = curproc;
337	size_t pgsz;
338	size_t len, newsize;
339	caddr_t addr, saddr;
340	caddr_t growend;
341	int oszc, szc;
342	int err;
343
344	newsize = p->p_usrstack - sp;
345
346	oszc = p->p_stkpageszc;
347	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
348	szc = page_szc(pgsz);
349
350	/*
351	 * Covers two cases:
352	 * 1. page_szc() returns -1 for invalid page size, so we want to
353	 * ignore it in that case.
354	 * 2. By design we never decrease page size, as it is more stable.
355	 * This shouldn't happen as the stack never shrinks.
356	 */
357	if (szc <= oszc) {
358		err = grow_internal(sp, oszc);
359		/* failed, fall back to base page size */
360		if (err != 0 && oszc != 0) {
361			err = grow_internal(sp, 0);
362		}
363		return (err);
364	}
365
366	/*
367	 * We've grown sufficiently to switch to a new page size.
368	 * So we are going to remap the whole segment with the new page size.
369	 */
370	err = grow_internal(sp, szc);
371	/* The grow with szc failed, so fall back to base page size. */
372	if (err != 0) {
373		if (szc != 0) {
374			err = grow_internal(sp, 0);
375		}
376		return (err);
377	}
378
379	/*
380	 * Round up stack pointer to a large page boundary and remap
381	 * any pgsz pages in the segment already faulted in beyond that
382	 * point.
383	 */
384	saddr = p->p_usrstack - p->p_stksize;
385	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
386	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
387	len = growend - addr;
388	/* Check that len is not negative. Update page size code for stack. */
389	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
390		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
391		p->p_stkpageszc = szc;
392	}
393
394	ASSERT(err == 0);
395	return (err);		/* should always be 0 */
396}
397
398/*
399 * This routine assumes that the stack grows downward.
400 * Returns 0 on success, errno on failure.
401 */
402int
403grow_internal(caddr_t sp, uint_t growszc)
404{
405	struct proc *p = curproc;
406	size_t newsize;
407	size_t oldsize;
408	int    error;
409	size_t pgsz;
410	uint_t szc;
411	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
412
413	ASSERT(sp < p->p_usrstack);
414	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
415
416	/*
417	 * grow to growszc alignment but use current p->p_stkpageszc for
418	 * the segvn_crargs szc passed to segvn_create. For memcntl to
419	 * increase the szc, this allows the new extension segment to be
420	 * concatenated successfully with the existing stack segment.
421	 */
422	if ((szc = growszc) != 0) {
423		pgsz = page_get_pagesize(szc);
424		ASSERT(pgsz > PAGESIZE);
425		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
426		if (newsize > (size_t)p->p_stk_ctl) {
427			szc = 0;
428			pgsz = PAGESIZE;
429			newsize = p->p_usrstack - sp;
430		}
431	} else {
432		pgsz = PAGESIZE;
433		newsize = p->p_usrstack - sp;
434	}
435
436	if (newsize > (size_t)p->p_stk_ctl) {
437		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
438		    RCA_UNSAFE_ALL);
439
440		return (ENOMEM);
441	}
442
443	oldsize = p->p_stksize;
444	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
445
446	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
447		return (0);
448	}
449
450	if (!(p->p_stkprot & PROT_EXEC)) {
451		crargs.prot &= ~PROT_EXEC;
452	}
453	/*
454	 * extend stack with the proposed new growszc, which is different
455	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
456	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
457	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
458	 * if not aligned to szc's pgsz.
459	 */
460	if (szc > 0) {
461		caddr_t oldsp = p->p_usrstack - oldsize;
462		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
463		    pgsz);
464
465		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
466			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
467			    AS_MAP_NO_LPOOB;
468		} else if (oldsp == austk) {
469			crargs.szc = szc;
470		} else {
471			crargs.szc = AS_MAP_STACK;
472		}
473	} else {
474		crargs.szc = AS_MAP_NO_LPOOB;
475	}
476	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
477
478	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
479	    segvn_create, &crargs)) != 0) {
480		if (error == EAGAIN) {
481			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
482			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
483		}
484		return (error);
485	}
486	p->p_stksize = newsize;
487	return (0);
488}
489
490/*
491 * Find address for user to map.
492 * If MAP_FIXED is not specified, we can pick any address we want, but we will
493 * first try the value in *addrp if it is non-NULL.  Thus this is implementing
494 * a way to try and get a preferred address.
495 */
496int
497choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
498    int vacalign, uint_t flags)
499{
500	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
501	size_t lenp = len;
502
503	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
504	if (flags & MAP_FIXED) {
505		(void) as_unmap(as, *addrp, len);
506		return (0);
507	} else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
508	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
509		/* User supplied address was available */
510		*addrp = basep;
511	} else {
512		/*
513		 * No user supplied address or the address supplied was not
514		 * available.
515		 */
516		map_addr(addrp, len, off, vacalign, flags);
517	}
518	if (*addrp == NULL)
519		return (ENOMEM);
520	return (0);
521}
522
523
524/*
525 * Used for MAP_ANON - fast way to get anonymous pages
526 */
527static int
528zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
529    offset_t pos)
530{
531	struct segvn_crargs vn_a;
532	int error;
533
534	if (((PROT_ALL & uprot) != uprot))
535		return (EACCES);
536
537	if ((flags & MAP_FIXED) != 0) {
538		caddr_t userlimit;
539
540		/*
541		 * Use the user address.  First verify that
542		 * the address to be used is page aligned.
543		 * Then make some simple bounds checks.
544		 */
545		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
546			return (EINVAL);
547
548		userlimit = flags & _MAP_LOW32 ?
549		    (caddr_t)USERLIMIT32 : as->a_userlimit;
550		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
551		case RANGE_OKAY:
552			break;
553		case RANGE_BADPROT:
554			return (ENOTSUP);
555		case RANGE_BADADDR:
556		default:
557			return (ENOMEM);
558		}
559	}
560	/*
561	 * No need to worry about vac alignment for anonymous
562	 * pages since this is a "clone" object that doesn't
563	 * yet exist.
564	 */
565	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
566	if (error != 0) {
567		return (error);
568	}
569
570	/*
571	 * Use the seg_vn segment driver; passing in the NULL amp
572	 * gives the desired "cloning" effect.
573	 */
574	vn_a.vp = NULL;
575	vn_a.offset = 0;
576	vn_a.type = flags & MAP_TYPE;
577	vn_a.prot = uprot;
578	vn_a.maxprot = PROT_ALL;
579	vn_a.flags = flags & ~MAP_TYPE;
580	vn_a.cred = CRED();
581	vn_a.amp = NULL;
582	vn_a.szc = 0;
583	vn_a.lgrp_mem_policy_flags = 0;
584
585	return (as_map(as, *addrp, len, segvn_create, &vn_a));
586}
587
588static int
589smmap_common(caddr_t *addrp, size_t len,
590    int prot, int flags, struct file *fp, offset_t pos)
591{
592	struct vnode *vp;
593	struct as *as = curproc->p_as;
594	uint_t uprot, maxprot, type;
595	int error;
596	int in_crit = 0;
597
598	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
599	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
600	    MAP_TEXT | MAP_INITDATA)) != 0) {
601		/* | MAP_RENAME */	/* not implemented, let user know */
602		return (EINVAL);
603	}
604
605	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
606		return (EINVAL);
607	}
608
609	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
610		return (EINVAL);
611	}
612
613#if defined(__sparc)
614	/*
615	 * See if this is an "old mmap call".  If so, remember this
616	 * fact and convert the flags value given to mmap to indicate
617	 * the specified address in the system call must be used.
618	 * _MAP_NEW is turned set by all new uses of mmap.
619	 */
620	if ((flags & _MAP_NEW) == 0)
621		flags |= MAP_FIXED;
622#endif
623	flags &= ~_MAP_NEW;
624
625	type = flags & MAP_TYPE;
626	if (type != MAP_PRIVATE && type != MAP_SHARED)
627		return (EINVAL);
628
629
630	if (flags & MAP_ALIGN) {
631
632		if (flags & MAP_FIXED)
633			return (EINVAL);
634
635		/* alignment needs to be a power of 2 >= page size */
636		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
637		    !ISP2((uintptr_t)*addrp))
638			return (EINVAL);
639	}
640	/*
641	 * Check for bad lengths and file position.
642	 * We let the VOP_MAP routine check for negative lengths
643	 * since on some vnode types this might be appropriate.
644	 */
645	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
646		return (EINVAL);
647
648	maxprot = PROT_ALL;		/* start out allowing all accesses */
649	uprot = prot | PROT_USER;
650
651	if (fp == NULL) {
652		ASSERT(flags & MAP_ANON);
653		/* discard lwpchan mappings, like munmap() */
654		if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
655			lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
656		as_rangelock(as);
657		error = zmap(as, addrp, len, uprot, flags, pos);
658		as_rangeunlock(as);
659		/*
660		 * Tell machine specific code that lwp has mapped shared memory
661		 */
662		if (error == 0 && (flags & MAP_SHARED)) {
663			/* EMPTY */
664			LWP_MMODEL_SHARED_AS(*addrp, len);
665		}
666		return (error);
667	} else if ((flags & MAP_ANON) != 0)
668		return (EINVAL);
669
670	vp = fp->f_vnode;
671
672	/* Can't execute code from "noexec" mounted filesystem. */
673	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
674		maxprot &= ~PROT_EXEC;
675
676	/*
677	 * These checks were added as part of large files.
678	 *
679	 * Return ENXIO if the initial position is negative; return EOVERFLOW
680	 * if (offset + len) would overflow the maximum allowed offset for the
681	 * type of file descriptor being used.
682	 */
683	if (vp->v_type == VREG) {
684		if (pos < 0)
685			return (ENXIO);
686		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
687			return (EOVERFLOW);
688	}
689
690	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
691		/* no write access allowed */
692		maxprot &= ~PROT_WRITE;
693	}
694
695	/*
696	 * XXX - Do we also adjust maxprot based on protections
697	 * of the vnode?  E.g. if no execute permission is given
698	 * on the vnode for the current user, maxprot probably
699	 * should disallow PROT_EXEC also?  This is different
700	 * from the write access as this would be a per vnode
701	 * test as opposed to a per fd test for writability.
702	 */
703
704	/*
705	 * Verify that the specified protections are not greater than
706	 * the maximum allowable protections.  Also test to make sure
707	 * that the file descriptor does allows for read access since
708	 * "write only" mappings are hard to do since normally we do
709	 * the read from the file before the page can be written.
710	 */
711	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
712		return (EACCES);
713
714	/*
715	 * If the user specified an address, do some simple checks here
716	 */
717	if ((flags & MAP_FIXED) != 0) {
718		caddr_t userlimit;
719
720		/*
721		 * Use the user address.  First verify that
722		 * the address to be used is page aligned.
723		 * Then make some simple bounds checks.
724		 */
725		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
726			return (EINVAL);
727
728		userlimit = flags & _MAP_LOW32 ?
729		    (caddr_t)USERLIMIT32 : as->a_userlimit;
730		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
731		case RANGE_OKAY:
732			break;
733		case RANGE_BADPROT:
734			return (ENOTSUP);
735		case RANGE_BADADDR:
736		default:
737			return (ENOMEM);
738		}
739	}
740
741	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
742	    nbl_need_check(vp)) {
743		int svmand;
744		nbl_op_t nop;
745
746		nbl_start_crit(vp, RW_READER);
747		in_crit = 1;
748		error = nbl_svmand(vp, fp->f_cred, &svmand);
749		if (error != 0)
750			goto done;
751		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
752			if (prot & (PROT_READ | PROT_EXEC)) {
753				nop = NBL_READWRITE;
754			} else {
755				nop = NBL_WRITE;
756			}
757		} else {
758			nop = NBL_READ;
759		}
760		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
761			error = EACCES;
762			goto done;
763		}
764	}
765
766	/* discard lwpchan mappings, like munmap() */
767	if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
768		lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
769
770	/*
771	 * Ok, now let the vnode map routine do its thing to set things up.
772	 */
773	error = VOP_MAP(vp, pos, as,
774	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
775
776	if (error == 0) {
777		/*
778		 * Tell machine specific code that lwp has mapped shared memory
779		 */
780		if (flags & MAP_SHARED) {
781			/* EMPTY */
782			LWP_MMODEL_SHARED_AS(*addrp, len);
783		}
784		if (vp->v_type == VREG &&
785		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
786			/*
787			 * Mark this as an executable vnode
788			 */
789			mutex_enter(&vp->v_lock);
790			vp->v_flag |= VVMEXEC;
791			mutex_exit(&vp->v_lock);
792		}
793	}
794
795done:
796	if (in_crit)
797		nbl_end_crit(vp);
798	return (error);
799}
800
801#ifdef _LP64
802/*
803 * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
804 *
805 * The "large file" mmap routine mmap64(2) is also mapped to this routine
806 * by the 64-bit version of libc.
807 *
808 * Eventually, this should be the only version, and have smmap_common()
809 * folded back into it again.  Some day.
810 */
811caddr_t
812smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
813{
814	struct file *fp;
815	int error;
816
817	if (flags & _MAP_LOW32)
818		error = EINVAL;
819	else if (fd == -1 && (flags & MAP_ANON) != 0)
820		error = smmap_common(&addr, len, prot, flags,
821		    NULL, (offset_t)pos);
822	else if ((fp = getf(fd)) != NULL) {
823		error = smmap_common(&addr, len, prot, flags,
824		    fp, (offset_t)pos);
825		releasef(fd);
826	} else
827		error = EBADF;
828
829	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
830}
831#endif	/* _LP64 */
832
833#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
834
835/*
836 * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
837 */
838caddr_t
839smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
840{
841	struct file *fp;
842	int error;
843	caddr_t a = (caddr_t)(uintptr_t)addr;
844
845	if (flags & _MAP_LOW32)
846		error = EINVAL;
847	else if (fd == -1 && (flags & MAP_ANON) != 0)
848		error = smmap_common(&a, (size_t)len, prot,
849		    flags | _MAP_LOW32, NULL, (offset_t)pos);
850	else if ((fp = getf(fd)) != NULL) {
851		error = smmap_common(&a, (size_t)len, prot,
852		    flags | _MAP_LOW32, fp, (offset_t)pos);
853		releasef(fd);
854	} else
855		error = EBADF;
856
857	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
858
859	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
860}
861
862/*
863 * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
864 *
865 * Now things really get ugly because we can't use the C-style
866 * calling convention for more than 6 args, and 64-bit parameter
867 * passing on 32-bit systems is less than clean.
868 */
869
870struct mmaplf32a {
871	caddr_t addr;
872	size_t len;
873#ifdef _LP64
874	/*
875	 * 32-bit contents, 64-bit cells
876	 */
877	uint64_t prot;
878	uint64_t flags;
879	uint64_t fd;
880	uint64_t offhi;
881	uint64_t offlo;
882#else
883	/*
884	 * 32-bit contents, 32-bit cells
885	 */
886	uint32_t prot;
887	uint32_t flags;
888	uint32_t fd;
889	uint32_t offhi;
890	uint32_t offlo;
891#endif
892};
893
894int
895smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
896{
897	struct file *fp;
898	int error;
899	caddr_t a = uap->addr;
900	int flags = (int)uap->flags;
901	int fd = (int)uap->fd;
902#ifdef _BIG_ENDIAN
903	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
904#else
905	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
906#endif
907
908	if (flags & _MAP_LOW32)
909		error = EINVAL;
910	else if (fd == -1 && (flags & MAP_ANON) != 0)
911		error = smmap_common(&a, uap->len, (int)uap->prot,
912		    flags | _MAP_LOW32, NULL, off);
913	else if ((fp = getf(fd)) != NULL) {
914		error = smmap_common(&a, uap->len, (int)uap->prot,
915		    flags | _MAP_LOW32, fp, off);
916		releasef(fd);
917	} else
918		error = EBADF;
919
920	if (error == 0)
921		rvp->r_val1 = (uintptr_t)a;
922	return (error);
923}
924
925#endif	/* _SYSCALL32_IMPL || _ILP32 */
926
927int
928munmap(caddr_t addr, size_t len)
929{
930	struct proc *p = curproc;
931	struct as *as = p->p_as;
932
933	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
934		return (set_errno(EINVAL));
935
936	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
937		return (set_errno(EINVAL));
938
939	/*
940	 * Discard lwpchan mappings.
941	 */
942	if (p->p_lcp != NULL)
943		lwpchan_delete_mapping(p, addr, addr + len);
944	if (as_unmap(as, addr, len) != 0)
945		return (set_errno(EINVAL));
946
947	return (0);
948}
949
950int
951mprotect(caddr_t addr, size_t len, int prot)
952{
953	struct as *as = curproc->p_as;
954	uint_t uprot = prot | PROT_USER;
955	int error;
956
957	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
958		return (set_errno(EINVAL));
959
960	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
961	case RANGE_OKAY:
962		break;
963	case RANGE_BADPROT:
964		return (set_errno(ENOTSUP));
965	case RANGE_BADADDR:
966	default:
967		return (set_errno(ENOMEM));
968	}
969
970	error = as_setprot(as, addr, len, uprot);
971	if (error)
972		return (set_errno(error));
973	return (0);
974}
975
976#define	MC_CACHE	128			/* internal result buffer */
977#define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
978
979int
980mincore(caddr_t addr, size_t len, char *vecp)
981{
982	struct as *as = curproc->p_as;
983	caddr_t ea;			/* end address of loop */
984	size_t rl;			/* inner result length */
985	char vec[MC_CACHE];		/* local vector cache */
986	int error;
987	model_t model;
988	long	llen;
989
990	model = get_udatamodel();
991	/*
992	 * Validate form of address parameters.
993	 */
994	if (model == DATAMODEL_NATIVE) {
995		llen = (long)len;
996	} else {
997		llen = (int32_t)(size32_t)len;
998	}
999	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1000		return (set_errno(EINVAL));
1001
1002	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1003		return (set_errno(ENOMEM));
1004
1005	/*
1006	 * Loop over subranges of interval [addr : addr + len), recovering
1007	 * results internally and then copying them out to caller.  Subrange
1008	 * is based on the size of MC_CACHE, defined above.
1009	 */
1010	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1011		error = as_incore(as, addr,
1012		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1013		if (rl != 0) {
1014			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1015			if (copyout(vec, vecp, rl) != 0)
1016				return (set_errno(EFAULT));
1017			vecp += rl;
1018		}
1019		if (error != 0)
1020			return (set_errno(ENOMEM));
1021	}
1022	return (0);
1023}
1024