1/*	$NetBSD: uvm_mmap.c,v 1.143 2012/01/05 15:19:53 reinoud Exp $	*/
2
3/*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39 *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
40 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
41 */
42
43/*
44 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
45 * function.
46 */
47
48#include <sys/cdefs.h>
49__KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.143 2012/01/05 15:19:53 reinoud Exp $");
50
51#include "opt_compat_netbsd.h"
52#include "opt_pax.h"
53#include "veriexec.h"
54
55#include <sys/param.h>
56#include <sys/systm.h>
57#include <sys/file.h>
58#include <sys/filedesc.h>
59#include <sys/resourcevar.h>
60#include <sys/mman.h>
61#include <sys/mount.h>
62#include <sys/vnode.h>
63#include <sys/conf.h>
64#include <sys/stat.h>
65
66#if NVERIEXEC > 0
67#include <sys/verified_exec.h>
68#endif /* NVERIEXEC > 0 */
69
70#if defined(PAX_ASLR) || defined(PAX_MPROTECT)
71#include <sys/pax.h>
72#endif /* PAX_ASLR || PAX_MPROTECT */
73
74#include <miscfs/specfs/specdev.h>
75
76#include <sys/syscallargs.h>
77
78#include <uvm/uvm.h>
79#include <uvm/uvm_device.h>
80
81#ifndef COMPAT_ZERODEV
82#define COMPAT_ZERODEV(dev)	(0)
83#endif
84
85static int
86range_test(vaddr_t addr, vsize_t size, bool ismmap)
87{
88	vaddr_t vm_min_address = VM_MIN_ADDRESS;
89	vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
90	vaddr_t eaddr = addr + size;
91
92	if (addr < vm_min_address)
93		return EINVAL;
94	if (eaddr > vm_max_address)
95		return ismmap ? EFBIG : EINVAL;
96	if (addr > eaddr) /* no wrapping! */
97		return ismmap ? EOVERFLOW : EINVAL;
98	return 0;
99}
100
101/*
102 * unimplemented VM system calls:
103 */
104
105/*
106 * sys_sbrk: sbrk system call.
107 */
108
109/* ARGSUSED */
110int
111sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
112{
113	/* {
114		syscallarg(intptr_t) incr;
115	} */
116
117	return (ENOSYS);
118}
119
120/*
121 * sys_sstk: sstk system call.
122 */
123
124/* ARGSUSED */
125int
126sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
127{
128	/* {
129		syscallarg(int) incr;
130	} */
131
132	return (ENOSYS);
133}
134
135/*
136 * sys_mincore: determine if pages are in core or not.
137 */
138
139/* ARGSUSED */
140int
141sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
142    register_t *retval)
143{
144	/* {
145		syscallarg(void *) addr;
146		syscallarg(size_t) len;
147		syscallarg(char *) vec;
148	} */
149	struct proc *p = l->l_proc;
150	struct vm_page *pg;
151	char *vec, pgi;
152	struct uvm_object *uobj;
153	struct vm_amap *amap;
154	struct vm_anon *anon;
155	struct vm_map_entry *entry;
156	vaddr_t start, end, lim;
157	struct vm_map *map;
158	vsize_t len;
159	int error = 0, npgs;
160
161	map = &p->p_vmspace->vm_map;
162
163	start = (vaddr_t)SCARG(uap, addr);
164	len = SCARG(uap, len);
165	vec = SCARG(uap, vec);
166
167	if (start & PAGE_MASK)
168		return (EINVAL);
169	len = round_page(len);
170	end = start + len;
171	if (end <= start)
172		return (EINVAL);
173
174	/*
175	 * Lock down vec, so our returned status isn't outdated by
176	 * storing the status byte for a page.
177	 */
178
179	npgs = len >> PAGE_SHIFT;
180	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
181	if (error) {
182		return error;
183	}
184	vm_map_lock_read(map);
185
186	if (uvm_map_lookup_entry(map, start, &entry) == false) {
187		error = ENOMEM;
188		goto out;
189	}
190
191	for (/* nothing */;
192	     entry != &map->header && entry->start < end;
193	     entry = entry->next) {
194		KASSERT(!UVM_ET_ISSUBMAP(entry));
195		KASSERT(start >= entry->start);
196
197		/* Make sure there are no holes. */
198		if (entry->end < end &&
199		     (entry->next == &map->header ||
200		      entry->next->start > entry->end)) {
201			error = ENOMEM;
202			goto out;
203		}
204
205		lim = end < entry->end ? end : entry->end;
206
207		/*
208		 * Special case for objects with no "real" pages.  Those
209		 * are always considered resident (mapped devices).
210		 */
211
212		if (UVM_ET_ISOBJ(entry)) {
213			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
214			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
215				for (/* nothing */; start < lim;
216				     start += PAGE_SIZE, vec++)
217					subyte(vec, 1);
218				continue;
219			}
220		}
221
222		amap = entry->aref.ar_amap;	/* upper layer */
223		uobj = entry->object.uvm_obj;	/* lower layer */
224
225		if (amap != NULL)
226			amap_lock(amap);
227		if (uobj != NULL)
228			mutex_enter(uobj->vmobjlock);
229
230		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
231			pgi = 0;
232			if (amap != NULL) {
233				/* Check the upper layer first. */
234				anon = amap_lookup(&entry->aref,
235				    start - entry->start);
236				/* Don't need to lock anon here. */
237				if (anon != NULL && anon->an_page != NULL) {
238
239					/*
240					 * Anon has the page for this entry
241					 * offset.
242					 */
243
244					pgi = 1;
245				}
246			}
247			if (uobj != NULL && pgi == 0) {
248				/* Check the lower layer. */
249				pg = uvm_pagelookup(uobj,
250				    entry->offset + (start - entry->start));
251				if (pg != NULL) {
252
253					/*
254					 * Object has the page for this entry
255					 * offset.
256					 */
257
258					pgi = 1;
259				}
260			}
261			(void) subyte(vec, pgi);
262		}
263		if (uobj != NULL)
264			mutex_exit(uobj->vmobjlock);
265		if (amap != NULL)
266			amap_unlock(amap);
267	}
268
269 out:
270	vm_map_unlock_read(map);
271	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
272	return (error);
273}
274
275/*
276 * sys_mmap: mmap system call.
277 *
278 * => file offset and address may not be page aligned
279 *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
280 *    - if address isn't page aligned the mapping starts at trunc_page(addr)
281 *      and the return value is adjusted up by the page offset.
282 */
283
284int
285sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
286{
287	/* {
288		syscallarg(void *) addr;
289		syscallarg(size_t) len;
290		syscallarg(int) prot;
291		syscallarg(int) flags;
292		syscallarg(int) fd;
293		syscallarg(long) pad;
294		syscallarg(off_t) pos;
295	} */
296	struct proc *p = l->l_proc;
297	vaddr_t addr;
298	struct vattr va;
299	off_t pos;
300	vsize_t size, pageoff;
301	vm_prot_t prot, maxprot;
302	int flags, fd;
303	vaddr_t defaddr;
304	struct file *fp = NULL;
305	struct vnode *vp;
306	void *handle;
307	int error;
308#ifdef PAX_ASLR
309	vaddr_t orig_addr;
310#endif /* PAX_ASLR */
311
312	/*
313	 * first, extract syscall args from the uap.
314	 */
315
316	addr = (vaddr_t)SCARG(uap, addr);
317	size = (vsize_t)SCARG(uap, len);
318	prot = SCARG(uap, prot) & VM_PROT_ALL;
319	flags = SCARG(uap, flags);
320	fd = SCARG(uap, fd);
321	pos = SCARG(uap, pos);
322
323#ifdef PAX_ASLR
324	orig_addr = addr;
325#endif /* PAX_ASLR */
326
327	/*
328	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
329	 * validate the flags.
330	 */
331	if (flags & MAP_COPY)
332		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
333	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
334		return (EINVAL);
335
336	/*
337	 * align file position and save offset.  adjust size.
338	 */
339
340	pageoff = (pos & PAGE_MASK);
341	pos  -= pageoff;
342	size += pageoff;			/* add offset */
343	size = (vsize_t)round_page(size);	/* round up */
344
345	/*
346	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
347	 */
348	if (flags & MAP_FIXED) {
349
350		/* ensure address and file offset are aligned properly */
351		addr -= pageoff;
352		if (addr & PAGE_MASK)
353			return (EINVAL);
354
355		error = range_test(addr, size, true);
356		if (error)
357			return error;
358	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
359
360		/*
361		 * not fixed: make sure we skip over the largest
362		 * possible heap for non-topdown mapping arrangements.
363		 * we will refine our guess later (e.g. to account for
364		 * VAC, etc)
365		 */
366
367		defaddr = p->p_emul->e_vm_default_addr(p,
368		    (vaddr_t)p->p_vmspace->vm_daddr, size);
369
370		if (addr == 0 ||
371		    !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
372			addr = MAX(addr, defaddr);
373		else
374			addr = MIN(addr, defaddr);
375	}
376
377	/*
378	 * check for file mappings (i.e. not anonymous) and verify file.
379	 */
380
381	if ((flags & MAP_ANON) == 0) {
382		if ((fp = fd_getfile(fd)) == NULL)
383			return (EBADF);
384		if (fp->f_type != DTYPE_VNODE) {
385			fd_putfile(fd);
386			return (ENODEV);		/* only mmap vnodes! */
387		}
388		vp = fp->f_data;		/* convert to vnode */
389		if (vp->v_type != VREG && vp->v_type != VCHR &&
390		    vp->v_type != VBLK) {
391			fd_putfile(fd);
392			return (ENODEV);  /* only REG/CHR/BLK support mmap */
393		}
394		if (vp->v_type != VCHR && pos < 0) {
395			fd_putfile(fd);
396			return (EINVAL);
397		}
398		if (vp->v_type != VCHR && (off_t)(pos + size) < pos) {
399			fd_putfile(fd);
400			return (EOVERFLOW);		/* no offset wrapping */
401		}
402
403		/* special case: catch SunOS style /dev/zero */
404		if (vp->v_type == VCHR
405		    && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
406			flags |= MAP_ANON;
407			fd_putfile(fd);
408			fp = NULL;
409			goto is_anon;
410		}
411
412		/*
413		 * Old programs may not select a specific sharing type, so
414		 * default to an appropriate one.
415		 *
416		 * XXX: how does MAP_ANON fit in the picture?
417		 */
418		if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
419#if defined(DEBUG)
420			printf("WARNING: defaulted mmap() share type to "
421			   "%s (pid %d command %s)\n", vp->v_type == VCHR ?
422			   "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
423			    p->p_comm);
424#endif
425			if (vp->v_type == VCHR)
426				flags |= MAP_SHARED;	/* for a device */
427			else
428				flags |= MAP_PRIVATE;	/* for a file */
429		}
430
431		/*
432		 * MAP_PRIVATE device mappings don't make sense (and aren't
433		 * supported anyway).  However, some programs rely on this,
434		 * so just change it to MAP_SHARED.
435		 */
436		if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
437			flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
438		}
439
440		/*
441		 * now check protection
442		 */
443
444		maxprot = VM_PROT_EXECUTE;
445
446		/* check read access */
447		if (fp->f_flag & FREAD)
448			maxprot |= VM_PROT_READ;
449		else if (prot & PROT_READ) {
450			fd_putfile(fd);
451			return (EACCES);
452		}
453
454		/* check write access, shared case first */
455		if (flags & MAP_SHARED) {
456			/*
457			 * if the file is writable, only add PROT_WRITE to
458			 * maxprot if the file is not immutable, append-only.
459			 * otherwise, if we have asked for PROT_WRITE, return
460			 * EPERM.
461			 */
462			if (fp->f_flag & FWRITE) {
463				vn_lock(vp, LK_SHARED | LK_RETRY);
464				error = VOP_GETATTR(vp, &va, l->l_cred);
465				VOP_UNLOCK(vp);
466				if (error) {
467					fd_putfile(fd);
468					return (error);
469				}
470				if ((va.va_flags &
471				    (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
472					maxprot |= VM_PROT_WRITE;
473				else if (prot & PROT_WRITE) {
474					fd_putfile(fd);
475					return (EPERM);
476				}
477			}
478			else if (prot & PROT_WRITE) {
479				fd_putfile(fd);
480				return (EACCES);
481			}
482		} else {
483			/* MAP_PRIVATE mappings can always write to */
484			maxprot |= VM_PROT_WRITE;
485		}
486		handle = vp;
487
488	} else {		/* MAP_ANON case */
489		/*
490		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
491		 */
492		if (fd != -1)
493			return (EINVAL);
494
495 is_anon:		/* label for SunOS style /dev/zero */
496		handle = NULL;
497		maxprot = VM_PROT_ALL;
498		pos = 0;
499	}
500
501#if NVERIEXEC > 0
502	if (handle != NULL) {
503		/*
504		 * Check if the file can be executed indirectly.
505		 *
506		 * XXX: This gives false warnings about "Incorrect access type"
507		 * XXX: if the mapping is not executable. Harmless, but will be
508		 * XXX: fixed as part of other changes.
509		 */
510		if (veriexec_verify(l, handle, "(mmap)", VERIEXEC_INDIRECT,
511		    NULL)) {
512			/*
513			 * Don't allow executable mappings if we can't
514			 * indirectly execute the file.
515			 */
516			if (prot & VM_PROT_EXECUTE) {
517			     	if (fp != NULL)
518					fd_putfile(fd);
519				return (EPERM);
520			}
521
522			/*
523			 * Strip the executable bit from 'maxprot' to make sure
524			 * it can't be made executable later.
525			 */
526			maxprot &= ~VM_PROT_EXECUTE;
527		}
528	}
529#endif /* NVERIEXEC > 0 */
530
531#ifdef PAX_MPROTECT
532	pax_mprotect(l, &prot, &maxprot);
533#endif /* PAX_MPROTECT */
534
535#ifdef PAX_ASLR
536	pax_aslr(l, &addr, orig_addr, flags);
537#endif /* PAX_ASLR */
538
539	/*
540	 * now let kernel internal function uvm_mmap do the work.
541	 */
542
543	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
544	    flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
545
546	if (error == 0)
547		/* remember to add offset */
548		*retval = (register_t)(addr + pageoff);
549
550     	if (fp != NULL)
551		fd_putfile(fd);
552
553	return (error);
554}
555
556/*
557 * sys___msync13: the msync system call (a front-end for flush)
558 */
559
560int
561sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
562    register_t *retval)
563{
564	/* {
565		syscallarg(void *) addr;
566		syscallarg(size_t) len;
567		syscallarg(int) flags;
568	} */
569	struct proc *p = l->l_proc;
570	vaddr_t addr;
571	vsize_t size, pageoff;
572	struct vm_map *map;
573	int error, rv, flags, uvmflags;
574
575	/*
576	 * extract syscall args from the uap
577	 */
578
579	addr = (vaddr_t)SCARG(uap, addr);
580	size = (vsize_t)SCARG(uap, len);
581	flags = SCARG(uap, flags);
582
583	/* sanity check flags */
584	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
585	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
586	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
587		return (EINVAL);
588	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
589		flags |= MS_SYNC;
590
591	/*
592	 * align the address to a page boundary and adjust the size accordingly.
593	 */
594
595	pageoff = (addr & PAGE_MASK);
596	addr -= pageoff;
597	size += pageoff;
598	size = (vsize_t)round_page(size);
599
600	error = range_test(addr, size, false);
601	if (error)
602		return error;
603
604	/*
605	 * get map
606	 */
607
608	map = &p->p_vmspace->vm_map;
609
610	/*
611	 * XXXCDC: do we really need this semantic?
612	 *
613	 * XXX Gak!  If size is zero we are supposed to sync "all modified
614	 * pages with the region containing addr".  Unfortunately, we
615	 * don't really keep track of individual mmaps so we approximate
616	 * by flushing the range of the map entry containing addr.
617	 * This can be incorrect if the region splits or is coalesced
618	 * with a neighbor.
619	 */
620
621	if (size == 0) {
622		struct vm_map_entry *entry;
623
624		vm_map_lock_read(map);
625		rv = uvm_map_lookup_entry(map, addr, &entry);
626		if (rv == true) {
627			addr = entry->start;
628			size = entry->end - entry->start;
629		}
630		vm_map_unlock_read(map);
631		if (rv == false)
632			return (EINVAL);
633	}
634
635	/*
636	 * translate MS_ flags into PGO_ flags
637	 */
638
639	uvmflags = PGO_CLEANIT;
640	if (flags & MS_INVALIDATE)
641		uvmflags |= PGO_FREE;
642	if (flags & MS_SYNC)
643		uvmflags |= PGO_SYNCIO;
644
645	error = uvm_map_clean(map, addr, addr+size, uvmflags);
646	return error;
647}
648
649/*
650 * sys_munmap: unmap a users memory
651 */
652
653int
654sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
655{
656	/* {
657		syscallarg(void *) addr;
658		syscallarg(size_t) len;
659	} */
660	struct proc *p = l->l_proc;
661	vaddr_t addr;
662	vsize_t size, pageoff;
663	struct vm_map *map;
664	struct vm_map_entry *dead_entries;
665	int error;
666
667	/*
668	 * get syscall args.
669	 */
670
671	addr = (vaddr_t)SCARG(uap, addr);
672	size = (vsize_t)SCARG(uap, len);
673
674	/*
675	 * align the address to a page boundary and adjust the size accordingly.
676	 */
677
678	pageoff = (addr & PAGE_MASK);
679	addr -= pageoff;
680	size += pageoff;
681	size = (vsize_t)round_page(size);
682
683	if (size == 0)
684		return (0);
685
686	error = range_test(addr, size, false);
687	if (error)
688		return error;
689
690	map = &p->p_vmspace->vm_map;
691
692	/*
693	 * interesting system call semantic: make sure entire range is
694	 * allocated before allowing an unmap.
695	 */
696
697	vm_map_lock(map);
698#if 0
699	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
700		vm_map_unlock(map);
701		return (EINVAL);
702	}
703#endif
704	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
705	vm_map_unlock(map);
706	if (dead_entries != NULL)
707		uvm_unmap_detach(dead_entries, 0);
708	return (0);
709}
710
711/*
712 * sys_mprotect: the mprotect system call
713 */
714
715int
716sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
717    register_t *retval)
718{
719	/* {
720		syscallarg(void *) addr;
721		syscallarg(size_t) len;
722		syscallarg(int) prot;
723	} */
724	struct proc *p = l->l_proc;
725	vaddr_t addr;
726	vsize_t size, pageoff;
727	vm_prot_t prot;
728	int error;
729
730	/*
731	 * extract syscall args from uap
732	 */
733
734	addr = (vaddr_t)SCARG(uap, addr);
735	size = (vsize_t)SCARG(uap, len);
736	prot = SCARG(uap, prot) & VM_PROT_ALL;
737
738	/*
739	 * align the address to a page boundary and adjust the size accordingly.
740	 */
741
742	pageoff = (addr & PAGE_MASK);
743	addr -= pageoff;
744	size += pageoff;
745	size = round_page(size);
746
747	error = range_test(addr, size, false);
748	if (error)
749		return error;
750
751	error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
752				false);
753	return error;
754}
755
756/*
757 * sys_minherit: the minherit system call
758 */
759
760int
761sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
762   register_t *retval)
763{
764	/* {
765		syscallarg(void *) addr;
766		syscallarg(int) len;
767		syscallarg(int) inherit;
768	} */
769	struct proc *p = l->l_proc;
770	vaddr_t addr;
771	vsize_t size, pageoff;
772	vm_inherit_t inherit;
773	int error;
774
775	addr = (vaddr_t)SCARG(uap, addr);
776	size = (vsize_t)SCARG(uap, len);
777	inherit = SCARG(uap, inherit);
778
779	/*
780	 * align the address to a page boundary and adjust the size accordingly.
781	 */
782
783	pageoff = (addr & PAGE_MASK);
784	addr -= pageoff;
785	size += pageoff;
786	size = (vsize_t)round_page(size);
787
788	error = range_test(addr, size, false);
789	if (error)
790		return error;
791
792	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
793				inherit);
794	return error;
795}
796
797/*
798 * sys_madvise: give advice about memory usage.
799 */
800
801/* ARGSUSED */
802int
803sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
804   register_t *retval)
805{
806	/* {
807		syscallarg(void *) addr;
808		syscallarg(size_t) len;
809		syscallarg(int) behav;
810	} */
811	struct proc *p = l->l_proc;
812	vaddr_t addr;
813	vsize_t size, pageoff;
814	int advice, error;
815
816	addr = (vaddr_t)SCARG(uap, addr);
817	size = (vsize_t)SCARG(uap, len);
818	advice = SCARG(uap, behav);
819
820	/*
821	 * align the address to a page boundary, and adjust the size accordingly
822	 */
823
824	pageoff = (addr & PAGE_MASK);
825	addr -= pageoff;
826	size += pageoff;
827	size = (vsize_t)round_page(size);
828
829	error = range_test(addr, size, false);
830	if (error)
831		return error;
832
833	switch (advice) {
834	case MADV_NORMAL:
835	case MADV_RANDOM:
836	case MADV_SEQUENTIAL:
837		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
838		    advice);
839		break;
840
841	case MADV_WILLNEED:
842
843		/*
844		 * Activate all these pages, pre-faulting them in if
845		 * necessary.
846		 */
847		error = uvm_map_willneed(&p->p_vmspace->vm_map,
848		    addr, addr + size);
849		break;
850
851	case MADV_DONTNEED:
852
853		/*
854		 * Deactivate all these pages.  We don't need them
855		 * any more.  We don't, however, toss the data in
856		 * the pages.
857		 */
858
859		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
860		    PGO_DEACTIVATE);
861		break;
862
863	case MADV_FREE:
864
865		/*
866		 * These pages contain no valid data, and may be
867		 * garbage-collected.  Toss all resources, including
868		 * any swap space in use.
869		 */
870
871		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
872		    PGO_FREE);
873		break;
874
875	case MADV_SPACEAVAIL:
876
877		/*
878		 * XXXMRG What is this?  I think it's:
879		 *
880		 *	Ensure that we have allocated backing-store
881		 *	for these pages.
882		 *
883		 * This is going to require changes to the page daemon,
884		 * as it will free swap space allocated to pages in core.
885		 * There's also what to do for device/file/anonymous memory.
886		 */
887
888		return (EINVAL);
889
890	default:
891		return (EINVAL);
892	}
893
894	return error;
895}
896
897/*
898 * sys_mlock: memory lock
899 */
900
901int
902sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
903{
904	/* {
905		syscallarg(const void *) addr;
906		syscallarg(size_t) len;
907	} */
908	struct proc *p = l->l_proc;
909	vaddr_t addr;
910	vsize_t size, pageoff;
911	int error;
912
913	/*
914	 * extract syscall args from uap
915	 */
916
917	addr = (vaddr_t)SCARG(uap, addr);
918	size = (vsize_t)SCARG(uap, len);
919
920	/*
921	 * align the address to a page boundary and adjust the size accordingly
922	 */
923
924	pageoff = (addr & PAGE_MASK);
925	addr -= pageoff;
926	size += pageoff;
927	size = (vsize_t)round_page(size);
928
929	error = range_test(addr, size, false);
930	if (error)
931		return error;
932
933	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
934		return (EAGAIN);
935
936	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
937			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
938		return (EAGAIN);
939
940	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
941	    0);
942	if (error == EFAULT)
943		error = ENOMEM;
944	return error;
945}
946
947/*
948 * sys_munlock: unlock wired pages
949 */
950
951int
952sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
953    register_t *retval)
954{
955	/* {
956		syscallarg(const void *) addr;
957		syscallarg(size_t) len;
958	} */
959	struct proc *p = l->l_proc;
960	vaddr_t addr;
961	vsize_t size, pageoff;
962	int error;
963
964	/*
965	 * extract syscall args from uap
966	 */
967
968	addr = (vaddr_t)SCARG(uap, addr);
969	size = (vsize_t)SCARG(uap, len);
970
971	/*
972	 * align the address to a page boundary, and adjust the size accordingly
973	 */
974
975	pageoff = (addr & PAGE_MASK);
976	addr -= pageoff;
977	size += pageoff;
978	size = (vsize_t)round_page(size);
979
980	error = range_test(addr, size, false);
981	if (error)
982		return error;
983
984	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
985	    0);
986	if (error == EFAULT)
987		error = ENOMEM;
988	return error;
989}
990
991/*
992 * sys_mlockall: lock all pages mapped into an address space.
993 */
994
995int
996sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
997    register_t *retval)
998{
999	/* {
1000		syscallarg(int) flags;
1001	} */
1002	struct proc *p = l->l_proc;
1003	int error, flags;
1004
1005	flags = SCARG(uap, flags);
1006
1007	if (flags == 0 ||
1008	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1009		return (EINVAL);
1010
1011	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1012	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1013	return (error);
1014}
1015
1016/*
1017 * sys_munlockall: unlock all pages mapped into an address space.
1018 */
1019
1020int
1021sys_munlockall(struct lwp *l, const void *v, register_t *retval)
1022{
1023	struct proc *p = l->l_proc;
1024
1025	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1026	return (0);
1027}
1028
1029/*
1030 * uvm_mmap: internal version of mmap
1031 *
1032 * - used by sys_mmap and various framebuffers
1033 * - handle is a vnode pointer or NULL for MAP_ANON
1034 * - caller must page-align the file offset
1035 */
1036
1037int
1038uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1039    vm_prot_t maxprot, int flags, void *handle, voff_t foff, vsize_t locklimit)
1040{
1041	struct uvm_object *uobj;
1042	struct vnode *vp;
1043	vaddr_t align = 0;
1044	int error;
1045	int advice = UVM_ADV_NORMAL;
1046	uvm_flag_t uvmflag = 0;
1047	bool needwritemap;
1048
1049	/*
1050	 * check params
1051	 */
1052
1053	if (size == 0)
1054		return(0);
1055	if (foff & PAGE_MASK)
1056		return(EINVAL);
1057	if ((prot & maxprot) != prot)
1058		return(EINVAL);
1059
1060	/*
1061	 * for non-fixed mappings, round off the suggested address.
1062	 * for fixed mappings, check alignment and zap old mappings.
1063	 */
1064
1065	if ((flags & MAP_FIXED) == 0) {
1066		*addr = round_page(*addr);
1067	} else {
1068		if (*addr & PAGE_MASK)
1069			return(EINVAL);
1070		uvmflag |= UVM_FLAG_FIXED;
1071		(void) uvm_unmap(map, *addr, *addr + size);
1072	}
1073
1074	/*
1075	 * Try to see if any requested alignment can even be attemped.
1076	 * Make sure we can express the alignment (asking for a >= 4GB
1077	 * alignment on an ILP32 architecure make no sense) and the
1078	 * alignment is at least for a page sized quanitiy.  If the
1079	 * request was for a fixed mapping, make sure supplied address
1080	 * adheres to the request alignment.
1081	 */
1082	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
1083	if (align) {
1084		if (align >= sizeof(vaddr_t) * NBBY)
1085			return(EINVAL);
1086		align = 1L << align;
1087		if (align < PAGE_SIZE)
1088			return(EINVAL);
1089		if (align >= vm_map_max(map))
1090			return(ENOMEM);
1091		if (flags & MAP_FIXED) {
1092			if ((*addr & (align-1)) != 0)
1093				return(EINVAL);
1094			align = 0;
1095		}
1096	}
1097
1098	/*
1099	 * check resource limits
1100	 */
1101
1102	if (!VM_MAP_IS_KERNEL(map) &&
1103	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
1104	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
1105		return ENOMEM;
1106
1107	/*
1108	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
1109	 * to underlying vm object.
1110	 */
1111
1112	if (flags & MAP_ANON) {
1113		KASSERT(handle == NULL);
1114		foff = UVM_UNKNOWN_OFFSET;
1115		uobj = NULL;
1116		if ((flags & MAP_SHARED) == 0)
1117			/* XXX: defer amap create */
1118			uvmflag |= UVM_FLAG_COPYONW;
1119		else
1120			/* shared: create amap now */
1121			uvmflag |= UVM_FLAG_OVERLAY;
1122
1123	} else {
1124		KASSERT(handle != NULL);
1125		vp = (struct vnode *)handle;
1126
1127		/*
1128		 * Don't allow mmap for EXEC if the file system
1129		 * is mounted NOEXEC.
1130		 */
1131		if ((prot & PROT_EXEC) != 0 &&
1132		    (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0)
1133			return (EACCES);
1134
1135		if (vp->v_type != VCHR) {
1136			error = VOP_MMAP(vp, prot, curlwp->l_cred);
1137			if (error) {
1138				return error;
1139			}
1140			vref(vp);
1141			uobj = &vp->v_uobj;
1142
1143			/*
1144			 * If the vnode is being mapped with PROT_EXEC,
1145			 * then mark it as text.
1146			 */
1147			if (prot & PROT_EXEC) {
1148				vn_markexec(vp);
1149			}
1150		} else {
1151			int i = maxprot;
1152
1153			/*
1154			 * XXX Some devices don't like to be mapped with
1155			 * XXX PROT_EXEC or PROT_WRITE, but we don't really
1156			 * XXX have a better way of handling this, right now
1157			 */
1158			do {
1159				uobj = udv_attach((void *) &vp->v_rdev,
1160				    (flags & MAP_SHARED) ? i :
1161				    (i & ~VM_PROT_WRITE), foff, size);
1162				i--;
1163			} while ((uobj == NULL) && (i > 0));
1164			if (uobj == NULL)
1165				return EINVAL;
1166			advice = UVM_ADV_RANDOM;
1167		}
1168		if ((flags & MAP_SHARED) == 0) {
1169			uvmflag |= UVM_FLAG_COPYONW;
1170		}
1171
1172		/*
1173		 * Set vnode flags to indicate the new kinds of mapping.
1174		 * We take the vnode lock in exclusive mode here to serialize
1175		 * with direct I/O.
1176		 *
1177		 * Safe to check for these flag values without a lock, as
1178		 * long as a reference to the vnode is held.
1179		 */
1180		needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
1181			(flags & MAP_SHARED) != 0 &&
1182			(maxprot & VM_PROT_WRITE) != 0;
1183		if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
1184			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1185			vp->v_vflag |= VV_MAPPED;
1186			if (needwritemap) {
1187				mutex_enter(vp->v_interlock);
1188				vp->v_iflag |= VI_WRMAP;
1189				mutex_exit(vp->v_interlock);
1190			}
1191			VOP_UNLOCK(vp);
1192		}
1193	}
1194
1195	uvmflag = UVM_MAPFLAG(prot, maxprot,
1196			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1197			advice, uvmflag);
1198	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1199	if (error) {
1200		if (uobj)
1201			uobj->pgops->pgo_detach(uobj);
1202		return error;
1203	}
1204
1205	/*
1206	 * POSIX 1003.1b -- if our address space was configured
1207	 * to lock all future mappings, wire the one we just made.
1208	 *
1209	 * Also handle the MAP_WIRED flag here.
1210	 */
1211
1212	if (prot == VM_PROT_NONE) {
1213
1214		/*
1215		 * No more work to do in this case.
1216		 */
1217
1218		return (0);
1219	}
1220	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
1221		vm_map_lock(map);
1222		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
1223		    (locklimit != 0 &&
1224		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
1225		     locklimit)) {
1226			vm_map_unlock(map);
1227			uvm_unmap(map, *addr, *addr + size);
1228			return ENOMEM;
1229		}
1230
1231		/*
1232		 * uvm_map_pageable() always returns the map unlocked.
1233		 */
1234
1235		error = uvm_map_pageable(map, *addr, *addr + size,
1236					 false, UVM_LK_ENTER);
1237		if (error) {
1238			uvm_unmap(map, *addr, *addr + size);
1239			return error;
1240		}
1241		return (0);
1242	}
1243	return 0;
1244}
1245
1246vaddr_t
1247uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz)
1248{
1249
1250	return VM_DEFAULT_ADDRESS(base, sz);
1251}
1252