1/*-
2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
8 * Science Department.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35 *
36 *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
37 */
38
39/*
40 * Mapped file (mmap) interface to VM
41 */
42
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD: releng/11.0/sys/vm/vm_mmap.c 285180 2015-07-05 22:26:19Z markj $");
45
46#include "opt_compat.h"
47#include "opt_hwpmc_hooks.h"
48#include "opt_vm.h"
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/capsicum.h>
53#include <sys/kernel.h>
54#include <sys/lock.h>
55#include <sys/mutex.h>
56#include <sys/sysproto.h>
57#include <sys/filedesc.h>
58#include <sys/priv.h>
59#include <sys/proc.h>
60#include <sys/procctl.h>
61#include <sys/racct.h>
62#include <sys/resource.h>
63#include <sys/resourcevar.h>
64#include <sys/rwlock.h>
65#include <sys/sysctl.h>
66#include <sys/vnode.h>
67#include <sys/fcntl.h>
68#include <sys/file.h>
69#include <sys/mman.h>
70#include <sys/mount.h>
71#include <sys/conf.h>
72#include <sys/stat.h>
73#include <sys/syscallsubr.h>
74#include <sys/sysent.h>
75#include <sys/vmmeter.h>
76
77#include <security/mac/mac_framework.h>
78
79#include <vm/vm.h>
80#include <vm/vm_param.h>
81#include <vm/pmap.h>
82#include <vm/vm_map.h>
83#include <vm/vm_object.h>
84#include <vm/vm_page.h>
85#include <vm/vm_pager.h>
86#include <vm/vm_pageout.h>
87#include <vm/vm_extern.h>
88#include <vm/vm_page.h>
89#include <vm/vnode_pager.h>
90
91#ifdef HWPMC_HOOKS
92#include <sys/pmckern.h>
93#endif
94
95int old_mlock = 0;
96SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
97    "Do not apply RLIMIT_MEMLOCK on mlockall");
98
99#ifdef MAP_32BIT
100#define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
101#endif
102
103#ifndef _SYS_SYSPROTO_H_
104struct sbrk_args {
105	int incr;
106};
107#endif
108
109/*
110 * MPSAFE
111 */
112/* ARGSUSED */
113int
114sys_sbrk(td, uap)
115	struct thread *td;
116	struct sbrk_args *uap;
117{
118	/* Not yet implemented */
119	return (EOPNOTSUPP);
120}
121
122#ifndef _SYS_SYSPROTO_H_
123struct sstk_args {
124	int incr;
125};
126#endif
127
128/*
129 * MPSAFE
130 */
131/* ARGSUSED */
132int
133sys_sstk(td, uap)
134	struct thread *td;
135	struct sstk_args *uap;
136{
137	/* Not yet implemented */
138	return (EOPNOTSUPP);
139}
140
141#if defined(COMPAT_43)
142#ifndef _SYS_SYSPROTO_H_
143struct getpagesize_args {
144	int dummy;
145};
146#endif
147
148int
149ogetpagesize(td, uap)
150	struct thread *td;
151	struct getpagesize_args *uap;
152{
153	/* MP SAFE */
154	td->td_retval[0] = PAGE_SIZE;
155	return (0);
156}
157#endif				/* COMPAT_43 */
158
159
160/*
161 * Memory Map (mmap) system call.  Note that the file offset
162 * and address are allowed to be NOT page aligned, though if
163 * the MAP_FIXED flag it set, both must have the same remainder
164 * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
165 * page-aligned, the actual mapping starts at trunc_page(addr)
166 * and the return value is adjusted up by the page offset.
167 *
168 * Generally speaking, only character devices which are themselves
169 * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
170 * there would be no cache coherency between a descriptor and a VM mapping
171 * both to the same character device.
172 */
173#ifndef _SYS_SYSPROTO_H_
174struct mmap_args {
175	void *addr;
176	size_t len;
177	int prot;
178	int flags;
179	int fd;
180	long pad;
181	off_t pos;
182};
183#endif
184
185/*
186 * MPSAFE
187 */
188int
189sys_mmap(td, uap)
190	struct thread *td;
191	struct mmap_args *uap;
192{
193	struct file *fp;
194	vm_offset_t addr;
195	vm_size_t size, pageoff;
196	vm_prot_t cap_maxprot;
197	int align, error, flags, prot;
198	off_t pos;
199	struct vmspace *vms = td->td_proc->p_vmspace;
200	cap_rights_t rights;
201
202	addr = (vm_offset_t) uap->addr;
203	size = uap->len;
204	prot = uap->prot;
205	flags = uap->flags;
206	pos = uap->pos;
207
208	fp = NULL;
209
210	/*
211	 * Ignore old flags that used to be defined but did not do anything.
212	 */
213	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
214
215	/*
216	 * Enforce the constraints.
217	 * Mapping of length 0 is only allowed for old binaries.
218	 * Anonymous mapping shall specify -1 as filedescriptor and
219	 * zero position for new code. Be nice to ancient a.out
220	 * binaries and correct pos for anonymous mapping, since old
221	 * ld.so sometimes issues anonymous map requests with non-zero
222	 * pos.
223	 */
224	if (!SV_CURPROC_FLAG(SV_AOUT)) {
225		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
226		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
227			return (EINVAL);
228	} else {
229		if ((flags & MAP_ANON) != 0)
230			pos = 0;
231	}
232
233	if (flags & MAP_STACK) {
234		if ((uap->fd != -1) ||
235		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
236			return (EINVAL);
237		flags |= MAP_ANON;
238		pos = 0;
239	}
240	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
241	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
242	    MAP_PREFAULT_READ |
243#ifdef MAP_32BIT
244	    MAP_32BIT |
245#endif
246	    MAP_ALIGNMENT_MASK)) != 0)
247		return (EINVAL);
248	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
249		return (EINVAL);
250	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
251		return (EINVAL);
252	if (prot != PROT_NONE &&
253	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
254		return (EINVAL);
255
256	/*
257	 * Align the file position to a page boundary,
258	 * and save its page offset component.
259	 */
260	pageoff = (pos & PAGE_MASK);
261	pos -= pageoff;
262
263	/* Adjust size for rounding (on both ends). */
264	size += pageoff;			/* low end... */
265	size = (vm_size_t) round_page(size);	/* hi end */
266
267	/* Ensure alignment is at least a page and fits in a pointer. */
268	align = flags & MAP_ALIGNMENT_MASK;
269	if (align != 0 && align != MAP_ALIGNED_SUPER &&
270	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
271	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
272		return (EINVAL);
273
274	/*
275	 * Check for illegal addresses.  Watch out for address wrap... Note
276	 * that VM_*_ADDRESS are not constants due to casts (argh).
277	 */
278	if (flags & MAP_FIXED) {
279		/*
280		 * The specified address must have the same remainder
281		 * as the file offset taken modulo PAGE_SIZE, so it
282		 * should be aligned after adjustment by pageoff.
283		 */
284		addr -= pageoff;
285		if (addr & PAGE_MASK)
286			return (EINVAL);
287
288		/* Address range must be all in user VM space. */
289		if (addr < vm_map_min(&vms->vm_map) ||
290		    addr + size > vm_map_max(&vms->vm_map))
291			return (EINVAL);
292		if (addr + size < addr)
293			return (EINVAL);
294#ifdef MAP_32BIT
295		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
296			return (EINVAL);
297	} else if (flags & MAP_32BIT) {
298		/*
299		 * For MAP_32BIT, override the hint if it is too high and
300		 * do not bother moving the mapping past the heap (since
301		 * the heap is usually above 2GB).
302		 */
303		if (addr + size > MAP_32BIT_MAX_ADDR)
304			addr = 0;
305#endif
306	} else {
307		/*
308		 * XXX for non-fixed mappings where no hint is provided or
309		 * the hint would fall in the potential heap space,
310		 * place it after the end of the largest possible heap.
311		 *
312		 * There should really be a pmap call to determine a reasonable
313		 * location.
314		 */
315		if (addr == 0 ||
316		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
317		    addr < round_page((vm_offset_t)vms->vm_daddr +
318		    lim_max(td, RLIMIT_DATA))))
319			addr = round_page((vm_offset_t)vms->vm_daddr +
320			    lim_max(td, RLIMIT_DATA));
321	}
322	if (size == 0) {
323		/*
324		 * Return success without mapping anything for old
325		 * binaries that request a page-aligned mapping of
326		 * length 0.  For modern binaries, this function
327		 * returns an error earlier.
328		 */
329		error = 0;
330	} else if (flags & MAP_ANON) {
331		/*
332		 * Mapping blank space is trivial.
333		 *
334		 * This relies on VM_PROT_* matching PROT_*.
335		 */
336		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
337		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
338	} else {
339		/*
340		 * Mapping file, get fp for validation and don't let the
341		 * descriptor disappear on us if we block. Check capability
342		 * rights, but also return the maximum rights to be combined
343		 * with maxprot later.
344		 */
345		cap_rights_init(&rights, CAP_MMAP);
346		if (prot & PROT_READ)
347			cap_rights_set(&rights, CAP_MMAP_R);
348		if ((flags & MAP_SHARED) != 0) {
349			if (prot & PROT_WRITE)
350				cap_rights_set(&rights, CAP_MMAP_W);
351		}
352		if (prot & PROT_EXEC)
353			cap_rights_set(&rights, CAP_MMAP_X);
354		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
355		if (error != 0)
356			goto done;
357		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
358		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
359			error = EINVAL;
360			goto done;
361		}
362
363		/* This relies on VM_PROT_* matching PROT_*. */
364		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
365		    cap_maxprot, flags, pos, td);
366	}
367
368	if (error == 0)
369		td->td_retval[0] = (register_t) (addr + pageoff);
370done:
371	if (fp)
372		fdrop(fp, td);
373
374	return (error);
375}
376
377#if defined(COMPAT_FREEBSD6)
378int
379freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
380{
381	struct mmap_args oargs;
382
383	oargs.addr = uap->addr;
384	oargs.len = uap->len;
385	oargs.prot = uap->prot;
386	oargs.flags = uap->flags;
387	oargs.fd = uap->fd;
388	oargs.pos = uap->pos;
389	return (sys_mmap(td, &oargs));
390}
391#endif
392
393#ifdef COMPAT_43
394#ifndef _SYS_SYSPROTO_H_
395struct ommap_args {
396	caddr_t addr;
397	int len;
398	int prot;
399	int flags;
400	int fd;
401	long pos;
402};
403#endif
404int
405ommap(td, uap)
406	struct thread *td;
407	struct ommap_args *uap;
408{
409	struct mmap_args nargs;
410	static const char cvtbsdprot[8] = {
411		0,
412		PROT_EXEC,
413		PROT_WRITE,
414		PROT_EXEC | PROT_WRITE,
415		PROT_READ,
416		PROT_EXEC | PROT_READ,
417		PROT_WRITE | PROT_READ,
418		PROT_EXEC | PROT_WRITE | PROT_READ,
419	};
420
421#define	OMAP_ANON	0x0002
422#define	OMAP_COPY	0x0020
423#define	OMAP_SHARED	0x0010
424#define	OMAP_FIXED	0x0100
425
426	nargs.addr = uap->addr;
427	nargs.len = uap->len;
428	nargs.prot = cvtbsdprot[uap->prot & 0x7];
429#ifdef COMPAT_FREEBSD32
430#if defined(__amd64__)
431	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
432	    nargs.prot != 0)
433		nargs.prot |= PROT_EXEC;
434#endif
435#endif
436	nargs.flags = 0;
437	if (uap->flags & OMAP_ANON)
438		nargs.flags |= MAP_ANON;
439	if (uap->flags & OMAP_COPY)
440		nargs.flags |= MAP_COPY;
441	if (uap->flags & OMAP_SHARED)
442		nargs.flags |= MAP_SHARED;
443	else
444		nargs.flags |= MAP_PRIVATE;
445	if (uap->flags & OMAP_FIXED)
446		nargs.flags |= MAP_FIXED;
447	nargs.fd = uap->fd;
448	nargs.pos = uap->pos;
449	return (sys_mmap(td, &nargs));
450}
451#endif				/* COMPAT_43 */
452
453
454#ifndef _SYS_SYSPROTO_H_
455struct msync_args {
456	void *addr;
457	size_t len;
458	int flags;
459};
460#endif
461/*
462 * MPSAFE
463 */
464int
465sys_msync(td, uap)
466	struct thread *td;
467	struct msync_args *uap;
468{
469	vm_offset_t addr;
470	vm_size_t size, pageoff;
471	int flags;
472	vm_map_t map;
473	int rv;
474
475	addr = (vm_offset_t) uap->addr;
476	size = uap->len;
477	flags = uap->flags;
478
479	pageoff = (addr & PAGE_MASK);
480	addr -= pageoff;
481	size += pageoff;
482	size = (vm_size_t) round_page(size);
483	if (addr + size < addr)
484		return (EINVAL);
485
486	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
487		return (EINVAL);
488
489	map = &td->td_proc->p_vmspace->vm_map;
490
491	/*
492	 * Clean the pages and interpret the return value.
493	 */
494	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
495	    (flags & MS_INVALIDATE) != 0);
496	switch (rv) {
497	case KERN_SUCCESS:
498		return (0);
499	case KERN_INVALID_ADDRESS:
500		return (ENOMEM);
501	case KERN_INVALID_ARGUMENT:
502		return (EBUSY);
503	case KERN_FAILURE:
504		return (EIO);
505	default:
506		return (EINVAL);
507	}
508}
509
510#ifndef _SYS_SYSPROTO_H_
511struct munmap_args {
512	void *addr;
513	size_t len;
514};
515#endif
516/*
517 * MPSAFE
518 */
519int
520sys_munmap(td, uap)
521	struct thread *td;
522	struct munmap_args *uap;
523{
524#ifdef HWPMC_HOOKS
525	struct pmckern_map_out pkm;
526	vm_map_entry_t entry;
527#endif
528	vm_offset_t addr;
529	vm_size_t size, pageoff;
530	vm_map_t map;
531
532	addr = (vm_offset_t) uap->addr;
533	size = uap->len;
534	if (size == 0)
535		return (EINVAL);
536
537	pageoff = (addr & PAGE_MASK);
538	addr -= pageoff;
539	size += pageoff;
540	size = (vm_size_t) round_page(size);
541	if (addr + size < addr)
542		return (EINVAL);
543
544	/*
545	 * Check for illegal addresses.  Watch out for address wrap...
546	 */
547	map = &td->td_proc->p_vmspace->vm_map;
548	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
549		return (EINVAL);
550	vm_map_lock(map);
551#ifdef HWPMC_HOOKS
552	/*
553	 * Inform hwpmc if the address range being unmapped contains
554	 * an executable region.
555	 */
556	pkm.pm_address = (uintptr_t) NULL;
557	if (vm_map_lookup_entry(map, addr, &entry)) {
558		for (;
559		     entry != &map->header && entry->start < addr + size;
560		     entry = entry->next) {
561			if (vm_map_check_protection(map, entry->start,
562				entry->end, VM_PROT_EXECUTE) == TRUE) {
563				pkm.pm_address = (uintptr_t) addr;
564				pkm.pm_size = (size_t) size;
565				break;
566			}
567		}
568	}
569#endif
570	vm_map_delete(map, addr, addr + size);
571
572#ifdef HWPMC_HOOKS
573	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
574	vm_map_lock_downgrade(map);
575	if (pkm.pm_address != (uintptr_t) NULL)
576		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
577	vm_map_unlock_read(map);
578#else
579	vm_map_unlock(map);
580#endif
581	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
582	return (0);
583}
584
585#ifndef _SYS_SYSPROTO_H_
586struct mprotect_args {
587	const void *addr;
588	size_t len;
589	int prot;
590};
591#endif
592/*
593 * MPSAFE
594 */
595int
596sys_mprotect(td, uap)
597	struct thread *td;
598	struct mprotect_args *uap;
599{
600	vm_offset_t addr;
601	vm_size_t size, pageoff;
602	vm_prot_t prot;
603
604	addr = (vm_offset_t) uap->addr;
605	size = uap->len;
606	prot = uap->prot & VM_PROT_ALL;
607
608	pageoff = (addr & PAGE_MASK);
609	addr -= pageoff;
610	size += pageoff;
611	size = (vm_size_t) round_page(size);
612	if (addr + size < addr)
613		return (EINVAL);
614
615	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
616	    addr + size, prot, FALSE)) {
617	case KERN_SUCCESS:
618		return (0);
619	case KERN_PROTECTION_FAILURE:
620		return (EACCES);
621	case KERN_RESOURCE_SHORTAGE:
622		return (ENOMEM);
623	}
624	return (EINVAL);
625}
626
627#ifndef _SYS_SYSPROTO_H_
628struct minherit_args {
629	void *addr;
630	size_t len;
631	int inherit;
632};
633#endif
634/*
635 * MPSAFE
636 */
637int
638sys_minherit(td, uap)
639	struct thread *td;
640	struct minherit_args *uap;
641{
642	vm_offset_t addr;
643	vm_size_t size, pageoff;
644	vm_inherit_t inherit;
645
646	addr = (vm_offset_t)uap->addr;
647	size = uap->len;
648	inherit = uap->inherit;
649
650	pageoff = (addr & PAGE_MASK);
651	addr -= pageoff;
652	size += pageoff;
653	size = (vm_size_t) round_page(size);
654	if (addr + size < addr)
655		return (EINVAL);
656
657	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
658	    addr + size, inherit)) {
659	case KERN_SUCCESS:
660		return (0);
661	case KERN_PROTECTION_FAILURE:
662		return (EACCES);
663	}
664	return (EINVAL);
665}
666
667#ifndef _SYS_SYSPROTO_H_
668struct madvise_args {
669	void *addr;
670	size_t len;
671	int behav;
672};
673#endif
674
675/*
676 * MPSAFE
677 */
678int
679sys_madvise(td, uap)
680	struct thread *td;
681	struct madvise_args *uap;
682{
683	vm_offset_t start, end;
684	vm_map_t map;
685	int flags;
686
687	/*
688	 * Check for our special case, advising the swap pager we are
689	 * "immortal."
690	 */
691	if (uap->behav == MADV_PROTECT) {
692		flags = PPROT_SET;
693		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
694		    PROC_SPROTECT, &flags));
695	}
696
697	/*
698	 * Check for illegal behavior
699	 */
700	if (uap->behav < 0 || uap->behav > MADV_CORE)
701		return (EINVAL);
702	/*
703	 * Check for illegal addresses.  Watch out for address wrap... Note
704	 * that VM_*_ADDRESS are not constants due to casts (argh).
705	 */
706	map = &td->td_proc->p_vmspace->vm_map;
707	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
708	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
709		return (EINVAL);
710	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
711		return (EINVAL);
712
713	/*
714	 * Since this routine is only advisory, we default to conservative
715	 * behavior.
716	 */
717	start = trunc_page((vm_offset_t) uap->addr);
718	end = round_page((vm_offset_t) uap->addr + uap->len);
719
720	if (vm_map_madvise(map, start, end, uap->behav))
721		return (EINVAL);
722	return (0);
723}
724
725#ifndef _SYS_SYSPROTO_H_
726struct mincore_args {
727	const void *addr;
728	size_t len;
729	char *vec;
730};
731#endif
732
733/*
734 * MPSAFE
735 */
736int
737sys_mincore(td, uap)
738	struct thread *td;
739	struct mincore_args *uap;
740{
741	vm_offset_t addr, first_addr;
742	vm_offset_t end, cend;
743	pmap_t pmap;
744	vm_map_t map;
745	char *vec;
746	int error = 0;
747	int vecindex, lastvecindex;
748	vm_map_entry_t current;
749	vm_map_entry_t entry;
750	vm_object_t object;
751	vm_paddr_t locked_pa;
752	vm_page_t m;
753	vm_pindex_t pindex;
754	int mincoreinfo;
755	unsigned int timestamp;
756	boolean_t locked;
757
758	/*
759	 * Make sure that the addresses presented are valid for user
760	 * mode.
761	 */
762	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
763	end = addr + (vm_size_t)round_page(uap->len);
764	map = &td->td_proc->p_vmspace->vm_map;
765	if (end > vm_map_max(map) || end < addr)
766		return (ENOMEM);
767
768	/*
769	 * Address of byte vector
770	 */
771	vec = uap->vec;
772
773	pmap = vmspace_pmap(td->td_proc->p_vmspace);
774
775	vm_map_lock_read(map);
776RestartScan:
777	timestamp = map->timestamp;
778
779	if (!vm_map_lookup_entry(map, addr, &entry)) {
780		vm_map_unlock_read(map);
781		return (ENOMEM);
782	}
783
784	/*
785	 * Do this on a map entry basis so that if the pages are not
786	 * in the current processes address space, we can easily look
787	 * up the pages elsewhere.
788	 */
789	lastvecindex = -1;
790	for (current = entry;
791	    (current != &map->header) && (current->start < end);
792	    current = current->next) {
793
794		/*
795		 * check for contiguity
796		 */
797		if (current->end < end &&
798		    (entry->next == &map->header ||
799		     current->next->start > current->end)) {
800			vm_map_unlock_read(map);
801			return (ENOMEM);
802		}
803
804		/*
805		 * ignore submaps (for now) or null objects
806		 */
807		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
808			current->object.vm_object == NULL)
809			continue;
810
811		/*
812		 * limit this scan to the current map entry and the
813		 * limits for the mincore call
814		 */
815		if (addr < current->start)
816			addr = current->start;
817		cend = current->end;
818		if (cend > end)
819			cend = end;
820
821		/*
822		 * scan this entry one page at a time
823		 */
824		while (addr < cend) {
825			/*
826			 * Check pmap first, it is likely faster, also
827			 * it can provide info as to whether we are the
828			 * one referencing or modifying the page.
829			 */
830			object = NULL;
831			locked_pa = 0;
832		retry:
833			m = NULL;
834			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
835			if (locked_pa != 0) {
836				/*
837				 * The page is mapped by this process but not
838				 * both accessed and modified.  It is also
839				 * managed.  Acquire the object lock so that
840				 * other mappings might be examined.
841				 */
842				m = PHYS_TO_VM_PAGE(locked_pa);
843				if (m->object != object) {
844					if (object != NULL)
845						VM_OBJECT_WUNLOCK(object);
846					object = m->object;
847					locked = VM_OBJECT_TRYWLOCK(object);
848					vm_page_unlock(m);
849					if (!locked) {
850						VM_OBJECT_WLOCK(object);
851						vm_page_lock(m);
852						goto retry;
853					}
854				} else
855					vm_page_unlock(m);
856				KASSERT(m->valid == VM_PAGE_BITS_ALL,
857				    ("mincore: page %p is mapped but invalid",
858				    m));
859			} else if (mincoreinfo == 0) {
860				/*
861				 * The page is not mapped by this process.  If
862				 * the object implements managed pages, then
863				 * determine if the page is resident so that
864				 * the mappings might be examined.
865				 */
866				if (current->object.vm_object != object) {
867					if (object != NULL)
868						VM_OBJECT_WUNLOCK(object);
869					object = current->object.vm_object;
870					VM_OBJECT_WLOCK(object);
871				}
872				if (object->type == OBJT_DEFAULT ||
873				    object->type == OBJT_SWAP ||
874				    object->type == OBJT_VNODE) {
875					pindex = OFF_TO_IDX(current->offset +
876					    (addr - current->start));
877					m = vm_page_lookup(object, pindex);
878					if (m == NULL &&
879					    vm_page_is_cached(object, pindex))
880						mincoreinfo = MINCORE_INCORE;
881					if (m != NULL && m->valid == 0)
882						m = NULL;
883					if (m != NULL)
884						mincoreinfo = MINCORE_INCORE;
885				}
886			}
887			if (m != NULL) {
888				/* Examine other mappings to the page. */
889				if (m->dirty == 0 && pmap_is_modified(m))
890					vm_page_dirty(m);
891				if (m->dirty != 0)
892					mincoreinfo |= MINCORE_MODIFIED_OTHER;
893				/*
894				 * The first test for PGA_REFERENCED is an
895				 * optimization.  The second test is
896				 * required because a concurrent pmap
897				 * operation could clear the last reference
898				 * and set PGA_REFERENCED before the call to
899				 * pmap_is_referenced().
900				 */
901				if ((m->aflags & PGA_REFERENCED) != 0 ||
902				    pmap_is_referenced(m) ||
903				    (m->aflags & PGA_REFERENCED) != 0)
904					mincoreinfo |= MINCORE_REFERENCED_OTHER;
905			}
906			if (object != NULL)
907				VM_OBJECT_WUNLOCK(object);
908
909			/*
910			 * subyte may page fault.  In case it needs to modify
911			 * the map, we release the lock.
912			 */
913			vm_map_unlock_read(map);
914
915			/*
916			 * calculate index into user supplied byte vector
917			 */
918			vecindex = OFF_TO_IDX(addr - first_addr);
919
920			/*
921			 * If we have skipped map entries, we need to make sure that
922			 * the byte vector is zeroed for those skipped entries.
923			 */
924			while ((lastvecindex + 1) < vecindex) {
925				++lastvecindex;
926				error = subyte(vec + lastvecindex, 0);
927				if (error) {
928					error = EFAULT;
929					goto done2;
930				}
931			}
932
933			/*
934			 * Pass the page information to the user
935			 */
936			error = subyte(vec + vecindex, mincoreinfo);
937			if (error) {
938				error = EFAULT;
939				goto done2;
940			}
941
942			/*
943			 * If the map has changed, due to the subyte, the previous
944			 * output may be invalid.
945			 */
946			vm_map_lock_read(map);
947			if (timestamp != map->timestamp)
948				goto RestartScan;
949
950			lastvecindex = vecindex;
951			addr += PAGE_SIZE;
952		}
953	}
954
955	/*
956	 * subyte may page fault.  In case it needs to modify
957	 * the map, we release the lock.
958	 */
959	vm_map_unlock_read(map);
960
961	/*
962	 * Zero the last entries in the byte vector.
963	 */
964	vecindex = OFF_TO_IDX(end - first_addr);
965	while ((lastvecindex + 1) < vecindex) {
966		++lastvecindex;
967		error = subyte(vec + lastvecindex, 0);
968		if (error) {
969			error = EFAULT;
970			goto done2;
971		}
972	}
973
974	/*
975	 * If the map has changed, due to the subyte, the previous
976	 * output may be invalid.
977	 */
978	vm_map_lock_read(map);
979	if (timestamp != map->timestamp)
980		goto RestartScan;
981	vm_map_unlock_read(map);
982done2:
983	return (error);
984}
985
986#ifndef _SYS_SYSPROTO_H_
987struct mlock_args {
988	const void *addr;
989	size_t len;
990};
991#endif
992/*
993 * MPSAFE
994 */
995int
996sys_mlock(td, uap)
997	struct thread *td;
998	struct mlock_args *uap;
999{
1000
1001	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
1002}
1003
1004int
1005vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
1006{
1007	vm_offset_t addr, end, last, start;
1008	vm_size_t npages, size;
1009	vm_map_t map;
1010	unsigned long nsize;
1011	int error;
1012
1013	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
1014	if (error)
1015		return (error);
1016	addr = (vm_offset_t)addr0;
1017	size = len;
1018	last = addr + size;
1019	start = trunc_page(addr);
1020	end = round_page(last);
1021	if (last < addr || end < addr)
1022		return (EINVAL);
1023	npages = atop(end - start);
1024	if (npages > vm_page_max_wired)
1025		return (ENOMEM);
1026	map = &proc->p_vmspace->vm_map;
1027	PROC_LOCK(proc);
1028	nsize = ptoa(npages + pmap_wired_count(map->pmap));
1029	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1030		PROC_UNLOCK(proc);
1031		return (ENOMEM);
1032	}
1033	PROC_UNLOCK(proc);
1034	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
1035		return (EAGAIN);
1036#ifdef RACCT
1037	if (racct_enable) {
1038		PROC_LOCK(proc);
1039		error = racct_set(proc, RACCT_MEMLOCK, nsize);
1040		PROC_UNLOCK(proc);
1041		if (error != 0)
1042			return (ENOMEM);
1043	}
1044#endif
1045	error = vm_map_wire(map, start, end,
1046	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1047#ifdef RACCT
1048	if (racct_enable && error != KERN_SUCCESS) {
1049		PROC_LOCK(proc);
1050		racct_set(proc, RACCT_MEMLOCK,
1051		    ptoa(pmap_wired_count(map->pmap)));
1052		PROC_UNLOCK(proc);
1053	}
1054#endif
1055	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1056}
1057
1058#ifndef _SYS_SYSPROTO_H_
1059struct mlockall_args {
1060	int	how;
1061};
1062#endif
1063
1064/*
1065 * MPSAFE
1066 */
1067int
1068sys_mlockall(td, uap)
1069	struct thread *td;
1070	struct mlockall_args *uap;
1071{
1072	vm_map_t map;
1073	int error;
1074
1075	map = &td->td_proc->p_vmspace->vm_map;
1076	error = priv_check(td, PRIV_VM_MLOCK);
1077	if (error)
1078		return (error);
1079
1080	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1081		return (EINVAL);
1082
1083	/*
1084	 * If wiring all pages in the process would cause it to exceed
1085	 * a hard resource limit, return ENOMEM.
1086	 */
1087	if (!old_mlock && uap->how & MCL_CURRENT) {
1088		PROC_LOCK(td->td_proc);
1089		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
1090			PROC_UNLOCK(td->td_proc);
1091			return (ENOMEM);
1092		}
1093		PROC_UNLOCK(td->td_proc);
1094	}
1095#ifdef RACCT
1096	if (racct_enable) {
1097		PROC_LOCK(td->td_proc);
1098		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1099		PROC_UNLOCK(td->td_proc);
1100		if (error != 0)
1101			return (ENOMEM);
1102	}
1103#endif
1104
1105	if (uap->how & MCL_FUTURE) {
1106		vm_map_lock(map);
1107		vm_map_modflags(map, MAP_WIREFUTURE, 0);
1108		vm_map_unlock(map);
1109		error = 0;
1110	}
1111
1112	if (uap->how & MCL_CURRENT) {
1113		/*
1114		 * P1003.1-2001 mandates that all currently mapped pages
1115		 * will be memory resident and locked (wired) upon return
1116		 * from mlockall(). vm_map_wire() will wire pages, by
1117		 * calling vm_fault_wire() for each page in the region.
1118		 */
1119		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1120		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1121		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1122	}
1123#ifdef RACCT
1124	if (racct_enable && error != KERN_SUCCESS) {
1125		PROC_LOCK(td->td_proc);
1126		racct_set(td->td_proc, RACCT_MEMLOCK,
1127		    ptoa(pmap_wired_count(map->pmap)));
1128		PROC_UNLOCK(td->td_proc);
1129	}
1130#endif
1131
1132	return (error);
1133}
1134
1135#ifndef _SYS_SYSPROTO_H_
1136struct munlockall_args {
1137	register_t dummy;
1138};
1139#endif
1140
1141/*
1142 * MPSAFE
1143 */
1144int
1145sys_munlockall(td, uap)
1146	struct thread *td;
1147	struct munlockall_args *uap;
1148{
1149	vm_map_t map;
1150	int error;
1151
1152	map = &td->td_proc->p_vmspace->vm_map;
1153	error = priv_check(td, PRIV_VM_MUNLOCK);
1154	if (error)
1155		return (error);
1156
1157	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1158	vm_map_lock(map);
1159	vm_map_modflags(map, 0, MAP_WIREFUTURE);
1160	vm_map_unlock(map);
1161
1162	/* Forcibly unwire all pages. */
1163	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1164	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1165#ifdef RACCT
1166	if (racct_enable && error == KERN_SUCCESS) {
1167		PROC_LOCK(td->td_proc);
1168		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1169		PROC_UNLOCK(td->td_proc);
1170	}
1171#endif
1172
1173	return (error);
1174}
1175
1176#ifndef _SYS_SYSPROTO_H_
1177struct munlock_args {
1178	const void *addr;
1179	size_t len;
1180};
1181#endif
1182/*
1183 * MPSAFE
1184 */
1185int
1186sys_munlock(td, uap)
1187	struct thread *td;
1188	struct munlock_args *uap;
1189{
1190	vm_offset_t addr, end, last, start;
1191	vm_size_t size;
1192#ifdef RACCT
1193	vm_map_t map;
1194#endif
1195	int error;
1196
1197	error = priv_check(td, PRIV_VM_MUNLOCK);
1198	if (error)
1199		return (error);
1200	addr = (vm_offset_t)uap->addr;
1201	size = uap->len;
1202	last = addr + size;
1203	start = trunc_page(addr);
1204	end = round_page(last);
1205	if (last < addr || end < addr)
1206		return (EINVAL);
1207	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1208	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1209#ifdef RACCT
1210	if (racct_enable && error == KERN_SUCCESS) {
1211		PROC_LOCK(td->td_proc);
1212		map = &td->td_proc->p_vmspace->vm_map;
1213		racct_set(td->td_proc, RACCT_MEMLOCK,
1214		    ptoa(pmap_wired_count(map->pmap)));
1215		PROC_UNLOCK(td->td_proc);
1216	}
1217#endif
1218	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1219}
1220
1221/*
1222 * vm_mmap_vnode()
1223 *
1224 * Helper function for vm_mmap.  Perform sanity check specific for mmap
1225 * operations on vnodes.
1226 */
1227int
1228vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1229    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1230    struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1231    boolean_t *writecounted)
1232{
1233	struct vattr va;
1234	vm_object_t obj;
1235	vm_offset_t foff;
1236	struct ucred *cred;
1237	int error, flags, locktype;
1238
1239	cred = td->td_ucred;
1240	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
1241		locktype = LK_EXCLUSIVE;
1242	else
1243		locktype = LK_SHARED;
1244	if ((error = vget(vp, locktype, td)) != 0)
1245		return (error);
1246	foff = *foffp;
1247	flags = *flagsp;
1248	obj = vp->v_object;
1249	if (vp->v_type == VREG) {
1250		/*
1251		 * Get the proper underlying object
1252		 */
1253		if (obj == NULL) {
1254			error = EINVAL;
1255			goto done;
1256		}
1257		if (obj->type == OBJT_VNODE && obj->handle != vp) {
1258			vput(vp);
1259			vp = (struct vnode *)obj->handle;
1260			/*
1261			 * Bypass filesystems obey the mpsafety of the
1262			 * underlying fs.  Tmpfs never bypasses.
1263			 */
1264			error = vget(vp, locktype, td);
1265			if (error != 0)
1266				return (error);
1267		}
1268		if (locktype == LK_EXCLUSIVE) {
1269			*writecounted = TRUE;
1270			vnode_pager_update_writecount(obj, 0, objsize);
1271		}
1272	} else {
1273		error = EINVAL;
1274		goto done;
1275	}
1276	if ((error = VOP_GETATTR(vp, &va, cred)))
1277		goto done;
1278#ifdef MAC
1279	/* This relies on VM_PROT_* matching PROT_*. */
1280	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1281	if (error != 0)
1282		goto done;
1283#endif
1284	if ((flags & MAP_SHARED) != 0) {
1285		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1286			if (prot & VM_PROT_WRITE) {
1287				error = EPERM;
1288				goto done;
1289			}
1290			*maxprotp &= ~VM_PROT_WRITE;
1291		}
1292	}
1293	/*
1294	 * If it is a regular file without any references
1295	 * we do not need to sync it.
1296	 * Adjust object size to be the size of actual file.
1297	 */
1298	objsize = round_page(va.va_size);
1299	if (va.va_nlink == 0)
1300		flags |= MAP_NOSYNC;
1301	if (obj->type == OBJT_VNODE) {
1302		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1303		    cred);
1304		if (obj == NULL) {
1305			error = ENOMEM;
1306			goto done;
1307		}
1308	} else {
1309		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
1310		    ("wrong object type"));
1311		VM_OBJECT_WLOCK(obj);
1312		vm_object_reference_locked(obj);
1313#if VM_NRESERVLEVEL > 0
1314		vm_object_color(obj, 0);
1315#endif
1316		VM_OBJECT_WUNLOCK(obj);
1317	}
1318	*objp = obj;
1319	*flagsp = flags;
1320
1321	vfs_mark_atime(vp, cred);
1322
1323done:
1324	if (error != 0 && *writecounted) {
1325		*writecounted = FALSE;
1326		vnode_pager_update_writecount(obj, objsize, 0);
1327	}
1328	vput(vp);
1329	return (error);
1330}
1331
1332/*
1333 * vm_mmap_cdev()
1334 *
1335 * MPSAFE
1336 *
1337 * Helper function for vm_mmap.  Perform sanity check specific for mmap
1338 * operations on cdevs.
1339 */
1340int
1341vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1342    vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1343    vm_ooffset_t *foff, vm_object_t *objp)
1344{
1345	vm_object_t obj;
1346	int error, flags;
1347
1348	flags = *flagsp;
1349
1350	if (dsw->d_flags & D_MMAP_ANON) {
1351		*objp = NULL;
1352		*foff = 0;
1353		*maxprotp = VM_PROT_ALL;
1354		*flagsp |= MAP_ANON;
1355		return (0);
1356	}
1357	/*
1358	 * cdevs do not provide private mappings of any kind.
1359	 */
1360	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1361	    (prot & VM_PROT_WRITE) != 0)
1362		return (EACCES);
1363	if (flags & (MAP_PRIVATE|MAP_COPY))
1364		return (EINVAL);
1365	/*
1366	 * Force device mappings to be shared.
1367	 */
1368	flags |= MAP_SHARED;
1369#ifdef MAC_XXX
1370	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1371	if (error != 0)
1372		return (error);
1373#endif
1374	/*
1375	 * First, try d_mmap_single().  If that is not implemented
1376	 * (returns ENODEV), fall back to using the device pager.
1377	 * Note that d_mmap_single() must return a reference to the
1378	 * object (it needs to bump the reference count of the object
1379	 * it returns somehow).
1380	 *
1381	 * XXX assumes VM_PROT_* == PROT_*
1382	 */
1383	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1384	if (error != ENODEV)
1385		return (error);
1386	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1387	    td->td_ucred);
1388	if (obj == NULL)
1389		return (EINVAL);
1390	*objp = obj;
1391	*flagsp = flags;
1392	return (0);
1393}
1394
1395/*
1396 * vm_mmap()
1397 *
1398 * Internal version of mmap used by exec, sys5 shared memory, and
1399 * various device drivers.  Handle is either a vnode pointer, a
1400 * character device, or NULL for MAP_ANON.
1401 */
1402int
1403vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1404	vm_prot_t maxprot, int flags,
1405	objtype_t handle_type, void *handle,
1406	vm_ooffset_t foff)
1407{
1408	vm_object_t object;
1409	struct thread *td = curthread;
1410	int error;
1411	boolean_t writecounted;
1412
1413	if (size == 0)
1414		return (EINVAL);
1415
1416	size = round_page(size);
1417	object = NULL;
1418	writecounted = FALSE;
1419
1420	/*
1421	 * Lookup/allocate object.
1422	 */
1423	switch (handle_type) {
1424	case OBJT_DEVICE: {
1425		struct cdevsw *dsw;
1426		struct cdev *cdev;
1427		int ref;
1428
1429		cdev = handle;
1430		dsw = dev_refthread(cdev, &ref);
1431		if (dsw == NULL)
1432			return (ENXIO);
1433		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1434		    dsw, &foff, &object);
1435		dev_relthread(cdev, ref);
1436		break;
1437	}
1438	case OBJT_VNODE:
1439		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1440		    handle, &foff, &object, &writecounted);
1441		break;
1442	case OBJT_DEFAULT:
1443		if (handle == NULL) {
1444			error = 0;
1445			break;
1446		}
1447		/* FALLTHROUGH */
1448	default:
1449		error = EINVAL;
1450		break;
1451	}
1452	if (error)
1453		return (error);
1454
1455	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1456	    foff, writecounted, td);
1457	if (error != 0 && object != NULL) {
1458		/*
1459		 * If this mapping was accounted for in the vnode's
1460		 * writecount, then undo that now.
1461		 */
1462		if (writecounted)
1463			vnode_pager_release_writecount(object, 0, size);
1464		vm_object_deallocate(object);
1465	}
1466	return (error);
1467}
1468
1469/*
1470 * Internal version of mmap that maps a specific VM object into an
1471 * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1472 */
1473int
1474vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1475    vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1476    boolean_t writecounted, struct thread *td)
1477{
1478	boolean_t fitit;
1479	int docow, error, findspace, rv;
1480
1481	if (map == &td->td_proc->p_vmspace->vm_map) {
1482		PROC_LOCK(td->td_proc);
1483		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
1484			PROC_UNLOCK(td->td_proc);
1485			return (ENOMEM);
1486		}
1487		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1488			PROC_UNLOCK(td->td_proc);
1489			return (ENOMEM);
1490		}
1491		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1492			if (ptoa(pmap_wired_count(map->pmap)) + size >
1493			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
1494				racct_set_force(td->td_proc, RACCT_VMEM,
1495				    map->size);
1496				PROC_UNLOCK(td->td_proc);
1497				return (ENOMEM);
1498			}
1499			error = racct_set(td->td_proc, RACCT_MEMLOCK,
1500			    ptoa(pmap_wired_count(map->pmap)) + size);
1501			if (error != 0) {
1502				racct_set_force(td->td_proc, RACCT_VMEM,
1503				    map->size);
1504				PROC_UNLOCK(td->td_proc);
1505				return (error);
1506			}
1507		}
1508		PROC_UNLOCK(td->td_proc);
1509	}
1510
1511	/*
1512	 * We currently can only deal with page aligned file offsets.
1513	 * The mmap() system call already enforces this by subtracting
1514	 * the page offset from the file offset, but checking here
1515	 * catches errors in device drivers (e.g. d_single_mmap()
1516	 * callbacks) and other internal mapping requests (such as in
1517	 * exec).
1518	 */
1519	if (foff & PAGE_MASK)
1520		return (EINVAL);
1521
1522	if ((flags & MAP_FIXED) == 0) {
1523		fitit = TRUE;
1524		*addr = round_page(*addr);
1525	} else {
1526		if (*addr != trunc_page(*addr))
1527			return (EINVAL);
1528		fitit = FALSE;
1529	}
1530
1531	if (flags & MAP_ANON) {
1532		if (object != NULL || foff != 0)
1533			return (EINVAL);
1534		docow = 0;
1535	} else if (flags & MAP_PREFAULT_READ)
1536		docow = MAP_PREFAULT;
1537	else
1538		docow = MAP_PREFAULT_PARTIAL;
1539
1540	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1541		docow |= MAP_COPY_ON_WRITE;
1542	if (flags & MAP_NOSYNC)
1543		docow |= MAP_DISABLE_SYNCER;
1544	if (flags & MAP_NOCORE)
1545		docow |= MAP_DISABLE_COREDUMP;
1546	/* Shared memory is also shared with children. */
1547	if (flags & MAP_SHARED)
1548		docow |= MAP_INHERIT_SHARE;
1549	if (writecounted)
1550		docow |= MAP_VN_WRITECOUNT;
1551	if (flags & MAP_STACK) {
1552		if (object != NULL)
1553			return (EINVAL);
1554		docow |= MAP_STACK_GROWS_DOWN;
1555	}
1556	if ((flags & MAP_EXCL) != 0)
1557		docow |= MAP_CHECK_EXCL;
1558
1559	if (fitit) {
1560		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1561			findspace = VMFS_SUPER_SPACE;
1562		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1563			findspace = VMFS_ALIGNED_SPACE(flags >>
1564			    MAP_ALIGNMENT_SHIFT);
1565		else
1566			findspace = VMFS_OPTIMAL_SPACE;
1567		rv = vm_map_find(map, object, foff, addr, size,
1568#ifdef MAP_32BIT
1569		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
1570#endif
1571		    0, findspace, prot, maxprot, docow);
1572	} else {
1573		rv = vm_map_fixed(map, object, foff, *addr, size,
1574		    prot, maxprot, docow);
1575	}
1576
1577	if (rv == KERN_SUCCESS) {
1578		/*
1579		 * If the process has requested that all future mappings
1580		 * be wired, then heed this.
1581		 */
1582		if (map->flags & MAP_WIREFUTURE) {
1583			vm_map_wire(map, *addr, *addr + size,
1584			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
1585			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
1586		}
1587	}
1588	return (vm_mmap_to_errno(rv));
1589}
1590
1591/*
1592 * Translate a Mach VM return code to zero on success or the appropriate errno
1593 * on failure.
1594 */
1595int
1596vm_mmap_to_errno(int rv)
1597{
1598
1599	switch (rv) {
1600	case KERN_SUCCESS:
1601		return (0);
1602	case KERN_INVALID_ADDRESS:
1603	case KERN_NO_SPACE:
1604		return (ENOMEM);
1605	case KERN_PROTECTION_FAILURE:
1606		return (EACCES);
1607	default:
1608		return (EINVAL);
1609	}
1610}
1611