vm_map.c revision 316073
1/*-
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39 *
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
45 *
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49 *
50 * Carnegie Mellon requests users of this software to return to
51 *
52 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53 *  School of Computer Science
54 *  Carnegie Mellon University
55 *  Pittsburgh PA 15213-3890
56 *
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
59 */
60
61/*
62 *	Virtual memory mapping module.
63 */
64
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 316073 2017-03-28 06:07:59Z kib $");
67
68#include <sys/param.h>
69#include <sys/systm.h>
70#include <sys/kernel.h>
71#include <sys/ktr.h>
72#include <sys/lock.h>
73#include <sys/mutex.h>
74#include <sys/proc.h>
75#include <sys/vmmeter.h>
76#include <sys/mman.h>
77#include <sys/vnode.h>
78#include <sys/racct.h>
79#include <sys/resourcevar.h>
80#include <sys/rwlock.h>
81#include <sys/file.h>
82#include <sys/sysctl.h>
83#include <sys/sysent.h>
84#include <sys/shm.h>
85
86#include <vm/vm.h>
87#include <vm/vm_param.h>
88#include <vm/pmap.h>
89#include <vm/vm_map.h>
90#include <vm/vm_page.h>
91#include <vm/vm_object.h>
92#include <vm/vm_pager.h>
93#include <vm/vm_kern.h>
94#include <vm/vm_extern.h>
95#include <vm/vnode_pager.h>
96#include <vm/swap_pager.h>
97#include <vm/uma.h>
98
99/*
100 *	Virtual memory maps provide for the mapping, protection,
101 *	and sharing of virtual memory objects.  In addition,
102 *	this module provides for an efficient virtual copy of
103 *	memory from one map to another.
104 *
105 *	Synchronization is required prior to most operations.
106 *
107 *	Maps consist of an ordered doubly-linked list of simple
108 *	entries; a self-adjusting binary search tree of these
109 *	entries is used to speed up lookups.
110 *
111 *	Since portions of maps are specified by start/end addresses,
112 *	which may not align with existing map entries, all
113 *	routines merely "clip" entries to these start/end values.
114 *	[That is, an entry is split into two, bordering at a
115 *	start or end value.]  Note that these clippings may not
116 *	always be necessary (as the two resulting entries are then
117 *	not changed); however, the clipping is done for convenience.
118 *
119 *	As mentioned above, virtual copy operations are performed
120 *	by copying VM object references from one map to
121 *	another, and then marking both regions as copy-on-write.
122 */
123
124static struct mtx map_sleep_mtx;
125static uma_zone_t mapentzone;
126static uma_zone_t kmapentzone;
127static uma_zone_t mapzone;
128static uma_zone_t vmspace_zone;
129static int vmspace_zinit(void *mem, int size, int flags);
130static int vm_map_zinit(void *mem, int ize, int flags);
131static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
132    vm_offset_t max);
133static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
134static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
135static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
136static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
137    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
138#ifdef INVARIANTS
139static void vm_map_zdtor(void *mem, int size, void *arg);
140static void vmspace_zdtor(void *mem, int size, void *arg);
141#endif
142static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
143    vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
144    int cow);
145static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
146    vm_offset_t failed_addr);
147
148#define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
149    ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
150     !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
151
152/*
153 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
154 * stable.
155 */
156#define PROC_VMSPACE_LOCK(p) do { } while (0)
157#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
158
159/*
160 *	VM_MAP_RANGE_CHECK:	[ internal use only ]
161 *
162 *	Asserts that the starting and ending region
163 *	addresses fall within the valid range of the map.
164 */
165#define	VM_MAP_RANGE_CHECK(map, start, end)		\
166		{					\
167		if (start < vm_map_min(map))		\
168			start = vm_map_min(map);	\
169		if (end > vm_map_max(map))		\
170			end = vm_map_max(map);		\
171		if (start > end)			\
172			start = end;			\
173		}
174
175/*
176 *	vm_map_startup:
177 *
178 *	Initialize the vm_map module.  Must be called before
179 *	any other vm_map routines.
180 *
181 *	Map and entry structures are allocated from the general
182 *	purpose memory pool with some exceptions:
183 *
184 *	- The kernel map and kmem submap are allocated statically.
185 *	- Kernel map entries are allocated out of a static pool.
186 *
187 *	These restrictions are necessary since malloc() uses the
188 *	maps and requires map entries.
189 */
190
191void
192vm_map_startup(void)
193{
194	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
195	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
196#ifdef INVARIANTS
197	    vm_map_zdtor,
198#else
199	    NULL,
200#endif
201	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
202	uma_prealloc(mapzone, MAX_KMAP);
203	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
204	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
205	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
206	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
207	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
208	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
209#ifdef INVARIANTS
210	    vmspace_zdtor,
211#else
212	    NULL,
213#endif
214	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
215}
216
217static int
218vmspace_zinit(void *mem, int size, int flags)
219{
220	struct vmspace *vm;
221
222	vm = (struct vmspace *)mem;
223
224	vm->vm_map.pmap = NULL;
225	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
226	PMAP_LOCK_INIT(vmspace_pmap(vm));
227	return (0);
228}
229
230static int
231vm_map_zinit(void *mem, int size, int flags)
232{
233	vm_map_t map;
234
235	map = (vm_map_t)mem;
236	memset(map, 0, sizeof(*map));
237	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
238	sx_init(&map->lock, "vm map (user)");
239	return (0);
240}
241
242#ifdef INVARIANTS
243static void
244vmspace_zdtor(void *mem, int size, void *arg)
245{
246	struct vmspace *vm;
247
248	vm = (struct vmspace *)mem;
249
250	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
251}
252static void
253vm_map_zdtor(void *mem, int size, void *arg)
254{
255	vm_map_t map;
256
257	map = (vm_map_t)mem;
258	KASSERT(map->nentries == 0,
259	    ("map %p nentries == %d on free.",
260	    map, map->nentries));
261	KASSERT(map->size == 0,
262	    ("map %p size == %lu on free.",
263	    map, (unsigned long)map->size));
264}
265#endif	/* INVARIANTS */
266
267/*
268 * Allocate a vmspace structure, including a vm_map and pmap,
269 * and initialize those structures.  The refcnt is set to 1.
270 *
271 * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
272 */
273struct vmspace *
274vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
275{
276	struct vmspace *vm;
277
278	vm = uma_zalloc(vmspace_zone, M_WAITOK);
279
280	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
281
282	if (pinit == NULL)
283		pinit = &pmap_pinit;
284
285	if (!pinit(vmspace_pmap(vm))) {
286		uma_zfree(vmspace_zone, vm);
287		return (NULL);
288	}
289	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
290	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
291	vm->vm_refcnt = 1;
292	vm->vm_shm = NULL;
293	vm->vm_swrss = 0;
294	vm->vm_tsize = 0;
295	vm->vm_dsize = 0;
296	vm->vm_ssize = 0;
297	vm->vm_taddr = 0;
298	vm->vm_daddr = 0;
299	vm->vm_maxsaddr = 0;
300	return (vm);
301}
302
303#ifdef RACCT
304static void
305vmspace_container_reset(struct proc *p)
306{
307
308	PROC_LOCK(p);
309	racct_set(p, RACCT_DATA, 0);
310	racct_set(p, RACCT_STACK, 0);
311	racct_set(p, RACCT_RSS, 0);
312	racct_set(p, RACCT_MEMLOCK, 0);
313	racct_set(p, RACCT_VMEM, 0);
314	PROC_UNLOCK(p);
315}
316#endif
317
318static inline void
319vmspace_dofree(struct vmspace *vm)
320{
321
322	CTR1(KTR_VM, "vmspace_free: %p", vm);
323
324	/*
325	 * Make sure any SysV shm is freed, it might not have been in
326	 * exit1().
327	 */
328	shmexit(vm);
329
330	/*
331	 * Lock the map, to wait out all other references to it.
332	 * Delete all of the mappings and pages they hold, then call
333	 * the pmap module to reclaim anything left.
334	 */
335	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
336	    vm->vm_map.max_offset);
337
338	pmap_release(vmspace_pmap(vm));
339	vm->vm_map.pmap = NULL;
340	uma_zfree(vmspace_zone, vm);
341}
342
343void
344vmspace_free(struct vmspace *vm)
345{
346
347	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
348	    "vmspace_free() called");
349
350	if (vm->vm_refcnt == 0)
351		panic("vmspace_free: attempt to free already freed vmspace");
352
353	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
354		vmspace_dofree(vm);
355}
356
357void
358vmspace_exitfree(struct proc *p)
359{
360	struct vmspace *vm;
361
362	PROC_VMSPACE_LOCK(p);
363	vm = p->p_vmspace;
364	p->p_vmspace = NULL;
365	PROC_VMSPACE_UNLOCK(p);
366	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
367	vmspace_free(vm);
368}
369
370void
371vmspace_exit(struct thread *td)
372{
373	int refcnt;
374	struct vmspace *vm;
375	struct proc *p;
376
377	/*
378	 * Release user portion of address space.
379	 * This releases references to vnodes,
380	 * which could cause I/O if the file has been unlinked.
381	 * Need to do this early enough that we can still sleep.
382	 *
383	 * The last exiting process to reach this point releases as
384	 * much of the environment as it can. vmspace_dofree() is the
385	 * slower fallback in case another process had a temporary
386	 * reference to the vmspace.
387	 */
388
389	p = td->td_proc;
390	vm = p->p_vmspace;
391	atomic_add_int(&vmspace0.vm_refcnt, 1);
392	do {
393		refcnt = vm->vm_refcnt;
394		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
395			/* Switch now since other proc might free vmspace */
396			PROC_VMSPACE_LOCK(p);
397			p->p_vmspace = &vmspace0;
398			PROC_VMSPACE_UNLOCK(p);
399			pmap_activate(td);
400		}
401	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
402	if (refcnt == 1) {
403		if (p->p_vmspace != vm) {
404			/* vmspace not yet freed, switch back */
405			PROC_VMSPACE_LOCK(p);
406			p->p_vmspace = vm;
407			PROC_VMSPACE_UNLOCK(p);
408			pmap_activate(td);
409		}
410		pmap_remove_pages(vmspace_pmap(vm));
411		/* Switch now since this proc will free vmspace */
412		PROC_VMSPACE_LOCK(p);
413		p->p_vmspace = &vmspace0;
414		PROC_VMSPACE_UNLOCK(p);
415		pmap_activate(td);
416		vmspace_dofree(vm);
417	}
418#ifdef RACCT
419	if (racct_enable)
420		vmspace_container_reset(p);
421#endif
422}
423
424/* Acquire reference to vmspace owned by another process. */
425
426struct vmspace *
427vmspace_acquire_ref(struct proc *p)
428{
429	struct vmspace *vm;
430	int refcnt;
431
432	PROC_VMSPACE_LOCK(p);
433	vm = p->p_vmspace;
434	if (vm == NULL) {
435		PROC_VMSPACE_UNLOCK(p);
436		return (NULL);
437	}
438	do {
439		refcnt = vm->vm_refcnt;
440		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
441			PROC_VMSPACE_UNLOCK(p);
442			return (NULL);
443		}
444	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
445	if (vm != p->p_vmspace) {
446		PROC_VMSPACE_UNLOCK(p);
447		vmspace_free(vm);
448		return (NULL);
449	}
450	PROC_VMSPACE_UNLOCK(p);
451	return (vm);
452}
453
454/*
455 * Switch between vmspaces in an AIO kernel process.
456 *
457 * The AIO kernel processes switch to and from a user process's
458 * vmspace while performing an I/O operation on behalf of a user
459 * process.  The new vmspace is either the vmspace of a user process
460 * obtained from an active AIO request or the initial vmspace of the
461 * AIO kernel process (when it is idling).  Because user processes
462 * will block to drain any active AIO requests before proceeding in
463 * exit() or execve(), the vmspace reference count for these vmspaces
464 * can never be 0.  This allows for a much simpler implementation than
465 * the loop in vmspace_acquire_ref() above.  Similarly, AIO kernel
466 * processes hold an extra reference on their initial vmspace for the
467 * life of the process so that this guarantee is true for any vmspace
468 * passed as 'newvm'.
469 */
470void
471vmspace_switch_aio(struct vmspace *newvm)
472{
473	struct vmspace *oldvm;
474
475	/* XXX: Need some way to assert that this is an aio daemon. */
476
477	KASSERT(newvm->vm_refcnt > 0,
478	    ("vmspace_switch_aio: newvm unreferenced"));
479
480	oldvm = curproc->p_vmspace;
481	if (oldvm == newvm)
482		return;
483
484	/*
485	 * Point to the new address space and refer to it.
486	 */
487	curproc->p_vmspace = newvm;
488	atomic_add_int(&newvm->vm_refcnt, 1);
489
490	/* Activate the new mapping. */
491	pmap_activate(curthread);
492
493	/* Remove the daemon's reference to the old address space. */
494	KASSERT(oldvm->vm_refcnt > 1,
495	    ("vmspace_switch_aio: oldvm dropping last reference"));
496	vmspace_free(oldvm);
497}
498
499void
500_vm_map_lock(vm_map_t map, const char *file, int line)
501{
502
503	if (map->system_map)
504		mtx_lock_flags_(&map->system_mtx, 0, file, line);
505	else
506		sx_xlock_(&map->lock, file, line);
507	map->timestamp++;
508}
509
510static void
511vm_map_process_deferred(void)
512{
513	struct thread *td;
514	vm_map_entry_t entry, next;
515	vm_object_t object;
516
517	td = curthread;
518	entry = td->td_map_def_user;
519	td->td_map_def_user = NULL;
520	while (entry != NULL) {
521		next = entry->next;
522		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
523			/*
524			 * Decrement the object's writemappings and
525			 * possibly the vnode's v_writecount.
526			 */
527			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
528			    ("Submap with writecount"));
529			object = entry->object.vm_object;
530			KASSERT(object != NULL, ("No object for writecount"));
531			vnode_pager_release_writecount(object, entry->start,
532			    entry->end);
533		}
534		vm_map_entry_deallocate(entry, FALSE);
535		entry = next;
536	}
537}
538
539void
540_vm_map_unlock(vm_map_t map, const char *file, int line)
541{
542
543	if (map->system_map)
544		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
545	else {
546		sx_xunlock_(&map->lock, file, line);
547		vm_map_process_deferred();
548	}
549}
550
551void
552_vm_map_lock_read(vm_map_t map, const char *file, int line)
553{
554
555	if (map->system_map)
556		mtx_lock_flags_(&map->system_mtx, 0, file, line);
557	else
558		sx_slock_(&map->lock, file, line);
559}
560
561void
562_vm_map_unlock_read(vm_map_t map, const char *file, int line)
563{
564
565	if (map->system_map)
566		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
567	else {
568		sx_sunlock_(&map->lock, file, line);
569		vm_map_process_deferred();
570	}
571}
572
573int
574_vm_map_trylock(vm_map_t map, const char *file, int line)
575{
576	int error;
577
578	error = map->system_map ?
579	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
580	    !sx_try_xlock_(&map->lock, file, line);
581	if (error == 0)
582		map->timestamp++;
583	return (error == 0);
584}
585
586int
587_vm_map_trylock_read(vm_map_t map, const char *file, int line)
588{
589	int error;
590
591	error = map->system_map ?
592	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
593	    !sx_try_slock_(&map->lock, file, line);
594	return (error == 0);
595}
596
597/*
598 *	_vm_map_lock_upgrade:	[ internal use only ]
599 *
600 *	Tries to upgrade a read (shared) lock on the specified map to a write
601 *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
602 *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
603 *	returned without a read or write lock held.
604 *
605 *	Requires that the map be read locked.
606 */
607int
608_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
609{
610	unsigned int last_timestamp;
611
612	if (map->system_map) {
613		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
614	} else {
615		if (!sx_try_upgrade_(&map->lock, file, line)) {
616			last_timestamp = map->timestamp;
617			sx_sunlock_(&map->lock, file, line);
618			vm_map_process_deferred();
619			/*
620			 * If the map's timestamp does not change while the
621			 * map is unlocked, then the upgrade succeeds.
622			 */
623			sx_xlock_(&map->lock, file, line);
624			if (last_timestamp != map->timestamp) {
625				sx_xunlock_(&map->lock, file, line);
626				return (1);
627			}
628		}
629	}
630	map->timestamp++;
631	return (0);
632}
633
634void
635_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
636{
637
638	if (map->system_map) {
639		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
640	} else
641		sx_downgrade_(&map->lock, file, line);
642}
643
644/*
645 *	vm_map_locked:
646 *
647 *	Returns a non-zero value if the caller holds a write (exclusive) lock
648 *	on the specified map and the value "0" otherwise.
649 */
650int
651vm_map_locked(vm_map_t map)
652{
653
654	if (map->system_map)
655		return (mtx_owned(&map->system_mtx));
656	else
657		return (sx_xlocked(&map->lock));
658}
659
660#ifdef INVARIANTS
661static void
662_vm_map_assert_locked(vm_map_t map, const char *file, int line)
663{
664
665	if (map->system_map)
666		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
667	else
668		sx_assert_(&map->lock, SA_XLOCKED, file, line);
669}
670
671#define	VM_MAP_ASSERT_LOCKED(map) \
672    _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
673#else
674#define	VM_MAP_ASSERT_LOCKED(map)
675#endif
676
677/*
678 *	_vm_map_unlock_and_wait:
679 *
680 *	Atomically releases the lock on the specified map and puts the calling
681 *	thread to sleep.  The calling thread will remain asleep until either
682 *	vm_map_wakeup() is performed on the map or the specified timeout is
683 *	exceeded.
684 *
685 *	WARNING!  This function does not perform deferred deallocations of
686 *	objects and map	entries.  Therefore, the calling thread is expected to
687 *	reacquire the map lock after reawakening and later perform an ordinary
688 *	unlock operation, such as vm_map_unlock(), before completing its
689 *	operation on the map.
690 */
691int
692_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
693{
694
695	mtx_lock(&map_sleep_mtx);
696	if (map->system_map)
697		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
698	else
699		sx_xunlock_(&map->lock, file, line);
700	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
701	    timo));
702}
703
704/*
705 *	vm_map_wakeup:
706 *
707 *	Awaken any threads that have slept on the map using
708 *	vm_map_unlock_and_wait().
709 */
710void
711vm_map_wakeup(vm_map_t map)
712{
713
714	/*
715	 * Acquire and release map_sleep_mtx to prevent a wakeup()
716	 * from being performed (and lost) between the map unlock
717	 * and the msleep() in _vm_map_unlock_and_wait().
718	 */
719	mtx_lock(&map_sleep_mtx);
720	mtx_unlock(&map_sleep_mtx);
721	wakeup(&map->root);
722}
723
724void
725vm_map_busy(vm_map_t map)
726{
727
728	VM_MAP_ASSERT_LOCKED(map);
729	map->busy++;
730}
731
732void
733vm_map_unbusy(vm_map_t map)
734{
735
736	VM_MAP_ASSERT_LOCKED(map);
737	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
738	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
739		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
740		wakeup(&map->busy);
741	}
742}
743
744void
745vm_map_wait_busy(vm_map_t map)
746{
747
748	VM_MAP_ASSERT_LOCKED(map);
749	while (map->busy) {
750		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
751		if (map->system_map)
752			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
753		else
754			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
755	}
756	map->timestamp++;
757}
758
759long
760vmspace_resident_count(struct vmspace *vmspace)
761{
762	return pmap_resident_count(vmspace_pmap(vmspace));
763}
764
765/*
766 *	vm_map_create:
767 *
768 *	Creates and returns a new empty VM map with
769 *	the given physical map structure, and having
770 *	the given lower and upper address bounds.
771 */
772vm_map_t
773vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
774{
775	vm_map_t result;
776
777	result = uma_zalloc(mapzone, M_WAITOK);
778	CTR1(KTR_VM, "vm_map_create: %p", result);
779	_vm_map_init(result, pmap, min, max);
780	return (result);
781}
782
783/*
784 * Initialize an existing vm_map structure
785 * such as that in the vmspace structure.
786 */
787static void
788_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
789{
790
791	map->header.next = map->header.prev = &map->header;
792	map->needs_wakeup = FALSE;
793	map->system_map = 0;
794	map->pmap = pmap;
795	map->min_offset = min;
796	map->max_offset = max;
797	map->flags = 0;
798	map->root = NULL;
799	map->timestamp = 0;
800	map->busy = 0;
801}
802
803void
804vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
805{
806
807	_vm_map_init(map, pmap, min, max);
808	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
809	sx_init(&map->lock, "user map");
810}
811
812/*
813 *	vm_map_entry_dispose:	[ internal use only ]
814 *
815 *	Inverse of vm_map_entry_create.
816 */
817static void
818vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
819{
820	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
821}
822
823/*
824 *	vm_map_entry_create:	[ internal use only ]
825 *
826 *	Allocates a VM map entry for insertion.
827 *	No entry fields are filled in.
828 */
829static vm_map_entry_t
830vm_map_entry_create(vm_map_t map)
831{
832	vm_map_entry_t new_entry;
833
834	if (map->system_map)
835		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
836	else
837		new_entry = uma_zalloc(mapentzone, M_WAITOK);
838	if (new_entry == NULL)
839		panic("vm_map_entry_create: kernel resources exhausted");
840	return (new_entry);
841}
842
843/*
844 *	vm_map_entry_set_behavior:
845 *
846 *	Set the expected access behavior, either normal, random, or
847 *	sequential.
848 */
849static inline void
850vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
851{
852	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
853	    (behavior & MAP_ENTRY_BEHAV_MASK);
854}
855
856/*
857 *	vm_map_entry_set_max_free:
858 *
859 *	Set the max_free field in a vm_map_entry.
860 */
861static inline void
862vm_map_entry_set_max_free(vm_map_entry_t entry)
863{
864
865	entry->max_free = entry->adj_free;
866	if (entry->left != NULL && entry->left->max_free > entry->max_free)
867		entry->max_free = entry->left->max_free;
868	if (entry->right != NULL && entry->right->max_free > entry->max_free)
869		entry->max_free = entry->right->max_free;
870}
871
872/*
873 *	vm_map_entry_splay:
874 *
875 *	The Sleator and Tarjan top-down splay algorithm with the
876 *	following variation.  Max_free must be computed bottom-up, so
877 *	on the downward pass, maintain the left and right spines in
878 *	reverse order.  Then, make a second pass up each side to fix
879 *	the pointers and compute max_free.  The time bound is O(log n)
880 *	amortized.
881 *
882 *	The new root is the vm_map_entry containing "addr", or else an
883 *	adjacent entry (lower or higher) if addr is not in the tree.
884 *
885 *	The map must be locked, and leaves it so.
886 *
887 *	Returns: the new root.
888 */
889static vm_map_entry_t
890vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
891{
892	vm_map_entry_t llist, rlist;
893	vm_map_entry_t ltree, rtree;
894	vm_map_entry_t y;
895
896	/* Special case of empty tree. */
897	if (root == NULL)
898		return (root);
899
900	/*
901	 * Pass One: Splay down the tree until we find addr or a NULL
902	 * pointer where addr would go.  llist and rlist are the two
903	 * sides in reverse order (bottom-up), with llist linked by
904	 * the right pointer and rlist linked by the left pointer in
905	 * the vm_map_entry.  Wait until Pass Two to set max_free on
906	 * the two spines.
907	 */
908	llist = NULL;
909	rlist = NULL;
910	for (;;) {
911		/* root is never NULL in here. */
912		if (addr < root->start) {
913			y = root->left;
914			if (y == NULL)
915				break;
916			if (addr < y->start && y->left != NULL) {
917				/* Rotate right and put y on rlist. */
918				root->left = y->right;
919				y->right = root;
920				vm_map_entry_set_max_free(root);
921				root = y->left;
922				y->left = rlist;
923				rlist = y;
924			} else {
925				/* Put root on rlist. */
926				root->left = rlist;
927				rlist = root;
928				root = y;
929			}
930		} else if (addr >= root->end) {
931			y = root->right;
932			if (y == NULL)
933				break;
934			if (addr >= y->end && y->right != NULL) {
935				/* Rotate left and put y on llist. */
936				root->right = y->left;
937				y->left = root;
938				vm_map_entry_set_max_free(root);
939				root = y->right;
940				y->right = llist;
941				llist = y;
942			} else {
943				/* Put root on llist. */
944				root->right = llist;
945				llist = root;
946				root = y;
947			}
948		} else
949			break;
950	}
951
952	/*
953	 * Pass Two: Walk back up the two spines, flip the pointers
954	 * and set max_free.  The subtrees of the root go at the
955	 * bottom of llist and rlist.
956	 */
957	ltree = root->left;
958	while (llist != NULL) {
959		y = llist->right;
960		llist->right = ltree;
961		vm_map_entry_set_max_free(llist);
962		ltree = llist;
963		llist = y;
964	}
965	rtree = root->right;
966	while (rlist != NULL) {
967		y = rlist->left;
968		rlist->left = rtree;
969		vm_map_entry_set_max_free(rlist);
970		rtree = rlist;
971		rlist = y;
972	}
973
974	/*
975	 * Final assembly: add ltree and rtree as subtrees of root.
976	 */
977	root->left = ltree;
978	root->right = rtree;
979	vm_map_entry_set_max_free(root);
980
981	return (root);
982}
983
984/*
985 *	vm_map_entry_{un,}link:
986 *
987 *	Insert/remove entries from maps.
988 */
989static void
990vm_map_entry_link(vm_map_t map,
991		  vm_map_entry_t after_where,
992		  vm_map_entry_t entry)
993{
994
995	CTR4(KTR_VM,
996	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
997	    map->nentries, entry, after_where);
998	VM_MAP_ASSERT_LOCKED(map);
999	KASSERT(after_where == &map->header ||
1000	    after_where->end <= entry->start,
1001	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
1002	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
1003	KASSERT(after_where->next == &map->header ||
1004	    entry->end <= after_where->next->start,
1005	    ("vm_map_entry_link: new end %jx next start %jx overlap",
1006	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
1007
1008	map->nentries++;
1009	entry->prev = after_where;
1010	entry->next = after_where->next;
1011	entry->next->prev = entry;
1012	after_where->next = entry;
1013
1014	if (after_where != &map->header) {
1015		if (after_where != map->root)
1016			vm_map_entry_splay(after_where->start, map->root);
1017		entry->right = after_where->right;
1018		entry->left = after_where;
1019		after_where->right = NULL;
1020		after_where->adj_free = entry->start - after_where->end;
1021		vm_map_entry_set_max_free(after_where);
1022	} else {
1023		entry->right = map->root;
1024		entry->left = NULL;
1025	}
1026	entry->adj_free = (entry->next == &map->header ? map->max_offset :
1027	    entry->next->start) - entry->end;
1028	vm_map_entry_set_max_free(entry);
1029	map->root = entry;
1030}
1031
1032static void
1033vm_map_entry_unlink(vm_map_t map,
1034		    vm_map_entry_t entry)
1035{
1036	vm_map_entry_t next, prev, root;
1037
1038	VM_MAP_ASSERT_LOCKED(map);
1039	if (entry != map->root)
1040		vm_map_entry_splay(entry->start, map->root);
1041	if (entry->left == NULL)
1042		root = entry->right;
1043	else {
1044		root = vm_map_entry_splay(entry->start, entry->left);
1045		root->right = entry->right;
1046		root->adj_free = (entry->next == &map->header ? map->max_offset :
1047		    entry->next->start) - root->end;
1048		vm_map_entry_set_max_free(root);
1049	}
1050	map->root = root;
1051
1052	prev = entry->prev;
1053	next = entry->next;
1054	next->prev = prev;
1055	prev->next = next;
1056	map->nentries--;
1057	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1058	    map->nentries, entry);
1059}
1060
1061/*
1062 *	vm_map_entry_resize_free:
1063 *
1064 *	Recompute the amount of free space following a vm_map_entry
1065 *	and propagate that value up the tree.  Call this function after
1066 *	resizing a map entry in-place, that is, without a call to
1067 *	vm_map_entry_link() or _unlink().
1068 *
1069 *	The map must be locked, and leaves it so.
1070 */
1071static void
1072vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1073{
1074
1075	/*
1076	 * Using splay trees without parent pointers, propagating
1077	 * max_free up the tree is done by moving the entry to the
1078	 * root and making the change there.
1079	 */
1080	if (entry != map->root)
1081		map->root = vm_map_entry_splay(entry->start, map->root);
1082
1083	entry->adj_free = (entry->next == &map->header ? map->max_offset :
1084	    entry->next->start) - entry->end;
1085	vm_map_entry_set_max_free(entry);
1086}
1087
1088/*
1089 *	vm_map_lookup_entry:	[ internal use only ]
1090 *
1091 *	Finds the map entry containing (or
1092 *	immediately preceding) the specified address
1093 *	in the given map; the entry is returned
1094 *	in the "entry" parameter.  The boolean
1095 *	result indicates whether the address is
1096 *	actually contained in the map.
1097 */
1098boolean_t
1099vm_map_lookup_entry(
1100	vm_map_t map,
1101	vm_offset_t address,
1102	vm_map_entry_t *entry)	/* OUT */
1103{
1104	vm_map_entry_t cur;
1105	boolean_t locked;
1106
1107	/*
1108	 * If the map is empty, then the map entry immediately preceding
1109	 * "address" is the map's header.
1110	 */
1111	cur = map->root;
1112	if (cur == NULL)
1113		*entry = &map->header;
1114	else if (address >= cur->start && cur->end > address) {
1115		*entry = cur;
1116		return (TRUE);
1117	} else if ((locked = vm_map_locked(map)) ||
1118	    sx_try_upgrade(&map->lock)) {
1119		/*
1120		 * Splay requires a write lock on the map.  However, it only
1121		 * restructures the binary search tree; it does not otherwise
1122		 * change the map.  Thus, the map's timestamp need not change
1123		 * on a temporary upgrade.
1124		 */
1125		map->root = cur = vm_map_entry_splay(address, cur);
1126		if (!locked)
1127			sx_downgrade(&map->lock);
1128
1129		/*
1130		 * If "address" is contained within a map entry, the new root
1131		 * is that map entry.  Otherwise, the new root is a map entry
1132		 * immediately before or after "address".
1133		 */
1134		if (address >= cur->start) {
1135			*entry = cur;
1136			if (cur->end > address)
1137				return (TRUE);
1138		} else
1139			*entry = cur->prev;
1140	} else
1141		/*
1142		 * Since the map is only locked for read access, perform a
1143		 * standard binary search tree lookup for "address".
1144		 */
1145		for (;;) {
1146			if (address < cur->start) {
1147				if (cur->left == NULL) {
1148					*entry = cur->prev;
1149					break;
1150				}
1151				cur = cur->left;
1152			} else if (cur->end > address) {
1153				*entry = cur;
1154				return (TRUE);
1155			} else {
1156				if (cur->right == NULL) {
1157					*entry = cur;
1158					break;
1159				}
1160				cur = cur->right;
1161			}
1162		}
1163	return (FALSE);
1164}
1165
1166/*
1167 *	vm_map_insert:
1168 *
1169 *	Inserts the given whole VM object into the target
1170 *	map at the specified address range.  The object's
1171 *	size should match that of the address range.
1172 *
1173 *	Requires that the map be locked, and leaves it so.
1174 *
1175 *	If object is non-NULL, ref count must be bumped by caller
1176 *	prior to making call to account for the new entry.
1177 */
1178int
1179vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1180    vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1181{
1182	vm_map_entry_t new_entry, prev_entry, temp_entry;
1183	struct ucred *cred;
1184	vm_eflags_t protoeflags;
1185	vm_inherit_t inheritance;
1186
1187	VM_MAP_ASSERT_LOCKED(map);
1188	KASSERT((object != kmem_object && object != kernel_object) ||
1189	    (cow & MAP_COPY_ON_WRITE) == 0,
1190	    ("vm_map_insert: kmem or kernel object and COW"));
1191	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1192	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1193
1194	/*
1195	 * Check that the start and end points are not bogus.
1196	 */
1197	if (start < map->min_offset || end > map->max_offset || start >= end)
1198		return (KERN_INVALID_ADDRESS);
1199
1200	/*
1201	 * Find the entry prior to the proposed starting address; if it's part
1202	 * of an existing entry, this range is bogus.
1203	 */
1204	if (vm_map_lookup_entry(map, start, &temp_entry))
1205		return (KERN_NO_SPACE);
1206
1207	prev_entry = temp_entry;
1208
1209	/*
1210	 * Assert that the next entry doesn't overlap the end point.
1211	 */
1212	if (prev_entry->next != &map->header && prev_entry->next->start < end)
1213		return (KERN_NO_SPACE);
1214
1215	protoeflags = 0;
1216	if (cow & MAP_COPY_ON_WRITE)
1217		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1218	if (cow & MAP_NOFAULT)
1219		protoeflags |= MAP_ENTRY_NOFAULT;
1220	if (cow & MAP_DISABLE_SYNCER)
1221		protoeflags |= MAP_ENTRY_NOSYNC;
1222	if (cow & MAP_DISABLE_COREDUMP)
1223		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1224	if (cow & MAP_STACK_GROWS_DOWN)
1225		protoeflags |= MAP_ENTRY_GROWS_DOWN;
1226	if (cow & MAP_STACK_GROWS_UP)
1227		protoeflags |= MAP_ENTRY_GROWS_UP;
1228	if (cow & MAP_VN_WRITECOUNT)
1229		protoeflags |= MAP_ENTRY_VN_WRITECNT;
1230	if (cow & MAP_INHERIT_SHARE)
1231		inheritance = VM_INHERIT_SHARE;
1232	else
1233		inheritance = VM_INHERIT_DEFAULT;
1234
1235	cred = NULL;
1236	if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
1237		goto charged;
1238	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1239	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1240		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1241			return (KERN_RESOURCE_SHORTAGE);
1242		KASSERT(object == NULL ||
1243		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1244		    object->cred == NULL,
1245		    ("overcommit: vm_map_insert o %p", object));
1246		cred = curthread->td_ucred;
1247	}
1248
1249charged:
1250	/* Expand the kernel pmap, if necessary. */
1251	if (map == kernel_map && end > kernel_vm_end)
1252		pmap_growkernel(end);
1253	if (object != NULL) {
1254		/*
1255		 * OBJ_ONEMAPPING must be cleared unless this mapping
1256		 * is trivially proven to be the only mapping for any
1257		 * of the object's pages.  (Object granularity
1258		 * reference counting is insufficient to recognize
1259		 * aliases with precision.)
1260		 */
1261		VM_OBJECT_WLOCK(object);
1262		if (object->ref_count > 1 || object->shadow_count != 0)
1263			vm_object_clear_flag(object, OBJ_ONEMAPPING);
1264		VM_OBJECT_WUNLOCK(object);
1265	} else if (prev_entry != &map->header &&
1266	    prev_entry->eflags == protoeflags &&
1267	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1268	    prev_entry->end == start && prev_entry->wired_count == 0 &&
1269	    (prev_entry->cred == cred ||
1270	    (prev_entry->object.vm_object != NULL &&
1271	    prev_entry->object.vm_object->cred == cred)) &&
1272	    vm_object_coalesce(prev_entry->object.vm_object,
1273	    prev_entry->offset,
1274	    (vm_size_t)(prev_entry->end - prev_entry->start),
1275	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
1276	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1277		/*
1278		 * We were able to extend the object.  Determine if we
1279		 * can extend the previous map entry to include the
1280		 * new range as well.
1281		 */
1282		if (prev_entry->inheritance == inheritance &&
1283		    prev_entry->protection == prot &&
1284		    prev_entry->max_protection == max) {
1285			map->size += end - prev_entry->end;
1286			prev_entry->end = end;
1287			vm_map_entry_resize_free(map, prev_entry);
1288			vm_map_simplify_entry(map, prev_entry);
1289			return (KERN_SUCCESS);
1290		}
1291
1292		/*
1293		 * If we can extend the object but cannot extend the
1294		 * map entry, we have to create a new map entry.  We
1295		 * must bump the ref count on the extended object to
1296		 * account for it.  object may be NULL.
1297		 */
1298		object = prev_entry->object.vm_object;
1299		offset = prev_entry->offset +
1300		    (prev_entry->end - prev_entry->start);
1301		vm_object_reference(object);
1302		if (cred != NULL && object != NULL && object->cred != NULL &&
1303		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1304			/* Object already accounts for this uid. */
1305			cred = NULL;
1306		}
1307	}
1308	if (cred != NULL)
1309		crhold(cred);
1310
1311	/*
1312	 * Create a new entry
1313	 */
1314	new_entry = vm_map_entry_create(map);
1315	new_entry->start = start;
1316	new_entry->end = end;
1317	new_entry->cred = NULL;
1318
1319	new_entry->eflags = protoeflags;
1320	new_entry->object.vm_object = object;
1321	new_entry->offset = offset;
1322	new_entry->avail_ssize = 0;
1323
1324	new_entry->inheritance = inheritance;
1325	new_entry->protection = prot;
1326	new_entry->max_protection = max;
1327	new_entry->wired_count = 0;
1328	new_entry->wiring_thread = NULL;
1329	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1330	new_entry->next_read = start;
1331
1332	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1333	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1334	new_entry->cred = cred;
1335
1336	/*
1337	 * Insert the new entry into the list
1338	 */
1339	vm_map_entry_link(map, prev_entry, new_entry);
1340	map->size += new_entry->end - new_entry->start;
1341
1342	/*
1343	 * Try to coalesce the new entry with both the previous and next
1344	 * entries in the list.  Previously, we only attempted to coalesce
1345	 * with the previous entry when object is NULL.  Here, we handle the
1346	 * other cases, which are less common.
1347	 */
1348	vm_map_simplify_entry(map, new_entry);
1349
1350	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1351		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1352		    end - start, cow & MAP_PREFAULT_PARTIAL);
1353	}
1354
1355	return (KERN_SUCCESS);
1356}
1357
1358/*
1359 *	vm_map_findspace:
1360 *
1361 *	Find the first fit (lowest VM address) for "length" free bytes
1362 *	beginning at address >= start in the given map.
1363 *
1364 *	In a vm_map_entry, "adj_free" is the amount of free space
1365 *	adjacent (higher address) to this entry, and "max_free" is the
1366 *	maximum amount of contiguous free space in its subtree.  This
1367 *	allows finding a free region in one path down the tree, so
1368 *	O(log n) amortized with splay trees.
1369 *
1370 *	The map must be locked, and leaves it so.
1371 *
1372 *	Returns: 0 on success, and starting address in *addr,
1373 *		 1 if insufficient space.
1374 */
1375int
1376vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1377    vm_offset_t *addr)	/* OUT */
1378{
1379	vm_map_entry_t entry;
1380	vm_offset_t st;
1381
1382	/*
1383	 * Request must fit within min/max VM address and must avoid
1384	 * address wrap.
1385	 */
1386	if (start < map->min_offset)
1387		start = map->min_offset;
1388	if (start + length > map->max_offset || start + length < start)
1389		return (1);
1390
1391	/* Empty tree means wide open address space. */
1392	if (map->root == NULL) {
1393		*addr = start;
1394		return (0);
1395	}
1396
1397	/*
1398	 * After splay, if start comes before root node, then there
1399	 * must be a gap from start to the root.
1400	 */
1401	map->root = vm_map_entry_splay(start, map->root);
1402	if (start + length <= map->root->start) {
1403		*addr = start;
1404		return (0);
1405	}
1406
1407	/*
1408	 * Root is the last node that might begin its gap before
1409	 * start, and this is the last comparison where address
1410	 * wrap might be a problem.
1411	 */
1412	st = (start > map->root->end) ? start : map->root->end;
1413	if (length <= map->root->end + map->root->adj_free - st) {
1414		*addr = st;
1415		return (0);
1416	}
1417
1418	/* With max_free, can immediately tell if no solution. */
1419	entry = map->root->right;
1420	if (entry == NULL || length > entry->max_free)
1421		return (1);
1422
1423	/*
1424	 * Search the right subtree in the order: left subtree, root,
1425	 * right subtree (first fit).  The previous splay implies that
1426	 * all regions in the right subtree have addresses > start.
1427	 */
1428	while (entry != NULL) {
1429		if (entry->left != NULL && entry->left->max_free >= length)
1430			entry = entry->left;
1431		else if (entry->adj_free >= length) {
1432			*addr = entry->end;
1433			return (0);
1434		} else
1435			entry = entry->right;
1436	}
1437
1438	/* Can't get here, so panic if we do. */
1439	panic("vm_map_findspace: max_free corrupt");
1440}
1441
1442int
1443vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1444    vm_offset_t start, vm_size_t length, vm_prot_t prot,
1445    vm_prot_t max, int cow)
1446{
1447	vm_offset_t end;
1448	int result;
1449
1450	end = start + length;
1451	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1452	    object == NULL,
1453	    ("vm_map_fixed: non-NULL backing object for stack"));
1454	vm_map_lock(map);
1455	VM_MAP_RANGE_CHECK(map, start, end);
1456	if ((cow & MAP_CHECK_EXCL) == 0)
1457		vm_map_delete(map, start, end);
1458	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1459		result = vm_map_stack_locked(map, start, length, sgrowsiz,
1460		    prot, max, cow);
1461	} else {
1462		result = vm_map_insert(map, object, offset, start, end,
1463		    prot, max, cow);
1464	}
1465	vm_map_unlock(map);
1466	return (result);
1467}
1468
1469/*
1470 *	vm_map_find finds an unallocated region in the target address
1471 *	map with the given length.  The search is defined to be
1472 *	first-fit from the specified address; the region found is
1473 *	returned in the same parameter.
1474 *
1475 *	If object is non-NULL, ref count must be bumped by caller
1476 *	prior to making call to account for the new entry.
1477 */
1478int
1479vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1480	    vm_offset_t *addr,	/* IN/OUT */
1481	    vm_size_t length, vm_offset_t max_addr, int find_space,
1482	    vm_prot_t prot, vm_prot_t max, int cow)
1483{
1484	vm_offset_t alignment, initial_addr, start;
1485	int result;
1486
1487	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1488	    object == NULL,
1489	    ("vm_map_find: non-NULL backing object for stack"));
1490	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1491	    (object->flags & OBJ_COLORED) == 0))
1492		find_space = VMFS_ANY_SPACE;
1493	if (find_space >> 8 != 0) {
1494		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1495		alignment = (vm_offset_t)1 << (find_space >> 8);
1496	} else
1497		alignment = 0;
1498	initial_addr = *addr;
1499again:
1500	start = initial_addr;
1501	vm_map_lock(map);
1502	do {
1503		if (find_space != VMFS_NO_SPACE) {
1504			if (vm_map_findspace(map, start, length, addr) ||
1505			    (max_addr != 0 && *addr + length > max_addr)) {
1506				vm_map_unlock(map);
1507				if (find_space == VMFS_OPTIMAL_SPACE) {
1508					find_space = VMFS_ANY_SPACE;
1509					goto again;
1510				}
1511				return (KERN_NO_SPACE);
1512			}
1513			switch (find_space) {
1514			case VMFS_SUPER_SPACE:
1515			case VMFS_OPTIMAL_SPACE:
1516				pmap_align_superpage(object, offset, addr,
1517				    length);
1518				break;
1519			case VMFS_ANY_SPACE:
1520				break;
1521			default:
1522				if ((*addr & (alignment - 1)) != 0) {
1523					*addr &= ~(alignment - 1);
1524					*addr += alignment;
1525				}
1526				break;
1527			}
1528
1529			start = *addr;
1530		}
1531		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1532			result = vm_map_stack_locked(map, start, length,
1533			    sgrowsiz, prot, max, cow);
1534		} else {
1535			result = vm_map_insert(map, object, offset, start,
1536			    start + length, prot, max, cow);
1537		}
1538	} while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
1539	    find_space != VMFS_ANY_SPACE);
1540	vm_map_unlock(map);
1541	return (result);
1542}
1543
1544/*
1545 *	vm_map_simplify_entry:
1546 *
1547 *	Simplify the given map entry by merging with either neighbor.  This
1548 *	routine also has the ability to merge with both neighbors.
1549 *
1550 *	The map must be locked.
1551 *
1552 *	This routine guarantees that the passed entry remains valid (though
1553 *	possibly extended).  When merging, this routine may delete one or
1554 *	both neighbors.
1555 */
1556void
1557vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1558{
1559	vm_map_entry_t next, prev;
1560	vm_size_t prevsize, esize;
1561
1562	if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
1563	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
1564		return;
1565
1566	prev = entry->prev;
1567	if (prev != &map->header) {
1568		prevsize = prev->end - prev->start;
1569		if ( (prev->end == entry->start) &&
1570		     (prev->object.vm_object == entry->object.vm_object) &&
1571		     (!prev->object.vm_object ||
1572			(prev->offset + prevsize == entry->offset)) &&
1573		     (prev->eflags == entry->eflags) &&
1574		     (prev->protection == entry->protection) &&
1575		     (prev->max_protection == entry->max_protection) &&
1576		     (prev->inheritance == entry->inheritance) &&
1577		     (prev->wired_count == entry->wired_count) &&
1578		     (prev->cred == entry->cred)) {
1579			vm_map_entry_unlink(map, prev);
1580			entry->start = prev->start;
1581			entry->offset = prev->offset;
1582			if (entry->prev != &map->header)
1583				vm_map_entry_resize_free(map, entry->prev);
1584
1585			/*
1586			 * If the backing object is a vnode object,
1587			 * vm_object_deallocate() calls vrele().
1588			 * However, vrele() does not lock the vnode
1589			 * because the vnode has additional
1590			 * references.  Thus, the map lock can be kept
1591			 * without causing a lock-order reversal with
1592			 * the vnode lock.
1593			 *
1594			 * Since we count the number of virtual page
1595			 * mappings in object->un_pager.vnp.writemappings,
1596			 * the writemappings value should not be adjusted
1597			 * when the entry is disposed of.
1598			 */
1599			if (prev->object.vm_object)
1600				vm_object_deallocate(prev->object.vm_object);
1601			if (prev->cred != NULL)
1602				crfree(prev->cred);
1603			vm_map_entry_dispose(map, prev);
1604		}
1605	}
1606
1607	next = entry->next;
1608	if (next != &map->header) {
1609		esize = entry->end - entry->start;
1610		if ((entry->end == next->start) &&
1611		    (next->object.vm_object == entry->object.vm_object) &&
1612		     (!entry->object.vm_object ||
1613			(entry->offset + esize == next->offset)) &&
1614		    (next->eflags == entry->eflags) &&
1615		    (next->protection == entry->protection) &&
1616		    (next->max_protection == entry->max_protection) &&
1617		    (next->inheritance == entry->inheritance) &&
1618		    (next->wired_count == entry->wired_count) &&
1619		    (next->cred == entry->cred)) {
1620			vm_map_entry_unlink(map, next);
1621			entry->end = next->end;
1622			vm_map_entry_resize_free(map, entry);
1623
1624			/*
1625			 * See comment above.
1626			 */
1627			if (next->object.vm_object)
1628				vm_object_deallocate(next->object.vm_object);
1629			if (next->cred != NULL)
1630				crfree(next->cred);
1631			vm_map_entry_dispose(map, next);
1632		}
1633	}
1634}
1635/*
1636 *	vm_map_clip_start:	[ internal use only ]
1637 *
1638 *	Asserts that the given entry begins at or after
1639 *	the specified address; if necessary,
1640 *	it splits the entry into two.
1641 */
1642#define vm_map_clip_start(map, entry, startaddr) \
1643{ \
1644	if (startaddr > entry->start) \
1645		_vm_map_clip_start(map, entry, startaddr); \
1646}
1647
1648/*
1649 *	This routine is called only when it is known that
1650 *	the entry must be split.
1651 */
1652static void
1653_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1654{
1655	vm_map_entry_t new_entry;
1656
1657	VM_MAP_ASSERT_LOCKED(map);
1658
1659	/*
1660	 * Split off the front portion -- note that we must insert the new
1661	 * entry BEFORE this one, so that this entry has the specified
1662	 * starting address.
1663	 */
1664	vm_map_simplify_entry(map, entry);
1665
1666	/*
1667	 * If there is no object backing this entry, we might as well create
1668	 * one now.  If we defer it, an object can get created after the map
1669	 * is clipped, and individual objects will be created for the split-up
1670	 * map.  This is a bit of a hack, but is also about the best place to
1671	 * put this improvement.
1672	 */
1673	if (entry->object.vm_object == NULL && !map->system_map) {
1674		vm_object_t object;
1675		object = vm_object_allocate(OBJT_DEFAULT,
1676				atop(entry->end - entry->start));
1677		entry->object.vm_object = object;
1678		entry->offset = 0;
1679		if (entry->cred != NULL) {
1680			object->cred = entry->cred;
1681			object->charge = entry->end - entry->start;
1682			entry->cred = NULL;
1683		}
1684	} else if (entry->object.vm_object != NULL &&
1685		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1686		   entry->cred != NULL) {
1687		VM_OBJECT_WLOCK(entry->object.vm_object);
1688		KASSERT(entry->object.vm_object->cred == NULL,
1689		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
1690		entry->object.vm_object->cred = entry->cred;
1691		entry->object.vm_object->charge = entry->end - entry->start;
1692		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1693		entry->cred = NULL;
1694	}
1695
1696	new_entry = vm_map_entry_create(map);
1697	*new_entry = *entry;
1698
1699	new_entry->end = start;
1700	entry->offset += (start - entry->start);
1701	entry->start = start;
1702	if (new_entry->cred != NULL)
1703		crhold(entry->cred);
1704
1705	vm_map_entry_link(map, entry->prev, new_entry);
1706
1707	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1708		vm_object_reference(new_entry->object.vm_object);
1709		/*
1710		 * The object->un_pager.vnp.writemappings for the
1711		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
1712		 * kept as is here.  The virtual pages are
1713		 * re-distributed among the clipped entries, so the sum is
1714		 * left the same.
1715		 */
1716	}
1717}
1718
1719/*
1720 *	vm_map_clip_end:	[ internal use only ]
1721 *
1722 *	Asserts that the given entry ends at or before
1723 *	the specified address; if necessary,
1724 *	it splits the entry into two.
1725 */
1726#define vm_map_clip_end(map, entry, endaddr) \
1727{ \
1728	if ((endaddr) < (entry->end)) \
1729		_vm_map_clip_end((map), (entry), (endaddr)); \
1730}
1731
1732/*
1733 *	This routine is called only when it is known that
1734 *	the entry must be split.
1735 */
1736static void
1737_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1738{
1739	vm_map_entry_t new_entry;
1740
1741	VM_MAP_ASSERT_LOCKED(map);
1742
1743	/*
1744	 * If there is no object backing this entry, we might as well create
1745	 * one now.  If we defer it, an object can get created after the map
1746	 * is clipped, and individual objects will be created for the split-up
1747	 * map.  This is a bit of a hack, but is also about the best place to
1748	 * put this improvement.
1749	 */
1750	if (entry->object.vm_object == NULL && !map->system_map) {
1751		vm_object_t object;
1752		object = vm_object_allocate(OBJT_DEFAULT,
1753				atop(entry->end - entry->start));
1754		entry->object.vm_object = object;
1755		entry->offset = 0;
1756		if (entry->cred != NULL) {
1757			object->cred = entry->cred;
1758			object->charge = entry->end - entry->start;
1759			entry->cred = NULL;
1760		}
1761	} else if (entry->object.vm_object != NULL &&
1762		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1763		   entry->cred != NULL) {
1764		VM_OBJECT_WLOCK(entry->object.vm_object);
1765		KASSERT(entry->object.vm_object->cred == NULL,
1766		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
1767		entry->object.vm_object->cred = entry->cred;
1768		entry->object.vm_object->charge = entry->end - entry->start;
1769		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1770		entry->cred = NULL;
1771	}
1772
1773	/*
1774	 * Create a new entry and insert it AFTER the specified entry
1775	 */
1776	new_entry = vm_map_entry_create(map);
1777	*new_entry = *entry;
1778
1779	new_entry->start = entry->end = end;
1780	new_entry->offset += (end - entry->start);
1781	if (new_entry->cred != NULL)
1782		crhold(entry->cred);
1783
1784	vm_map_entry_link(map, entry, new_entry);
1785
1786	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1787		vm_object_reference(new_entry->object.vm_object);
1788	}
1789}
1790
1791/*
1792 *	vm_map_submap:		[ kernel use only ]
1793 *
1794 *	Mark the given range as handled by a subordinate map.
1795 *
1796 *	This range must have been created with vm_map_find,
1797 *	and no other operations may have been performed on this
1798 *	range prior to calling vm_map_submap.
1799 *
1800 *	Only a limited number of operations can be performed
1801 *	within this rage after calling vm_map_submap:
1802 *		vm_fault
1803 *	[Don't try vm_map_copy!]
1804 *
1805 *	To remove a submapping, one must first remove the
1806 *	range from the superior map, and then destroy the
1807 *	submap (if desired).  [Better yet, don't try it.]
1808 */
1809int
1810vm_map_submap(
1811	vm_map_t map,
1812	vm_offset_t start,
1813	vm_offset_t end,
1814	vm_map_t submap)
1815{
1816	vm_map_entry_t entry;
1817	int result = KERN_INVALID_ARGUMENT;
1818
1819	vm_map_lock(map);
1820
1821	VM_MAP_RANGE_CHECK(map, start, end);
1822
1823	if (vm_map_lookup_entry(map, start, &entry)) {
1824		vm_map_clip_start(map, entry, start);
1825	} else
1826		entry = entry->next;
1827
1828	vm_map_clip_end(map, entry, end);
1829
1830	if ((entry->start == start) && (entry->end == end) &&
1831	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1832	    (entry->object.vm_object == NULL)) {
1833		entry->object.sub_map = submap;
1834		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1835		result = KERN_SUCCESS;
1836	}
1837	vm_map_unlock(map);
1838
1839	return (result);
1840}
1841
1842/*
1843 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
1844 */
1845#define	MAX_INIT_PT	96
1846
1847/*
1848 *	vm_map_pmap_enter:
1849 *
1850 *	Preload the specified map's pmap with mappings to the specified
1851 *	object's memory-resident pages.  No further physical pages are
1852 *	allocated, and no further virtual pages are retrieved from secondary
1853 *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
1854 *	limited number of page mappings are created at the low-end of the
1855 *	specified address range.  (For this purpose, a superpage mapping
1856 *	counts as one page mapping.)  Otherwise, all resident pages within
1857 *	the specified address range are mapped.  Because these mappings are
1858 *	being created speculatively, cached pages are not reactivated and
1859 *	mapped.
1860 */
1861static void
1862vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1863    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1864{
1865	vm_offset_t start;
1866	vm_page_t p, p_start;
1867	vm_pindex_t mask, psize, threshold, tmpidx;
1868
1869	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1870		return;
1871	VM_OBJECT_RLOCK(object);
1872	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1873		VM_OBJECT_RUNLOCK(object);
1874		VM_OBJECT_WLOCK(object);
1875		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1876			pmap_object_init_pt(map->pmap, addr, object, pindex,
1877			    size);
1878			VM_OBJECT_WUNLOCK(object);
1879			return;
1880		}
1881		VM_OBJECT_LOCK_DOWNGRADE(object);
1882	}
1883
1884	psize = atop(size);
1885	if (psize + pindex > object->size) {
1886		if (object->size < pindex) {
1887			VM_OBJECT_RUNLOCK(object);
1888			return;
1889		}
1890		psize = object->size - pindex;
1891	}
1892
1893	start = 0;
1894	p_start = NULL;
1895	threshold = MAX_INIT_PT;
1896
1897	p = vm_page_find_least(object, pindex);
1898	/*
1899	 * Assert: the variable p is either (1) the page with the
1900	 * least pindex greater than or equal to the parameter pindex
1901	 * or (2) NULL.
1902	 */
1903	for (;
1904	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
1905	     p = TAILQ_NEXT(p, listq)) {
1906		/*
1907		 * don't allow an madvise to blow away our really
1908		 * free pages allocating pv entries.
1909		 */
1910		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
1911		    vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
1912		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
1913		    tmpidx >= threshold)) {
1914			psize = tmpidx;
1915			break;
1916		}
1917		if (p->valid == VM_PAGE_BITS_ALL) {
1918			if (p_start == NULL) {
1919				start = addr + ptoa(tmpidx);
1920				p_start = p;
1921			}
1922			/* Jump ahead if a superpage mapping is possible. */
1923			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
1924			    (pagesizes[p->psind] - 1)) == 0) {
1925				mask = atop(pagesizes[p->psind]) - 1;
1926				if (tmpidx + mask < psize &&
1927				    vm_page_ps_is_valid(p)) {
1928					p += mask;
1929					threshold += mask;
1930				}
1931			}
1932		} else if (p_start != NULL) {
1933			pmap_enter_object(map->pmap, start, addr +
1934			    ptoa(tmpidx), p_start, prot);
1935			p_start = NULL;
1936		}
1937	}
1938	if (p_start != NULL)
1939		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
1940		    p_start, prot);
1941	VM_OBJECT_RUNLOCK(object);
1942}
1943
1944/*
1945 *	vm_map_protect:
1946 *
1947 *	Sets the protection of the specified address
1948 *	region in the target map.  If "set_max" is
1949 *	specified, the maximum protection is to be set;
1950 *	otherwise, only the current protection is affected.
1951 */
1952int
1953vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1954	       vm_prot_t new_prot, boolean_t set_max)
1955{
1956	vm_map_entry_t current, entry;
1957	vm_object_t obj;
1958	struct ucred *cred;
1959	vm_prot_t old_prot;
1960
1961	if (start == end)
1962		return (KERN_SUCCESS);
1963
1964	vm_map_lock(map);
1965
1966	VM_MAP_RANGE_CHECK(map, start, end);
1967
1968	if (vm_map_lookup_entry(map, start, &entry)) {
1969		vm_map_clip_start(map, entry, start);
1970	} else {
1971		entry = entry->next;
1972	}
1973
1974	/*
1975	 * Make a first pass to check for protection violations.
1976	 */
1977	current = entry;
1978	while ((current != &map->header) && (current->start < end)) {
1979		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1980			vm_map_unlock(map);
1981			return (KERN_INVALID_ARGUMENT);
1982		}
1983		if ((new_prot & current->max_protection) != new_prot) {
1984			vm_map_unlock(map);
1985			return (KERN_PROTECTION_FAILURE);
1986		}
1987		current = current->next;
1988	}
1989
1990
1991	/*
1992	 * Do an accounting pass for private read-only mappings that
1993	 * now will do cow due to allowed write (e.g. debugger sets
1994	 * breakpoint on text segment)
1995	 */
1996	for (current = entry; (current != &map->header) &&
1997	     (current->start < end); current = current->next) {
1998
1999		vm_map_clip_end(map, current, end);
2000
2001		if (set_max ||
2002		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2003		    ENTRY_CHARGED(current)) {
2004			continue;
2005		}
2006
2007		cred = curthread->td_ucred;
2008		obj = current->object.vm_object;
2009
2010		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2011			if (!swap_reserve(current->end - current->start)) {
2012				vm_map_unlock(map);
2013				return (KERN_RESOURCE_SHORTAGE);
2014			}
2015			crhold(cred);
2016			current->cred = cred;
2017			continue;
2018		}
2019
2020		VM_OBJECT_WLOCK(obj);
2021		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2022			VM_OBJECT_WUNLOCK(obj);
2023			continue;
2024		}
2025
2026		/*
2027		 * Charge for the whole object allocation now, since
2028		 * we cannot distinguish between non-charged and
2029		 * charged clipped mapping of the same object later.
2030		 */
2031		KASSERT(obj->charge == 0,
2032		    ("vm_map_protect: object %p overcharged (entry %p)",
2033		    obj, current));
2034		if (!swap_reserve(ptoa(obj->size))) {
2035			VM_OBJECT_WUNLOCK(obj);
2036			vm_map_unlock(map);
2037			return (KERN_RESOURCE_SHORTAGE);
2038		}
2039
2040		crhold(cred);
2041		obj->cred = cred;
2042		obj->charge = ptoa(obj->size);
2043		VM_OBJECT_WUNLOCK(obj);
2044	}
2045
2046	/*
2047	 * Go back and fix up protections. [Note that clipping is not
2048	 * necessary the second time.]
2049	 */
2050	current = entry;
2051	while ((current != &map->header) && (current->start < end)) {
2052		old_prot = current->protection;
2053
2054		if (set_max)
2055			current->protection =
2056			    (current->max_protection = new_prot) &
2057			    old_prot;
2058		else
2059			current->protection = new_prot;
2060
2061		/*
2062		 * For user wired map entries, the normal lazy evaluation of
2063		 * write access upgrades through soft page faults is
2064		 * undesirable.  Instead, immediately copy any pages that are
2065		 * copy-on-write and enable write access in the physical map.
2066		 */
2067		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2068		    (current->protection & VM_PROT_WRITE) != 0 &&
2069		    (old_prot & VM_PROT_WRITE) == 0)
2070			vm_fault_copy_entry(map, map, current, current, NULL);
2071
2072		/*
2073		 * When restricting access, update the physical map.  Worry
2074		 * about copy-on-write here.
2075		 */
2076		if ((old_prot & ~current->protection) != 0) {
2077#define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2078							VM_PROT_ALL)
2079			pmap_protect(map->pmap, current->start,
2080			    current->end,
2081			    current->protection & MASK(current));
2082#undef	MASK
2083		}
2084		vm_map_simplify_entry(map, current);
2085		current = current->next;
2086	}
2087	vm_map_unlock(map);
2088	return (KERN_SUCCESS);
2089}
2090
2091/*
2092 *	vm_map_madvise:
2093 *
2094 *	This routine traverses a processes map handling the madvise
2095 *	system call.  Advisories are classified as either those effecting
2096 *	the vm_map_entry structure, or those effecting the underlying
2097 *	objects.
2098 */
2099int
2100vm_map_madvise(
2101	vm_map_t map,
2102	vm_offset_t start,
2103	vm_offset_t end,
2104	int behav)
2105{
2106	vm_map_entry_t current, entry;
2107	int modify_map = 0;
2108
2109	/*
2110	 * Some madvise calls directly modify the vm_map_entry, in which case
2111	 * we need to use an exclusive lock on the map and we need to perform
2112	 * various clipping operations.  Otherwise we only need a read-lock
2113	 * on the map.
2114	 */
2115	switch(behav) {
2116	case MADV_NORMAL:
2117	case MADV_SEQUENTIAL:
2118	case MADV_RANDOM:
2119	case MADV_NOSYNC:
2120	case MADV_AUTOSYNC:
2121	case MADV_NOCORE:
2122	case MADV_CORE:
2123		if (start == end)
2124			return (KERN_SUCCESS);
2125		modify_map = 1;
2126		vm_map_lock(map);
2127		break;
2128	case MADV_WILLNEED:
2129	case MADV_DONTNEED:
2130	case MADV_FREE:
2131		if (start == end)
2132			return (KERN_SUCCESS);
2133		vm_map_lock_read(map);
2134		break;
2135	default:
2136		return (KERN_INVALID_ARGUMENT);
2137	}
2138
2139	/*
2140	 * Locate starting entry and clip if necessary.
2141	 */
2142	VM_MAP_RANGE_CHECK(map, start, end);
2143
2144	if (vm_map_lookup_entry(map, start, &entry)) {
2145		if (modify_map)
2146			vm_map_clip_start(map, entry, start);
2147	} else {
2148		entry = entry->next;
2149	}
2150
2151	if (modify_map) {
2152		/*
2153		 * madvise behaviors that are implemented in the vm_map_entry.
2154		 *
2155		 * We clip the vm_map_entry so that behavioral changes are
2156		 * limited to the specified address range.
2157		 */
2158		for (current = entry;
2159		     (current != &map->header) && (current->start < end);
2160		     current = current->next
2161		) {
2162			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2163				continue;
2164
2165			vm_map_clip_end(map, current, end);
2166
2167			switch (behav) {
2168			case MADV_NORMAL:
2169				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2170				break;
2171			case MADV_SEQUENTIAL:
2172				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2173				break;
2174			case MADV_RANDOM:
2175				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2176				break;
2177			case MADV_NOSYNC:
2178				current->eflags |= MAP_ENTRY_NOSYNC;
2179				break;
2180			case MADV_AUTOSYNC:
2181				current->eflags &= ~MAP_ENTRY_NOSYNC;
2182				break;
2183			case MADV_NOCORE:
2184				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2185				break;
2186			case MADV_CORE:
2187				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2188				break;
2189			default:
2190				break;
2191			}
2192			vm_map_simplify_entry(map, current);
2193		}
2194		vm_map_unlock(map);
2195	} else {
2196		vm_pindex_t pstart, pend;
2197
2198		/*
2199		 * madvise behaviors that are implemented in the underlying
2200		 * vm_object.
2201		 *
2202		 * Since we don't clip the vm_map_entry, we have to clip
2203		 * the vm_object pindex and count.
2204		 */
2205		for (current = entry;
2206		     (current != &map->header) && (current->start < end);
2207		     current = current->next
2208		) {
2209			vm_offset_t useEnd, useStart;
2210
2211			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2212				continue;
2213
2214			pstart = OFF_TO_IDX(current->offset);
2215			pend = pstart + atop(current->end - current->start);
2216			useStart = current->start;
2217			useEnd = current->end;
2218
2219			if (current->start < start) {
2220				pstart += atop(start - current->start);
2221				useStart = start;
2222			}
2223			if (current->end > end) {
2224				pend -= atop(current->end - end);
2225				useEnd = end;
2226			}
2227
2228			if (pstart >= pend)
2229				continue;
2230
2231			/*
2232			 * Perform the pmap_advise() before clearing
2233			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2234			 * concurrent pmap operation, such as pmap_remove(),
2235			 * could clear a reference in the pmap and set
2236			 * PGA_REFERENCED on the page before the pmap_advise()
2237			 * had completed.  Consequently, the page would appear
2238			 * referenced based upon an old reference that
2239			 * occurred before this pmap_advise() ran.
2240			 */
2241			if (behav == MADV_DONTNEED || behav == MADV_FREE)
2242				pmap_advise(map->pmap, useStart, useEnd,
2243				    behav);
2244
2245			vm_object_madvise(current->object.vm_object, pstart,
2246			    pend, behav);
2247
2248			/*
2249			 * Pre-populate paging structures in the
2250			 * WILLNEED case.  For wired entries, the
2251			 * paging structures are already populated.
2252			 */
2253			if (behav == MADV_WILLNEED &&
2254			    current->wired_count == 0) {
2255				vm_map_pmap_enter(map,
2256				    useStart,
2257				    current->protection,
2258				    current->object.vm_object,
2259				    pstart,
2260				    ptoa(pend - pstart),
2261				    MAP_PREFAULT_MADVISE
2262				);
2263			}
2264		}
2265		vm_map_unlock_read(map);
2266	}
2267	return (0);
2268}
2269
2270
2271/*
2272 *	vm_map_inherit:
2273 *
2274 *	Sets the inheritance of the specified address
2275 *	range in the target map.  Inheritance
2276 *	affects how the map will be shared with
2277 *	child maps at the time of vmspace_fork.
2278 */
2279int
2280vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2281	       vm_inherit_t new_inheritance)
2282{
2283	vm_map_entry_t entry;
2284	vm_map_entry_t temp_entry;
2285
2286	switch (new_inheritance) {
2287	case VM_INHERIT_NONE:
2288	case VM_INHERIT_COPY:
2289	case VM_INHERIT_SHARE:
2290		break;
2291	default:
2292		return (KERN_INVALID_ARGUMENT);
2293	}
2294	if (start == end)
2295		return (KERN_SUCCESS);
2296	vm_map_lock(map);
2297	VM_MAP_RANGE_CHECK(map, start, end);
2298	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2299		entry = temp_entry;
2300		vm_map_clip_start(map, entry, start);
2301	} else
2302		entry = temp_entry->next;
2303	while ((entry != &map->header) && (entry->start < end)) {
2304		vm_map_clip_end(map, entry, end);
2305		entry->inheritance = new_inheritance;
2306		vm_map_simplify_entry(map, entry);
2307		entry = entry->next;
2308	}
2309	vm_map_unlock(map);
2310	return (KERN_SUCCESS);
2311}
2312
2313/*
2314 *	vm_map_unwire:
2315 *
2316 *	Implements both kernel and user unwiring.
2317 */
2318int
2319vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2320    int flags)
2321{
2322	vm_map_entry_t entry, first_entry, tmp_entry;
2323	vm_offset_t saved_start;
2324	unsigned int last_timestamp;
2325	int rv;
2326	boolean_t need_wakeup, result, user_unwire;
2327
2328	if (start == end)
2329		return (KERN_SUCCESS);
2330	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2331	vm_map_lock(map);
2332	VM_MAP_RANGE_CHECK(map, start, end);
2333	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2334		if (flags & VM_MAP_WIRE_HOLESOK)
2335			first_entry = first_entry->next;
2336		else {
2337			vm_map_unlock(map);
2338			return (KERN_INVALID_ADDRESS);
2339		}
2340	}
2341	last_timestamp = map->timestamp;
2342	entry = first_entry;
2343	while (entry != &map->header && entry->start < end) {
2344		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2345			/*
2346			 * We have not yet clipped the entry.
2347			 */
2348			saved_start = (start >= entry->start) ? start :
2349			    entry->start;
2350			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2351			if (vm_map_unlock_and_wait(map, 0)) {
2352				/*
2353				 * Allow interruption of user unwiring?
2354				 */
2355			}
2356			vm_map_lock(map);
2357			if (last_timestamp+1 != map->timestamp) {
2358				/*
2359				 * Look again for the entry because the map was
2360				 * modified while it was unlocked.
2361				 * Specifically, the entry may have been
2362				 * clipped, merged, or deleted.
2363				 */
2364				if (!vm_map_lookup_entry(map, saved_start,
2365				    &tmp_entry)) {
2366					if (flags & VM_MAP_WIRE_HOLESOK)
2367						tmp_entry = tmp_entry->next;
2368					else {
2369						if (saved_start == start) {
2370							/*
2371							 * First_entry has been deleted.
2372							 */
2373							vm_map_unlock(map);
2374							return (KERN_INVALID_ADDRESS);
2375						}
2376						end = saved_start;
2377						rv = KERN_INVALID_ADDRESS;
2378						goto done;
2379					}
2380				}
2381				if (entry == first_entry)
2382					first_entry = tmp_entry;
2383				else
2384					first_entry = NULL;
2385				entry = tmp_entry;
2386			}
2387			last_timestamp = map->timestamp;
2388			continue;
2389		}
2390		vm_map_clip_start(map, entry, start);
2391		vm_map_clip_end(map, entry, end);
2392		/*
2393		 * Mark the entry in case the map lock is released.  (See
2394		 * above.)
2395		 */
2396		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2397		    entry->wiring_thread == NULL,
2398		    ("owned map entry %p", entry));
2399		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2400		entry->wiring_thread = curthread;
2401		/*
2402		 * Check the map for holes in the specified region.
2403		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2404		 */
2405		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2406		    (entry->end < end && (entry->next == &map->header ||
2407		    entry->next->start > entry->end))) {
2408			end = entry->end;
2409			rv = KERN_INVALID_ADDRESS;
2410			goto done;
2411		}
2412		/*
2413		 * If system unwiring, require that the entry is system wired.
2414		 */
2415		if (!user_unwire &&
2416		    vm_map_entry_system_wired_count(entry) == 0) {
2417			end = entry->end;
2418			rv = KERN_INVALID_ARGUMENT;
2419			goto done;
2420		}
2421		entry = entry->next;
2422	}
2423	rv = KERN_SUCCESS;
2424done:
2425	need_wakeup = FALSE;
2426	if (first_entry == NULL) {
2427		result = vm_map_lookup_entry(map, start, &first_entry);
2428		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2429			first_entry = first_entry->next;
2430		else
2431			KASSERT(result, ("vm_map_unwire: lookup failed"));
2432	}
2433	for (entry = first_entry; entry != &map->header && entry->start < end;
2434	    entry = entry->next) {
2435		/*
2436		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2437		 * space in the unwired region could have been mapped
2438		 * while the map lock was dropped for draining
2439		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2440		 * could be simultaneously wiring this new mapping
2441		 * entry.  Detect these cases and skip any entries
2442		 * marked as in transition by us.
2443		 */
2444		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2445		    entry->wiring_thread != curthread) {
2446			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2447			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
2448			continue;
2449		}
2450
2451		if (rv == KERN_SUCCESS && (!user_unwire ||
2452		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2453			if (user_unwire)
2454				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2455			if (entry->wired_count == 1)
2456				vm_map_entry_unwire(map, entry);
2457			else
2458				entry->wired_count--;
2459		}
2460		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2461		    ("vm_map_unwire: in-transition flag missing %p", entry));
2462		KASSERT(entry->wiring_thread == curthread,
2463		    ("vm_map_unwire: alien wire %p", entry));
2464		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2465		entry->wiring_thread = NULL;
2466		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2467			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2468			need_wakeup = TRUE;
2469		}
2470		vm_map_simplify_entry(map, entry);
2471	}
2472	vm_map_unlock(map);
2473	if (need_wakeup)
2474		vm_map_wakeup(map);
2475	return (rv);
2476}
2477
2478/*
2479 *	vm_map_wire_entry_failure:
2480 *
2481 *	Handle a wiring failure on the given entry.
2482 *
2483 *	The map should be locked.
2484 */
2485static void
2486vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
2487    vm_offset_t failed_addr)
2488{
2489
2490	VM_MAP_ASSERT_LOCKED(map);
2491	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
2492	    entry->wired_count == 1,
2493	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
2494	KASSERT(failed_addr < entry->end,
2495	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
2496
2497	/*
2498	 * If any pages at the start of this entry were successfully wired,
2499	 * then unwire them.
2500	 */
2501	if (failed_addr > entry->start) {
2502		pmap_unwire(map->pmap, entry->start, failed_addr);
2503		vm_object_unwire(entry->object.vm_object, entry->offset,
2504		    failed_addr - entry->start, PQ_ACTIVE);
2505	}
2506
2507	/*
2508	 * Assign an out-of-range value to represent the failure to wire this
2509	 * entry.
2510	 */
2511	entry->wired_count = -1;
2512}
2513
2514/*
2515 *	vm_map_wire:
2516 *
2517 *	Implements both kernel and user wiring.
2518 */
2519int
2520vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2521    int flags)
2522{
2523	vm_map_entry_t entry, first_entry, tmp_entry;
2524	vm_offset_t faddr, saved_end, saved_start;
2525	unsigned int last_timestamp;
2526	int rv;
2527	boolean_t need_wakeup, result, user_wire;
2528	vm_prot_t prot;
2529
2530	if (start == end)
2531		return (KERN_SUCCESS);
2532	prot = 0;
2533	if (flags & VM_MAP_WIRE_WRITE)
2534		prot |= VM_PROT_WRITE;
2535	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2536	vm_map_lock(map);
2537	VM_MAP_RANGE_CHECK(map, start, end);
2538	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2539		if (flags & VM_MAP_WIRE_HOLESOK)
2540			first_entry = first_entry->next;
2541		else {
2542			vm_map_unlock(map);
2543			return (KERN_INVALID_ADDRESS);
2544		}
2545	}
2546	last_timestamp = map->timestamp;
2547	entry = first_entry;
2548	while (entry != &map->header && entry->start < end) {
2549		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2550			/*
2551			 * We have not yet clipped the entry.
2552			 */
2553			saved_start = (start >= entry->start) ? start :
2554			    entry->start;
2555			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2556			if (vm_map_unlock_and_wait(map, 0)) {
2557				/*
2558				 * Allow interruption of user wiring?
2559				 */
2560			}
2561			vm_map_lock(map);
2562			if (last_timestamp + 1 != map->timestamp) {
2563				/*
2564				 * Look again for the entry because the map was
2565				 * modified while it was unlocked.
2566				 * Specifically, the entry may have been
2567				 * clipped, merged, or deleted.
2568				 */
2569				if (!vm_map_lookup_entry(map, saved_start,
2570				    &tmp_entry)) {
2571					if (flags & VM_MAP_WIRE_HOLESOK)
2572						tmp_entry = tmp_entry->next;
2573					else {
2574						if (saved_start == start) {
2575							/*
2576							 * first_entry has been deleted.
2577							 */
2578							vm_map_unlock(map);
2579							return (KERN_INVALID_ADDRESS);
2580						}
2581						end = saved_start;
2582						rv = KERN_INVALID_ADDRESS;
2583						goto done;
2584					}
2585				}
2586				if (entry == first_entry)
2587					first_entry = tmp_entry;
2588				else
2589					first_entry = NULL;
2590				entry = tmp_entry;
2591			}
2592			last_timestamp = map->timestamp;
2593			continue;
2594		}
2595		vm_map_clip_start(map, entry, start);
2596		vm_map_clip_end(map, entry, end);
2597		/*
2598		 * Mark the entry in case the map lock is released.  (See
2599		 * above.)
2600		 */
2601		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2602		    entry->wiring_thread == NULL,
2603		    ("owned map entry %p", entry));
2604		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2605		entry->wiring_thread = curthread;
2606		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
2607		    || (entry->protection & prot) != prot) {
2608			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2609			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2610				end = entry->end;
2611				rv = KERN_INVALID_ADDRESS;
2612				goto done;
2613			}
2614			goto next_entry;
2615		}
2616		if (entry->wired_count == 0) {
2617			entry->wired_count++;
2618			saved_start = entry->start;
2619			saved_end = entry->end;
2620
2621			/*
2622			 * Release the map lock, relying on the in-transition
2623			 * mark.  Mark the map busy for fork.
2624			 */
2625			vm_map_busy(map);
2626			vm_map_unlock(map);
2627
2628			faddr = saved_start;
2629			do {
2630				/*
2631				 * Simulate a fault to get the page and enter
2632				 * it into the physical map.
2633				 */
2634				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
2635				    VM_FAULT_WIRE)) != KERN_SUCCESS)
2636					break;
2637			} while ((faddr += PAGE_SIZE) < saved_end);
2638			vm_map_lock(map);
2639			vm_map_unbusy(map);
2640			if (last_timestamp + 1 != map->timestamp) {
2641				/*
2642				 * Look again for the entry because the map was
2643				 * modified while it was unlocked.  The entry
2644				 * may have been clipped, but NOT merged or
2645				 * deleted.
2646				 */
2647				result = vm_map_lookup_entry(map, saved_start,
2648				    &tmp_entry);
2649				KASSERT(result, ("vm_map_wire: lookup failed"));
2650				if (entry == first_entry)
2651					first_entry = tmp_entry;
2652				else
2653					first_entry = NULL;
2654				entry = tmp_entry;
2655				while (entry->end < saved_end) {
2656					/*
2657					 * In case of failure, handle entries
2658					 * that were not fully wired here;
2659					 * fully wired entries are handled
2660					 * later.
2661					 */
2662					if (rv != KERN_SUCCESS &&
2663					    faddr < entry->end)
2664						vm_map_wire_entry_failure(map,
2665						    entry, faddr);
2666					entry = entry->next;
2667				}
2668			}
2669			last_timestamp = map->timestamp;
2670			if (rv != KERN_SUCCESS) {
2671				vm_map_wire_entry_failure(map, entry, faddr);
2672				end = entry->end;
2673				goto done;
2674			}
2675		} else if (!user_wire ||
2676			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2677			entry->wired_count++;
2678		}
2679		/*
2680		 * Check the map for holes in the specified region.
2681		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2682		 */
2683	next_entry:
2684		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2685		    (entry->end < end && (entry->next == &map->header ||
2686		    entry->next->start > entry->end))) {
2687			end = entry->end;
2688			rv = KERN_INVALID_ADDRESS;
2689			goto done;
2690		}
2691		entry = entry->next;
2692	}
2693	rv = KERN_SUCCESS;
2694done:
2695	need_wakeup = FALSE;
2696	if (first_entry == NULL) {
2697		result = vm_map_lookup_entry(map, start, &first_entry);
2698		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2699			first_entry = first_entry->next;
2700		else
2701			KASSERT(result, ("vm_map_wire: lookup failed"));
2702	}
2703	for (entry = first_entry; entry != &map->header && entry->start < end;
2704	    entry = entry->next) {
2705		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
2706			goto next_entry_done;
2707
2708		/*
2709		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2710		 * space in the unwired region could have been mapped
2711		 * while the map lock was dropped for faulting in the
2712		 * pages or draining MAP_ENTRY_IN_TRANSITION.
2713		 * Moreover, another thread could be simultaneously
2714		 * wiring this new mapping entry.  Detect these cases
2715		 * and skip any entries marked as in transition by us.
2716		 */
2717		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2718		    entry->wiring_thread != curthread) {
2719			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2720			    ("vm_map_wire: !HOLESOK and new/changed entry"));
2721			continue;
2722		}
2723
2724		if (rv == KERN_SUCCESS) {
2725			if (user_wire)
2726				entry->eflags |= MAP_ENTRY_USER_WIRED;
2727		} else if (entry->wired_count == -1) {
2728			/*
2729			 * Wiring failed on this entry.  Thus, unwiring is
2730			 * unnecessary.
2731			 */
2732			entry->wired_count = 0;
2733		} else if (!user_wire ||
2734		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2735			/*
2736			 * Undo the wiring.  Wiring succeeded on this entry
2737			 * but failed on a later entry.
2738			 */
2739			if (entry->wired_count == 1)
2740				vm_map_entry_unwire(map, entry);
2741			else
2742				entry->wired_count--;
2743		}
2744	next_entry_done:
2745		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2746		    ("vm_map_wire: in-transition flag missing %p", entry));
2747		KASSERT(entry->wiring_thread == curthread,
2748		    ("vm_map_wire: alien wire %p", entry));
2749		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
2750		    MAP_ENTRY_WIRE_SKIPPED);
2751		entry->wiring_thread = NULL;
2752		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2753			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2754			need_wakeup = TRUE;
2755		}
2756		vm_map_simplify_entry(map, entry);
2757	}
2758	vm_map_unlock(map);
2759	if (need_wakeup)
2760		vm_map_wakeup(map);
2761	return (rv);
2762}
2763
2764/*
2765 * vm_map_sync
2766 *
2767 * Push any dirty cached pages in the address range to their pager.
2768 * If syncio is TRUE, dirty pages are written synchronously.
2769 * If invalidate is TRUE, any cached pages are freed as well.
2770 *
2771 * If the size of the region from start to end is zero, we are
2772 * supposed to flush all modified pages within the region containing
2773 * start.  Unfortunately, a region can be split or coalesced with
2774 * neighboring regions, making it difficult to determine what the
2775 * original region was.  Therefore, we approximate this requirement by
2776 * flushing the current region containing start.
2777 *
2778 * Returns an error if any part of the specified range is not mapped.
2779 */
2780int
2781vm_map_sync(
2782	vm_map_t map,
2783	vm_offset_t start,
2784	vm_offset_t end,
2785	boolean_t syncio,
2786	boolean_t invalidate)
2787{
2788	vm_map_entry_t current;
2789	vm_map_entry_t entry;
2790	vm_size_t size;
2791	vm_object_t object;
2792	vm_ooffset_t offset;
2793	unsigned int last_timestamp;
2794	boolean_t failed;
2795
2796	vm_map_lock_read(map);
2797	VM_MAP_RANGE_CHECK(map, start, end);
2798	if (!vm_map_lookup_entry(map, start, &entry)) {
2799		vm_map_unlock_read(map);
2800		return (KERN_INVALID_ADDRESS);
2801	} else if (start == end) {
2802		start = entry->start;
2803		end = entry->end;
2804	}
2805	/*
2806	 * Make a first pass to check for user-wired memory and holes.
2807	 */
2808	for (current = entry; current != &map->header && current->start < end;
2809	    current = current->next) {
2810		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2811			vm_map_unlock_read(map);
2812			return (KERN_INVALID_ARGUMENT);
2813		}
2814		if (end > current->end &&
2815		    (current->next == &map->header ||
2816			current->end != current->next->start)) {
2817			vm_map_unlock_read(map);
2818			return (KERN_INVALID_ADDRESS);
2819		}
2820	}
2821
2822	if (invalidate)
2823		pmap_remove(map->pmap, start, end);
2824	failed = FALSE;
2825
2826	/*
2827	 * Make a second pass, cleaning/uncaching pages from the indicated
2828	 * objects as we go.
2829	 */
2830	for (current = entry; current != &map->header && current->start < end;) {
2831		offset = current->offset + (start - current->start);
2832		size = (end <= current->end ? end : current->end) - start;
2833		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2834			vm_map_t smap;
2835			vm_map_entry_t tentry;
2836			vm_size_t tsize;
2837
2838			smap = current->object.sub_map;
2839			vm_map_lock_read(smap);
2840			(void) vm_map_lookup_entry(smap, offset, &tentry);
2841			tsize = tentry->end - offset;
2842			if (tsize < size)
2843				size = tsize;
2844			object = tentry->object.vm_object;
2845			offset = tentry->offset + (offset - tentry->start);
2846			vm_map_unlock_read(smap);
2847		} else {
2848			object = current->object.vm_object;
2849		}
2850		vm_object_reference(object);
2851		last_timestamp = map->timestamp;
2852		vm_map_unlock_read(map);
2853		if (!vm_object_sync(object, offset, size, syncio, invalidate))
2854			failed = TRUE;
2855		start += size;
2856		vm_object_deallocate(object);
2857		vm_map_lock_read(map);
2858		if (last_timestamp == map->timestamp ||
2859		    !vm_map_lookup_entry(map, start, &current))
2860			current = current->next;
2861	}
2862
2863	vm_map_unlock_read(map);
2864	return (failed ? KERN_FAILURE : KERN_SUCCESS);
2865}
2866
2867/*
2868 *	vm_map_entry_unwire:	[ internal use only ]
2869 *
2870 *	Make the region specified by this entry pageable.
2871 *
2872 *	The map in question should be locked.
2873 *	[This is the reason for this routine's existence.]
2874 */
2875static void
2876vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2877{
2878
2879	VM_MAP_ASSERT_LOCKED(map);
2880	KASSERT(entry->wired_count > 0,
2881	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
2882	pmap_unwire(map->pmap, entry->start, entry->end);
2883	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
2884	    entry->start, PQ_ACTIVE);
2885	entry->wired_count = 0;
2886}
2887
2888static void
2889vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
2890{
2891
2892	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
2893		vm_object_deallocate(entry->object.vm_object);
2894	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
2895}
2896
2897/*
2898 *	vm_map_entry_delete:	[ internal use only ]
2899 *
2900 *	Deallocate the given entry from the target map.
2901 */
2902static void
2903vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2904{
2905	vm_object_t object;
2906	vm_pindex_t offidxstart, offidxend, count, size1;
2907	vm_size_t size;
2908
2909	vm_map_entry_unlink(map, entry);
2910	object = entry->object.vm_object;
2911	size = entry->end - entry->start;
2912	map->size -= size;
2913
2914	if (entry->cred != NULL) {
2915		swap_release_by_cred(size, entry->cred);
2916		crfree(entry->cred);
2917	}
2918
2919	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2920	    (object != NULL)) {
2921		KASSERT(entry->cred == NULL || object->cred == NULL ||
2922		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
2923		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
2924		count = atop(size);
2925		offidxstart = OFF_TO_IDX(entry->offset);
2926		offidxend = offidxstart + count;
2927		VM_OBJECT_WLOCK(object);
2928		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
2929		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
2930		    object == kernel_object || object == kmem_object)) {
2931			vm_object_collapse(object);
2932
2933			/*
2934			 * The option OBJPR_NOTMAPPED can be passed here
2935			 * because vm_map_delete() already performed
2936			 * pmap_remove() on the only mapping to this range
2937			 * of pages.
2938			 */
2939			vm_object_page_remove(object, offidxstart, offidxend,
2940			    OBJPR_NOTMAPPED);
2941			if (object->type == OBJT_SWAP)
2942				swap_pager_freespace(object, offidxstart,
2943				    count);
2944			if (offidxend >= object->size &&
2945			    offidxstart < object->size) {
2946				size1 = object->size;
2947				object->size = offidxstart;
2948				if (object->cred != NULL) {
2949					size1 -= object->size;
2950					KASSERT(object->charge >= ptoa(size1),
2951					    ("object %p charge < 0", object));
2952					swap_release_by_cred(ptoa(size1),
2953					    object->cred);
2954					object->charge -= ptoa(size1);
2955				}
2956			}
2957		}
2958		VM_OBJECT_WUNLOCK(object);
2959	} else
2960		entry->object.vm_object = NULL;
2961	if (map->system_map)
2962		vm_map_entry_deallocate(entry, TRUE);
2963	else {
2964		entry->next = curthread->td_map_def_user;
2965		curthread->td_map_def_user = entry;
2966	}
2967}
2968
2969/*
2970 *	vm_map_delete:	[ internal use only ]
2971 *
2972 *	Deallocates the given address range from the target
2973 *	map.
2974 */
2975int
2976vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2977{
2978	vm_map_entry_t entry;
2979	vm_map_entry_t first_entry;
2980
2981	VM_MAP_ASSERT_LOCKED(map);
2982	if (start == end)
2983		return (KERN_SUCCESS);
2984
2985	/*
2986	 * Find the start of the region, and clip it
2987	 */
2988	if (!vm_map_lookup_entry(map, start, &first_entry))
2989		entry = first_entry->next;
2990	else {
2991		entry = first_entry;
2992		vm_map_clip_start(map, entry, start);
2993	}
2994
2995	/*
2996	 * Step through all entries in this region
2997	 */
2998	while ((entry != &map->header) && (entry->start < end)) {
2999		vm_map_entry_t next;
3000
3001		/*
3002		 * Wait for wiring or unwiring of an entry to complete.
3003		 * Also wait for any system wirings to disappear on
3004		 * user maps.
3005		 */
3006		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3007		    (vm_map_pmap(map) != kernel_pmap &&
3008		    vm_map_entry_system_wired_count(entry) != 0)) {
3009			unsigned int last_timestamp;
3010			vm_offset_t saved_start;
3011			vm_map_entry_t tmp_entry;
3012
3013			saved_start = entry->start;
3014			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3015			last_timestamp = map->timestamp;
3016			(void) vm_map_unlock_and_wait(map, 0);
3017			vm_map_lock(map);
3018			if (last_timestamp + 1 != map->timestamp) {
3019				/*
3020				 * Look again for the entry because the map was
3021				 * modified while it was unlocked.
3022				 * Specifically, the entry may have been
3023				 * clipped, merged, or deleted.
3024				 */
3025				if (!vm_map_lookup_entry(map, saved_start,
3026							 &tmp_entry))
3027					entry = tmp_entry->next;
3028				else {
3029					entry = tmp_entry;
3030					vm_map_clip_start(map, entry,
3031							  saved_start);
3032				}
3033			}
3034			continue;
3035		}
3036		vm_map_clip_end(map, entry, end);
3037
3038		next = entry->next;
3039
3040		/*
3041		 * Unwire before removing addresses from the pmap; otherwise,
3042		 * unwiring will put the entries back in the pmap.
3043		 */
3044		if (entry->wired_count != 0) {
3045			vm_map_entry_unwire(map, entry);
3046		}
3047
3048		pmap_remove(map->pmap, entry->start, entry->end);
3049
3050		/*
3051		 * Delete the entry only after removing all pmap
3052		 * entries pointing to its pages.  (Otherwise, its
3053		 * page frames may be reallocated, and any modify bits
3054		 * will be set in the wrong object!)
3055		 */
3056		vm_map_entry_delete(map, entry);
3057		entry = next;
3058	}
3059	return (KERN_SUCCESS);
3060}
3061
3062/*
3063 *	vm_map_remove:
3064 *
3065 *	Remove the given address range from the target map.
3066 *	This is the exported form of vm_map_delete.
3067 */
3068int
3069vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3070{
3071	int result;
3072
3073	vm_map_lock(map);
3074	VM_MAP_RANGE_CHECK(map, start, end);
3075	result = vm_map_delete(map, start, end);
3076	vm_map_unlock(map);
3077	return (result);
3078}
3079
3080/*
3081 *	vm_map_check_protection:
3082 *
3083 *	Assert that the target map allows the specified privilege on the
3084 *	entire address region given.  The entire region must be allocated.
3085 *
3086 *	WARNING!  This code does not and should not check whether the
3087 *	contents of the region is accessible.  For example a smaller file
3088 *	might be mapped into a larger address space.
3089 *
3090 *	NOTE!  This code is also called by munmap().
3091 *
3092 *	The map must be locked.  A read lock is sufficient.
3093 */
3094boolean_t
3095vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3096			vm_prot_t protection)
3097{
3098	vm_map_entry_t entry;
3099	vm_map_entry_t tmp_entry;
3100
3101	if (!vm_map_lookup_entry(map, start, &tmp_entry))
3102		return (FALSE);
3103	entry = tmp_entry;
3104
3105	while (start < end) {
3106		if (entry == &map->header)
3107			return (FALSE);
3108		/*
3109		 * No holes allowed!
3110		 */
3111		if (start < entry->start)
3112			return (FALSE);
3113		/*
3114		 * Check protection associated with entry.
3115		 */
3116		if ((entry->protection & protection) != protection)
3117			return (FALSE);
3118		/* go to next entry */
3119		start = entry->end;
3120		entry = entry->next;
3121	}
3122	return (TRUE);
3123}
3124
3125/*
3126 *	vm_map_copy_entry:
3127 *
3128 *	Copies the contents of the source entry to the destination
3129 *	entry.  The entries *must* be aligned properly.
3130 */
3131static void
3132vm_map_copy_entry(
3133	vm_map_t src_map,
3134	vm_map_t dst_map,
3135	vm_map_entry_t src_entry,
3136	vm_map_entry_t dst_entry,
3137	vm_ooffset_t *fork_charge)
3138{
3139	vm_object_t src_object;
3140	vm_map_entry_t fake_entry;
3141	vm_offset_t size;
3142	struct ucred *cred;
3143	int charged;
3144
3145	VM_MAP_ASSERT_LOCKED(dst_map);
3146
3147	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3148		return;
3149
3150	if (src_entry->wired_count == 0 ||
3151	    (src_entry->protection & VM_PROT_WRITE) == 0) {
3152		/*
3153		 * If the source entry is marked needs_copy, it is already
3154		 * write-protected.
3155		 */
3156		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3157		    (src_entry->protection & VM_PROT_WRITE) != 0) {
3158			pmap_protect(src_map->pmap,
3159			    src_entry->start,
3160			    src_entry->end,
3161			    src_entry->protection & ~VM_PROT_WRITE);
3162		}
3163
3164		/*
3165		 * Make a copy of the object.
3166		 */
3167		size = src_entry->end - src_entry->start;
3168		if ((src_object = src_entry->object.vm_object) != NULL) {
3169			VM_OBJECT_WLOCK(src_object);
3170			charged = ENTRY_CHARGED(src_entry);
3171			if (src_object->handle == NULL &&
3172			    (src_object->type == OBJT_DEFAULT ||
3173			    src_object->type == OBJT_SWAP)) {
3174				vm_object_collapse(src_object);
3175				if ((src_object->flags & (OBJ_NOSPLIT |
3176				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3177					vm_object_split(src_entry);
3178					src_object =
3179					    src_entry->object.vm_object;
3180				}
3181			}
3182			vm_object_reference_locked(src_object);
3183			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3184			if (src_entry->cred != NULL &&
3185			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3186				KASSERT(src_object->cred == NULL,
3187				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3188				     src_object));
3189				src_object->cred = src_entry->cred;
3190				src_object->charge = size;
3191			}
3192			VM_OBJECT_WUNLOCK(src_object);
3193			dst_entry->object.vm_object = src_object;
3194			if (charged) {
3195				cred = curthread->td_ucred;
3196				crhold(cred);
3197				dst_entry->cred = cred;
3198				*fork_charge += size;
3199				if (!(src_entry->eflags &
3200				      MAP_ENTRY_NEEDS_COPY)) {
3201					crhold(cred);
3202					src_entry->cred = cred;
3203					*fork_charge += size;
3204				}
3205			}
3206			src_entry->eflags |= MAP_ENTRY_COW |
3207			    MAP_ENTRY_NEEDS_COPY;
3208			dst_entry->eflags |= MAP_ENTRY_COW |
3209			    MAP_ENTRY_NEEDS_COPY;
3210			dst_entry->offset = src_entry->offset;
3211			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3212				/*
3213				 * MAP_ENTRY_VN_WRITECNT cannot
3214				 * indicate write reference from
3215				 * src_entry, since the entry is
3216				 * marked as needs copy.  Allocate a
3217				 * fake entry that is used to
3218				 * decrement object->un_pager.vnp.writecount
3219				 * at the appropriate time.  Attach
3220				 * fake_entry to the deferred list.
3221				 */
3222				fake_entry = vm_map_entry_create(dst_map);
3223				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3224				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3225				vm_object_reference(src_object);
3226				fake_entry->object.vm_object = src_object;
3227				fake_entry->start = src_entry->start;
3228				fake_entry->end = src_entry->end;
3229				fake_entry->next = curthread->td_map_def_user;
3230				curthread->td_map_def_user = fake_entry;
3231			}
3232		} else {
3233			dst_entry->object.vm_object = NULL;
3234			dst_entry->offset = 0;
3235			if (src_entry->cred != NULL) {
3236				dst_entry->cred = curthread->td_ucred;
3237				crhold(dst_entry->cred);
3238				*fork_charge += size;
3239			}
3240		}
3241
3242		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3243		    dst_entry->end - dst_entry->start, src_entry->start);
3244	} else {
3245		/*
3246		 * We don't want to make writeable wired pages copy-on-write.
3247		 * Immediately copy these pages into the new map by simulating
3248		 * page faults.  The new pages are pageable.
3249		 */
3250		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3251		    fork_charge);
3252	}
3253}
3254
3255/*
3256 * vmspace_map_entry_forked:
3257 * Update the newly-forked vmspace each time a map entry is inherited
3258 * or copied.  The values for vm_dsize and vm_tsize are approximate
3259 * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3260 */
3261static void
3262vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3263    vm_map_entry_t entry)
3264{
3265	vm_size_t entrysize;
3266	vm_offset_t newend;
3267
3268	entrysize = entry->end - entry->start;
3269	vm2->vm_map.size += entrysize;
3270	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3271		vm2->vm_ssize += btoc(entrysize);
3272	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3273	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3274		newend = MIN(entry->end,
3275		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3276		vm2->vm_dsize += btoc(newend - entry->start);
3277	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3278	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3279		newend = MIN(entry->end,
3280		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3281		vm2->vm_tsize += btoc(newend - entry->start);
3282	}
3283}
3284
3285/*
3286 * vmspace_fork:
3287 * Create a new process vmspace structure and vm_map
3288 * based on those of an existing process.  The new map
3289 * is based on the old map, according to the inheritance
3290 * values on the regions in that map.
3291 *
3292 * XXX It might be worth coalescing the entries added to the new vmspace.
3293 *
3294 * The source map must not be locked.
3295 */
3296struct vmspace *
3297vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3298{
3299	struct vmspace *vm2;
3300	vm_map_t new_map, old_map;
3301	vm_map_entry_t new_entry, old_entry;
3302	vm_object_t object;
3303	int locked;
3304
3305	old_map = &vm1->vm_map;
3306	/* Copy immutable fields of vm1 to vm2. */
3307	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
3308	if (vm2 == NULL)
3309		return (NULL);
3310	vm2->vm_taddr = vm1->vm_taddr;
3311	vm2->vm_daddr = vm1->vm_daddr;
3312	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3313	vm_map_lock(old_map);
3314	if (old_map->busy)
3315		vm_map_wait_busy(old_map);
3316	new_map = &vm2->vm_map;
3317	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3318	KASSERT(locked, ("vmspace_fork: lock failed"));
3319
3320	old_entry = old_map->header.next;
3321
3322	while (old_entry != &old_map->header) {
3323		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3324			panic("vm_map_fork: encountered a submap");
3325
3326		switch (old_entry->inheritance) {
3327		case VM_INHERIT_NONE:
3328			break;
3329
3330		case VM_INHERIT_SHARE:
3331			/*
3332			 * Clone the entry, creating the shared object if necessary.
3333			 */
3334			object = old_entry->object.vm_object;
3335			if (object == NULL) {
3336				object = vm_object_allocate(OBJT_DEFAULT,
3337					atop(old_entry->end - old_entry->start));
3338				old_entry->object.vm_object = object;
3339				old_entry->offset = 0;
3340				if (old_entry->cred != NULL) {
3341					object->cred = old_entry->cred;
3342					object->charge = old_entry->end -
3343					    old_entry->start;
3344					old_entry->cred = NULL;
3345				}
3346			}
3347
3348			/*
3349			 * Add the reference before calling vm_object_shadow
3350			 * to insure that a shadow object is created.
3351			 */
3352			vm_object_reference(object);
3353			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3354				vm_object_shadow(&old_entry->object.vm_object,
3355				    &old_entry->offset,
3356				    old_entry->end - old_entry->start);
3357				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3358				/* Transfer the second reference too. */
3359				vm_object_reference(
3360				    old_entry->object.vm_object);
3361
3362				/*
3363				 * As in vm_map_simplify_entry(), the
3364				 * vnode lock will not be acquired in
3365				 * this call to vm_object_deallocate().
3366				 */
3367				vm_object_deallocate(object);
3368				object = old_entry->object.vm_object;
3369			}
3370			VM_OBJECT_WLOCK(object);
3371			vm_object_clear_flag(object, OBJ_ONEMAPPING);
3372			if (old_entry->cred != NULL) {
3373				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3374				object->cred = old_entry->cred;
3375				object->charge = old_entry->end - old_entry->start;
3376				old_entry->cred = NULL;
3377			}
3378
3379			/*
3380			 * Assert the correct state of the vnode
3381			 * v_writecount while the object is locked, to
3382			 * not relock it later for the assertion
3383			 * correctness.
3384			 */
3385			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3386			    object->type == OBJT_VNODE) {
3387				KASSERT(((struct vnode *)object->handle)->
3388				    v_writecount > 0,
3389				    ("vmspace_fork: v_writecount %p", object));
3390				KASSERT(object->un_pager.vnp.writemappings > 0,
3391				    ("vmspace_fork: vnp.writecount %p",
3392				    object));
3393			}
3394			VM_OBJECT_WUNLOCK(object);
3395
3396			/*
3397			 * Clone the entry, referencing the shared object.
3398			 */
3399			new_entry = vm_map_entry_create(new_map);
3400			*new_entry = *old_entry;
3401			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3402			    MAP_ENTRY_IN_TRANSITION);
3403			new_entry->wiring_thread = NULL;
3404			new_entry->wired_count = 0;
3405			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3406				vnode_pager_update_writecount(object,
3407				    new_entry->start, new_entry->end);
3408			}
3409
3410			/*
3411			 * Insert the entry into the new map -- we know we're
3412			 * inserting at the end of the new map.
3413			 */
3414			vm_map_entry_link(new_map, new_map->header.prev,
3415			    new_entry);
3416			vmspace_map_entry_forked(vm1, vm2, new_entry);
3417
3418			/*
3419			 * Update the physical map
3420			 */
3421			pmap_copy(new_map->pmap, old_map->pmap,
3422			    new_entry->start,
3423			    (old_entry->end - old_entry->start),
3424			    old_entry->start);
3425			break;
3426
3427		case VM_INHERIT_COPY:
3428			/*
3429			 * Clone the entry and link into the map.
3430			 */
3431			new_entry = vm_map_entry_create(new_map);
3432			*new_entry = *old_entry;
3433			/*
3434			 * Copied entry is COW over the old object.
3435			 */
3436			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3437			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3438			new_entry->wiring_thread = NULL;
3439			new_entry->wired_count = 0;
3440			new_entry->object.vm_object = NULL;
3441			new_entry->cred = NULL;
3442			vm_map_entry_link(new_map, new_map->header.prev,
3443			    new_entry);
3444			vmspace_map_entry_forked(vm1, vm2, new_entry);
3445			vm_map_copy_entry(old_map, new_map, old_entry,
3446			    new_entry, fork_charge);
3447			break;
3448		}
3449		old_entry = old_entry->next;
3450	}
3451	/*
3452	 * Use inlined vm_map_unlock() to postpone handling the deferred
3453	 * map entries, which cannot be done until both old_map and
3454	 * new_map locks are released.
3455	 */
3456	sx_xunlock(&old_map->lock);
3457	sx_xunlock(&new_map->lock);
3458	vm_map_process_deferred();
3459
3460	return (vm2);
3461}
3462
3463int
3464vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3465    vm_prot_t prot, vm_prot_t max, int cow)
3466{
3467	vm_size_t growsize, init_ssize;
3468	rlim_t lmemlim, vmemlim;
3469	int rv;
3470
3471	growsize = sgrowsiz;
3472	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3473	vm_map_lock(map);
3474	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
3475	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3476	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
3477		if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
3478			rv = KERN_NO_SPACE;
3479			goto out;
3480		}
3481	}
3482	/* If we would blow our VMEM resource limit, no go */
3483	if (map->size + init_ssize > vmemlim) {
3484		rv = KERN_NO_SPACE;
3485		goto out;
3486	}
3487	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3488	    max, cow);
3489out:
3490	vm_map_unlock(map);
3491	return (rv);
3492}
3493
3494static int
3495vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3496    vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3497{
3498	vm_map_entry_t new_entry, prev_entry;
3499	vm_offset_t bot, top;
3500	vm_size_t init_ssize;
3501	int orient, rv;
3502
3503	/*
3504	 * The stack orientation is piggybacked with the cow argument.
3505	 * Extract it into orient and mask the cow argument so that we
3506	 * don't pass it around further.
3507	 * NOTE: We explicitly allow bi-directional stacks.
3508	 */
3509	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
3510	KASSERT(orient != 0, ("No stack grow direction"));
3511
3512	if (addrbos < vm_map_min(map) ||
3513	    addrbos > vm_map_max(map) ||
3514	    addrbos + max_ssize < addrbos)
3515		return (KERN_NO_SPACE);
3516
3517	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3518
3519	/* If addr is already mapped, no go */
3520	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
3521		return (KERN_NO_SPACE);
3522
3523	/*
3524	 * If we can't accommodate max_ssize in the current mapping, no go.
3525	 * However, we need to be aware that subsequent user mappings might
3526	 * map into the space we have reserved for stack, and currently this
3527	 * space is not protected.
3528	 *
3529	 * Hopefully we will at least detect this condition when we try to
3530	 * grow the stack.
3531	 */
3532	if ((prev_entry->next != &map->header) &&
3533	    (prev_entry->next->start < addrbos + max_ssize))
3534		return (KERN_NO_SPACE);
3535
3536	/*
3537	 * We initially map a stack of only init_ssize.  We will grow as
3538	 * needed later.  Depending on the orientation of the stack (i.e.
3539	 * the grow direction) we either map at the top of the range, the
3540	 * bottom of the range or in the middle.
3541	 *
3542	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
3543	 * and cow to be 0.  Possibly we should eliminate these as input
3544	 * parameters, and just pass these values here in the insert call.
3545	 */
3546	if (orient == MAP_STACK_GROWS_DOWN)
3547		bot = addrbos + max_ssize - init_ssize;
3548	else if (orient == MAP_STACK_GROWS_UP)
3549		bot = addrbos;
3550	else
3551		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
3552	top = bot + init_ssize;
3553	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
3554
3555	/* Now set the avail_ssize amount. */
3556	if (rv == KERN_SUCCESS) {
3557		new_entry = prev_entry->next;
3558		if (new_entry->end != top || new_entry->start != bot)
3559			panic("Bad entry start/end for new stack entry");
3560
3561		new_entry->avail_ssize = max_ssize - init_ssize;
3562		KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
3563		    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
3564		    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3565		KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
3566		    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
3567		    ("new entry lacks MAP_ENTRY_GROWS_UP"));
3568	}
3569
3570	return (rv);
3571}
3572
3573static int stack_guard_page = 0;
3574SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3575    &stack_guard_page, 0,
3576    "Insert stack guard page ahead of the growable segments.");
3577
3578/* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3579 * desired address is already mapped, or if we successfully grow
3580 * the stack.  Also returns KERN_SUCCESS if addr is outside the
3581 * stack range (this is strange, but preserves compatibility with
3582 * the grow function in vm_machdep.c).
3583 */
3584int
3585vm_map_growstack(struct proc *p, vm_offset_t addr)
3586{
3587	vm_map_entry_t next_entry, prev_entry;
3588	vm_map_entry_t new_entry, stack_entry;
3589	struct vmspace *vm = p->p_vmspace;
3590	vm_map_t map = &vm->vm_map;
3591	vm_offset_t end;
3592	vm_size_t growsize;
3593	size_t grow_amount, max_grow;
3594	rlim_t lmemlim, stacklim, vmemlim;
3595	int is_procstack, rv;
3596	struct ucred *cred;
3597#ifdef notyet
3598	uint64_t limit;
3599#endif
3600#ifdef RACCT
3601	int error;
3602#endif
3603
3604	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
3605	stacklim = lim_cur(curthread, RLIMIT_STACK);
3606	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3607Retry:
3608
3609	vm_map_lock_read(map);
3610
3611	/* If addr is already in the entry range, no need to grow.*/
3612	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
3613		vm_map_unlock_read(map);
3614		return (KERN_SUCCESS);
3615	}
3616
3617	next_entry = prev_entry->next;
3618	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
3619		/*
3620		 * This entry does not grow upwards. Since the address lies
3621		 * beyond this entry, the next entry (if one exists) has to
3622		 * be a downward growable entry. The entry list header is
3623		 * never a growable entry, so it suffices to check the flags.
3624		 */
3625		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
3626			vm_map_unlock_read(map);
3627			return (KERN_SUCCESS);
3628		}
3629		stack_entry = next_entry;
3630	} else {
3631		/*
3632		 * This entry grows upward. If the next entry does not at
3633		 * least grow downwards, this is the entry we need to grow.
3634		 * otherwise we have two possible choices and we have to
3635		 * select one.
3636		 */
3637		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
3638			/*
3639			 * We have two choices; grow the entry closest to
3640			 * the address to minimize the amount of growth.
3641			 */
3642			if (addr - prev_entry->end <= next_entry->start - addr)
3643				stack_entry = prev_entry;
3644			else
3645				stack_entry = next_entry;
3646		} else
3647			stack_entry = prev_entry;
3648	}
3649
3650	if (stack_entry == next_entry) {
3651		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
3652		KASSERT(addr < stack_entry->start, ("foo"));
3653		end = (prev_entry != &map->header) ? prev_entry->end :
3654		    stack_entry->start - stack_entry->avail_ssize;
3655		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
3656		max_grow = stack_entry->start - end;
3657	} else {
3658		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
3659		KASSERT(addr >= stack_entry->end, ("foo"));
3660		end = (next_entry != &map->header) ? next_entry->start :
3661		    stack_entry->end + stack_entry->avail_ssize;
3662		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
3663		max_grow = end - stack_entry->end;
3664	}
3665
3666	if (grow_amount > stack_entry->avail_ssize) {
3667		vm_map_unlock_read(map);
3668		return (KERN_NO_SPACE);
3669	}
3670
3671	/*
3672	 * If there is no longer enough space between the entries nogo, and
3673	 * adjust the available space.  Note: this  should only happen if the
3674	 * user has mapped into the stack area after the stack was created,
3675	 * and is probably an error.
3676	 *
3677	 * This also effectively destroys any guard page the user might have
3678	 * intended by limiting the stack size.
3679	 */
3680	if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
3681		if (vm_map_lock_upgrade(map))
3682			goto Retry;
3683
3684		stack_entry->avail_ssize = max_grow;
3685
3686		vm_map_unlock(map);
3687		return (KERN_NO_SPACE);
3688	}
3689
3690	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr &&
3691	    addr < (vm_offset_t)p->p_sysent->sv_usrstack) ? 1 : 0;
3692
3693	/*
3694	 * If this is the main process stack, see if we're over the stack
3695	 * limit.
3696	 */
3697	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3698		vm_map_unlock_read(map);
3699		return (KERN_NO_SPACE);
3700	}
3701#ifdef RACCT
3702	if (racct_enable) {
3703		PROC_LOCK(p);
3704		if (is_procstack && racct_set(p, RACCT_STACK,
3705		    ctob(vm->vm_ssize) + grow_amount)) {
3706			PROC_UNLOCK(p);
3707			vm_map_unlock_read(map);
3708			return (KERN_NO_SPACE);
3709		}
3710		PROC_UNLOCK(p);
3711	}
3712#endif
3713
3714	/* Round up the grow amount modulo sgrowsiz */
3715	growsize = sgrowsiz;
3716	grow_amount = roundup(grow_amount, growsize);
3717	if (grow_amount > stack_entry->avail_ssize)
3718		grow_amount = stack_entry->avail_ssize;
3719	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3720		grow_amount = trunc_page((vm_size_t)stacklim) -
3721		    ctob(vm->vm_ssize);
3722	}
3723#ifdef notyet
3724	PROC_LOCK(p);
3725	limit = racct_get_available(p, RACCT_STACK);
3726	PROC_UNLOCK(p);
3727	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
3728		grow_amount = limit - ctob(vm->vm_ssize);
3729#endif
3730	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
3731		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
3732			vm_map_unlock_read(map);
3733			rv = KERN_NO_SPACE;
3734			goto out;
3735		}
3736#ifdef RACCT
3737		if (racct_enable) {
3738			PROC_LOCK(p);
3739			if (racct_set(p, RACCT_MEMLOCK,
3740			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
3741				PROC_UNLOCK(p);
3742				vm_map_unlock_read(map);
3743				rv = KERN_NO_SPACE;
3744				goto out;
3745			}
3746			PROC_UNLOCK(p);
3747		}
3748#endif
3749	}
3750	/* If we would blow our VMEM resource limit, no go */
3751	if (map->size + grow_amount > vmemlim) {
3752		vm_map_unlock_read(map);
3753		rv = KERN_NO_SPACE;
3754		goto out;
3755	}
3756#ifdef RACCT
3757	if (racct_enable) {
3758		PROC_LOCK(p);
3759		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
3760			PROC_UNLOCK(p);
3761			vm_map_unlock_read(map);
3762			rv = KERN_NO_SPACE;
3763			goto out;
3764		}
3765		PROC_UNLOCK(p);
3766	}
3767#endif
3768
3769	if (vm_map_lock_upgrade(map))
3770		goto Retry;
3771
3772	if (stack_entry == next_entry) {
3773		/*
3774		 * Growing downward.
3775		 */
3776		/* Get the preliminary new entry start value */
3777		addr = stack_entry->start - grow_amount;
3778
3779		/*
3780		 * If this puts us into the previous entry, cut back our
3781		 * growth to the available space. Also, see the note above.
3782		 */
3783		if (addr < end) {
3784			stack_entry->avail_ssize = max_grow;
3785			addr = end;
3786			if (stack_guard_page)
3787				addr += PAGE_SIZE;
3788		}
3789
3790		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
3791		    next_entry->protection, next_entry->max_protection,
3792		    MAP_STACK_GROWS_DOWN);
3793
3794		/* Adjust the available stack space by the amount we grew. */
3795		if (rv == KERN_SUCCESS) {
3796			new_entry = prev_entry->next;
3797			KASSERT(new_entry == stack_entry->prev, ("foo"));
3798			KASSERT(new_entry->end == stack_entry->start, ("foo"));
3799			KASSERT(new_entry->start == addr, ("foo"));
3800			KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) !=
3801			    0, ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3802			grow_amount = new_entry->end - new_entry->start;
3803			new_entry->avail_ssize = stack_entry->avail_ssize -
3804			    grow_amount;
3805			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
3806		}
3807	} else {
3808		/*
3809		 * Growing upward.
3810		 */
3811		addr = stack_entry->end + grow_amount;
3812
3813		/*
3814		 * If this puts us into the next entry, cut back our growth
3815		 * to the available space. Also, see the note above.
3816		 */
3817		if (addr > end) {
3818			stack_entry->avail_ssize = end - stack_entry->end;
3819			addr = end;
3820			if (stack_guard_page)
3821				addr -= PAGE_SIZE;
3822		}
3823
3824		grow_amount = addr - stack_entry->end;
3825		cred = stack_entry->cred;
3826		if (cred == NULL && stack_entry->object.vm_object != NULL)
3827			cred = stack_entry->object.vm_object->cred;
3828		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
3829			rv = KERN_NO_SPACE;
3830		/* Grow the underlying object if applicable. */
3831		else if (stack_entry->object.vm_object == NULL ||
3832		    vm_object_coalesce(stack_entry->object.vm_object,
3833		    stack_entry->offset,
3834		    (vm_size_t)(stack_entry->end - stack_entry->start),
3835		    (vm_size_t)grow_amount, cred != NULL)) {
3836			map->size += (addr - stack_entry->end);
3837			/* Update the current entry. */
3838			stack_entry->end = addr;
3839			stack_entry->avail_ssize -= grow_amount;
3840			vm_map_entry_resize_free(map, stack_entry);
3841			rv = KERN_SUCCESS;
3842		} else
3843			rv = KERN_FAILURE;
3844	}
3845
3846	if (rv == KERN_SUCCESS && is_procstack)
3847		vm->vm_ssize += btoc(grow_amount);
3848
3849	vm_map_unlock(map);
3850
3851	/*
3852	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
3853	 */
3854	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
3855		vm_map_wire(map,
3856		    (stack_entry == next_entry) ? addr : addr - grow_amount,
3857		    (stack_entry == next_entry) ? stack_entry->start : addr,
3858		    (p->p_flag & P_SYSTEM)
3859		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
3860		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
3861	}
3862
3863out:
3864#ifdef RACCT
3865	if (racct_enable && rv != KERN_SUCCESS) {
3866		PROC_LOCK(p);
3867		error = racct_set(p, RACCT_VMEM, map->size);
3868		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
3869		if (!old_mlock) {
3870			error = racct_set(p, RACCT_MEMLOCK,
3871			    ptoa(pmap_wired_count(map->pmap)));
3872			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
3873		}
3874	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
3875		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
3876		PROC_UNLOCK(p);
3877	}
3878#endif
3879
3880	return (rv);
3881}
3882
3883/*
3884 * Unshare the specified VM space for exec.  If other processes are
3885 * mapped to it, then create a new one.  The new vmspace is null.
3886 */
3887int
3888vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
3889{
3890	struct vmspace *oldvmspace = p->p_vmspace;
3891	struct vmspace *newvmspace;
3892
3893	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
3894	    ("vmspace_exec recursed"));
3895	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
3896	if (newvmspace == NULL)
3897		return (ENOMEM);
3898	newvmspace->vm_swrss = oldvmspace->vm_swrss;
3899	/*
3900	 * This code is written like this for prototype purposes.  The
3901	 * goal is to avoid running down the vmspace here, but let the
3902	 * other process's that are still using the vmspace to finally
3903	 * run it down.  Even though there is little or no chance of blocking
3904	 * here, it is a good idea to keep this form for future mods.
3905	 */
3906	PROC_VMSPACE_LOCK(p);
3907	p->p_vmspace = newvmspace;
3908	PROC_VMSPACE_UNLOCK(p);
3909	if (p == curthread->td_proc)
3910		pmap_activate(curthread);
3911	curthread->td_pflags |= TDP_EXECVMSPC;
3912	return (0);
3913}
3914
3915/*
3916 * Unshare the specified VM space for forcing COW.  This
3917 * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3918 */
3919int
3920vmspace_unshare(struct proc *p)
3921{
3922	struct vmspace *oldvmspace = p->p_vmspace;
3923	struct vmspace *newvmspace;
3924	vm_ooffset_t fork_charge;
3925
3926	if (oldvmspace->vm_refcnt == 1)
3927		return (0);
3928	fork_charge = 0;
3929	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
3930	if (newvmspace == NULL)
3931		return (ENOMEM);
3932	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
3933		vmspace_free(newvmspace);
3934		return (ENOMEM);
3935	}
3936	PROC_VMSPACE_LOCK(p);
3937	p->p_vmspace = newvmspace;
3938	PROC_VMSPACE_UNLOCK(p);
3939	if (p == curthread->td_proc)
3940		pmap_activate(curthread);
3941	vmspace_free(oldvmspace);
3942	return (0);
3943}
3944
3945/*
3946 *	vm_map_lookup:
3947 *
3948 *	Finds the VM object, offset, and
3949 *	protection for a given virtual address in the
3950 *	specified map, assuming a page fault of the
3951 *	type specified.
3952 *
3953 *	Leaves the map in question locked for read; return
3954 *	values are guaranteed until a vm_map_lookup_done
3955 *	call is performed.  Note that the map argument
3956 *	is in/out; the returned map must be used in
3957 *	the call to vm_map_lookup_done.
3958 *
3959 *	A handle (out_entry) is returned for use in
3960 *	vm_map_lookup_done, to make that fast.
3961 *
3962 *	If a lookup is requested with "write protection"
3963 *	specified, the map may be changed to perform virtual
3964 *	copying operations, although the data referenced will
3965 *	remain the same.
3966 */
3967int
3968vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
3969	      vm_offset_t vaddr,
3970	      vm_prot_t fault_typea,
3971	      vm_map_entry_t *out_entry,	/* OUT */
3972	      vm_object_t *object,		/* OUT */
3973	      vm_pindex_t *pindex,		/* OUT */
3974	      vm_prot_t *out_prot,		/* OUT */
3975	      boolean_t *wired)			/* OUT */
3976{
3977	vm_map_entry_t entry;
3978	vm_map_t map = *var_map;
3979	vm_prot_t prot;
3980	vm_prot_t fault_type = fault_typea;
3981	vm_object_t eobject;
3982	vm_size_t size;
3983	struct ucred *cred;
3984
3985RetryLookup:;
3986
3987	vm_map_lock_read(map);
3988
3989	/*
3990	 * Lookup the faulting address.
3991	 */
3992	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
3993		vm_map_unlock_read(map);
3994		return (KERN_INVALID_ADDRESS);
3995	}
3996
3997	entry = *out_entry;
3998
3999	/*
4000	 * Handle submaps.
4001	 */
4002	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4003		vm_map_t old_map = map;
4004
4005		*var_map = map = entry->object.sub_map;
4006		vm_map_unlock_read(old_map);
4007		goto RetryLookup;
4008	}
4009
4010	/*
4011	 * Check whether this task is allowed to have this page.
4012	 */
4013	prot = entry->protection;
4014	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4015	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4016		vm_map_unlock_read(map);
4017		return (KERN_PROTECTION_FAILURE);
4018	}
4019	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4020	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4021	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4022	    ("entry %p flags %x", entry, entry->eflags));
4023	if ((fault_typea & VM_PROT_COPY) != 0 &&
4024	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
4025	    (entry->eflags & MAP_ENTRY_COW) == 0) {
4026		vm_map_unlock_read(map);
4027		return (KERN_PROTECTION_FAILURE);
4028	}
4029
4030	/*
4031	 * If this page is not pageable, we have to get it for all possible
4032	 * accesses.
4033	 */
4034	*wired = (entry->wired_count != 0);
4035	if (*wired)
4036		fault_type = entry->protection;
4037	size = entry->end - entry->start;
4038	/*
4039	 * If the entry was copy-on-write, we either ...
4040	 */
4041	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4042		/*
4043		 * If we want to write the page, we may as well handle that
4044		 * now since we've got the map locked.
4045		 *
4046		 * If we don't need to write the page, we just demote the
4047		 * permissions allowed.
4048		 */
4049		if ((fault_type & VM_PROT_WRITE) != 0 ||
4050		    (fault_typea & VM_PROT_COPY) != 0) {
4051			/*
4052			 * Make a new object, and place it in the object
4053			 * chain.  Note that no new references have appeared
4054			 * -- one just moved from the map to the new
4055			 * object.
4056			 */
4057			if (vm_map_lock_upgrade(map))
4058				goto RetryLookup;
4059
4060			if (entry->cred == NULL) {
4061				/*
4062				 * The debugger owner is charged for
4063				 * the memory.
4064				 */
4065				cred = curthread->td_ucred;
4066				crhold(cred);
4067				if (!swap_reserve_by_cred(size, cred)) {
4068					crfree(cred);
4069					vm_map_unlock(map);
4070					return (KERN_RESOURCE_SHORTAGE);
4071				}
4072				entry->cred = cred;
4073			}
4074			vm_object_shadow(&entry->object.vm_object,
4075			    &entry->offset, size);
4076			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4077			eobject = entry->object.vm_object;
4078			if (eobject->cred != NULL) {
4079				/*
4080				 * The object was not shadowed.
4081				 */
4082				swap_release_by_cred(size, entry->cred);
4083				crfree(entry->cred);
4084				entry->cred = NULL;
4085			} else if (entry->cred != NULL) {
4086				VM_OBJECT_WLOCK(eobject);
4087				eobject->cred = entry->cred;
4088				eobject->charge = size;
4089				VM_OBJECT_WUNLOCK(eobject);
4090				entry->cred = NULL;
4091			}
4092
4093			vm_map_lock_downgrade(map);
4094		} else {
4095			/*
4096			 * We're attempting to read a copy-on-write page --
4097			 * don't allow writes.
4098			 */
4099			prot &= ~VM_PROT_WRITE;
4100		}
4101	}
4102
4103	/*
4104	 * Create an object if necessary.
4105	 */
4106	if (entry->object.vm_object == NULL &&
4107	    !map->system_map) {
4108		if (vm_map_lock_upgrade(map))
4109			goto RetryLookup;
4110		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4111		    atop(size));
4112		entry->offset = 0;
4113		if (entry->cred != NULL) {
4114			VM_OBJECT_WLOCK(entry->object.vm_object);
4115			entry->object.vm_object->cred = entry->cred;
4116			entry->object.vm_object->charge = size;
4117			VM_OBJECT_WUNLOCK(entry->object.vm_object);
4118			entry->cred = NULL;
4119		}
4120		vm_map_lock_downgrade(map);
4121	}
4122
4123	/*
4124	 * Return the object/offset from this entry.  If the entry was
4125	 * copy-on-write or empty, it has been fixed up.
4126	 */
4127	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
4128	*object = entry->object.vm_object;
4129
4130	*out_prot = prot;
4131	return (KERN_SUCCESS);
4132}
4133
4134/*
4135 *	vm_map_lookup_locked:
4136 *
4137 *	Lookup the faulting address.  A version of vm_map_lookup that returns
4138 *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4139 */
4140int
4141vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
4142		     vm_offset_t vaddr,
4143		     vm_prot_t fault_typea,
4144		     vm_map_entry_t *out_entry,	/* OUT */
4145		     vm_object_t *object,	/* OUT */
4146		     vm_pindex_t *pindex,	/* OUT */
4147		     vm_prot_t *out_prot,	/* OUT */
4148		     boolean_t *wired)		/* OUT */
4149{
4150	vm_map_entry_t entry;
4151	vm_map_t map = *var_map;
4152	vm_prot_t prot;
4153	vm_prot_t fault_type = fault_typea;
4154
4155	/*
4156	 * Lookup the faulting address.
4157	 */
4158	if (!vm_map_lookup_entry(map, vaddr, out_entry))
4159		return (KERN_INVALID_ADDRESS);
4160
4161	entry = *out_entry;
4162
4163	/*
4164	 * Fail if the entry refers to a submap.
4165	 */
4166	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4167		return (KERN_FAILURE);
4168
4169	/*
4170	 * Check whether this task is allowed to have this page.
4171	 */
4172	prot = entry->protection;
4173	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4174	if ((fault_type & prot) != fault_type)
4175		return (KERN_PROTECTION_FAILURE);
4176
4177	/*
4178	 * If this page is not pageable, we have to get it for all possible
4179	 * accesses.
4180	 */
4181	*wired = (entry->wired_count != 0);
4182	if (*wired)
4183		fault_type = entry->protection;
4184
4185	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4186		/*
4187		 * Fail if the entry was copy-on-write for a write fault.
4188		 */
4189		if (fault_type & VM_PROT_WRITE)
4190			return (KERN_FAILURE);
4191		/*
4192		 * We're attempting to read a copy-on-write page --
4193		 * don't allow writes.
4194		 */
4195		prot &= ~VM_PROT_WRITE;
4196	}
4197
4198	/*
4199	 * Fail if an object should be created.
4200	 */
4201	if (entry->object.vm_object == NULL && !map->system_map)
4202		return (KERN_FAILURE);
4203
4204	/*
4205	 * Return the object/offset from this entry.  If the entry was
4206	 * copy-on-write or empty, it has been fixed up.
4207	 */
4208	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
4209	*object = entry->object.vm_object;
4210
4211	*out_prot = prot;
4212	return (KERN_SUCCESS);
4213}
4214
4215/*
4216 *	vm_map_lookup_done:
4217 *
4218 *	Releases locks acquired by a vm_map_lookup
4219 *	(according to the handle returned by that lookup).
4220 */
4221void
4222vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4223{
4224	/*
4225	 * Unlock the main-level map
4226	 */
4227	vm_map_unlock_read(map);
4228}
4229
4230#include "opt_ddb.h"
4231#ifdef DDB
4232#include <sys/kernel.h>
4233
4234#include <ddb/ddb.h>
4235
4236static void
4237vm_map_print(vm_map_t map)
4238{
4239	vm_map_entry_t entry;
4240
4241	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4242	    (void *)map,
4243	    (void *)map->pmap, map->nentries, map->timestamp);
4244
4245	db_indent += 2;
4246	for (entry = map->header.next; entry != &map->header;
4247	    entry = entry->next) {
4248		db_iprintf("map entry %p: start=%p, end=%p\n",
4249		    (void *)entry, (void *)entry->start, (void *)entry->end);
4250		{
4251			static char *inheritance_name[4] =
4252			{"share", "copy", "none", "donate_copy"};
4253
4254			db_iprintf(" prot=%x/%x/%s",
4255			    entry->protection,
4256			    entry->max_protection,
4257			    inheritance_name[(int)(unsigned char)entry->inheritance]);
4258			if (entry->wired_count != 0)
4259				db_printf(", wired");
4260		}
4261		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4262			db_printf(", share=%p, offset=0x%jx\n",
4263			    (void *)entry->object.sub_map,
4264			    (uintmax_t)entry->offset);
4265			if ((entry->prev == &map->header) ||
4266			    (entry->prev->object.sub_map !=
4267				entry->object.sub_map)) {
4268				db_indent += 2;
4269				vm_map_print((vm_map_t)entry->object.sub_map);
4270				db_indent -= 2;
4271			}
4272		} else {
4273			if (entry->cred != NULL)
4274				db_printf(", ruid %d", entry->cred->cr_ruid);
4275			db_printf(", object=%p, offset=0x%jx",
4276			    (void *)entry->object.vm_object,
4277			    (uintmax_t)entry->offset);
4278			if (entry->object.vm_object && entry->object.vm_object->cred)
4279				db_printf(", obj ruid %d charge %jx",
4280				    entry->object.vm_object->cred->cr_ruid,
4281				    (uintmax_t)entry->object.vm_object->charge);
4282			if (entry->eflags & MAP_ENTRY_COW)
4283				db_printf(", copy (%s)",
4284				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4285			db_printf("\n");
4286
4287			if ((entry->prev == &map->header) ||
4288			    (entry->prev->object.vm_object !=
4289				entry->object.vm_object)) {
4290				db_indent += 2;
4291				vm_object_print((db_expr_t)(intptr_t)
4292						entry->object.vm_object,
4293						0, 0, (char *)0);
4294				db_indent -= 2;
4295			}
4296		}
4297	}
4298	db_indent -= 2;
4299}
4300
4301DB_SHOW_COMMAND(map, map)
4302{
4303
4304	if (!have_addr) {
4305		db_printf("usage: show map <addr>\n");
4306		return;
4307	}
4308	vm_map_print((vm_map_t)addr);
4309}
4310
4311DB_SHOW_COMMAND(procvm, procvm)
4312{
4313	struct proc *p;
4314
4315	if (have_addr) {
4316		p = db_lookup_proc(addr);
4317	} else {
4318		p = curproc;
4319	}
4320
4321	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4322	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4323	    (void *)vmspace_pmap(p->p_vmspace));
4324
4325	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4326}
4327
4328#endif /* DDB */
4329