1/*
2 * Copyright 2009-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2010, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11#include <vm/vm.h>
12
13#include <ctype.h>
14#include <stdlib.h>
15#include <stdio.h>
16#include <string.h>
17#include <sys/mman.h>
18
19#include <algorithm>
20
21#include <OS.h>
22#include <KernelExport.h>
23
24#include <AutoDeleterDrivers.h>
25
26#include <symbol_versioning.h>
27
28#include <arch/cpu.h>
29#include <arch/vm.h>
30#include <arch/user_memory.h>
31#include <boot/elf.h>
32#include <boot/stage2.h>
33#include <condition_variable.h>
34#include <console.h>
35#include <debug.h>
36#include <file_cache.h>
37#include <fs/fd.h>
38#include <heap.h>
39#include <kernel.h>
40#include <int.h>
41#include <lock.h>
42#include <low_resource_manager.h>
43#include <slab/Slab.h>
44#include <smp.h>
45#include <system_info.h>
46#include <thread.h>
47#include <team.h>
48#include <tracing.h>
49#include <util/AutoLock.h>
50#include <util/BitUtils.h>
51#include <util/ThreadAutoLock.h>
52#include <vm/vm_page.h>
53#include <vm/vm_priv.h>
54#include <vm/VMAddressSpace.h>
55#include <vm/VMArea.h>
56#include <vm/VMCache.h>
57
58#include "VMAddressSpaceLocking.h"
59#include "VMAnonymousCache.h"
60#include "VMAnonymousNoSwapCache.h"
61#include "IORequest.h"
62
63
64//#define TRACE_VM
65//#define TRACE_FAULTS
66#ifdef TRACE_VM
67#	define TRACE(x) dprintf x
68#else
69#	define TRACE(x) ;
70#endif
71#ifdef TRACE_FAULTS
72#	define FTRACE(x) dprintf x
73#else
74#	define FTRACE(x) ;
75#endif
76
77
78namespace {
79
80class AreaCacheLocking {
81public:
82	inline bool Lock(VMCache* lockable)
83	{
84		return false;
85	}
86
87	inline void Unlock(VMCache* lockable)
88	{
89		vm_area_put_locked_cache(lockable);
90	}
91};
92
93class AreaCacheLocker : public AutoLocker<VMCache, AreaCacheLocking> {
94public:
95	inline AreaCacheLocker(VMCache* cache = NULL)
96		: AutoLocker<VMCache, AreaCacheLocking>(cache, true)
97	{
98	}
99
100	inline AreaCacheLocker(VMArea* area)
101		: AutoLocker<VMCache, AreaCacheLocking>()
102	{
103		SetTo(area);
104	}
105
106	inline void SetTo(VMCache* cache, bool alreadyLocked)
107	{
108		AutoLocker<VMCache, AreaCacheLocking>::SetTo(cache, alreadyLocked);
109	}
110
111	inline void SetTo(VMArea* area)
112	{
113		return AutoLocker<VMCache, AreaCacheLocking>::SetTo(
114			area != NULL ? vm_area_get_locked_cache(area) : NULL, true, true);
115	}
116};
117
118
119class VMCacheChainLocker {
120public:
121	VMCacheChainLocker()
122		:
123		fTopCache(NULL),
124		fBottomCache(NULL)
125	{
126	}
127
128	VMCacheChainLocker(VMCache* topCache)
129		:
130		fTopCache(topCache),
131		fBottomCache(topCache)
132	{
133	}
134
135	~VMCacheChainLocker()
136	{
137		Unlock();
138	}
139
140	void SetTo(VMCache* topCache)
141	{
142		fTopCache = topCache;
143		fBottomCache = topCache;
144
145		if (topCache != NULL)
146			topCache->SetUserData(NULL);
147	}
148
149	VMCache* LockSourceCache()
150	{
151		if (fBottomCache == NULL || fBottomCache->source == NULL)
152			return NULL;
153
154		VMCache* previousCache = fBottomCache;
155
156		fBottomCache = fBottomCache->source;
157		fBottomCache->Lock();
158		fBottomCache->AcquireRefLocked();
159		fBottomCache->SetUserData(previousCache);
160
161		return fBottomCache;
162	}
163
164	void LockAllSourceCaches()
165	{
166		while (LockSourceCache() != NULL) {
167		}
168	}
169
170	void Unlock(VMCache* exceptCache = NULL)
171	{
172		if (fTopCache == NULL)
173			return;
174
175		// Unlock caches in source -> consumer direction. This is important to
176		// avoid double-locking and a reversal of locking order in case a cache
177		// is eligable for merging.
178		VMCache* cache = fBottomCache;
179		while (cache != NULL) {
180			VMCache* nextCache = (VMCache*)cache->UserData();
181			if (cache != exceptCache)
182				cache->ReleaseRefAndUnlock(cache != fTopCache);
183
184			if (cache == fTopCache)
185				break;
186
187			cache = nextCache;
188		}
189
190		fTopCache = NULL;
191		fBottomCache = NULL;
192	}
193
194	void UnlockKeepRefs(bool keepTopCacheLocked)
195	{
196		if (fTopCache == NULL)
197			return;
198
199		VMCache* nextCache = fBottomCache;
200		VMCache* cache = NULL;
201
202		while (keepTopCacheLocked
203				? nextCache != fTopCache : cache != fTopCache) {
204			cache = nextCache;
205			nextCache = (VMCache*)cache->UserData();
206			cache->Unlock(cache != fTopCache);
207		}
208	}
209
210	void RelockCaches(bool topCacheLocked)
211	{
212		if (fTopCache == NULL)
213			return;
214
215		VMCache* nextCache = fTopCache;
216		VMCache* cache = NULL;
217		if (topCacheLocked) {
218			cache = nextCache;
219			nextCache = cache->source;
220		}
221
222		while (cache != fBottomCache && nextCache != NULL) {
223			VMCache* consumer = cache;
224			cache = nextCache;
225			nextCache = cache->source;
226			cache->Lock();
227			cache->SetUserData(consumer);
228		}
229	}
230
231private:
232	VMCache*	fTopCache;
233	VMCache*	fBottomCache;
234};
235
236} // namespace
237
238
239// The memory reserve an allocation of the certain priority must not touch.
240static const size_t kMemoryReserveForPriority[] = {
241	VM_MEMORY_RESERVE_USER,		// user
242	VM_MEMORY_RESERVE_SYSTEM,	// system
243	0							// VIP
244};
245
246
247ObjectCache* gPageMappingsObjectCache;
248
249static rw_lock sAreaCacheLock = RW_LOCK_INITIALIZER("area->cache");
250
251static off_t sAvailableMemory;
252static off_t sNeededMemory;
253static mutex sAvailableMemoryLock = MUTEX_INITIALIZER("available memory lock");
254static uint32 sPageFaults;
255
256static VMPhysicalPageMapper* sPhysicalPageMapper;
257
258#if DEBUG_CACHE_LIST
259
260struct cache_info {
261	VMCache*	cache;
262	addr_t		page_count;
263	addr_t		committed;
264};
265
266static const int kCacheInfoTableCount = 100 * 1024;
267static cache_info* sCacheInfoTable;
268
269#endif	// DEBUG_CACHE_LIST
270
271
272// function declarations
273static void delete_area(VMAddressSpace* addressSpace, VMArea* area,
274	bool addressSpaceCleanup);
275static status_t vm_soft_fault(VMAddressSpace* addressSpace, addr_t address,
276	bool isWrite, bool isExecute, bool isUser, vm_page** wirePage);
277static status_t map_backing_store(VMAddressSpace* addressSpace,
278	VMCache* cache, off_t offset, const char* areaName, addr_t size, int wiring,
279	int protection, int protectionMax, int mapping, uint32 flags,
280	const virtual_address_restrictions* addressRestrictions, bool kernel,
281	VMArea** _area, void** _virtualAddress);
282static void fix_protection(uint32* protection);
283
284
285//	#pragma mark -
286
287
288#if VM_PAGE_FAULT_TRACING
289
290namespace VMPageFaultTracing {
291
292class PageFaultStart : public AbstractTraceEntry {
293public:
294	PageFaultStart(addr_t address, bool write, bool user, addr_t pc)
295		:
296		fAddress(address),
297		fPC(pc),
298		fWrite(write),
299		fUser(user)
300	{
301		Initialized();
302	}
303
304	virtual void AddDump(TraceOutput& out)
305	{
306		out.Print("page fault %#lx %s %s, pc: %#lx", fAddress,
307			fWrite ? "write" : "read", fUser ? "user" : "kernel", fPC);
308	}
309
310private:
311	addr_t	fAddress;
312	addr_t	fPC;
313	bool	fWrite;
314	bool	fUser;
315};
316
317
318// page fault errors
319enum {
320	PAGE_FAULT_ERROR_NO_AREA		= 0,
321	PAGE_FAULT_ERROR_KERNEL_ONLY,
322	PAGE_FAULT_ERROR_WRITE_PROTECTED,
323	PAGE_FAULT_ERROR_READ_PROTECTED,
324	PAGE_FAULT_ERROR_EXECUTE_PROTECTED,
325	PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY,
326	PAGE_FAULT_ERROR_NO_ADDRESS_SPACE
327};
328
329
330class PageFaultError : public AbstractTraceEntry {
331public:
332	PageFaultError(area_id area, status_t error)
333		:
334		fArea(area),
335		fError(error)
336	{
337		Initialized();
338	}
339
340	virtual void AddDump(TraceOutput& out)
341	{
342		switch (fError) {
343			case PAGE_FAULT_ERROR_NO_AREA:
344				out.Print("page fault error: no area");
345				break;
346			case PAGE_FAULT_ERROR_KERNEL_ONLY:
347				out.Print("page fault error: area: %ld, kernel only", fArea);
348				break;
349			case PAGE_FAULT_ERROR_WRITE_PROTECTED:
350				out.Print("page fault error: area: %ld, write protected",
351					fArea);
352				break;
353			case PAGE_FAULT_ERROR_READ_PROTECTED:
354				out.Print("page fault error: area: %ld, read protected", fArea);
355				break;
356			case PAGE_FAULT_ERROR_EXECUTE_PROTECTED:
357				out.Print("page fault error: area: %ld, execute protected",
358					fArea);
359				break;
360			case PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY:
361				out.Print("page fault error: kernel touching bad user memory");
362				break;
363			case PAGE_FAULT_ERROR_NO_ADDRESS_SPACE:
364				out.Print("page fault error: no address space");
365				break;
366			default:
367				out.Print("page fault error: area: %ld, error: %s", fArea,
368					strerror(fError));
369				break;
370		}
371	}
372
373private:
374	area_id		fArea;
375	status_t	fError;
376};
377
378
379class PageFaultDone : public AbstractTraceEntry {
380public:
381	PageFaultDone(area_id area, VMCache* topCache, VMCache* cache,
382			vm_page* page)
383		:
384		fArea(area),
385		fTopCache(topCache),
386		fCache(cache),
387		fPage(page)
388	{
389		Initialized();
390	}
391
392	virtual void AddDump(TraceOutput& out)
393	{
394		out.Print("page fault done: area: %ld, top cache: %p, cache: %p, "
395			"page: %p", fArea, fTopCache, fCache, fPage);
396	}
397
398private:
399	area_id		fArea;
400	VMCache*	fTopCache;
401	VMCache*	fCache;
402	vm_page*	fPage;
403};
404
405}	// namespace VMPageFaultTracing
406
407#	define TPF(x) new(std::nothrow) VMPageFaultTracing::x;
408#else
409#	define TPF(x) ;
410#endif	// VM_PAGE_FAULT_TRACING
411
412
413//	#pragma mark -
414
415
416/*!	The page's cache must be locked.
417*/
418static inline void
419increment_page_wired_count(vm_page* page)
420{
421	if (!page->IsMapped())
422		atomic_add(&gMappedPagesCount, 1);
423	page->IncrementWiredCount();
424}
425
426
427/*!	The page's cache must be locked.
428*/
429static inline void
430decrement_page_wired_count(vm_page* page)
431{
432	page->DecrementWiredCount();
433	if (!page->IsMapped())
434		atomic_add(&gMappedPagesCount, -1);
435}
436
437
438static inline addr_t
439virtual_page_address(VMArea* area, vm_page* page)
440{
441	return area->Base()
442		+ ((page->cache_offset << PAGE_SHIFT) - area->cache_offset);
443}
444
445
446static inline bool
447is_page_in_area(VMArea* area, vm_page* page)
448{
449	off_t pageCacheOffsetBytes = (off_t)(page->cache_offset << PAGE_SHIFT);
450	return pageCacheOffsetBytes >= area->cache_offset
451		&& pageCacheOffsetBytes < area->cache_offset + (off_t)area->Size();
452}
453
454
455//! You need to have the address space locked when calling this function
456static VMArea*
457lookup_area(VMAddressSpace* addressSpace, area_id id)
458{
459	VMAreas::ReadLock();
460
461	VMArea* area = VMAreas::LookupLocked(id);
462	if (area != NULL && area->address_space != addressSpace)
463		area = NULL;
464
465	VMAreas::ReadUnlock();
466
467	return area;
468}
469
470
471static inline size_t
472area_page_protections_size(size_t areaSize)
473{
474	// In the page protections we store only the three user protections,
475	// so we use 4 bits per page.
476	return (areaSize / B_PAGE_SIZE + 1) / 2;
477}
478
479
480static status_t
481allocate_area_page_protections(VMArea* area)
482{
483	size_t bytes = area_page_protections_size(area->Size());
484	area->page_protections = (uint8*)malloc_etc(bytes,
485		area->address_space == VMAddressSpace::Kernel()
486			? HEAP_DONT_LOCK_KERNEL_SPACE : 0);
487	if (area->page_protections == NULL)
488		return B_NO_MEMORY;
489
490	// init the page protections for all pages to that of the area
491	uint32 areaProtection = area->protection
492		& (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
493	memset(area->page_protections, areaProtection | (areaProtection << 4),
494		bytes);
495	return B_OK;
496}
497
498
499static inline void
500set_area_page_protection(VMArea* area, addr_t pageAddress, uint32 protection)
501{
502	protection &= B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA;
503	addr_t pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
504	uint8& entry = area->page_protections[pageIndex / 2];
505	if (pageIndex % 2 == 0)
506		entry = (entry & 0xf0) | protection;
507	else
508		entry = (entry & 0x0f) | (protection << 4);
509}
510
511
512static inline uint32
513get_area_page_protection(VMArea* area, addr_t pageAddress)
514{
515	if (area->page_protections == NULL)
516		return area->protection;
517
518	uint32 pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
519	uint32 protection = area->page_protections[pageIndex / 2];
520	if (pageIndex % 2 == 0)
521		protection &= 0x0f;
522	else
523		protection >>= 4;
524
525	uint32 kernelProtection = 0;
526	if ((protection & B_READ_AREA) != 0)
527		kernelProtection |= B_KERNEL_READ_AREA;
528	if ((protection & B_WRITE_AREA) != 0)
529		kernelProtection |= B_KERNEL_WRITE_AREA;
530
531	// If this is a kernel area we return only the kernel flags.
532	if (area->address_space == VMAddressSpace::Kernel())
533		return kernelProtection;
534
535	return protection | kernelProtection;
536}
537
538
539static inline uint8*
540realloc_page_protections(uint8* pageProtections, size_t areaSize,
541	uint32 allocationFlags)
542{
543	size_t bytes = area_page_protections_size(areaSize);
544	return (uint8*)realloc_etc(pageProtections, bytes, allocationFlags);
545}
546
547
548/*!	The caller must have reserved enough pages the translation map
549	implementation might need to map this page.
550	The page's cache must be locked.
551*/
552static status_t
553map_page(VMArea* area, vm_page* page, addr_t address, uint32 protection,
554	vm_page_reservation* reservation)
555{
556	VMTranslationMap* map = area->address_space->TranslationMap();
557
558	bool wasMapped = page->IsMapped();
559
560	if (area->wiring == B_NO_LOCK) {
561		DEBUG_PAGE_ACCESS_CHECK(page);
562
563		bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
564		vm_page_mapping* mapping = (vm_page_mapping*)object_cache_alloc(
565			gPageMappingsObjectCache,
566			CACHE_DONT_WAIT_FOR_MEMORY
567				| (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0));
568		if (mapping == NULL)
569			return B_NO_MEMORY;
570
571		mapping->page = page;
572		mapping->area = area;
573
574		map->Lock();
575
576		map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
577			area->MemoryType(), reservation);
578
579		// insert mapping into lists
580		if (!page->IsMapped())
581			atomic_add(&gMappedPagesCount, 1);
582
583		page->mappings.Add(mapping);
584		area->mappings.Add(mapping);
585
586		map->Unlock();
587	} else {
588		DEBUG_PAGE_ACCESS_CHECK(page);
589
590		map->Lock();
591		map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
592			area->MemoryType(), reservation);
593		map->Unlock();
594
595		increment_page_wired_count(page);
596	}
597
598	if (!wasMapped) {
599		// The page is mapped now, so we must not remain in the cached queue.
600		// It also makes sense to move it from the inactive to the active, since
601		// otherwise the page daemon wouldn't come to keep track of it (in idle
602		// mode) -- if the page isn't touched, it will be deactivated after a
603		// full iteration through the queue at the latest.
604		if (page->State() == PAGE_STATE_CACHED
605				|| page->State() == PAGE_STATE_INACTIVE) {
606			vm_page_set_state(page, PAGE_STATE_ACTIVE);
607		}
608	}
609
610	return B_OK;
611}
612
613
614/*!	If \a preserveModified is \c true, the caller must hold the lock of the
615	page's cache.
616*/
617static inline bool
618unmap_page(VMArea* area, addr_t virtualAddress)
619{
620	return area->address_space->TranslationMap()->UnmapPage(area,
621		virtualAddress, true);
622}
623
624
625/*!	If \a preserveModified is \c true, the caller must hold the lock of all
626	mapped pages' caches.
627*/
628static inline void
629unmap_pages(VMArea* area, addr_t base, size_t size)
630{
631	area->address_space->TranslationMap()->UnmapPages(area, base, size, true);
632}
633
634
635static inline bool
636intersect_area(VMArea* area, addr_t& address, addr_t& size, addr_t& offset)
637{
638	if (address < area->Base()) {
639		offset = area->Base() - address;
640		if (offset >= size)
641			return false;
642
643		address = area->Base();
644		size -= offset;
645		offset = 0;
646		if (size > area->Size())
647			size = area->Size();
648
649		return true;
650	}
651
652	offset = address - area->Base();
653	if (offset >= area->Size())
654		return false;
655
656	if (size >= area->Size() - offset)
657		size = area->Size() - offset;
658
659	return true;
660}
661
662
663/*!	Cuts a piece out of an area. If the given cut range covers the complete
664	area, it is deleted. If it covers the beginning or the end, the area is
665	resized accordingly. If the range covers some part in the middle of the
666	area, it is split in two; in this case the second area is returned via
667	\a _secondArea (the variable is left untouched in the other cases).
668	The address space must be write locked.
669	The caller must ensure that no part of the given range is wired.
670*/
671static status_t
672cut_area(VMAddressSpace* addressSpace, VMArea* area, addr_t address,
673	addr_t size, VMArea** _secondArea, bool kernel)
674{
675	addr_t offset;
676	if (!intersect_area(area, address, size, offset))
677		return B_OK;
678
679	// Is the area fully covered?
680	if (address == area->Base() && size == area->Size()) {
681		delete_area(addressSpace, area, false);
682		return B_OK;
683	}
684
685	int priority;
686	uint32 allocationFlags;
687	if (addressSpace == VMAddressSpace::Kernel()) {
688		priority = VM_PRIORITY_SYSTEM;
689		allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
690			| HEAP_DONT_LOCK_KERNEL_SPACE;
691	} else {
692		priority = VM_PRIORITY_USER;
693		allocationFlags = 0;
694	}
695
696	VMCache* cache = vm_area_get_locked_cache(area);
697	VMCacheChainLocker cacheChainLocker(cache);
698	cacheChainLocker.LockAllSourceCaches();
699
700	// If no one else uses the area's cache and it's an anonymous cache, we can
701	// resize or split it, too.
702	bool onlyCacheUser = cache->areas == area && area->cache_next == NULL
703		&& cache->consumers.IsEmpty() && area->cache_type == CACHE_TYPE_RAM;
704
705	const addr_t oldSize = area->Size();
706
707	// Cut the end only?
708	if (offset > 0 && size == area->Size() - offset) {
709		status_t error = addressSpace->ShrinkAreaTail(area, offset,
710			allocationFlags);
711		if (error != B_OK)
712			return error;
713
714		if (area->page_protections != NULL) {
715			uint8* newProtections = realloc_page_protections(
716				area->page_protections, area->Size(), allocationFlags);
717
718			if (newProtections == NULL) {
719				addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
720				return B_NO_MEMORY;
721			}
722
723			area->page_protections = newProtections;
724		}
725
726		// unmap pages
727		unmap_pages(area, address, size);
728
729		if (onlyCacheUser) {
730			// Since VMCache::Resize() can temporarily drop the lock, we must
731			// unlock all lower caches to prevent locking order inversion.
732			cacheChainLocker.Unlock(cache);
733			cache->Resize(cache->virtual_base + offset, priority);
734			cache->ReleaseRefAndUnlock();
735		}
736
737		return B_OK;
738	}
739
740	// Cut the beginning only?
741	if (area->Base() == address) {
742		uint8* newProtections = NULL;
743		if (area->page_protections != NULL) {
744			// Allocate all memory before shifting as the shift might lose some
745			// bits.
746			newProtections = realloc_page_protections(NULL, area->Size(),
747				allocationFlags);
748
749			if (newProtections == NULL)
750				return B_NO_MEMORY;
751		}
752
753		// resize the area
754		status_t error = addressSpace->ShrinkAreaHead(area, area->Size() - size,
755			allocationFlags);
756		if (error != B_OK) {
757			if (newProtections != NULL)
758				free_etc(newProtections, allocationFlags);
759			return error;
760		}
761
762		if (area->page_protections != NULL) {
763			size_t oldBytes = area_page_protections_size(oldSize);
764			ssize_t pagesShifted = (oldSize - area->Size()) / B_PAGE_SIZE;
765			bitmap_shift<uint8>(area->page_protections, oldBytes * 8, -(pagesShifted * 4));
766
767			size_t bytes = area_page_protections_size(area->Size());
768			memcpy(newProtections, area->page_protections, bytes);
769			free_etc(area->page_protections, allocationFlags);
770			area->page_protections = newProtections;
771		}
772
773		// unmap pages
774		unmap_pages(area, address, size);
775
776		if (onlyCacheUser) {
777			// Since VMCache::Rebase() can temporarily drop the lock, we must
778			// unlock all lower caches to prevent locking order inversion.
779			cacheChainLocker.Unlock(cache);
780			cache->Rebase(cache->virtual_base + size, priority);
781			cache->ReleaseRefAndUnlock();
782		}
783		area->cache_offset += size;
784
785		return B_OK;
786	}
787
788	// The tough part -- cut a piece out of the middle of the area.
789	// We do that by shrinking the area to the begin section and creating a
790	// new area for the end section.
791	addr_t firstNewSize = offset;
792	addr_t secondBase = address + size;
793	addr_t secondSize = area->Size() - offset - size;
794
795	// unmap pages
796	unmap_pages(area, address, area->Size() - firstNewSize);
797
798	// resize the area
799	status_t error = addressSpace->ShrinkAreaTail(area, firstNewSize,
800		allocationFlags);
801	if (error != B_OK)
802		return error;
803
804	uint8* areaNewProtections = NULL;
805	uint8* secondAreaNewProtections = NULL;
806
807	// Try to allocate the new memory before making some hard to reverse
808	// changes.
809	if (area->page_protections != NULL) {
810		areaNewProtections = realloc_page_protections(NULL, area->Size(),
811			allocationFlags);
812		secondAreaNewProtections = realloc_page_protections(NULL, secondSize,
813			allocationFlags);
814
815		if (areaNewProtections == NULL || secondAreaNewProtections == NULL) {
816			addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
817			free_etc(areaNewProtections, allocationFlags);
818			free_etc(secondAreaNewProtections, allocationFlags);
819			return B_NO_MEMORY;
820		}
821	}
822
823	virtual_address_restrictions addressRestrictions = {};
824	addressRestrictions.address = (void*)secondBase;
825	addressRestrictions.address_specification = B_EXACT_ADDRESS;
826	VMArea* secondArea;
827
828	if (onlyCacheUser) {
829		// Create a new cache for the second area.
830		VMCache* secondCache;
831		error = VMCacheFactory::CreateAnonymousCache(secondCache,
832			area->protection & B_OVERCOMMITTING_AREA, 0, 0,
833			dynamic_cast<VMAnonymousNoSwapCache*>(cache) == NULL, priority);
834		if (error != B_OK) {
835			addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
836			free_etc(areaNewProtections, allocationFlags);
837			free_etc(secondAreaNewProtections, allocationFlags);
838			return error;
839		}
840
841		secondCache->Lock();
842		secondCache->temporary = cache->temporary;
843		secondCache->virtual_base = area->cache_offset;
844		secondCache->virtual_end = area->cache_offset + secondSize;
845
846		// Transfer the concerned pages from the first cache.
847		off_t adoptOffset = area->cache_offset + secondBase - area->Base();
848		error = secondCache->Adopt(cache, adoptOffset, secondSize,
849			area->cache_offset);
850
851		if (error == B_OK) {
852			// Since VMCache::Resize() can temporarily drop the lock, we must
853			// unlock all lower caches to prevent locking order inversion.
854			cacheChainLocker.Unlock(cache);
855			cache->Resize(cache->virtual_base + firstNewSize, priority);
856			// Don't unlock the cache yet because we might have to resize it
857			// back.
858
859			// Map the second area.
860			error = map_backing_store(addressSpace, secondCache,
861				area->cache_offset, area->name, secondSize, area->wiring,
862				area->protection, area->protection_max, REGION_NO_PRIVATE_MAP, 0,
863				&addressRestrictions, kernel, &secondArea, NULL);
864		}
865
866		if (error != B_OK) {
867			// Restore the original cache.
868			cache->Resize(cache->virtual_base + oldSize, priority);
869
870			// Move the pages back.
871			status_t readoptStatus = cache->Adopt(secondCache,
872				area->cache_offset, secondSize, adoptOffset);
873			if (readoptStatus != B_OK) {
874				// Some (swap) pages have not been moved back and will be lost
875				// once the second cache is deleted.
876				panic("failed to restore cache range: %s",
877					strerror(readoptStatus));
878
879				// TODO: Handle out of memory cases by freeing memory and
880				// retrying.
881			}
882
883			cache->ReleaseRefAndUnlock();
884			secondCache->ReleaseRefAndUnlock();
885			addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
886			free_etc(areaNewProtections, allocationFlags);
887			free_etc(secondAreaNewProtections, allocationFlags);
888			return error;
889		}
890
891		// Now we can unlock it.
892		cache->ReleaseRefAndUnlock();
893		secondCache->Unlock();
894	} else {
895		error = map_backing_store(addressSpace, cache, area->cache_offset
896			+ (secondBase - area->Base()),
897			area->name, secondSize, area->wiring, area->protection,
898			area->protection_max, REGION_NO_PRIVATE_MAP, 0,
899			&addressRestrictions, kernel, &secondArea, NULL);
900		if (error != B_OK) {
901			addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
902			free_etc(areaNewProtections, allocationFlags);
903			free_etc(secondAreaNewProtections, allocationFlags);
904			return error;
905		}
906		// We need a cache reference for the new area.
907		cache->AcquireRefLocked();
908	}
909
910	if (area->page_protections != NULL) {
911		// Copy the protection bits of the first area.
912		size_t areaBytes = area_page_protections_size(area->Size());
913		memcpy(areaNewProtections, area->page_protections, areaBytes);
914		uint8* areaOldProtections = area->page_protections;
915		area->page_protections = areaNewProtections;
916
917		// Shift the protection bits of the second area to the start of
918		// the old array.
919		size_t oldBytes = area_page_protections_size(oldSize);
920		addr_t secondAreaOffset = secondBase - area->Base();
921		ssize_t secondAreaPagesShifted = secondAreaOffset / B_PAGE_SIZE;
922		bitmap_shift<uint8>(areaOldProtections, oldBytes * 8, -(secondAreaPagesShifted * 4));
923
924		// Copy the protection bits of the second area.
925		size_t secondAreaBytes = area_page_protections_size(secondSize);
926		memcpy(secondAreaNewProtections, areaOldProtections, secondAreaBytes);
927		secondArea->page_protections = secondAreaNewProtections;
928
929		// We don't need this anymore.
930		free_etc(areaOldProtections, allocationFlags);
931
932		// Set the correct page protections for the second area.
933		VMTranslationMap* map = addressSpace->TranslationMap();
934		map->Lock();
935		for (VMCachePagesTree::Iterator it
936				= secondArea->cache->pages.GetIterator();
937				vm_page* page = it.Next();) {
938			if (is_page_in_area(secondArea, page)) {
939				addr_t address = virtual_page_address(secondArea, page);
940				uint32 pageProtection
941					= get_area_page_protection(secondArea, address);
942				map->ProtectPage(secondArea, address, pageProtection);
943			}
944		}
945		map->Unlock();
946	}
947
948	if (_secondArea != NULL)
949		*_secondArea = secondArea;
950
951	return B_OK;
952}
953
954
955/*!	Deletes or cuts all areas in the given address range.
956	The address space must be write-locked.
957	The caller must ensure that no part of the given range is wired.
958*/
959static status_t
960unmap_address_range(VMAddressSpace* addressSpace, addr_t address, addr_t size,
961	bool kernel)
962{
963	size = PAGE_ALIGN(size);
964
965	// Check, whether the caller is allowed to modify the concerned areas.
966	if (!kernel) {
967		for (VMAddressSpace::AreaRangeIterator it
968				= addressSpace->GetAreaRangeIterator(address, size);
969			VMArea* area = it.Next();) {
970
971			if ((area->protection & B_KERNEL_AREA) != 0) {
972				dprintf("unmap_address_range: team %" B_PRId32 " tried to "
973					"unmap range of kernel area %" B_PRId32 " (%s)\n",
974					team_get_current_team_id(), area->id, area->name);
975				return B_NOT_ALLOWED;
976			}
977		}
978	}
979
980	for (VMAddressSpace::AreaRangeIterator it
981			= addressSpace->GetAreaRangeIterator(address, size);
982		VMArea* area = it.Next();) {
983
984		status_t error = cut_area(addressSpace, area, address, size, NULL,
985			kernel);
986		if (error != B_OK)
987			return error;
988			// Failing after already messing with areas is ugly, but we
989			// can't do anything about it.
990	}
991
992	return B_OK;
993}
994
995
996static status_t
997discard_area_range(VMArea* area, addr_t address, addr_t size)
998{
999	addr_t offset;
1000	if (!intersect_area(area, address, size, offset))
1001		return B_OK;
1002
1003	// If someone else uses the area's cache or it's not an anonymous cache, we
1004	// can't discard.
1005	VMCache* cache = vm_area_get_locked_cache(area);
1006	if (cache->areas != area || area->cache_next != NULL
1007		|| !cache->consumers.IsEmpty() || cache->type != CACHE_TYPE_RAM) {
1008		return B_OK;
1009	}
1010
1011	VMCacheChainLocker cacheChainLocker(cache);
1012	cacheChainLocker.LockAllSourceCaches();
1013
1014	unmap_pages(area, address, size);
1015
1016	// Since VMCache::Discard() can temporarily drop the lock, we must
1017	// unlock all lower caches to prevent locking order inversion.
1018	cacheChainLocker.Unlock(cache);
1019	cache->Discard(cache->virtual_base + offset, size);
1020	cache->ReleaseRefAndUnlock();
1021
1022	return B_OK;
1023}
1024
1025
1026static status_t
1027discard_address_range(VMAddressSpace* addressSpace, addr_t address, addr_t size,
1028	bool kernel)
1029{
1030	for (VMAddressSpace::AreaRangeIterator it
1031		= addressSpace->GetAreaRangeIterator(address, size);
1032			VMArea* area = it.Next();) {
1033		status_t error = discard_area_range(area, address, size);
1034		if (error != B_OK)
1035			return error;
1036	}
1037
1038	return B_OK;
1039}
1040
1041
1042/*! You need to hold the lock of the cache and the write lock of the address
1043	space when calling this function.
1044	Note, that in case of error your cache will be temporarily unlocked.
1045	If \a addressSpec is \c B_EXACT_ADDRESS and the
1046	\c CREATE_AREA_UNMAP_ADDRESS_RANGE flag is specified, the caller must ensure
1047	that no part of the specified address range (base \c *_virtualAddress, size
1048	\a size) is wired. The cache will also be temporarily unlocked.
1049*/
1050static status_t
1051map_backing_store(VMAddressSpace* addressSpace, VMCache* cache, off_t offset,
1052	const char* areaName, addr_t size, int wiring, int protection,
1053	int protectionMax, int mapping,
1054	uint32 flags, const virtual_address_restrictions* addressRestrictions,
1055	bool kernel, VMArea** _area, void** _virtualAddress)
1056{
1057	TRACE(("map_backing_store: aspace %p, cache %p, virtual %p, offset 0x%"
1058		B_PRIx64 ", size %" B_PRIuADDR ", addressSpec %" B_PRIu32 ", wiring %d"
1059		", protection %d, protectionMax %d, area %p, areaName '%s'\n",
1060		addressSpace, cache, addressRestrictions->address, offset, size,
1061		addressRestrictions->address_specification, wiring, protection,
1062		protectionMax, _area, areaName));
1063	cache->AssertLocked();
1064
1065	if (size == 0) {
1066#if KDEBUG
1067		panic("map_backing_store(): called with size=0 for area '%s'!",
1068			areaName);
1069#endif
1070		return B_BAD_VALUE;
1071	}
1072	if (offset < 0)
1073		return B_BAD_VALUE;
1074
1075	uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
1076		| HEAP_DONT_LOCK_KERNEL_SPACE;
1077	int priority;
1078	if (addressSpace != VMAddressSpace::Kernel()) {
1079		priority = VM_PRIORITY_USER;
1080	} else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0) {
1081		priority = VM_PRIORITY_VIP;
1082		allocationFlags |= HEAP_PRIORITY_VIP;
1083	} else
1084		priority = VM_PRIORITY_SYSTEM;
1085
1086	VMArea* area = addressSpace->CreateArea(areaName, wiring, protection,
1087		allocationFlags);
1088	if (mapping != REGION_PRIVATE_MAP)
1089		area->protection_max = protectionMax & B_USER_PROTECTION;
1090	if (area == NULL)
1091		return B_NO_MEMORY;
1092
1093	status_t status;
1094
1095	// if this is a private map, we need to create a new cache
1096	// to handle the private copies of pages as they are written to
1097	VMCache* sourceCache = cache;
1098	if (mapping == REGION_PRIVATE_MAP) {
1099		VMCache* newCache;
1100
1101		// create an anonymous cache
1102		status = VMCacheFactory::CreateAnonymousCache(newCache,
1103			(protection & B_STACK_AREA) != 0
1104				|| (protection & B_OVERCOMMITTING_AREA) != 0, 0,
1105			cache->GuardSize() / B_PAGE_SIZE, true, VM_PRIORITY_USER);
1106		if (status != B_OK)
1107			goto err1;
1108
1109		newCache->Lock();
1110		newCache->temporary = 1;
1111		newCache->virtual_base = offset;
1112		newCache->virtual_end = offset + size;
1113
1114		cache->AddConsumer(newCache);
1115
1116		cache = newCache;
1117	}
1118
1119	if ((flags & CREATE_AREA_DONT_COMMIT_MEMORY) == 0) {
1120		status = cache->SetMinimalCommitment(size, priority);
1121		if (status != B_OK)
1122			goto err2;
1123	}
1124
1125	// check to see if this address space has entered DELETE state
1126	if (addressSpace->IsBeingDeleted()) {
1127		// okay, someone is trying to delete this address space now, so we can't
1128		// insert the area, so back out
1129		status = B_BAD_TEAM_ID;
1130		goto err2;
1131	}
1132
1133	if (addressRestrictions->address_specification == B_EXACT_ADDRESS
1134			&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0) {
1135		// temporarily unlock the current cache since it might be mapped to
1136		// some existing area, and unmap_address_range also needs to lock that
1137		// cache to delete the area.
1138		cache->Unlock();
1139		status = unmap_address_range(addressSpace,
1140			(addr_t)addressRestrictions->address, size, kernel);
1141		cache->Lock();
1142		if (status != B_OK)
1143			goto err2;
1144	}
1145
1146	status = addressSpace->InsertArea(area, size, addressRestrictions,
1147		allocationFlags, _virtualAddress);
1148	if (status == B_NO_MEMORY
1149			&& addressRestrictions->address_specification == B_ANY_KERNEL_ADDRESS) {
1150		// Due to how many locks are held, we cannot wait here for space to be
1151		// freed up, but we can at least notify the low_resource handler.
1152		low_resource(B_KERNEL_RESOURCE_ADDRESS_SPACE, size, B_RELATIVE_TIMEOUT, 0);
1153	}
1154	if (status != B_OK)
1155		goto err2;
1156
1157	// attach the cache to the area
1158	area->cache = cache;
1159	area->cache_offset = offset;
1160
1161	// point the cache back to the area
1162	cache->InsertAreaLocked(area);
1163	if (mapping == REGION_PRIVATE_MAP)
1164		cache->Unlock();
1165
1166	// insert the area in the global areas map
1167	VMAreas::Insert(area);
1168
1169	// grab a ref to the address space (the area holds this)
1170	addressSpace->Get();
1171
1172//	ktrace_printf("map_backing_store: cache: %p (source: %p), \"%s\" -> %p",
1173//		cache, sourceCache, areaName, area);
1174
1175	*_area = area;
1176	return B_OK;
1177
1178err2:
1179	if (mapping == REGION_PRIVATE_MAP) {
1180		// We created this cache, so we must delete it again. Note, that we
1181		// need to temporarily unlock the source cache or we'll otherwise
1182		// deadlock, since VMCache::_RemoveConsumer() will try to lock it, too.
1183		sourceCache->Unlock();
1184		cache->ReleaseRefAndUnlock();
1185		sourceCache->Lock();
1186	}
1187err1:
1188	addressSpace->DeleteArea(area, allocationFlags);
1189	return status;
1190}
1191
1192
1193/*!	Equivalent to wait_if_area_range_is_wired(area, area->Base(), area->Size(),
1194	  locker1, locker2).
1195*/
1196template<typename LockerType1, typename LockerType2>
1197static inline bool
1198wait_if_area_is_wired(VMArea* area, LockerType1* locker1, LockerType2* locker2)
1199{
1200	area->cache->AssertLocked();
1201
1202	VMAreaUnwiredWaiter waiter;
1203	if (!area->AddWaiterIfWired(&waiter))
1204		return false;
1205
1206	// unlock everything and wait
1207	if (locker1 != NULL)
1208		locker1->Unlock();
1209	if (locker2 != NULL)
1210		locker2->Unlock();
1211
1212	waiter.waitEntry.Wait();
1213
1214	return true;
1215}
1216
1217
1218/*!	Checks whether the given area has any wired ranges intersecting with the
1219	specified range and waits, if so.
1220
1221	When it has to wait, the function calls \c Unlock() on both \a locker1
1222	and \a locker2, if given.
1223	The area's top cache must be locked and must be unlocked as a side effect
1224	of calling \c Unlock() on either \a locker1 or \a locker2.
1225
1226	If the function does not have to wait it does not modify or unlock any
1227	object.
1228
1229	\param area The area to be checked.
1230	\param base The base address of the range to check.
1231	\param size The size of the address range to check.
1232	\param locker1 An object to be unlocked when before starting to wait (may
1233		be \c NULL).
1234	\param locker2 An object to be unlocked when before starting to wait (may
1235		be \c NULL).
1236	\return \c true, if the function had to wait, \c false otherwise.
1237*/
1238template<typename LockerType1, typename LockerType2>
1239static inline bool
1240wait_if_area_range_is_wired(VMArea* area, addr_t base, size_t size,
1241	LockerType1* locker1, LockerType2* locker2)
1242{
1243	area->cache->AssertLocked();
1244
1245	VMAreaUnwiredWaiter waiter;
1246	if (!area->AddWaiterIfWired(&waiter, base, size))
1247		return false;
1248
1249	// unlock everything and wait
1250	if (locker1 != NULL)
1251		locker1->Unlock();
1252	if (locker2 != NULL)
1253		locker2->Unlock();
1254
1255	waiter.waitEntry.Wait();
1256
1257	return true;
1258}
1259
1260
1261/*!	Checks whether the given address space has any wired ranges intersecting
1262	with the specified range and waits, if so.
1263
1264	Similar to wait_if_area_range_is_wired(), with the following differences:
1265	- All areas intersecting with the range are checked (respectively all until
1266	  one is found that contains a wired range intersecting with the given
1267	  range).
1268	- The given address space must at least be read-locked and must be unlocked
1269	  when \c Unlock() is called on \a locker.
1270	- None of the areas' caches are allowed to be locked.
1271*/
1272template<typename LockerType>
1273static inline bool
1274wait_if_address_range_is_wired(VMAddressSpace* addressSpace, addr_t base,
1275	size_t size, LockerType* locker)
1276{
1277	for (VMAddressSpace::AreaRangeIterator it
1278		= addressSpace->GetAreaRangeIterator(base, size);
1279			VMArea* area = it.Next();) {
1280
1281		AreaCacheLocker cacheLocker(vm_area_get_locked_cache(area));
1282
1283		if (wait_if_area_range_is_wired(area, base, size, locker, &cacheLocker))
1284			return true;
1285	}
1286
1287	return false;
1288}
1289
1290
1291/*!	Prepares an area to be used for vm_set_kernel_area_debug_protection().
1292	It must be called in a situation where the kernel address space may be
1293	locked.
1294*/
1295status_t
1296vm_prepare_kernel_area_debug_protection(area_id id, void** cookie)
1297{
1298	AddressSpaceReadLocker locker;
1299	VMArea* area;
1300	status_t status = locker.SetFromArea(id, area);
1301	if (status != B_OK)
1302		return status;
1303
1304	if (area->page_protections == NULL) {
1305		status = allocate_area_page_protections(area);
1306		if (status != B_OK)
1307			return status;
1308	}
1309
1310	*cookie = (void*)area;
1311	return B_OK;
1312}
1313
1314
1315/*!	This is a debug helper function that can only be used with very specific
1316	use cases.
1317	Sets protection for the given address range to the protection specified.
1318	If \a protection is 0 then the involved pages will be marked non-present
1319	in the translation map to cause a fault on access. The pages aren't
1320	actually unmapped however so that they can be marked present again with
1321	additional calls to this function. For this to work the area must be
1322	fully locked in memory so that the pages aren't otherwise touched.
1323	This function does not lock the kernel address space and needs to be
1324	supplied with a \a cookie retrieved from a successful call to
1325	vm_prepare_kernel_area_debug_protection().
1326*/
1327status_t
1328vm_set_kernel_area_debug_protection(void* cookie, void* _address, size_t size,
1329	uint32 protection)
1330{
1331	// check address range
1332	addr_t address = (addr_t)_address;
1333	size = PAGE_ALIGN(size);
1334
1335	if ((address % B_PAGE_SIZE) != 0
1336		|| (addr_t)address + size < (addr_t)address
1337		|| !IS_KERNEL_ADDRESS(address)
1338		|| !IS_KERNEL_ADDRESS((addr_t)address + size)) {
1339		return B_BAD_VALUE;
1340	}
1341
1342	// Translate the kernel protection to user protection as we only store that.
1343	if ((protection & B_KERNEL_READ_AREA) != 0)
1344		protection |= B_READ_AREA;
1345	if ((protection & B_KERNEL_WRITE_AREA) != 0)
1346		protection |= B_WRITE_AREA;
1347
1348	VMAddressSpace* addressSpace = VMAddressSpace::GetKernel();
1349	VMTranslationMap* map = addressSpace->TranslationMap();
1350	VMArea* area = (VMArea*)cookie;
1351
1352	addr_t offset = address - area->Base();
1353	if (area->Size() - offset < size) {
1354		panic("protect range not fully within supplied area");
1355		return B_BAD_VALUE;
1356	}
1357
1358	if (area->page_protections == NULL) {
1359		panic("area has no page protections");
1360		return B_BAD_VALUE;
1361	}
1362
1363	// Invalidate the mapping entries so any access to them will fault or
1364	// restore the mapping entries unchanged so that lookup will success again.
1365	map->Lock();
1366	map->DebugMarkRangePresent(address, address + size, protection != 0);
1367	map->Unlock();
1368
1369	// And set the proper page protections so that the fault case will actually
1370	// fail and not simply try to map a new page.
1371	for (addr_t pageAddress = address; pageAddress < address + size;
1372			pageAddress += B_PAGE_SIZE) {
1373		set_area_page_protection(area, pageAddress, protection);
1374	}
1375
1376	return B_OK;
1377}
1378
1379
1380status_t
1381vm_block_address_range(const char* name, void* address, addr_t size)
1382{
1383	if (!arch_vm_supports_protection(0))
1384		return B_NOT_SUPPORTED;
1385
1386	AddressSpaceWriteLocker locker;
1387	status_t status = locker.SetTo(VMAddressSpace::KernelID());
1388	if (status != B_OK)
1389		return status;
1390
1391	VMAddressSpace* addressSpace = locker.AddressSpace();
1392
1393	// create an anonymous cache
1394	VMCache* cache;
1395	status = VMCacheFactory::CreateAnonymousCache(cache, false, 0, 0, false,
1396		VM_PRIORITY_SYSTEM);
1397	if (status != B_OK)
1398		return status;
1399
1400	cache->temporary = 1;
1401	cache->virtual_end = size;
1402	cache->Lock();
1403
1404	VMArea* area;
1405	virtual_address_restrictions addressRestrictions = {};
1406	addressRestrictions.address = address;
1407	addressRestrictions.address_specification = B_EXACT_ADDRESS;
1408	status = map_backing_store(addressSpace, cache, 0, name, size,
1409		B_ALREADY_WIRED, 0, REGION_NO_PRIVATE_MAP, 0, 0, &addressRestrictions,
1410		true, &area, NULL);
1411	if (status != B_OK) {
1412		cache->ReleaseRefAndUnlock();
1413		return status;
1414	}
1415
1416	cache->Unlock();
1417	area->cache_type = CACHE_TYPE_RAM;
1418	return area->id;
1419}
1420
1421
1422status_t
1423vm_unreserve_address_range(team_id team, void* address, addr_t size)
1424{
1425	AddressSpaceWriteLocker locker(team);
1426	if (!locker.IsLocked())
1427		return B_BAD_TEAM_ID;
1428
1429	VMAddressSpace* addressSpace = locker.AddressSpace();
1430	return addressSpace->UnreserveAddressRange((addr_t)address, size,
1431		addressSpace == VMAddressSpace::Kernel()
1432			? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0);
1433}
1434
1435
1436status_t
1437vm_reserve_address_range(team_id team, void** _address, uint32 addressSpec,
1438	addr_t size, uint32 flags)
1439{
1440	if (size == 0)
1441		return B_BAD_VALUE;
1442
1443	AddressSpaceWriteLocker locker(team);
1444	if (!locker.IsLocked())
1445		return B_BAD_TEAM_ID;
1446
1447	virtual_address_restrictions addressRestrictions = {};
1448	addressRestrictions.address = *_address;
1449	addressRestrictions.address_specification = addressSpec;
1450	VMAddressSpace* addressSpace = locker.AddressSpace();
1451	return addressSpace->ReserveAddressRange(size, &addressRestrictions, flags,
1452		addressSpace == VMAddressSpace::Kernel()
1453			? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0,
1454		_address);
1455}
1456
1457
1458area_id
1459vm_create_anonymous_area(team_id team, const char *name, addr_t size,
1460	uint32 wiring, uint32 protection, uint32 flags, addr_t guardSize,
1461	const virtual_address_restrictions* virtualAddressRestrictions,
1462	const physical_address_restrictions* physicalAddressRestrictions,
1463	bool kernel, void** _address)
1464{
1465	VMArea* area;
1466	VMCache* cache;
1467	vm_page* page = NULL;
1468	bool isStack = (protection & B_STACK_AREA) != 0;
1469	page_num_t guardPages;
1470	bool canOvercommit = false;
1471	uint32 pageAllocFlags = (flags & CREATE_AREA_DONT_CLEAR) == 0
1472		? VM_PAGE_ALLOC_CLEAR : 0;
1473
1474	TRACE(("create_anonymous_area [%" B_PRId32 "] %s: size 0x%" B_PRIxADDR "\n",
1475		team, name, size));
1476
1477	size = PAGE_ALIGN(size);
1478	guardSize = PAGE_ALIGN(guardSize);
1479	guardPages = guardSize / B_PAGE_SIZE;
1480
1481	if (size == 0 || size < guardSize)
1482		return B_BAD_VALUE;
1483	if (!arch_vm_supports_protection(protection))
1484		return B_NOT_SUPPORTED;
1485
1486	if (team == B_CURRENT_TEAM)
1487		team = VMAddressSpace::CurrentID();
1488	if (team < 0)
1489		return B_BAD_TEAM_ID;
1490
1491	if (isStack || (protection & B_OVERCOMMITTING_AREA) != 0)
1492		canOvercommit = true;
1493
1494#ifdef DEBUG_KERNEL_STACKS
1495	if ((protection & B_KERNEL_STACK_AREA) != 0)
1496		isStack = true;
1497#endif
1498
1499	// check parameters
1500	switch (virtualAddressRestrictions->address_specification) {
1501		case B_ANY_ADDRESS:
1502		case B_EXACT_ADDRESS:
1503		case B_BASE_ADDRESS:
1504		case B_ANY_KERNEL_ADDRESS:
1505		case B_ANY_KERNEL_BLOCK_ADDRESS:
1506		case B_RANDOMIZED_ANY_ADDRESS:
1507		case B_RANDOMIZED_BASE_ADDRESS:
1508			break;
1509
1510		default:
1511			return B_BAD_VALUE;
1512	}
1513
1514	// If low or high physical address restrictions are given, we force
1515	// B_CONTIGUOUS wiring, since only then we'll use
1516	// vm_page_allocate_page_run() which deals with those restrictions.
1517	if (physicalAddressRestrictions->low_address != 0
1518		|| physicalAddressRestrictions->high_address != 0) {
1519		wiring = B_CONTIGUOUS;
1520	}
1521
1522	physical_address_restrictions stackPhysicalRestrictions;
1523	bool doReserveMemory = false;
1524	switch (wiring) {
1525		case B_NO_LOCK:
1526			break;
1527		case B_FULL_LOCK:
1528		case B_LAZY_LOCK:
1529		case B_CONTIGUOUS:
1530			doReserveMemory = true;
1531			break;
1532		case B_ALREADY_WIRED:
1533			break;
1534		case B_LOMEM:
1535			stackPhysicalRestrictions = *physicalAddressRestrictions;
1536			stackPhysicalRestrictions.high_address = 16 * 1024 * 1024;
1537			physicalAddressRestrictions = &stackPhysicalRestrictions;
1538			wiring = B_CONTIGUOUS;
1539			doReserveMemory = true;
1540			break;
1541		case B_32_BIT_FULL_LOCK:
1542			if (B_HAIKU_PHYSICAL_BITS <= 32
1543				|| (uint64)vm_page_max_address() < (uint64)1 << 32) {
1544				wiring = B_FULL_LOCK;
1545				doReserveMemory = true;
1546				break;
1547			}
1548			// TODO: We don't really support this mode efficiently. Just fall
1549			// through for now ...
1550		case B_32_BIT_CONTIGUOUS:
1551			#if B_HAIKU_PHYSICAL_BITS > 32
1552				if (vm_page_max_address() >= (phys_addr_t)1 << 32) {
1553					stackPhysicalRestrictions = *physicalAddressRestrictions;
1554					stackPhysicalRestrictions.high_address
1555						= (phys_addr_t)1 << 32;
1556					physicalAddressRestrictions = &stackPhysicalRestrictions;
1557				}
1558			#endif
1559			wiring = B_CONTIGUOUS;
1560			doReserveMemory = true;
1561			break;
1562		default:
1563			return B_BAD_VALUE;
1564	}
1565
1566	// Optimization: For a single-page contiguous allocation without low/high
1567	// memory restriction B_FULL_LOCK wiring suffices.
1568	if (wiring == B_CONTIGUOUS && size == B_PAGE_SIZE
1569		&& physicalAddressRestrictions->low_address == 0
1570		&& physicalAddressRestrictions->high_address == 0) {
1571		wiring = B_FULL_LOCK;
1572	}
1573
1574	// For full lock or contiguous areas we're also going to map the pages and
1575	// thus need to reserve pages for the mapping backend upfront.
1576	addr_t reservedMapPages = 0;
1577	if (wiring == B_FULL_LOCK || wiring == B_CONTIGUOUS) {
1578		AddressSpaceWriteLocker locker;
1579		status_t status = locker.SetTo(team);
1580		if (status != B_OK)
1581			return status;
1582
1583		VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1584		reservedMapPages = map->MaxPagesNeededToMap(0, size - 1);
1585	}
1586
1587	int priority;
1588	if (team != VMAddressSpace::KernelID())
1589		priority = VM_PRIORITY_USER;
1590	else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0)
1591		priority = VM_PRIORITY_VIP;
1592	else
1593		priority = VM_PRIORITY_SYSTEM;
1594
1595	// Reserve memory before acquiring the address space lock. This reduces the
1596	// chances of failure, since while holding the write lock to the address
1597	// space (if it is the kernel address space that is), the low memory handler
1598	// won't be able to free anything for us.
1599	addr_t reservedMemory = 0;
1600	if (doReserveMemory) {
1601		bigtime_t timeout = (flags & CREATE_AREA_DONT_WAIT) != 0 ? 0 : 1000000;
1602		if (vm_try_reserve_memory(size, priority, timeout) != B_OK)
1603			return B_NO_MEMORY;
1604		reservedMemory = size;
1605		// TODO: We don't reserve the memory for the pages for the page
1606		// directories/tables. We actually need to do since we currently don't
1607		// reclaim them (and probably can't reclaim all of them anyway). Thus
1608		// there are actually less physical pages than there should be, which
1609		// can get the VM into trouble in low memory situations.
1610	}
1611
1612	AddressSpaceWriteLocker locker;
1613	VMAddressSpace* addressSpace;
1614	status_t status;
1615
1616	// For full lock areas reserve the pages before locking the address
1617	// space. E.g. block caches can't release their memory while we hold the
1618	// address space lock.
1619	page_num_t reservedPages = reservedMapPages;
1620	if (wiring == B_FULL_LOCK)
1621		reservedPages += size / B_PAGE_SIZE;
1622
1623	vm_page_reservation reservation;
1624	if (reservedPages > 0) {
1625		if ((flags & CREATE_AREA_DONT_WAIT) != 0) {
1626			if (!vm_page_try_reserve_pages(&reservation, reservedPages,
1627					priority)) {
1628				reservedPages = 0;
1629				status = B_WOULD_BLOCK;
1630				goto err0;
1631			}
1632		} else
1633			vm_page_reserve_pages(&reservation, reservedPages, priority);
1634	}
1635
1636	if (wiring == B_CONTIGUOUS) {
1637		// we try to allocate the page run here upfront as this may easily
1638		// fail for obvious reasons
1639		page = vm_page_allocate_page_run(PAGE_STATE_WIRED | pageAllocFlags,
1640			size / B_PAGE_SIZE, physicalAddressRestrictions, priority);
1641		if (page == NULL) {
1642			status = B_NO_MEMORY;
1643			goto err0;
1644		}
1645	}
1646
1647	// Lock the address space and, if B_EXACT_ADDRESS and
1648	// CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
1649	// is not wired.
1650	do {
1651		status = locker.SetTo(team);
1652		if (status != B_OK)
1653			goto err1;
1654
1655		addressSpace = locker.AddressSpace();
1656	} while (virtualAddressRestrictions->address_specification
1657			== B_EXACT_ADDRESS
1658		&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
1659		&& wait_if_address_range_is_wired(addressSpace,
1660			(addr_t)virtualAddressRestrictions->address, size, &locker));
1661
1662	// create an anonymous cache
1663	// if it's a stack, make sure that two pages are available at least
1664	status = VMCacheFactory::CreateAnonymousCache(cache, canOvercommit,
1665		isStack ? (min_c(2, size / B_PAGE_SIZE - guardPages)) : 0, guardPages,
1666		wiring == B_NO_LOCK, priority);
1667	if (status != B_OK)
1668		goto err1;
1669
1670	cache->temporary = 1;
1671	cache->virtual_end = size;
1672	cache->committed_size = reservedMemory;
1673		// TODO: This should be done via a method.
1674	reservedMemory = 0;
1675
1676	cache->Lock();
1677
1678	status = map_backing_store(addressSpace, cache, 0, name, size, wiring,
1679		protection, 0, REGION_NO_PRIVATE_MAP, flags,
1680		virtualAddressRestrictions, kernel, &area, _address);
1681
1682	if (status != B_OK) {
1683		cache->ReleaseRefAndUnlock();
1684		goto err1;
1685	}
1686
1687	locker.DegradeToReadLock();
1688
1689	switch (wiring) {
1690		case B_NO_LOCK:
1691		case B_LAZY_LOCK:
1692			// do nothing - the pages are mapped in as needed
1693			break;
1694
1695		case B_FULL_LOCK:
1696		{
1697			// Allocate and map all pages for this area
1698
1699			off_t offset = 0;
1700			for (addr_t address = area->Base();
1701					address < area->Base() + (area->Size() - 1);
1702					address += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1703#ifdef DEBUG_KERNEL_STACKS
1704#	ifdef STACK_GROWS_DOWNWARDS
1705				if (isStack && address < area->Base()
1706						+ KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1707#	else
1708				if (isStack && address >= area->Base() + area->Size()
1709						- KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1710#	endif
1711					continue;
1712#endif
1713				vm_page* page = vm_page_allocate_page(&reservation,
1714					PAGE_STATE_WIRED | pageAllocFlags);
1715				cache->InsertPage(page, offset);
1716				map_page(area, page, address, protection, &reservation);
1717
1718				DEBUG_PAGE_ACCESS_END(page);
1719			}
1720
1721			break;
1722		}
1723
1724		case B_ALREADY_WIRED:
1725		{
1726			// The pages should already be mapped. This is only really useful
1727			// during boot time. Find the appropriate vm_page objects and stick
1728			// them in the cache object.
1729			VMTranslationMap* map = addressSpace->TranslationMap();
1730			off_t offset = 0;
1731
1732			if (!gKernelStartup)
1733				panic("ALREADY_WIRED flag used outside kernel startup\n");
1734
1735			map->Lock();
1736
1737			for (addr_t virtualAddress = area->Base();
1738					virtualAddress < area->Base() + (area->Size() - 1);
1739					virtualAddress += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1740				phys_addr_t physicalAddress;
1741				uint32 flags;
1742				status = map->Query(virtualAddress, &physicalAddress, &flags);
1743				if (status < B_OK) {
1744					panic("looking up mapping failed for va 0x%lx\n",
1745						virtualAddress);
1746				}
1747				page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1748				if (page == NULL) {
1749					panic("looking up page failed for pa %#" B_PRIxPHYSADDR
1750						"\n", physicalAddress);
1751				}
1752
1753				DEBUG_PAGE_ACCESS_START(page);
1754
1755				cache->InsertPage(page, offset);
1756				increment_page_wired_count(page);
1757				vm_page_set_state(page, PAGE_STATE_WIRED);
1758				page->busy = false;
1759
1760				DEBUG_PAGE_ACCESS_END(page);
1761			}
1762
1763			map->Unlock();
1764			break;
1765		}
1766
1767		case B_CONTIGUOUS:
1768		{
1769			// We have already allocated our continuous pages run, so we can now
1770			// just map them in the address space
1771			VMTranslationMap* map = addressSpace->TranslationMap();
1772			phys_addr_t physicalAddress
1773				= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1774			addr_t virtualAddress = area->Base();
1775			off_t offset = 0;
1776
1777			map->Lock();
1778
1779			for (virtualAddress = area->Base(); virtualAddress < area->Base()
1780					+ (area->Size() - 1); virtualAddress += B_PAGE_SIZE,
1781					offset += B_PAGE_SIZE, physicalAddress += B_PAGE_SIZE) {
1782				page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1783				if (page == NULL)
1784					panic("couldn't lookup physical page just allocated\n");
1785
1786				status = map->Map(virtualAddress, physicalAddress, protection,
1787					area->MemoryType(), &reservation);
1788				if (status < B_OK)
1789					panic("couldn't map physical page in page run\n");
1790
1791				cache->InsertPage(page, offset);
1792				increment_page_wired_count(page);
1793
1794				DEBUG_PAGE_ACCESS_END(page);
1795			}
1796
1797			map->Unlock();
1798			break;
1799		}
1800
1801		default:
1802			break;
1803	}
1804
1805	cache->Unlock();
1806
1807	if (reservedPages > 0)
1808		vm_page_unreserve_pages(&reservation);
1809
1810	TRACE(("vm_create_anonymous_area: done\n"));
1811
1812	area->cache_type = CACHE_TYPE_RAM;
1813	return area->id;
1814
1815err1:
1816	if (wiring == B_CONTIGUOUS) {
1817		// we had reserved the area space upfront...
1818		phys_addr_t pageNumber = page->physical_page_number;
1819		int32 i;
1820		for (i = size / B_PAGE_SIZE; i-- > 0; pageNumber++) {
1821			page = vm_lookup_page(pageNumber);
1822			if (page == NULL)
1823				panic("couldn't lookup physical page just allocated\n");
1824
1825			vm_page_set_state(page, PAGE_STATE_FREE);
1826		}
1827	}
1828
1829err0:
1830	if (reservedPages > 0)
1831		vm_page_unreserve_pages(&reservation);
1832	if (reservedMemory > 0)
1833		vm_unreserve_memory(reservedMemory);
1834
1835	return status;
1836}
1837
1838
1839area_id
1840vm_map_physical_memory(team_id team, const char* name, void** _address,
1841	uint32 addressSpec, addr_t size, uint32 protection,
1842	phys_addr_t physicalAddress, bool alreadyWired)
1843{
1844	VMArea* area;
1845	VMCache* cache;
1846	addr_t mapOffset;
1847
1848	TRACE(("vm_map_physical_memory(aspace = %" B_PRId32 ", \"%s\", virtual = %p"
1849		", spec = %" B_PRIu32 ", size = %" B_PRIxADDR ", protection = %"
1850		B_PRIu32 ", phys = %#" B_PRIxPHYSADDR ")\n", team, name, *_address,
1851		addressSpec, size, protection, physicalAddress));
1852
1853	if (!arch_vm_supports_protection(protection))
1854		return B_NOT_SUPPORTED;
1855
1856	AddressSpaceWriteLocker locker(team);
1857	if (!locker.IsLocked())
1858		return B_BAD_TEAM_ID;
1859
1860	// if the physical address is somewhat inside a page,
1861	// move the actual area down to align on a page boundary
1862	mapOffset = physicalAddress % B_PAGE_SIZE;
1863	size += mapOffset;
1864	physicalAddress -= mapOffset;
1865
1866	size = PAGE_ALIGN(size);
1867
1868	// create a device cache
1869	status_t status = VMCacheFactory::CreateDeviceCache(cache, physicalAddress);
1870	if (status != B_OK)
1871		return status;
1872
1873	cache->virtual_end = size;
1874
1875	cache->Lock();
1876
1877	virtual_address_restrictions addressRestrictions = {};
1878	addressRestrictions.address = *_address;
1879	addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1880	status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
1881		B_FULL_LOCK, protection, 0, REGION_NO_PRIVATE_MAP, 0, &addressRestrictions,
1882		true, &area, _address);
1883
1884	if (status < B_OK)
1885		cache->ReleaseRefLocked();
1886
1887	cache->Unlock();
1888
1889	if (status == B_OK) {
1890		// set requested memory type -- use uncached, if not given
1891		uint32 memoryType = addressSpec & B_MTR_MASK;
1892		if (memoryType == 0)
1893			memoryType = B_MTR_UC;
1894
1895		area->SetMemoryType(memoryType);
1896
1897		status = arch_vm_set_memory_type(area, physicalAddress, memoryType);
1898		if (status != B_OK)
1899			delete_area(locker.AddressSpace(), area, false);
1900	}
1901
1902	if (status != B_OK)
1903		return status;
1904
1905	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1906
1907	if (alreadyWired) {
1908		// The area is already mapped, but possibly not with the right
1909		// memory type.
1910		map->Lock();
1911		map->ProtectArea(area, area->protection);
1912		map->Unlock();
1913	} else {
1914		// Map the area completely.
1915
1916		// reserve pages needed for the mapping
1917		size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
1918			area->Base() + (size - 1));
1919		vm_page_reservation reservation;
1920		vm_page_reserve_pages(&reservation, reservePages,
1921			team == VMAddressSpace::KernelID()
1922				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1923
1924		map->Lock();
1925
1926		for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
1927			map->Map(area->Base() + offset, physicalAddress + offset,
1928				protection, area->MemoryType(), &reservation);
1929		}
1930
1931		map->Unlock();
1932
1933		vm_page_unreserve_pages(&reservation);
1934	}
1935
1936	// modify the pointer returned to be offset back into the new area
1937	// the same way the physical address in was offset
1938	*_address = (void*)((addr_t)*_address + mapOffset);
1939
1940	area->cache_type = CACHE_TYPE_DEVICE;
1941	return area->id;
1942}
1943
1944
1945/*!	Don't use!
1946	TODO: This function was introduced to map physical page vecs to
1947	contiguous virtual memory in IOBuffer::GetNextVirtualVec(). It does
1948	use a device cache and does not track vm_page::wired_count!
1949*/
1950area_id
1951vm_map_physical_memory_vecs(team_id team, const char* name, void** _address,
1952	uint32 addressSpec, addr_t* _size, uint32 protection,
1953	struct generic_io_vec* vecs, uint32 vecCount)
1954{
1955	TRACE(("vm_map_physical_memory_vecs(team = %" B_PRId32 ", \"%s\", virtual "
1956		"= %p, spec = %" B_PRIu32 ", _size = %p, protection = %" B_PRIu32 ", "
1957		"vecs = %p, vecCount = %" B_PRIu32 ")\n", team, name, *_address,
1958		addressSpec, _size, protection, vecs, vecCount));
1959
1960	if (!arch_vm_supports_protection(protection)
1961		|| (addressSpec & B_MTR_MASK) != 0) {
1962		return B_NOT_SUPPORTED;
1963	}
1964
1965	AddressSpaceWriteLocker locker(team);
1966	if (!locker.IsLocked())
1967		return B_BAD_TEAM_ID;
1968
1969	if (vecCount == 0)
1970		return B_BAD_VALUE;
1971
1972	addr_t size = 0;
1973	for (uint32 i = 0; i < vecCount; i++) {
1974		if (vecs[i].base % B_PAGE_SIZE != 0
1975			|| vecs[i].length % B_PAGE_SIZE != 0) {
1976			return B_BAD_VALUE;
1977		}
1978
1979		size += vecs[i].length;
1980	}
1981
1982	// create a device cache
1983	VMCache* cache;
1984	status_t result = VMCacheFactory::CreateDeviceCache(cache, vecs[0].base);
1985	if (result != B_OK)
1986		return result;
1987
1988	cache->virtual_end = size;
1989
1990	cache->Lock();
1991
1992	VMArea* area;
1993	virtual_address_restrictions addressRestrictions = {};
1994	addressRestrictions.address = *_address;
1995	addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1996	result = map_backing_store(locker.AddressSpace(), cache, 0, name,
1997		size, B_FULL_LOCK, protection, 0, REGION_NO_PRIVATE_MAP, 0,
1998		&addressRestrictions, true, &area, _address);
1999
2000	if (result != B_OK)
2001		cache->ReleaseRefLocked();
2002
2003	cache->Unlock();
2004
2005	if (result != B_OK)
2006		return result;
2007
2008	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
2009	size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
2010		area->Base() + (size - 1));
2011
2012	vm_page_reservation reservation;
2013	vm_page_reserve_pages(&reservation, reservePages,
2014			team == VMAddressSpace::KernelID()
2015				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2016	map->Lock();
2017
2018	uint32 vecIndex = 0;
2019	size_t vecOffset = 0;
2020	for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
2021		while (vecOffset >= vecs[vecIndex].length && vecIndex < vecCount) {
2022			vecOffset = 0;
2023			vecIndex++;
2024		}
2025
2026		if (vecIndex >= vecCount)
2027			break;
2028
2029		map->Map(area->Base() + offset, vecs[vecIndex].base + vecOffset,
2030			protection, area->MemoryType(), &reservation);
2031
2032		vecOffset += B_PAGE_SIZE;
2033	}
2034
2035	map->Unlock();
2036	vm_page_unreserve_pages(&reservation);
2037
2038	if (_size != NULL)
2039		*_size = size;
2040
2041	area->cache_type = CACHE_TYPE_DEVICE;
2042	return area->id;
2043}
2044
2045
2046area_id
2047vm_create_null_area(team_id team, const char* name, void** address,
2048	uint32 addressSpec, addr_t size, uint32 flags)
2049{
2050	size = PAGE_ALIGN(size);
2051
2052	// Lock the address space and, if B_EXACT_ADDRESS and
2053	// CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
2054	// is not wired.
2055	AddressSpaceWriteLocker locker;
2056	do {
2057		if (locker.SetTo(team) != B_OK)
2058			return B_BAD_TEAM_ID;
2059	} while (addressSpec == B_EXACT_ADDRESS
2060		&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
2061		&& wait_if_address_range_is_wired(locker.AddressSpace(),
2062			(addr_t)*address, size, &locker));
2063
2064	// create a null cache
2065	int priority = (flags & CREATE_AREA_PRIORITY_VIP) != 0
2066		? VM_PRIORITY_VIP : VM_PRIORITY_SYSTEM;
2067	VMCache* cache;
2068	status_t status = VMCacheFactory::CreateNullCache(priority, cache);
2069	if (status != B_OK)
2070		return status;
2071
2072	cache->temporary = 1;
2073	cache->virtual_end = size;
2074
2075	cache->Lock();
2076
2077	VMArea* area;
2078	virtual_address_restrictions addressRestrictions = {};
2079	addressRestrictions.address = *address;
2080	addressRestrictions.address_specification = addressSpec;
2081	status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
2082		B_LAZY_LOCK, B_KERNEL_READ_AREA, B_KERNEL_READ_AREA,
2083		REGION_NO_PRIVATE_MAP, flags,
2084		&addressRestrictions, true, &area, address);
2085
2086	if (status < B_OK) {
2087		cache->ReleaseRefAndUnlock();
2088		return status;
2089	}
2090
2091	cache->Unlock();
2092
2093	area->cache_type = CACHE_TYPE_NULL;
2094	return area->id;
2095}
2096
2097
2098/*!	Creates the vnode cache for the specified \a vnode.
2099	The vnode has to be marked busy when calling this function.
2100*/
2101status_t
2102vm_create_vnode_cache(struct vnode* vnode, struct VMCache** cache)
2103{
2104	return VMCacheFactory::CreateVnodeCache(*cache, vnode);
2105}
2106
2107
2108/*!	\a cache must be locked. The area's address space must be read-locked.
2109*/
2110static void
2111pre_map_area_pages(VMArea* area, VMCache* cache,
2112	vm_page_reservation* reservation)
2113{
2114	addr_t baseAddress = area->Base();
2115	addr_t cacheOffset = area->cache_offset;
2116	page_num_t firstPage = cacheOffset / B_PAGE_SIZE;
2117	page_num_t endPage = firstPage + area->Size() / B_PAGE_SIZE;
2118
2119	for (VMCachePagesTree::Iterator it
2120				= cache->pages.GetIterator(firstPage, true, true);
2121			vm_page* page = it.Next();) {
2122		if (page->cache_offset >= endPage)
2123			break;
2124
2125		// skip busy and inactive pages
2126		if (page->busy || page->usage_count == 0)
2127			continue;
2128
2129		DEBUG_PAGE_ACCESS_START(page);
2130		map_page(area, page,
2131			baseAddress + (page->cache_offset * B_PAGE_SIZE - cacheOffset),
2132			B_READ_AREA | B_KERNEL_READ_AREA, reservation);
2133		DEBUG_PAGE_ACCESS_END(page);
2134	}
2135}
2136
2137
2138/*!	Will map the file specified by \a fd to an area in memory.
2139	The file will be mirrored beginning at the specified \a offset. The
2140	\a offset and \a size arguments have to be page aligned.
2141*/
2142static area_id
2143_vm_map_file(team_id team, const char* name, void** _address,
2144	uint32 addressSpec, size_t size, uint32 protection, uint32 mapping,
2145	bool unmapAddressRange, int fd, off_t offset, bool kernel)
2146{
2147	// TODO: for binary files, we want to make sure that they get the
2148	//	copy of a file at a given time, ie. later changes should not
2149	//	make it into the mapped copy -- this will need quite some changes
2150	//	to be done in a nice way
2151	TRACE(("_vm_map_file(fd = %d, offset = %" B_PRIdOFF ", size = %lu, mapping "
2152		"%" B_PRIu32 ")\n", fd, offset, size, mapping));
2153
2154	offset = ROUNDDOWN(offset, B_PAGE_SIZE);
2155	size = PAGE_ALIGN(size);
2156
2157	if (mapping == REGION_NO_PRIVATE_MAP)
2158		protection |= B_SHARED_AREA;
2159	if (addressSpec != B_EXACT_ADDRESS)
2160		unmapAddressRange = false;
2161
2162	uint32 mappingFlags = 0;
2163	if (unmapAddressRange)
2164		mappingFlags |= CREATE_AREA_UNMAP_ADDRESS_RANGE;
2165
2166	if (fd < 0) {
2167		virtual_address_restrictions virtualRestrictions = {};
2168		virtualRestrictions.address = *_address;
2169		virtualRestrictions.address_specification = addressSpec;
2170		physical_address_restrictions physicalRestrictions = {};
2171		return vm_create_anonymous_area(team, name, size, B_NO_LOCK, protection,
2172			mappingFlags, 0, &virtualRestrictions, &physicalRestrictions, kernel,
2173			_address);
2174	}
2175
2176	// get the open flags of the FD
2177	file_descriptor* descriptor = get_fd(get_current_io_context(kernel), fd);
2178	if (descriptor == NULL)
2179		return EBADF;
2180	int32 openMode = descriptor->open_mode;
2181	put_fd(descriptor);
2182
2183	// The FD must open for reading at any rate. For shared mapping with write
2184	// access, additionally the FD must be open for writing.
2185	if ((openMode & O_ACCMODE) == O_WRONLY
2186		|| (mapping == REGION_NO_PRIVATE_MAP
2187			&& (protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0
2188			&& (openMode & O_ACCMODE) == O_RDONLY)) {
2189		return EACCES;
2190	}
2191
2192	uint32 protectionMax = 0;
2193	if (mapping == REGION_NO_PRIVATE_MAP) {
2194		if ((openMode & O_ACCMODE) == O_RDWR)
2195			protectionMax = protection | B_USER_PROTECTION;
2196		else
2197			protectionMax = protection | (B_USER_PROTECTION & ~B_WRITE_AREA);
2198	} else if (mapping == REGION_PRIVATE_MAP) {
2199		// For privately mapped read-only regions, skip committing memory.
2200		// (If protections are changed later on, memory will be committed then.)
2201		if ((protection & B_WRITE_AREA) == 0)
2202			mappingFlags |= CREATE_AREA_DONT_COMMIT_MEMORY;
2203	}
2204
2205	// get the vnode for the object, this also grabs a ref to it
2206	struct vnode* vnode = NULL;
2207	status_t status = vfs_get_vnode_from_fd(fd, kernel, &vnode);
2208	if (status < B_OK)
2209		return status;
2210	VnodePutter vnodePutter(vnode);
2211
2212	// If we're going to pre-map pages, we need to reserve the pages needed by
2213	// the mapping backend upfront.
2214	page_num_t reservedPreMapPages = 0;
2215	vm_page_reservation reservation;
2216	if ((protection & B_READ_AREA) != 0) {
2217		AddressSpaceWriteLocker locker;
2218		status = locker.SetTo(team);
2219		if (status != B_OK)
2220			return status;
2221
2222		VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
2223		reservedPreMapPages = map->MaxPagesNeededToMap(0, size - 1);
2224
2225		locker.Unlock();
2226
2227		vm_page_reserve_pages(&reservation, reservedPreMapPages,
2228			team == VMAddressSpace::KernelID()
2229				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2230	}
2231
2232	struct PageUnreserver {
2233		PageUnreserver(vm_page_reservation* reservation)
2234			:
2235			fReservation(reservation)
2236		{
2237		}
2238
2239		~PageUnreserver()
2240		{
2241			if (fReservation != NULL)
2242				vm_page_unreserve_pages(fReservation);
2243		}
2244
2245		vm_page_reservation* fReservation;
2246	} pageUnreserver(reservedPreMapPages > 0 ? &reservation : NULL);
2247
2248	// Lock the address space and, if the specified address range shall be
2249	// unmapped, ensure it is not wired.
2250	AddressSpaceWriteLocker locker;
2251	do {
2252		if (locker.SetTo(team) != B_OK)
2253			return B_BAD_TEAM_ID;
2254	} while (unmapAddressRange
2255		&& wait_if_address_range_is_wired(locker.AddressSpace(),
2256			(addr_t)*_address, size, &locker));
2257
2258	// TODO: this only works for file systems that use the file cache
2259	VMCache* cache;
2260	status = vfs_get_vnode_cache(vnode, &cache, false);
2261	if (status < B_OK)
2262		return status;
2263
2264	cache->Lock();
2265
2266	VMArea* area;
2267	virtual_address_restrictions addressRestrictions = {};
2268	addressRestrictions.address = *_address;
2269	addressRestrictions.address_specification = addressSpec;
2270	status = map_backing_store(locker.AddressSpace(), cache, offset, name, size,
2271		0, protection, protectionMax, mapping, mappingFlags,
2272		&addressRestrictions, kernel, &area, _address);
2273
2274	if (status != B_OK || mapping == REGION_PRIVATE_MAP) {
2275		// map_backing_store() cannot know we no longer need the ref
2276		cache->ReleaseRefLocked();
2277	}
2278
2279	if (status == B_OK && (protection & B_READ_AREA) != 0)
2280		pre_map_area_pages(area, cache, &reservation);
2281
2282	cache->Unlock();
2283
2284	if (status == B_OK) {
2285		// TODO: this probably deserves a smarter solution, ie. don't always
2286		// prefetch stuff, and also, probably don't trigger it at this place.
2287		cache_prefetch_vnode(vnode, offset, min_c(size, 10LL * 1024 * 1024));
2288			// prefetches at max 10 MB starting from "offset"
2289	}
2290
2291	if (status != B_OK)
2292		return status;
2293
2294	area->cache_type = CACHE_TYPE_VNODE;
2295	return area->id;
2296}
2297
2298
2299area_id
2300vm_map_file(team_id aid, const char* name, void** address, uint32 addressSpec,
2301	addr_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
2302	int fd, off_t offset)
2303{
2304	if (!arch_vm_supports_protection(protection))
2305		return B_NOT_SUPPORTED;
2306
2307	return _vm_map_file(aid, name, address, addressSpec, size, protection,
2308		mapping, unmapAddressRange, fd, offset, true);
2309}
2310
2311
2312VMCache*
2313vm_area_get_locked_cache(VMArea* area)
2314{
2315	rw_lock_read_lock(&sAreaCacheLock);
2316
2317	while (true) {
2318		VMCache* cache = area->cache;
2319
2320		if (!cache->SwitchFromReadLock(&sAreaCacheLock)) {
2321			// cache has been deleted
2322			rw_lock_read_lock(&sAreaCacheLock);
2323			continue;
2324		}
2325
2326		rw_lock_read_lock(&sAreaCacheLock);
2327
2328		if (cache == area->cache) {
2329			cache->AcquireRefLocked();
2330			rw_lock_read_unlock(&sAreaCacheLock);
2331			return cache;
2332		}
2333
2334		// the cache changed in the meantime
2335		cache->Unlock();
2336	}
2337}
2338
2339
2340void
2341vm_area_put_locked_cache(VMCache* cache)
2342{
2343	cache->ReleaseRefAndUnlock();
2344}
2345
2346
2347area_id
2348vm_clone_area(team_id team, const char* name, void** address,
2349	uint32 addressSpec, uint32 protection, uint32 mapping, area_id sourceID,
2350	bool kernel)
2351{
2352	VMArea* newArea = NULL;
2353	VMArea* sourceArea;
2354
2355	// Check whether the source area exists and is cloneable. If so, mark it
2356	// B_SHARED_AREA, so that we don't get problems with copy-on-write.
2357	{
2358		AddressSpaceWriteLocker locker;
2359		status_t status = locker.SetFromArea(sourceID, sourceArea);
2360		if (status != B_OK)
2361			return status;
2362
2363		if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2364			return B_NOT_ALLOWED;
2365
2366		sourceArea->protection |= B_SHARED_AREA;
2367		protection |= B_SHARED_AREA;
2368	}
2369
2370	// Now lock both address spaces and actually do the cloning.
2371
2372	MultiAddressSpaceLocker locker;
2373	VMAddressSpace* sourceAddressSpace;
2374	status_t status = locker.AddArea(sourceID, false, &sourceAddressSpace);
2375	if (status != B_OK)
2376		return status;
2377
2378	VMAddressSpace* targetAddressSpace;
2379	status = locker.AddTeam(team, true, &targetAddressSpace);
2380	if (status != B_OK)
2381		return status;
2382
2383	status = locker.Lock();
2384	if (status != B_OK)
2385		return status;
2386
2387	sourceArea = lookup_area(sourceAddressSpace, sourceID);
2388	if (sourceArea == NULL)
2389		return B_BAD_VALUE;
2390
2391	if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2392		return B_NOT_ALLOWED;
2393
2394	VMCache* cache = vm_area_get_locked_cache(sourceArea);
2395
2396	if (!kernel && sourceAddressSpace != targetAddressSpace
2397		&& (sourceArea->protection & B_CLONEABLE_AREA) == 0) {
2398#if KDEBUG
2399		Team* team = thread_get_current_thread()->team;
2400		dprintf("team \"%s\" (%" B_PRId32 ") attempted to clone area \"%s\" (%"
2401			B_PRId32 ")!\n", team->Name(), team->id, sourceArea->name, sourceID);
2402#endif
2403		status = B_NOT_ALLOWED;
2404	} else if (sourceArea->cache_type == CACHE_TYPE_NULL) {
2405		status = B_NOT_ALLOWED;
2406	} else {
2407		virtual_address_restrictions addressRestrictions = {};
2408		addressRestrictions.address = *address;
2409		addressRestrictions.address_specification = addressSpec;
2410		status = map_backing_store(targetAddressSpace, cache,
2411			sourceArea->cache_offset, name, sourceArea->Size(),
2412			sourceArea->wiring, protection, sourceArea->protection_max,
2413			mapping, 0, &addressRestrictions,
2414			kernel, &newArea, address);
2415	}
2416	if (status == B_OK && mapping != REGION_PRIVATE_MAP) {
2417		// If the mapping is REGION_PRIVATE_MAP, map_backing_store() needed
2418		// to create a new cache, and has therefore already acquired a reference
2419		// to the source cache - but otherwise it has no idea that we need
2420		// one.
2421		cache->AcquireRefLocked();
2422	}
2423	if (status == B_OK && newArea->wiring == B_FULL_LOCK) {
2424		// we need to map in everything at this point
2425		if (sourceArea->cache_type == CACHE_TYPE_DEVICE) {
2426			// we don't have actual pages to map but a physical area
2427			VMTranslationMap* map
2428				= sourceArea->address_space->TranslationMap();
2429			map->Lock();
2430
2431			phys_addr_t physicalAddress;
2432			uint32 oldProtection;
2433			map->Query(sourceArea->Base(), &physicalAddress, &oldProtection);
2434
2435			map->Unlock();
2436
2437			map = targetAddressSpace->TranslationMap();
2438			size_t reservePages = map->MaxPagesNeededToMap(newArea->Base(),
2439				newArea->Base() + (newArea->Size() - 1));
2440
2441			vm_page_reservation reservation;
2442			vm_page_reserve_pages(&reservation, reservePages,
2443				targetAddressSpace == VMAddressSpace::Kernel()
2444					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2445			map->Lock();
2446
2447			for (addr_t offset = 0; offset < newArea->Size();
2448					offset += B_PAGE_SIZE) {
2449				map->Map(newArea->Base() + offset, physicalAddress + offset,
2450					protection, newArea->MemoryType(), &reservation);
2451			}
2452
2453			map->Unlock();
2454			vm_page_unreserve_pages(&reservation);
2455		} else {
2456			VMTranslationMap* map = targetAddressSpace->TranslationMap();
2457			size_t reservePages = map->MaxPagesNeededToMap(
2458				newArea->Base(), newArea->Base() + (newArea->Size() - 1));
2459			vm_page_reservation reservation;
2460			vm_page_reserve_pages(&reservation, reservePages,
2461				targetAddressSpace == VMAddressSpace::Kernel()
2462					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2463
2464			// map in all pages from source
2465			for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
2466					vm_page* page  = it.Next();) {
2467				if (!page->busy) {
2468					DEBUG_PAGE_ACCESS_START(page);
2469					map_page(newArea, page,
2470						newArea->Base() + ((page->cache_offset << PAGE_SHIFT)
2471							- newArea->cache_offset),
2472						protection, &reservation);
2473					DEBUG_PAGE_ACCESS_END(page);
2474				}
2475			}
2476			// TODO: B_FULL_LOCK means that all pages are locked. We are not
2477			// ensuring that!
2478
2479			vm_page_unreserve_pages(&reservation);
2480		}
2481	}
2482	if (status == B_OK)
2483		newArea->cache_type = sourceArea->cache_type;
2484
2485	vm_area_put_locked_cache(cache);
2486
2487	if (status < B_OK)
2488		return status;
2489
2490	return newArea->id;
2491}
2492
2493
2494/*!	Deletes the specified area of the given address space.
2495
2496	The address space must be write-locked.
2497	The caller must ensure that the area does not have any wired ranges.
2498
2499	\param addressSpace The address space containing the area.
2500	\param area The area to be deleted.
2501	\param deletingAddressSpace \c true, if the address space is in the process
2502		of being deleted.
2503*/
2504static void
2505delete_area(VMAddressSpace* addressSpace, VMArea* area,
2506	bool deletingAddressSpace)
2507{
2508	ASSERT(!area->IsWired());
2509
2510	VMAreas::Remove(area);
2511
2512	// At this point the area is removed from the global hash table, but
2513	// still exists in the area list.
2514
2515	// Unmap the virtual address space the area occupied.
2516	{
2517		// We need to lock the complete cache chain.
2518		VMCache* topCache = vm_area_get_locked_cache(area);
2519		VMCacheChainLocker cacheChainLocker(topCache);
2520		cacheChainLocker.LockAllSourceCaches();
2521
2522		// If the area's top cache is a temporary cache and the area is the only
2523		// one referencing it (besides us currently holding a second reference),
2524		// the unmapping code doesn't need to care about preserving the accessed
2525		// and dirty flags of the top cache page mappings.
2526		bool ignoreTopCachePageFlags
2527			= topCache->temporary && topCache->RefCount() == 2;
2528
2529		area->address_space->TranslationMap()->UnmapArea(area,
2530			deletingAddressSpace, ignoreTopCachePageFlags);
2531	}
2532
2533	if (!area->cache->temporary)
2534		area->cache->WriteModified();
2535
2536	uint32 allocationFlags = addressSpace == VMAddressSpace::Kernel()
2537		? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
2538
2539	arch_vm_unset_memory_type(area);
2540	addressSpace->RemoveArea(area, allocationFlags);
2541	addressSpace->Put();
2542
2543	area->cache->RemoveArea(area);
2544	area->cache->ReleaseRef();
2545
2546	addressSpace->DeleteArea(area, allocationFlags);
2547}
2548
2549
2550status_t
2551vm_delete_area(team_id team, area_id id, bool kernel)
2552{
2553	TRACE(("vm_delete_area(team = 0x%" B_PRIx32 ", area = 0x%" B_PRIx32 ")\n",
2554		team, id));
2555
2556	// lock the address space and make sure the area isn't wired
2557	AddressSpaceWriteLocker locker;
2558	VMArea* area;
2559	AreaCacheLocker cacheLocker;
2560
2561	do {
2562		status_t status = locker.SetFromArea(team, id, area);
2563		if (status != B_OK)
2564			return status;
2565
2566		cacheLocker.SetTo(area);
2567	} while (wait_if_area_is_wired(area, &locker, &cacheLocker));
2568
2569	cacheLocker.Unlock();
2570
2571	if (!kernel && (area->protection & B_KERNEL_AREA) != 0)
2572		return B_NOT_ALLOWED;
2573
2574	delete_area(locker.AddressSpace(), area, false);
2575	return B_OK;
2576}
2577
2578
2579/*!	Creates a new cache on top of given cache, moves all areas from
2580	the old cache to the new one, and changes the protection of all affected
2581	areas' pages to read-only. If requested, wired pages are moved up to the
2582	new cache and copies are added to the old cache in their place.
2583	Preconditions:
2584	- The given cache must be locked.
2585	- All of the cache's areas' address spaces must be read locked.
2586	- Either the cache must not have any wired ranges or a page reservation for
2587	  all wired pages must be provided, so they can be copied.
2588
2589	\param lowerCache The cache on top of which a new cache shall be created.
2590	\param wiredPagesReservation If \c NULL there must not be any wired pages
2591		in \a lowerCache. Otherwise as many pages must be reserved as the cache
2592		has wired page. The wired pages are copied in this case.
2593*/
2594static status_t
2595vm_copy_on_write_area(VMCache* lowerCache,
2596	vm_page_reservation* wiredPagesReservation)
2597{
2598	VMCache* upperCache;
2599
2600	TRACE(("vm_copy_on_write_area(cache = %p)\n", lowerCache));
2601
2602	// We need to separate the cache from its areas. The cache goes one level
2603	// deeper and we create a new cache inbetween.
2604
2605	// create an anonymous cache
2606	status_t status = VMCacheFactory::CreateAnonymousCache(upperCache, false, 0,
2607		lowerCache->GuardSize() / B_PAGE_SIZE,
2608		dynamic_cast<VMAnonymousNoSwapCache*>(lowerCache) == NULL,
2609		VM_PRIORITY_USER);
2610	if (status != B_OK)
2611		return status;
2612
2613	upperCache->Lock();
2614
2615	upperCache->temporary = 1;
2616	upperCache->virtual_base = lowerCache->virtual_base;
2617	upperCache->virtual_end = lowerCache->virtual_end;
2618
2619	// transfer the lower cache areas to the upper cache
2620	rw_lock_write_lock(&sAreaCacheLock);
2621	upperCache->TransferAreas(lowerCache);
2622	rw_lock_write_unlock(&sAreaCacheLock);
2623
2624	lowerCache->AddConsumer(upperCache);
2625
2626	// We now need to remap all pages from all of the cache's areas read-only,
2627	// so that a copy will be created on next write access. If there are wired
2628	// pages, we keep their protection, move them to the upper cache and create
2629	// copies for the lower cache.
2630	if (wiredPagesReservation != NULL) {
2631		// We need to handle wired pages -- iterate through the cache's pages.
2632		for (VMCachePagesTree::Iterator it = lowerCache->pages.GetIterator();
2633				vm_page* page = it.Next();) {
2634			if (page->WiredCount() > 0) {
2635				// allocate a new page and copy the wired one
2636				vm_page* copiedPage = vm_page_allocate_page(
2637					wiredPagesReservation, PAGE_STATE_ACTIVE);
2638
2639				vm_memcpy_physical_page(
2640					copiedPage->physical_page_number * B_PAGE_SIZE,
2641					page->physical_page_number * B_PAGE_SIZE);
2642
2643				// move the wired page to the upper cache (note: removing is OK
2644				// with the SplayTree iterator) and insert the copy
2645				upperCache->MovePage(page);
2646				lowerCache->InsertPage(copiedPage,
2647					page->cache_offset * B_PAGE_SIZE);
2648
2649				DEBUG_PAGE_ACCESS_END(copiedPage);
2650			} else {
2651				// Change the protection of this page in all areas.
2652				for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2653						tempArea = tempArea->cache_next) {
2654					if (!is_page_in_area(tempArea, page))
2655						continue;
2656
2657					// The area must be readable in the same way it was
2658					// previously writable.
2659					addr_t address = virtual_page_address(tempArea, page);
2660					uint32 protection = 0;
2661					uint32 pageProtection = get_area_page_protection(tempArea, address);
2662					if ((pageProtection & B_KERNEL_READ_AREA) != 0)
2663						protection |= B_KERNEL_READ_AREA;
2664					if ((pageProtection & B_READ_AREA) != 0)
2665						protection |= B_READ_AREA;
2666
2667					VMTranslationMap* map
2668						= tempArea->address_space->TranslationMap();
2669					map->Lock();
2670					map->ProtectPage(tempArea, address, protection);
2671					map->Unlock();
2672				}
2673			}
2674		}
2675	} else {
2676		ASSERT(lowerCache->WiredPagesCount() == 0);
2677
2678		// just change the protection of all areas
2679		for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2680				tempArea = tempArea->cache_next) {
2681			if (tempArea->page_protections != NULL) {
2682				// Change the protection of all pages in this area.
2683				VMTranslationMap* map = tempArea->address_space->TranslationMap();
2684				map->Lock();
2685				for (VMCachePagesTree::Iterator it = lowerCache->pages.GetIterator();
2686					vm_page* page = it.Next();) {
2687					if (!is_page_in_area(tempArea, page))
2688						continue;
2689
2690					// The area must be readable in the same way it was
2691					// previously writable.
2692					addr_t address = virtual_page_address(tempArea, page);
2693					uint32 protection = 0;
2694					uint32 pageProtection = get_area_page_protection(tempArea, address);
2695					if ((pageProtection & B_KERNEL_READ_AREA) != 0)
2696						protection |= B_KERNEL_READ_AREA;
2697					if ((pageProtection & B_READ_AREA) != 0)
2698						protection |= B_READ_AREA;
2699
2700					map->ProtectPage(tempArea, address, protection);
2701				}
2702				map->Unlock();
2703				continue;
2704			}
2705			// The area must be readable in the same way it was previously
2706			// writable.
2707			uint32 protection = 0;
2708			if ((tempArea->protection & B_KERNEL_READ_AREA) != 0)
2709				protection |= B_KERNEL_READ_AREA;
2710			if ((tempArea->protection & B_READ_AREA) != 0)
2711				protection |= B_READ_AREA;
2712
2713			VMTranslationMap* map = tempArea->address_space->TranslationMap();
2714			map->Lock();
2715			map->ProtectArea(tempArea, protection);
2716			map->Unlock();
2717		}
2718	}
2719
2720	vm_area_put_locked_cache(upperCache);
2721
2722	return B_OK;
2723}
2724
2725
2726area_id
2727vm_copy_area(team_id team, const char* name, void** _address,
2728	uint32 addressSpec, area_id sourceID)
2729{
2730	// Do the locking: target address space, all address spaces associated with
2731	// the source cache, and the cache itself.
2732	MultiAddressSpaceLocker locker;
2733	VMAddressSpace* targetAddressSpace;
2734	VMCache* cache;
2735	VMArea* source;
2736	AreaCacheLocker cacheLocker;
2737	status_t status;
2738	bool sharedArea;
2739
2740	page_num_t wiredPages = 0;
2741	vm_page_reservation wiredPagesReservation;
2742
2743	bool restart;
2744	do {
2745		restart = false;
2746
2747		locker.Unset();
2748		status = locker.AddTeam(team, true, &targetAddressSpace);
2749		if (status == B_OK) {
2750			status = locker.AddAreaCacheAndLock(sourceID, false, false, source,
2751				&cache);
2752		}
2753		if (status != B_OK)
2754			return status;
2755
2756		cacheLocker.SetTo(cache, true);	// already locked
2757
2758		sharedArea = (source->protection & B_SHARED_AREA) != 0;
2759
2760		page_num_t oldWiredPages = wiredPages;
2761		wiredPages = 0;
2762
2763		// If the source area isn't shared, count the number of wired pages in
2764		// the cache and reserve as many pages.
2765		if (!sharedArea) {
2766			wiredPages = cache->WiredPagesCount();
2767
2768			if (wiredPages > oldWiredPages) {
2769				cacheLocker.Unlock();
2770				locker.Unlock();
2771
2772				if (oldWiredPages > 0)
2773					vm_page_unreserve_pages(&wiredPagesReservation);
2774
2775				vm_page_reserve_pages(&wiredPagesReservation, wiredPages,
2776					VM_PRIORITY_USER);
2777
2778				restart = true;
2779			}
2780		} else if (oldWiredPages > 0)
2781			vm_page_unreserve_pages(&wiredPagesReservation);
2782	} while (restart);
2783
2784	// unreserve pages later
2785	struct PagesUnreserver {
2786		PagesUnreserver(vm_page_reservation* reservation)
2787			:
2788			fReservation(reservation)
2789		{
2790		}
2791
2792		~PagesUnreserver()
2793		{
2794			if (fReservation != NULL)
2795				vm_page_unreserve_pages(fReservation);
2796		}
2797
2798	private:
2799		vm_page_reservation*	fReservation;
2800	} pagesUnreserver(wiredPages > 0 ? &wiredPagesReservation : NULL);
2801
2802	bool writableCopy
2803		= (source->protection & (B_KERNEL_WRITE_AREA | B_WRITE_AREA)) != 0;
2804	uint8* targetPageProtections = NULL;
2805
2806	if (source->page_protections != NULL) {
2807		size_t bytes = area_page_protections_size(source->Size());
2808		targetPageProtections = (uint8*)malloc_etc(bytes,
2809			(source->address_space == VMAddressSpace::Kernel()
2810					|| targetAddressSpace == VMAddressSpace::Kernel())
2811				? HEAP_DONT_LOCK_KERNEL_SPACE : 0);
2812		if (targetPageProtections == NULL)
2813			return B_NO_MEMORY;
2814
2815		memcpy(targetPageProtections, source->page_protections, bytes);
2816
2817		if (!writableCopy) {
2818			for (size_t i = 0; i < bytes; i++) {
2819				if ((targetPageProtections[i]
2820						& (B_WRITE_AREA | B_WRITE_AREA << 4)) != 0) {
2821					writableCopy = true;
2822					break;
2823				}
2824			}
2825		}
2826	}
2827
2828	if (addressSpec == B_CLONE_ADDRESS) {
2829		addressSpec = B_EXACT_ADDRESS;
2830		*_address = (void*)source->Base();
2831	}
2832
2833	// First, create a cache on top of the source area, respectively use the
2834	// existing one, if this is a shared area.
2835
2836	VMArea* target;
2837	virtual_address_restrictions addressRestrictions = {};
2838	addressRestrictions.address = *_address;
2839	addressRestrictions.address_specification = addressSpec;
2840	status = map_backing_store(targetAddressSpace, cache, source->cache_offset,
2841		name, source->Size(), source->wiring, source->protection,
2842		source->protection_max,
2843		sharedArea ? REGION_NO_PRIVATE_MAP : REGION_PRIVATE_MAP,
2844		writableCopy ? 0 : CREATE_AREA_DONT_COMMIT_MEMORY,
2845		&addressRestrictions, true, &target, _address);
2846	if (status < B_OK) {
2847		free_etc(targetPageProtections, HEAP_DONT_LOCK_KERNEL_SPACE);
2848		return status;
2849	}
2850
2851	if (targetPageProtections != NULL)
2852		target->page_protections = targetPageProtections;
2853
2854	if (sharedArea) {
2855		// The new area uses the old area's cache, but map_backing_store()
2856		// hasn't acquired a ref. So we have to do that now.
2857		cache->AcquireRefLocked();
2858	}
2859
2860	// If the source area is writable, we need to move it one layer up as well
2861
2862	if (!sharedArea) {
2863		if (writableCopy) {
2864			// TODO: do something more useful if this fails!
2865			if (vm_copy_on_write_area(cache,
2866					wiredPages > 0 ? &wiredPagesReservation : NULL) < B_OK) {
2867				panic("vm_copy_on_write_area() failed!\n");
2868			}
2869		}
2870	}
2871
2872	// we return the ID of the newly created area
2873	return target->id;
2874}
2875
2876
2877status_t
2878vm_set_area_protection(team_id team, area_id areaID, uint32 newProtection,
2879	bool kernel)
2880{
2881	fix_protection(&newProtection);
2882
2883	TRACE(("vm_set_area_protection(team = %#" B_PRIx32 ", area = %#" B_PRIx32
2884		", protection = %#" B_PRIx32 ")\n", team, areaID, newProtection));
2885
2886	if (!arch_vm_supports_protection(newProtection))
2887		return B_NOT_SUPPORTED;
2888
2889	bool becomesWritable
2890		= (newProtection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2891
2892	// lock address spaces and cache
2893	MultiAddressSpaceLocker locker;
2894	VMCache* cache;
2895	VMArea* area;
2896	status_t status;
2897	AreaCacheLocker cacheLocker;
2898	bool isWritable;
2899
2900	bool restart;
2901	do {
2902		restart = false;
2903
2904		locker.Unset();
2905		status = locker.AddAreaCacheAndLock(areaID, true, false, area, &cache);
2906		if (status != B_OK)
2907			return status;
2908
2909		cacheLocker.SetTo(cache, true);	// already locked
2910
2911		if (!kernel && (area->address_space == VMAddressSpace::Kernel()
2912				|| (area->protection & B_KERNEL_AREA) != 0)) {
2913			dprintf("vm_set_area_protection: team %" B_PRId32 " tried to "
2914				"set protection %#" B_PRIx32 " on kernel area %" B_PRId32
2915				" (%s)\n", team, newProtection, areaID, area->name);
2916			return B_NOT_ALLOWED;
2917		}
2918		if (!kernel && area->protection_max != 0
2919			&& (newProtection & area->protection_max)
2920				!= (newProtection & B_USER_PROTECTION)) {
2921			dprintf("vm_set_area_protection: team %" B_PRId32 " tried to "
2922				"set protection %#" B_PRIx32 " (max %#" B_PRIx32 ") on kernel "
2923				"area %" B_PRId32 " (%s)\n", team, newProtection,
2924				area->protection_max, areaID, area->name);
2925			return B_NOT_ALLOWED;
2926		}
2927
2928		if (team != VMAddressSpace::KernelID()
2929			&& area->address_space->ID() != team) {
2930			// unless you're the kernel, you are only allowed to set
2931			// the protection of your own areas
2932			return B_NOT_ALLOWED;
2933		}
2934
2935		if (area->protection == newProtection)
2936			return B_OK;
2937
2938		isWritable
2939			= (area->protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2940
2941		// Make sure the area (respectively, if we're going to call
2942		// vm_copy_on_write_area(), all areas of the cache) doesn't have any
2943		// wired ranges.
2944		if (!isWritable && becomesWritable && !cache->consumers.IsEmpty()) {
2945			for (VMArea* otherArea = cache->areas; otherArea != NULL;
2946					otherArea = otherArea->cache_next) {
2947				if (wait_if_area_is_wired(otherArea, &locker, &cacheLocker)) {
2948					restart = true;
2949					break;
2950				}
2951			}
2952		} else {
2953			if (wait_if_area_is_wired(area, &locker, &cacheLocker))
2954				restart = true;
2955		}
2956	} while (restart);
2957
2958	bool changePageProtection = true;
2959	bool changeTopCachePagesOnly = false;
2960
2961	if (isWritable && !becomesWritable) {
2962		// writable -> !writable
2963
2964		if (cache->source != NULL && cache->temporary) {
2965			if (cache->CountWritableAreas(area) == 0) {
2966				// Since this cache now lives from the pages in its source cache,
2967				// we can change the cache's commitment to take only those pages
2968				// into account that really are in this cache.
2969
2970				status = cache->Commit(cache->page_count * B_PAGE_SIZE,
2971					team == VMAddressSpace::KernelID()
2972						? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2973
2974				// TODO: we may be able to join with our source cache, if
2975				// count == 0
2976			}
2977		}
2978
2979		// If only the writability changes, we can just remap the pages of the
2980		// top cache, since the pages of lower caches are mapped read-only
2981		// anyway. That's advantageous only, if the number of pages in the cache
2982		// is significantly smaller than the number of pages in the area,
2983		// though.
2984		if (newProtection
2985				== (area->protection & ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA))
2986			&& cache->page_count * 2 < area->Size() / B_PAGE_SIZE) {
2987			changeTopCachePagesOnly = true;
2988		}
2989	} else if (!isWritable && becomesWritable) {
2990		// !writable -> writable
2991
2992		if (!cache->consumers.IsEmpty()) {
2993			// There are consumers -- we have to insert a new cache. Fortunately
2994			// vm_copy_on_write_area() does everything that's needed.
2995			changePageProtection = false;
2996			status = vm_copy_on_write_area(cache, NULL);
2997		} else {
2998			// No consumers, so we don't need to insert a new one.
2999			if (cache->source != NULL && cache->temporary) {
3000				// the cache's commitment must contain all possible pages
3001				status = cache->Commit(cache->virtual_end - cache->virtual_base,
3002					team == VMAddressSpace::KernelID()
3003						? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
3004			}
3005
3006			if (status == B_OK && cache->source != NULL) {
3007				// There's a source cache, hence we can't just change all pages'
3008				// protection or we might allow writing into pages belonging to
3009				// a lower cache.
3010				changeTopCachePagesOnly = true;
3011			}
3012		}
3013	} else {
3014		// we don't have anything special to do in all other cases
3015	}
3016
3017	if (status == B_OK) {
3018		// remap existing pages in this cache
3019		if (changePageProtection) {
3020			VMTranslationMap* map = area->address_space->TranslationMap();
3021			map->Lock();
3022
3023			if (changeTopCachePagesOnly) {
3024				page_num_t firstPageOffset = area->cache_offset / B_PAGE_SIZE;
3025				page_num_t lastPageOffset
3026					= firstPageOffset + area->Size() / B_PAGE_SIZE;
3027				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
3028						vm_page* page = it.Next();) {
3029					if (page->cache_offset >= firstPageOffset
3030						&& page->cache_offset <= lastPageOffset) {
3031						addr_t address = virtual_page_address(area, page);
3032						map->ProtectPage(area, address, newProtection);
3033					}
3034				}
3035			} else
3036				map->ProtectArea(area, newProtection);
3037
3038			map->Unlock();
3039		}
3040
3041		area->protection = newProtection;
3042	}
3043
3044	return status;
3045}
3046
3047
3048status_t
3049vm_get_page_mapping(team_id team, addr_t vaddr, phys_addr_t* paddr)
3050{
3051	VMAddressSpace* addressSpace = VMAddressSpace::Get(team);
3052	if (addressSpace == NULL)
3053		return B_BAD_TEAM_ID;
3054
3055	VMTranslationMap* map = addressSpace->TranslationMap();
3056
3057	map->Lock();
3058	uint32 dummyFlags;
3059	status_t status = map->Query(vaddr, paddr, &dummyFlags);
3060	map->Unlock();
3061
3062	addressSpace->Put();
3063	return status;
3064}
3065
3066
3067/*!	The page's cache must be locked.
3068*/
3069bool
3070vm_test_map_modification(vm_page* page)
3071{
3072	if (page->modified)
3073		return true;
3074
3075	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
3076	vm_page_mapping* mapping;
3077	while ((mapping = iterator.Next()) != NULL) {
3078		VMArea* area = mapping->area;
3079		VMTranslationMap* map = area->address_space->TranslationMap();
3080
3081		phys_addr_t physicalAddress;
3082		uint32 flags;
3083		map->Lock();
3084		map->Query(virtual_page_address(area, page), &physicalAddress, &flags);
3085		map->Unlock();
3086
3087		if ((flags & PAGE_MODIFIED) != 0)
3088			return true;
3089	}
3090
3091	return false;
3092}
3093
3094
3095/*!	The page's cache must be locked.
3096*/
3097void
3098vm_clear_map_flags(vm_page* page, uint32 flags)
3099{
3100	if ((flags & PAGE_ACCESSED) != 0)
3101		page->accessed = false;
3102	if ((flags & PAGE_MODIFIED) != 0)
3103		page->modified = false;
3104
3105	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
3106	vm_page_mapping* mapping;
3107	while ((mapping = iterator.Next()) != NULL) {
3108		VMArea* area = mapping->area;
3109		VMTranslationMap* map = area->address_space->TranslationMap();
3110
3111		map->Lock();
3112		map->ClearFlags(virtual_page_address(area, page), flags);
3113		map->Unlock();
3114	}
3115}
3116
3117
3118/*!	Removes all mappings from a page.
3119	After you've called this function, the page is unmapped from memory and
3120	the page's \c accessed and \c modified flags have been updated according
3121	to the state of the mappings.
3122	The page's cache must be locked.
3123*/
3124void
3125vm_remove_all_page_mappings(vm_page* page)
3126{
3127	while (vm_page_mapping* mapping = page->mappings.Head()) {
3128		VMArea* area = mapping->area;
3129		VMTranslationMap* map = area->address_space->TranslationMap();
3130		addr_t address = virtual_page_address(area, page);
3131		map->UnmapPage(area, address, false);
3132	}
3133}
3134
3135
3136int32
3137vm_clear_page_mapping_accessed_flags(struct vm_page *page)
3138{
3139	int32 count = 0;
3140
3141	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
3142	vm_page_mapping* mapping;
3143	while ((mapping = iterator.Next()) != NULL) {
3144		VMArea* area = mapping->area;
3145		VMTranslationMap* map = area->address_space->TranslationMap();
3146
3147		bool modified;
3148		if (map->ClearAccessedAndModified(area,
3149				virtual_page_address(area, page), false, modified)) {
3150			count++;
3151		}
3152
3153		page->modified |= modified;
3154	}
3155
3156
3157	if (page->accessed) {
3158		count++;
3159		page->accessed = false;
3160	}
3161
3162	return count;
3163}
3164
3165
3166/*!	Removes all mappings of a page and/or clears the accessed bits of the
3167	mappings.
3168	The function iterates through the page mappings and removes them until
3169	encountering one that has been accessed. From then on it will continue to
3170	iterate, but only clear the accessed flag of the mapping. The page's
3171	\c modified bit will be updated accordingly, the \c accessed bit will be
3172	cleared.
3173	\return The number of mapping accessed bits encountered, including the
3174		\c accessed bit of the page itself. If \c 0 is returned, all mappings
3175		of the page have been removed.
3176*/
3177int32
3178vm_remove_all_page_mappings_if_unaccessed(struct vm_page *page)
3179{
3180	ASSERT(page->WiredCount() == 0);
3181
3182	if (page->accessed)
3183		return vm_clear_page_mapping_accessed_flags(page);
3184
3185	while (vm_page_mapping* mapping = page->mappings.Head()) {
3186		VMArea* area = mapping->area;
3187		VMTranslationMap* map = area->address_space->TranslationMap();
3188		addr_t address = virtual_page_address(area, page);
3189		bool modified = false;
3190		if (map->ClearAccessedAndModified(area, address, true, modified)) {
3191			page->accessed = true;
3192			page->modified |= modified;
3193			return vm_clear_page_mapping_accessed_flags(page);
3194		}
3195		page->modified |= modified;
3196	}
3197
3198	return 0;
3199}
3200
3201
3202static int
3203display_mem(int argc, char** argv)
3204{
3205	bool physical = false;
3206	addr_t copyAddress;
3207	int32 displayWidth;
3208	int32 itemSize;
3209	int32 num = -1;
3210	addr_t address;
3211	int i = 1, j;
3212
3213	if (argc > 1 && argv[1][0] == '-') {
3214		if (!strcmp(argv[1], "-p") || !strcmp(argv[1], "--physical")) {
3215			physical = true;
3216			i++;
3217		} else
3218			i = 99;
3219	}
3220
3221	if (argc < i + 1 || argc > i + 2) {
3222		kprintf("usage: dl/dw/ds/db/string [-p|--physical] <address> [num]\n"
3223			"\tdl - 8 bytes\n"
3224			"\tdw - 4 bytes\n"
3225			"\tds - 2 bytes\n"
3226			"\tdb - 1 byte\n"
3227			"\tstring - a whole string\n"
3228			"  -p or --physical only allows memory from a single page to be "
3229			"displayed.\n");
3230		return 0;
3231	}
3232
3233	address = parse_expression(argv[i]);
3234
3235	if (argc > i + 1)
3236		num = parse_expression(argv[i + 1]);
3237
3238	// build the format string
3239	if (strcmp(argv[0], "db") == 0) {
3240		itemSize = 1;
3241		displayWidth = 16;
3242	} else if (strcmp(argv[0], "ds") == 0) {
3243		itemSize = 2;
3244		displayWidth = 8;
3245	} else if (strcmp(argv[0], "dw") == 0) {
3246		itemSize = 4;
3247		displayWidth = 4;
3248	} else if (strcmp(argv[0], "dl") == 0) {
3249		itemSize = 8;
3250		displayWidth = 2;
3251	} else if (strcmp(argv[0], "string") == 0) {
3252		itemSize = 1;
3253		displayWidth = -1;
3254	} else {
3255		kprintf("display_mem called in an invalid way!\n");
3256		return 0;
3257	}
3258
3259	if (num <= 0)
3260		num = displayWidth;
3261
3262	void* physicalPageHandle = NULL;
3263
3264	if (physical) {
3265		int32 offset = address & (B_PAGE_SIZE - 1);
3266		if (num * itemSize + offset > B_PAGE_SIZE) {
3267			num = (B_PAGE_SIZE - offset) / itemSize;
3268			kprintf("NOTE: number of bytes has been cut to page size\n");
3269		}
3270
3271		address = ROUNDDOWN(address, B_PAGE_SIZE);
3272
3273		if (vm_get_physical_page_debug(address, &copyAddress,
3274				&physicalPageHandle) != B_OK) {
3275			kprintf("getting the hardware page failed.");
3276			return 0;
3277		}
3278
3279		address += offset;
3280		copyAddress += offset;
3281	} else
3282		copyAddress = address;
3283
3284	if (!strcmp(argv[0], "string")) {
3285		kprintf("%p \"", (char*)copyAddress);
3286
3287		// string mode
3288		for (i = 0; true; i++) {
3289			char c;
3290			if (debug_memcpy(B_CURRENT_TEAM, &c, (char*)copyAddress + i, 1)
3291					!= B_OK
3292				|| c == '\0') {
3293				break;
3294			}
3295
3296			if (c == '\n')
3297				kprintf("\\n");
3298			else if (c == '\t')
3299				kprintf("\\t");
3300			else {
3301				if (!isprint(c))
3302					c = '.';
3303
3304				kprintf("%c", c);
3305			}
3306		}
3307
3308		kprintf("\"\n");
3309	} else {
3310		// number mode
3311		for (i = 0; i < num; i++) {
3312			uint64 value;
3313
3314			if ((i % displayWidth) == 0) {
3315				int32 displayed = min_c(displayWidth, (num-i)) * itemSize;
3316				if (i != 0)
3317					kprintf("\n");
3318
3319				kprintf("[0x%lx]  ", address + i * itemSize);
3320
3321				for (j = 0; j < displayed; j++) {
3322					char c;
3323					if (debug_memcpy(B_CURRENT_TEAM, &c,
3324							(char*)copyAddress + i * itemSize + j, 1) != B_OK) {
3325						displayed = j;
3326						break;
3327					}
3328					if (!isprint(c))
3329						c = '.';
3330
3331					kprintf("%c", c);
3332				}
3333				if (num > displayWidth) {
3334					// make sure the spacing in the last line is correct
3335					for (j = displayed; j < displayWidth * itemSize; j++)
3336						kprintf(" ");
3337				}
3338				kprintf("  ");
3339			}
3340
3341			if (debug_memcpy(B_CURRENT_TEAM, &value,
3342					(uint8*)copyAddress + i * itemSize, itemSize) != B_OK) {
3343				kprintf("read fault");
3344				break;
3345			}
3346
3347			switch (itemSize) {
3348				case 1:
3349					kprintf(" %02" B_PRIx8, *(uint8*)&value);
3350					break;
3351				case 2:
3352					kprintf(" %04" B_PRIx16, *(uint16*)&value);
3353					break;
3354				case 4:
3355					kprintf(" %08" B_PRIx32, *(uint32*)&value);
3356					break;
3357				case 8:
3358					kprintf(" %016" B_PRIx64, *(uint64*)&value);
3359					break;
3360			}
3361		}
3362
3363		kprintf("\n");
3364	}
3365
3366	if (physical) {
3367		copyAddress = ROUNDDOWN(copyAddress, B_PAGE_SIZE);
3368		vm_put_physical_page_debug(copyAddress, physicalPageHandle);
3369	}
3370	return 0;
3371}
3372
3373
3374static void
3375dump_cache_tree_recursively(VMCache* cache, int level,
3376	VMCache* highlightCache)
3377{
3378	// print this cache
3379	for (int i = 0; i < level; i++)
3380		kprintf("  ");
3381	if (cache == highlightCache)
3382		kprintf("%p <--\n", cache);
3383	else
3384		kprintf("%p\n", cache);
3385
3386	// recursively print its consumers
3387	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3388			VMCache* consumer = it.Next();) {
3389		dump_cache_tree_recursively(consumer, level + 1, highlightCache);
3390	}
3391}
3392
3393
3394static int
3395dump_cache_tree(int argc, char** argv)
3396{
3397	if (argc != 2 || !strcmp(argv[1], "--help")) {
3398		kprintf("usage: %s <address>\n", argv[0]);
3399		return 0;
3400	}
3401
3402	addr_t address = parse_expression(argv[1]);
3403	if (address == 0)
3404		return 0;
3405
3406	VMCache* cache = (VMCache*)address;
3407	VMCache* root = cache;
3408
3409	// find the root cache (the transitive source)
3410	while (root->source != NULL)
3411		root = root->source;
3412
3413	dump_cache_tree_recursively(root, 0, cache);
3414
3415	return 0;
3416}
3417
3418
3419const char*
3420vm_cache_type_to_string(int32 type)
3421{
3422	switch (type) {
3423		case CACHE_TYPE_RAM:
3424			return "RAM";
3425		case CACHE_TYPE_DEVICE:
3426			return "device";
3427		case CACHE_TYPE_VNODE:
3428			return "vnode";
3429		case CACHE_TYPE_NULL:
3430			return "null";
3431
3432		default:
3433			return "unknown";
3434	}
3435}
3436
3437
3438#if DEBUG_CACHE_LIST
3439
3440static void
3441update_cache_info_recursively(VMCache* cache, cache_info& info)
3442{
3443	info.page_count += cache->page_count;
3444	if (cache->type == CACHE_TYPE_RAM)
3445		info.committed += cache->committed_size;
3446
3447	// recurse
3448	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3449			VMCache* consumer = it.Next();) {
3450		update_cache_info_recursively(consumer, info);
3451	}
3452}
3453
3454
3455static int
3456cache_info_compare_page_count(const void* _a, const void* _b)
3457{
3458	const cache_info* a = (const cache_info*)_a;
3459	const cache_info* b = (const cache_info*)_b;
3460	if (a->page_count == b->page_count)
3461		return 0;
3462	return a->page_count < b->page_count ? 1 : -1;
3463}
3464
3465
3466static int
3467cache_info_compare_committed(const void* _a, const void* _b)
3468{
3469	const cache_info* a = (const cache_info*)_a;
3470	const cache_info* b = (const cache_info*)_b;
3471	if (a->committed == b->committed)
3472		return 0;
3473	return a->committed < b->committed ? 1 : -1;
3474}
3475
3476
3477static void
3478dump_caches_recursively(VMCache* cache, cache_info& info, int level)
3479{
3480	for (int i = 0; i < level; i++)
3481		kprintf("  ");
3482
3483	kprintf("%p: type: %s, base: %" B_PRIdOFF ", size: %" B_PRIdOFF ", "
3484		"pages: %" B_PRIu32, cache, vm_cache_type_to_string(cache->type),
3485		cache->virtual_base, cache->virtual_end, cache->page_count);
3486
3487	if (level == 0)
3488		kprintf("/%lu", info.page_count);
3489
3490	if (cache->type == CACHE_TYPE_RAM || (level == 0 && info.committed > 0)) {
3491		kprintf(", committed: %" B_PRIdOFF, cache->committed_size);
3492
3493		if (level == 0)
3494			kprintf("/%lu", info.committed);
3495	}
3496
3497	// areas
3498	if (cache->areas != NULL) {
3499		VMArea* area = cache->areas;
3500		kprintf(", areas: %" B_PRId32 " (%s, team: %" B_PRId32 ")", area->id,
3501			area->name, area->address_space->ID());
3502
3503		while (area->cache_next != NULL) {
3504			area = area->cache_next;
3505			kprintf(", %" B_PRId32, area->id);
3506		}
3507	}
3508
3509	kputs("\n");
3510
3511	// recurse
3512	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3513			VMCache* consumer = it.Next();) {
3514		dump_caches_recursively(consumer, info, level + 1);
3515	}
3516}
3517
3518
3519static int
3520dump_caches(int argc, char** argv)
3521{
3522	if (sCacheInfoTable == NULL) {
3523		kprintf("No cache info table!\n");
3524		return 0;
3525	}
3526
3527	bool sortByPageCount = true;
3528
3529	for (int32 i = 1; i < argc; i++) {
3530		if (strcmp(argv[i], "-c") == 0) {
3531			sortByPageCount = false;
3532		} else {
3533			print_debugger_command_usage(argv[0]);
3534			return 0;
3535		}
3536	}
3537
3538	uint32 totalCount = 0;
3539	uint32 rootCount = 0;
3540	off_t totalCommitted = 0;
3541	page_num_t totalPages = 0;
3542
3543	VMCache* cache = gDebugCacheList;
3544	while (cache) {
3545		totalCount++;
3546		if (cache->source == NULL) {
3547			cache_info stackInfo;
3548			cache_info& info = rootCount < (uint32)kCacheInfoTableCount
3549				? sCacheInfoTable[rootCount] : stackInfo;
3550			rootCount++;
3551			info.cache = cache;
3552			info.page_count = 0;
3553			info.committed = 0;
3554			update_cache_info_recursively(cache, info);
3555			totalCommitted += info.committed;
3556			totalPages += info.page_count;
3557		}
3558
3559		cache = cache->debug_next;
3560	}
3561
3562	if (rootCount <= (uint32)kCacheInfoTableCount) {
3563		qsort(sCacheInfoTable, rootCount, sizeof(cache_info),
3564			sortByPageCount
3565				? &cache_info_compare_page_count
3566				: &cache_info_compare_committed);
3567	}
3568
3569	kprintf("total committed memory: %" B_PRIdOFF ", total used pages: %"
3570		B_PRIuPHYSADDR "\n", totalCommitted, totalPages);
3571	kprintf("%" B_PRIu32 " caches (%" B_PRIu32 " root caches), sorted by %s "
3572		"per cache tree...\n\n", totalCount, rootCount, sortByPageCount ?
3573			"page count" : "committed size");
3574
3575	if (rootCount <= (uint32)kCacheInfoTableCount) {
3576		for (uint32 i = 0; i < rootCount; i++) {
3577			cache_info& info = sCacheInfoTable[i];
3578			dump_caches_recursively(info.cache, info, 0);
3579		}
3580	} else
3581		kprintf("Cache info table too small! Can't sort and print caches!\n");
3582
3583	return 0;
3584}
3585
3586#endif	// DEBUG_CACHE_LIST
3587
3588
3589static int
3590dump_cache(int argc, char** argv)
3591{
3592	VMCache* cache;
3593	bool showPages = false;
3594	int i = 1;
3595
3596	if (argc < 2 || !strcmp(argv[1], "--help")) {
3597		kprintf("usage: %s [-ps] <address>\n"
3598			"  if -p is specified, all pages are shown, if -s is used\n"
3599			"  only the cache info is shown respectively.\n", argv[0]);
3600		return 0;
3601	}
3602	while (argv[i][0] == '-') {
3603		char* arg = argv[i] + 1;
3604		while (arg[0]) {
3605			if (arg[0] == 'p')
3606				showPages = true;
3607			arg++;
3608		}
3609		i++;
3610	}
3611	if (argv[i] == NULL) {
3612		kprintf("%s: invalid argument, pass address\n", argv[0]);
3613		return 0;
3614	}
3615
3616	addr_t address = parse_expression(argv[i]);
3617	if (address == 0)
3618		return 0;
3619
3620	cache = (VMCache*)address;
3621
3622	cache->Dump(showPages);
3623
3624	set_debug_variable("_sourceCache", (addr_t)cache->source);
3625
3626	return 0;
3627}
3628
3629
3630static void
3631dump_area_struct(VMArea* area, bool mappings)
3632{
3633	kprintf("AREA: %p\n", area);
3634	kprintf("name:\t\t'%s'\n", area->name);
3635	kprintf("owner:\t\t0x%" B_PRIx32 "\n", area->address_space->ID());
3636	kprintf("id:\t\t0x%" B_PRIx32 "\n", area->id);
3637	kprintf("base:\t\t0x%lx\n", area->Base());
3638	kprintf("size:\t\t0x%lx\n", area->Size());
3639	kprintf("protection:\t0x%" B_PRIx32 "\n", area->protection);
3640	kprintf("page_protection:%p\n", area->page_protections);
3641	kprintf("wiring:\t\t0x%x\n", area->wiring);
3642	kprintf("memory_type:\t%#" B_PRIx32 "\n", area->MemoryType());
3643	kprintf("cache:\t\t%p\n", area->cache);
3644	kprintf("cache_type:\t%s\n", vm_cache_type_to_string(area->cache_type));
3645	kprintf("cache_offset:\t0x%" B_PRIx64 "\n", area->cache_offset);
3646	kprintf("cache_next:\t%p\n", area->cache_next);
3647	kprintf("cache_prev:\t%p\n", area->cache_prev);
3648
3649	VMAreaMappings::Iterator iterator = area->mappings.GetIterator();
3650	if (mappings) {
3651		kprintf("page mappings:\n");
3652		while (iterator.HasNext()) {
3653			vm_page_mapping* mapping = iterator.Next();
3654			kprintf("  %p", mapping->page);
3655		}
3656		kprintf("\n");
3657	} else {
3658		uint32 count = 0;
3659		while (iterator.Next() != NULL) {
3660			count++;
3661		}
3662		kprintf("page mappings:\t%" B_PRIu32 "\n", count);
3663	}
3664}
3665
3666
3667static int
3668dump_area(int argc, char** argv)
3669{
3670	bool mappings = false;
3671	bool found = false;
3672	int32 index = 1;
3673	VMArea* area;
3674	addr_t num;
3675
3676	if (argc < 2 || !strcmp(argv[1], "--help")) {
3677		kprintf("usage: area [-m] [id|contains|address|name] <id|address|name>\n"
3678			"All areas matching either id/address/name are listed. You can\n"
3679			"force to check only a specific item by prefixing the specifier\n"
3680			"with the id/contains/address/name keywords.\n"
3681			"-m shows the area's mappings as well.\n");
3682		return 0;
3683	}
3684
3685	if (!strcmp(argv[1], "-m")) {
3686		mappings = true;
3687		index++;
3688	}
3689
3690	int32 mode = 0xf;
3691	if (!strcmp(argv[index], "id"))
3692		mode = 1;
3693	else if (!strcmp(argv[index], "contains"))
3694		mode = 2;
3695	else if (!strcmp(argv[index], "name"))
3696		mode = 4;
3697	else if (!strcmp(argv[index], "address"))
3698		mode = 0;
3699	if (mode != 0xf)
3700		index++;
3701
3702	if (index >= argc) {
3703		kprintf("No area specifier given.\n");
3704		return 0;
3705	}
3706
3707	num = parse_expression(argv[index]);
3708
3709	if (mode == 0) {
3710		dump_area_struct((struct VMArea*)num, mappings);
3711	} else {
3712		// walk through the area list, looking for the arguments as a name
3713
3714		VMAreasTree::Iterator it = VMAreas::GetIterator();
3715		while ((area = it.Next()) != NULL) {
3716			if (((mode & 4) != 0
3717					&& !strcmp(argv[index], area->name))
3718				|| (num != 0 && (((mode & 1) != 0 && (addr_t)area->id == num)
3719					|| (((mode & 2) != 0 && area->Base() <= num
3720						&& area->Base() + area->Size() > num))))) {
3721				dump_area_struct(area, mappings);
3722				found = true;
3723			}
3724		}
3725
3726		if (!found)
3727			kprintf("could not find area %s (%ld)\n", argv[index], num);
3728	}
3729
3730	return 0;
3731}
3732
3733
3734static int
3735dump_area_list(int argc, char** argv)
3736{
3737	VMArea* area;
3738	const char* name = NULL;
3739	int32 id = 0;
3740
3741	if (argc > 1) {
3742		id = parse_expression(argv[1]);
3743		if (id == 0)
3744			name = argv[1];
3745	}
3746
3747	kprintf("%-*s      id  %-*s    %-*sprotect lock  name\n",
3748		B_PRINTF_POINTER_WIDTH, "addr", B_PRINTF_POINTER_WIDTH, "base",
3749		B_PRINTF_POINTER_WIDTH, "size");
3750
3751	VMAreasTree::Iterator it = VMAreas::GetIterator();
3752	while ((area = it.Next()) != NULL) {
3753		if ((id != 0 && area->address_space->ID() != id)
3754			|| (name != NULL && strstr(area->name, name) == NULL))
3755			continue;
3756
3757		kprintf("%p %5" B_PRIx32 "  %p  %p %4" B_PRIx32 " %4d  %s\n", area,
3758			area->id, (void*)area->Base(), (void*)area->Size(),
3759			area->protection, area->wiring, area->name);
3760	}
3761	return 0;
3762}
3763
3764
3765static int
3766dump_available_memory(int argc, char** argv)
3767{
3768	kprintf("Available memory: %" B_PRIdOFF "/%" B_PRIuPHYSADDR " bytes\n",
3769		sAvailableMemory, (phys_addr_t)vm_page_num_pages() * B_PAGE_SIZE);
3770	return 0;
3771}
3772
3773
3774static int
3775dump_mapping_info(int argc, char** argv)
3776{
3777	bool reverseLookup = false;
3778	bool pageLookup = false;
3779
3780	int argi = 1;
3781	for (; argi < argc && argv[argi][0] == '-'; argi++) {
3782		const char* arg = argv[argi];
3783		if (strcmp(arg, "-r") == 0) {
3784			reverseLookup = true;
3785		} else if (strcmp(arg, "-p") == 0) {
3786			reverseLookup = true;
3787			pageLookup = true;
3788		} else {
3789			print_debugger_command_usage(argv[0]);
3790			return 0;
3791		}
3792	}
3793
3794	// We need at least one argument, the address. Optionally a thread ID can be
3795	// specified.
3796	if (argi >= argc || argi + 2 < argc) {
3797		print_debugger_command_usage(argv[0]);
3798		return 0;
3799	}
3800
3801	uint64 addressValue;
3802	if (!evaluate_debug_expression(argv[argi++], &addressValue, false))
3803		return 0;
3804
3805	Team* team = NULL;
3806	if (argi < argc) {
3807		uint64 threadID;
3808		if (!evaluate_debug_expression(argv[argi++], &threadID, false))
3809			return 0;
3810
3811		Thread* thread = Thread::GetDebug(threadID);
3812		if (thread == NULL) {
3813			kprintf("Invalid thread/team ID \"%s\"\n", argv[argi - 1]);
3814			return 0;
3815		}
3816
3817		team = thread->team;
3818	}
3819
3820	if (reverseLookup) {
3821		phys_addr_t physicalAddress;
3822		if (pageLookup) {
3823			vm_page* page = (vm_page*)(addr_t)addressValue;
3824			physicalAddress = page->physical_page_number * B_PAGE_SIZE;
3825		} else {
3826			physicalAddress = (phys_addr_t)addressValue;
3827			physicalAddress -= physicalAddress % B_PAGE_SIZE;
3828		}
3829
3830		kprintf("    Team     Virtual Address      Area\n");
3831		kprintf("--------------------------------------\n");
3832
3833		struct Callback : VMTranslationMap::ReverseMappingInfoCallback {
3834			Callback()
3835				:
3836				fAddressSpace(NULL)
3837			{
3838			}
3839
3840			void SetAddressSpace(VMAddressSpace* addressSpace)
3841			{
3842				fAddressSpace = addressSpace;
3843			}
3844
3845			virtual bool HandleVirtualAddress(addr_t virtualAddress)
3846			{
3847				kprintf("%8" B_PRId32 "  %#18" B_PRIxADDR, fAddressSpace->ID(),
3848					virtualAddress);
3849				if (VMArea* area = fAddressSpace->LookupArea(virtualAddress))
3850					kprintf("  %8" B_PRId32 " %s\n", area->id, area->name);
3851				else
3852					kprintf("\n");
3853				return false;
3854			}
3855
3856		private:
3857			VMAddressSpace*	fAddressSpace;
3858		} callback;
3859
3860		if (team != NULL) {
3861			// team specified -- get its address space
3862			VMAddressSpace* addressSpace = team->address_space;
3863			if (addressSpace == NULL) {
3864				kprintf("Failed to get address space!\n");
3865				return 0;
3866			}
3867
3868			callback.SetAddressSpace(addressSpace);
3869			addressSpace->TranslationMap()->DebugGetReverseMappingInfo(
3870				physicalAddress, callback);
3871		} else {
3872			// no team specified -- iterate through all address spaces
3873			for (VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
3874				addressSpace != NULL;
3875				addressSpace = VMAddressSpace::DebugNext(addressSpace)) {
3876				callback.SetAddressSpace(addressSpace);
3877				addressSpace->TranslationMap()->DebugGetReverseMappingInfo(
3878					physicalAddress, callback);
3879			}
3880		}
3881	} else {
3882		// get the address space
3883		addr_t virtualAddress = (addr_t)addressValue;
3884		virtualAddress -= virtualAddress % B_PAGE_SIZE;
3885		VMAddressSpace* addressSpace;
3886		if (IS_KERNEL_ADDRESS(virtualAddress)) {
3887			addressSpace = VMAddressSpace::Kernel();
3888		} else if (team != NULL) {
3889			addressSpace = team->address_space;
3890		} else {
3891			Thread* thread = debug_get_debugged_thread();
3892			if (thread == NULL || thread->team == NULL) {
3893				kprintf("Failed to get team!\n");
3894				return 0;
3895			}
3896
3897			addressSpace = thread->team->address_space;
3898		}
3899
3900		if (addressSpace == NULL) {
3901			kprintf("Failed to get address space!\n");
3902			return 0;
3903		}
3904
3905		// let the translation map implementation do the job
3906		addressSpace->TranslationMap()->DebugPrintMappingInfo(virtualAddress);
3907	}
3908
3909	return 0;
3910}
3911
3912
3913/*!	Deletes all areas and reserved regions in the given address space.
3914
3915	The caller must ensure that none of the areas has any wired ranges.
3916
3917	\param addressSpace The address space.
3918	\param deletingAddressSpace \c true, if the address space is in the process
3919		of being deleted.
3920*/
3921void
3922vm_delete_areas(struct VMAddressSpace* addressSpace, bool deletingAddressSpace)
3923{
3924	TRACE(("vm_delete_areas: called on address space 0x%" B_PRIx32 "\n",
3925		addressSpace->ID()));
3926
3927	addressSpace->WriteLock();
3928
3929	// remove all reserved areas in this address space
3930	addressSpace->UnreserveAllAddressRanges(0);
3931
3932	// delete all the areas in this address space
3933	while (VMArea* area = addressSpace->FirstArea()) {
3934		ASSERT(!area->IsWired());
3935		delete_area(addressSpace, area, deletingAddressSpace);
3936	}
3937
3938	addressSpace->WriteUnlock();
3939}
3940
3941
3942static area_id
3943vm_area_for(addr_t address, bool kernel)
3944{
3945	team_id team;
3946	if (IS_USER_ADDRESS(address)) {
3947		// we try the user team address space, if any
3948		team = VMAddressSpace::CurrentID();
3949		if (team < 0)
3950			return team;
3951	} else
3952		team = VMAddressSpace::KernelID();
3953
3954	AddressSpaceReadLocker locker(team);
3955	if (!locker.IsLocked())
3956		return B_BAD_TEAM_ID;
3957
3958	VMArea* area = locker.AddressSpace()->LookupArea(address);
3959	if (area != NULL) {
3960		if (!kernel && (area->protection & (B_READ_AREA | B_WRITE_AREA)) == 0
3961				&& (area->protection & B_KERNEL_AREA) != 0)
3962			return B_ERROR;
3963
3964		return area->id;
3965	}
3966
3967	return B_ERROR;
3968}
3969
3970
3971/*!	Frees physical pages that were used during the boot process.
3972	\a end is inclusive.
3973*/
3974static void
3975unmap_and_free_physical_pages(VMTranslationMap* map, addr_t start, addr_t end)
3976{
3977	// free all physical pages in the specified range
3978
3979	for (addr_t current = start; current < end; current += B_PAGE_SIZE) {
3980		phys_addr_t physicalAddress;
3981		uint32 flags;
3982
3983		if (map->Query(current, &physicalAddress, &flags) == B_OK
3984			&& (flags & PAGE_PRESENT) != 0) {
3985			vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
3986			if (page != NULL && page->State() != PAGE_STATE_FREE
3987					&& page->State() != PAGE_STATE_CLEAR
3988					&& page->State() != PAGE_STATE_UNUSED) {
3989				DEBUG_PAGE_ACCESS_START(page);
3990				vm_page_set_state(page, PAGE_STATE_FREE);
3991			}
3992		}
3993	}
3994
3995	// unmap the memory
3996	map->Unmap(start, end);
3997}
3998
3999
4000void
4001vm_free_unused_boot_loader_range(addr_t start, addr_t size)
4002{
4003	VMTranslationMap* map = VMAddressSpace::Kernel()->TranslationMap();
4004	addr_t end = start + (size - 1);
4005	addr_t lastEnd = start;
4006
4007	TRACE(("vm_free_unused_boot_loader_range(): asked to free %p - %p\n",
4008		(void*)start, (void*)end));
4009
4010	// The areas are sorted in virtual address space order, so
4011	// we just have to find the holes between them that fall
4012	// into the area we should dispose
4013
4014	map->Lock();
4015
4016	for (VMAddressSpace::AreaIterator it
4017				= VMAddressSpace::Kernel()->GetAreaIterator();
4018			VMArea* area = it.Next();) {
4019		addr_t areaStart = area->Base();
4020		addr_t areaEnd = areaStart + (area->Size() - 1);
4021
4022		if (areaEnd < start)
4023			continue;
4024
4025		if (areaStart > end) {
4026			// we are done, the area is already beyond of what we have to free
4027			break;
4028		}
4029
4030		if (areaStart > lastEnd) {
4031			// this is something we can free
4032			TRACE(("free boot range: get rid of %p - %p\n", (void*)lastEnd,
4033				(void*)areaStart));
4034			unmap_and_free_physical_pages(map, lastEnd, areaStart - 1);
4035		}
4036
4037		if (areaEnd >= end) {
4038			lastEnd = areaEnd;
4039				// no +1 to prevent potential overflow
4040			break;
4041		}
4042
4043		lastEnd = areaEnd + 1;
4044	}
4045
4046	if (lastEnd < end) {
4047		// we can also get rid of some space at the end of the area
4048		TRACE(("free boot range: also remove %p - %p\n", (void*)lastEnd,
4049			(void*)end));
4050		unmap_and_free_physical_pages(map, lastEnd, end);
4051	}
4052
4053	map->Unlock();
4054}
4055
4056
4057static void
4058create_preloaded_image_areas(struct preloaded_image* _image)
4059{
4060	preloaded_elf_image* image = static_cast<preloaded_elf_image*>(_image);
4061	char name[B_OS_NAME_LENGTH];
4062	void* address;
4063	int32 length;
4064
4065	// use file name to create a good area name
4066	char* fileName = strrchr(image->name, '/');
4067	if (fileName == NULL)
4068		fileName = image->name;
4069	else
4070		fileName++;
4071
4072	length = strlen(fileName);
4073	// make sure there is enough space for the suffix
4074	if (length > 25)
4075		length = 25;
4076
4077	memcpy(name, fileName, length);
4078	strcpy(name + length, "_text");
4079	address = (void*)ROUNDDOWN(image->text_region.start, B_PAGE_SIZE);
4080	image->text_region.id = create_area(name, &address, B_EXACT_ADDRESS,
4081		PAGE_ALIGN(image->text_region.size), B_ALREADY_WIRED,
4082		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4083		// this will later be remapped read-only/executable by the
4084		// ELF initialization code
4085
4086	strcpy(name + length, "_data");
4087	address = (void*)ROUNDDOWN(image->data_region.start, B_PAGE_SIZE);
4088	image->data_region.id = create_area(name, &address, B_EXACT_ADDRESS,
4089		PAGE_ALIGN(image->data_region.size), B_ALREADY_WIRED,
4090		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4091}
4092
4093
4094/*!	Frees all previously kernel arguments areas from the kernel_args structure.
4095	Any boot loader resources contained in that arguments must not be accessed
4096	anymore past this point.
4097*/
4098void
4099vm_free_kernel_args(kernel_args* args)
4100{
4101	uint32 i;
4102
4103	TRACE(("vm_free_kernel_args()\n"));
4104
4105	for (i = 0; i < args->num_kernel_args_ranges; i++) {
4106		area_id area = area_for((void*)(addr_t)args->kernel_args_range[i].start);
4107		if (area >= B_OK)
4108			delete_area(area);
4109	}
4110}
4111
4112
4113static void
4114allocate_kernel_args(kernel_args* args)
4115{
4116	TRACE(("allocate_kernel_args()\n"));
4117
4118	for (uint32 i = 0; i < args->num_kernel_args_ranges; i++) {
4119		void* address = (void*)(addr_t)args->kernel_args_range[i].start;
4120
4121		create_area("_kernel args_", &address, B_EXACT_ADDRESS,
4122			args->kernel_args_range[i].size, B_ALREADY_WIRED,
4123			B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4124	}
4125}
4126
4127
4128static void
4129unreserve_boot_loader_ranges(kernel_args* args)
4130{
4131	TRACE(("unreserve_boot_loader_ranges()\n"));
4132
4133	for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
4134		vm_unreserve_address_range(VMAddressSpace::KernelID(),
4135			(void*)(addr_t)args->virtual_allocated_range[i].start,
4136			args->virtual_allocated_range[i].size);
4137	}
4138}
4139
4140
4141static void
4142reserve_boot_loader_ranges(kernel_args* args)
4143{
4144	TRACE(("reserve_boot_loader_ranges()\n"));
4145
4146	for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
4147		void* address = (void*)(addr_t)args->virtual_allocated_range[i].start;
4148
4149		// If the address is no kernel address, we just skip it. The
4150		// architecture specific code has to deal with it.
4151		if (!IS_KERNEL_ADDRESS(address)) {
4152			dprintf("reserve_boot_loader_ranges(): Skipping range: %p, %"
4153				B_PRIu64 "\n", address, args->virtual_allocated_range[i].size);
4154			continue;
4155		}
4156
4157		status_t status = vm_reserve_address_range(VMAddressSpace::KernelID(),
4158			&address, B_EXACT_ADDRESS, args->virtual_allocated_range[i].size, 0);
4159		if (status < B_OK)
4160			panic("could not reserve boot loader ranges\n");
4161	}
4162}
4163
4164
4165static addr_t
4166allocate_early_virtual(kernel_args* args, size_t size, addr_t alignment)
4167{
4168	size = PAGE_ALIGN(size);
4169
4170	// find a slot in the virtual allocation addr range
4171	for (uint32 i = 1; i < args->num_virtual_allocated_ranges; i++) {
4172		// check to see if the space between this one and the last is big enough
4173		addr_t rangeStart = args->virtual_allocated_range[i].start;
4174		addr_t previousRangeEnd = args->virtual_allocated_range[i - 1].start
4175			+ args->virtual_allocated_range[i - 1].size;
4176
4177		addr_t base = alignment > 0
4178			? ROUNDUP(previousRangeEnd, alignment) : previousRangeEnd;
4179
4180		if (base >= KERNEL_BASE && base < rangeStart
4181				&& rangeStart - base >= size) {
4182			args->virtual_allocated_range[i - 1].size
4183				+= base + size - previousRangeEnd;
4184			return base;
4185		}
4186	}
4187
4188	// we hadn't found one between allocation ranges. this is ok.
4189	// see if there's a gap after the last one
4190	int lastEntryIndex = args->num_virtual_allocated_ranges - 1;
4191	addr_t lastRangeEnd = args->virtual_allocated_range[lastEntryIndex].start
4192		+ args->virtual_allocated_range[lastEntryIndex].size;
4193	addr_t base = alignment > 0
4194		? ROUNDUP(lastRangeEnd, alignment) : lastRangeEnd;
4195	if (KERNEL_BASE + (KERNEL_SIZE - 1) - base >= size) {
4196		args->virtual_allocated_range[lastEntryIndex].size
4197			+= base + size - lastRangeEnd;
4198		return base;
4199	}
4200
4201	// see if there's a gap before the first one
4202	addr_t rangeStart = args->virtual_allocated_range[0].start;
4203	if (rangeStart > KERNEL_BASE && rangeStart - KERNEL_BASE >= size) {
4204		base = rangeStart - size;
4205		if (alignment > 0)
4206			base = ROUNDDOWN(base, alignment);
4207
4208		if (base >= KERNEL_BASE) {
4209			args->virtual_allocated_range[0].start = base;
4210			args->virtual_allocated_range[0].size += rangeStart - base;
4211			return base;
4212		}
4213	}
4214
4215	return 0;
4216}
4217
4218
4219static bool
4220is_page_in_physical_memory_range(kernel_args* args, phys_addr_t address)
4221{
4222	// TODO: horrible brute-force method of determining if the page can be
4223	// allocated
4224	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
4225		if (address >= args->physical_memory_range[i].start
4226			&& address < args->physical_memory_range[i].start
4227				+ args->physical_memory_range[i].size)
4228			return true;
4229	}
4230	return false;
4231}
4232
4233
4234page_num_t
4235vm_allocate_early_physical_page(kernel_args* args)
4236{
4237	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
4238		phys_addr_t nextPage;
4239
4240		nextPage = args->physical_allocated_range[i].start
4241			+ args->physical_allocated_range[i].size;
4242		// see if the page after the next allocated paddr run can be allocated
4243		if (i + 1 < args->num_physical_allocated_ranges
4244			&& args->physical_allocated_range[i + 1].size != 0) {
4245			// see if the next page will collide with the next allocated range
4246			if (nextPage >= args->physical_allocated_range[i+1].start)
4247				continue;
4248		}
4249		// see if the next physical page fits in the memory block
4250		if (is_page_in_physical_memory_range(args, nextPage)) {
4251			// we got one!
4252			args->physical_allocated_range[i].size += B_PAGE_SIZE;
4253			return nextPage / B_PAGE_SIZE;
4254		}
4255	}
4256
4257	// Expanding upwards didn't work, try going downwards.
4258	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
4259		phys_addr_t nextPage;
4260
4261		nextPage = args->physical_allocated_range[i].start - B_PAGE_SIZE;
4262		// see if the page after the prev allocated paddr run can be allocated
4263		if (i > 0 && args->physical_allocated_range[i - 1].size != 0) {
4264			// see if the next page will collide with the next allocated range
4265			if (nextPage < args->physical_allocated_range[i-1].start
4266				+ args->physical_allocated_range[i-1].size)
4267				continue;
4268		}
4269		// see if the next physical page fits in the memory block
4270		if (is_page_in_physical_memory_range(args, nextPage)) {
4271			// we got one!
4272			args->physical_allocated_range[i].start -= B_PAGE_SIZE;
4273			args->physical_allocated_range[i].size += B_PAGE_SIZE;
4274			return nextPage / B_PAGE_SIZE;
4275		}
4276	}
4277
4278	return 0;
4279		// could not allocate a block
4280}
4281
4282
4283/*!	This one uses the kernel_args' physical and virtual memory ranges to
4284	allocate some pages before the VM is completely up.
4285*/
4286addr_t
4287vm_allocate_early(kernel_args* args, size_t virtualSize, size_t physicalSize,
4288	uint32 attributes, addr_t alignment)
4289{
4290	if (physicalSize > virtualSize)
4291		physicalSize = virtualSize;
4292
4293	// find the vaddr to allocate at
4294	addr_t virtualBase = allocate_early_virtual(args, virtualSize, alignment);
4295	//dprintf("vm_allocate_early: vaddr 0x%lx\n", virtualBase);
4296	if (virtualBase == 0) {
4297		panic("vm_allocate_early: could not allocate virtual address\n");
4298		return 0;
4299	}
4300
4301	// map the pages
4302	for (uint32 i = 0; i < PAGE_ALIGN(physicalSize) / B_PAGE_SIZE; i++) {
4303		page_num_t physicalAddress = vm_allocate_early_physical_page(args);
4304		if (physicalAddress == 0)
4305			panic("error allocating early page!\n");
4306
4307		//dprintf("vm_allocate_early: paddr 0x%lx\n", physicalAddress);
4308
4309		arch_vm_translation_map_early_map(args, virtualBase + i * B_PAGE_SIZE,
4310			physicalAddress * B_PAGE_SIZE, attributes,
4311			&vm_allocate_early_physical_page);
4312	}
4313
4314	return virtualBase;
4315}
4316
4317
4318/*!	The main entrance point to initialize the VM. */
4319status_t
4320vm_init(kernel_args* args)
4321{
4322	struct preloaded_image* image;
4323	void* address;
4324	status_t err = 0;
4325	uint32 i;
4326
4327	TRACE(("vm_init: entry\n"));
4328	err = arch_vm_translation_map_init(args, &sPhysicalPageMapper);
4329	err = arch_vm_init(args);
4330
4331	// initialize some globals
4332	vm_page_init_num_pages(args);
4333	sAvailableMemory = vm_page_num_pages() * B_PAGE_SIZE;
4334
4335	slab_init(args);
4336
4337#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4338	off_t heapSize = INITIAL_HEAP_SIZE;
4339	// try to accomodate low memory systems
4340	while (heapSize > sAvailableMemory / 8)
4341		heapSize /= 2;
4342	if (heapSize < 1024 * 1024)
4343		panic("vm_init: go buy some RAM please.");
4344
4345	// map in the new heap and initialize it
4346	addr_t heapBase = vm_allocate_early(args, heapSize, heapSize,
4347		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
4348	TRACE(("heap at 0x%lx\n", heapBase));
4349	heap_init(heapBase, heapSize);
4350#endif
4351
4352	// initialize the free page list and physical page mapper
4353	vm_page_init(args);
4354
4355	// initialize the cache allocators
4356	vm_cache_init(args);
4357
4358	{
4359		status_t error = VMAreas::Init();
4360		if (error != B_OK)
4361			panic("vm_init: error initializing areas map\n");
4362	}
4363
4364	VMAddressSpace::Init();
4365	reserve_boot_loader_ranges(args);
4366
4367#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4368	heap_init_post_area();
4369#endif
4370
4371	// Do any further initialization that the architecture dependant layers may
4372	// need now
4373	arch_vm_translation_map_init_post_area(args);
4374	arch_vm_init_post_area(args);
4375	vm_page_init_post_area(args);
4376	slab_init_post_area();
4377
4378	// allocate areas to represent stuff that already exists
4379
4380#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4381	address = (void*)ROUNDDOWN(heapBase, B_PAGE_SIZE);
4382	create_area("kernel heap", &address, B_EXACT_ADDRESS, heapSize,
4383		B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4384#endif
4385
4386	allocate_kernel_args(args);
4387
4388	create_preloaded_image_areas(args->kernel_image);
4389
4390	// allocate areas for preloaded images
4391	for (image = args->preloaded_images; image != NULL; image = image->next)
4392		create_preloaded_image_areas(image);
4393
4394	// allocate kernel stacks
4395	for (i = 0; i < args->num_cpus; i++) {
4396		char name[64];
4397
4398		sprintf(name, "idle thread %" B_PRIu32 " kstack", i + 1);
4399		address = (void*)args->cpu_kstack[i].start;
4400		create_area(name, &address, B_EXACT_ADDRESS, args->cpu_kstack[i].size,
4401			B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4402	}
4403
4404	void* lastPage = (void*)ROUNDDOWN(~(addr_t)0, B_PAGE_SIZE);
4405	vm_block_address_range("overflow protection", lastPage, B_PAGE_SIZE);
4406
4407#if PARANOID_KERNEL_MALLOC
4408	vm_block_address_range("uninitialized heap memory",
4409		(void *)ROUNDDOWN(0xcccccccc, B_PAGE_SIZE), B_PAGE_SIZE * 64);
4410#endif
4411#if PARANOID_KERNEL_FREE
4412	vm_block_address_range("freed heap memory",
4413		(void *)ROUNDDOWN(0xdeadbeef, B_PAGE_SIZE), B_PAGE_SIZE * 64);
4414#endif
4415
4416	// create the object cache for the page mappings
4417	gPageMappingsObjectCache = create_object_cache_etc("page mappings",
4418		sizeof(vm_page_mapping), 0, 0, 64, 128, CACHE_LARGE_SLAB, NULL, NULL,
4419		NULL, NULL);
4420	if (gPageMappingsObjectCache == NULL)
4421		panic("failed to create page mappings object cache");
4422
4423	object_cache_set_minimum_reserve(gPageMappingsObjectCache, 1024);
4424
4425#if DEBUG_CACHE_LIST
4426	if (vm_page_num_free_pages() >= 200 * 1024 * 1024 / B_PAGE_SIZE) {
4427		virtual_address_restrictions virtualRestrictions = {};
4428		virtualRestrictions.address_specification = B_ANY_KERNEL_ADDRESS;
4429		physical_address_restrictions physicalRestrictions = {};
4430		create_area_etc(VMAddressSpace::KernelID(), "cache info table",
4431			ROUNDUP(kCacheInfoTableCount * sizeof(cache_info), B_PAGE_SIZE),
4432			B_FULL_LOCK, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA,
4433			CREATE_AREA_DONT_WAIT, 0, &virtualRestrictions,
4434			&physicalRestrictions, (void**)&sCacheInfoTable);
4435	}
4436#endif	// DEBUG_CACHE_LIST
4437
4438	// add some debugger commands
4439	add_debugger_command("areas", &dump_area_list, "Dump a list of all areas");
4440	add_debugger_command("area", &dump_area,
4441		"Dump info about a particular area");
4442	add_debugger_command("cache", &dump_cache, "Dump VMCache");
4443	add_debugger_command("cache_tree", &dump_cache_tree, "Dump VMCache tree");
4444#if DEBUG_CACHE_LIST
4445	if (sCacheInfoTable != NULL) {
4446		add_debugger_command_etc("caches", &dump_caches,
4447			"List all VMCache trees",
4448			"[ \"-c\" ]\n"
4449			"All cache trees are listed sorted in decreasing order by number "
4450				"of\n"
4451			"used pages or, if \"-c\" is specified, by size of committed "
4452				"memory.\n",
4453			0);
4454	}
4455#endif
4456	add_debugger_command("avail", &dump_available_memory,
4457		"Dump available memory");
4458	add_debugger_command("dl", &display_mem, "dump memory long words (64-bit)");
4459	add_debugger_command("dw", &display_mem, "dump memory words (32-bit)");
4460	add_debugger_command("ds", &display_mem, "dump memory shorts (16-bit)");
4461	add_debugger_command("db", &display_mem, "dump memory bytes (8-bit)");
4462	add_debugger_command("string", &display_mem, "dump strings");
4463
4464	add_debugger_command_etc("mapping", &dump_mapping_info,
4465		"Print address mapping information",
4466		"[ \"-r\" | \"-p\" ] <address> [ <thread ID> ]\n"
4467		"Prints low-level page mapping information for a given address. If\n"
4468		"neither \"-r\" nor \"-p\" are specified, <address> is a virtual\n"
4469		"address that is looked up in the translation map of the current\n"
4470		"team, respectively the team specified by thread ID <thread ID>. If\n"
4471		"\"-r\" is specified, <address> is a physical address that is\n"
4472		"searched in the translation map of all teams, respectively the team\n"
4473		"specified by thread ID <thread ID>. If \"-p\" is specified,\n"
4474		"<address> is the address of a vm_page structure. The behavior is\n"
4475		"equivalent to specifying \"-r\" with the physical address of that\n"
4476		"page.\n",
4477		0);
4478
4479	TRACE(("vm_init: exit\n"));
4480
4481	vm_cache_init_post_heap();
4482
4483	return err;
4484}
4485
4486
4487status_t
4488vm_init_post_sem(kernel_args* args)
4489{
4490	// This frees all unused boot loader resources and makes its space available
4491	// again
4492	arch_vm_init_end(args);
4493	unreserve_boot_loader_ranges(args);
4494
4495	// fill in all of the semaphores that were not allocated before
4496	// since we're still single threaded and only the kernel address space
4497	// exists, it isn't that hard to find all of the ones we need to create
4498
4499	arch_vm_translation_map_init_post_sem(args);
4500
4501	slab_init_post_sem();
4502
4503#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4504	heap_init_post_sem();
4505#endif
4506
4507	return B_OK;
4508}
4509
4510
4511status_t
4512vm_init_post_thread(kernel_args* args)
4513{
4514	vm_page_init_post_thread(args);
4515	slab_init_post_thread();
4516	return heap_init_post_thread();
4517}
4518
4519
4520status_t
4521vm_init_post_modules(kernel_args* args)
4522{
4523	return arch_vm_init_post_modules(args);
4524}
4525
4526
4527void
4528permit_page_faults(void)
4529{
4530	Thread* thread = thread_get_current_thread();
4531	if (thread != NULL)
4532		atomic_add(&thread->page_faults_allowed, 1);
4533}
4534
4535
4536void
4537forbid_page_faults(void)
4538{
4539	Thread* thread = thread_get_current_thread();
4540	if (thread != NULL)
4541		atomic_add(&thread->page_faults_allowed, -1);
4542}
4543
4544
4545status_t
4546vm_page_fault(addr_t address, addr_t faultAddress, bool isWrite, bool isExecute,
4547	bool isUser, addr_t* newIP)
4548{
4549	FTRACE(("vm_page_fault: page fault at 0x%lx, ip 0x%lx\n", address,
4550		faultAddress));
4551
4552	TPF(PageFaultStart(address, isWrite, isUser, faultAddress));
4553
4554	addr_t pageAddress = ROUNDDOWN(address, B_PAGE_SIZE);
4555	VMAddressSpace* addressSpace = NULL;
4556
4557	status_t status = B_OK;
4558	*newIP = 0;
4559	atomic_add((int32*)&sPageFaults, 1);
4560
4561	if (IS_KERNEL_ADDRESS(pageAddress)) {
4562		addressSpace = VMAddressSpace::GetKernel();
4563	} else if (IS_USER_ADDRESS(pageAddress)) {
4564		addressSpace = VMAddressSpace::GetCurrent();
4565		if (addressSpace == NULL) {
4566			if (!isUser) {
4567				dprintf("vm_page_fault: kernel thread accessing invalid user "
4568					"memory!\n");
4569				status = B_BAD_ADDRESS;
4570				TPF(PageFaultError(-1,
4571					VMPageFaultTracing
4572						::PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY));
4573			} else {
4574				// XXX weird state.
4575				panic("vm_page_fault: non kernel thread accessing user memory "
4576					"that doesn't exist!\n");
4577				status = B_BAD_ADDRESS;
4578			}
4579		}
4580	} else {
4581		// the hit was probably in the 64k DMZ between kernel and user space
4582		// this keeps a user space thread from passing a buffer that crosses
4583		// into kernel space
4584		status = B_BAD_ADDRESS;
4585		TPF(PageFaultError(-1,
4586			VMPageFaultTracing::PAGE_FAULT_ERROR_NO_ADDRESS_SPACE));
4587	}
4588
4589	if (status == B_OK) {
4590		status = vm_soft_fault(addressSpace, pageAddress, isWrite, isExecute,
4591			isUser, NULL);
4592	}
4593
4594	if (status < B_OK) {
4595		dprintf("vm_page_fault: vm_soft_fault returned error '%s' on fault at "
4596			"0x%lx, ip 0x%lx, write %d, user %d, exec %d, thread 0x%" B_PRIx32 "\n",
4597			strerror(status), address, faultAddress, isWrite, isUser, isExecute,
4598			thread_get_current_thread_id());
4599		if (!isUser) {
4600			Thread* thread = thread_get_current_thread();
4601			if (thread != NULL && thread->fault_handler != 0) {
4602				// this will cause the arch dependant page fault handler to
4603				// modify the IP on the interrupt frame or whatever to return
4604				// to this address
4605				*newIP = reinterpret_cast<uintptr_t>(thread->fault_handler);
4606			} else {
4607				// unhandled page fault in the kernel
4608				panic("vm_page_fault: unhandled page fault in kernel space at "
4609					"0x%lx, ip 0x%lx\n", address, faultAddress);
4610			}
4611		} else {
4612			Thread* thread = thread_get_current_thread();
4613
4614#ifdef TRACE_FAULTS
4615			VMArea* area = NULL;
4616			if (addressSpace != NULL) {
4617				addressSpace->ReadLock();
4618				area = addressSpace->LookupArea(faultAddress);
4619			}
4620
4621			dprintf("vm_page_fault: thread \"%s\" (%" B_PRId32 ") in team "
4622				"\"%s\" (%" B_PRId32 ") tried to %s address %#lx, ip %#lx "
4623				"(\"%s\" +%#lx)\n", thread->name, thread->id,
4624				thread->team->Name(), thread->team->id,
4625				isWrite ? "write" : (isExecute ? "execute" : "read"), address,
4626				faultAddress, area ? area->name : "???", faultAddress - (area ?
4627					area->Base() : 0x0));
4628
4629			if (addressSpace != NULL)
4630				addressSpace->ReadUnlock();
4631#endif
4632
4633			// If the thread has a signal handler for SIGSEGV, we simply
4634			// send it the signal. Otherwise we notify the user debugger
4635			// first.
4636			struct sigaction action;
4637			if ((sigaction(SIGSEGV, NULL, &action) == 0
4638					&& action.sa_handler != SIG_DFL
4639					&& action.sa_handler != SIG_IGN)
4640				|| user_debug_exception_occurred(B_SEGMENT_VIOLATION,
4641					SIGSEGV)) {
4642				Signal signal(SIGSEGV,
4643					status == B_PERMISSION_DENIED
4644						? SEGV_ACCERR : SEGV_MAPERR,
4645					EFAULT, thread->team->id);
4646				signal.SetAddress((void*)address);
4647				send_signal_to_thread(thread, signal, 0);
4648			}
4649		}
4650	}
4651
4652	if (addressSpace != NULL)
4653		addressSpace->Put();
4654
4655	return B_HANDLED_INTERRUPT;
4656}
4657
4658
4659struct PageFaultContext {
4660	AddressSpaceReadLocker	addressSpaceLocker;
4661	VMCacheChainLocker		cacheChainLocker;
4662
4663	VMTranslationMap*		map;
4664	VMCache*				topCache;
4665	off_t					cacheOffset;
4666	vm_page_reservation		reservation;
4667	bool					isWrite;
4668
4669	// return values
4670	vm_page*				page;
4671	bool					restart;
4672	bool					pageAllocated;
4673
4674
4675	PageFaultContext(VMAddressSpace* addressSpace, bool isWrite)
4676		:
4677		addressSpaceLocker(addressSpace, true),
4678		map(addressSpace->TranslationMap()),
4679		isWrite(isWrite)
4680	{
4681	}
4682
4683	~PageFaultContext()
4684	{
4685		UnlockAll();
4686		vm_page_unreserve_pages(&reservation);
4687	}
4688
4689	void Prepare(VMCache* topCache, off_t cacheOffset)
4690	{
4691		this->topCache = topCache;
4692		this->cacheOffset = cacheOffset;
4693		page = NULL;
4694		restart = false;
4695		pageAllocated = false;
4696
4697		cacheChainLocker.SetTo(topCache);
4698	}
4699
4700	void UnlockAll(VMCache* exceptCache = NULL)
4701	{
4702		topCache = NULL;
4703		addressSpaceLocker.Unlock();
4704		cacheChainLocker.Unlock(exceptCache);
4705	}
4706};
4707
4708
4709/*!	Gets the page that should be mapped into the area.
4710	Returns an error code other than \c B_OK, if the page couldn't be found or
4711	paged in. The locking state of the address space and the caches is undefined
4712	in that case.
4713	Returns \c B_OK with \c context.restart set to \c true, if the functions
4714	had to unlock the address space and all caches and is supposed to be called
4715	again.
4716	Returns \c B_OK with \c context.restart set to \c false, if the page was
4717	found. It is returned in \c context.page. The address space will still be
4718	locked as well as all caches starting from the top cache to at least the
4719	cache the page lives in.
4720*/
4721static status_t
4722fault_get_page(PageFaultContext& context)
4723{
4724	VMCache* cache = context.topCache;
4725	VMCache* lastCache = NULL;
4726	vm_page* page = NULL;
4727
4728	while (cache != NULL) {
4729		// We already hold the lock of the cache at this point.
4730
4731		lastCache = cache;
4732
4733		page = cache->LookupPage(context.cacheOffset);
4734		if (page != NULL && page->busy) {
4735			// page must be busy -- wait for it to become unbusy
4736			context.UnlockAll(cache);
4737			cache->ReleaseRefLocked();
4738			cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, false);
4739
4740			// restart the whole process
4741			context.restart = true;
4742			return B_OK;
4743		}
4744
4745		if (page != NULL)
4746			break;
4747
4748		// The current cache does not contain the page we're looking for.
4749
4750		// see if the backing store has it
4751		if (cache->HasPage(context.cacheOffset)) {
4752			// insert a fresh page and mark it busy -- we're going to read it in
4753			page = vm_page_allocate_page(&context.reservation,
4754				PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_BUSY);
4755			cache->InsertPage(page, context.cacheOffset);
4756
4757			// We need to unlock all caches and the address space while reading
4758			// the page in. Keep a reference to the cache around.
4759			cache->AcquireRefLocked();
4760			context.UnlockAll();
4761
4762			// read the page in
4763			generic_io_vec vec;
4764			vec.base = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
4765			generic_size_t bytesRead = vec.length = B_PAGE_SIZE;
4766
4767			status_t status = cache->Read(context.cacheOffset, &vec, 1,
4768				B_PHYSICAL_IO_REQUEST, &bytesRead);
4769
4770			cache->Lock();
4771
4772			if (status < B_OK) {
4773				// on error remove and free the page
4774				dprintf("reading page from cache %p returned: %s!\n",
4775					cache, strerror(status));
4776
4777				cache->NotifyPageEvents(page, PAGE_EVENT_NOT_BUSY);
4778				cache->RemovePage(page);
4779				vm_page_set_state(page, PAGE_STATE_FREE);
4780
4781				cache->ReleaseRefAndUnlock();
4782				return status;
4783			}
4784
4785			// mark the page unbusy again
4786			cache->MarkPageUnbusy(page);
4787
4788			DEBUG_PAGE_ACCESS_END(page);
4789
4790			// Since we needed to unlock everything temporarily, the area
4791			// situation might have changed. So we need to restart the whole
4792			// process.
4793			cache->ReleaseRefAndUnlock();
4794			context.restart = true;
4795			return B_OK;
4796		}
4797
4798		cache = context.cacheChainLocker.LockSourceCache();
4799	}
4800
4801	if (page == NULL) {
4802		// There was no adequate page, determine the cache for a clean one.
4803		// Read-only pages come in the deepest cache, only the top most cache
4804		// may have direct write access.
4805		cache = context.isWrite ? context.topCache : lastCache;
4806
4807		// allocate a clean page
4808		page = vm_page_allocate_page(&context.reservation,
4809			PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_CLEAR);
4810		FTRACE(("vm_soft_fault: just allocated page 0x%" B_PRIxPHYSADDR "\n",
4811			page->physical_page_number));
4812
4813		// insert the new page into our cache
4814		cache->InsertPage(page, context.cacheOffset);
4815		context.pageAllocated = true;
4816	} else if (page->Cache() != context.topCache && context.isWrite) {
4817		// We have a page that has the data we want, but in the wrong cache
4818		// object so we need to copy it and stick it into the top cache.
4819		vm_page* sourcePage = page;
4820
4821		// TODO: If memory is low, it might be a good idea to steal the page
4822		// from our source cache -- if possible, that is.
4823		FTRACE(("get new page, copy it, and put it into the topmost cache\n"));
4824		page = vm_page_allocate_page(&context.reservation, PAGE_STATE_ACTIVE);
4825
4826		// To not needlessly kill concurrency we unlock all caches but the top
4827		// one while copying the page. Lacking another mechanism to ensure that
4828		// the source page doesn't disappear, we mark it busy.
4829		sourcePage->busy = true;
4830		context.cacheChainLocker.UnlockKeepRefs(true);
4831
4832		// copy the page
4833		vm_memcpy_physical_page(page->physical_page_number * B_PAGE_SIZE,
4834			sourcePage->physical_page_number * B_PAGE_SIZE);
4835
4836		context.cacheChainLocker.RelockCaches(true);
4837		sourcePage->Cache()->MarkPageUnbusy(sourcePage);
4838
4839		// insert the new page into our cache
4840		context.topCache->InsertPage(page, context.cacheOffset);
4841		context.pageAllocated = true;
4842	} else
4843		DEBUG_PAGE_ACCESS_START(page);
4844
4845	context.page = page;
4846	return B_OK;
4847}
4848
4849
4850/*!	Makes sure the address in the given address space is mapped.
4851
4852	\param addressSpace The address space.
4853	\param originalAddress The address. Doesn't need to be page aligned.
4854	\param isWrite If \c true the address shall be write-accessible.
4855	\param isUser If \c true the access is requested by a userland team.
4856	\param wirePage On success, if non \c NULL, the wired count of the page
4857		mapped at the given address is incremented and the page is returned
4858		via this parameter.
4859	\return \c B_OK on success, another error code otherwise.
4860*/
4861static status_t
4862vm_soft_fault(VMAddressSpace* addressSpace, addr_t originalAddress,
4863	bool isWrite, bool isExecute, bool isUser, vm_page** wirePage)
4864{
4865	FTRACE(("vm_soft_fault: thid 0x%" B_PRIx32 " address 0x%" B_PRIxADDR ", "
4866		"isWrite %d, isUser %d\n", thread_get_current_thread_id(),
4867		originalAddress, isWrite, isUser));
4868
4869	PageFaultContext context(addressSpace, isWrite);
4870
4871	addr_t address = ROUNDDOWN(originalAddress, B_PAGE_SIZE);
4872	status_t status = B_OK;
4873
4874	addressSpace->IncrementFaultCount();
4875
4876	// We may need up to 2 pages plus pages needed for mapping them -- reserving
4877	// the pages upfront makes sure we don't have any cache locked, so that the
4878	// page daemon/thief can do their job without problems.
4879	size_t reservePages = 2 + context.map->MaxPagesNeededToMap(originalAddress,
4880		originalAddress);
4881	context.addressSpaceLocker.Unlock();
4882	vm_page_reserve_pages(&context.reservation, reservePages,
4883		addressSpace == VMAddressSpace::Kernel()
4884			? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
4885
4886	while (true) {
4887		context.addressSpaceLocker.Lock();
4888
4889		// get the area the fault was in
4890		VMArea* area = addressSpace->LookupArea(address);
4891		if (area == NULL) {
4892			dprintf("vm_soft_fault: va 0x%lx not covered by area in address "
4893				"space\n", originalAddress);
4894			TPF(PageFaultError(-1,
4895				VMPageFaultTracing::PAGE_FAULT_ERROR_NO_AREA));
4896			status = B_BAD_ADDRESS;
4897			break;
4898		}
4899
4900		// check permissions
4901		uint32 protection = get_area_page_protection(area, address);
4902		if (isUser && (protection & B_USER_PROTECTION) == 0
4903				&& (area->protection & B_KERNEL_AREA) != 0) {
4904			dprintf("user access on kernel area 0x%" B_PRIx32 " at %p\n",
4905				area->id, (void*)originalAddress);
4906			TPF(PageFaultError(area->id,
4907				VMPageFaultTracing::PAGE_FAULT_ERROR_KERNEL_ONLY));
4908			status = B_PERMISSION_DENIED;
4909			break;
4910		}
4911		if (isWrite && (protection
4912				& (B_WRITE_AREA | (isUser ? 0 : B_KERNEL_WRITE_AREA))) == 0) {
4913			dprintf("write access attempted on write-protected area 0x%"
4914				B_PRIx32 " at %p\n", area->id, (void*)originalAddress);
4915			TPF(PageFaultError(area->id,
4916				VMPageFaultTracing::PAGE_FAULT_ERROR_WRITE_PROTECTED));
4917			status = B_PERMISSION_DENIED;
4918			break;
4919		} else if (isExecute && (protection
4920				& (B_EXECUTE_AREA | (isUser ? 0 : B_KERNEL_EXECUTE_AREA))) == 0) {
4921			dprintf("instruction fetch attempted on execute-protected area 0x%"
4922				B_PRIx32 " at %p\n", area->id, (void*)originalAddress);
4923			TPF(PageFaultError(area->id,
4924				VMPageFaultTracing::PAGE_FAULT_ERROR_EXECUTE_PROTECTED));
4925			status = B_PERMISSION_DENIED;
4926			break;
4927		} else if (!isWrite && !isExecute && (protection
4928				& (B_READ_AREA | (isUser ? 0 : B_KERNEL_READ_AREA))) == 0) {
4929			dprintf("read access attempted on read-protected area 0x%" B_PRIx32
4930				" at %p\n", area->id, (void*)originalAddress);
4931			TPF(PageFaultError(area->id,
4932				VMPageFaultTracing::PAGE_FAULT_ERROR_READ_PROTECTED));
4933			status = B_PERMISSION_DENIED;
4934			break;
4935		}
4936
4937		// We have the area, it was a valid access, so let's try to resolve the
4938		// page fault now.
4939		// At first, the top most cache from the area is investigated.
4940
4941		context.Prepare(vm_area_get_locked_cache(area),
4942			address - area->Base() + area->cache_offset);
4943
4944		// See if this cache has a fault handler -- this will do all the work
4945		// for us.
4946		{
4947			// Note, since the page fault is resolved with interrupts enabled,
4948			// the fault handler could be called more than once for the same
4949			// reason -- the store must take this into account.
4950			status = context.topCache->Fault(addressSpace, context.cacheOffset);
4951			if (status != B_BAD_HANDLER)
4952				break;
4953		}
4954
4955		// The top most cache has no fault handler, so let's see if the cache or
4956		// its sources already have the page we're searching for (we're going
4957		// from top to bottom).
4958		status = fault_get_page(context);
4959		if (status != B_OK) {
4960			TPF(PageFaultError(area->id, status));
4961			break;
4962		}
4963
4964		if (context.restart)
4965			continue;
4966
4967		// All went fine, all there is left to do is to map the page into the
4968		// address space.
4969		TPF(PageFaultDone(area->id, context.topCache, context.page->Cache(),
4970			context.page));
4971
4972		// If the page doesn't reside in the area's cache, we need to make sure
4973		// it's mapped in read-only, so that we cannot overwrite someone else's
4974		// data (copy-on-write)
4975		uint32 newProtection = protection;
4976		if (context.page->Cache() != context.topCache && !isWrite)
4977			newProtection &= ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA);
4978
4979		bool unmapPage = false;
4980		bool mapPage = true;
4981
4982		// check whether there's already a page mapped at the address
4983		context.map->Lock();
4984
4985		phys_addr_t physicalAddress;
4986		uint32 flags;
4987		vm_page* mappedPage = NULL;
4988		if (context.map->Query(address, &physicalAddress, &flags) == B_OK
4989			&& (flags & PAGE_PRESENT) != 0
4990			&& (mappedPage = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
4991				!= NULL) {
4992			// Yep there's already a page. If it's ours, we can simply adjust
4993			// its protection. Otherwise we have to unmap it.
4994			if (mappedPage == context.page) {
4995				context.map->ProtectPage(area, address, newProtection);
4996					// Note: We assume that ProtectPage() is atomic (i.e.
4997					// the page isn't temporarily unmapped), otherwise we'd have
4998					// to make sure it isn't wired.
4999				mapPage = false;
5000			} else
5001				unmapPage = true;
5002		}
5003
5004		context.map->Unlock();
5005
5006		if (unmapPage) {
5007			// If the page is wired, we can't unmap it. Wait until it is unwired
5008			// again and restart. Note that the page cannot be wired for
5009			// writing, since it it isn't in the topmost cache. So we can safely
5010			// ignore ranges wired for writing (our own and other concurrent
5011			// wiring attempts in progress) and in fact have to do that to avoid
5012			// a deadlock.
5013			VMAreaUnwiredWaiter waiter;
5014			if (area->AddWaiterIfWired(&waiter, address, B_PAGE_SIZE,
5015					VMArea::IGNORE_WRITE_WIRED_RANGES)) {
5016				// unlock everything and wait
5017				if (context.pageAllocated) {
5018					// ... but since we allocated a page and inserted it into
5019					// the top cache, remove and free it first. Otherwise we'd
5020					// have a page from a lower cache mapped while an upper
5021					// cache has a page that would shadow it.
5022					context.topCache->RemovePage(context.page);
5023					vm_page_free_etc(context.topCache, context.page,
5024						&context.reservation);
5025				} else
5026					DEBUG_PAGE_ACCESS_END(context.page);
5027
5028				context.UnlockAll();
5029				waiter.waitEntry.Wait();
5030				continue;
5031			}
5032
5033			// Note: The mapped page is a page of a lower cache. We are
5034			// guaranteed to have that cached locked, our new page is a copy of
5035			// that page, and the page is not busy. The logic for that guarantee
5036			// is as follows: Since the page is mapped, it must live in the top
5037			// cache (ruled out above) or any of its lower caches, and there is
5038			// (was before the new page was inserted) no other page in any
5039			// cache between the top cache and the page's cache (otherwise that
5040			// would be mapped instead). That in turn means that our algorithm
5041			// must have found it and therefore it cannot be busy either.
5042			DEBUG_PAGE_ACCESS_START(mappedPage);
5043			unmap_page(area, address);
5044			DEBUG_PAGE_ACCESS_END(mappedPage);
5045		}
5046
5047		if (mapPage) {
5048			if (map_page(area, context.page, address, newProtection,
5049					&context.reservation) != B_OK) {
5050				// Mapping can only fail, when the page mapping object couldn't
5051				// be allocated. Save for the missing mapping everything is
5052				// fine, though. If this was a regular page fault, we'll simply
5053				// leave and probably fault again. To make sure we'll have more
5054				// luck then, we ensure that the minimum object reserve is
5055				// available.
5056				DEBUG_PAGE_ACCESS_END(context.page);
5057
5058				context.UnlockAll();
5059
5060				if (object_cache_reserve(gPageMappingsObjectCache, 1, 0)
5061						!= B_OK) {
5062					// Apparently the situation is serious. Let's get ourselves
5063					// killed.
5064					status = B_NO_MEMORY;
5065				} else if (wirePage != NULL) {
5066					// The caller expects us to wire the page. Since
5067					// object_cache_reserve() succeeded, we should now be able
5068					// to allocate a mapping structure. Restart.
5069					continue;
5070				}
5071
5072				break;
5073			}
5074		} else if (context.page->State() == PAGE_STATE_INACTIVE)
5075			vm_page_set_state(context.page, PAGE_STATE_ACTIVE);
5076
5077		// also wire the page, if requested
5078		if (wirePage != NULL && status == B_OK) {
5079			increment_page_wired_count(context.page);
5080			*wirePage = context.page;
5081		}
5082
5083		DEBUG_PAGE_ACCESS_END(context.page);
5084
5085		break;
5086	}
5087
5088	return status;
5089}
5090
5091
5092status_t
5093vm_get_physical_page(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
5094{
5095	return sPhysicalPageMapper->GetPage(paddr, _vaddr, _handle);
5096}
5097
5098status_t
5099vm_put_physical_page(addr_t vaddr, void* handle)
5100{
5101	return sPhysicalPageMapper->PutPage(vaddr, handle);
5102}
5103
5104
5105status_t
5106vm_get_physical_page_current_cpu(phys_addr_t paddr, addr_t* _vaddr,
5107	void** _handle)
5108{
5109	return sPhysicalPageMapper->GetPageCurrentCPU(paddr, _vaddr, _handle);
5110}
5111
5112status_t
5113vm_put_physical_page_current_cpu(addr_t vaddr, void* handle)
5114{
5115	return sPhysicalPageMapper->PutPageCurrentCPU(vaddr, handle);
5116}
5117
5118
5119status_t
5120vm_get_physical_page_debug(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
5121{
5122	return sPhysicalPageMapper->GetPageDebug(paddr, _vaddr, _handle);
5123}
5124
5125status_t
5126vm_put_physical_page_debug(addr_t vaddr, void* handle)
5127{
5128	return sPhysicalPageMapper->PutPageDebug(vaddr, handle);
5129}
5130
5131
5132void
5133vm_get_info(system_info* info)
5134{
5135	swap_get_info(info);
5136
5137	MutexLocker locker(sAvailableMemoryLock);
5138	info->needed_memory = sNeededMemory;
5139	info->free_memory = sAvailableMemory;
5140}
5141
5142
5143uint32
5144vm_num_page_faults(void)
5145{
5146	return sPageFaults;
5147}
5148
5149
5150off_t
5151vm_available_memory(void)
5152{
5153	MutexLocker locker(sAvailableMemoryLock);
5154	return sAvailableMemory;
5155}
5156
5157
5158off_t
5159vm_available_not_needed_memory(void)
5160{
5161	MutexLocker locker(sAvailableMemoryLock);
5162	return sAvailableMemory - sNeededMemory;
5163}
5164
5165
5166/*!	Like vm_available_not_needed_memory(), but only for use in the kernel
5167	debugger.
5168*/
5169off_t
5170vm_available_not_needed_memory_debug(void)
5171{
5172	return sAvailableMemory - sNeededMemory;
5173}
5174
5175
5176size_t
5177vm_kernel_address_space_left(void)
5178{
5179	return VMAddressSpace::Kernel()->FreeSpace();
5180}
5181
5182
5183void
5184vm_unreserve_memory(size_t amount)
5185{
5186	mutex_lock(&sAvailableMemoryLock);
5187
5188	sAvailableMemory += amount;
5189
5190	mutex_unlock(&sAvailableMemoryLock);
5191}
5192
5193
5194status_t
5195vm_try_reserve_memory(size_t amount, int priority, bigtime_t timeout)
5196{
5197	size_t reserve = kMemoryReserveForPriority[priority];
5198
5199	MutexLocker locker(sAvailableMemoryLock);
5200
5201	//dprintf("try to reserve %lu bytes, %Lu left\n", amount, sAvailableMemory);
5202
5203	if (sAvailableMemory >= (off_t)(amount + reserve)) {
5204		sAvailableMemory -= amount;
5205		return B_OK;
5206	}
5207
5208	if (amount >= (vm_page_num_pages() * B_PAGE_SIZE)) {
5209		// Do not wait for something that will never happen.
5210		return B_NO_MEMORY;
5211	}
5212
5213	if (timeout <= 0)
5214		return B_NO_MEMORY;
5215
5216	// turn timeout into an absolute timeout
5217	timeout += system_time();
5218
5219	// loop until we've got the memory or the timeout occurs
5220	do {
5221		sNeededMemory += amount;
5222
5223		// call the low resource manager
5224		locker.Unlock();
5225		low_resource(B_KERNEL_RESOURCE_MEMORY, sNeededMemory - sAvailableMemory,
5226			B_ABSOLUTE_TIMEOUT, timeout);
5227		locker.Lock();
5228
5229		sNeededMemory -= amount;
5230
5231		if (sAvailableMemory >= (off_t)(amount + reserve)) {
5232			sAvailableMemory -= amount;
5233			return B_OK;
5234		}
5235	} while (timeout > system_time());
5236
5237	return B_NO_MEMORY;
5238}
5239
5240
5241status_t
5242vm_set_area_memory_type(area_id id, phys_addr_t physicalBase, uint32 type)
5243{
5244	// NOTE: The caller is responsible for synchronizing calls to this function!
5245
5246	AddressSpaceReadLocker locker;
5247	VMArea* area;
5248	status_t status = locker.SetFromArea(id, area);
5249	if (status != B_OK)
5250		return status;
5251
5252	// nothing to do, if the type doesn't change
5253	uint32 oldType = area->MemoryType();
5254	if (type == oldType)
5255		return B_OK;
5256
5257	// set the memory type of the area and the mapped pages
5258	VMTranslationMap* map = area->address_space->TranslationMap();
5259	map->Lock();
5260	area->SetMemoryType(type);
5261	map->ProtectArea(area, area->protection);
5262	map->Unlock();
5263
5264	// set the physical memory type
5265	status_t error = arch_vm_set_memory_type(area, physicalBase, type);
5266	if (error != B_OK) {
5267		// reset the memory type of the area and the mapped pages
5268		map->Lock();
5269		area->SetMemoryType(oldType);
5270		map->ProtectArea(area, area->protection);
5271		map->Unlock();
5272		return error;
5273	}
5274
5275	return B_OK;
5276
5277}
5278
5279
5280/*!	This function enforces some protection properties:
5281	 - kernel areas must be W^X (after kernel startup)
5282	 - if B_WRITE_AREA is set, B_KERNEL_WRITE_AREA is set as well
5283	 - if B_READ_AREA has been set, B_KERNEL_READ_AREA is also set
5284*/
5285static void
5286fix_protection(uint32* protection)
5287{
5288	if ((*protection & B_KERNEL_EXECUTE_AREA) != 0
5289		&& ((*protection & B_KERNEL_WRITE_AREA) != 0
5290			|| (*protection & B_WRITE_AREA) != 0)
5291		&& !gKernelStartup)
5292		panic("kernel areas cannot be both writable and executable!");
5293
5294	if ((*protection & B_KERNEL_PROTECTION) == 0) {
5295		if ((*protection & B_WRITE_AREA) != 0)
5296			*protection |= B_KERNEL_WRITE_AREA;
5297		if ((*protection & B_READ_AREA) != 0)
5298			*protection |= B_KERNEL_READ_AREA;
5299	}
5300}
5301
5302
5303static void
5304fill_area_info(struct VMArea* area, area_info* info, size_t size)
5305{
5306	strlcpy(info->name, area->name, B_OS_NAME_LENGTH);
5307	info->area = area->id;
5308	info->address = (void*)area->Base();
5309	info->size = area->Size();
5310	info->protection = area->protection;
5311	info->lock = area->wiring;
5312	info->team = area->address_space->ID();
5313	info->copy_count = 0;
5314	info->in_count = 0;
5315	info->out_count = 0;
5316		// TODO: retrieve real values here!
5317
5318	VMCache* cache = vm_area_get_locked_cache(area);
5319
5320	// Note, this is a simplification; the cache could be larger than this area
5321	info->ram_size = cache->page_count * B_PAGE_SIZE;
5322
5323	vm_area_put_locked_cache(cache);
5324}
5325
5326
5327static status_t
5328vm_resize_area(area_id areaID, size_t newSize, bool kernel)
5329{
5330	// is newSize a multiple of B_PAGE_SIZE?
5331	if (newSize & (B_PAGE_SIZE - 1))
5332		return B_BAD_VALUE;
5333
5334	// lock all affected address spaces and the cache
5335	VMArea* area;
5336	VMCache* cache;
5337
5338	MultiAddressSpaceLocker locker;
5339	AreaCacheLocker cacheLocker;
5340
5341	status_t status;
5342	size_t oldSize;
5343	bool anyKernelArea;
5344	bool restart;
5345
5346	do {
5347		anyKernelArea = false;
5348		restart = false;
5349
5350		locker.Unset();
5351		status = locker.AddAreaCacheAndLock(areaID, true, true, area, &cache);
5352		if (status != B_OK)
5353			return status;
5354		cacheLocker.SetTo(cache, true);	// already locked
5355
5356		// enforce restrictions
5357		if (!kernel && (area->address_space == VMAddressSpace::Kernel()
5358				|| (area->protection & B_KERNEL_AREA) != 0)) {
5359			dprintf("vm_resize_area: team %" B_PRId32 " tried to "
5360				"resize kernel area %" B_PRId32 " (%s)\n",
5361				team_get_current_team_id(), areaID, area->name);
5362			return B_NOT_ALLOWED;
5363		}
5364		// TODO: Enforce all restrictions (team, etc.)!
5365
5366		oldSize = area->Size();
5367		if (newSize == oldSize)
5368			return B_OK;
5369
5370		if (cache->type != CACHE_TYPE_RAM)
5371			return B_NOT_ALLOWED;
5372
5373		if (oldSize < newSize) {
5374			// We need to check if all areas of this cache can be resized.
5375			for (VMArea* current = cache->areas; current != NULL;
5376					current = current->cache_next) {
5377				if (!current->address_space->CanResizeArea(current, newSize))
5378					return B_ERROR;
5379				anyKernelArea
5380					|= current->address_space == VMAddressSpace::Kernel();
5381			}
5382		} else {
5383			// We're shrinking the areas, so we must make sure the affected
5384			// ranges are not wired.
5385			for (VMArea* current = cache->areas; current != NULL;
5386					current = current->cache_next) {
5387				anyKernelArea
5388					|= current->address_space == VMAddressSpace::Kernel();
5389
5390				if (wait_if_area_range_is_wired(current,
5391						current->Base() + newSize, oldSize - newSize, &locker,
5392						&cacheLocker)) {
5393					restart = true;
5394					break;
5395				}
5396			}
5397		}
5398	} while (restart);
5399
5400	// Okay, looks good so far, so let's do it
5401
5402	int priority = kernel && anyKernelArea
5403		? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
5404	uint32 allocationFlags = kernel && anyKernelArea
5405		? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
5406
5407	if (oldSize < newSize) {
5408		// Growing the cache can fail, so we do it first.
5409		status = cache->Resize(cache->virtual_base + newSize, priority);
5410		if (status != B_OK)
5411			return status;
5412	}
5413
5414	for (VMArea* current = cache->areas; current != NULL;
5415			current = current->cache_next) {
5416		status = current->address_space->ResizeArea(current, newSize,
5417			allocationFlags);
5418		if (status != B_OK)
5419			break;
5420
5421		// We also need to unmap all pages beyond the new size, if the area has
5422		// shrunk
5423		if (newSize < oldSize) {
5424			VMCacheChainLocker cacheChainLocker(cache);
5425			cacheChainLocker.LockAllSourceCaches();
5426
5427			unmap_pages(current, current->Base() + newSize,
5428				oldSize - newSize);
5429
5430			cacheChainLocker.Unlock(cache);
5431		}
5432	}
5433
5434	if (status == B_OK) {
5435		// Shrink or grow individual page protections if in use.
5436		if (area->page_protections != NULL) {
5437			size_t bytes = area_page_protections_size(newSize);
5438			uint8* newProtections
5439				= (uint8*)realloc(area->page_protections, bytes);
5440			if (newProtections == NULL)
5441				status = B_NO_MEMORY;
5442			else {
5443				area->page_protections = newProtections;
5444
5445				if (oldSize < newSize) {
5446					// init the additional page protections to that of the area
5447					uint32 offset = area_page_protections_size(oldSize);
5448					uint32 areaProtection = area->protection
5449						& (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
5450					memset(area->page_protections + offset,
5451						areaProtection | (areaProtection << 4), bytes - offset);
5452					if ((oldSize / B_PAGE_SIZE) % 2 != 0) {
5453						uint8& entry = area->page_protections[offset - 1];
5454						entry = (entry & 0x0f) | (areaProtection << 4);
5455					}
5456				}
5457			}
5458		}
5459	}
5460
5461	// shrinking the cache can't fail, so we do it now
5462	if (status == B_OK && newSize < oldSize)
5463		status = cache->Resize(cache->virtual_base + newSize, priority);
5464
5465	if (status != B_OK) {
5466		// Something failed -- resize the areas back to their original size.
5467		// This can fail, too, in which case we're seriously screwed.
5468		for (VMArea* current = cache->areas; current != NULL;
5469				current = current->cache_next) {
5470			if (current->address_space->ResizeArea(current, oldSize,
5471					allocationFlags) != B_OK) {
5472				panic("vm_resize_area(): Failed and not being able to restore "
5473					"original state.");
5474			}
5475		}
5476
5477		cache->Resize(cache->virtual_base + oldSize, priority);
5478	}
5479
5480	// TODO: we must honour the lock restrictions of this area
5481	return status;
5482}
5483
5484
5485status_t
5486vm_memset_physical(phys_addr_t address, int value, phys_size_t length)
5487{
5488	return sPhysicalPageMapper->MemsetPhysical(address, value, length);
5489}
5490
5491
5492status_t
5493vm_memcpy_from_physical(void* to, phys_addr_t from, size_t length, bool user)
5494{
5495	return sPhysicalPageMapper->MemcpyFromPhysical(to, from, length, user);
5496}
5497
5498
5499status_t
5500vm_memcpy_to_physical(phys_addr_t to, const void* _from, size_t length,
5501	bool user)
5502{
5503	return sPhysicalPageMapper->MemcpyToPhysical(to, _from, length, user);
5504}
5505
5506
5507void
5508vm_memcpy_physical_page(phys_addr_t to, phys_addr_t from)
5509{
5510	return sPhysicalPageMapper->MemcpyPhysicalPage(to, from);
5511}
5512
5513
5514/*!	Copies a range of memory directly from/to a page that might not be mapped
5515	at the moment.
5516
5517	For \a unsafeMemory the current mapping (if any is ignored). The function
5518	walks through the respective area's cache chain to find the physical page
5519	and copies from/to it directly.
5520	The memory range starting at \a unsafeMemory with a length of \a size bytes
5521	must not cross a page boundary.
5522
5523	\param teamID The team ID identifying the address space \a unsafeMemory is
5524		to be interpreted in. Ignored, if \a unsafeMemory is a kernel address
5525		(the kernel address space is assumed in this case). If \c B_CURRENT_TEAM
5526		is passed, the address space of the thread returned by
5527		debug_get_debugged_thread() is used.
5528	\param unsafeMemory The start of the unsafe memory range to be copied
5529		from/to.
5530	\param buffer A safely accessible kernel buffer to be copied from/to.
5531	\param size The number of bytes to be copied.
5532	\param copyToUnsafe If \c true, memory is copied from \a buffer to
5533		\a unsafeMemory, the other way around otherwise.
5534*/
5535status_t
5536vm_debug_copy_page_memory(team_id teamID, void* unsafeMemory, void* buffer,
5537	size_t size, bool copyToUnsafe)
5538{
5539	if (size > B_PAGE_SIZE || ROUNDDOWN((addr_t)unsafeMemory, B_PAGE_SIZE)
5540			!= ROUNDDOWN((addr_t)unsafeMemory + size - 1, B_PAGE_SIZE)) {
5541		return B_BAD_VALUE;
5542	}
5543
5544	// get the address space for the debugged thread
5545	VMAddressSpace* addressSpace;
5546	if (IS_KERNEL_ADDRESS(unsafeMemory)) {
5547		addressSpace = VMAddressSpace::Kernel();
5548	} else if (teamID == B_CURRENT_TEAM) {
5549		Thread* thread = debug_get_debugged_thread();
5550		if (thread == NULL || thread->team == NULL)
5551			return B_BAD_ADDRESS;
5552
5553		addressSpace = thread->team->address_space;
5554	} else
5555		addressSpace = VMAddressSpace::DebugGet(teamID);
5556
5557	if (addressSpace == NULL)
5558		return B_BAD_ADDRESS;
5559
5560	// get the area
5561	VMArea* area = addressSpace->LookupArea((addr_t)unsafeMemory);
5562	if (area == NULL)
5563		return B_BAD_ADDRESS;
5564
5565	// search the page
5566	off_t cacheOffset = (addr_t)unsafeMemory - area->Base()
5567		+ area->cache_offset;
5568	VMCache* cache = area->cache;
5569	vm_page* page = NULL;
5570	while (cache != NULL) {
5571		page = cache->DebugLookupPage(cacheOffset);
5572		if (page != NULL)
5573			break;
5574
5575		// Page not found in this cache -- if it is paged out, we must not try
5576		// to get it from lower caches.
5577		if (cache->DebugHasPage(cacheOffset))
5578			break;
5579
5580		cache = cache->source;
5581	}
5582
5583	if (page == NULL)
5584		return B_UNSUPPORTED;
5585
5586	// copy from/to physical memory
5587	phys_addr_t physicalAddress = page->physical_page_number * B_PAGE_SIZE
5588		+ (addr_t)unsafeMemory % B_PAGE_SIZE;
5589
5590	if (copyToUnsafe) {
5591		if (page->Cache() != area->cache)
5592			return B_UNSUPPORTED;
5593
5594		return vm_memcpy_to_physical(physicalAddress, buffer, size, false);
5595	}
5596
5597	return vm_memcpy_from_physical(buffer, physicalAddress, size, false);
5598}
5599
5600
5601/** Validate that a memory range is either fully in kernel space, or fully in
5602 *  userspace */
5603static inline bool
5604validate_memory_range(const void* addr, size_t size)
5605{
5606	addr_t address = (addr_t)addr;
5607
5608	// Check for overflows on all addresses.
5609	if ((address + size) < address)
5610		return false;
5611
5612	// Validate that the address range does not cross the kernel/user boundary.
5613	return IS_USER_ADDRESS(address) == IS_USER_ADDRESS(address + size - 1);
5614}
5615
5616
5617//	#pragma mark - kernel public API
5618
5619
5620status_t
5621user_memcpy(void* to, const void* from, size_t size)
5622{
5623	if (!validate_memory_range(to, size) || !validate_memory_range(from, size))
5624		return B_BAD_ADDRESS;
5625
5626	if (arch_cpu_user_memcpy(to, from, size) < B_OK)
5627		return B_BAD_ADDRESS;
5628
5629	return B_OK;
5630}
5631
5632
5633/*!	\brief Copies at most (\a size - 1) characters from the string in \a from to
5634	the string in \a to, NULL-terminating the result.
5635
5636	\param to Pointer to the destination C-string.
5637	\param from Pointer to the source C-string.
5638	\param size Size in bytes of the string buffer pointed to by \a to.
5639
5640	\return strlen(\a from).
5641*/
5642ssize_t
5643user_strlcpy(char* to, const char* from, size_t size)
5644{
5645	if (to == NULL && size != 0)
5646		return B_BAD_VALUE;
5647	if (from == NULL)
5648		return B_BAD_ADDRESS;
5649
5650	// Protect the source address from overflows.
5651	size_t maxSize = size;
5652	if ((addr_t)from + maxSize < (addr_t)from)
5653		maxSize -= (addr_t)from + maxSize;
5654	if (IS_USER_ADDRESS(from) && !IS_USER_ADDRESS((addr_t)from + maxSize))
5655		maxSize = USER_TOP - (addr_t)from;
5656
5657	if (!validate_memory_range(to, maxSize))
5658		return B_BAD_ADDRESS;
5659
5660	ssize_t result = arch_cpu_user_strlcpy(to, from, maxSize);
5661	if (result < 0)
5662		return result;
5663
5664	// If we hit the address overflow boundary, fail.
5665	if ((size_t)result >= maxSize && maxSize < size)
5666		return B_BAD_ADDRESS;
5667
5668	return result;
5669}
5670
5671
5672status_t
5673user_memset(void* s, char c, size_t count)
5674{
5675	if (!validate_memory_range(s, count))
5676		return B_BAD_ADDRESS;
5677
5678	if (arch_cpu_user_memset(s, c, count) < B_OK)
5679		return B_BAD_ADDRESS;
5680
5681	return B_OK;
5682}
5683
5684
5685/*!	Wires a single page at the given address.
5686
5687	\param team The team whose address space the address belongs to. Supports
5688		also \c B_CURRENT_TEAM. If the given address is a kernel address, the
5689		parameter is ignored.
5690	\param address address The virtual address to wire down. Does not need to
5691		be page aligned.
5692	\param writable If \c true the page shall be writable.
5693	\param info On success the info is filled in, among other things
5694		containing the physical address the given virtual one translates to.
5695	\return \c B_OK, when the page could be wired, another error code otherwise.
5696*/
5697status_t
5698vm_wire_page(team_id team, addr_t address, bool writable,
5699	VMPageWiringInfo* info)
5700{
5701	addr_t pageAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5702	info->range.SetTo(pageAddress, B_PAGE_SIZE, writable, false);
5703
5704	// compute the page protection that is required
5705	bool isUser = IS_USER_ADDRESS(address);
5706	uint32 requiredProtection = PAGE_PRESENT
5707		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5708	if (writable)
5709		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5710
5711	// get and read lock the address space
5712	VMAddressSpace* addressSpace = NULL;
5713	if (isUser) {
5714		if (team == B_CURRENT_TEAM)
5715			addressSpace = VMAddressSpace::GetCurrent();
5716		else
5717			addressSpace = VMAddressSpace::Get(team);
5718	} else
5719		addressSpace = VMAddressSpace::GetKernel();
5720	if (addressSpace == NULL)
5721		return B_ERROR;
5722
5723	AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5724
5725	VMTranslationMap* map = addressSpace->TranslationMap();
5726	status_t error = B_OK;
5727
5728	// get the area
5729	VMArea* area = addressSpace->LookupArea(pageAddress);
5730	if (area == NULL) {
5731		addressSpace->Put();
5732		return B_BAD_ADDRESS;
5733	}
5734
5735	// Lock the area's top cache. This is a requirement for VMArea::Wire().
5736	VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5737
5738	// mark the area range wired
5739	area->Wire(&info->range);
5740
5741	// Lock the area's cache chain and the translation map. Needed to look
5742	// up the page and play with its wired count.
5743	cacheChainLocker.LockAllSourceCaches();
5744	map->Lock();
5745
5746	phys_addr_t physicalAddress;
5747	uint32 flags;
5748	vm_page* page;
5749	if (map->Query(pageAddress, &physicalAddress, &flags) == B_OK
5750		&& (flags & requiredProtection) == requiredProtection
5751		&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5752			!= NULL) {
5753		// Already mapped with the correct permissions -- just increment
5754		// the page's wired count.
5755		increment_page_wired_count(page);
5756
5757		map->Unlock();
5758		cacheChainLocker.Unlock();
5759		addressSpaceLocker.Unlock();
5760	} else {
5761		// Let vm_soft_fault() map the page for us, if possible. We need
5762		// to fully unlock to avoid deadlocks. Since we have already
5763		// wired the area itself, nothing disturbing will happen with it
5764		// in the meantime.
5765		map->Unlock();
5766		cacheChainLocker.Unlock();
5767		addressSpaceLocker.Unlock();
5768
5769		error = vm_soft_fault(addressSpace, pageAddress, writable, false,
5770			isUser, &page);
5771
5772		if (error != B_OK) {
5773			// The page could not be mapped -- clean up.
5774			VMCache* cache = vm_area_get_locked_cache(area);
5775			area->Unwire(&info->range);
5776			cache->ReleaseRefAndUnlock();
5777			addressSpace->Put();
5778			return error;
5779		}
5780	}
5781
5782	info->physicalAddress
5783		= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
5784			+ address % B_PAGE_SIZE;
5785	info->page = page;
5786
5787	return B_OK;
5788}
5789
5790
5791/*!	Unwires a single page previously wired via vm_wire_page().
5792
5793	\param info The same object passed to vm_wire_page() before.
5794*/
5795void
5796vm_unwire_page(VMPageWiringInfo* info)
5797{
5798	// lock the address space
5799	VMArea* area = info->range.area;
5800	AddressSpaceReadLocker addressSpaceLocker(area->address_space, false);
5801		// takes over our reference
5802
5803	// lock the top cache
5804	VMCache* cache = vm_area_get_locked_cache(area);
5805	VMCacheChainLocker cacheChainLocker(cache);
5806
5807	if (info->page->Cache() != cache) {
5808		// The page is not in the top cache, so we lock the whole cache chain
5809		// before touching the page's wired count.
5810		cacheChainLocker.LockAllSourceCaches();
5811	}
5812
5813	decrement_page_wired_count(info->page);
5814
5815	// remove the wired range from the range
5816	area->Unwire(&info->range);
5817
5818	cacheChainLocker.Unlock();
5819}
5820
5821
5822/*!	Wires down the given address range in the specified team's address space.
5823
5824	If successful the function
5825	- acquires a reference to the specified team's address space,
5826	- adds respective wired ranges to all areas that intersect with the given
5827	  address range,
5828	- makes sure all pages in the given address range are mapped with the
5829	  requested access permissions and increments their wired count.
5830
5831	It fails, when \a team doesn't specify a valid address space, when any part
5832	of the specified address range is not covered by areas, when the concerned
5833	areas don't allow mapping with the requested permissions, or when mapping
5834	failed for another reason.
5835
5836	When successful the call must be balanced by a unlock_memory_etc() call with
5837	the exact same parameters.
5838
5839	\param team Identifies the address (via team ID). \c B_CURRENT_TEAM is
5840		supported.
5841	\param address The start of the address range to be wired.
5842	\param numBytes The size of the address range to be wired.
5843	\param flags Flags. Currently only \c B_READ_DEVICE is defined, which
5844		requests that the range must be wired writable ("read from device
5845		into memory").
5846	\return \c B_OK on success, another error code otherwise.
5847*/
5848status_t
5849lock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
5850{
5851	addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5852	addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
5853
5854	// compute the page protection that is required
5855	bool isUser = IS_USER_ADDRESS(address);
5856	bool writable = (flags & B_READ_DEVICE) == 0;
5857	uint32 requiredProtection = PAGE_PRESENT
5858		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5859	if (writable)
5860		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5861
5862	uint32 mallocFlags = isUser
5863		? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
5864
5865	// get and read lock the address space
5866	VMAddressSpace* addressSpace = NULL;
5867	if (isUser) {
5868		if (team == B_CURRENT_TEAM)
5869			addressSpace = VMAddressSpace::GetCurrent();
5870		else
5871			addressSpace = VMAddressSpace::Get(team);
5872	} else
5873		addressSpace = VMAddressSpace::GetKernel();
5874	if (addressSpace == NULL)
5875		return B_ERROR;
5876
5877	AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5878		// We get a new address space reference here. The one we got above will
5879		// be freed by unlock_memory_etc().
5880
5881	VMTranslationMap* map = addressSpace->TranslationMap();
5882	status_t error = B_OK;
5883
5884	// iterate through all concerned areas
5885	addr_t nextAddress = lockBaseAddress;
5886	while (nextAddress != lockEndAddress) {
5887		// get the next area
5888		VMArea* area = addressSpace->LookupArea(nextAddress);
5889		if (area == NULL) {
5890			error = B_BAD_ADDRESS;
5891			break;
5892		}
5893
5894		addr_t areaStart = nextAddress;
5895		addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
5896
5897		// allocate the wired range (do that before locking the cache to avoid
5898		// deadlocks)
5899		VMAreaWiredRange* range = new(malloc_flags(mallocFlags))
5900			VMAreaWiredRange(areaStart, areaEnd - areaStart, writable, true);
5901		if (range == NULL) {
5902			error = B_NO_MEMORY;
5903			break;
5904		}
5905
5906		// Lock the area's top cache. This is a requirement for VMArea::Wire().
5907		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5908
5909		// mark the area range wired
5910		area->Wire(range);
5911
5912		// Depending on the area cache type and the wiring, we may not need to
5913		// look at the individual pages.
5914		if (area->cache_type == CACHE_TYPE_NULL
5915			|| area->cache_type == CACHE_TYPE_DEVICE
5916			|| area->wiring == B_FULL_LOCK
5917			|| area->wiring == B_CONTIGUOUS) {
5918			nextAddress = areaEnd;
5919			continue;
5920		}
5921
5922		// Lock the area's cache chain and the translation map. Needed to look
5923		// up pages and play with their wired count.
5924		cacheChainLocker.LockAllSourceCaches();
5925		map->Lock();
5926
5927		// iterate through the pages and wire them
5928		for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
5929			phys_addr_t physicalAddress;
5930			uint32 flags;
5931
5932			vm_page* page;
5933			if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
5934				&& (flags & requiredProtection) == requiredProtection
5935				&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5936					!= NULL) {
5937				// Already mapped with the correct permissions -- just increment
5938				// the page's wired count.
5939				increment_page_wired_count(page);
5940			} else {
5941				// Let vm_soft_fault() map the page for us, if possible. We need
5942				// to fully unlock to avoid deadlocks. Since we have already
5943				// wired the area itself, nothing disturbing will happen with it
5944				// in the meantime.
5945				map->Unlock();
5946				cacheChainLocker.Unlock();
5947				addressSpaceLocker.Unlock();
5948
5949				error = vm_soft_fault(addressSpace, nextAddress, writable,
5950					false, isUser, &page);
5951
5952				addressSpaceLocker.Lock();
5953				cacheChainLocker.SetTo(vm_area_get_locked_cache(area));
5954				cacheChainLocker.LockAllSourceCaches();
5955				map->Lock();
5956			}
5957
5958			if (error != B_OK)
5959				break;
5960		}
5961
5962		map->Unlock();
5963
5964		if (error == B_OK) {
5965			cacheChainLocker.Unlock();
5966		} else {
5967			// An error occurred, so abort right here. If the current address
5968			// is the first in this area, unwire the area, since we won't get
5969			// to it when reverting what we've done so far.
5970			if (nextAddress == areaStart) {
5971				area->Unwire(range);
5972				cacheChainLocker.Unlock();
5973				range->~VMAreaWiredRange();
5974				free_etc(range, mallocFlags);
5975			} else
5976				cacheChainLocker.Unlock();
5977
5978			break;
5979		}
5980	}
5981
5982	if (error != B_OK) {
5983		// An error occurred, so unwire all that we've already wired. Note that
5984		// even if not a single page was wired, unlock_memory_etc() is called
5985		// to put the address space reference.
5986		addressSpaceLocker.Unlock();
5987		unlock_memory_etc(team, (void*)lockBaseAddress,
5988			nextAddress - lockBaseAddress, flags);
5989	}
5990
5991	return error;
5992}
5993
5994
5995status_t
5996lock_memory(void* address, size_t numBytes, uint32 flags)
5997{
5998	return lock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
5999}
6000
6001
6002/*!	Unwires an address range previously wired with lock_memory_etc().
6003
6004	Note that a call to this function must balance a previous lock_memory_etc()
6005	call with exactly the same parameters.
6006*/
6007status_t
6008unlock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
6009{
6010	addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
6011	addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
6012
6013	// compute the page protection that is required
6014	bool isUser = IS_USER_ADDRESS(address);
6015	bool writable = (flags & B_READ_DEVICE) == 0;
6016	uint32 requiredProtection = PAGE_PRESENT
6017		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
6018	if (writable)
6019		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
6020
6021	uint32 mallocFlags = isUser
6022		? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
6023
6024	// get and read lock the address space
6025	VMAddressSpace* addressSpace = NULL;
6026	if (isUser) {
6027		if (team == B_CURRENT_TEAM)
6028			addressSpace = VMAddressSpace::GetCurrent();
6029		else
6030			addressSpace = VMAddressSpace::Get(team);
6031	} else
6032		addressSpace = VMAddressSpace::GetKernel();
6033	if (addressSpace == NULL)
6034		return B_ERROR;
6035
6036	AddressSpaceReadLocker addressSpaceLocker(addressSpace, false);
6037		// Take over the address space reference. We don't unlock until we're
6038		// done.
6039
6040	VMTranslationMap* map = addressSpace->TranslationMap();
6041	status_t error = B_OK;
6042
6043	// iterate through all concerned areas
6044	addr_t nextAddress = lockBaseAddress;
6045	while (nextAddress != lockEndAddress) {
6046		// get the next area
6047		VMArea* area = addressSpace->LookupArea(nextAddress);
6048		if (area == NULL) {
6049			error = B_BAD_ADDRESS;
6050			break;
6051		}
6052
6053		addr_t areaStart = nextAddress;
6054		addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
6055
6056		// Lock the area's top cache. This is a requirement for
6057		// VMArea::Unwire().
6058		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
6059
6060		// Depending on the area cache type and the wiring, we may not need to
6061		// look at the individual pages.
6062		if (area->cache_type == CACHE_TYPE_NULL
6063			|| area->cache_type == CACHE_TYPE_DEVICE
6064			|| area->wiring == B_FULL_LOCK
6065			|| area->wiring == B_CONTIGUOUS) {
6066			// unwire the range (to avoid deadlocks we delete the range after
6067			// unlocking the cache)
6068			nextAddress = areaEnd;
6069			VMAreaWiredRange* range = area->Unwire(areaStart,
6070				areaEnd - areaStart, writable);
6071			cacheChainLocker.Unlock();
6072			if (range != NULL) {
6073				range->~VMAreaWiredRange();
6074				free_etc(range, mallocFlags);
6075			}
6076			continue;
6077		}
6078
6079		// Lock the area's cache chain and the translation map. Needed to look
6080		// up pages and play with their wired count.
6081		cacheChainLocker.LockAllSourceCaches();
6082		map->Lock();
6083
6084		// iterate through the pages and unwire them
6085		for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
6086			phys_addr_t physicalAddress;
6087			uint32 flags;
6088
6089			vm_page* page;
6090			if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
6091				&& (flags & PAGE_PRESENT) != 0
6092				&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
6093					!= NULL) {
6094				// Already mapped with the correct permissions -- just increment
6095				// the page's wired count.
6096				decrement_page_wired_count(page);
6097			} else {
6098				panic("unlock_memory_etc(): Failed to unwire page: address "
6099					"space %p, address: %#" B_PRIxADDR, addressSpace,
6100					nextAddress);
6101				error = B_BAD_VALUE;
6102				break;
6103			}
6104		}
6105
6106		map->Unlock();
6107
6108		// All pages are unwired. Remove the area's wired range as well (to
6109		// avoid deadlocks we delete the range after unlocking the cache).
6110		VMAreaWiredRange* range = area->Unwire(areaStart,
6111			areaEnd - areaStart, writable);
6112
6113		cacheChainLocker.Unlock();
6114
6115		if (range != NULL) {
6116			range->~VMAreaWiredRange();
6117			free_etc(range, mallocFlags);
6118		}
6119
6120		if (error != B_OK)
6121			break;
6122	}
6123
6124	// get rid of the address space reference lock_memory_etc() acquired
6125	addressSpace->Put();
6126
6127	return error;
6128}
6129
6130
6131status_t
6132unlock_memory(void* address, size_t numBytes, uint32 flags)
6133{
6134	return unlock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
6135}
6136
6137
6138/*!	Similar to get_memory_map(), but also allows to specify the address space
6139	for the memory in question and has a saner semantics.
6140	Returns \c B_OK when the complete range could be translated or
6141	\c B_BUFFER_OVERFLOW, if the provided array wasn't big enough. In either
6142	case the actual number of entries is written to \c *_numEntries. Any other
6143	error case indicates complete failure; \c *_numEntries will be set to \c 0
6144	in this case.
6145*/
6146status_t
6147get_memory_map_etc(team_id team, const void* address, size_t numBytes,
6148	physical_entry* table, uint32* _numEntries)
6149{
6150	uint32 numEntries = *_numEntries;
6151	*_numEntries = 0;
6152
6153	VMAddressSpace* addressSpace;
6154	addr_t virtualAddress = (addr_t)address;
6155	addr_t pageOffset = virtualAddress & (B_PAGE_SIZE - 1);
6156	phys_addr_t physicalAddress;
6157	status_t status = B_OK;
6158	int32 index = -1;
6159	addr_t offset = 0;
6160	bool interrupts = are_interrupts_enabled();
6161
6162	TRACE(("get_memory_map_etc(%" B_PRId32 ", %p, %lu bytes, %" B_PRIu32 " "
6163		"entries)\n", team, address, numBytes, numEntries));
6164
6165	if (numEntries == 0 || numBytes == 0)
6166		return B_BAD_VALUE;
6167
6168	// in which address space is the address to be found?
6169	if (IS_USER_ADDRESS(virtualAddress)) {
6170		if (team == B_CURRENT_TEAM)
6171			addressSpace = VMAddressSpace::GetCurrent();
6172		else
6173			addressSpace = VMAddressSpace::Get(team);
6174	} else
6175		addressSpace = VMAddressSpace::GetKernel();
6176
6177	if (addressSpace == NULL)
6178		return B_ERROR;
6179
6180	VMTranslationMap* map = addressSpace->TranslationMap();
6181
6182	if (interrupts)
6183		map->Lock();
6184
6185	while (offset < numBytes) {
6186		addr_t bytes = min_c(numBytes - offset, B_PAGE_SIZE);
6187		uint32 flags;
6188
6189		if (interrupts) {
6190			status = map->Query((addr_t)address + offset, &physicalAddress,
6191				&flags);
6192		} else {
6193			status = map->QueryInterrupt((addr_t)address + offset,
6194				&physicalAddress, &flags);
6195		}
6196		if (status < B_OK)
6197			break;
6198		if ((flags & PAGE_PRESENT) == 0) {
6199			panic("get_memory_map() called on unmapped memory!");
6200			return B_BAD_ADDRESS;
6201		}
6202
6203		if (index < 0 && pageOffset > 0) {
6204			physicalAddress += pageOffset;
6205			if (bytes > B_PAGE_SIZE - pageOffset)
6206				bytes = B_PAGE_SIZE - pageOffset;
6207		}
6208
6209		// need to switch to the next physical_entry?
6210		if (index < 0 || table[index].address
6211				!= physicalAddress - table[index].size) {
6212			if ((uint32)++index + 1 > numEntries) {
6213				// table to small
6214				break;
6215			}
6216			table[index].address = physicalAddress;
6217			table[index].size = bytes;
6218		} else {
6219			// page does fit in current entry
6220			table[index].size += bytes;
6221		}
6222
6223		offset += bytes;
6224	}
6225
6226	if (interrupts)
6227		map->Unlock();
6228
6229	if (status != B_OK)
6230		return status;
6231
6232	if ((uint32)index + 1 > numEntries) {
6233		*_numEntries = index;
6234		return B_BUFFER_OVERFLOW;
6235	}
6236
6237	*_numEntries = index + 1;
6238	return B_OK;
6239}
6240
6241
6242/*!	According to the BeBook, this function should always succeed.
6243	This is no longer the case.
6244*/
6245extern "C" int32
6246__get_memory_map_haiku(const void* address, size_t numBytes,
6247	physical_entry* table, int32 numEntries)
6248{
6249	uint32 entriesRead = numEntries;
6250	status_t error = get_memory_map_etc(B_CURRENT_TEAM, address, numBytes,
6251		table, &entriesRead);
6252	if (error != B_OK)
6253		return error;
6254
6255	// close the entry list
6256
6257	// if it's only one entry, we will silently accept the missing ending
6258	if (numEntries == 1)
6259		return B_OK;
6260
6261	if (entriesRead + 1 > (uint32)numEntries)
6262		return B_BUFFER_OVERFLOW;
6263
6264	table[entriesRead].address = 0;
6265	table[entriesRead].size = 0;
6266
6267	return B_OK;
6268}
6269
6270
6271area_id
6272area_for(void* address)
6273{
6274	return vm_area_for((addr_t)address, true);
6275}
6276
6277
6278area_id
6279find_area(const char* name)
6280{
6281	return VMAreas::Find(name);
6282}
6283
6284
6285status_t
6286_get_area_info(area_id id, area_info* info, size_t size)
6287{
6288	if (size != sizeof(area_info) || info == NULL)
6289		return B_BAD_VALUE;
6290
6291	AddressSpaceReadLocker locker;
6292	VMArea* area;
6293	status_t status = locker.SetFromArea(id, area);
6294	if (status != B_OK)
6295		return status;
6296
6297	fill_area_info(area, info, size);
6298	return B_OK;
6299}
6300
6301
6302status_t
6303_get_next_area_info(team_id team, ssize_t* cookie, area_info* info, size_t size)
6304{
6305	addr_t nextBase = *(addr_t*)cookie;
6306
6307	// we're already through the list
6308	if (nextBase == (addr_t)-1)
6309		return B_ENTRY_NOT_FOUND;
6310
6311	if (team == B_CURRENT_TEAM)
6312		team = team_get_current_team_id();
6313
6314	AddressSpaceReadLocker locker(team);
6315	if (!locker.IsLocked())
6316		return B_BAD_TEAM_ID;
6317
6318	VMArea* area = locker.AddressSpace()->FindClosestArea(nextBase, false);
6319	if (area == NULL) {
6320		nextBase = (addr_t)-1;
6321		return B_ENTRY_NOT_FOUND;
6322	}
6323
6324	fill_area_info(area, info, size);
6325	*cookie = (ssize_t)(area->Base() + 1);
6326
6327	return B_OK;
6328}
6329
6330
6331status_t
6332set_area_protection(area_id area, uint32 newProtection)
6333{
6334	return vm_set_area_protection(VMAddressSpace::KernelID(), area,
6335		newProtection, true);
6336}
6337
6338
6339status_t
6340resize_area(area_id areaID, size_t newSize)
6341{
6342	return vm_resize_area(areaID, newSize, true);
6343}
6344
6345
6346/*!	Transfers the specified area to a new team. The caller must be the owner
6347	of the area.
6348*/
6349area_id
6350transfer_area(area_id id, void** _address, uint32 addressSpec, team_id target,
6351	bool kernel)
6352{
6353	area_info info;
6354	status_t status = get_area_info(id, &info);
6355	if (status != B_OK)
6356		return status;
6357
6358	if (info.team != thread_get_current_thread()->team->id)
6359		return B_PERMISSION_DENIED;
6360
6361	// We need to mark the area cloneable so the following operations work.
6362	status = set_area_protection(id, info.protection | B_CLONEABLE_AREA);
6363	if (status != B_OK)
6364		return status;
6365
6366	area_id clonedArea = vm_clone_area(target, info.name, _address,
6367		addressSpec, info.protection, REGION_NO_PRIVATE_MAP, id, kernel);
6368	if (clonedArea < 0)
6369		return clonedArea;
6370
6371	status = vm_delete_area(info.team, id, kernel);
6372	if (status != B_OK) {
6373		vm_delete_area(target, clonedArea, kernel);
6374		return status;
6375	}
6376
6377	// Now we can reset the protection to whatever it was before.
6378	set_area_protection(clonedArea, info.protection);
6379
6380	// TODO: The clonedArea is B_SHARED_AREA, which is not really desired.
6381
6382	return clonedArea;
6383}
6384
6385
6386extern "C" area_id
6387__map_physical_memory_haiku(const char* name, phys_addr_t physicalAddress,
6388	size_t numBytes, uint32 addressSpec, uint32 protection,
6389	void** _virtualAddress)
6390{
6391	if (!arch_vm_supports_protection(protection))
6392		return B_NOT_SUPPORTED;
6393
6394	fix_protection(&protection);
6395
6396	return vm_map_physical_memory(VMAddressSpace::KernelID(), name,
6397		_virtualAddress, addressSpec, numBytes, protection, physicalAddress,
6398		false);
6399}
6400
6401
6402area_id
6403clone_area(const char* name, void** _address, uint32 addressSpec,
6404	uint32 protection, area_id source)
6405{
6406	if ((protection & B_KERNEL_PROTECTION) == 0)
6407		protection |= B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA;
6408
6409	return vm_clone_area(VMAddressSpace::KernelID(), name, _address,
6410		addressSpec, protection, REGION_NO_PRIVATE_MAP, source, true);
6411}
6412
6413
6414area_id
6415create_area_etc(team_id team, const char* name, size_t size, uint32 lock,
6416	uint32 protection, uint32 flags, uint32 guardSize,
6417	const virtual_address_restrictions* virtualAddressRestrictions,
6418	const physical_address_restrictions* physicalAddressRestrictions,
6419	void** _address)
6420{
6421	fix_protection(&protection);
6422
6423	return vm_create_anonymous_area(team, name, size, lock, protection, flags,
6424		guardSize, virtualAddressRestrictions, physicalAddressRestrictions,
6425		true, _address);
6426}
6427
6428
6429extern "C" area_id
6430__create_area_haiku(const char* name, void** _address, uint32 addressSpec,
6431	size_t size, uint32 lock, uint32 protection)
6432{
6433	fix_protection(&protection);
6434
6435	virtual_address_restrictions virtualRestrictions = {};
6436	virtualRestrictions.address = *_address;
6437	virtualRestrictions.address_specification = addressSpec;
6438	physical_address_restrictions physicalRestrictions = {};
6439	return vm_create_anonymous_area(VMAddressSpace::KernelID(), name, size,
6440		lock, protection, 0, 0, &virtualRestrictions, &physicalRestrictions,
6441		true, _address);
6442}
6443
6444
6445status_t
6446delete_area(area_id area)
6447{
6448	return vm_delete_area(VMAddressSpace::KernelID(), area, true);
6449}
6450
6451
6452//	#pragma mark - Userland syscalls
6453
6454
6455status_t
6456_user_reserve_address_range(addr_t* userAddress, uint32 addressSpec,
6457	addr_t size)
6458{
6459	// filter out some unavailable values (for userland)
6460	switch (addressSpec) {
6461		case B_ANY_KERNEL_ADDRESS:
6462		case B_ANY_KERNEL_BLOCK_ADDRESS:
6463			return B_BAD_VALUE;
6464	}
6465
6466	addr_t address;
6467
6468	if (!IS_USER_ADDRESS(userAddress)
6469		|| user_memcpy(&address, userAddress, sizeof(address)) != B_OK)
6470		return B_BAD_ADDRESS;
6471
6472	status_t status = vm_reserve_address_range(
6473		VMAddressSpace::CurrentID(), (void**)&address, addressSpec, size,
6474		RESERVED_AVOID_BASE);
6475	if (status != B_OK)
6476		return status;
6477
6478	if (user_memcpy(userAddress, &address, sizeof(address)) != B_OK) {
6479		vm_unreserve_address_range(VMAddressSpace::CurrentID(),
6480			(void*)address, size);
6481		return B_BAD_ADDRESS;
6482	}
6483
6484	return B_OK;
6485}
6486
6487
6488status_t
6489_user_unreserve_address_range(addr_t address, addr_t size)
6490{
6491	return vm_unreserve_address_range(VMAddressSpace::CurrentID(),
6492		(void*)address, size);
6493}
6494
6495
6496area_id
6497_user_area_for(void* address)
6498{
6499	return vm_area_for((addr_t)address, false);
6500}
6501
6502
6503area_id
6504_user_find_area(const char* userName)
6505{
6506	char name[B_OS_NAME_LENGTH];
6507
6508	if (!IS_USER_ADDRESS(userName)
6509		|| user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK)
6510		return B_BAD_ADDRESS;
6511
6512	return find_area(name);
6513}
6514
6515
6516status_t
6517_user_get_area_info(area_id area, area_info* userInfo)
6518{
6519	if (!IS_USER_ADDRESS(userInfo))
6520		return B_BAD_ADDRESS;
6521
6522	area_info info;
6523	status_t status = get_area_info(area, &info);
6524	if (status < B_OK)
6525		return status;
6526
6527	// TODO: do we want to prevent userland from seeing kernel protections?
6528	//info.protection &= B_USER_PROTECTION;
6529
6530	if (user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
6531		return B_BAD_ADDRESS;
6532
6533	return status;
6534}
6535
6536
6537status_t
6538_user_get_next_area_info(team_id team, ssize_t* userCookie, area_info* userInfo)
6539{
6540	ssize_t cookie;
6541
6542	if (!IS_USER_ADDRESS(userCookie)
6543		|| !IS_USER_ADDRESS(userInfo)
6544		|| user_memcpy(&cookie, userCookie, sizeof(ssize_t)) < B_OK)
6545		return B_BAD_ADDRESS;
6546
6547	area_info info;
6548	status_t status = _get_next_area_info(team, &cookie, &info,
6549		sizeof(area_info));
6550	if (status != B_OK)
6551		return status;
6552
6553	//info.protection &= B_USER_PROTECTION;
6554
6555	if (user_memcpy(userCookie, &cookie, sizeof(ssize_t)) < B_OK
6556		|| user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
6557		return B_BAD_ADDRESS;
6558
6559	return status;
6560}
6561
6562
6563status_t
6564_user_set_area_protection(area_id area, uint32 newProtection)
6565{
6566	if ((newProtection & ~(B_USER_PROTECTION | B_CLONEABLE_AREA)) != 0)
6567		return B_BAD_VALUE;
6568
6569	return vm_set_area_protection(VMAddressSpace::CurrentID(), area,
6570		newProtection, false);
6571}
6572
6573
6574status_t
6575_user_resize_area(area_id area, size_t newSize)
6576{
6577	// TODO: Since we restrict deleting of areas to those owned by the team,
6578	// we should also do that for resizing (check other functions, too).
6579	return vm_resize_area(area, newSize, false);
6580}
6581
6582
6583area_id
6584_user_transfer_area(area_id area, void** userAddress, uint32 addressSpec,
6585	team_id target)
6586{
6587	// filter out some unavailable values (for userland)
6588	switch (addressSpec) {
6589		case B_ANY_KERNEL_ADDRESS:
6590		case B_ANY_KERNEL_BLOCK_ADDRESS:
6591			return B_BAD_VALUE;
6592	}
6593
6594	void* address;
6595	if (!IS_USER_ADDRESS(userAddress)
6596		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6597		return B_BAD_ADDRESS;
6598
6599	area_id newArea = transfer_area(area, &address, addressSpec, target, false);
6600	if (newArea < B_OK)
6601		return newArea;
6602
6603	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6604		return B_BAD_ADDRESS;
6605
6606	return newArea;
6607}
6608
6609
6610area_id
6611_user_clone_area(const char* userName, void** userAddress, uint32 addressSpec,
6612	uint32 protection, area_id sourceArea)
6613{
6614	char name[B_OS_NAME_LENGTH];
6615	void* address;
6616
6617	// filter out some unavailable values (for userland)
6618	switch (addressSpec) {
6619		case B_ANY_KERNEL_ADDRESS:
6620		case B_ANY_KERNEL_BLOCK_ADDRESS:
6621			return B_BAD_VALUE;
6622	}
6623	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6624		return B_BAD_VALUE;
6625
6626	if (!IS_USER_ADDRESS(userName)
6627		|| !IS_USER_ADDRESS(userAddress)
6628		|| user_strlcpy(name, userName, sizeof(name)) < B_OK
6629		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6630		return B_BAD_ADDRESS;
6631
6632	fix_protection(&protection);
6633
6634	area_id clonedArea = vm_clone_area(VMAddressSpace::CurrentID(), name,
6635		&address, addressSpec, protection, REGION_NO_PRIVATE_MAP, sourceArea,
6636		false);
6637	if (clonedArea < B_OK)
6638		return clonedArea;
6639
6640	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6641		delete_area(clonedArea);
6642		return B_BAD_ADDRESS;
6643	}
6644
6645	return clonedArea;
6646}
6647
6648
6649area_id
6650_user_create_area(const char* userName, void** userAddress, uint32 addressSpec,
6651	size_t size, uint32 lock, uint32 protection)
6652{
6653	char name[B_OS_NAME_LENGTH];
6654	void* address;
6655
6656	// filter out some unavailable values (for userland)
6657	switch (addressSpec) {
6658		case B_ANY_KERNEL_ADDRESS:
6659		case B_ANY_KERNEL_BLOCK_ADDRESS:
6660			return B_BAD_VALUE;
6661	}
6662	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6663		return B_BAD_VALUE;
6664
6665	if (!IS_USER_ADDRESS(userName)
6666		|| !IS_USER_ADDRESS(userAddress)
6667		|| user_strlcpy(name, userName, sizeof(name)) < B_OK
6668		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6669		return B_BAD_ADDRESS;
6670
6671	if (addressSpec == B_EXACT_ADDRESS
6672		&& IS_KERNEL_ADDRESS(address))
6673		return B_BAD_VALUE;
6674
6675	if (addressSpec == B_ANY_ADDRESS)
6676		addressSpec = B_RANDOMIZED_ANY_ADDRESS;
6677	if (addressSpec == B_BASE_ADDRESS)
6678		addressSpec = B_RANDOMIZED_BASE_ADDRESS;
6679
6680	fix_protection(&protection);
6681
6682	virtual_address_restrictions virtualRestrictions = {};
6683	virtualRestrictions.address = address;
6684	virtualRestrictions.address_specification = addressSpec;
6685	physical_address_restrictions physicalRestrictions = {};
6686	area_id area = vm_create_anonymous_area(VMAddressSpace::CurrentID(), name,
6687		size, lock, protection, 0, 0, &virtualRestrictions,
6688		&physicalRestrictions, false, &address);
6689
6690	if (area >= B_OK
6691		&& user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6692		delete_area(area);
6693		return B_BAD_ADDRESS;
6694	}
6695
6696	return area;
6697}
6698
6699
6700status_t
6701_user_delete_area(area_id area)
6702{
6703	// Unlike the BeOS implementation, you can now only delete areas
6704	// that you have created yourself from userland.
6705	// The documentation to delete_area() explicitly states that this
6706	// will be restricted in the future, and so it will.
6707	return vm_delete_area(VMAddressSpace::CurrentID(), area, false);
6708}
6709
6710
6711// TODO: create a BeOS style call for this!
6712
6713area_id
6714_user_map_file(const char* userName, void** userAddress, uint32 addressSpec,
6715	size_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
6716	int fd, off_t offset)
6717{
6718	char name[B_OS_NAME_LENGTH];
6719	void* address;
6720	area_id area;
6721
6722	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6723		return B_BAD_VALUE;
6724
6725	fix_protection(&protection);
6726
6727	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userAddress)
6728		|| user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK
6729		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6730		return B_BAD_ADDRESS;
6731
6732	if (addressSpec == B_EXACT_ADDRESS) {
6733		if ((addr_t)address + size < (addr_t)address
6734				|| (addr_t)address % B_PAGE_SIZE != 0) {
6735			return B_BAD_VALUE;
6736		}
6737		if (!IS_USER_ADDRESS(address)
6738				|| !IS_USER_ADDRESS((addr_t)address + size - 1)) {
6739			return B_BAD_ADDRESS;
6740		}
6741	}
6742
6743	area = _vm_map_file(VMAddressSpace::CurrentID(), name, &address,
6744		addressSpec, size, protection, mapping, unmapAddressRange, fd, offset,
6745		false);
6746	if (area < B_OK)
6747		return area;
6748
6749	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6750		return B_BAD_ADDRESS;
6751
6752	return area;
6753}
6754
6755
6756status_t
6757_user_unmap_memory(void* _address, size_t size)
6758{
6759	addr_t address = (addr_t)_address;
6760
6761	// check params
6762	if (size == 0 || (addr_t)address + size < (addr_t)address
6763		|| (addr_t)address % B_PAGE_SIZE != 0) {
6764		return B_BAD_VALUE;
6765	}
6766
6767	if (!IS_USER_ADDRESS(address)
6768		|| !IS_USER_ADDRESS((addr_t)address + size - 1)) {
6769		return B_BAD_ADDRESS;
6770	}
6771
6772	// Write lock the address space and ensure the address range is not wired.
6773	AddressSpaceWriteLocker locker;
6774	do {
6775		status_t status = locker.SetTo(team_get_current_team_id());
6776		if (status != B_OK)
6777			return status;
6778	} while (wait_if_address_range_is_wired(locker.AddressSpace(), address,
6779			size, &locker));
6780
6781	// unmap
6782	return unmap_address_range(locker.AddressSpace(), address, size, false);
6783}
6784
6785
6786status_t
6787_user_set_memory_protection(void* _address, size_t size, uint32 protection)
6788{
6789	// check address range
6790	addr_t address = (addr_t)_address;
6791	size = PAGE_ALIGN(size);
6792
6793	if ((address % B_PAGE_SIZE) != 0)
6794		return B_BAD_VALUE;
6795	if (!is_user_address_range(_address, size)) {
6796		// weird error code required by POSIX
6797		return ENOMEM;
6798	}
6799
6800	// extend and check protection
6801	if ((protection & ~B_USER_PROTECTION) != 0)
6802		return B_BAD_VALUE;
6803
6804	fix_protection(&protection);
6805
6806	// We need to write lock the address space, since we're going to play with
6807	// the areas. Also make sure that none of the areas is wired and that we're
6808	// actually allowed to change the protection.
6809	AddressSpaceWriteLocker locker;
6810
6811	bool restart;
6812	do {
6813		restart = false;
6814
6815		status_t status = locker.SetTo(team_get_current_team_id());
6816		if (status != B_OK)
6817			return status;
6818
6819		// First round: Check whether the whole range is covered by areas and we
6820		// are allowed to modify them.
6821		addr_t currentAddress = address;
6822		size_t sizeLeft = size;
6823		while (sizeLeft > 0) {
6824			VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6825			if (area == NULL)
6826				return B_NO_MEMORY;
6827
6828			if ((area->protection & B_KERNEL_AREA) != 0)
6829				return B_NOT_ALLOWED;
6830			if (area->protection_max != 0
6831				&& (protection & area->protection_max) != (protection & B_USER_PROTECTION)) {
6832				return B_NOT_ALLOWED;
6833			}
6834
6835			addr_t offset = currentAddress - area->Base();
6836			size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6837
6838			AreaCacheLocker cacheLocker(area);
6839
6840			if (wait_if_area_range_is_wired(area, currentAddress, rangeSize,
6841					&locker, &cacheLocker)) {
6842				restart = true;
6843				break;
6844			}
6845
6846			cacheLocker.Unlock();
6847
6848			currentAddress += rangeSize;
6849			sizeLeft -= rangeSize;
6850		}
6851	} while (restart);
6852
6853	// Second round: If the protections differ from that of the area, create a
6854	// page protection array and re-map mapped pages.
6855	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
6856	addr_t currentAddress = address;
6857	size_t sizeLeft = size;
6858	while (sizeLeft > 0) {
6859		VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6860		if (area == NULL)
6861			return B_NO_MEMORY;
6862
6863		addr_t offset = currentAddress - area->Base();
6864		size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6865
6866		currentAddress += rangeSize;
6867		sizeLeft -= rangeSize;
6868
6869		if (area->page_protections == NULL) {
6870			if (area->protection == protection)
6871				continue;
6872			if (offset == 0 && rangeSize == area->Size()) {
6873				// The whole area is covered: let set_area_protection handle it.
6874				status_t status = vm_set_area_protection(area->address_space->ID(),
6875					area->id, protection, false);
6876				if (status != B_OK)
6877					return status;
6878				continue;
6879			}
6880
6881			status_t status = allocate_area_page_protections(area);
6882			if (status != B_OK)
6883				return status;
6884		}
6885
6886		// We need to lock the complete cache chain, since we potentially unmap
6887		// pages of lower caches.
6888		VMCache* topCache = vm_area_get_locked_cache(area);
6889		VMCacheChainLocker cacheChainLocker(topCache);
6890		cacheChainLocker.LockAllSourceCaches();
6891
6892		// Adjust the committed size, if necessary.
6893		if (topCache->source != NULL && topCache->temporary) {
6894			const bool becomesWritable = (protection & B_WRITE_AREA) != 0;
6895			ssize_t commitmentChange = 0;
6896			for (addr_t pageAddress = area->Base() + offset;
6897					pageAddress < currentAddress; pageAddress += B_PAGE_SIZE) {
6898				if (topCache->LookupPage(pageAddress) != NULL) {
6899					// This page should already be accounted for in the commitment.
6900					continue;
6901				}
6902
6903				const bool isWritable
6904					= (get_area_page_protection(area, pageAddress) & B_WRITE_AREA) != 0;
6905
6906				if (becomesWritable && !isWritable)
6907					commitmentChange += B_PAGE_SIZE;
6908				else if (!becomesWritable && isWritable)
6909					commitmentChange -= B_PAGE_SIZE;
6910			}
6911
6912			if (commitmentChange != 0) {
6913				const off_t newCommitment = topCache->committed_size + commitmentChange;
6914				ASSERT(newCommitment <= (topCache->virtual_end - topCache->virtual_base));
6915				status_t status = topCache->Commit(newCommitment, VM_PRIORITY_USER);
6916				if (status != B_OK)
6917					return status;
6918			}
6919		}
6920
6921		for (addr_t pageAddress = area->Base() + offset;
6922				pageAddress < currentAddress; pageAddress += B_PAGE_SIZE) {
6923			map->Lock();
6924
6925			set_area_page_protection(area, pageAddress, protection);
6926
6927			phys_addr_t physicalAddress;
6928			uint32 flags;
6929
6930			status_t error = map->Query(pageAddress, &physicalAddress, &flags);
6931			if (error != B_OK || (flags & PAGE_PRESENT) == 0) {
6932				map->Unlock();
6933				continue;
6934			}
6935
6936			vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
6937			if (page == NULL) {
6938				panic("area %p looking up page failed for pa %#" B_PRIxPHYSADDR
6939					"\n", area, physicalAddress);
6940				map->Unlock();
6941				return B_ERROR;
6942			}
6943
6944			// If the page is not in the topmost cache and write access is
6945			// requested, we have to unmap it. Otherwise we can re-map it with
6946			// the new protection.
6947			bool unmapPage = page->Cache() != topCache
6948				&& (protection & B_WRITE_AREA) != 0;
6949
6950			if (!unmapPage)
6951				map->ProtectPage(area, pageAddress, protection);
6952
6953			map->Unlock();
6954
6955			if (unmapPage) {
6956				DEBUG_PAGE_ACCESS_START(page);
6957				unmap_page(area, pageAddress);
6958				DEBUG_PAGE_ACCESS_END(page);
6959			}
6960		}
6961	}
6962
6963	return B_OK;
6964}
6965
6966
6967status_t
6968_user_sync_memory(void* _address, size_t size, uint32 flags)
6969{
6970	addr_t address = (addr_t)_address;
6971	size = PAGE_ALIGN(size);
6972
6973	// check params
6974	if ((address % B_PAGE_SIZE) != 0)
6975		return B_BAD_VALUE;
6976	if (!is_user_address_range(_address, size)) {
6977		// weird error code required by POSIX
6978		return ENOMEM;
6979	}
6980
6981	bool writeSync = (flags & MS_SYNC) != 0;
6982	bool writeAsync = (flags & MS_ASYNC) != 0;
6983	if (writeSync && writeAsync)
6984		return B_BAD_VALUE;
6985
6986	if (size == 0 || (!writeSync && !writeAsync))
6987		return B_OK;
6988
6989	// iterate through the range and sync all concerned areas
6990	while (size > 0) {
6991		// read lock the address space
6992		AddressSpaceReadLocker locker;
6993		status_t error = locker.SetTo(team_get_current_team_id());
6994		if (error != B_OK)
6995			return error;
6996
6997		// get the first area
6998		VMArea* area = locker.AddressSpace()->LookupArea(address);
6999		if (area == NULL)
7000			return B_NO_MEMORY;
7001
7002		uint32 offset = address - area->Base();
7003		size_t rangeSize = min_c(area->Size() - offset, size);
7004		offset += area->cache_offset;
7005
7006		// lock the cache
7007		AreaCacheLocker cacheLocker(area);
7008		if (!cacheLocker)
7009			return B_BAD_VALUE;
7010		VMCache* cache = area->cache;
7011
7012		locker.Unlock();
7013
7014		uint32 firstPage = offset >> PAGE_SHIFT;
7015		uint32 endPage = firstPage + (rangeSize >> PAGE_SHIFT);
7016
7017		// write the pages
7018		if (cache->type == CACHE_TYPE_VNODE) {
7019			if (writeSync) {
7020				// synchronous
7021				error = vm_page_write_modified_page_range(cache, firstPage,
7022					endPage);
7023				if (error != B_OK)
7024					return error;
7025			} else {
7026				// asynchronous
7027				vm_page_schedule_write_page_range(cache, firstPage, endPage);
7028				// TODO: This is probably not quite what is supposed to happen.
7029				// Especially when a lot has to be written, it might take ages
7030				// until it really hits the disk.
7031			}
7032		}
7033
7034		address += rangeSize;
7035		size -= rangeSize;
7036	}
7037
7038	// NOTE: If I understand it correctly the purpose of MS_INVALIDATE is to
7039	// synchronize multiple mappings of the same file. In our VM they never get
7040	// out of sync, though, so we don't have to do anything.
7041
7042	return B_OK;
7043}
7044
7045
7046status_t
7047_user_memory_advice(void* _address, size_t size, uint32 advice)
7048{
7049	addr_t address = (addr_t)_address;
7050	if ((address % B_PAGE_SIZE) != 0)
7051		return B_BAD_VALUE;
7052
7053	size = PAGE_ALIGN(size);
7054	if (!is_user_address_range(_address, size)) {
7055		// weird error code required by POSIX
7056		return B_NO_MEMORY;
7057	}
7058
7059	switch (advice) {
7060		case MADV_NORMAL:
7061		case MADV_SEQUENTIAL:
7062		case MADV_RANDOM:
7063		case MADV_WILLNEED:
7064		case MADV_DONTNEED:
7065			// TODO: Implement!
7066			break;
7067
7068		case MADV_FREE:
7069		{
7070			AddressSpaceWriteLocker locker;
7071			do {
7072				status_t status = locker.SetTo(team_get_current_team_id());
7073				if (status != B_OK)
7074					return status;
7075			} while (wait_if_address_range_is_wired(locker.AddressSpace(),
7076					address, size, &locker));
7077
7078			discard_address_range(locker.AddressSpace(), address, size, false);
7079			break;
7080		}
7081
7082		default:
7083			return B_BAD_VALUE;
7084	}
7085
7086	return B_OK;
7087}
7088
7089
7090status_t
7091_user_get_memory_properties(team_id teamID, const void* address,
7092	uint32* _protected, uint32* _lock)
7093{
7094	if (!IS_USER_ADDRESS(_protected) || !IS_USER_ADDRESS(_lock))
7095		return B_BAD_ADDRESS;
7096
7097	AddressSpaceReadLocker locker;
7098	status_t error = locker.SetTo(teamID);
7099	if (error != B_OK)
7100		return error;
7101
7102	VMArea* area = locker.AddressSpace()->LookupArea((addr_t)address);
7103	if (area == NULL)
7104		return B_NO_MEMORY;
7105
7106	uint32 protection = get_area_page_protection(area, (addr_t)address);
7107	uint32 wiring = area->wiring;
7108
7109	locker.Unlock();
7110
7111	error = user_memcpy(_protected, &protection, sizeof(protection));
7112	if (error != B_OK)
7113		return error;
7114
7115	error = user_memcpy(_lock, &wiring, sizeof(wiring));
7116
7117	return error;
7118}
7119
7120
7121static status_t
7122user_set_memory_swappable(const void* _address, size_t size, bool swappable)
7123{
7124#if ENABLE_SWAP_SUPPORT
7125	// check address range
7126	addr_t address = (addr_t)_address;
7127	size = PAGE_ALIGN(size);
7128
7129	if ((address % B_PAGE_SIZE) != 0)
7130		return EINVAL;
7131	if (!is_user_address_range(_address, size))
7132		return EINVAL;
7133
7134	const addr_t endAddress = address + size;
7135
7136	AddressSpaceReadLocker addressSpaceLocker;
7137	status_t error = addressSpaceLocker.SetTo(team_get_current_team_id());
7138	if (error != B_OK)
7139		return error;
7140	VMAddressSpace* addressSpace = addressSpaceLocker.AddressSpace();
7141
7142	// iterate through all concerned areas
7143	addr_t nextAddress = address;
7144	while (nextAddress != endAddress) {
7145		// get the next area
7146		VMArea* area = addressSpace->LookupArea(nextAddress);
7147		if (area == NULL) {
7148			error = B_BAD_ADDRESS;
7149			break;
7150		}
7151
7152		const addr_t areaStart = nextAddress;
7153		const addr_t areaEnd = std::min(endAddress, area->Base() + area->Size());
7154		nextAddress = areaEnd;
7155
7156		error = lock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
7157		if (error != B_OK) {
7158			// We don't need to unset or reset things on failure.
7159			break;
7160		}
7161
7162		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
7163		VMAnonymousCache* anonCache = NULL;
7164		if (dynamic_cast<VMAnonymousNoSwapCache*>(area->cache) != NULL) {
7165			// This memory will aready never be swapped. Nothing to do.
7166		} else if ((anonCache = dynamic_cast<VMAnonymousCache*>(area->cache)) != NULL) {
7167			error = anonCache->SetCanSwapPages(areaStart - area->Base(),
7168				areaEnd - areaStart, swappable);
7169		} else {
7170			// Some other cache type? We cannot affect anything here.
7171			error = EINVAL;
7172		}
7173
7174		cacheChainLocker.Unlock();
7175
7176		unlock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
7177		if (error != B_OK)
7178			break;
7179	}
7180
7181	return error;
7182#else
7183	// No swap support? Nothing to do.
7184	return B_OK;
7185#endif
7186}
7187
7188
7189status_t
7190_user_mlock(const void* _address, size_t size)
7191{
7192	return user_set_memory_swappable(_address, size, false);
7193}
7194
7195
7196status_t
7197_user_munlock(const void* _address, size_t size)
7198{
7199	// TODO: B_SHARED_AREAs need to be handled a bit differently:
7200	// if multiple clones of an area had mlock() called on them,
7201	// munlock() must also be called on all of them to actually unlock.
7202	// (At present, the first munlock() will unlock all.)
7203	// TODO: fork() should automatically unlock memory in the child.
7204	return user_set_memory_swappable(_address, size, true);
7205}
7206
7207
7208// #pragma mark -- compatibility
7209
7210
7211#if defined(__i386__) && B_HAIKU_PHYSICAL_BITS > 32
7212
7213
7214struct physical_entry_beos {
7215	uint32	address;
7216	uint32	size;
7217};
7218
7219
7220/*!	The physical_entry structure has changed. We need to translate it to the
7221	old one.
7222*/
7223extern "C" int32
7224__get_memory_map_beos(const void* _address, size_t numBytes,
7225	physical_entry_beos* table, int32 numEntries)
7226{
7227	if (numEntries <= 0)
7228		return B_BAD_VALUE;
7229
7230	const uint8* address = (const uint8*)_address;
7231
7232	int32 count = 0;
7233	while (numBytes > 0 && count < numEntries) {
7234		physical_entry entry;
7235		status_t result = __get_memory_map_haiku(address, numBytes, &entry, 1);
7236		if (result < 0) {
7237			if (result != B_BUFFER_OVERFLOW)
7238				return result;
7239		}
7240
7241		if (entry.address >= (phys_addr_t)1 << 32) {
7242			panic("get_memory_map(): Address is greater 4 GB!");
7243			return B_ERROR;
7244		}
7245
7246		table[count].address = entry.address;
7247		table[count++].size = entry.size;
7248
7249		address += entry.size;
7250		numBytes -= entry.size;
7251	}
7252
7253	// null-terminate the table, if possible
7254	if (count < numEntries) {
7255		table[count].address = 0;
7256		table[count].size = 0;
7257	}
7258
7259	return B_OK;
7260}
7261
7262
7263/*!	The type of the \a physicalAddress parameter has changed from void* to
7264	phys_addr_t.
7265*/
7266extern "C" area_id
7267__map_physical_memory_beos(const char* name, void* physicalAddress,
7268	size_t numBytes, uint32 addressSpec, uint32 protection,
7269	void** _virtualAddress)
7270{
7271	return __map_physical_memory_haiku(name, (addr_t)physicalAddress, numBytes,
7272		addressSpec, protection, _virtualAddress);
7273}
7274
7275
7276/*! The caller might not be able to deal with physical addresses >= 4 GB, so
7277	we meddle with the \a lock parameter to force 32 bit.
7278*/
7279extern "C" area_id
7280__create_area_beos(const char* name, void** _address, uint32 addressSpec,
7281	size_t size, uint32 lock, uint32 protection)
7282{
7283	switch (lock) {
7284		case B_NO_LOCK:
7285			break;
7286		case B_FULL_LOCK:
7287		case B_LAZY_LOCK:
7288			lock = B_32_BIT_FULL_LOCK;
7289			break;
7290		case B_CONTIGUOUS:
7291			lock = B_32_BIT_CONTIGUOUS;
7292			break;
7293	}
7294
7295	return __create_area_haiku(name, _address, addressSpec, size, lock,
7296		protection);
7297}
7298
7299
7300DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_beos", "get_memory_map@",
7301	"BASE");
7302DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_beos",
7303	"map_physical_memory@", "BASE");
7304DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_beos", "create_area@",
7305	"BASE");
7306
7307DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
7308	"get_memory_map@@", "1_ALPHA3");
7309DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
7310	"map_physical_memory@@", "1_ALPHA3");
7311DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
7312	"1_ALPHA3");
7313
7314
7315#else
7316
7317
7318DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
7319	"get_memory_map@@", "BASE");
7320DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
7321	"map_physical_memory@@", "BASE");
7322DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
7323	"BASE");
7324
7325
7326#endif	// defined(__i386__) && B_HAIKU_PHYSICAL_BITS > 32
7327