1/*
2 * Copyright 2009-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2010, Axel Dörfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11#include <vm/vm.h>
12
13#include <ctype.h>
14#include <stdlib.h>
15#include <stdio.h>
16#include <string.h>
17#include <sys/mman.h>
18
19#include <algorithm>
20
21#include <OS.h>
22#include <KernelExport.h>
23
24#include <AutoDeleter.h>
25
26#include <symbol_versioning.h>
27
28#include <arch/cpu.h>
29#include <arch/vm.h>
30#include <boot/elf.h>
31#include <boot/stage2.h>
32#include <condition_variable.h>
33#include <console.h>
34#include <debug.h>
35#include <file_cache.h>
36#include <fs/fd.h>
37#include <heap.h>
38#include <kernel.h>
39#include <int.h>
40#include <lock.h>
41#include <low_resource_manager.h>
42#include <slab/Slab.h>
43#include <smp.h>
44#include <system_info.h>
45#include <thread.h>
46#include <team.h>
47#include <tracing.h>
48#include <util/AutoLock.h>
49#include <util/khash.h>
50#include <vm/vm_page.h>
51#include <vm/vm_priv.h>
52#include <vm/VMAddressSpace.h>
53#include <vm/VMArea.h>
54#include <vm/VMCache.h>
55
56#include "VMAddressSpaceLocking.h"
57#include "VMAnonymousCache.h"
58#include "VMAnonymousNoSwapCache.h"
59#include "IORequest.h"
60
61
62//#define TRACE_VM
63//#define TRACE_FAULTS
64#ifdef TRACE_VM
65#	define TRACE(x) dprintf x
66#else
67#	define TRACE(x) ;
68#endif
69#ifdef TRACE_FAULTS
70#	define FTRACE(x) dprintf x
71#else
72#	define FTRACE(x) ;
73#endif
74
75
76class AreaCacheLocking {
77public:
78	inline bool Lock(VMCache* lockable)
79	{
80		return false;
81	}
82
83	inline void Unlock(VMCache* lockable)
84	{
85		vm_area_put_locked_cache(lockable);
86	}
87};
88
89class AreaCacheLocker : public AutoLocker<VMCache, AreaCacheLocking> {
90public:
91	inline AreaCacheLocker(VMCache* cache = NULL)
92		: AutoLocker<VMCache, AreaCacheLocking>(cache, true)
93	{
94	}
95
96	inline AreaCacheLocker(VMArea* area)
97		: AutoLocker<VMCache, AreaCacheLocking>()
98	{
99		SetTo(area);
100	}
101
102	inline void SetTo(VMCache* cache, bool alreadyLocked)
103	{
104		AutoLocker<VMCache, AreaCacheLocking>::SetTo(cache, alreadyLocked);
105	}
106
107	inline void SetTo(VMArea* area)
108	{
109		return AutoLocker<VMCache, AreaCacheLocking>::SetTo(
110			area != NULL ? vm_area_get_locked_cache(area) : NULL, true, true);
111	}
112};
113
114
115class VMCacheChainLocker {
116public:
117	VMCacheChainLocker()
118		:
119		fTopCache(NULL),
120		fBottomCache(NULL)
121	{
122	}
123
124	VMCacheChainLocker(VMCache* topCache)
125		:
126		fTopCache(topCache),
127		fBottomCache(topCache)
128	{
129	}
130
131	~VMCacheChainLocker()
132	{
133		Unlock();
134	}
135
136	void SetTo(VMCache* topCache)
137	{
138		fTopCache = topCache;
139		fBottomCache = topCache;
140
141		if (topCache != NULL)
142			topCache->SetUserData(NULL);
143	}
144
145	VMCache* LockSourceCache()
146	{
147		if (fBottomCache == NULL || fBottomCache->source == NULL)
148			return NULL;
149
150		VMCache* previousCache = fBottomCache;
151
152		fBottomCache = fBottomCache->source;
153		fBottomCache->Lock();
154		fBottomCache->AcquireRefLocked();
155		fBottomCache->SetUserData(previousCache);
156
157		return fBottomCache;
158	}
159
160	void LockAllSourceCaches()
161	{
162		while (LockSourceCache() != NULL) {
163		}
164	}
165
166	void Unlock(VMCache* exceptCache = NULL)
167	{
168		if (fTopCache == NULL)
169			return;
170
171		// Unlock caches in source -> consumer direction. This is important to
172		// avoid double-locking and a reversal of locking order in case a cache
173		// is eligable for merging.
174		VMCache* cache = fBottomCache;
175		while (cache != NULL) {
176			VMCache* nextCache = (VMCache*)cache->UserData();
177			if (cache != exceptCache)
178				cache->ReleaseRefAndUnlock(cache != fTopCache);
179
180			if (cache == fTopCache)
181				break;
182
183			cache = nextCache;
184		}
185
186		fTopCache = NULL;
187		fBottomCache = NULL;
188	}
189
190	void UnlockKeepRefs(bool keepTopCacheLocked)
191	{
192		if (fTopCache == NULL)
193			return;
194
195		VMCache* nextCache = fBottomCache;
196		VMCache* cache = NULL;
197
198		while (keepTopCacheLocked
199				? nextCache != fTopCache : cache != fTopCache) {
200			cache = nextCache;
201			nextCache = (VMCache*)cache->UserData();
202			cache->Unlock(cache != fTopCache);
203		}
204	}
205
206	void RelockCaches(bool topCacheLocked)
207	{
208		if (fTopCache == NULL)
209			return;
210
211		VMCache* nextCache = fTopCache;
212		VMCache* cache = NULL;
213		if (topCacheLocked) {
214			cache = nextCache;
215			nextCache = cache->source;
216		}
217
218		while (cache != fBottomCache && nextCache != NULL) {
219			VMCache* consumer = cache;
220			cache = nextCache;
221			nextCache = cache->source;
222			cache->Lock();
223			cache->SetUserData(consumer);
224		}
225	}
226
227private:
228	VMCache*	fTopCache;
229	VMCache*	fBottomCache;
230};
231
232
233// The memory reserve an allocation of the certain priority must not touch.
234static const size_t kMemoryReserveForPriority[] = {
235	VM_MEMORY_RESERVE_USER,		// user
236	VM_MEMORY_RESERVE_SYSTEM,	// system
237	0							// VIP
238};
239
240
241ObjectCache* gPageMappingsObjectCache;
242
243static rw_lock sAreaCacheLock = RW_LOCK_INITIALIZER("area->cache");
244
245static off_t sAvailableMemory;
246static off_t sNeededMemory;
247static mutex sAvailableMemoryLock = MUTEX_INITIALIZER("available memory lock");
248static uint32 sPageFaults;
249
250static VMPhysicalPageMapper* sPhysicalPageMapper;
251
252#if DEBUG_CACHE_LIST
253
254struct cache_info {
255	VMCache*	cache;
256	addr_t		page_count;
257	addr_t		committed;
258};
259
260static const int kCacheInfoTableCount = 100 * 1024;
261static cache_info* sCacheInfoTable;
262
263#endif	// DEBUG_CACHE_LIST
264
265
266// function declarations
267static void delete_area(VMAddressSpace* addressSpace, VMArea* area,
268	bool addressSpaceCleanup);
269static status_t vm_soft_fault(VMAddressSpace* addressSpace, addr_t address,
270	bool isWrite, bool isUser, vm_page** wirePage,
271	VMAreaWiredRange* wiredRange = NULL);
272static status_t map_backing_store(VMAddressSpace* addressSpace,
273	VMCache* cache, off_t offset, const char* areaName, addr_t size, int wiring,
274	int protection, int mapping, uint32 flags,
275	const virtual_address_restrictions* addressRestrictions, bool kernel,
276	VMArea** _area, void** _virtualAddress);
277
278
279//	#pragma mark -
280
281
282#if VM_PAGE_FAULT_TRACING
283
284namespace VMPageFaultTracing {
285
286class PageFaultStart : public AbstractTraceEntry {
287public:
288	PageFaultStart(addr_t address, bool write, bool user, addr_t pc)
289		:
290		fAddress(address),
291		fPC(pc),
292		fWrite(write),
293		fUser(user)
294	{
295		Initialized();
296	}
297
298	virtual void AddDump(TraceOutput& out)
299	{
300		out.Print("page fault %#lx %s %s, pc: %#lx", fAddress,
301			fWrite ? "write" : "read", fUser ? "user" : "kernel", fPC);
302	}
303
304private:
305	addr_t	fAddress;
306	addr_t	fPC;
307	bool	fWrite;
308	bool	fUser;
309};
310
311
312// page fault errors
313enum {
314	PAGE_FAULT_ERROR_NO_AREA		= 0,
315	PAGE_FAULT_ERROR_KERNEL_ONLY,
316	PAGE_FAULT_ERROR_WRITE_PROTECTED,
317	PAGE_FAULT_ERROR_READ_PROTECTED,
318	PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY,
319	PAGE_FAULT_ERROR_NO_ADDRESS_SPACE
320};
321
322
323class PageFaultError : public AbstractTraceEntry {
324public:
325	PageFaultError(area_id area, status_t error)
326		:
327		fArea(area),
328		fError(error)
329	{
330		Initialized();
331	}
332
333	virtual void AddDump(TraceOutput& out)
334	{
335		switch (fError) {
336			case PAGE_FAULT_ERROR_NO_AREA:
337				out.Print("page fault error: no area");
338				break;
339			case PAGE_FAULT_ERROR_KERNEL_ONLY:
340				out.Print("page fault error: area: %ld, kernel only", fArea);
341				break;
342			case PAGE_FAULT_ERROR_WRITE_PROTECTED:
343				out.Print("page fault error: area: %ld, write protected",
344					fArea);
345				break;
346			case PAGE_FAULT_ERROR_READ_PROTECTED:
347				out.Print("page fault error: area: %ld, read protected", fArea);
348				break;
349			case PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY:
350				out.Print("page fault error: kernel touching bad user memory");
351				break;
352			case PAGE_FAULT_ERROR_NO_ADDRESS_SPACE:
353				out.Print("page fault error: no address space");
354				break;
355			default:
356				out.Print("page fault error: area: %ld, error: %s", fArea,
357					strerror(fError));
358				break;
359		}
360	}
361
362private:
363	area_id		fArea;
364	status_t	fError;
365};
366
367
368class PageFaultDone : public AbstractTraceEntry {
369public:
370	PageFaultDone(area_id area, VMCache* topCache, VMCache* cache,
371			vm_page* page)
372		:
373		fArea(area),
374		fTopCache(topCache),
375		fCache(cache),
376		fPage(page)
377	{
378		Initialized();
379	}
380
381	virtual void AddDump(TraceOutput& out)
382	{
383		out.Print("page fault done: area: %ld, top cache: %p, cache: %p, "
384			"page: %p", fArea, fTopCache, fCache, fPage);
385	}
386
387private:
388	area_id		fArea;
389	VMCache*	fTopCache;
390	VMCache*	fCache;
391	vm_page*	fPage;
392};
393
394}	// namespace VMPageFaultTracing
395
396#	define TPF(x) new(std::nothrow) VMPageFaultTracing::x;
397#else
398#	define TPF(x) ;
399#endif	// VM_PAGE_FAULT_TRACING
400
401
402//	#pragma mark -
403
404
405/*!	The page's cache must be locked.
406*/
407static inline void
408increment_page_wired_count(vm_page* page)
409{
410	if (!page->IsMapped())
411		atomic_add(&gMappedPagesCount, 1);
412	page->IncrementWiredCount();
413}
414
415
416/*!	The page's cache must be locked.
417*/
418static inline void
419decrement_page_wired_count(vm_page* page)
420{
421	page->DecrementWiredCount();
422	if (!page->IsMapped())
423		atomic_add(&gMappedPagesCount, -1);
424}
425
426
427static inline addr_t
428virtual_page_address(VMArea* area, vm_page* page)
429{
430	return area->Base()
431		+ ((page->cache_offset << PAGE_SHIFT) - area->cache_offset);
432}
433
434
435//! You need to have the address space locked when calling this function
436static VMArea*
437lookup_area(VMAddressSpace* addressSpace, area_id id)
438{
439	VMAreaHash::ReadLock();
440
441	VMArea* area = VMAreaHash::LookupLocked(id);
442	if (area != NULL && area->address_space != addressSpace)
443		area = NULL;
444
445	VMAreaHash::ReadUnlock();
446
447	return area;
448}
449
450
451static status_t
452allocate_area_page_protections(VMArea* area)
453{
454	// In the page protections we store only the three user protections,
455	// so we use 4 bits per page.
456	uint32 bytes = (area->Size() / B_PAGE_SIZE + 1) / 2;
457	area->page_protections = (uint8*)malloc_etc(bytes,
458		HEAP_DONT_LOCK_KERNEL_SPACE);
459	if (area->page_protections == NULL)
460		return B_NO_MEMORY;
461
462	// init the page protections for all pages to that of the area
463	uint32 areaProtection = area->protection
464		& (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
465	memset(area->page_protections, areaProtection | (areaProtection << 4),
466		bytes);
467	return B_OK;
468}
469
470
471static inline void
472set_area_page_protection(VMArea* area, addr_t pageAddress, uint32 protection)
473{
474	protection &= B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA;
475	uint32 pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
476	uint8& entry = area->page_protections[pageIndex / 2];
477	if (pageIndex % 2 == 0)
478		entry = (entry & 0xf0) | protection;
479	else
480		entry = (entry & 0x0f) | (protection << 4);
481}
482
483
484static inline uint32
485get_area_page_protection(VMArea* area, addr_t pageAddress)
486{
487	if (area->page_protections == NULL)
488		return area->protection;
489
490	uint32 pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
491	uint32 protection = area->page_protections[pageIndex / 2];
492	if (pageIndex % 2 == 0)
493		protection &= 0x0f;
494	else
495		protection >>= 4;
496
497	// If this is a kernel area we translate the user flags to kernel flags.
498	if (area->address_space == VMAddressSpace::Kernel()) {
499		uint32 kernelProtection = 0;
500		if ((protection & B_READ_AREA) != 0)
501			kernelProtection |= B_KERNEL_READ_AREA;
502		if ((protection & B_WRITE_AREA) != 0)
503			kernelProtection |= B_KERNEL_WRITE_AREA;
504
505		return kernelProtection;
506	}
507
508	return protection | B_KERNEL_READ_AREA
509		| (protection & B_WRITE_AREA ? B_KERNEL_WRITE_AREA : 0);
510}
511
512
513/*!	The caller must have reserved enough pages the translation map
514	implementation might need to map this page.
515	The page's cache must be locked.
516*/
517static status_t
518map_page(VMArea* area, vm_page* page, addr_t address, uint32 protection,
519	vm_page_reservation* reservation)
520{
521	VMTranslationMap* map = area->address_space->TranslationMap();
522
523	bool wasMapped = page->IsMapped();
524
525	if (area->wiring == B_NO_LOCK) {
526		DEBUG_PAGE_ACCESS_CHECK(page);
527
528		bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
529		vm_page_mapping* mapping = (vm_page_mapping*)object_cache_alloc(
530			gPageMappingsObjectCache,
531			CACHE_DONT_WAIT_FOR_MEMORY
532				| (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0));
533		if (mapping == NULL)
534			return B_NO_MEMORY;
535
536		mapping->page = page;
537		mapping->area = area;
538
539		map->Lock();
540
541		map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
542			area->MemoryType(), reservation);
543
544		// insert mapping into lists
545		if (!page->IsMapped())
546			atomic_add(&gMappedPagesCount, 1);
547
548		page->mappings.Add(mapping);
549		area->mappings.Add(mapping);
550
551		map->Unlock();
552	} else {
553		DEBUG_PAGE_ACCESS_CHECK(page);
554
555		map->Lock();
556		map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
557			area->MemoryType(), reservation);
558		map->Unlock();
559
560		increment_page_wired_count(page);
561	}
562
563	if (!wasMapped) {
564		// The page is mapped now, so we must not remain in the cached queue.
565		// It also makes sense to move it from the inactive to the active, since
566		// otherwise the page daemon wouldn't come to keep track of it (in idle
567		// mode) -- if the page isn't touched, it will be deactivated after a
568		// full iteration through the queue at the latest.
569		if (page->State() == PAGE_STATE_CACHED
570				|| page->State() == PAGE_STATE_INACTIVE) {
571			vm_page_set_state(page, PAGE_STATE_ACTIVE);
572		}
573	}
574
575	return B_OK;
576}
577
578
579/*!	If \a preserveModified is \c true, the caller must hold the lock of the
580	page's cache.
581*/
582static inline bool
583unmap_page(VMArea* area, addr_t virtualAddress)
584{
585	return area->address_space->TranslationMap()->UnmapPage(area,
586		virtualAddress, true);
587}
588
589
590/*!	If \a preserveModified is \c true, the caller must hold the lock of all
591	mapped pages' caches.
592*/
593static inline void
594unmap_pages(VMArea* area, addr_t base, size_t size)
595{
596	area->address_space->TranslationMap()->UnmapPages(area, base, size, true);
597}
598
599
600/*!	Cuts a piece out of an area. If the given cut range covers the complete
601	area, it is deleted. If it covers the beginning or the end, the area is
602	resized accordingly. If the range covers some part in the middle of the
603	area, it is split in two; in this case the second area is returned via
604	\a _secondArea (the variable is left untouched in the other cases).
605	The address space must be write locked.
606	The caller must ensure that no part of the given range is wired.
607*/
608static status_t
609cut_area(VMAddressSpace* addressSpace, VMArea* area, addr_t address,
610	addr_t lastAddress, VMArea** _secondArea, bool kernel)
611{
612	// Does the cut range intersect with the area at all?
613	addr_t areaLast = area->Base() + (area->Size() - 1);
614	if (area->Base() > lastAddress || areaLast < address)
615		return B_OK;
616
617	// Is the area fully covered?
618	if (area->Base() >= address && areaLast <= lastAddress) {
619		delete_area(addressSpace, area, false);
620		return B_OK;
621	}
622
623	int priority;
624	uint32 allocationFlags;
625	if (addressSpace == VMAddressSpace::Kernel()) {
626		priority = VM_PRIORITY_SYSTEM;
627		allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
628			| HEAP_DONT_LOCK_KERNEL_SPACE;
629	} else {
630		priority = VM_PRIORITY_USER;
631		allocationFlags = 0;
632	}
633
634	VMCache* cache = vm_area_get_locked_cache(area);
635	VMCacheChainLocker cacheChainLocker(cache);
636	cacheChainLocker.LockAllSourceCaches();
637
638	// Cut the end only?
639	if (areaLast <= lastAddress) {
640		size_t oldSize = area->Size();
641		size_t newSize = address - area->Base();
642
643		status_t error = addressSpace->ShrinkAreaTail(area, newSize,
644			allocationFlags);
645		if (error != B_OK)
646			return error;
647
648		// unmap pages
649		unmap_pages(area, address, oldSize - newSize);
650
651		// If no one else uses the area's cache, we can resize it, too.
652		if (cache->areas == area && area->cache_next == NULL
653			&& cache->consumers.IsEmpty()
654			&& cache->type == CACHE_TYPE_RAM) {
655			// Since VMCache::Resize() can temporarily drop the lock, we must
656			// unlock all lower caches to prevent locking order inversion.
657			cacheChainLocker.Unlock(cache);
658			cache->Resize(cache->virtual_base + newSize, priority);
659			cache->ReleaseRefAndUnlock();
660		}
661
662		return B_OK;
663	}
664
665	// Cut the beginning only?
666	if (area->Base() >= address) {
667		addr_t oldBase = area->Base();
668		addr_t newBase = lastAddress + 1;
669		size_t newSize = areaLast - lastAddress;
670
671		// unmap pages
672		unmap_pages(area, oldBase, newBase - oldBase);
673
674		// resize the area
675		status_t error = addressSpace->ShrinkAreaHead(area, newSize,
676			allocationFlags);
677		if (error != B_OK)
678			return error;
679
680		// TODO: If no one else uses the area's cache, we should resize it, too!
681
682		area->cache_offset += newBase - oldBase;
683
684		return B_OK;
685	}
686
687	// The tough part -- cut a piece out of the middle of the area.
688	// We do that by shrinking the area to the begin section and creating a
689	// new area for the end section.
690
691	addr_t firstNewSize = address - area->Base();
692	addr_t secondBase = lastAddress + 1;
693	addr_t secondSize = areaLast - lastAddress;
694
695	// unmap pages
696	unmap_pages(area, address, area->Size() - firstNewSize);
697
698	// resize the area
699	addr_t oldSize = area->Size();
700	status_t error = addressSpace->ShrinkAreaTail(area, firstNewSize,
701		allocationFlags);
702	if (error != B_OK)
703		return error;
704
705	// TODO: If no one else uses the area's cache, we might want to create a
706	// new cache for the second area, transfer the concerned pages from the
707	// first cache to it and resize the first cache.
708
709	// map the second area
710	virtual_address_restrictions addressRestrictions = {};
711	addressRestrictions.address = (void*)secondBase;
712	addressRestrictions.address_specification = B_EXACT_ADDRESS;
713	VMArea* secondArea;
714	error = map_backing_store(addressSpace, cache,
715		area->cache_offset + (secondBase - area->Base()), area->name,
716		secondSize, area->wiring, area->protection, REGION_NO_PRIVATE_MAP, 0,
717		&addressRestrictions, kernel, &secondArea, NULL);
718	if (error != B_OK) {
719		addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
720		return error;
721	}
722
723	// We need a cache reference for the new area.
724	cache->AcquireRefLocked();
725
726	if (_secondArea != NULL)
727		*_secondArea = secondArea;
728
729	return B_OK;
730}
731
732
733/*!	Deletes all areas in the given address range.
734	The address space must be write-locked.
735	The caller must ensure that no part of the given range is wired.
736*/
737static status_t
738unmap_address_range(VMAddressSpace* addressSpace, addr_t address, addr_t size,
739	bool kernel)
740{
741	size = PAGE_ALIGN(size);
742	addr_t lastAddress = address + (size - 1);
743
744	// Check, whether the caller is allowed to modify the concerned areas.
745	if (!kernel) {
746		for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
747				VMArea* area = it.Next();) {
748			addr_t areaLast = area->Base() + (area->Size() - 1);
749			if (area->Base() < lastAddress && address < areaLast) {
750				if ((area->protection & B_KERNEL_AREA) != 0)
751					return B_NOT_ALLOWED;
752			}
753		}
754	}
755
756	for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
757			VMArea* area = it.Next();) {
758		addr_t areaLast = area->Base() + (area->Size() - 1);
759		if (area->Base() < lastAddress && address < areaLast) {
760			status_t error = cut_area(addressSpace, area, address,
761				lastAddress, NULL, kernel);
762			if (error != B_OK)
763				return error;
764				// Failing after already messing with areas is ugly, but we
765				// can't do anything about it.
766		}
767	}
768
769	return B_OK;
770}
771
772
773/*! You need to hold the lock of the cache and the write lock of the address
774	space when calling this function.
775	Note, that in case of error your cache will be temporarily unlocked.
776	If \a addressSpec is \c B_EXACT_ADDRESS and the
777	\c CREATE_AREA_UNMAP_ADDRESS_RANGE flag is specified, the caller must ensure
778	that no part of the specified address range (base \c *_virtualAddress, size
779	\a size) is wired.
780*/
781static status_t
782map_backing_store(VMAddressSpace* addressSpace, VMCache* cache, off_t offset,
783	const char* areaName, addr_t size, int wiring, int protection, int mapping,
784	uint32 flags, const virtual_address_restrictions* addressRestrictions,
785	bool kernel, VMArea** _area, void** _virtualAddress)
786{
787	TRACE(("map_backing_store: aspace %p, cache %p, virtual %p, offset 0x%"
788		B_PRIx64 ", size %" B_PRIuADDR ", addressSpec %" B_PRIu32 ", wiring %d"
789		", protection %d, area %p, areaName '%s'\n", addressSpace, cache,
790		addressRestrictions->address, offset, size,
791		addressRestrictions->address_specification, wiring, protection,
792		_area, areaName));
793	cache->AssertLocked();
794
795	uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
796		| HEAP_DONT_LOCK_KERNEL_SPACE;
797	int priority;
798	if (addressSpace != VMAddressSpace::Kernel()) {
799		priority = VM_PRIORITY_USER;
800	} else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0) {
801		priority = VM_PRIORITY_VIP;
802		allocationFlags |= HEAP_PRIORITY_VIP;
803	} else
804		priority = VM_PRIORITY_SYSTEM;
805
806	VMArea* area = addressSpace->CreateArea(areaName, wiring, protection,
807		allocationFlags);
808	if (area == NULL)
809		return B_NO_MEMORY;
810
811	status_t status;
812
813	// if this is a private map, we need to create a new cache
814	// to handle the private copies of pages as they are written to
815	VMCache* sourceCache = cache;
816	if (mapping == REGION_PRIVATE_MAP) {
817		VMCache* newCache;
818
819		// create an anonymous cache
820		status = VMCacheFactory::CreateAnonymousCache(newCache,
821			(protection & B_STACK_AREA) != 0
822				|| (protection & B_OVERCOMMITTING_AREA) != 0, 0,
823			cache->GuardSize() / B_PAGE_SIZE, true, VM_PRIORITY_USER);
824		if (status != B_OK)
825			goto err1;
826
827		newCache->Lock();
828		newCache->temporary = 1;
829		newCache->virtual_base = offset;
830		newCache->virtual_end = offset + size;
831
832		cache->AddConsumer(newCache);
833
834		cache = newCache;
835	}
836
837	if ((flags & CREATE_AREA_DONT_COMMIT_MEMORY) == 0) {
838		status = cache->SetMinimalCommitment(size, priority);
839		if (status != B_OK)
840			goto err2;
841	}
842
843	// check to see if this address space has entered DELETE state
844	if (addressSpace->IsBeingDeleted()) {
845		// okay, someone is trying to delete this address space now, so we can't
846		// insert the area, so back out
847		status = B_BAD_TEAM_ID;
848		goto err2;
849	}
850
851	if (addressRestrictions->address_specification == B_EXACT_ADDRESS
852			&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0) {
853		status = unmap_address_range(addressSpace,
854			(addr_t)addressRestrictions->address, size, kernel);
855		if (status != B_OK)
856			goto err2;
857	}
858
859	status = addressSpace->InsertArea(area, size, addressRestrictions,
860		allocationFlags, _virtualAddress);
861	if (status != B_OK) {
862		// TODO: wait and try again once this is working in the backend
863#if 0
864		if (status == B_NO_MEMORY && addressSpec == B_ANY_KERNEL_ADDRESS) {
865			low_resource(B_KERNEL_RESOURCE_ADDRESS_SPACE, size,
866				0, 0);
867		}
868#endif
869		goto err2;
870	}
871
872	// attach the cache to the area
873	area->cache = cache;
874	area->cache_offset = offset;
875
876	// point the cache back to the area
877	cache->InsertAreaLocked(area);
878	if (mapping == REGION_PRIVATE_MAP)
879		cache->Unlock();
880
881	// insert the area in the global area hash table
882	VMAreaHash::Insert(area);
883
884	// grab a ref to the address space (the area holds this)
885	addressSpace->Get();
886
887//	ktrace_printf("map_backing_store: cache: %p (source: %p), \"%s\" -> %p",
888//		cache, sourceCache, areaName, area);
889
890	*_area = area;
891	return B_OK;
892
893err2:
894	if (mapping == REGION_PRIVATE_MAP) {
895		// We created this cache, so we must delete it again. Note, that we
896		// need to temporarily unlock the source cache or we'll otherwise
897		// deadlock, since VMCache::_RemoveConsumer() will try to lock it, too.
898		sourceCache->Unlock();
899		cache->ReleaseRefAndUnlock();
900		sourceCache->Lock();
901	}
902err1:
903	addressSpace->DeleteArea(area, allocationFlags);
904	return status;
905}
906
907
908/*!	Equivalent to wait_if_area_range_is_wired(area, area->Base(), area->Size(),
909	  locker1, locker2).
910*/
911template<typename LockerType1, typename LockerType2>
912static inline bool
913wait_if_area_is_wired(VMArea* area, LockerType1* locker1, LockerType2* locker2)
914{
915	area->cache->AssertLocked();
916
917	VMAreaUnwiredWaiter waiter;
918	if (!area->AddWaiterIfWired(&waiter))
919		return false;
920
921	// unlock everything and wait
922	if (locker1 != NULL)
923		locker1->Unlock();
924	if (locker2 != NULL)
925		locker2->Unlock();
926
927	waiter.waitEntry.Wait();
928
929	return true;
930}
931
932
933/*!	Checks whether the given area has any wired ranges intersecting with the
934	specified range and waits, if so.
935
936	When it has to wait, the function calls \c Unlock() on both \a locker1
937	and \a locker2, if given.
938	The area's top cache must be locked and must be unlocked as a side effect
939	of calling \c Unlock() on either \a locker1 or \a locker2.
940
941	If the function does not have to wait it does not modify or unlock any
942	object.
943
944	\param area The area to be checked.
945	\param base The base address of the range to check.
946	\param size The size of the address range to check.
947	\param locker1 An object to be unlocked when before starting to wait (may
948		be \c NULL).
949	\param locker2 An object to be unlocked when before starting to wait (may
950		be \c NULL).
951	\return \c true, if the function had to wait, \c false otherwise.
952*/
953template<typename LockerType1, typename LockerType2>
954static inline bool
955wait_if_area_range_is_wired(VMArea* area, addr_t base, size_t size,
956	LockerType1* locker1, LockerType2* locker2)
957{
958	area->cache->AssertLocked();
959
960	VMAreaUnwiredWaiter waiter;
961	if (!area->AddWaiterIfWired(&waiter, base, size))
962		return false;
963
964	// unlock everything and wait
965	if (locker1 != NULL)
966		locker1->Unlock();
967	if (locker2 != NULL)
968		locker2->Unlock();
969
970	waiter.waitEntry.Wait();
971
972	return true;
973}
974
975
976/*!	Checks whether the given address space has any wired ranges intersecting
977	with the specified range and waits, if so.
978
979	Similar to wait_if_area_range_is_wired(), with the following differences:
980	- All areas intersecting with the range are checked (respectively all until
981	  one is found that contains a wired range intersecting with the given
982	  range).
983	- The given address space must at least be read-locked and must be unlocked
984	  when \c Unlock() is called on \a locker.
985	- None of the areas' caches are allowed to be locked.
986*/
987template<typename LockerType>
988static inline bool
989wait_if_address_range_is_wired(VMAddressSpace* addressSpace, addr_t base,
990	size_t size, LockerType* locker)
991{
992	addr_t end = base + size - 1;
993	for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
994			VMArea* area = it.Next();) {
995		// TODO: Introduce a VMAddressSpace method to get a close iterator!
996		if (area->Base() > end)
997			return false;
998
999		if (base >= area->Base() + area->Size() - 1)
1000			continue;
1001
1002		AreaCacheLocker cacheLocker(vm_area_get_locked_cache(area));
1003
1004		if (wait_if_area_range_is_wired(area, base, size, locker, &cacheLocker))
1005			return true;
1006	}
1007
1008	return false;
1009}
1010
1011
1012/*!	Prepares an area to be used for vm_set_kernel_area_debug_protection().
1013	It must be called in a situation where the kernel address space may be
1014	locked.
1015*/
1016status_t
1017vm_prepare_kernel_area_debug_protection(area_id id, void** cookie)
1018{
1019	AddressSpaceReadLocker locker;
1020	VMArea* area;
1021	status_t status = locker.SetFromArea(id, area);
1022	if (status != B_OK)
1023		return status;
1024
1025	if (area->page_protections == NULL) {
1026		status = allocate_area_page_protections(area);
1027		if (status != B_OK)
1028			return status;
1029	}
1030
1031	*cookie = (void*)area;
1032	return B_OK;
1033}
1034
1035
1036/*!	This is a debug helper function that can only be used with very specific
1037	use cases.
1038	Sets protection for the given address range to the protection specified.
1039	If \a protection is 0 then the involved pages will be marked non-present
1040	in the translation map to cause a fault on access. The pages aren't
1041	actually unmapped however so that they can be marked present again with
1042	additional calls to this function. For this to work the area must be
1043	fully locked in memory so that the pages aren't otherwise touched.
1044	This function does not lock the kernel address space and needs to be
1045	supplied with a \a cookie retrieved from a successful call to
1046	vm_prepare_kernel_area_debug_protection().
1047*/
1048status_t
1049vm_set_kernel_area_debug_protection(void* cookie, void* _address, size_t size,
1050	uint32 protection)
1051{
1052	// check address range
1053	addr_t address = (addr_t)_address;
1054	size = PAGE_ALIGN(size);
1055
1056	if ((address % B_PAGE_SIZE) != 0
1057		|| (addr_t)address + size < (addr_t)address
1058		|| !IS_KERNEL_ADDRESS(address)
1059		|| !IS_KERNEL_ADDRESS((addr_t)address + size)) {
1060		return B_BAD_VALUE;
1061	}
1062
1063	// Translate the kernel protection to user protection as we only store that.
1064	if ((protection & B_KERNEL_READ_AREA) != 0)
1065		protection |= B_READ_AREA;
1066	if ((protection & B_KERNEL_WRITE_AREA) != 0)
1067		protection |= B_WRITE_AREA;
1068
1069	VMAddressSpace* addressSpace = VMAddressSpace::GetKernel();
1070	VMTranslationMap* map = addressSpace->TranslationMap();
1071	VMArea* area = (VMArea*)cookie;
1072
1073	addr_t offset = address - area->Base();
1074	if (area->Size() - offset < size) {
1075		panic("protect range not fully within supplied area");
1076		return B_BAD_VALUE;
1077	}
1078
1079	if (area->page_protections == NULL) {
1080		panic("area has no page protections");
1081		return B_BAD_VALUE;
1082	}
1083
1084	// Invalidate the mapping entries so any access to them will fault or
1085	// restore the mapping entries unchanged so that lookup will success again.
1086	map->Lock();
1087	map->DebugMarkRangePresent(address, address + size, protection != 0);
1088	map->Unlock();
1089
1090	// And set the proper page protections so that the fault case will actually
1091	// fail and not simply try to map a new page.
1092	for (addr_t pageAddress = address; pageAddress < address + size;
1093			pageAddress += B_PAGE_SIZE) {
1094		set_area_page_protection(area, pageAddress, protection);
1095	}
1096
1097	return B_OK;
1098}
1099
1100
1101status_t
1102vm_block_address_range(const char* name, void* address, addr_t size)
1103{
1104	if (!arch_vm_supports_protection(0))
1105		return B_NOT_SUPPORTED;
1106
1107	AddressSpaceWriteLocker locker;
1108	status_t status = locker.SetTo(VMAddressSpace::KernelID());
1109	if (status != B_OK)
1110		return status;
1111
1112	VMAddressSpace* addressSpace = locker.AddressSpace();
1113
1114	// create an anonymous cache
1115	VMCache* cache;
1116	status = VMCacheFactory::CreateAnonymousCache(cache, false, 0, 0, false,
1117		VM_PRIORITY_SYSTEM);
1118	if (status != B_OK)
1119		return status;
1120
1121	cache->temporary = 1;
1122	cache->virtual_end = size;
1123	cache->Lock();
1124
1125	VMArea* area;
1126	virtual_address_restrictions addressRestrictions = {};
1127	addressRestrictions.address = address;
1128	addressRestrictions.address_specification = B_EXACT_ADDRESS;
1129	status = map_backing_store(addressSpace, cache, 0, name, size,
1130		B_ALREADY_WIRED, B_ALREADY_WIRED, REGION_NO_PRIVATE_MAP, 0,
1131		&addressRestrictions, true, &area, NULL);
1132	if (status != B_OK) {
1133		cache->ReleaseRefAndUnlock();
1134		return status;
1135	}
1136
1137	cache->Unlock();
1138	area->cache_type = CACHE_TYPE_RAM;
1139	return area->id;
1140}
1141
1142
1143status_t
1144vm_unreserve_address_range(team_id team, void* address, addr_t size)
1145{
1146	AddressSpaceWriteLocker locker(team);
1147	if (!locker.IsLocked())
1148		return B_BAD_TEAM_ID;
1149
1150	VMAddressSpace* addressSpace = locker.AddressSpace();
1151	return addressSpace->UnreserveAddressRange((addr_t)address, size,
1152		addressSpace == VMAddressSpace::Kernel()
1153			? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0);
1154}
1155
1156
1157status_t
1158vm_reserve_address_range(team_id team, void** _address, uint32 addressSpec,
1159	addr_t size, uint32 flags)
1160{
1161	if (size == 0)
1162		return B_BAD_VALUE;
1163
1164	AddressSpaceWriteLocker locker(team);
1165	if (!locker.IsLocked())
1166		return B_BAD_TEAM_ID;
1167
1168	virtual_address_restrictions addressRestrictions = {};
1169	addressRestrictions.address = *_address;
1170	addressRestrictions.address_specification = addressSpec;
1171	VMAddressSpace* addressSpace = locker.AddressSpace();
1172	return addressSpace->ReserveAddressRange(size, &addressRestrictions, flags,
1173		addressSpace == VMAddressSpace::Kernel()
1174			? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0,
1175		_address);
1176}
1177
1178
1179area_id
1180vm_create_anonymous_area(team_id team, const char *name, addr_t size,
1181	uint32 wiring, uint32 protection, uint32 flags, addr_t guardSize,
1182	const virtual_address_restrictions* virtualAddressRestrictions,
1183	const physical_address_restrictions* physicalAddressRestrictions,
1184	bool kernel, void** _address)
1185{
1186	VMArea* area;
1187	VMCache* cache;
1188	vm_page* page = NULL;
1189	bool isStack = (protection & B_STACK_AREA) != 0;
1190	page_num_t guardPages;
1191	bool canOvercommit = false;
1192	uint32 pageAllocFlags = (flags & CREATE_AREA_DONT_CLEAR) == 0
1193		? VM_PAGE_ALLOC_CLEAR : 0;
1194
1195	TRACE(("create_anonymous_area [%" B_PRId32 "] %s: size 0x%" B_PRIxADDR "\n",
1196		team, name, size));
1197
1198	size = PAGE_ALIGN(size);
1199	guardSize = PAGE_ALIGN(guardSize);
1200	guardPages = guardSize / B_PAGE_SIZE;
1201
1202	if (size == 0 || size < guardSize)
1203		return B_BAD_VALUE;
1204	if (!arch_vm_supports_protection(protection))
1205		return B_NOT_SUPPORTED;
1206
1207	if (isStack || (protection & B_OVERCOMMITTING_AREA) != 0)
1208		canOvercommit = true;
1209
1210#ifdef DEBUG_KERNEL_STACKS
1211	if ((protection & B_KERNEL_STACK_AREA) != 0)
1212		isStack = true;
1213#endif
1214
1215	// check parameters
1216	switch (virtualAddressRestrictions->address_specification) {
1217		case B_ANY_ADDRESS:
1218		case B_EXACT_ADDRESS:
1219		case B_BASE_ADDRESS:
1220		case B_ANY_KERNEL_ADDRESS:
1221		case B_ANY_KERNEL_BLOCK_ADDRESS:
1222			break;
1223
1224		default:
1225			return B_BAD_VALUE;
1226	}
1227
1228	// If low or high physical address restrictions are given, we force
1229	// B_CONTIGUOUS wiring, since only then we'll use
1230	// vm_page_allocate_page_run() which deals with those restrictions.
1231	if (physicalAddressRestrictions->low_address != 0
1232		|| physicalAddressRestrictions->high_address != 0) {
1233		wiring = B_CONTIGUOUS;
1234	}
1235
1236	physical_address_restrictions stackPhysicalRestrictions;
1237	bool doReserveMemory = false;
1238	switch (wiring) {
1239		case B_NO_LOCK:
1240			break;
1241		case B_FULL_LOCK:
1242		case B_LAZY_LOCK:
1243		case B_CONTIGUOUS:
1244			doReserveMemory = true;
1245			break;
1246		case B_ALREADY_WIRED:
1247			break;
1248		case B_LOMEM:
1249			stackPhysicalRestrictions = *physicalAddressRestrictions;
1250			stackPhysicalRestrictions.high_address = 16 * 1024 * 1024;
1251			physicalAddressRestrictions = &stackPhysicalRestrictions;
1252			wiring = B_CONTIGUOUS;
1253			doReserveMemory = true;
1254			break;
1255		case B_32_BIT_FULL_LOCK:
1256			if (B_HAIKU_PHYSICAL_BITS <= 32
1257				|| (uint64)vm_page_max_address() < (uint64)1 << 32) {
1258				wiring = B_FULL_LOCK;
1259				doReserveMemory = true;
1260				break;
1261			}
1262			// TODO: We don't really support this mode efficiently. Just fall
1263			// through for now ...
1264		case B_32_BIT_CONTIGUOUS:
1265			#if B_HAIKU_PHYSICAL_BITS > 32
1266				if (vm_page_max_address() >= (phys_addr_t)1 << 32) {
1267					stackPhysicalRestrictions = *physicalAddressRestrictions;
1268					stackPhysicalRestrictions.high_address
1269						= (phys_addr_t)1 << 32;
1270					physicalAddressRestrictions = &stackPhysicalRestrictions;
1271				}
1272			#endif
1273			wiring = B_CONTIGUOUS;
1274			doReserveMemory = true;
1275			break;
1276		default:
1277			return B_BAD_VALUE;
1278	}
1279
1280	// Optimization: For a single-page contiguous allocation without low/high
1281	// memory restriction B_FULL_LOCK wiring suffices.
1282	if (wiring == B_CONTIGUOUS && size == B_PAGE_SIZE
1283		&& physicalAddressRestrictions->low_address == 0
1284		&& physicalAddressRestrictions->high_address == 0) {
1285		wiring = B_FULL_LOCK;
1286	}
1287
1288	// For full lock or contiguous areas we're also going to map the pages and
1289	// thus need to reserve pages for the mapping backend upfront.
1290	addr_t reservedMapPages = 0;
1291	if (wiring == B_FULL_LOCK || wiring == B_CONTIGUOUS) {
1292		AddressSpaceWriteLocker locker;
1293		status_t status = locker.SetTo(team);
1294		if (status != B_OK)
1295			return status;
1296
1297		VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1298		reservedMapPages = map->MaxPagesNeededToMap(0, size - 1);
1299	}
1300
1301	int priority;
1302	if (team != VMAddressSpace::KernelID())
1303		priority = VM_PRIORITY_USER;
1304	else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0)
1305		priority = VM_PRIORITY_VIP;
1306	else
1307		priority = VM_PRIORITY_SYSTEM;
1308
1309	// Reserve memory before acquiring the address space lock. This reduces the
1310	// chances of failure, since while holding the write lock to the address
1311	// space (if it is the kernel address space that is), the low memory handler
1312	// won't be able to free anything for us.
1313	addr_t reservedMemory = 0;
1314	if (doReserveMemory) {
1315		bigtime_t timeout = (flags & CREATE_AREA_DONT_WAIT) != 0 ? 0 : 1000000;
1316		if (vm_try_reserve_memory(size, priority, timeout) != B_OK)
1317			return B_NO_MEMORY;
1318		reservedMemory = size;
1319		// TODO: We don't reserve the memory for the pages for the page
1320		// directories/tables. We actually need to do since we currently don't
1321		// reclaim them (and probably can't reclaim all of them anyway). Thus
1322		// there are actually less physical pages than there should be, which
1323		// can get the VM into trouble in low memory situations.
1324	}
1325
1326	AddressSpaceWriteLocker locker;
1327	VMAddressSpace* addressSpace;
1328	status_t status;
1329
1330	// For full lock areas reserve the pages before locking the address
1331	// space. E.g. block caches can't release their memory while we hold the
1332	// address space lock.
1333	page_num_t reservedPages = reservedMapPages;
1334	if (wiring == B_FULL_LOCK)
1335		reservedPages += size / B_PAGE_SIZE;
1336
1337	vm_page_reservation reservation;
1338	if (reservedPages > 0) {
1339		if ((flags & CREATE_AREA_DONT_WAIT) != 0) {
1340			if (!vm_page_try_reserve_pages(&reservation, reservedPages,
1341					priority)) {
1342				reservedPages = 0;
1343				status = B_WOULD_BLOCK;
1344				goto err0;
1345			}
1346		} else
1347			vm_page_reserve_pages(&reservation, reservedPages, priority);
1348	}
1349
1350	if (wiring == B_CONTIGUOUS) {
1351		// we try to allocate the page run here upfront as this may easily
1352		// fail for obvious reasons
1353		page = vm_page_allocate_page_run(PAGE_STATE_WIRED | pageAllocFlags,
1354			size / B_PAGE_SIZE, physicalAddressRestrictions, priority);
1355		if (page == NULL) {
1356			status = B_NO_MEMORY;
1357			goto err0;
1358		}
1359	}
1360
1361	// Lock the address space and, if B_EXACT_ADDRESS and
1362	// CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
1363	// is not wired.
1364	do {
1365		status = locker.SetTo(team);
1366		if (status != B_OK)
1367			goto err1;
1368
1369		addressSpace = locker.AddressSpace();
1370	} while (virtualAddressRestrictions->address_specification
1371			== B_EXACT_ADDRESS
1372		&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
1373		&& wait_if_address_range_is_wired(addressSpace,
1374			(addr_t)virtualAddressRestrictions->address, size, &locker));
1375
1376	// create an anonymous cache
1377	// if it's a stack, make sure that two pages are available at least
1378	status = VMCacheFactory::CreateAnonymousCache(cache, canOvercommit,
1379		isStack ? (min_c(2, size / B_PAGE_SIZE - guardPages)) : 0, guardPages,
1380		wiring == B_NO_LOCK, priority);
1381	if (status != B_OK)
1382		goto err1;
1383
1384	cache->temporary = 1;
1385	cache->virtual_end = size;
1386	cache->committed_size = reservedMemory;
1387		// TODO: This should be done via a method.
1388	reservedMemory = 0;
1389
1390	cache->Lock();
1391
1392	status = map_backing_store(addressSpace, cache, 0, name, size, wiring,
1393		protection, REGION_NO_PRIVATE_MAP, flags, virtualAddressRestrictions,
1394		kernel, &area, _address);
1395
1396	if (status != B_OK) {
1397		cache->ReleaseRefAndUnlock();
1398		goto err1;
1399	}
1400
1401	locker.DegradeToReadLock();
1402
1403	switch (wiring) {
1404		case B_NO_LOCK:
1405		case B_LAZY_LOCK:
1406			// do nothing - the pages are mapped in as needed
1407			break;
1408
1409		case B_FULL_LOCK:
1410		{
1411			// Allocate and map all pages for this area
1412
1413			off_t offset = 0;
1414			for (addr_t address = area->Base();
1415					address < area->Base() + (area->Size() - 1);
1416					address += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1417#ifdef DEBUG_KERNEL_STACKS
1418#	ifdef STACK_GROWS_DOWNWARDS
1419				if (isStack && address < area->Base()
1420						+ KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1421#	else
1422				if (isStack && address >= area->Base() + area->Size()
1423						- KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1424#	endif
1425					continue;
1426#endif
1427				vm_page* page = vm_page_allocate_page(&reservation,
1428					PAGE_STATE_WIRED | pageAllocFlags);
1429				cache->InsertPage(page, offset);
1430				map_page(area, page, address, protection, &reservation);
1431
1432				DEBUG_PAGE_ACCESS_END(page);
1433			}
1434
1435			break;
1436		}
1437
1438		case B_ALREADY_WIRED:
1439		{
1440			// The pages should already be mapped. This is only really useful
1441			// during boot time. Find the appropriate vm_page objects and stick
1442			// them in the cache object.
1443			VMTranslationMap* map = addressSpace->TranslationMap();
1444			off_t offset = 0;
1445
1446			if (!gKernelStartup)
1447				panic("ALREADY_WIRED flag used outside kernel startup\n");
1448
1449			map->Lock();
1450
1451			for (addr_t virtualAddress = area->Base();
1452					virtualAddress < area->Base() + (area->Size() - 1);
1453					virtualAddress += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1454				phys_addr_t physicalAddress;
1455				uint32 flags;
1456				status = map->Query(virtualAddress, &physicalAddress, &flags);
1457				if (status < B_OK) {
1458					panic("looking up mapping failed for va 0x%lx\n",
1459						virtualAddress);
1460				}
1461				page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1462				if (page == NULL) {
1463					panic("looking up page failed for pa %#" B_PRIxPHYSADDR
1464						"\n", physicalAddress);
1465				}
1466
1467				DEBUG_PAGE_ACCESS_START(page);
1468
1469				cache->InsertPage(page, offset);
1470				increment_page_wired_count(page);
1471				vm_page_set_state(page, PAGE_STATE_WIRED);
1472				page->busy = false;
1473
1474				DEBUG_PAGE_ACCESS_END(page);
1475			}
1476
1477			map->Unlock();
1478			break;
1479		}
1480
1481		case B_CONTIGUOUS:
1482		{
1483			// We have already allocated our continuous pages run, so we can now
1484			// just map them in the address space
1485			VMTranslationMap* map = addressSpace->TranslationMap();
1486			phys_addr_t physicalAddress
1487				= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1488			addr_t virtualAddress = area->Base();
1489			off_t offset = 0;
1490
1491			map->Lock();
1492
1493			for (virtualAddress = area->Base(); virtualAddress < area->Base()
1494					+ (area->Size() - 1); virtualAddress += B_PAGE_SIZE,
1495					offset += B_PAGE_SIZE, physicalAddress += B_PAGE_SIZE) {
1496				page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1497				if (page == NULL)
1498					panic("couldn't lookup physical page just allocated\n");
1499
1500				status = map->Map(virtualAddress, physicalAddress, protection,
1501					area->MemoryType(), &reservation);
1502				if (status < B_OK)
1503					panic("couldn't map physical page in page run\n");
1504
1505				cache->InsertPage(page, offset);
1506				increment_page_wired_count(page);
1507
1508				DEBUG_PAGE_ACCESS_END(page);
1509			}
1510
1511			map->Unlock();
1512			break;
1513		}
1514
1515		default:
1516			break;
1517	}
1518
1519	cache->Unlock();
1520
1521	if (reservedPages > 0)
1522		vm_page_unreserve_pages(&reservation);
1523
1524	TRACE(("vm_create_anonymous_area: done\n"));
1525
1526	area->cache_type = CACHE_TYPE_RAM;
1527	return area->id;
1528
1529err1:
1530	if (wiring == B_CONTIGUOUS) {
1531		// we had reserved the area space upfront...
1532		phys_addr_t pageNumber = page->physical_page_number;
1533		int32 i;
1534		for (i = size / B_PAGE_SIZE; i-- > 0; pageNumber++) {
1535			page = vm_lookup_page(pageNumber);
1536			if (page == NULL)
1537				panic("couldn't lookup physical page just allocated\n");
1538
1539			vm_page_set_state(page, PAGE_STATE_FREE);
1540		}
1541	}
1542
1543err0:
1544	if (reservedPages > 0)
1545		vm_page_unreserve_pages(&reservation);
1546	if (reservedMemory > 0)
1547		vm_unreserve_memory(reservedMemory);
1548
1549	return status;
1550}
1551
1552
1553area_id
1554vm_map_physical_memory(team_id team, const char* name, void** _address,
1555	uint32 addressSpec, addr_t size, uint32 protection,
1556	phys_addr_t physicalAddress, bool alreadyWired)
1557{
1558	VMArea* area;
1559	VMCache* cache;
1560	addr_t mapOffset;
1561
1562	TRACE(("vm_map_physical_memory(aspace = %" B_PRId32 ", \"%s\", virtual = %p"
1563		", spec = %" B_PRIu32 ", size = %" B_PRIxADDR ", protection = %"
1564		B_PRIu32 ", phys = %#" B_PRIxPHYSADDR ")\n", team, name, *_address,
1565		addressSpec, size, protection, physicalAddress));
1566
1567	if (!arch_vm_supports_protection(protection))
1568		return B_NOT_SUPPORTED;
1569
1570	AddressSpaceWriteLocker locker(team);
1571	if (!locker.IsLocked())
1572		return B_BAD_TEAM_ID;
1573
1574	// if the physical address is somewhat inside a page,
1575	// move the actual area down to align on a page boundary
1576	mapOffset = physicalAddress % B_PAGE_SIZE;
1577	size += mapOffset;
1578	physicalAddress -= mapOffset;
1579
1580	size = PAGE_ALIGN(size);
1581
1582	// create a device cache
1583	status_t status = VMCacheFactory::CreateDeviceCache(cache, physicalAddress);
1584	if (status != B_OK)
1585		return status;
1586
1587	cache->virtual_end = size;
1588
1589	cache->Lock();
1590
1591	virtual_address_restrictions addressRestrictions = {};
1592	addressRestrictions.address = *_address;
1593	addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1594	status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
1595		B_FULL_LOCK, protection, REGION_NO_PRIVATE_MAP, 0, &addressRestrictions,
1596		true, &area, _address);
1597
1598	if (status < B_OK)
1599		cache->ReleaseRefLocked();
1600
1601	cache->Unlock();
1602
1603	if (status == B_OK) {
1604		// set requested memory type -- use uncached, if not given
1605		uint32 memoryType = addressSpec & B_MTR_MASK;
1606		if (memoryType == 0)
1607			memoryType = B_MTR_UC;
1608
1609		area->SetMemoryType(memoryType);
1610
1611		status = arch_vm_set_memory_type(area, physicalAddress, memoryType);
1612		if (status != B_OK)
1613			delete_area(locker.AddressSpace(), area, false);
1614	}
1615
1616	if (status != B_OK)
1617		return status;
1618
1619	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1620
1621	if (alreadyWired) {
1622		// The area is already mapped, but possibly not with the right
1623		// memory type.
1624		map->Lock();
1625		map->ProtectArea(area, area->protection);
1626		map->Unlock();
1627	} else {
1628		// Map the area completely.
1629
1630		// reserve pages needed for the mapping
1631		size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
1632			area->Base() + (size - 1));
1633		vm_page_reservation reservation;
1634		vm_page_reserve_pages(&reservation, reservePages,
1635			team == VMAddressSpace::KernelID()
1636				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1637
1638		map->Lock();
1639
1640		for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
1641			map->Map(area->Base() + offset, physicalAddress + offset,
1642				protection, area->MemoryType(), &reservation);
1643		}
1644
1645		map->Unlock();
1646
1647		vm_page_unreserve_pages(&reservation);
1648	}
1649
1650	// modify the pointer returned to be offset back into the new area
1651	// the same way the physical address in was offset
1652	*_address = (void*)((addr_t)*_address + mapOffset);
1653
1654	area->cache_type = CACHE_TYPE_DEVICE;
1655	return area->id;
1656}
1657
1658
1659/*!	Don't use!
1660	TODO: This function was introduced to map physical page vecs to
1661	contiguous virtual memory in IOBuffer::GetNextVirtualVec(). It does
1662	use a device cache and does not track vm_page::wired_count!
1663*/
1664area_id
1665vm_map_physical_memory_vecs(team_id team, const char* name, void** _address,
1666	uint32 addressSpec, addr_t* _size, uint32 protection,
1667	struct generic_io_vec* vecs, uint32 vecCount)
1668{
1669	TRACE(("vm_map_physical_memory_vecs(team = %" B_PRId32 ", \"%s\", virtual "
1670		"= %p, spec = %" B_PRIu32 ", _size = %p, protection = %" B_PRIu32 ", "
1671		"vecs = %p, vecCount = %" B_PRIu32 ")\n", team, name, *_address,
1672		addressSpec, _size, protection, vecs, vecCount));
1673
1674	if (!arch_vm_supports_protection(protection)
1675		|| (addressSpec & B_MTR_MASK) != 0) {
1676		return B_NOT_SUPPORTED;
1677	}
1678
1679	AddressSpaceWriteLocker locker(team);
1680	if (!locker.IsLocked())
1681		return B_BAD_TEAM_ID;
1682
1683	if (vecCount == 0)
1684		return B_BAD_VALUE;
1685
1686	addr_t size = 0;
1687	for (uint32 i = 0; i < vecCount; i++) {
1688		if (vecs[i].base % B_PAGE_SIZE != 0
1689			|| vecs[i].length % B_PAGE_SIZE != 0) {
1690			return B_BAD_VALUE;
1691		}
1692
1693		size += vecs[i].length;
1694	}
1695
1696	// create a device cache
1697	VMCache* cache;
1698	status_t result = VMCacheFactory::CreateDeviceCache(cache, vecs[0].base);
1699	if (result != B_OK)
1700		return result;
1701
1702	cache->virtual_end = size;
1703
1704	cache->Lock();
1705
1706	VMArea* area;
1707	virtual_address_restrictions addressRestrictions = {};
1708	addressRestrictions.address = *_address;
1709	addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1710	result = map_backing_store(locker.AddressSpace(), cache, 0, name,
1711		size, B_FULL_LOCK, protection, REGION_NO_PRIVATE_MAP, 0,
1712		&addressRestrictions, true, &area, _address);
1713
1714	if (result != B_OK)
1715		cache->ReleaseRefLocked();
1716
1717	cache->Unlock();
1718
1719	if (result != B_OK)
1720		return result;
1721
1722	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1723	size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
1724		area->Base() + (size - 1));
1725
1726	vm_page_reservation reservation;
1727	vm_page_reserve_pages(&reservation, reservePages,
1728			team == VMAddressSpace::KernelID()
1729				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1730	map->Lock();
1731
1732	uint32 vecIndex = 0;
1733	size_t vecOffset = 0;
1734	for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
1735		while (vecOffset >= vecs[vecIndex].length && vecIndex < vecCount) {
1736			vecOffset = 0;
1737			vecIndex++;
1738		}
1739
1740		if (vecIndex >= vecCount)
1741			break;
1742
1743		map->Map(area->Base() + offset, vecs[vecIndex].base + vecOffset,
1744			protection, area->MemoryType(), &reservation);
1745
1746		vecOffset += B_PAGE_SIZE;
1747	}
1748
1749	map->Unlock();
1750	vm_page_unreserve_pages(&reservation);
1751
1752	if (_size != NULL)
1753		*_size = size;
1754
1755	area->cache_type = CACHE_TYPE_DEVICE;
1756	return area->id;
1757}
1758
1759
1760area_id
1761vm_create_null_area(team_id team, const char* name, void** address,
1762	uint32 addressSpec, addr_t size, uint32 flags)
1763{
1764	size = PAGE_ALIGN(size);
1765
1766	// Lock the address space and, if B_EXACT_ADDRESS and
1767	// CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
1768	// is not wired.
1769	AddressSpaceWriteLocker locker;
1770	do {
1771		if (locker.SetTo(team) != B_OK)
1772			return B_BAD_TEAM_ID;
1773	} while (addressSpec == B_EXACT_ADDRESS
1774		&& (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
1775		&& wait_if_address_range_is_wired(locker.AddressSpace(),
1776			(addr_t)*address, size, &locker));
1777
1778	// create a null cache
1779	int priority = (flags & CREATE_AREA_PRIORITY_VIP) != 0
1780		? VM_PRIORITY_VIP : VM_PRIORITY_SYSTEM;
1781	VMCache* cache;
1782	status_t status = VMCacheFactory::CreateNullCache(priority, cache);
1783	if (status != B_OK)
1784		return status;
1785
1786	cache->temporary = 1;
1787	cache->virtual_end = size;
1788
1789	cache->Lock();
1790
1791	VMArea* area;
1792	virtual_address_restrictions addressRestrictions = {};
1793	addressRestrictions.address = *address;
1794	addressRestrictions.address_specification = addressSpec;
1795	status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
1796		B_LAZY_LOCK, B_KERNEL_READ_AREA, REGION_NO_PRIVATE_MAP, flags,
1797		&addressRestrictions, true, &area, address);
1798
1799	if (status < B_OK) {
1800		cache->ReleaseRefAndUnlock();
1801		return status;
1802	}
1803
1804	cache->Unlock();
1805
1806	area->cache_type = CACHE_TYPE_NULL;
1807	return area->id;
1808}
1809
1810
1811/*!	Creates the vnode cache for the specified \a vnode.
1812	The vnode has to be marked busy when calling this function.
1813*/
1814status_t
1815vm_create_vnode_cache(struct vnode* vnode, struct VMCache** cache)
1816{
1817	return VMCacheFactory::CreateVnodeCache(*cache, vnode);
1818}
1819
1820
1821/*!	\a cache must be locked. The area's address space must be read-locked.
1822*/
1823static void
1824pre_map_area_pages(VMArea* area, VMCache* cache,
1825	vm_page_reservation* reservation)
1826{
1827	addr_t baseAddress = area->Base();
1828	addr_t cacheOffset = area->cache_offset;
1829	page_num_t firstPage = cacheOffset / B_PAGE_SIZE;
1830	page_num_t endPage = firstPage + area->Size() / B_PAGE_SIZE;
1831
1832	for (VMCachePagesTree::Iterator it
1833				= cache->pages.GetIterator(firstPage, true, true);
1834			vm_page* page = it.Next();) {
1835		if (page->cache_offset >= endPage)
1836			break;
1837
1838		// skip busy and inactive pages
1839		if (page->busy || page->usage_count == 0)
1840			continue;
1841
1842		DEBUG_PAGE_ACCESS_START(page);
1843		map_page(area, page,
1844			baseAddress + (page->cache_offset * B_PAGE_SIZE - cacheOffset),
1845			B_READ_AREA | B_KERNEL_READ_AREA, reservation);
1846		DEBUG_PAGE_ACCESS_END(page);
1847	}
1848}
1849
1850
1851/*!	Will map the file specified by \a fd to an area in memory.
1852	The file will be mirrored beginning at the specified \a offset. The
1853	\a offset and \a size arguments have to be page aligned.
1854*/
1855static area_id
1856_vm_map_file(team_id team, const char* name, void** _address,
1857	uint32 addressSpec, size_t size, uint32 protection, uint32 mapping,
1858	bool unmapAddressRange, int fd, off_t offset, bool kernel)
1859{
1860	// TODO: for binary files, we want to make sure that they get the
1861	//	copy of a file at a given time, ie. later changes should not
1862	//	make it into the mapped copy -- this will need quite some changes
1863	//	to be done in a nice way
1864	TRACE(("_vm_map_file(fd = %d, offset = %" B_PRIdOFF ", size = %lu, mapping "
1865		"%" B_PRIu32 ")\n", fd, offset, size, mapping));
1866
1867	offset = ROUNDDOWN(offset, B_PAGE_SIZE);
1868	size = PAGE_ALIGN(size);
1869
1870	if (mapping == REGION_NO_PRIVATE_MAP)
1871		protection |= B_SHARED_AREA;
1872	if (addressSpec != B_EXACT_ADDRESS)
1873		unmapAddressRange = false;
1874
1875	if (fd < 0) {
1876		uint32 flags = unmapAddressRange ? CREATE_AREA_UNMAP_ADDRESS_RANGE : 0;
1877		virtual_address_restrictions virtualRestrictions = {};
1878		virtualRestrictions.address = *_address;
1879		virtualRestrictions.address_specification = addressSpec;
1880		physical_address_restrictions physicalRestrictions = {};
1881		return vm_create_anonymous_area(team, name, size, B_NO_LOCK, protection,
1882			flags, 0, &virtualRestrictions, &physicalRestrictions, kernel,
1883			_address);
1884	}
1885
1886	// get the open flags of the FD
1887	file_descriptor* descriptor = get_fd(get_current_io_context(kernel), fd);
1888	if (descriptor == NULL)
1889		return EBADF;
1890	int32 openMode = descriptor->open_mode;
1891	put_fd(descriptor);
1892
1893	// The FD must open for reading at any rate. For shared mapping with write
1894	// access, additionally the FD must be open for writing.
1895	if ((openMode & O_ACCMODE) == O_WRONLY
1896		|| (mapping == REGION_NO_PRIVATE_MAP
1897			&& (protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0
1898			&& (openMode & O_ACCMODE) == O_RDONLY)) {
1899		return EACCES;
1900	}
1901
1902	// get the vnode for the object, this also grabs a ref to it
1903	struct vnode* vnode = NULL;
1904	status_t status = vfs_get_vnode_from_fd(fd, kernel, &vnode);
1905	if (status < B_OK)
1906		return status;
1907	CObjectDeleter<struct vnode> vnodePutter(vnode, vfs_put_vnode);
1908
1909	// If we're going to pre-map pages, we need to reserve the pages needed by
1910	// the mapping backend upfront.
1911	page_num_t reservedPreMapPages = 0;
1912	vm_page_reservation reservation;
1913	if ((protection & B_READ_AREA) != 0) {
1914		AddressSpaceWriteLocker locker;
1915		status = locker.SetTo(team);
1916		if (status != B_OK)
1917			return status;
1918
1919		VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1920		reservedPreMapPages = map->MaxPagesNeededToMap(0, size - 1);
1921
1922		locker.Unlock();
1923
1924		vm_page_reserve_pages(&reservation, reservedPreMapPages,
1925			team == VMAddressSpace::KernelID()
1926				? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1927	}
1928
1929	struct PageUnreserver {
1930		PageUnreserver(vm_page_reservation* reservation)
1931			:
1932			fReservation(reservation)
1933		{
1934		}
1935
1936		~PageUnreserver()
1937		{
1938			if (fReservation != NULL)
1939				vm_page_unreserve_pages(fReservation);
1940		}
1941
1942		vm_page_reservation* fReservation;
1943	} pageUnreserver(reservedPreMapPages > 0 ? &reservation : NULL);
1944
1945	// Lock the address space and, if the specified address range shall be
1946	// unmapped, ensure it is not wired.
1947	AddressSpaceWriteLocker locker;
1948	do {
1949		if (locker.SetTo(team) != B_OK)
1950			return B_BAD_TEAM_ID;
1951	} while (unmapAddressRange
1952		&& wait_if_address_range_is_wired(locker.AddressSpace(),
1953			(addr_t)*_address, size, &locker));
1954
1955	// TODO: this only works for file systems that use the file cache
1956	VMCache* cache;
1957	status = vfs_get_vnode_cache(vnode, &cache, false);
1958	if (status < B_OK)
1959		return status;
1960
1961	cache->Lock();
1962
1963	VMArea* area;
1964	virtual_address_restrictions addressRestrictions = {};
1965	addressRestrictions.address = *_address;
1966	addressRestrictions.address_specification = addressSpec;
1967	status = map_backing_store(locker.AddressSpace(), cache, offset, name, size,
1968		0, protection, mapping,
1969		unmapAddressRange ? CREATE_AREA_UNMAP_ADDRESS_RANGE : 0,
1970		&addressRestrictions, kernel, &area, _address);
1971
1972	if (status != B_OK || mapping == REGION_PRIVATE_MAP) {
1973		// map_backing_store() cannot know we no longer need the ref
1974		cache->ReleaseRefLocked();
1975	}
1976
1977	if (status == B_OK && (protection & B_READ_AREA) != 0)
1978		pre_map_area_pages(area, cache, &reservation);
1979
1980	cache->Unlock();
1981
1982	if (status == B_OK) {
1983		// TODO: this probably deserves a smarter solution, ie. don't always
1984		// prefetch stuff, and also, probably don't trigger it at this place.
1985		cache_prefetch_vnode(vnode, offset, min_c(size, 10LL * 1024 * 1024));
1986			// prefetches at max 10 MB starting from "offset"
1987	}
1988
1989	if (status != B_OK)
1990		return status;
1991
1992	area->cache_type = CACHE_TYPE_VNODE;
1993	return area->id;
1994}
1995
1996
1997area_id
1998vm_map_file(team_id aid, const char* name, void** address, uint32 addressSpec,
1999	addr_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
2000	int fd, off_t offset)
2001{
2002	if (!arch_vm_supports_protection(protection))
2003		return B_NOT_SUPPORTED;
2004
2005	return _vm_map_file(aid, name, address, addressSpec, size, protection,
2006		mapping, unmapAddressRange, fd, offset, true);
2007}
2008
2009
2010VMCache*
2011vm_area_get_locked_cache(VMArea* area)
2012{
2013	rw_lock_read_lock(&sAreaCacheLock);
2014
2015	while (true) {
2016		VMCache* cache = area->cache;
2017
2018		if (!cache->SwitchFromReadLock(&sAreaCacheLock)) {
2019			// cache has been deleted
2020			rw_lock_read_lock(&sAreaCacheLock);
2021			continue;
2022		}
2023
2024		rw_lock_read_lock(&sAreaCacheLock);
2025
2026		if (cache == area->cache) {
2027			cache->AcquireRefLocked();
2028			rw_lock_read_unlock(&sAreaCacheLock);
2029			return cache;
2030		}
2031
2032		// the cache changed in the meantime
2033		cache->Unlock();
2034	}
2035}
2036
2037
2038void
2039vm_area_put_locked_cache(VMCache* cache)
2040{
2041	cache->ReleaseRefAndUnlock();
2042}
2043
2044
2045area_id
2046vm_clone_area(team_id team, const char* name, void** address,
2047	uint32 addressSpec, uint32 protection, uint32 mapping, area_id sourceID,
2048	bool kernel)
2049{
2050	VMArea* newArea = NULL;
2051	VMArea* sourceArea;
2052
2053	// Check whether the source area exists and is cloneable. If so, mark it
2054	// B_SHARED_AREA, so that we don't get problems with copy-on-write.
2055	{
2056		AddressSpaceWriteLocker locker;
2057		status_t status = locker.SetFromArea(sourceID, sourceArea);
2058		if (status != B_OK)
2059			return status;
2060
2061		if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2062			return B_NOT_ALLOWED;
2063
2064		sourceArea->protection |= B_SHARED_AREA;
2065		protection |= B_SHARED_AREA;
2066	}
2067
2068	// Now lock both address spaces and actually do the cloning.
2069
2070	MultiAddressSpaceLocker locker;
2071	VMAddressSpace* sourceAddressSpace;
2072	status_t status = locker.AddArea(sourceID, false, &sourceAddressSpace);
2073	if (status != B_OK)
2074		return status;
2075
2076	VMAddressSpace* targetAddressSpace;
2077	status = locker.AddTeam(team, true, &targetAddressSpace);
2078	if (status != B_OK)
2079		return status;
2080
2081	status = locker.Lock();
2082	if (status != B_OK)
2083		return status;
2084
2085	sourceArea = lookup_area(sourceAddressSpace, sourceID);
2086	if (sourceArea == NULL)
2087		return B_BAD_VALUE;
2088
2089	if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2090		return B_NOT_ALLOWED;
2091
2092	VMCache* cache = vm_area_get_locked_cache(sourceArea);
2093
2094	// TODO: for now, B_USER_CLONEABLE is disabled, until all drivers
2095	//	have been adapted. Maybe it should be part of the kernel settings,
2096	//	anyway (so that old drivers can always work).
2097#if 0
2098	if (sourceArea->aspace == VMAddressSpace::Kernel()
2099		&& addressSpace != VMAddressSpace::Kernel()
2100		&& !(sourceArea->protection & B_USER_CLONEABLE_AREA)) {
2101		// kernel areas must not be cloned in userland, unless explicitly
2102		// declared user-cloneable upon construction
2103		status = B_NOT_ALLOWED;
2104	} else
2105#endif
2106	if (sourceArea->cache_type == CACHE_TYPE_NULL)
2107		status = B_NOT_ALLOWED;
2108	else {
2109		virtual_address_restrictions addressRestrictions = {};
2110		addressRestrictions.address = *address;
2111		addressRestrictions.address_specification = addressSpec;
2112		status = map_backing_store(targetAddressSpace, cache,
2113			sourceArea->cache_offset, name, sourceArea->Size(),
2114			sourceArea->wiring, protection, mapping, 0, &addressRestrictions,
2115			kernel, &newArea, address);
2116	}
2117	if (status == B_OK && mapping != REGION_PRIVATE_MAP) {
2118		// If the mapping is REGION_PRIVATE_MAP, map_backing_store() needed
2119		// to create a new cache, and has therefore already acquired a reference
2120		// to the source cache - but otherwise it has no idea that we need
2121		// one.
2122		cache->AcquireRefLocked();
2123	}
2124	if (status == B_OK && newArea->wiring == B_FULL_LOCK) {
2125		// we need to map in everything at this point
2126		if (sourceArea->cache_type == CACHE_TYPE_DEVICE) {
2127			// we don't have actual pages to map but a physical area
2128			VMTranslationMap* map
2129				= sourceArea->address_space->TranslationMap();
2130			map->Lock();
2131
2132			phys_addr_t physicalAddress;
2133			uint32 oldProtection;
2134			map->Query(sourceArea->Base(), &physicalAddress, &oldProtection);
2135
2136			map->Unlock();
2137
2138			map = targetAddressSpace->TranslationMap();
2139			size_t reservePages = map->MaxPagesNeededToMap(newArea->Base(),
2140				newArea->Base() + (newArea->Size() - 1));
2141
2142			vm_page_reservation reservation;
2143			vm_page_reserve_pages(&reservation, reservePages,
2144				targetAddressSpace == VMAddressSpace::Kernel()
2145					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2146			map->Lock();
2147
2148			for (addr_t offset = 0; offset < newArea->Size();
2149					offset += B_PAGE_SIZE) {
2150				map->Map(newArea->Base() + offset, physicalAddress + offset,
2151					protection, newArea->MemoryType(), &reservation);
2152			}
2153
2154			map->Unlock();
2155			vm_page_unreserve_pages(&reservation);
2156		} else {
2157			VMTranslationMap* map = targetAddressSpace->TranslationMap();
2158			size_t reservePages = map->MaxPagesNeededToMap(
2159				newArea->Base(), newArea->Base() + (newArea->Size() - 1));
2160			vm_page_reservation reservation;
2161			vm_page_reserve_pages(&reservation, reservePages,
2162				targetAddressSpace == VMAddressSpace::Kernel()
2163					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2164
2165			// map in all pages from source
2166			for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
2167					vm_page* page  = it.Next();) {
2168				if (!page->busy) {
2169					DEBUG_PAGE_ACCESS_START(page);
2170					map_page(newArea, page,
2171						newArea->Base() + ((page->cache_offset << PAGE_SHIFT)
2172							- newArea->cache_offset),
2173						protection, &reservation);
2174					DEBUG_PAGE_ACCESS_END(page);
2175				}
2176			}
2177			// TODO: B_FULL_LOCK means that all pages are locked. We are not
2178			// ensuring that!
2179
2180			vm_page_unreserve_pages(&reservation);
2181		}
2182	}
2183	if (status == B_OK)
2184		newArea->cache_type = sourceArea->cache_type;
2185
2186	vm_area_put_locked_cache(cache);
2187
2188	if (status < B_OK)
2189		return status;
2190
2191	return newArea->id;
2192}
2193
2194
2195/*!	Deletes the specified area of the given address space.
2196
2197	The address space must be write-locked.
2198	The caller must ensure that the area does not have any wired ranges.
2199
2200	\param addressSpace The address space containing the area.
2201	\param area The area to be deleted.
2202	\param deletingAddressSpace \c true, if the address space is in the process
2203		of being deleted.
2204*/
2205static void
2206delete_area(VMAddressSpace* addressSpace, VMArea* area,
2207	bool deletingAddressSpace)
2208{
2209	ASSERT(!area->IsWired());
2210
2211	VMAreaHash::Remove(area);
2212
2213	// At this point the area is removed from the global hash table, but
2214	// still exists in the area list.
2215
2216	// Unmap the virtual address space the area occupied.
2217	{
2218		// We need to lock the complete cache chain.
2219		VMCache* topCache = vm_area_get_locked_cache(area);
2220		VMCacheChainLocker cacheChainLocker(topCache);
2221		cacheChainLocker.LockAllSourceCaches();
2222
2223		// If the area's top cache is a temporary cache and the area is the only
2224		// one referencing it (besides us currently holding a second reference),
2225		// the unmapping code doesn't need to care about preserving the accessed
2226		// and dirty flags of the top cache page mappings.
2227		bool ignoreTopCachePageFlags
2228			= topCache->temporary && topCache->RefCount() == 2;
2229
2230		area->address_space->TranslationMap()->UnmapArea(area,
2231			deletingAddressSpace, ignoreTopCachePageFlags);
2232	}
2233
2234	if (!area->cache->temporary)
2235		area->cache->WriteModified();
2236
2237	uint32 allocationFlags = addressSpace == VMAddressSpace::Kernel()
2238		? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
2239
2240	arch_vm_unset_memory_type(area);
2241	addressSpace->RemoveArea(area, allocationFlags);
2242	addressSpace->Put();
2243
2244	area->cache->RemoveArea(area);
2245	area->cache->ReleaseRef();
2246
2247	addressSpace->DeleteArea(area, allocationFlags);
2248}
2249
2250
2251status_t
2252vm_delete_area(team_id team, area_id id, bool kernel)
2253{
2254	TRACE(("vm_delete_area(team = 0x%" B_PRIx32 ", area = 0x%" B_PRIx32 ")\n",
2255		team, id));
2256
2257	// lock the address space and make sure the area isn't wired
2258	AddressSpaceWriteLocker locker;
2259	VMArea* area;
2260	AreaCacheLocker cacheLocker;
2261
2262	do {
2263		status_t status = locker.SetFromArea(team, id, area);
2264		if (status != B_OK)
2265			return status;
2266
2267		cacheLocker.SetTo(area);
2268	} while (wait_if_area_is_wired(area, &locker, &cacheLocker));
2269
2270	cacheLocker.Unlock();
2271
2272	if (!kernel && (area->protection & B_KERNEL_AREA) != 0)
2273		return B_NOT_ALLOWED;
2274
2275	delete_area(locker.AddressSpace(), area, false);
2276	return B_OK;
2277}
2278
2279
2280/*!	Creates a new cache on top of given cache, moves all areas from
2281	the old cache to the new one, and changes the protection of all affected
2282	areas' pages to read-only. If requested, wired pages are moved up to the
2283	new cache and copies are added to the old cache in their place.
2284	Preconditions:
2285	- The given cache must be locked.
2286	- All of the cache's areas' address spaces must be read locked.
2287	- Either the cache must not have any wired ranges or a page reservation for
2288	  all wired pages must be provided, so they can be copied.
2289
2290	\param lowerCache The cache on top of which a new cache shall be created.
2291	\param wiredPagesReservation If \c NULL there must not be any wired pages
2292		in \a lowerCache. Otherwise as many pages must be reserved as the cache
2293		has wired page. The wired pages are copied in this case.
2294*/
2295static status_t
2296vm_copy_on_write_area(VMCache* lowerCache,
2297	vm_page_reservation* wiredPagesReservation)
2298{
2299	VMCache* upperCache;
2300
2301	TRACE(("vm_copy_on_write_area(cache = %p)\n", lowerCache));
2302
2303	// We need to separate the cache from its areas. The cache goes one level
2304	// deeper and we create a new cache inbetween.
2305
2306	// create an anonymous cache
2307	status_t status = VMCacheFactory::CreateAnonymousCache(upperCache, false, 0,
2308		lowerCache->GuardSize() / B_PAGE_SIZE,
2309		dynamic_cast<VMAnonymousNoSwapCache*>(lowerCache) == NULL,
2310		VM_PRIORITY_USER);
2311	if (status != B_OK)
2312		return status;
2313
2314	upperCache->Lock();
2315
2316	upperCache->temporary = 1;
2317	upperCache->virtual_base = lowerCache->virtual_base;
2318	upperCache->virtual_end = lowerCache->virtual_end;
2319
2320	// transfer the lower cache areas to the upper cache
2321	rw_lock_write_lock(&sAreaCacheLock);
2322	upperCache->TransferAreas(lowerCache);
2323	rw_lock_write_unlock(&sAreaCacheLock);
2324
2325	lowerCache->AddConsumer(upperCache);
2326
2327	// We now need to remap all pages from all of the cache's areas read-only,
2328	// so that a copy will be created on next write access. If there are wired
2329	// pages, we keep their protection, move them to the upper cache and create
2330	// copies for the lower cache.
2331	if (wiredPagesReservation != NULL) {
2332		// We need to handle wired pages -- iterate through the cache's pages.
2333		for (VMCachePagesTree::Iterator it = lowerCache->pages.GetIterator();
2334				vm_page* page = it.Next();) {
2335			if (page->WiredCount() > 0) {
2336				// allocate a new page and copy the wired one
2337				vm_page* copiedPage = vm_page_allocate_page(
2338					wiredPagesReservation, PAGE_STATE_ACTIVE);
2339
2340				vm_memcpy_physical_page(
2341					copiedPage->physical_page_number * B_PAGE_SIZE,
2342					page->physical_page_number * B_PAGE_SIZE);
2343
2344				// move the wired page to the upper cache (note: removing is OK
2345				// with the SplayTree iterator) and insert the copy
2346				upperCache->MovePage(page);
2347				lowerCache->InsertPage(copiedPage,
2348					page->cache_offset * B_PAGE_SIZE);
2349
2350				DEBUG_PAGE_ACCESS_END(copiedPage);
2351			} else {
2352				// Change the protection of this page in all areas.
2353				for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2354						tempArea = tempArea->cache_next) {
2355					// The area must be readable in the same way it was
2356					// previously writable.
2357					uint32 protection = B_KERNEL_READ_AREA;
2358					if ((tempArea->protection & B_READ_AREA) != 0)
2359						protection |= B_READ_AREA;
2360
2361					VMTranslationMap* map
2362						= tempArea->address_space->TranslationMap();
2363					map->Lock();
2364					map->ProtectPage(tempArea,
2365						virtual_page_address(tempArea, page), protection);
2366					map->Unlock();
2367				}
2368			}
2369		}
2370	} else {
2371		ASSERT(lowerCache->WiredPagesCount() == 0);
2372
2373		// just change the protection of all areas
2374		for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2375				tempArea = tempArea->cache_next) {
2376			// The area must be readable in the same way it was previously
2377			// writable.
2378			uint32 protection = B_KERNEL_READ_AREA;
2379			if ((tempArea->protection & B_READ_AREA) != 0)
2380				protection |= B_READ_AREA;
2381
2382			VMTranslationMap* map = tempArea->address_space->TranslationMap();
2383			map->Lock();
2384			map->ProtectArea(tempArea, protection);
2385			map->Unlock();
2386		}
2387	}
2388
2389	vm_area_put_locked_cache(upperCache);
2390
2391	return B_OK;
2392}
2393
2394
2395area_id
2396vm_copy_area(team_id team, const char* name, void** _address,
2397	uint32 addressSpec, uint32 protection, area_id sourceID)
2398{
2399	bool writableCopy = (protection & (B_KERNEL_WRITE_AREA | B_WRITE_AREA)) != 0;
2400
2401	if ((protection & B_KERNEL_PROTECTION) == 0) {
2402		// set the same protection for the kernel as for userland
2403		protection |= B_KERNEL_READ_AREA;
2404		if (writableCopy)
2405			protection |= B_KERNEL_WRITE_AREA;
2406	}
2407
2408	// Do the locking: target address space, all address spaces associated with
2409	// the source cache, and the cache itself.
2410	MultiAddressSpaceLocker locker;
2411	VMAddressSpace* targetAddressSpace;
2412	VMCache* cache;
2413	VMArea* source;
2414	AreaCacheLocker cacheLocker;
2415	status_t status;
2416	bool sharedArea;
2417
2418	page_num_t wiredPages = 0;
2419	vm_page_reservation wiredPagesReservation;
2420
2421	bool restart;
2422	do {
2423		restart = false;
2424
2425		locker.Unset();
2426		status = locker.AddTeam(team, true, &targetAddressSpace);
2427		if (status == B_OK) {
2428			status = locker.AddAreaCacheAndLock(sourceID, false, false, source,
2429				&cache);
2430		}
2431		if (status != B_OK)
2432			return status;
2433
2434		cacheLocker.SetTo(cache, true);	// already locked
2435
2436		sharedArea = (source->protection & B_SHARED_AREA) != 0;
2437
2438		page_num_t oldWiredPages = wiredPages;
2439		wiredPages = 0;
2440
2441		// If the source area isn't shared, count the number of wired pages in
2442		// the cache and reserve as many pages.
2443		if (!sharedArea) {
2444			wiredPages = cache->WiredPagesCount();
2445
2446			if (wiredPages > oldWiredPages) {
2447				cacheLocker.Unlock();
2448				locker.Unlock();
2449
2450				if (oldWiredPages > 0)
2451					vm_page_unreserve_pages(&wiredPagesReservation);
2452
2453				vm_page_reserve_pages(&wiredPagesReservation, wiredPages,
2454					VM_PRIORITY_USER);
2455
2456				restart = true;
2457			}
2458		} else if (oldWiredPages > 0)
2459			vm_page_unreserve_pages(&wiredPagesReservation);
2460	} while (restart);
2461
2462	// unreserve pages later
2463	struct PagesUnreserver {
2464		PagesUnreserver(vm_page_reservation* reservation)
2465			:
2466			fReservation(reservation)
2467		{
2468		}
2469
2470		~PagesUnreserver()
2471		{
2472			if (fReservation != NULL)
2473				vm_page_unreserve_pages(fReservation);
2474		}
2475
2476	private:
2477		vm_page_reservation*	fReservation;
2478	} pagesUnreserver(wiredPages > 0 ? &wiredPagesReservation : NULL);
2479
2480	if (addressSpec == B_CLONE_ADDRESS) {
2481		addressSpec = B_EXACT_ADDRESS;
2482		*_address = (void*)source->Base();
2483	}
2484
2485	// First, create a cache on top of the source area, respectively use the
2486	// existing one, if this is a shared area.
2487
2488	VMArea* target;
2489	virtual_address_restrictions addressRestrictions = {};
2490	addressRestrictions.address = *_address;
2491	addressRestrictions.address_specification = addressSpec;
2492	status = map_backing_store(targetAddressSpace, cache, source->cache_offset,
2493		name, source->Size(), source->wiring, protection,
2494		sharedArea ? REGION_NO_PRIVATE_MAP : REGION_PRIVATE_MAP,
2495		writableCopy ? 0 : CREATE_AREA_DONT_COMMIT_MEMORY,
2496		&addressRestrictions, true, &target, _address);
2497	if (status < B_OK)
2498		return status;
2499
2500	if (sharedArea) {
2501		// The new area uses the old area's cache, but map_backing_store()
2502		// hasn't acquired a ref. So we have to do that now.
2503		cache->AcquireRefLocked();
2504	}
2505
2506	// If the source area is writable, we need to move it one layer up as well
2507
2508	if (!sharedArea) {
2509		if ((source->protection & (B_KERNEL_WRITE_AREA | B_WRITE_AREA)) != 0) {
2510			// TODO: do something more useful if this fails!
2511			if (vm_copy_on_write_area(cache,
2512					wiredPages > 0 ? &wiredPagesReservation : NULL) < B_OK) {
2513				panic("vm_copy_on_write_area() failed!\n");
2514			}
2515		}
2516	}
2517
2518	// we return the ID of the newly created area
2519	return target->id;
2520}
2521
2522
2523static status_t
2524vm_set_area_protection(team_id team, area_id areaID, uint32 newProtection,
2525	bool kernel)
2526{
2527	TRACE(("vm_set_area_protection(team = %#" B_PRIx32 ", area = %#" B_PRIx32
2528		", protection = %#" B_PRIx32 ")\n", team, areaID, newProtection));
2529
2530	if (!arch_vm_supports_protection(newProtection))
2531		return B_NOT_SUPPORTED;
2532
2533	bool becomesWritable
2534		= (newProtection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2535
2536	// lock address spaces and cache
2537	MultiAddressSpaceLocker locker;
2538	VMCache* cache;
2539	VMArea* area;
2540	status_t status;
2541	AreaCacheLocker cacheLocker;
2542	bool isWritable;
2543
2544	bool restart;
2545	do {
2546		restart = false;
2547
2548		locker.Unset();
2549		status = locker.AddAreaCacheAndLock(areaID, true, false, area, &cache);
2550		if (status != B_OK)
2551			return status;
2552
2553		cacheLocker.SetTo(cache, true);	// already locked
2554
2555		if (!kernel && (area->protection & B_KERNEL_AREA) != 0)
2556			return B_NOT_ALLOWED;
2557
2558		if (area->protection == newProtection)
2559			return B_OK;
2560
2561		if (team != VMAddressSpace::KernelID()
2562			&& area->address_space->ID() != team) {
2563			// unless you're the kernel, you are only allowed to set
2564			// the protection of your own areas
2565			return B_NOT_ALLOWED;
2566		}
2567
2568		isWritable
2569			= (area->protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2570
2571		// Make sure the area (respectively, if we're going to call
2572		// vm_copy_on_write_area(), all areas of the cache) doesn't have any
2573		// wired ranges.
2574		if (!isWritable && becomesWritable && !cache->consumers.IsEmpty()) {
2575			for (VMArea* otherArea = cache->areas; otherArea != NULL;
2576					otherArea = otherArea->cache_next) {
2577				if (wait_if_area_is_wired(otherArea, &locker, &cacheLocker)) {
2578					restart = true;
2579					break;
2580				}
2581			}
2582		} else {
2583			if (wait_if_area_is_wired(area, &locker, &cacheLocker))
2584				restart = true;
2585		}
2586	} while (restart);
2587
2588	bool changePageProtection = true;
2589	bool changeTopCachePagesOnly = false;
2590
2591	if (isWritable && !becomesWritable) {
2592		// writable -> !writable
2593
2594		if (cache->source != NULL && cache->temporary) {
2595			if (cache->CountWritableAreas(area) == 0) {
2596				// Since this cache now lives from the pages in its source cache,
2597				// we can change the cache's commitment to take only those pages
2598				// into account that really are in this cache.
2599
2600				status = cache->Commit(cache->page_count * B_PAGE_SIZE,
2601					team == VMAddressSpace::KernelID()
2602						? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2603
2604				// TODO: we may be able to join with our source cache, if
2605				// count == 0
2606			}
2607		}
2608
2609		// If only the writability changes, we can just remap the pages of the
2610		// top cache, since the pages of lower caches are mapped read-only
2611		// anyway. That's advantageous only, if the number of pages in the cache
2612		// is significantly smaller than the number of pages in the area,
2613		// though.
2614		if (newProtection
2615				== (area->protection & ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA))
2616			&& cache->page_count * 2 < area->Size() / B_PAGE_SIZE) {
2617			changeTopCachePagesOnly = true;
2618		}
2619	} else if (!isWritable && becomesWritable) {
2620		// !writable -> writable
2621
2622		if (!cache->consumers.IsEmpty()) {
2623			// There are consumers -- we have to insert a new cache. Fortunately
2624			// vm_copy_on_write_area() does everything that's needed.
2625			changePageProtection = false;
2626			status = vm_copy_on_write_area(cache, NULL);
2627		} else {
2628			// No consumers, so we don't need to insert a new one.
2629			if (cache->source != NULL && cache->temporary) {
2630				// the cache's commitment must contain all possible pages
2631				status = cache->Commit(cache->virtual_end - cache->virtual_base,
2632					team == VMAddressSpace::KernelID()
2633						? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2634			}
2635
2636			if (status == B_OK && cache->source != NULL) {
2637				// There's a source cache, hence we can't just change all pages'
2638				// protection or we might allow writing into pages belonging to
2639				// a lower cache.
2640				changeTopCachePagesOnly = true;
2641			}
2642		}
2643	} else {
2644		// we don't have anything special to do in all other cases
2645	}
2646
2647	if (status == B_OK) {
2648		// remap existing pages in this cache
2649		if (changePageProtection) {
2650			VMTranslationMap* map = area->address_space->TranslationMap();
2651			map->Lock();
2652
2653			if (changeTopCachePagesOnly) {
2654				page_num_t firstPageOffset = area->cache_offset / B_PAGE_SIZE;
2655				page_num_t lastPageOffset
2656					= firstPageOffset + area->Size() / B_PAGE_SIZE;
2657				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
2658						vm_page* page = it.Next();) {
2659					if (page->cache_offset >= firstPageOffset
2660						&& page->cache_offset <= lastPageOffset) {
2661						addr_t address = virtual_page_address(area, page);
2662						map->ProtectPage(area, address, newProtection);
2663					}
2664				}
2665			} else
2666				map->ProtectArea(area, newProtection);
2667
2668			map->Unlock();
2669		}
2670
2671		area->protection = newProtection;
2672	}
2673
2674	return status;
2675}
2676
2677
2678status_t
2679vm_get_page_mapping(team_id team, addr_t vaddr, phys_addr_t* paddr)
2680{
2681	VMAddressSpace* addressSpace = VMAddressSpace::Get(team);
2682	if (addressSpace == NULL)
2683		return B_BAD_TEAM_ID;
2684
2685	VMTranslationMap* map = addressSpace->TranslationMap();
2686
2687	map->Lock();
2688	uint32 dummyFlags;
2689	status_t status = map->Query(vaddr, paddr, &dummyFlags);
2690	map->Unlock();
2691
2692	addressSpace->Put();
2693	return status;
2694}
2695
2696
2697/*!	The page's cache must be locked.
2698*/
2699bool
2700vm_test_map_modification(vm_page* page)
2701{
2702	if (page->modified)
2703		return true;
2704
2705	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2706	vm_page_mapping* mapping;
2707	while ((mapping = iterator.Next()) != NULL) {
2708		VMArea* area = mapping->area;
2709		VMTranslationMap* map = area->address_space->TranslationMap();
2710
2711		phys_addr_t physicalAddress;
2712		uint32 flags;
2713		map->Lock();
2714		map->Query(virtual_page_address(area, page), &physicalAddress, &flags);
2715		map->Unlock();
2716
2717		if ((flags & PAGE_MODIFIED) != 0)
2718			return true;
2719	}
2720
2721	return false;
2722}
2723
2724
2725/*!	The page's cache must be locked.
2726*/
2727void
2728vm_clear_map_flags(vm_page* page, uint32 flags)
2729{
2730	if ((flags & PAGE_ACCESSED) != 0)
2731		page->accessed = false;
2732	if ((flags & PAGE_MODIFIED) != 0)
2733		page->modified = false;
2734
2735	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2736	vm_page_mapping* mapping;
2737	while ((mapping = iterator.Next()) != NULL) {
2738		VMArea* area = mapping->area;
2739		VMTranslationMap* map = area->address_space->TranslationMap();
2740
2741		map->Lock();
2742		map->ClearFlags(virtual_page_address(area, page), flags);
2743		map->Unlock();
2744	}
2745}
2746
2747
2748/*!	Removes all mappings from a page.
2749	After you've called this function, the page is unmapped from memory and
2750	the page's \c accessed and \c modified flags have been updated according
2751	to the state of the mappings.
2752	The page's cache must be locked.
2753*/
2754void
2755vm_remove_all_page_mappings(vm_page* page)
2756{
2757	while (vm_page_mapping* mapping = page->mappings.Head()) {
2758		VMArea* area = mapping->area;
2759		VMTranslationMap* map = area->address_space->TranslationMap();
2760		addr_t address = virtual_page_address(area, page);
2761		map->UnmapPage(area, address, false);
2762	}
2763}
2764
2765
2766int32
2767vm_clear_page_mapping_accessed_flags(struct vm_page *page)
2768{
2769	int32 count = 0;
2770
2771	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2772	vm_page_mapping* mapping;
2773	while ((mapping = iterator.Next()) != NULL) {
2774		VMArea* area = mapping->area;
2775		VMTranslationMap* map = area->address_space->TranslationMap();
2776
2777		bool modified;
2778		if (map->ClearAccessedAndModified(area,
2779				virtual_page_address(area, page), false, modified)) {
2780			count++;
2781		}
2782
2783		page->modified |= modified;
2784	}
2785
2786
2787	if (page->accessed) {
2788		count++;
2789		page->accessed = false;
2790	}
2791
2792	return count;
2793}
2794
2795
2796/*!	Removes all mappings of a page and/or clears the accessed bits of the
2797	mappings.
2798	The function iterates through the page mappings and removes them until
2799	encountering one that has been accessed. From then on it will continue to
2800	iterate, but only clear the accessed flag of the mapping. The page's
2801	\c modified bit will be updated accordingly, the \c accessed bit will be
2802	cleared.
2803	\return The number of mapping accessed bits encountered, including the
2804		\c accessed bit of the page itself. If \c 0 is returned, all mappings
2805		of the page have been removed.
2806*/
2807int32
2808vm_remove_all_page_mappings_if_unaccessed(struct vm_page *page)
2809{
2810	ASSERT(page->WiredCount() == 0);
2811
2812	if (page->accessed)
2813		return vm_clear_page_mapping_accessed_flags(page);
2814
2815	while (vm_page_mapping* mapping = page->mappings.Head()) {
2816		VMArea* area = mapping->area;
2817		VMTranslationMap* map = area->address_space->TranslationMap();
2818		addr_t address = virtual_page_address(area, page);
2819		bool modified = false;
2820		if (map->ClearAccessedAndModified(area, address, true, modified)) {
2821			page->accessed = true;
2822			page->modified |= modified;
2823			return vm_clear_page_mapping_accessed_flags(page);
2824		}
2825		page->modified |= modified;
2826	}
2827
2828	return 0;
2829}
2830
2831
2832static int
2833display_mem(int argc, char** argv)
2834{
2835	bool physical = false;
2836	addr_t copyAddress;
2837	int32 displayWidth;
2838	int32 itemSize;
2839	int32 num = -1;
2840	addr_t address;
2841	int i = 1, j;
2842
2843	if (argc > 1 && argv[1][0] == '-') {
2844		if (!strcmp(argv[1], "-p") || !strcmp(argv[1], "--physical")) {
2845			physical = true;
2846			i++;
2847		} else
2848			i = 99;
2849	}
2850
2851	if (argc < i + 1 || argc > i + 2) {
2852		kprintf("usage: dl/dw/ds/db/string [-p|--physical] <address> [num]\n"
2853			"\tdl - 8 bytes\n"
2854			"\tdw - 4 bytes\n"
2855			"\tds - 2 bytes\n"
2856			"\tdb - 1 byte\n"
2857			"\tstring - a whole string\n"
2858			"  -p or --physical only allows memory from a single page to be "
2859			"displayed.\n");
2860		return 0;
2861	}
2862
2863	address = parse_expression(argv[i]);
2864
2865	if (argc > i + 1)
2866		num = parse_expression(argv[i + 1]);
2867
2868	// build the format string
2869	if (strcmp(argv[0], "db") == 0) {
2870		itemSize = 1;
2871		displayWidth = 16;
2872	} else if (strcmp(argv[0], "ds") == 0) {
2873		itemSize = 2;
2874		displayWidth = 8;
2875	} else if (strcmp(argv[0], "dw") == 0) {
2876		itemSize = 4;
2877		displayWidth = 4;
2878	} else if (strcmp(argv[0], "dl") == 0) {
2879		itemSize = 8;
2880		displayWidth = 2;
2881	} else if (strcmp(argv[0], "string") == 0) {
2882		itemSize = 1;
2883		displayWidth = -1;
2884	} else {
2885		kprintf("display_mem called in an invalid way!\n");
2886		return 0;
2887	}
2888
2889	if (num <= 0)
2890		num = displayWidth;
2891
2892	void* physicalPageHandle = NULL;
2893
2894	if (physical) {
2895		int32 offset = address & (B_PAGE_SIZE - 1);
2896		if (num * itemSize + offset > B_PAGE_SIZE) {
2897			num = (B_PAGE_SIZE - offset) / itemSize;
2898			kprintf("NOTE: number of bytes has been cut to page size\n");
2899		}
2900
2901		address = ROUNDDOWN(address, B_PAGE_SIZE);
2902
2903		if (vm_get_physical_page_debug(address, &copyAddress,
2904				&physicalPageHandle) != B_OK) {
2905			kprintf("getting the hardware page failed.");
2906			return 0;
2907		}
2908
2909		address += offset;
2910		copyAddress += offset;
2911	} else
2912		copyAddress = address;
2913
2914	if (!strcmp(argv[0], "string")) {
2915		kprintf("%p \"", (char*)copyAddress);
2916
2917		// string mode
2918		for (i = 0; true; i++) {
2919			char c;
2920			if (debug_memcpy(B_CURRENT_TEAM, &c, (char*)copyAddress + i, 1)
2921					!= B_OK
2922				|| c == '\0') {
2923				break;
2924			}
2925
2926			if (c == '\n')
2927				kprintf("\\n");
2928			else if (c == '\t')
2929				kprintf("\\t");
2930			else {
2931				if (!isprint(c))
2932					c = '.';
2933
2934				kprintf("%c", c);
2935			}
2936		}
2937
2938		kprintf("\"\n");
2939	} else {
2940		// number mode
2941		for (i = 0; i < num; i++) {
2942			uint32 value;
2943
2944			if ((i % displayWidth) == 0) {
2945				int32 displayed = min_c(displayWidth, (num-i)) * itemSize;
2946				if (i != 0)
2947					kprintf("\n");
2948
2949				kprintf("[0x%lx]  ", address + i * itemSize);
2950
2951				for (j = 0; j < displayed; j++) {
2952					char c;
2953					if (debug_memcpy(B_CURRENT_TEAM, &c,
2954							(char*)copyAddress + i * itemSize + j, 1) != B_OK) {
2955						displayed = j;
2956						break;
2957					}
2958					if (!isprint(c))
2959						c = '.';
2960
2961					kprintf("%c", c);
2962				}
2963				if (num > displayWidth) {
2964					// make sure the spacing in the last line is correct
2965					for (j = displayed; j < displayWidth * itemSize; j++)
2966						kprintf(" ");
2967				}
2968				kprintf("  ");
2969			}
2970
2971			if (debug_memcpy(B_CURRENT_TEAM, &value,
2972					(uint8*)copyAddress + i * itemSize, itemSize) != B_OK) {
2973				kprintf("read fault");
2974				break;
2975			}
2976
2977			switch (itemSize) {
2978				case 1:
2979					kprintf(" %02" B_PRIx8, *(uint8*)&value);
2980					break;
2981				case 2:
2982					kprintf(" %04" B_PRIx16, *(uint16*)&value);
2983					break;
2984				case 4:
2985					kprintf(" %08" B_PRIx32, *(uint32*)&value);
2986					break;
2987				case 8:
2988					kprintf(" %016" B_PRIx64, *(uint64*)&value);
2989					break;
2990			}
2991		}
2992
2993		kprintf("\n");
2994	}
2995
2996	if (physical) {
2997		copyAddress = ROUNDDOWN(copyAddress, B_PAGE_SIZE);
2998		vm_put_physical_page_debug(copyAddress, physicalPageHandle);
2999	}
3000	return 0;
3001}
3002
3003
3004static void
3005dump_cache_tree_recursively(VMCache* cache, int level,
3006	VMCache* highlightCache)
3007{
3008	// print this cache
3009	for (int i = 0; i < level; i++)
3010		kprintf("  ");
3011	if (cache == highlightCache)
3012		kprintf("%p <--\n", cache);
3013	else
3014		kprintf("%p\n", cache);
3015
3016	// recursively print its consumers
3017	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3018			VMCache* consumer = it.Next();) {
3019		dump_cache_tree_recursively(consumer, level + 1, highlightCache);
3020	}
3021}
3022
3023
3024static int
3025dump_cache_tree(int argc, char** argv)
3026{
3027	if (argc != 2 || !strcmp(argv[1], "--help")) {
3028		kprintf("usage: %s <address>\n", argv[0]);
3029		return 0;
3030	}
3031
3032	addr_t address = parse_expression(argv[1]);
3033	if (address == 0)
3034		return 0;
3035
3036	VMCache* cache = (VMCache*)address;
3037	VMCache* root = cache;
3038
3039	// find the root cache (the transitive source)
3040	while (root->source != NULL)
3041		root = root->source;
3042
3043	dump_cache_tree_recursively(root, 0, cache);
3044
3045	return 0;
3046}
3047
3048
3049const char*
3050vm_cache_type_to_string(int32 type)
3051{
3052	switch (type) {
3053		case CACHE_TYPE_RAM:
3054			return "RAM";
3055		case CACHE_TYPE_DEVICE:
3056			return "device";
3057		case CACHE_TYPE_VNODE:
3058			return "vnode";
3059		case CACHE_TYPE_NULL:
3060			return "null";
3061
3062		default:
3063			return "unknown";
3064	}
3065}
3066
3067
3068#if DEBUG_CACHE_LIST
3069
3070static void
3071update_cache_info_recursively(VMCache* cache, cache_info& info)
3072{
3073	info.page_count += cache->page_count;
3074	if (cache->type == CACHE_TYPE_RAM)
3075		info.committed += cache->committed_size;
3076
3077	// recurse
3078	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3079			VMCache* consumer = it.Next();) {
3080		update_cache_info_recursively(consumer, info);
3081	}
3082}
3083
3084
3085static int
3086cache_info_compare_page_count(const void* _a, const void* _b)
3087{
3088	const cache_info* a = (const cache_info*)_a;
3089	const cache_info* b = (const cache_info*)_b;
3090	if (a->page_count == b->page_count)
3091		return 0;
3092	return a->page_count < b->page_count ? 1 : -1;
3093}
3094
3095
3096static int
3097cache_info_compare_committed(const void* _a, const void* _b)
3098{
3099	const cache_info* a = (const cache_info*)_a;
3100	const cache_info* b = (const cache_info*)_b;
3101	if (a->committed == b->committed)
3102		return 0;
3103	return a->committed < b->committed ? 1 : -1;
3104}
3105
3106
3107static void
3108dump_caches_recursively(VMCache* cache, cache_info& info, int level)
3109{
3110	for (int i = 0; i < level; i++)
3111		kprintf("  ");
3112
3113	kprintf("%p: type: %s, base: %" B_PRIdOFF ", size: %" B_PRIdOFF ", "
3114		"pages: %" B_PRIu32, cache, vm_cache_type_to_string(cache->type),
3115		cache->virtual_base, cache->virtual_end, cache->page_count);
3116
3117	if (level == 0)
3118		kprintf("/%lu", info.page_count);
3119
3120	if (cache->type == CACHE_TYPE_RAM || (level == 0 && info.committed > 0)) {
3121		kprintf(", committed: %" B_PRIdOFF, cache->committed_size);
3122
3123		if (level == 0)
3124			kprintf("/%lu", info.committed);
3125	}
3126
3127	// areas
3128	if (cache->areas != NULL) {
3129		VMArea* area = cache->areas;
3130		kprintf(", areas: %" B_PRId32 " (%s, team: %" B_PRId32 ")", area->id,
3131			area->name, area->address_space->ID());
3132
3133		while (area->cache_next != NULL) {
3134			area = area->cache_next;
3135			kprintf(", %" B_PRId32, area->id);
3136		}
3137	}
3138
3139	kputs("\n");
3140
3141	// recurse
3142	for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3143			VMCache* consumer = it.Next();) {
3144		dump_caches_recursively(consumer, info, level + 1);
3145	}
3146}
3147
3148
3149static int
3150dump_caches(int argc, char** argv)
3151{
3152	if (sCacheInfoTable == NULL) {
3153		kprintf("No cache info table!\n");
3154		return 0;
3155	}
3156
3157	bool sortByPageCount = true;
3158
3159	for (int32 i = 1; i < argc; i++) {
3160		if (strcmp(argv[i], "-c") == 0) {
3161			sortByPageCount = false;
3162		} else {
3163			print_debugger_command_usage(argv[0]);
3164			return 0;
3165		}
3166	}
3167
3168	uint32 totalCount = 0;
3169	uint32 rootCount = 0;
3170	off_t totalCommitted = 0;
3171	page_num_t totalPages = 0;
3172
3173	VMCache* cache = gDebugCacheList;
3174	while (cache) {
3175		totalCount++;
3176		if (cache->source == NULL) {
3177			cache_info stackInfo;
3178			cache_info& info = rootCount < (uint32)kCacheInfoTableCount
3179				? sCacheInfoTable[rootCount] : stackInfo;
3180			rootCount++;
3181			info.cache = cache;
3182			info.page_count = 0;
3183			info.committed = 0;
3184			update_cache_info_recursively(cache, info);
3185			totalCommitted += info.committed;
3186			totalPages += info.page_count;
3187		}
3188
3189		cache = cache->debug_next;
3190	}
3191
3192	if (rootCount <= (uint32)kCacheInfoTableCount) {
3193		qsort(sCacheInfoTable, rootCount, sizeof(cache_info),
3194			sortByPageCount
3195				? &cache_info_compare_page_count
3196				: &cache_info_compare_committed);
3197	}
3198
3199	kprintf("total committed memory: %" B_PRIdOFF ", total used pages: %"
3200		B_PRIuPHYSADDR "\n", totalCommitted, totalPages);
3201	kprintf("%" B_PRIu32 " caches (%" B_PRIu32 " root caches), sorted by %s "
3202		"per cache tree...\n\n", totalCount, rootCount, sortByPageCount ?
3203			"page count" : "committed size");
3204
3205	if (rootCount <= (uint32)kCacheInfoTableCount) {
3206		for (uint32 i = 0; i < rootCount; i++) {
3207			cache_info& info = sCacheInfoTable[i];
3208			dump_caches_recursively(info.cache, info, 0);
3209		}
3210	} else
3211		kprintf("Cache info table too small! Can't sort and print caches!\n");
3212
3213	return 0;
3214}
3215
3216#endif	// DEBUG_CACHE_LIST
3217
3218
3219static int
3220dump_cache(int argc, char** argv)
3221{
3222	VMCache* cache;
3223	bool showPages = false;
3224	int i = 1;
3225
3226	if (argc < 2 || !strcmp(argv[1], "--help")) {
3227		kprintf("usage: %s [-ps] <address>\n"
3228			"  if -p is specified, all pages are shown, if -s is used\n"
3229			"  only the cache info is shown respectively.\n", argv[0]);
3230		return 0;
3231	}
3232	while (argv[i][0] == '-') {
3233		char* arg = argv[i] + 1;
3234		while (arg[0]) {
3235			if (arg[0] == 'p')
3236				showPages = true;
3237			arg++;
3238		}
3239		i++;
3240	}
3241	if (argv[i] == NULL) {
3242		kprintf("%s: invalid argument, pass address\n", argv[0]);
3243		return 0;
3244	}
3245
3246	addr_t address = parse_expression(argv[i]);
3247	if (address == 0)
3248		return 0;
3249
3250	cache = (VMCache*)address;
3251
3252	cache->Dump(showPages);
3253
3254	set_debug_variable("_sourceCache", (addr_t)cache->source);
3255
3256	return 0;
3257}
3258
3259
3260static void
3261dump_area_struct(VMArea* area, bool mappings)
3262{
3263	kprintf("AREA: %p\n", area);
3264	kprintf("name:\t\t'%s'\n", area->name);
3265	kprintf("owner:\t\t0x%" B_PRIx32 "\n", area->address_space->ID());
3266	kprintf("id:\t\t0x%" B_PRIx32 "\n", area->id);
3267	kprintf("base:\t\t0x%lx\n", area->Base());
3268	kprintf("size:\t\t0x%lx\n", area->Size());
3269	kprintf("protection:\t0x%" B_PRIx32 "\n", area->protection);
3270	kprintf("wiring:\t\t0x%x\n", area->wiring);
3271	kprintf("memory_type:\t%#" B_PRIx32 "\n", area->MemoryType());
3272	kprintf("cache:\t\t%p\n", area->cache);
3273	kprintf("cache_type:\t%s\n", vm_cache_type_to_string(area->cache_type));
3274	kprintf("cache_offset:\t0x%" B_PRIx64 "\n", area->cache_offset);
3275	kprintf("cache_next:\t%p\n", area->cache_next);
3276	kprintf("cache_prev:\t%p\n", area->cache_prev);
3277
3278	VMAreaMappings::Iterator iterator = area->mappings.GetIterator();
3279	if (mappings) {
3280		kprintf("page mappings:\n");
3281		while (iterator.HasNext()) {
3282			vm_page_mapping* mapping = iterator.Next();
3283			kprintf("  %p", mapping->page);
3284		}
3285		kprintf("\n");
3286	} else {
3287		uint32 count = 0;
3288		while (iterator.Next() != NULL) {
3289			count++;
3290		}
3291		kprintf("page mappings:\t%" B_PRIu32 "\n", count);
3292	}
3293}
3294
3295
3296static int
3297dump_area(int argc, char** argv)
3298{
3299	bool mappings = false;
3300	bool found = false;
3301	int32 index = 1;
3302	VMArea* area;
3303	addr_t num;
3304
3305	if (argc < 2 || !strcmp(argv[1], "--help")) {
3306		kprintf("usage: area [-m] [id|contains|address|name] <id|address|name>\n"
3307			"All areas matching either id/address/name are listed. You can\n"
3308			"force to check only a specific item by prefixing the specifier\n"
3309			"with the id/contains/address/name keywords.\n"
3310			"-m shows the area's mappings as well.\n");
3311		return 0;
3312	}
3313
3314	if (!strcmp(argv[1], "-m")) {
3315		mappings = true;
3316		index++;
3317	}
3318
3319	int32 mode = 0xf;
3320	if (!strcmp(argv[index], "id"))
3321		mode = 1;
3322	else if (!strcmp(argv[index], "contains"))
3323		mode = 2;
3324	else if (!strcmp(argv[index], "name"))
3325		mode = 4;
3326	else if (!strcmp(argv[index], "address"))
3327		mode = 0;
3328	if (mode != 0xf)
3329		index++;
3330
3331	if (index >= argc) {
3332		kprintf("No area specifier given.\n");
3333		return 0;
3334	}
3335
3336	num = parse_expression(argv[index]);
3337
3338	if (mode == 0) {
3339		dump_area_struct((struct VMArea*)num, mappings);
3340	} else {
3341		// walk through the area list, looking for the arguments as a name
3342
3343		VMAreaHashTable::Iterator it = VMAreaHash::GetIterator();
3344		while ((area = it.Next()) != NULL) {
3345			if (((mode & 4) != 0 && area->name != NULL
3346					&& !strcmp(argv[index], area->name))
3347				|| (num != 0 && (((mode & 1) != 0 && (addr_t)area->id == num)
3348					|| (((mode & 2) != 0 && area->Base() <= num
3349						&& area->Base() + area->Size() > num))))) {
3350				dump_area_struct(area, mappings);
3351				found = true;
3352			}
3353		}
3354
3355		if (!found)
3356			kprintf("could not find area %s (%ld)\n", argv[index], num);
3357	}
3358
3359	return 0;
3360}
3361
3362
3363static int
3364dump_area_list(int argc, char** argv)
3365{
3366	VMArea* area;
3367	const char* name = NULL;
3368	int32 id = 0;
3369
3370	if (argc > 1) {
3371		id = parse_expression(argv[1]);
3372		if (id == 0)
3373			name = argv[1];
3374	}
3375
3376	kprintf("%-*s      id  %-*s    %-*sprotect lock  name\n",
3377		B_PRINTF_POINTER_WIDTH, "addr", B_PRINTF_POINTER_WIDTH, "base",
3378		B_PRINTF_POINTER_WIDTH, "size");
3379
3380	VMAreaHashTable::Iterator it = VMAreaHash::GetIterator();
3381	while ((area = it.Next()) != NULL) {
3382		if ((id != 0 && area->address_space->ID() != id)
3383			|| (name != NULL && strstr(area->name, name) == NULL))
3384			continue;
3385
3386		kprintf("%p %5" B_PRIx32 "  %p  %p %4" B_PRIx32 " %4d  %s\n", area,
3387			area->id, (void*)area->Base(), (void*)area->Size(),
3388			area->protection, area->wiring, area->name);
3389	}
3390	return 0;
3391}
3392
3393
3394static int
3395dump_available_memory(int argc, char** argv)
3396{
3397	kprintf("Available memory: %" B_PRIdOFF "/%" B_PRIuPHYSADDR " bytes\n",
3398		sAvailableMemory, (phys_addr_t)vm_page_num_pages() * B_PAGE_SIZE);
3399	return 0;
3400}
3401
3402
3403/*!	Deletes all areas and reserved regions in the given address space.
3404
3405	The caller must ensure that none of the areas has any wired ranges.
3406
3407	\param addressSpace The address space.
3408	\param deletingAddressSpace \c true, if the address space is in the process
3409		of being deleted.
3410*/
3411void
3412vm_delete_areas(struct VMAddressSpace* addressSpace, bool deletingAddressSpace)
3413{
3414	TRACE(("vm_delete_areas: called on address space 0x%" B_PRIx32 "\n",
3415		addressSpace->ID()));
3416
3417	addressSpace->WriteLock();
3418
3419	// remove all reserved areas in this address space
3420	addressSpace->UnreserveAllAddressRanges(0);
3421
3422	// delete all the areas in this address space
3423	while (VMArea* area = addressSpace->FirstArea()) {
3424		ASSERT(!area->IsWired());
3425		delete_area(addressSpace, area, deletingAddressSpace);
3426	}
3427
3428	addressSpace->WriteUnlock();
3429}
3430
3431
3432static area_id
3433vm_area_for(addr_t address, bool kernel)
3434{
3435	team_id team;
3436	if (IS_USER_ADDRESS(address)) {
3437		// we try the user team address space, if any
3438		team = VMAddressSpace::CurrentID();
3439		if (team < 0)
3440			return team;
3441	} else
3442		team = VMAddressSpace::KernelID();
3443
3444	AddressSpaceReadLocker locker(team);
3445	if (!locker.IsLocked())
3446		return B_BAD_TEAM_ID;
3447
3448	VMArea* area = locker.AddressSpace()->LookupArea(address);
3449	if (area != NULL) {
3450		if (!kernel && (area->protection & (B_READ_AREA | B_WRITE_AREA)) == 0)
3451			return B_ERROR;
3452
3453		return area->id;
3454	}
3455
3456	return B_ERROR;
3457}
3458
3459
3460/*!	Frees physical pages that were used during the boot process.
3461	\a end is inclusive.
3462*/
3463static void
3464unmap_and_free_physical_pages(VMTranslationMap* map, addr_t start, addr_t end)
3465{
3466	// free all physical pages in the specified range
3467
3468	for (addr_t current = start; current < end; current += B_PAGE_SIZE) {
3469		phys_addr_t physicalAddress;
3470		uint32 flags;
3471
3472		if (map->Query(current, &physicalAddress, &flags) == B_OK
3473			&& (flags & PAGE_PRESENT) != 0) {
3474			vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
3475			if (page != NULL && page->State() != PAGE_STATE_FREE
3476					 && page->State() != PAGE_STATE_CLEAR
3477					 && page->State() != PAGE_STATE_UNUSED) {
3478				DEBUG_PAGE_ACCESS_START(page);
3479				vm_page_set_state(page, PAGE_STATE_FREE);
3480			}
3481		}
3482	}
3483
3484	// unmap the memory
3485	map->Unmap(start, end);
3486}
3487
3488
3489void
3490vm_free_unused_boot_loader_range(addr_t start, addr_t size)
3491{
3492	VMTranslationMap* map = VMAddressSpace::Kernel()->TranslationMap();
3493	addr_t end = start + (size - 1);
3494	addr_t lastEnd = start;
3495
3496	TRACE(("vm_free_unused_boot_loader_range(): asked to free %p - %p\n",
3497		(void*)start, (void*)end));
3498
3499	// The areas are sorted in virtual address space order, so
3500	// we just have to find the holes between them that fall
3501	// into the area we should dispose
3502
3503	map->Lock();
3504
3505	for (VMAddressSpace::AreaIterator it
3506				= VMAddressSpace::Kernel()->GetAreaIterator();
3507			VMArea* area = it.Next();) {
3508		addr_t areaStart = area->Base();
3509		addr_t areaEnd = areaStart + (area->Size() - 1);
3510
3511		if (areaEnd < start)
3512			continue;
3513
3514		if (areaStart > end) {
3515			// we are done, the area is already beyond of what we have to free
3516			break;
3517		}
3518
3519		if (areaStart > lastEnd) {
3520			// this is something we can free
3521			TRACE(("free boot range: get rid of %p - %p\n", (void*)lastEnd,
3522				(void*)areaStart));
3523			unmap_and_free_physical_pages(map, lastEnd, areaStart - 1);
3524		}
3525
3526		if (areaEnd >= end) {
3527			lastEnd = areaEnd;
3528				// no +1 to prevent potential overflow
3529			break;
3530		}
3531
3532		lastEnd = areaEnd + 1;
3533	}
3534
3535	if (lastEnd < end) {
3536		// we can also get rid of some space at the end of the area
3537		TRACE(("free boot range: also remove %p - %p\n", (void*)lastEnd,
3538			(void*)end));
3539		unmap_and_free_physical_pages(map, lastEnd, end);
3540	}
3541
3542	map->Unlock();
3543}
3544
3545
3546static void
3547create_preloaded_image_areas(struct preloaded_image* _image)
3548{
3549	preloaded_elf_image* image = static_cast<preloaded_elf_image*>(_image);
3550	char name[B_OS_NAME_LENGTH];
3551	void* address;
3552	int32 length;
3553
3554	// use file name to create a good area name
3555	char* fileName = strrchr(image->name, '/');
3556	if (fileName == NULL)
3557		fileName = image->name;
3558	else
3559		fileName++;
3560
3561	length = strlen(fileName);
3562	// make sure there is enough space for the suffix
3563	if (length > 25)
3564		length = 25;
3565
3566	memcpy(name, fileName, length);
3567	strcpy(name + length, "_text");
3568	address = (void*)ROUNDDOWN(image->text_region.start, B_PAGE_SIZE);
3569	image->text_region.id = create_area(name, &address, B_EXACT_ADDRESS,
3570		PAGE_ALIGN(image->text_region.size), B_ALREADY_WIRED,
3571		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3572		// this will later be remapped read-only/executable by the
3573		// ELF initialization code
3574
3575	strcpy(name + length, "_data");
3576	address = (void*)ROUNDDOWN(image->data_region.start, B_PAGE_SIZE);
3577	image->data_region.id = create_area(name, &address, B_EXACT_ADDRESS,
3578		PAGE_ALIGN(image->data_region.size), B_ALREADY_WIRED,
3579		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3580}
3581
3582
3583/*!	Frees all previously kernel arguments areas from the kernel_args structure.
3584	Any boot loader resources contained in that arguments must not be accessed
3585	anymore past this point.
3586*/
3587void
3588vm_free_kernel_args(kernel_args* args)
3589{
3590	uint32 i;
3591
3592	TRACE(("vm_free_kernel_args()\n"));
3593
3594	for (i = 0; i < args->num_kernel_args_ranges; i++) {
3595		area_id area = area_for((void*)(addr_t)args->kernel_args_range[i].start);
3596		if (area >= B_OK)
3597			delete_area(area);
3598	}
3599}
3600
3601
3602static void
3603allocate_kernel_args(kernel_args* args)
3604{
3605	TRACE(("allocate_kernel_args()\n"));
3606
3607	for (uint32 i = 0; i < args->num_kernel_args_ranges; i++) {
3608		void* address = (void*)(addr_t)args->kernel_args_range[i].start;
3609
3610		create_area("_kernel args_", &address, B_EXACT_ADDRESS,
3611			args->kernel_args_range[i].size, B_ALREADY_WIRED,
3612			B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3613	}
3614}
3615
3616
3617static void
3618unreserve_boot_loader_ranges(kernel_args* args)
3619{
3620	TRACE(("unreserve_boot_loader_ranges()\n"));
3621
3622	for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
3623		vm_unreserve_address_range(VMAddressSpace::KernelID(),
3624			(void*)(addr_t)args->virtual_allocated_range[i].start,
3625			args->virtual_allocated_range[i].size);
3626	}
3627}
3628
3629
3630static void
3631reserve_boot_loader_ranges(kernel_args* args)
3632{
3633	TRACE(("reserve_boot_loader_ranges()\n"));
3634
3635	for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
3636		void* address = (void*)(addr_t)args->virtual_allocated_range[i].start;
3637
3638		// If the address is no kernel address, we just skip it. The
3639		// architecture specific code has to deal with it.
3640		if (!IS_KERNEL_ADDRESS(address)) {
3641			dprintf("reserve_boot_loader_ranges(): Skipping range: %p, %"
3642				B_PRIu64 "\n", address, args->virtual_allocated_range[i].size);
3643			continue;
3644		}
3645
3646		status_t status = vm_reserve_address_range(VMAddressSpace::KernelID(),
3647			&address, B_EXACT_ADDRESS, args->virtual_allocated_range[i].size, 0);
3648		if (status < B_OK)
3649			panic("could not reserve boot loader ranges\n");
3650	}
3651}
3652
3653
3654static addr_t
3655allocate_early_virtual(kernel_args* args, size_t size, addr_t alignment)
3656{
3657	size = PAGE_ALIGN(size);
3658
3659	// find a slot in the virtual allocation addr range
3660	for (uint32 i = 1; i < args->num_virtual_allocated_ranges; i++) {
3661		// check to see if the space between this one and the last is big enough
3662		addr_t rangeStart = args->virtual_allocated_range[i].start;
3663		addr_t previousRangeEnd = args->virtual_allocated_range[i - 1].start
3664			+ args->virtual_allocated_range[i - 1].size;
3665
3666		addr_t base = alignment > 0
3667			? ROUNDUP(previousRangeEnd, alignment) : previousRangeEnd;
3668
3669		if (base >= KERNEL_BASE && base < rangeStart
3670				&& rangeStart - base >= size) {
3671			args->virtual_allocated_range[i - 1].size
3672				+= base + size - previousRangeEnd;
3673			return base;
3674		}
3675	}
3676
3677	// we hadn't found one between allocation ranges. this is ok.
3678	// see if there's a gap after the last one
3679	int lastEntryIndex = args->num_virtual_allocated_ranges - 1;
3680	addr_t lastRangeEnd = args->virtual_allocated_range[lastEntryIndex].start
3681		+ args->virtual_allocated_range[lastEntryIndex].size;
3682	addr_t base = alignment > 0
3683		? ROUNDUP(lastRangeEnd, alignment) : lastRangeEnd;
3684	if (KERNEL_BASE + (KERNEL_SIZE - 1) - base >= size) {
3685		args->virtual_allocated_range[lastEntryIndex].size
3686			+= base + size - lastRangeEnd;
3687		return base;
3688	}
3689
3690	// see if there's a gap before the first one
3691	addr_t rangeStart = args->virtual_allocated_range[0].start;
3692	if (rangeStart > KERNEL_BASE && rangeStart - KERNEL_BASE >= size) {
3693		base = rangeStart - size;
3694		if (alignment > 0)
3695			base = ROUNDDOWN(base, alignment);
3696
3697		if (base >= KERNEL_BASE) {
3698			args->virtual_allocated_range[0].start = base;
3699			args->virtual_allocated_range[0].size += rangeStart - base;
3700			return base;
3701		}
3702	}
3703
3704	return 0;
3705}
3706
3707
3708static bool
3709is_page_in_physical_memory_range(kernel_args* args, phys_addr_t address)
3710{
3711	// TODO: horrible brute-force method of determining if the page can be
3712	// allocated
3713	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3714		if (address >= args->physical_memory_range[i].start
3715			&& address < args->physical_memory_range[i].start
3716				+ args->physical_memory_range[i].size)
3717			return true;
3718	}
3719	return false;
3720}
3721
3722
3723page_num_t
3724vm_allocate_early_physical_page(kernel_args* args)
3725{
3726	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3727		phys_addr_t nextPage;
3728
3729		nextPage = args->physical_allocated_range[i].start
3730			+ args->physical_allocated_range[i].size;
3731		// see if the page after the next allocated paddr run can be allocated
3732		if (i + 1 < args->num_physical_allocated_ranges
3733			&& args->physical_allocated_range[i + 1].size != 0) {
3734			// see if the next page will collide with the next allocated range
3735			if (nextPage >= args->physical_allocated_range[i+1].start)
3736				continue;
3737		}
3738		// see if the next physical page fits in the memory block
3739		if (is_page_in_physical_memory_range(args, nextPage)) {
3740			// we got one!
3741			args->physical_allocated_range[i].size += B_PAGE_SIZE;
3742			return nextPage / B_PAGE_SIZE;
3743		}
3744	}
3745
3746	return 0;
3747		// could not allocate a block
3748}
3749
3750
3751/*!	This one uses the kernel_args' physical and virtual memory ranges to
3752	allocate some pages before the VM is completely up.
3753*/
3754addr_t
3755vm_allocate_early(kernel_args* args, size_t virtualSize, size_t physicalSize,
3756	uint32 attributes, addr_t alignment)
3757{
3758	if (physicalSize > virtualSize)
3759		physicalSize = virtualSize;
3760
3761	// find the vaddr to allocate at
3762	addr_t virtualBase = allocate_early_virtual(args, virtualSize, alignment);
3763	//dprintf("vm_allocate_early: vaddr 0x%lx\n", virtualBase);
3764
3765	// map the pages
3766	for (uint32 i = 0; i < PAGE_ALIGN(physicalSize) / B_PAGE_SIZE; i++) {
3767		page_num_t physicalAddress = vm_allocate_early_physical_page(args);
3768		if (physicalAddress == 0)
3769			panic("error allocating early page!\n");
3770
3771		//dprintf("vm_allocate_early: paddr 0x%lx\n", physicalAddress);
3772
3773		arch_vm_translation_map_early_map(args, virtualBase + i * B_PAGE_SIZE,
3774			physicalAddress * B_PAGE_SIZE, attributes,
3775			&vm_allocate_early_physical_page);
3776	}
3777
3778	return virtualBase;
3779}
3780
3781
3782/*!	The main entrance point to initialize the VM. */
3783status_t
3784vm_init(kernel_args* args)
3785{
3786	struct preloaded_image* image;
3787	void* address;
3788	status_t err = 0;
3789	uint32 i;
3790
3791	TRACE(("vm_init: entry\n"));
3792	err = arch_vm_translation_map_init(args, &sPhysicalPageMapper);
3793	err = arch_vm_init(args);
3794
3795	// initialize some globals
3796	vm_page_init_num_pages(args);
3797	sAvailableMemory = vm_page_num_pages() * B_PAGE_SIZE;
3798
3799	slab_init(args);
3800
3801#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
3802	size_t heapSize = INITIAL_HEAP_SIZE;
3803	// try to accomodate low memory systems
3804	while (heapSize > sAvailableMemory / 8)
3805		heapSize /= 2;
3806	if (heapSize < 1024 * 1024)
3807		panic("vm_init: go buy some RAM please.");
3808
3809	// map in the new heap and initialize it
3810	addr_t heapBase = vm_allocate_early(args, heapSize, heapSize,
3811		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3812	TRACE(("heap at 0x%lx\n", heapBase));
3813	heap_init(heapBase, heapSize);
3814#endif
3815
3816	// initialize the free page list and physical page mapper
3817	vm_page_init(args);
3818
3819	// initialize the cache allocators
3820	vm_cache_init(args);
3821
3822	{
3823		status_t error = VMAreaHash::Init();
3824		if (error != B_OK)
3825			panic("vm_init: error initializing area hash table\n");
3826	}
3827
3828	VMAddressSpace::Init();
3829	reserve_boot_loader_ranges(args);
3830
3831#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
3832	heap_init_post_area();
3833#endif
3834
3835	// Do any further initialization that the architecture dependant layers may
3836	// need now
3837	arch_vm_translation_map_init_post_area(args);
3838	arch_vm_init_post_area(args);
3839	vm_page_init_post_area(args);
3840	slab_init_post_area();
3841
3842	// allocate areas to represent stuff that already exists
3843
3844#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
3845	address = (void*)ROUNDDOWN(heapBase, B_PAGE_SIZE);
3846	create_area("kernel heap", &address, B_EXACT_ADDRESS, heapSize,
3847		B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3848#endif
3849
3850	allocate_kernel_args(args);
3851
3852	create_preloaded_image_areas(args->kernel_image);
3853
3854	// allocate areas for preloaded images
3855	for (image = args->preloaded_images; image != NULL; image = image->next)
3856		create_preloaded_image_areas(image);
3857
3858	// allocate kernel stacks
3859	for (i = 0; i < args->num_cpus; i++) {
3860		char name[64];
3861
3862		sprintf(name, "idle thread %" B_PRIu32 " kstack", i + 1);
3863		address = (void*)args->cpu_kstack[i].start;
3864		create_area(name, &address, B_EXACT_ADDRESS, args->cpu_kstack[i].size,
3865			B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3866	}
3867
3868	void* lastPage = (void*)ROUNDDOWN(~(addr_t)0, B_PAGE_SIZE);
3869	vm_block_address_range("overflow protection", lastPage, B_PAGE_SIZE);
3870
3871#if PARANOID_KERNEL_MALLOC
3872	vm_block_address_range("uninitialized heap memory",
3873		(void *)ROUNDDOWN(0xcccccccc, B_PAGE_SIZE), B_PAGE_SIZE * 64);
3874#endif
3875#if PARANOID_KERNEL_FREE
3876	vm_block_address_range("freed heap memory",
3877		(void *)ROUNDDOWN(0xdeadbeef, B_PAGE_SIZE), B_PAGE_SIZE * 64);
3878#endif
3879
3880	// create the object cache for the page mappings
3881	gPageMappingsObjectCache = create_object_cache_etc("page mappings",
3882		sizeof(vm_page_mapping), 0, 0, 64, 128, CACHE_LARGE_SLAB, NULL, NULL,
3883		NULL, NULL);
3884	if (gPageMappingsObjectCache == NULL)
3885		panic("failed to create page mappings object cache");
3886
3887	object_cache_set_minimum_reserve(gPageMappingsObjectCache, 1024);
3888
3889#if DEBUG_CACHE_LIST
3890	if (vm_page_num_free_pages() >= 200 * 1024 * 1024 / B_PAGE_SIZE) {
3891		virtual_address_restrictions virtualRestrictions = {};
3892		virtualRestrictions.address_specification = B_ANY_KERNEL_ADDRESS;
3893		physical_address_restrictions physicalRestrictions = {};
3894		create_area_etc(VMAddressSpace::KernelID(), "cache info table",
3895			ROUNDUP(kCacheInfoTableCount * sizeof(cache_info), B_PAGE_SIZE),
3896			B_FULL_LOCK, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA,
3897			CREATE_AREA_DONT_WAIT, 0, &virtualRestrictions,
3898			&physicalRestrictions, (void**)&sCacheInfoTable);
3899	}
3900#endif	// DEBUG_CACHE_LIST
3901
3902	// add some debugger commands
3903	add_debugger_command("areas", &dump_area_list, "Dump a list of all areas");
3904	add_debugger_command("area", &dump_area,
3905		"Dump info about a particular area");
3906	add_debugger_command("cache", &dump_cache, "Dump VMCache");
3907	add_debugger_command("cache_tree", &dump_cache_tree, "Dump VMCache tree");
3908#if DEBUG_CACHE_LIST
3909	if (sCacheInfoTable != NULL) {
3910		add_debugger_command_etc("caches", &dump_caches,
3911			"List all VMCache trees",
3912			"[ \"-c\" ]\n"
3913			"All cache trees are listed sorted in decreasing order by number "
3914				"of\n"
3915			"used pages or, if \"-c\" is specified, by size of committed "
3916				"memory.\n",
3917			0);
3918	}
3919#endif
3920	add_debugger_command("avail", &dump_available_memory,
3921		"Dump available memory");
3922	add_debugger_command("dl", &display_mem, "dump memory long words (64-bit)");
3923	add_debugger_command("dw", &display_mem, "dump memory words (32-bit)");
3924	add_debugger_command("ds", &display_mem, "dump memory shorts (16-bit)");
3925	add_debugger_command("db", &display_mem, "dump memory bytes (8-bit)");
3926	add_debugger_command("string", &display_mem, "dump strings");
3927
3928	TRACE(("vm_init: exit\n"));
3929
3930	vm_cache_init_post_heap();
3931
3932	return err;
3933}
3934
3935
3936status_t
3937vm_init_post_sem(kernel_args* args)
3938{
3939	// This frees all unused boot loader resources and makes its space available
3940	// again
3941	arch_vm_init_end(args);
3942	unreserve_boot_loader_ranges(args);
3943
3944	// fill in all of the semaphores that were not allocated before
3945	// since we're still single threaded and only the kernel address space
3946	// exists, it isn't that hard to find all of the ones we need to create
3947
3948	arch_vm_translation_map_init_post_sem(args);
3949
3950	slab_init_post_sem();
3951
3952#if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
3953	heap_init_post_sem();
3954#endif
3955
3956	return B_OK;
3957}
3958
3959
3960status_t
3961vm_init_post_thread(kernel_args* args)
3962{
3963	vm_page_init_post_thread(args);
3964	slab_init_post_thread();
3965	return heap_init_post_thread();
3966}
3967
3968
3969status_t
3970vm_init_post_modules(kernel_args* args)
3971{
3972	return arch_vm_init_post_modules(args);
3973}
3974
3975
3976void
3977permit_page_faults(void)
3978{
3979	Thread* thread = thread_get_current_thread();
3980	if (thread != NULL)
3981		atomic_add(&thread->page_faults_allowed, 1);
3982}
3983
3984
3985void
3986forbid_page_faults(void)
3987{
3988	Thread* thread = thread_get_current_thread();
3989	if (thread != NULL)
3990		atomic_add(&thread->page_faults_allowed, -1);
3991}
3992
3993
3994status_t
3995vm_page_fault(addr_t address, addr_t faultAddress, bool isWrite, bool isUser,
3996	addr_t* newIP)
3997{
3998	FTRACE(("vm_page_fault: page fault at 0x%lx, ip 0x%lx\n", address,
3999		faultAddress));
4000
4001	TPF(PageFaultStart(address, isWrite, isUser, faultAddress));
4002
4003	addr_t pageAddress = ROUNDDOWN(address, B_PAGE_SIZE);
4004	VMAddressSpace* addressSpace = NULL;
4005
4006	status_t status = B_OK;
4007	*newIP = 0;
4008	atomic_add((int32*)&sPageFaults, 1);
4009
4010	if (IS_KERNEL_ADDRESS(pageAddress)) {
4011		addressSpace = VMAddressSpace::GetKernel();
4012	} else if (IS_USER_ADDRESS(pageAddress)) {
4013		addressSpace = VMAddressSpace::GetCurrent();
4014		if (addressSpace == NULL) {
4015			if (!isUser) {
4016				dprintf("vm_page_fault: kernel thread accessing invalid user "
4017					"memory!\n");
4018				status = B_BAD_ADDRESS;
4019				TPF(PageFaultError(-1,
4020					VMPageFaultTracing
4021						::PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY));
4022			} else {
4023				// XXX weird state.
4024				panic("vm_page_fault: non kernel thread accessing user memory "
4025					"that doesn't exist!\n");
4026				status = B_BAD_ADDRESS;
4027			}
4028		}
4029	} else {
4030		// the hit was probably in the 64k DMZ between kernel and user space
4031		// this keeps a user space thread from passing a buffer that crosses
4032		// into kernel space
4033		status = B_BAD_ADDRESS;
4034		TPF(PageFaultError(-1,
4035			VMPageFaultTracing::PAGE_FAULT_ERROR_NO_ADDRESS_SPACE));
4036	}
4037
4038	if (status == B_OK) {
4039		status = vm_soft_fault(addressSpace, pageAddress, isWrite, isUser,
4040			NULL);
4041	}
4042
4043	if (status < B_OK) {
4044		dprintf("vm_page_fault: vm_soft_fault returned error '%s' on fault at "
4045			"0x%lx, ip 0x%lx, write %d, user %d, thread 0x%" B_PRIx32 "\n",
4046			strerror(status), address, faultAddress, isWrite, isUser,
4047			thread_get_current_thread_id());
4048		if (!isUser) {
4049			Thread* thread = thread_get_current_thread();
4050			if (thread != NULL && thread->fault_handler != 0) {
4051				// this will cause the arch dependant page fault handler to
4052				// modify the IP on the interrupt frame or whatever to return
4053				// to this address
4054				*newIP = thread->fault_handler;
4055			} else {
4056				// unhandled page fault in the kernel
4057				panic("vm_page_fault: unhandled page fault in kernel space at "
4058					"0x%lx, ip 0x%lx\n", address, faultAddress);
4059			}
4060		} else {
4061#if 1
4062			// TODO: remove me once we have proper userland debugging support
4063			// (and tools)
4064			VMArea* area = NULL;
4065			if (addressSpace != NULL) {
4066				addressSpace->ReadLock();
4067				area = addressSpace->LookupArea(faultAddress);
4068			}
4069
4070			Thread* thread = thread_get_current_thread();
4071			dprintf("vm_page_fault: thread \"%s\" (%" B_PRId32 ") in team "
4072				"\"%s\" (%" B_PRId32 ") tried to %s address %#lx, ip %#lx "
4073				"(\"%s\" +%#lx)\n", thread->name, thread->id,
4074				thread->team->Name(), thread->team->id,
4075				isWrite ? "write" : "read", address, faultAddress,
4076				area ? area->name : "???", faultAddress - (area ?
4077					area->Base() : 0x0));
4078
4079			// We can print a stack trace of the userland thread here.
4080// TODO: The user_memcpy() below can cause a deadlock, if it causes a page
4081// fault and someone is already waiting for a write lock on the same address
4082// space. This thread will then try to acquire the lock again and will
4083// be queued after the writer.
4084#	if 0
4085			if (area) {
4086				struct stack_frame {
4087					#if defined(__INTEL__) || defined(__POWERPC__) || defined(__M68K__)
4088						struct stack_frame*	previous;
4089						void*				return_address;
4090					#else
4091						// ...
4092					#warning writeme
4093					#endif
4094				} frame;
4095#		ifdef __INTEL__
4096				struct iframe* iframe = x86_get_user_iframe();
4097				if (iframe == NULL)
4098					panic("iframe is NULL!");
4099
4100				status_t status = user_memcpy(&frame, (void*)iframe->ebp,
4101					sizeof(struct stack_frame));
4102#		elif defined(__POWERPC__)
4103				struct iframe* iframe = ppc_get_user_iframe();
4104				if (iframe == NULL)
4105					panic("iframe is NULL!");
4106
4107				status_t status = user_memcpy(&frame, (void*)iframe->r1,
4108					sizeof(struct stack_frame));
4109#		else
4110#			warning "vm_page_fault() stack trace won't work"
4111				status = B_ERROR;
4112#		endif
4113
4114				dprintf("stack trace:\n");
4115				int32 maxFrames = 50;
4116				while (status == B_OK && --maxFrames >= 0
4117						&& frame.return_address != NULL) {
4118					dprintf("  %p", frame.return_address);
4119					area = addressSpace->LookupArea(
4120						(addr_t)frame.return_address);
4121					if (area) {
4122						dprintf(" (%s + %#lx)", area->name,
4123							(addr_t)frame.return_address - area->Base());
4124					}
4125					dprintf("\n");
4126
4127					status = user_memcpy(&frame, frame.previous,
4128						sizeof(struct stack_frame));
4129				}
4130			}
4131#	endif	// 0 (stack trace)
4132
4133			if (addressSpace != NULL)
4134				addressSpace->ReadUnlock();
4135#endif
4136
4137			// If the thread has a signal handler for SIGSEGV, we simply
4138			// send it the signal. Otherwise we notify the user debugger
4139			// first.
4140			struct sigaction action;
4141			if ((sigaction(SIGSEGV, NULL, &action) == 0
4142					&& action.sa_handler != SIG_DFL
4143					&& action.sa_handler != SIG_IGN)
4144				|| user_debug_exception_occurred(B_SEGMENT_VIOLATION,
4145					SIGSEGV)) {
4146				Signal signal(SIGSEGV,
4147					status == B_PERMISSION_DENIED
4148						? SEGV_ACCERR : SEGV_MAPERR,
4149					EFAULT, thread->team->id);
4150				signal.SetAddress((void*)address);
4151				send_signal_to_thread(thread, signal, 0);
4152			}
4153		}
4154	}
4155
4156	if (addressSpace != NULL)
4157		addressSpace->Put();
4158
4159	return B_HANDLED_INTERRUPT;
4160}
4161
4162
4163struct PageFaultContext {
4164	AddressSpaceReadLocker	addressSpaceLocker;
4165	VMCacheChainLocker		cacheChainLocker;
4166
4167	VMTranslationMap*		map;
4168	VMCache*				topCache;
4169	off_t					cacheOffset;
4170	vm_page_reservation		reservation;
4171	bool					isWrite;
4172
4173	// return values
4174	vm_page*				page;
4175	bool					restart;
4176
4177
4178	PageFaultContext(VMAddressSpace* addressSpace, bool isWrite)
4179		:
4180		addressSpaceLocker(addressSpace, true),
4181		map(addressSpace->TranslationMap()),
4182		isWrite(isWrite)
4183	{
4184	}
4185
4186	~PageFaultContext()
4187	{
4188		UnlockAll();
4189		vm_page_unreserve_pages(&reservation);
4190	}
4191
4192	void Prepare(VMCache* topCache, off_t cacheOffset)
4193	{
4194		this->topCache = topCache;
4195		this->cacheOffset = cacheOffset;
4196		page = NULL;
4197		restart = false;
4198
4199		cacheChainLocker.SetTo(topCache);
4200	}
4201
4202	void UnlockAll(VMCache* exceptCache = NULL)
4203	{
4204		topCache = NULL;
4205		addressSpaceLocker.Unlock();
4206		cacheChainLocker.Unlock(exceptCache);
4207	}
4208};
4209
4210
4211/*!	Gets the page that should be mapped into the area.
4212	Returns an error code other than \c B_OK, if the page couldn't be found or
4213	paged in. The locking state of the address space and the caches is undefined
4214	in that case.
4215	Returns \c B_OK with \c context.restart set to \c true, if the functions
4216	had to unlock the address space and all caches and is supposed to be called
4217	again.
4218	Returns \c B_OK with \c context.restart set to \c false, if the page was
4219	found. It is returned in \c context.page. The address space will still be
4220	locked as well as all caches starting from the top cache to at least the
4221	cache the page lives in.
4222*/
4223static status_t
4224fault_get_page(PageFaultContext& context)
4225{
4226	VMCache* cache = context.topCache;
4227	VMCache* lastCache = NULL;
4228	vm_page* page = NULL;
4229
4230	while (cache != NULL) {
4231		// We already hold the lock of the cache at this point.
4232
4233		lastCache = cache;
4234
4235		page = cache->LookupPage(context.cacheOffset);
4236		if (page != NULL && page->busy) {
4237			// page must be busy -- wait for it to become unbusy
4238			context.UnlockAll(cache);
4239			cache->ReleaseRefLocked();
4240			cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, false);
4241
4242			// restart the whole process
4243			context.restart = true;
4244			return B_OK;
4245		}
4246
4247		if (page != NULL)
4248			break;
4249
4250		// The current cache does not contain the page we're looking for.
4251
4252		// see if the backing store has it
4253		if (cache->HasPage(context.cacheOffset)) {
4254			// insert a fresh page and mark it busy -- we're going to read it in
4255			page = vm_page_allocate_page(&context.reservation,
4256				PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_BUSY);
4257			cache->InsertPage(page, context.cacheOffset);
4258
4259			// We need to unlock all caches and the address space while reading
4260			// the page in. Keep a reference to the cache around.
4261			cache->AcquireRefLocked();
4262			context.UnlockAll();
4263
4264			// read the page in
4265			generic_io_vec vec;
4266			vec.base = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
4267			generic_size_t bytesRead = vec.length = B_PAGE_SIZE;
4268
4269			status_t status = cache->Read(context.cacheOffset, &vec, 1,
4270				B_PHYSICAL_IO_REQUEST, &bytesRead);
4271
4272			cache->Lock();
4273
4274			if (status < B_OK) {
4275				// on error remove and free the page
4276				dprintf("reading page from cache %p returned: %s!\n",
4277					cache, strerror(status));
4278
4279				cache->NotifyPageEvents(page, PAGE_EVENT_NOT_BUSY);
4280				cache->RemovePage(page);
4281				vm_page_set_state(page, PAGE_STATE_FREE);
4282
4283				cache->ReleaseRefAndUnlock();
4284				return status;
4285			}
4286
4287			// mark the page unbusy again
4288			cache->MarkPageUnbusy(page);
4289
4290			DEBUG_PAGE_ACCESS_END(page);
4291
4292			// Since we needed to unlock everything temporarily, the area
4293			// situation might have changed. So we need to restart the whole
4294			// process.
4295			cache->ReleaseRefAndUnlock();
4296			context.restart = true;
4297			return B_OK;
4298		}
4299
4300		cache = context.cacheChainLocker.LockSourceCache();
4301	}
4302
4303	if (page == NULL) {
4304		// There was no adequate page, determine the cache for a clean one.
4305		// Read-only pages come in the deepest cache, only the top most cache
4306		// may have direct write access.
4307		cache = context.isWrite ? context.topCache : lastCache;
4308
4309		// allocate a clean page
4310		page = vm_page_allocate_page(&context.reservation,
4311			PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_CLEAR);
4312		FTRACE(("vm_soft_fault: just allocated page 0x%" B_PRIxPHYSADDR "\n",
4313			page->physical_page_number));
4314
4315		// insert the new page into our cache
4316		cache->InsertPage(page, context.cacheOffset);
4317	} else if (page->Cache() != context.topCache && context.isWrite) {
4318		// We have a page that has the data we want, but in the wrong cache
4319		// object so we need to copy it and stick it into the top cache.
4320		vm_page* sourcePage = page;
4321
4322		// TODO: If memory is low, it might be a good idea to steal the page
4323		// from our source cache -- if possible, that is.
4324		FTRACE(("get new page, copy it, and put it into the topmost cache\n"));
4325		page = vm_page_allocate_page(&context.reservation, PAGE_STATE_ACTIVE);
4326
4327		// To not needlessly kill concurrency we unlock all caches but the top
4328		// one while copying the page. Lacking another mechanism to ensure that
4329		// the source page doesn't disappear, we mark it busy.
4330		sourcePage->busy = true;
4331		context.cacheChainLocker.UnlockKeepRefs(true);
4332
4333		// copy the page
4334		vm_memcpy_physical_page(page->physical_page_number * B_PAGE_SIZE,
4335			sourcePage->physical_page_number * B_PAGE_SIZE);
4336
4337		context.cacheChainLocker.RelockCaches(true);
4338		sourcePage->Cache()->MarkPageUnbusy(sourcePage);
4339
4340		// insert the new page into our cache
4341		context.topCache->InsertPage(page, context.cacheOffset);
4342	} else
4343		DEBUG_PAGE_ACCESS_START(page);
4344
4345	context.page = page;
4346	return B_OK;
4347}
4348
4349
4350/*!	Makes sure the address in the given address space is mapped.
4351
4352	\param addressSpace The address space.
4353	\param originalAddress The address. Doesn't need to be page aligned.
4354	\param isWrite If \c true the address shall be write-accessible.
4355	\param isUser If \c true the access is requested by a userland team.
4356	\param wirePage On success, if non \c NULL, the wired count of the page
4357		mapped at the given address is incremented and the page is returned
4358		via this parameter.
4359	\param wiredRange If given, this wiredRange is ignored when checking whether
4360		an already mapped page at the virtual address can be unmapped.
4361	\return \c B_OK on success, another error code otherwise.
4362*/
4363static status_t
4364vm_soft_fault(VMAddressSpace* addressSpace, addr_t originalAddress,
4365	bool isWrite, bool isUser, vm_page** wirePage, VMAreaWiredRange* wiredRange)
4366{
4367	FTRACE(("vm_soft_fault: thid 0x%" B_PRIx32 " address 0x%" B_PRIxADDR ", "
4368		"isWrite %d, isUser %d\n", thread_get_current_thread_id(),
4369		originalAddress, isWrite, isUser));
4370
4371	PageFaultContext context(addressSpace, isWrite);
4372
4373	addr_t address = ROUNDDOWN(originalAddress, B_PAGE_SIZE);
4374	status_t status = B_OK;
4375
4376	addressSpace->IncrementFaultCount();
4377
4378	// We may need up to 2 pages plus pages needed for mapping them -- reserving
4379	// the pages upfront makes sure we don't have any cache locked, so that the
4380	// page daemon/thief can do their job without problems.
4381	size_t reservePages = 2 + context.map->MaxPagesNeededToMap(originalAddress,
4382		originalAddress);
4383	context.addressSpaceLocker.Unlock();
4384	vm_page_reserve_pages(&context.reservation, reservePages,
4385		addressSpace == VMAddressSpace::Kernel()
4386			? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
4387
4388	while (true) {
4389		context.addressSpaceLocker.Lock();
4390
4391		// get the area the fault was in
4392		VMArea* area = addressSpace->LookupArea(address);
4393		if (area == NULL) {
4394			dprintf("vm_soft_fault: va 0x%lx not covered by area in address "
4395				"space\n", originalAddress);
4396			TPF(PageFaultError(-1,
4397				VMPageFaultTracing::PAGE_FAULT_ERROR_NO_AREA));
4398			status = B_BAD_ADDRESS;
4399			break;
4400		}
4401
4402		// check permissions
4403		uint32 protection = get_area_page_protection(area, address);
4404		if (isUser && (protection & B_USER_PROTECTION) == 0) {
4405			dprintf("user access on kernel area 0x%" B_PRIx32 " at %p\n",
4406				area->id, (void*)originalAddress);
4407			TPF(PageFaultError(area->id,
4408				VMPageFaultTracing::PAGE_FAULT_ERROR_KERNEL_ONLY));
4409			status = B_PERMISSION_DENIED;
4410			break;
4411		}
4412		if (isWrite && (protection
4413				& (B_WRITE_AREA | (isUser ? 0 : B_KERNEL_WRITE_AREA))) == 0) {
4414			dprintf("write access attempted on write-protected area 0x%"
4415				B_PRIx32 " at %p\n", area->id, (void*)originalAddress);
4416			TPF(PageFaultError(area->id,
4417				VMPageFaultTracing::PAGE_FAULT_ERROR_WRITE_PROTECTED));
4418			status = B_PERMISSION_DENIED;
4419			break;
4420		} else if (!isWrite && (protection
4421				& (B_READ_AREA | (isUser ? 0 : B_KERNEL_READ_AREA))) == 0) {
4422			dprintf("read access attempted on read-protected area 0x%" B_PRIx32
4423				" at %p\n", area->id, (void*)originalAddress);
4424			TPF(PageFaultError(area->id,
4425				VMPageFaultTracing::PAGE_FAULT_ERROR_READ_PROTECTED));
4426			status = B_PERMISSION_DENIED;
4427			break;
4428		}
4429
4430		// We have the area, it was a valid access, so let's try to resolve the
4431		// page fault now.
4432		// At first, the top most cache from the area is investigated.
4433
4434		context.Prepare(vm_area_get_locked_cache(area),
4435			address - area->Base() + area->cache_offset);
4436
4437		// See if this cache has a fault handler -- this will do all the work
4438		// for us.
4439		{
4440			// Note, since the page fault is resolved with interrupts enabled,
4441			// the fault handler could be called more than once for the same
4442			// reason -- the store must take this into account.
4443			status = context.topCache->Fault(addressSpace, context.cacheOffset);
4444			if (status != B_BAD_HANDLER)
4445				break;
4446		}
4447
4448		// The top most cache has no fault handler, so let's see if the cache or
4449		// its sources already have the page we're searching for (we're going
4450		// from top to bottom).
4451		status = fault_get_page(context);
4452		if (status != B_OK) {
4453			TPF(PageFaultError(area->id, status));
4454			break;
4455		}
4456
4457		if (context.restart)
4458			continue;
4459
4460		// All went fine, all there is left to do is to map the page into the
4461		// address space.
4462		TPF(PageFaultDone(area->id, context.topCache, context.page->Cache(),
4463			context.page));
4464
4465		// If the page doesn't reside in the area's cache, we need to make sure
4466		// it's mapped in read-only, so that we cannot overwrite someone else's
4467		// data (copy-on-write)
4468		uint32 newProtection = protection;
4469		if (context.page->Cache() != context.topCache && !isWrite)
4470			newProtection &= ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA);
4471
4472		bool unmapPage = false;
4473		bool mapPage = true;
4474
4475		// check whether there's already a page mapped at the address
4476		context.map->Lock();
4477
4478		phys_addr_t physicalAddress;
4479		uint32 flags;
4480		vm_page* mappedPage = NULL;
4481		if (context.map->Query(address, &physicalAddress, &flags) == B_OK
4482			&& (flags & PAGE_PRESENT) != 0
4483			&& (mappedPage = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
4484				!= NULL) {
4485			// Yep there's already a page. If it's ours, we can simply adjust
4486			// its protection. Otherwise we have to unmap it.
4487			if (mappedPage == context.page) {
4488				context.map->ProtectPage(area, address, newProtection);
4489					// Note: We assume that ProtectPage() is atomic (i.e.
4490					// the page isn't temporarily unmapped), otherwise we'd have
4491					// to make sure it isn't wired.
4492				mapPage = false;
4493			} else
4494				unmapPage = true;
4495		}
4496
4497		context.map->Unlock();
4498
4499		if (unmapPage) {
4500			// If the page is wired, we can't unmap it. Wait until it is unwired
4501			// again and restart.
4502			VMAreaUnwiredWaiter waiter;
4503			if (area->AddWaiterIfWired(&waiter, address, B_PAGE_SIZE,
4504					wiredRange)) {
4505				// unlock everything and wait
4506				context.UnlockAll();
4507				waiter.waitEntry.Wait();
4508				continue;
4509			}
4510
4511			// Note: The mapped page is a page of a lower cache. We are
4512			// guaranteed to have that cached locked, our new page is a copy of
4513			// that page, and the page is not busy. The logic for that guarantee
4514			// is as follows: Since the page is mapped, it must live in the top
4515			// cache (ruled out above) or any of its lower caches, and there is
4516			// (was before the new page was inserted) no other page in any
4517			// cache between the top cache and the page's cache (otherwise that
4518			// would be mapped instead). That in turn means that our algorithm
4519			// must have found it and therefore it cannot be busy either.
4520			DEBUG_PAGE_ACCESS_START(mappedPage);
4521			unmap_page(area, address);
4522			DEBUG_PAGE_ACCESS_END(mappedPage);
4523		}
4524
4525		if (mapPage) {
4526			if (map_page(area, context.page, address, newProtection,
4527					&context.reservation) != B_OK) {
4528				// Mapping can only fail, when the page mapping object couldn't
4529				// be allocated. Save for the missing mapping everything is
4530				// fine, though. If this was a regular page fault, we'll simply
4531				// leave and probably fault again. To make sure we'll have more
4532				// luck then, we ensure that the minimum object reserve is
4533				// available.
4534				DEBUG_PAGE_ACCESS_END(context.page);
4535
4536				context.UnlockAll();
4537
4538				if (object_cache_reserve(gPageMappingsObjectCache, 1, 0)
4539						!= B_OK) {
4540					// Apparently the situation is serious. Let's get ourselves
4541					// killed.
4542					status = B_NO_MEMORY;
4543				} else if (wirePage != NULL) {
4544					// The caller expects us to wire the page. Since
4545					// object_cache_reserve() succeeded, we should now be able
4546					// to allocate a mapping structure. Restart.
4547					continue;
4548				}
4549
4550				break;
4551			}
4552		} else if (context.page->State() == PAGE_STATE_INACTIVE)
4553			vm_page_set_state(context.page, PAGE_STATE_ACTIVE);
4554
4555		// also wire the page, if requested
4556		if (wirePage != NULL && status == B_OK) {
4557			increment_page_wired_count(context.page);
4558			*wirePage = context.page;
4559		}
4560
4561		DEBUG_PAGE_ACCESS_END(context.page);
4562
4563		break;
4564	}
4565
4566	return status;
4567}
4568
4569
4570status_t
4571vm_get_physical_page(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
4572{
4573	return sPhysicalPageMapper->GetPage(paddr, _vaddr, _handle);
4574}
4575
4576status_t
4577vm_put_physical_page(addr_t vaddr, void* handle)
4578{
4579	return sPhysicalPageMapper->PutPage(vaddr, handle);
4580}
4581
4582
4583status_t
4584vm_get_physical_page_current_cpu(phys_addr_t paddr, addr_t* _vaddr,
4585	void** _handle)
4586{
4587	return sPhysicalPageMapper->GetPageCurrentCPU(paddr, _vaddr, _handle);
4588}
4589
4590status_t
4591vm_put_physical_page_current_cpu(addr_t vaddr, void* handle)
4592{
4593	return sPhysicalPageMapper->PutPageCurrentCPU(vaddr, handle);
4594}
4595
4596
4597status_t
4598vm_get_physical_page_debug(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
4599{
4600	return sPhysicalPageMapper->GetPageDebug(paddr, _vaddr, _handle);
4601}
4602
4603status_t
4604vm_put_physical_page_debug(addr_t vaddr, void* handle)
4605{
4606	return sPhysicalPageMapper->PutPageDebug(vaddr, handle);
4607}
4608
4609
4610void
4611vm_get_info(system_memory_info* info)
4612{
4613	swap_get_info(info);
4614
4615	info->max_memory = vm_page_num_pages() * B_PAGE_SIZE;
4616	info->page_faults = sPageFaults;
4617
4618	MutexLocker locker(sAvailableMemoryLock);
4619	info->free_memory = sAvailableMemory;
4620	info->needed_memory = sNeededMemory;
4621}
4622
4623
4624uint32
4625vm_num_page_faults(void)
4626{
4627	return sPageFaults;
4628}
4629
4630
4631off_t
4632vm_available_memory(void)
4633{
4634	MutexLocker locker(sAvailableMemoryLock);
4635	return sAvailableMemory;
4636}
4637
4638
4639off_t
4640vm_available_not_needed_memory(void)
4641{
4642	MutexLocker locker(sAvailableMemoryLock);
4643	return sAvailableMemory - sNeededMemory;
4644}
4645
4646
4647/*!	Like vm_available_not_needed_memory(), but only for use in the kernel
4648	debugger.
4649*/
4650off_t
4651vm_available_not_needed_memory_debug(void)
4652{
4653	return sAvailableMemory - sNeededMemory;
4654}
4655
4656
4657size_t
4658vm_kernel_address_space_left(void)
4659{
4660	return VMAddressSpace::Kernel()->FreeSpace();
4661}
4662
4663
4664void
4665vm_unreserve_memory(size_t amount)
4666{
4667	mutex_lock(&sAvailableMemoryLock);
4668
4669	sAvailableMemory += amount;
4670
4671	mutex_unlock(&sAvailableMemoryLock);
4672}
4673
4674
4675status_t
4676vm_try_reserve_memory(size_t amount, int priority, bigtime_t timeout)
4677{
4678	size_t reserve = kMemoryReserveForPriority[priority];
4679
4680	MutexLocker locker(sAvailableMemoryLock);
4681
4682	//dprintf("try to reserve %lu bytes, %Lu left\n", amount, sAvailableMemory);
4683
4684	if (sAvailableMemory >= (off_t)(amount + reserve)) {
4685		sAvailableMemory -= amount;
4686		return B_OK;
4687	}
4688
4689	if (timeout <= 0)
4690		return B_NO_MEMORY;
4691
4692	// turn timeout into an absolute timeout
4693	timeout += system_time();
4694
4695	// loop until we've got the memory or the timeout occurs
4696	do {
4697		sNeededMemory += amount;
4698
4699		// call the low resource manager
4700		locker.Unlock();
4701		low_resource(B_KERNEL_RESOURCE_MEMORY, sNeededMemory - sAvailableMemory,
4702			B_ABSOLUTE_TIMEOUT, timeout);
4703		locker.Lock();
4704
4705		sNeededMemory -= amount;
4706
4707		if (sAvailableMemory >= (off_t)(amount + reserve)) {
4708			sAvailableMemory -= amount;
4709			return B_OK;
4710		}
4711	} while (timeout > system_time());
4712
4713	return B_NO_MEMORY;
4714}
4715
4716
4717status_t
4718vm_set_area_memory_type(area_id id, phys_addr_t physicalBase, uint32 type)
4719{
4720	// NOTE: The caller is responsible for synchronizing calls to this function!
4721
4722	AddressSpaceReadLocker locker;
4723	VMArea* area;
4724	status_t status = locker.SetFromArea(id, area);
4725	if (status != B_OK)
4726		return status;
4727
4728	// nothing to do, if the type doesn't change
4729	uint32 oldType = area->MemoryType();
4730	if (type == oldType)
4731		return B_OK;
4732
4733	// set the memory type of the area and the mapped pages
4734	VMTranslationMap* map = area->address_space->TranslationMap();
4735	map->Lock();
4736	area->SetMemoryType(type);
4737	map->ProtectArea(area, area->protection);
4738	map->Unlock();
4739
4740	// set the physical memory type
4741	status_t error = arch_vm_set_memory_type(area, physicalBase, type);
4742	if (error != B_OK) {
4743		// reset the memory type of the area and the mapped pages
4744		map->Lock();
4745		area->SetMemoryType(oldType);
4746		map->ProtectArea(area, area->protection);
4747		map->Unlock();
4748		return error;
4749	}
4750
4751	return B_OK;
4752
4753}
4754
4755
4756/*!	This function enforces some protection properties:
4757	 - if B_WRITE_AREA is set, B_WRITE_KERNEL_AREA is set as well
4758	 - if only B_READ_AREA has been set, B_KERNEL_READ_AREA is also set
4759	 - if no protection is specified, it defaults to B_KERNEL_READ_AREA
4760	   and B_KERNEL_WRITE_AREA.
4761*/
4762static void
4763fix_protection(uint32* protection)
4764{
4765	if ((*protection & B_KERNEL_PROTECTION) == 0) {
4766		if ((*protection & B_USER_PROTECTION) == 0
4767			|| (*protection & B_WRITE_AREA) != 0)
4768			*protection |= B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA;
4769		else
4770			*protection |= B_KERNEL_READ_AREA;
4771	}
4772}
4773
4774
4775static void
4776fill_area_info(struct VMArea* area, area_info* info, size_t size)
4777{
4778	strlcpy(info->name, area->name, B_OS_NAME_LENGTH);
4779	info->area = area->id;
4780	info->address = (void*)area->Base();
4781	info->size = area->Size();
4782	info->protection = area->protection;
4783	info->lock = B_FULL_LOCK;
4784	info->team = area->address_space->ID();
4785	info->copy_count = 0;
4786	info->in_count = 0;
4787	info->out_count = 0;
4788		// TODO: retrieve real values here!
4789
4790	VMCache* cache = vm_area_get_locked_cache(area);
4791
4792	// Note, this is a simplification; the cache could be larger than this area
4793	info->ram_size = cache->page_count * B_PAGE_SIZE;
4794
4795	vm_area_put_locked_cache(cache);
4796}
4797
4798
4799static status_t
4800vm_resize_area(area_id areaID, size_t newSize, bool kernel)
4801{
4802	// is newSize a multiple of B_PAGE_SIZE?
4803	if (newSize & (B_PAGE_SIZE - 1))
4804		return B_BAD_VALUE;
4805
4806	// lock all affected address spaces and the cache
4807	VMArea* area;
4808	VMCache* cache;
4809
4810	MultiAddressSpaceLocker locker;
4811	AreaCacheLocker cacheLocker;
4812
4813	status_t status;
4814	size_t oldSize;
4815	bool anyKernelArea;
4816	bool restart;
4817
4818	do {
4819		anyKernelArea = false;
4820		restart = false;
4821
4822		locker.Unset();
4823		status = locker.AddAreaCacheAndLock(areaID, true, true, area, &cache);
4824		if (status != B_OK)
4825			return status;
4826		cacheLocker.SetTo(cache, true);	// already locked
4827
4828		// enforce restrictions
4829		if (!kernel) {
4830			if ((area->protection & B_KERNEL_AREA) != 0)
4831				return B_NOT_ALLOWED;
4832			// TODO: Enforce all restrictions (team, etc.)!
4833		}
4834
4835		oldSize = area->Size();
4836		if (newSize == oldSize)
4837			return B_OK;
4838
4839		if (cache->type != CACHE_TYPE_RAM)
4840			return B_NOT_ALLOWED;
4841
4842		if (oldSize < newSize) {
4843			// We need to check if all areas of this cache can be resized.
4844			for (VMArea* current = cache->areas; current != NULL;
4845					current = current->cache_next) {
4846				if (!current->address_space->CanResizeArea(current, newSize))
4847					return B_ERROR;
4848				anyKernelArea
4849					|= current->address_space == VMAddressSpace::Kernel();
4850			}
4851		} else {
4852			// We're shrinking the areas, so we must make sure the affected
4853			// ranges are not wired.
4854			for (VMArea* current = cache->areas; current != NULL;
4855					current = current->cache_next) {
4856				anyKernelArea
4857					|= current->address_space == VMAddressSpace::Kernel();
4858
4859				if (wait_if_area_range_is_wired(current,
4860						current->Base() + newSize, oldSize - newSize, &locker,
4861						&cacheLocker)) {
4862					restart = true;
4863					break;
4864				}
4865			}
4866		}
4867	} while (restart);
4868
4869	// Okay, looks good so far, so let's do it
4870
4871	int priority = kernel && anyKernelArea
4872		? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
4873	uint32 allocationFlags = kernel && anyKernelArea
4874		? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
4875
4876	if (oldSize < newSize) {
4877		// Growing the cache can fail, so we do it first.
4878		status = cache->Resize(cache->virtual_base + newSize, priority);
4879		if (status != B_OK)
4880			return status;
4881	}
4882
4883	for (VMArea* current = cache->areas; current != NULL;
4884			current = current->cache_next) {
4885		status = current->address_space->ResizeArea(current, newSize,
4886			allocationFlags);
4887		if (status != B_OK)
4888			break;
4889
4890		// We also need to unmap all pages beyond the new size, if the area has
4891		// shrunk
4892		if (newSize < oldSize) {
4893			VMCacheChainLocker cacheChainLocker(cache);
4894			cacheChainLocker.LockAllSourceCaches();
4895
4896			unmap_pages(current, current->Base() + newSize,
4897				oldSize - newSize);
4898
4899			cacheChainLocker.Unlock(cache);
4900		}
4901	}
4902
4903	if (status == B_OK) {
4904		// Shrink or grow individual page protections if in use.
4905		if (area->page_protections != NULL) {
4906			uint32 bytes = (newSize / B_PAGE_SIZE + 1) / 2;
4907			uint8* newProtections
4908				= (uint8*)realloc(area->page_protections, bytes);
4909			if (newProtections == NULL)
4910				status = B_NO_MEMORY;
4911			else {
4912				area->page_protections = newProtections;
4913
4914				if (oldSize < newSize) {
4915					// init the additional page protections to that of the area
4916					uint32 offset = (oldSize / B_PAGE_SIZE + 1) / 2;
4917					uint32 areaProtection = area->protection
4918						& (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
4919					memset(area->page_protections + offset,
4920						areaProtection | (areaProtection << 4), bytes - offset);
4921					if ((oldSize / B_PAGE_SIZE) % 2 != 0) {
4922						uint8& entry = area->page_protections[offset - 1];
4923						entry = (entry & 0x0f) | (areaProtection << 4);
4924					}
4925				}
4926			}
4927		}
4928	}
4929
4930	// shrinking the cache can't fail, so we do it now
4931	if (status == B_OK && newSize < oldSize)
4932		status = cache->Resize(cache->virtual_base + newSize, priority);
4933
4934	if (status != B_OK) {
4935		// Something failed -- resize the areas back to their original size.
4936		// This can fail, too, in which case we're seriously screwed.
4937		for (VMArea* current = cache->areas; current != NULL;
4938				current = current->cache_next) {
4939			if (current->address_space->ResizeArea(current, oldSize,
4940					allocationFlags) != B_OK) {
4941				panic("vm_resize_area(): Failed and not being able to restore "
4942					"original state.");
4943			}
4944		}
4945
4946		cache->Resize(cache->virtual_base + oldSize, priority);
4947	}
4948
4949	// TODO: we must honour the lock restrictions of this area
4950	return status;
4951}
4952
4953
4954status_t
4955vm_memset_physical(phys_addr_t address, int value, size_t length)
4956{
4957	return sPhysicalPageMapper->MemsetPhysical(address, value, length);
4958}
4959
4960
4961status_t
4962vm_memcpy_from_physical(void* to, phys_addr_t from, size_t length, bool user)
4963{
4964	return sPhysicalPageMapper->MemcpyFromPhysical(to, from, length, user);
4965}
4966
4967
4968status_t
4969vm_memcpy_to_physical(phys_addr_t to, const void* _from, size_t length,
4970	bool user)
4971{
4972	return sPhysicalPageMapper->MemcpyToPhysical(to, _from, length, user);
4973}
4974
4975
4976void
4977vm_memcpy_physical_page(phys_addr_t to, phys_addr_t from)
4978{
4979	return sPhysicalPageMapper->MemcpyPhysicalPage(to, from);
4980}
4981
4982
4983/*!	Copies a range of memory directly from/to a page that might not be mapped
4984	at the moment.
4985
4986	For \a unsafeMemory the current mapping (if any is ignored). The function
4987	walks through the respective area's cache chain to find the physical page
4988	and copies from/to it directly.
4989	The memory range starting at \a unsafeMemory with a length of \a size bytes
4990	must not cross a page boundary.
4991
4992	\param teamID The team ID identifying the address space \a unsafeMemory is
4993		to be interpreted in. Ignored, if \a unsafeMemory is a kernel address
4994		(the kernel address space is assumed in this case). If \c B_CURRENT_TEAM
4995		is passed, the address space of the thread returned by
4996		debug_get_debugged_thread() is used.
4997	\param unsafeMemory The start of the unsafe memory range to be copied
4998		from/to.
4999	\param buffer A safely accessible kernel buffer to be copied from/to.
5000	\param size The number of bytes to be copied.
5001	\param copyToUnsafe If \c true, memory is copied from \a buffer to
5002		\a unsafeMemory, the other way around otherwise.
5003*/
5004status_t
5005vm_debug_copy_page_memory(team_id teamID, void* unsafeMemory, void* buffer,
5006	size_t size, bool copyToUnsafe)
5007{
5008	if (size > B_PAGE_SIZE || ROUNDDOWN((addr_t)unsafeMemory, B_PAGE_SIZE)
5009			!= ROUNDDOWN((addr_t)unsafeMemory + size - 1, B_PAGE_SIZE)) {
5010		return B_BAD_VALUE;
5011	}
5012
5013	// get the address space for the debugged thread
5014	VMAddressSpace* addressSpace;
5015	if (IS_KERNEL_ADDRESS(unsafeMemory)) {
5016		addressSpace = VMAddressSpace::Kernel();
5017	} else if (teamID == B_CURRENT_TEAM) {
5018		Thread* thread = debug_get_debugged_thread();
5019		if (thread == NULL || thread->team == NULL)
5020			return B_BAD_ADDRESS;
5021
5022		addressSpace = thread->team->address_space;
5023	} else
5024		addressSpace = VMAddressSpace::DebugGet(teamID);
5025
5026	if (addressSpace == NULL)
5027		return B_BAD_ADDRESS;
5028
5029	// get the area
5030	VMArea* area = addressSpace->LookupArea((addr_t)unsafeMemory);
5031	if (area == NULL)
5032		return B_BAD_ADDRESS;
5033
5034	// search the page
5035	off_t cacheOffset = (addr_t)unsafeMemory - area->Base()
5036		+ area->cache_offset;
5037	VMCache* cache = area->cache;
5038	vm_page* page = NULL;
5039	while (cache != NULL) {
5040		page = cache->DebugLookupPage(cacheOffset);
5041		if (page != NULL)
5042			break;
5043
5044		// Page not found in this cache -- if it is paged out, we must not try
5045		// to get it from lower caches.
5046		if (cache->DebugHasPage(cacheOffset))
5047			break;
5048
5049		cache = cache->source;
5050	}
5051
5052	if (page == NULL)
5053		return B_UNSUPPORTED;
5054
5055	// copy from/to physical memory
5056	phys_addr_t physicalAddress = page->physical_page_number * B_PAGE_SIZE
5057		+ (addr_t)unsafeMemory % B_PAGE_SIZE;
5058
5059	if (copyToUnsafe) {
5060		if (page->Cache() != area->cache)
5061			return B_UNSUPPORTED;
5062
5063		return vm_memcpy_to_physical(physicalAddress, buffer, size, false);
5064	}
5065
5066	return vm_memcpy_from_physical(buffer, physicalAddress, size, false);
5067}
5068
5069
5070//	#pragma mark - kernel public API
5071
5072
5073status_t
5074user_memcpy(void* to, const void* from, size_t size)
5075{
5076	// don't allow address overflows
5077	if ((addr_t)from + size < (addr_t)from || (addr_t)to + size < (addr_t)to)
5078		return B_BAD_ADDRESS;
5079
5080	if (arch_cpu_user_memcpy(to, from, size,
5081			&thread_get_current_thread()->fault_handler) < B_OK)
5082		return B_BAD_ADDRESS;
5083
5084	return B_OK;
5085}
5086
5087
5088/*!	\brief Copies at most (\a size - 1) characters from the string in \a from to
5089	the string in \a to, NULL-terminating the result.
5090
5091	\param to Pointer to the destination C-string.
5092	\param from Pointer to the source C-string.
5093	\param size Size in bytes of the string buffer pointed to by \a to.
5094
5095	\return strlen(\a from).
5096*/
5097ssize_t
5098user_strlcpy(char* to, const char* from, size_t size)
5099{
5100	if (to == NULL && size != 0)
5101		return B_BAD_VALUE;
5102	if (from == NULL)
5103		return B_BAD_ADDRESS;
5104
5105	// limit size to avoid address overflows
5106	size_t maxSize = std::min(size,
5107		~(addr_t)0 - std::max((addr_t)from, (addr_t)to) + 1);
5108		// NOTE: Since arch_cpu_user_strlcpy() determines the length of \a from,
5109		// the source address might still overflow.
5110
5111	ssize_t result = arch_cpu_user_strlcpy(to, from, maxSize,
5112		&thread_get_current_thread()->fault_handler);
5113
5114	// If we hit the address overflow boundary, fail.
5115	if (result < 0 || (result >= 0 && (size_t)result >= maxSize
5116			&& maxSize < size)) {
5117		return B_BAD_ADDRESS;
5118	}
5119
5120	return result;
5121}
5122
5123
5124status_t
5125user_memset(void* s, char c, size_t count)
5126{
5127	// don't allow address overflows
5128	if ((addr_t)s + count < (addr_t)s)
5129		return B_BAD_ADDRESS;
5130
5131	if (arch_cpu_user_memset(s, c, count,
5132			&thread_get_current_thread()->fault_handler) < B_OK)
5133		return B_BAD_ADDRESS;
5134
5135	return B_OK;
5136}
5137
5138
5139/*!	Wires a single page at the given address.
5140
5141	\param team The team whose address space the address belongs to. Supports
5142		also \c B_CURRENT_TEAM. If the given address is a kernel address, the
5143		parameter is ignored.
5144	\param address address The virtual address to wire down. Does not need to
5145		be page aligned.
5146	\param writable If \c true the page shall be writable.
5147	\param info On success the info is filled in, among other things
5148		containing the physical address the given virtual one translates to.
5149	\return \c B_OK, when the page could be wired, another error code otherwise.
5150*/
5151status_t
5152vm_wire_page(team_id team, addr_t address, bool writable,
5153	VMPageWiringInfo* info)
5154{
5155	addr_t pageAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5156	info->range.SetTo(pageAddress, B_PAGE_SIZE, writable, false);
5157
5158	// compute the page protection that is required
5159	bool isUser = IS_USER_ADDRESS(address);
5160	uint32 requiredProtection = PAGE_PRESENT
5161		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5162	if (writable)
5163		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5164
5165	// get and read lock the address space
5166	VMAddressSpace* addressSpace = NULL;
5167	if (isUser) {
5168		if (team == B_CURRENT_TEAM)
5169			addressSpace = VMAddressSpace::GetCurrent();
5170		else
5171			addressSpace = VMAddressSpace::Get(team);
5172	} else
5173		addressSpace = VMAddressSpace::GetKernel();
5174	if (addressSpace == NULL)
5175		return B_ERROR;
5176
5177	AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5178
5179	VMTranslationMap* map = addressSpace->TranslationMap();
5180	status_t error = B_OK;
5181
5182	// get the area
5183	VMArea* area = addressSpace->LookupArea(pageAddress);
5184	if (area == NULL) {
5185		addressSpace->Put();
5186		return B_BAD_ADDRESS;
5187	}
5188
5189	// Lock the area's top cache. This is a requirement for VMArea::Wire().
5190	VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5191
5192	// mark the area range wired
5193	area->Wire(&info->range);
5194
5195	// Lock the area's cache chain and the translation map. Needed to look
5196	// up the page and play with its wired count.
5197	cacheChainLocker.LockAllSourceCaches();
5198	map->Lock();
5199
5200	phys_addr_t physicalAddress;
5201	uint32 flags;
5202	vm_page* page;
5203	if (map->Query(pageAddress, &physicalAddress, &flags) == B_OK
5204		&& (flags & requiredProtection) == requiredProtection
5205		&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5206			!= NULL) {
5207		// Already mapped with the correct permissions -- just increment
5208		// the page's wired count.
5209		increment_page_wired_count(page);
5210
5211		map->Unlock();
5212		cacheChainLocker.Unlock();
5213		addressSpaceLocker.Unlock();
5214	} else {
5215		// Let vm_soft_fault() map the page for us, if possible. We need
5216		// to fully unlock to avoid deadlocks. Since we have already
5217		// wired the area itself, nothing disturbing will happen with it
5218		// in the meantime.
5219		map->Unlock();
5220		cacheChainLocker.Unlock();
5221		addressSpaceLocker.Unlock();
5222
5223		error = vm_soft_fault(addressSpace, pageAddress, writable, isUser,
5224			&page, &info->range);
5225
5226		if (error != B_OK) {
5227			// The page could not be mapped -- clean up.
5228			VMCache* cache = vm_area_get_locked_cache(area);
5229			area->Unwire(&info->range);
5230			cache->ReleaseRefAndUnlock();
5231			addressSpace->Put();
5232			return error;
5233		}
5234	}
5235
5236	info->physicalAddress
5237		= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
5238			+ address % B_PAGE_SIZE;
5239	info->page = page;
5240
5241	return B_OK;
5242}
5243
5244
5245/*!	Unwires a single page previously wired via vm_wire_page().
5246
5247	\param info The same object passed to vm_wire_page() before.
5248*/
5249void
5250vm_unwire_page(VMPageWiringInfo* info)
5251{
5252	// lock the address space
5253	VMArea* area = info->range.area;
5254	AddressSpaceReadLocker addressSpaceLocker(area->address_space, false);
5255		// takes over our reference
5256
5257	// lock the top cache
5258	VMCache* cache = vm_area_get_locked_cache(area);
5259	VMCacheChainLocker cacheChainLocker(cache);
5260
5261	if (info->page->Cache() != cache) {
5262		// The page is not in the top cache, so we lock the whole cache chain
5263		// before touching the page's wired count.
5264		cacheChainLocker.LockAllSourceCaches();
5265	}
5266
5267	decrement_page_wired_count(info->page);
5268
5269	// remove the wired range from the range
5270	area->Unwire(&info->range);
5271
5272	cacheChainLocker.Unlock();
5273}
5274
5275
5276/*!	Wires down the given address range in the specified team's address space.
5277
5278	If successful the function
5279	- acquires a reference to the specified team's address space,
5280	- adds respective wired ranges to all areas that intersect with the given
5281	  address range,
5282	- makes sure all pages in the given address range are mapped with the
5283	  requested access permissions and increments their wired count.
5284
5285	It fails, when \a team doesn't specify a valid address space, when any part
5286	of the specified address range is not covered by areas, when the concerned
5287	areas don't allow mapping with the requested permissions, or when mapping
5288	failed for another reason.
5289
5290	When successful the call must be balanced by a unlock_memory_etc() call with
5291	the exact same parameters.
5292
5293	\param team Identifies the address (via team ID). \c B_CURRENT_TEAM is
5294		supported.
5295	\param address The start of the address range to be wired.
5296	\param numBytes The size of the address range to be wired.
5297	\param flags Flags. Currently only \c B_READ_DEVICE is defined, which
5298		requests that the range must be wired writable ("read from device
5299		into memory").
5300	\return \c B_OK on success, another error code otherwise.
5301*/
5302status_t
5303lock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
5304{
5305	addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5306	addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
5307
5308	// compute the page protection that is required
5309	bool isUser = IS_USER_ADDRESS(address);
5310	bool writable = (flags & B_READ_DEVICE) == 0;
5311	uint32 requiredProtection = PAGE_PRESENT
5312		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5313	if (writable)
5314		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5315
5316	uint32 mallocFlags = isUser
5317		? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
5318
5319	// get and read lock the address space
5320	VMAddressSpace* addressSpace = NULL;
5321	if (isUser) {
5322		if (team == B_CURRENT_TEAM)
5323			addressSpace = VMAddressSpace::GetCurrent();
5324		else
5325			addressSpace = VMAddressSpace::Get(team);
5326	} else
5327		addressSpace = VMAddressSpace::GetKernel();
5328	if (addressSpace == NULL)
5329		return B_ERROR;
5330
5331	AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5332
5333	VMTranslationMap* map = addressSpace->TranslationMap();
5334	status_t error = B_OK;
5335
5336	// iterate through all concerned areas
5337	addr_t nextAddress = lockBaseAddress;
5338	while (nextAddress != lockEndAddress) {
5339		// get the next area
5340		VMArea* area = addressSpace->LookupArea(nextAddress);
5341		if (area == NULL) {
5342			error = B_BAD_ADDRESS;
5343			break;
5344		}
5345
5346		addr_t areaStart = nextAddress;
5347		addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
5348
5349		// allocate the wired range (do that before locking the cache to avoid
5350		// deadlocks)
5351		VMAreaWiredRange* range = new(malloc_flags(mallocFlags))
5352			VMAreaWiredRange(areaStart, areaEnd - areaStart, writable, true);
5353		if (range == NULL) {
5354			error = B_NO_MEMORY;
5355			break;
5356		}
5357
5358		// Lock the area's top cache. This is a requirement for VMArea::Wire().
5359		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5360
5361		// mark the area range wired
5362		area->Wire(range);
5363
5364		// Depending on the area cache type and the wiring, we may not need to
5365		// look at the individual pages.
5366		if (area->cache_type == CACHE_TYPE_NULL
5367			|| area->cache_type == CACHE_TYPE_DEVICE
5368			|| area->wiring == B_FULL_LOCK
5369			|| area->wiring == B_CONTIGUOUS) {
5370			nextAddress = areaEnd;
5371			continue;
5372		}
5373
5374		// Lock the area's cache chain and the translation map. Needed to look
5375		// up pages and play with their wired count.
5376		cacheChainLocker.LockAllSourceCaches();
5377		map->Lock();
5378
5379		// iterate through the pages and wire them
5380		for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
5381			phys_addr_t physicalAddress;
5382			uint32 flags;
5383
5384			vm_page* page;
5385			if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
5386				&& (flags & requiredProtection) == requiredProtection
5387				&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5388					!= NULL) {
5389				// Already mapped with the correct permissions -- just increment
5390				// the page's wired count.
5391				increment_page_wired_count(page);
5392			} else {
5393				// Let vm_soft_fault() map the page for us, if possible. We need
5394				// to fully unlock to avoid deadlocks. Since we have already
5395				// wired the area itself, nothing disturbing will happen with it
5396				// in the meantime.
5397				map->Unlock();
5398				cacheChainLocker.Unlock();
5399				addressSpaceLocker.Unlock();
5400
5401				error = vm_soft_fault(addressSpace, nextAddress, writable,
5402					isUser, &page, range);
5403
5404				addressSpaceLocker.Lock();
5405				cacheChainLocker.SetTo(vm_area_get_locked_cache(area));
5406				cacheChainLocker.LockAllSourceCaches();
5407				map->Lock();
5408			}
5409
5410			if (error != B_OK)
5411				break;
5412		}
5413
5414		map->Unlock();
5415
5416		if (error == B_OK) {
5417			cacheChainLocker.Unlock();
5418		} else {
5419			// An error occurred, so abort right here. If the current address
5420			// is the first in this area, unwire the area, since we won't get
5421			// to it when reverting what we've done so far.
5422			if (nextAddress == areaStart) {
5423				area->Unwire(range);
5424				cacheChainLocker.Unlock();
5425				range->~VMAreaWiredRange();
5426				free_etc(range, mallocFlags);
5427			} else
5428				cacheChainLocker.Unlock();
5429
5430			break;
5431		}
5432	}
5433
5434	if (error != B_OK) {
5435		// An error occurred, so unwire all that we've already wired. Note that
5436		// even if not a single page was wired, unlock_memory_etc() is called
5437		// to put the address space reference.
5438		addressSpaceLocker.Unlock();
5439		unlock_memory_etc(team, (void*)address, nextAddress - lockBaseAddress,
5440			flags);
5441	}
5442
5443	return error;
5444}
5445
5446
5447status_t
5448lock_memory(void* address, size_t numBytes, uint32 flags)
5449{
5450	return lock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
5451}
5452
5453
5454/*!	Unwires an address range previously wired with lock_memory_etc().
5455
5456	Note that a call to this function must balance a previous lock_memory_etc()
5457	call with exactly the same parameters.
5458*/
5459status_t
5460unlock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
5461{
5462	addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5463	addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
5464
5465	// compute the page protection that is required
5466	bool isUser = IS_USER_ADDRESS(address);
5467	bool writable = (flags & B_READ_DEVICE) == 0;
5468	uint32 requiredProtection = PAGE_PRESENT
5469		| B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5470	if (writable)
5471		requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5472
5473	uint32 mallocFlags = isUser
5474		? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
5475
5476	// get and read lock the address space
5477	VMAddressSpace* addressSpace = NULL;
5478	if (isUser) {
5479		if (team == B_CURRENT_TEAM)
5480			addressSpace = VMAddressSpace::GetCurrent();
5481		else
5482			addressSpace = VMAddressSpace::Get(team);
5483	} else
5484		addressSpace = VMAddressSpace::GetKernel();
5485	if (addressSpace == NULL)
5486		return B_ERROR;
5487
5488	AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5489
5490	VMTranslationMap* map = addressSpace->TranslationMap();
5491	status_t error = B_OK;
5492
5493	// iterate through all concerned areas
5494	addr_t nextAddress = lockBaseAddress;
5495	while (nextAddress != lockEndAddress) {
5496		// get the next area
5497		VMArea* area = addressSpace->LookupArea(nextAddress);
5498		if (area == NULL) {
5499			error = B_BAD_ADDRESS;
5500			break;
5501		}
5502
5503		addr_t areaStart = nextAddress;
5504		addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
5505
5506		// Lock the area's top cache. This is a requirement for
5507		// VMArea::Unwire().
5508		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5509
5510		// Depending on the area cache type and the wiring, we may not need to
5511		// look at the individual pages.
5512		if (area->cache_type == CACHE_TYPE_NULL
5513			|| area->cache_type == CACHE_TYPE_DEVICE
5514			|| area->wiring == B_FULL_LOCK
5515			|| area->wiring == B_CONTIGUOUS) {
5516			// unwire the range (to avoid deadlocks we delete the range after
5517			// unlocking the cache)
5518			nextAddress = areaEnd;
5519			VMAreaWiredRange* range = area->Unwire(areaStart,
5520				areaEnd - areaStart, writable);
5521			cacheChainLocker.Unlock();
5522			if (range != NULL) {
5523				range->~VMAreaWiredRange();
5524				free_etc(range, mallocFlags);
5525			}
5526			continue;
5527		}
5528
5529		// Lock the area's cache chain and the translation map. Needed to look
5530		// up pages and play with their wired count.
5531		cacheChainLocker.LockAllSourceCaches();
5532		map->Lock();
5533
5534		// iterate through the pages and unwire them
5535		for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
5536			phys_addr_t physicalAddress;
5537			uint32 flags;
5538
5539			vm_page* page;
5540			if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
5541				&& (flags & PAGE_PRESENT) != 0
5542				&& (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5543					!= NULL) {
5544				// Already mapped with the correct permissions -- just increment
5545				// the page's wired count.
5546				decrement_page_wired_count(page);
5547			} else {
5548				panic("unlock_memory_etc(): Failed to unwire page: address "
5549					"space %p, address: %#" B_PRIxADDR, addressSpace,
5550					nextAddress);
5551				error = B_BAD_VALUE;
5552				break;
5553			}
5554		}
5555
5556		map->Unlock();
5557
5558		// All pages are unwired. Remove the area's wired range as well (to
5559		// avoid deadlocks we delete the range after unlocking the cache).
5560		VMAreaWiredRange* range = area->Unwire(areaStart,
5561			areaEnd - areaStart, writable);
5562
5563		cacheChainLocker.Unlock();
5564
5565		if (range != NULL) {
5566			range->~VMAreaWiredRange();
5567			free_etc(range, mallocFlags);
5568		}
5569
5570		if (error != B_OK)
5571			break;
5572	}
5573
5574	// get rid of the address space reference
5575	addressSpace->Put();
5576
5577	return error;
5578}
5579
5580
5581status_t
5582unlock_memory(void* address, size_t numBytes, uint32 flags)
5583{
5584	return unlock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
5585}
5586
5587
5588/*!	Similar to get_memory_map(), but also allows to specify the address space
5589	for the memory in question and has a saner semantics.
5590	Returns \c B_OK when the complete range could be translated or
5591	\c B_BUFFER_OVERFLOW, if the provided array wasn't big enough. In either
5592	case the actual number of entries is written to \c *_numEntries. Any other
5593	error case indicates complete failure; \c *_numEntries will be set to \c 0
5594	in this case.
5595*/
5596status_t
5597get_memory_map_etc(team_id team, const void* address, size_t numBytes,
5598	physical_entry* table, uint32* _numEntries)
5599{
5600	uint32 numEntries = *_numEntries;
5601	*_numEntries = 0;
5602
5603	VMAddressSpace* addressSpace;
5604	addr_t virtualAddress = (addr_t)address;
5605	addr_t pageOffset = virtualAddress & (B_PAGE_SIZE - 1);
5606	phys_addr_t physicalAddress;
5607	status_t status = B_OK;
5608	int32 index = -1;
5609	addr_t offset = 0;
5610	bool interrupts = are_interrupts_enabled();
5611
5612	TRACE(("get_memory_map_etc(%" B_PRId32 ", %p, %lu bytes, %" B_PRIu32 " "
5613		"entries)\n", team, address, numBytes, numEntries));
5614
5615	if (numEntries == 0 || numBytes == 0)
5616		return B_BAD_VALUE;
5617
5618	// in which address space is the address to be found?
5619	if (IS_USER_ADDRESS(virtualAddress)) {
5620		if (team == B_CURRENT_TEAM)
5621			addressSpace = VMAddressSpace::GetCurrent();
5622		else
5623			addressSpace = VMAddressSpace::Get(team);
5624	} else
5625		addressSpace = VMAddressSpace::GetKernel();
5626
5627	if (addressSpace == NULL)
5628		return B_ERROR;
5629
5630	VMTranslationMap* map = addressSpace->TranslationMap();
5631
5632	if (interrupts)
5633		map->Lock();
5634
5635	while (offset < numBytes) {
5636		addr_t bytes = min_c(numBytes - offset, B_PAGE_SIZE);
5637		uint32 flags;
5638
5639		if (interrupts) {
5640			status = map->Query((addr_t)address + offset, &physicalAddress,
5641				&flags);
5642		} else {
5643			status = map->QueryInterrupt((addr_t)address + offset,
5644				&physicalAddress, &flags);
5645		}
5646		if (status < B_OK)
5647			break;
5648		if ((flags & PAGE_PRESENT) == 0) {
5649			panic("get_memory_map() called on unmapped memory!");
5650			return B_BAD_ADDRESS;
5651		}
5652
5653		if (index < 0 && pageOffset > 0) {
5654			physicalAddress += pageOffset;
5655			if (bytes > B_PAGE_SIZE - pageOffset)
5656				bytes = B_PAGE_SIZE - pageOffset;
5657		}
5658
5659		// need to switch to the next physical_entry?
5660		if (index < 0 || table[index].address
5661				!= physicalAddress - table[index].size) {
5662			if ((uint32)++index + 1 > numEntries) {
5663				// table to small
5664				break;
5665			}
5666			table[index].address = physicalAddress;
5667			table[index].size = bytes;
5668		} else {
5669			// page does fit in current entry
5670			table[index].size += bytes;
5671		}
5672
5673		offset += bytes;
5674	}
5675
5676	if (interrupts)
5677		map->Unlock();
5678
5679	if (status != B_OK)
5680		return status;
5681
5682	if ((uint32)index + 1 > numEntries) {
5683		*_numEntries = index;
5684		return B_BUFFER_OVERFLOW;
5685	}
5686
5687	*_numEntries = index + 1;
5688	return B_OK;
5689}
5690
5691
5692/*!	According to the BeBook, this function should always succeed.
5693	This is no longer the case.
5694*/
5695extern "C" int32
5696__get_memory_map_haiku(const void* address, size_t numBytes,
5697	physical_entry* table, int32 numEntries)
5698{
5699	uint32 entriesRead = numEntries;
5700	status_t error = get_memory_map_etc(B_CURRENT_TEAM, address, numBytes,
5701		table, &entriesRead);
5702	if (error != B_OK)
5703		return error;
5704
5705	// close the entry list
5706
5707	// if it's only one entry, we will silently accept the missing ending
5708	if (numEntries == 1)
5709		return B_OK;
5710
5711	if (entriesRead + 1 > (uint32)numEntries)
5712		return B_BUFFER_OVERFLOW;
5713
5714	table[entriesRead].address = 0;
5715	table[entriesRead].size = 0;
5716
5717	return B_OK;
5718}
5719
5720
5721area_id
5722area_for(void* address)
5723{
5724	return vm_area_for((addr_t)address, true);
5725}
5726
5727
5728area_id
5729find_area(const char* name)
5730{
5731	return VMAreaHash::Find(name);
5732}
5733
5734
5735status_t
5736_get_area_info(area_id id, area_info* info, size_t size)
5737{
5738	if (size != sizeof(area_info) || info == NULL)
5739		return B_BAD_VALUE;
5740
5741	AddressSpaceReadLocker locker;
5742	VMArea* area;
5743	status_t status = locker.SetFromArea(id, area);
5744	if (status != B_OK)
5745		return status;
5746
5747	fill_area_info(area, info, size);
5748	return B_OK;
5749}
5750
5751
5752status_t
5753_get_next_area_info(team_id team, ssize_t* cookie, area_info* info, size_t size)
5754{
5755	addr_t nextBase = *(addr_t*)cookie;
5756
5757	// we're already through the list
5758	if (nextBase == (addr_t)-1)
5759		return B_ENTRY_NOT_FOUND;
5760
5761	if (team == B_CURRENT_TEAM)
5762		team = team_get_current_team_id();
5763
5764	AddressSpaceReadLocker locker(team);
5765	if (!locker.IsLocked())
5766		return B_BAD_TEAM_ID;
5767
5768	VMArea* area;
5769	for (VMAddressSpace::AreaIterator it
5770				= locker.AddressSpace()->GetAreaIterator();
5771			(area = it.Next()) != NULL;) {
5772		if (area->Base() > nextBase)
5773			break;
5774	}
5775
5776	if (area == NULL) {
5777		nextBase = (addr_t)-1;
5778		return B_ENTRY_NOT_FOUND;
5779	}
5780
5781	fill_area_info(area, info, size);
5782	*cookie = (ssize_t)(area->Base());
5783
5784	return B_OK;
5785}
5786
5787
5788status_t
5789set_area_protection(area_id area, uint32 newProtection)
5790{
5791	fix_protection(&newProtection);
5792
5793	return vm_set_area_protection(VMAddressSpace::KernelID(), area,
5794		newProtection, true);
5795}
5796
5797
5798status_t
5799resize_area(area_id areaID, size_t newSize)
5800{
5801	return vm_resize_area(areaID, newSize, true);
5802}
5803
5804
5805/*!	Transfers the specified area to a new team. The caller must be the owner
5806	of the area.
5807*/
5808area_id
5809transfer_area(area_id id, void** _address, uint32 addressSpec, team_id target,
5810	bool kernel)
5811{
5812	area_info info;
5813	status_t status = get_area_info(id, &info);
5814	if (status != B_OK)
5815		return status;
5816
5817	if (info.team != thread_get_current_thread()->team->id)
5818		return B_PERMISSION_DENIED;
5819
5820	area_id clonedArea = vm_clone_area(target, info.name, _address,
5821		addressSpec, info.protection, REGION_NO_PRIVATE_MAP, id, kernel);
5822	if (clonedArea < 0)
5823		return clonedArea;
5824
5825	status = vm_delete_area(info.team, id, kernel);
5826	if (status != B_OK) {
5827		vm_delete_area(target, clonedArea, kernel);
5828		return status;
5829	}
5830
5831	// TODO: The clonedArea is B_SHARED_AREA, which is not really desired.
5832
5833	return clonedArea;
5834}
5835
5836
5837extern "C" area_id
5838__map_physical_memory_haiku(const char* name, phys_addr_t physicalAddress,
5839	size_t numBytes, uint32 addressSpec, uint32 protection,
5840	void** _virtualAddress)
5841{
5842	if (!arch_vm_supports_protection(protection))
5843		return B_NOT_SUPPORTED;
5844
5845	fix_protection(&protection);
5846
5847	return vm_map_physical_memory(VMAddressSpace::KernelID(), name,
5848		_virtualAddress, addressSpec, numBytes, protection, physicalAddress,
5849		false);
5850}
5851
5852
5853area_id
5854clone_area(const char* name, void** _address, uint32 addressSpec,
5855	uint32 protection, area_id source)
5856{
5857	if ((protection & B_KERNEL_PROTECTION) == 0)
5858		protection |= B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA;
5859
5860	return vm_clone_area(VMAddressSpace::KernelID(), name, _address,
5861		addressSpec, protection, REGION_NO_PRIVATE_MAP, source, true);
5862}
5863
5864
5865area_id
5866create_area_etc(team_id team, const char* name, uint32 size, uint32 lock,
5867	uint32 protection, uint32 flags, uint32 guardSize,
5868	const virtual_address_restrictions* virtualAddressRestrictions,
5869	const physical_address_restrictions* physicalAddressRestrictions,
5870	void** _address)
5871{
5872	fix_protection(&protection);
5873
5874	return vm_create_anonymous_area(team, name, size, lock, protection, flags,
5875		guardSize, virtualAddressRestrictions, physicalAddressRestrictions,
5876		true, _address);
5877}
5878
5879
5880extern "C" area_id
5881__create_area_haiku(const char* name, void** _address, uint32 addressSpec,
5882	size_t size, uint32 lock, uint32 protection)
5883{
5884	fix_protection(&protection);
5885
5886	virtual_address_restrictions virtualRestrictions = {};
5887	virtualRestrictions.address = *_address;
5888	virtualRestrictions.address_specification = addressSpec;
5889	physical_address_restrictions physicalRestrictions = {};
5890	return vm_create_anonymous_area(VMAddressSpace::KernelID(), name, size,
5891		lock, protection, 0, 0, &virtualRestrictions, &physicalRestrictions,
5892		true, _address);
5893}
5894
5895
5896status_t
5897delete_area(area_id area)
5898{
5899	return vm_delete_area(VMAddressSpace::KernelID(), area, true);
5900}
5901
5902
5903//	#pragma mark - Userland syscalls
5904
5905
5906status_t
5907_user_reserve_address_range(addr_t* userAddress, uint32 addressSpec,
5908	addr_t size)
5909{
5910	// filter out some unavailable values (for userland)
5911	switch (addressSpec) {
5912		case B_ANY_KERNEL_ADDRESS:
5913		case B_ANY_KERNEL_BLOCK_ADDRESS:
5914			return B_BAD_VALUE;
5915	}
5916
5917	addr_t address;
5918
5919	if (!IS_USER_ADDRESS(userAddress)
5920		|| user_memcpy(&address, userAddress, sizeof(address)) != B_OK)
5921		return B_BAD_ADDRESS;
5922
5923	status_t status = vm_reserve_address_range(
5924		VMAddressSpace::CurrentID(), (void**)&address, addressSpec, size,
5925		RESERVED_AVOID_BASE);
5926	if (status != B_OK)
5927		return status;
5928
5929	if (user_memcpy(userAddress, &address, sizeof(address)) != B_OK) {
5930		vm_unreserve_address_range(VMAddressSpace::CurrentID(),
5931			(void*)address, size);
5932		return B_BAD_ADDRESS;
5933	}
5934
5935	return B_OK;
5936}
5937
5938
5939status_t
5940_user_unreserve_address_range(addr_t address, addr_t size)
5941{
5942	return vm_unreserve_address_range(VMAddressSpace::CurrentID(),
5943		(void*)address, size);
5944}
5945
5946
5947area_id
5948_user_area_for(void* address)
5949{
5950	return vm_area_for((addr_t)address, false);
5951}
5952
5953
5954area_id
5955_user_find_area(const char* userName)
5956{
5957	char name[B_OS_NAME_LENGTH];
5958
5959	if (!IS_USER_ADDRESS(userName)
5960		|| user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK)
5961		return B_BAD_ADDRESS;
5962
5963	return find_area(name);
5964}
5965
5966
5967status_t
5968_user_get_area_info(area_id area, area_info* userInfo)
5969{
5970	if (!IS_USER_ADDRESS(userInfo))
5971		return B_BAD_ADDRESS;
5972
5973	area_info info;
5974	status_t status = get_area_info(area, &info);
5975	if (status < B_OK)
5976		return status;
5977
5978	// TODO: do we want to prevent userland from seeing kernel protections?
5979	//info.protection &= B_USER_PROTECTION;
5980
5981	if (user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
5982		return B_BAD_ADDRESS;
5983
5984	return status;
5985}
5986
5987
5988status_t
5989_user_get_next_area_info(team_id team, ssize_t* userCookie, area_info* userInfo)
5990{
5991	ssize_t cookie;
5992
5993	if (!IS_USER_ADDRESS(userCookie)
5994		|| !IS_USER_ADDRESS(userInfo)
5995		|| user_memcpy(&cookie, userCookie, sizeof(ssize_t)) < B_OK)
5996		return B_BAD_ADDRESS;
5997
5998	area_info info;
5999	status_t status = _get_next_area_info(team, &cookie, &info,
6000		sizeof(area_info));
6001	if (status != B_OK)
6002		return status;
6003
6004	//info.protection &= B_USER_PROTECTION;
6005
6006	if (user_memcpy(userCookie, &cookie, sizeof(ssize_t)) < B_OK
6007		|| user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
6008		return B_BAD_ADDRESS;
6009
6010	return status;
6011}
6012
6013
6014status_t
6015_user_set_area_protection(area_id area, uint32 newProtection)
6016{
6017	if ((newProtection & ~B_USER_PROTECTION) != 0)
6018		return B_BAD_VALUE;
6019
6020	fix_protection(&newProtection);
6021
6022	return vm_set_area_protection(VMAddressSpace::CurrentID(), area,
6023		newProtection, false);
6024}
6025
6026
6027status_t
6028_user_resize_area(area_id area, size_t newSize)
6029{
6030	// TODO: Since we restrict deleting of areas to those owned by the team,
6031	// we should also do that for resizing (check other functions, too).
6032	return vm_resize_area(area, newSize, false);
6033}
6034
6035
6036area_id
6037_user_transfer_area(area_id area, void** userAddress, uint32 addressSpec,
6038	team_id target)
6039{
6040	// filter out some unavailable values (for userland)
6041	switch (addressSpec) {
6042		case B_ANY_KERNEL_ADDRESS:
6043		case B_ANY_KERNEL_BLOCK_ADDRESS:
6044			return B_BAD_VALUE;
6045	}
6046
6047	void* address;
6048	if (!IS_USER_ADDRESS(userAddress)
6049		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6050		return B_BAD_ADDRESS;
6051
6052	area_id newArea = transfer_area(area, &address, addressSpec, target, false);
6053	if (newArea < B_OK)
6054		return newArea;
6055
6056	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6057		return B_BAD_ADDRESS;
6058
6059	return newArea;
6060}
6061
6062
6063area_id
6064_user_clone_area(const char* userName, void** userAddress, uint32 addressSpec,
6065	uint32 protection, area_id sourceArea)
6066{
6067	char name[B_OS_NAME_LENGTH];
6068	void* address;
6069
6070	// filter out some unavailable values (for userland)
6071	switch (addressSpec) {
6072		case B_ANY_KERNEL_ADDRESS:
6073		case B_ANY_KERNEL_BLOCK_ADDRESS:
6074			return B_BAD_VALUE;
6075	}
6076	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6077		return B_BAD_VALUE;
6078
6079	if (!IS_USER_ADDRESS(userName)
6080		|| !IS_USER_ADDRESS(userAddress)
6081		|| user_strlcpy(name, userName, sizeof(name)) < B_OK
6082		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6083		return B_BAD_ADDRESS;
6084
6085	fix_protection(&protection);
6086
6087	area_id clonedArea = vm_clone_area(VMAddressSpace::CurrentID(), name,
6088		&address, addressSpec, protection, REGION_NO_PRIVATE_MAP, sourceArea,
6089		false);
6090	if (clonedArea < B_OK)
6091		return clonedArea;
6092
6093	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6094		delete_area(clonedArea);
6095		return B_BAD_ADDRESS;
6096	}
6097
6098	return clonedArea;
6099}
6100
6101
6102area_id
6103_user_create_area(const char* userName, void** userAddress, uint32 addressSpec,
6104	size_t size, uint32 lock, uint32 protection)
6105{
6106	char name[B_OS_NAME_LENGTH];
6107	void* address;
6108
6109	// filter out some unavailable values (for userland)
6110	switch (addressSpec) {
6111		case B_ANY_KERNEL_ADDRESS:
6112		case B_ANY_KERNEL_BLOCK_ADDRESS:
6113			return B_BAD_VALUE;
6114	}
6115	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6116		return B_BAD_VALUE;
6117
6118	if (!IS_USER_ADDRESS(userName)
6119		|| !IS_USER_ADDRESS(userAddress)
6120		|| user_strlcpy(name, userName, sizeof(name)) < B_OK
6121		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6122		return B_BAD_ADDRESS;
6123
6124	if (addressSpec == B_EXACT_ADDRESS
6125		&& IS_KERNEL_ADDRESS(address))
6126		return B_BAD_VALUE;
6127
6128	fix_protection(&protection);
6129
6130	virtual_address_restrictions virtualRestrictions = {};
6131	virtualRestrictions.address = address;
6132	virtualRestrictions.address_specification = addressSpec;
6133	physical_address_restrictions physicalRestrictions = {};
6134	area_id area = vm_create_anonymous_area(VMAddressSpace::CurrentID(), name,
6135		size, lock, protection, 0, 0, &virtualRestrictions,
6136		&physicalRestrictions, false, &address);
6137
6138	if (area >= B_OK
6139		&& user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6140		delete_area(area);
6141		return B_BAD_ADDRESS;
6142	}
6143
6144	return area;
6145}
6146
6147
6148status_t
6149_user_delete_area(area_id area)
6150{
6151	// Unlike the BeOS implementation, you can now only delete areas
6152	// that you have created yourself from userland.
6153	// The documentation to delete_area() explicitly states that this
6154	// will be restricted in the future, and so it will.
6155	return vm_delete_area(VMAddressSpace::CurrentID(), area, false);
6156}
6157
6158
6159// TODO: create a BeOS style call for this!
6160
6161area_id
6162_user_map_file(const char* userName, void** userAddress, uint32 addressSpec,
6163	size_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
6164	int fd, off_t offset)
6165{
6166	char name[B_OS_NAME_LENGTH];
6167	void* address;
6168	area_id area;
6169
6170	if ((protection & ~B_USER_AREA_FLAGS) != 0)
6171		return B_BAD_VALUE;
6172
6173	fix_protection(&protection);
6174
6175	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userAddress)
6176		|| user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK
6177		|| user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6178		return B_BAD_ADDRESS;
6179
6180	if (addressSpec == B_EXACT_ADDRESS) {
6181		if ((addr_t)address + size < (addr_t)address
6182				|| (addr_t)address % B_PAGE_SIZE != 0) {
6183			return B_BAD_VALUE;
6184		}
6185		if (!IS_USER_ADDRESS(address)
6186				|| !IS_USER_ADDRESS((addr_t)address + size)) {
6187			return B_BAD_ADDRESS;
6188		}
6189	}
6190
6191	area = _vm_map_file(VMAddressSpace::CurrentID(), name, &address,
6192		addressSpec, size, protection, mapping, unmapAddressRange, fd, offset,
6193		false);
6194	if (area < B_OK)
6195		return area;
6196
6197	if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6198		return B_BAD_ADDRESS;
6199
6200	return area;
6201}
6202
6203
6204status_t
6205_user_unmap_memory(void* _address, size_t size)
6206{
6207	addr_t address = (addr_t)_address;
6208
6209	// check params
6210	if (size == 0 || (addr_t)address + size < (addr_t)address
6211		|| (addr_t)address % B_PAGE_SIZE != 0) {
6212		return B_BAD_VALUE;
6213	}
6214
6215	if (!IS_USER_ADDRESS(address) || !IS_USER_ADDRESS((addr_t)address + size))
6216		return B_BAD_ADDRESS;
6217
6218	// Write lock the address space and ensure the address range is not wired.
6219	AddressSpaceWriteLocker locker;
6220	do {
6221		status_t status = locker.SetTo(team_get_current_team_id());
6222		if (status != B_OK)
6223			return status;
6224	} while (wait_if_address_range_is_wired(locker.AddressSpace(), address,
6225			size, &locker));
6226
6227	// unmap
6228	return unmap_address_range(locker.AddressSpace(), address, size, false);
6229}
6230
6231
6232status_t
6233_user_set_memory_protection(void* _address, size_t size, uint32 protection)
6234{
6235	// check address range
6236	addr_t address = (addr_t)_address;
6237	size = PAGE_ALIGN(size);
6238
6239	if ((address % B_PAGE_SIZE) != 0)
6240		return B_BAD_VALUE;
6241	if ((addr_t)address + size < (addr_t)address || !IS_USER_ADDRESS(address)
6242		|| !IS_USER_ADDRESS((addr_t)address + size)) {
6243		// weird error code required by POSIX
6244		return ENOMEM;
6245	}
6246
6247	// extend and check protection
6248	if ((protection & ~B_USER_PROTECTION) != 0)
6249		return B_BAD_VALUE;
6250
6251	fix_protection(&protection);
6252
6253	// We need to write lock the address space, since we're going to play with
6254	// the areas. Also make sure that none of the areas is wired and that we're
6255	// actually allowed to change the protection.
6256	AddressSpaceWriteLocker locker;
6257
6258	bool restart;
6259	do {
6260		restart = false;
6261
6262		status_t status = locker.SetTo(team_get_current_team_id());
6263		if (status != B_OK)
6264			return status;
6265
6266		// First round: Check whether the whole range is covered by areas and we
6267		// are allowed to modify them.
6268		addr_t currentAddress = address;
6269		size_t sizeLeft = size;
6270		while (sizeLeft > 0) {
6271			VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6272			if (area == NULL)
6273				return B_NO_MEMORY;
6274
6275			if ((area->protection & B_KERNEL_AREA) != 0)
6276				return B_NOT_ALLOWED;
6277
6278			// TODO: For (shared) mapped files we should check whether the new
6279			// protections are compatible with the file permissions. We don't
6280			// have a way to do that yet, though.
6281
6282			addr_t offset = currentAddress - area->Base();
6283			size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6284
6285			AreaCacheLocker cacheLocker(area);
6286
6287			if (wait_if_area_range_is_wired(area, currentAddress, rangeSize,
6288					&locker, &cacheLocker)) {
6289				restart = true;
6290				break;
6291			}
6292
6293			cacheLocker.Unlock();
6294
6295			currentAddress += rangeSize;
6296			sizeLeft -= rangeSize;
6297		}
6298	} while (restart);
6299
6300	// Second round: If the protections differ from that of the area, create a
6301	// page protection array and re-map mapped pages.
6302	VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
6303	addr_t currentAddress = address;
6304	size_t sizeLeft = size;
6305	while (sizeLeft > 0) {
6306		VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6307		if (area == NULL)
6308			return B_NO_MEMORY;
6309
6310		addr_t offset = currentAddress - area->Base();
6311		size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6312
6313		currentAddress += rangeSize;
6314		sizeLeft -= rangeSize;
6315
6316		if (area->page_protections == NULL) {
6317			if (area->protection == protection)
6318				continue;
6319
6320			status_t status = allocate_area_page_protections(area);
6321			if (status != B_OK)
6322				return status;
6323		}
6324
6325		// We need to lock the complete cache chain, since we potentially unmap
6326		// pages of lower caches.
6327		VMCache* topCache = vm_area_get_locked_cache(area);
6328		VMCacheChainLocker cacheChainLocker(topCache);
6329		cacheChainLocker.LockAllSourceCaches();
6330
6331		for (addr_t pageAddress = area->Base() + offset;
6332				pageAddress < currentAddress; pageAddress += B_PAGE_SIZE) {
6333			map->Lock();
6334
6335			set_area_page_protection(area, pageAddress, protection);
6336
6337			phys_addr_t physicalAddress;
6338			uint32 flags;
6339
6340			status_t error = map->Query(pageAddress, &physicalAddress, &flags);
6341			if (error != B_OK || (flags & PAGE_PRESENT) == 0) {
6342				map->Unlock();
6343				continue;
6344			}
6345
6346			vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
6347			if (page == NULL) {
6348				panic("area %p looking up page failed for pa %#" B_PRIxPHYSADDR
6349					"\n", area, physicalAddress);
6350				map->Unlock();
6351				return B_ERROR;
6352			}
6353
6354			// If the page is not in the topmost cache and write access is
6355			// requested, we have to unmap it. Otherwise we can re-map it with
6356			// the new protection.
6357			bool unmapPage = page->Cache() != topCache
6358				&& (protection & B_WRITE_AREA) != 0;
6359
6360			if (!unmapPage)
6361				map->ProtectPage(area, pageAddress, protection);
6362
6363			map->Unlock();
6364
6365			if (unmapPage) {
6366				DEBUG_PAGE_ACCESS_START(page);
6367				unmap_page(area, pageAddress);
6368				DEBUG_PAGE_ACCESS_END(page);
6369			}
6370		}
6371	}
6372
6373	return B_OK;
6374}
6375
6376
6377status_t
6378_user_sync_memory(void* _address, size_t size, uint32 flags)
6379{
6380	addr_t address = (addr_t)_address;
6381	size = PAGE_ALIGN(size);
6382
6383	// check params
6384	if ((address % B_PAGE_SIZE) != 0)
6385		return B_BAD_VALUE;
6386	if ((addr_t)address + size < (addr_t)address || !IS_USER_ADDRESS(address)
6387		|| !IS_USER_ADDRESS((addr_t)address + size)) {
6388		// weird error code required by POSIX
6389		return ENOMEM;
6390	}
6391
6392	bool writeSync = (flags & MS_SYNC) != 0;
6393	bool writeAsync = (flags & MS_ASYNC) != 0;
6394	if (writeSync && writeAsync)
6395		return B_BAD_VALUE;
6396
6397	if (size == 0 || (!writeSync && !writeAsync))
6398		return B_OK;
6399
6400	// iterate through the range and sync all concerned areas
6401	while (size > 0) {
6402		// read lock the address space
6403		AddressSpaceReadLocker locker;
6404		status_t error = locker.SetTo(team_get_current_team_id());
6405		if (error != B_OK)
6406			return error;
6407
6408		// get the first area
6409		VMArea* area = locker.AddressSpace()->LookupArea(address);
6410		if (area == NULL)
6411			return B_NO_MEMORY;
6412
6413		uint32 offset = address - area->Base();
6414		size_t rangeSize = min_c(area->Size() - offset, size);
6415		offset += area->cache_offset;
6416
6417		// lock the cache
6418		AreaCacheLocker cacheLocker(area);
6419		if (!cacheLocker)
6420			return B_BAD_VALUE;
6421		VMCache* cache = area->cache;
6422
6423		locker.Unlock();
6424
6425		uint32 firstPage = offset >> PAGE_SHIFT;
6426		uint32 endPage = firstPage + (rangeSize >> PAGE_SHIFT);
6427
6428		// write the pages
6429		if (cache->type == CACHE_TYPE_VNODE) {
6430			if (writeSync) {
6431				// synchronous
6432				error = vm_page_write_modified_page_range(cache, firstPage,
6433					endPage);
6434				if (error != B_OK)
6435					return error;
6436			} else {
6437				// asynchronous
6438				vm_page_schedule_write_page_range(cache, firstPage, endPage);
6439				// TODO: This is probably not quite what is supposed to happen.
6440				// Especially when a lot has to be written, it might take ages
6441				// until it really hits the disk.
6442			}
6443		}
6444
6445		address += rangeSize;
6446		size -= rangeSize;
6447	}
6448
6449	// NOTE: If I understand it correctly the purpose of MS_INVALIDATE is to
6450	// synchronize multiple mappings of the same file. In our VM they never get
6451	// out of sync, though, so we don't have to do anything.
6452
6453	return B_OK;
6454}
6455
6456
6457status_t
6458_user_memory_advice(void* address, size_t size, uint32 advice)
6459{
6460	// TODO: Implement!
6461	return B_OK;
6462}
6463
6464
6465status_t
6466_user_get_memory_properties(team_id teamID, const void* address,
6467	uint32* _protected, uint32* _lock)
6468{
6469	if (!IS_USER_ADDRESS(_protected) || !IS_USER_ADDRESS(_lock))
6470		return B_BAD_ADDRESS;
6471
6472	AddressSpaceReadLocker locker;
6473	status_t error = locker.SetTo(teamID);
6474	if (error != B_OK)
6475		return error;
6476
6477	VMArea* area = locker.AddressSpace()->LookupArea((addr_t)address);
6478	if (area == NULL)
6479		return B_NO_MEMORY;
6480
6481
6482	uint32 protection = area->protection;
6483	if (area->page_protections != NULL)
6484		protection = get_area_page_protection(area, (addr_t)address);
6485
6486	uint32 wiring = area->wiring;
6487
6488	locker.Unlock();
6489
6490	error = user_memcpy(_protected, &protection, sizeof(protection));
6491	if (error != B_OK)
6492		return error;
6493
6494	error = user_memcpy(_lock, &wiring, sizeof(wiring));
6495
6496	return error;
6497}
6498
6499
6500// #pragma mark -- compatibility
6501
6502
6503#if defined(__INTEL__) && B_HAIKU_PHYSICAL_BITS > 32
6504
6505
6506struct physical_entry_beos {
6507	uint32	address;
6508	uint32	size;
6509};
6510
6511
6512/*!	The physical_entry structure has changed. We need to translate it to the
6513	old one.
6514*/
6515extern "C" int32
6516__get_memory_map_beos(const void* _address, size_t numBytes,
6517	physical_entry_beos* table, int32 numEntries)
6518{
6519	if (numEntries <= 0)
6520		return B_BAD_VALUE;
6521
6522	const uint8* address = (const uint8*)_address;
6523
6524	int32 count = 0;
6525	while (numBytes > 0 && count < numEntries) {
6526		physical_entry entry;
6527		status_t result = __get_memory_map_haiku(address, numBytes, &entry, 1);
6528		if (result < 0) {
6529			if (result != B_BUFFER_OVERFLOW)
6530				return result;
6531		}
6532
6533		if (entry.address >= (phys_addr_t)1 << 32) {
6534			panic("get_memory_map(): Address is greater 4 GB!");
6535			return B_ERROR;
6536		}
6537
6538		table[count].address = entry.address;
6539		table[count++].size = entry.size;
6540
6541		address += entry.size;
6542		numBytes -= entry.size;
6543	}
6544
6545	// null-terminate the table, if possible
6546	if (count < numEntries) {
6547		table[count].address = 0;
6548		table[count].size = 0;
6549	}
6550
6551	return B_OK;
6552}
6553
6554
6555/*!	The type of the \a physicalAddress parameter has changed from void* to
6556	phys_addr_t.
6557*/
6558extern "C" area_id
6559__map_physical_memory_beos(const char* name, void* physicalAddress,
6560	size_t numBytes, uint32 addressSpec, uint32 protection,
6561	void** _virtualAddress)
6562{
6563	return __map_physical_memory_haiku(name, (addr_t)physicalAddress, numBytes,
6564		addressSpec, protection, _virtualAddress);
6565}
6566
6567
6568/*! The caller might not be able to deal with physical addresses >= 4 GB, so
6569	we meddle with the \a lock parameter to force 32 bit.
6570*/
6571extern "C" area_id
6572__create_area_beos(const char* name, void** _address, uint32 addressSpec,
6573	size_t size, uint32 lock, uint32 protection)
6574{
6575	switch (lock) {
6576		case B_NO_LOCK:
6577			break;
6578		case B_FULL_LOCK:
6579		case B_LAZY_LOCK:
6580			lock = B_32_BIT_FULL_LOCK;
6581			break;
6582		case B_CONTIGUOUS:
6583			lock = B_32_BIT_CONTIGUOUS;
6584			break;
6585	}
6586
6587	return __create_area_haiku(name, _address, addressSpec, size, lock,
6588		protection);
6589}
6590
6591
6592DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_beos", "get_memory_map@",
6593	"BASE");
6594DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_beos",
6595	"map_physical_memory@", "BASE");
6596DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_beos", "create_area@",
6597	"BASE");
6598
6599DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
6600	"get_memory_map@@", "1_ALPHA3");
6601DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
6602	"map_physical_memory@@", "1_ALPHA3");
6603DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
6604	"1_ALPHA3");
6605
6606
6607#else
6608
6609
6610DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
6611	"get_memory_map@@", "BASE");
6612DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
6613	"map_physical_memory@@", "BASE");
6614DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
6615	"BASE");
6616
6617
6618#endif	// defined(__INTEL__) && B_HAIKU_PHYSICAL_BITS > 32
6619