1/*
2 * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2010, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11#include <string.h>
12#include <stdlib.h>
13
14#include <algorithm>
15
16#include <KernelExport.h>
17#include <OS.h>
18
19#include <AutoDeleter.h>
20
21#include <arch/cpu.h>
22#include <arch/vm_translation_map.h>
23#include <block_cache.h>
24#include <boot/kernel_args.h>
25#include <condition_variable.h>
26#include <elf.h>
27#include <heap.h>
28#include <kernel.h>
29#include <low_resource_manager.h>
30#include <thread.h>
31#include <tracing.h>
32#include <util/AutoLock.h>
33#include <vfs.h>
34#include <vm/vm.h>
35#include <vm/vm_priv.h>
36#include <vm/vm_page.h>
37#include <vm/VMAddressSpace.h>
38#include <vm/VMArea.h>
39#include <vm/VMCache.h>
40
41#include "IORequest.h"
42#include "PageCacheLocker.h"
43#include "VMAnonymousCache.h"
44#include "VMPageQueue.h"
45
46
47//#define TRACE_VM_PAGE
48#ifdef TRACE_VM_PAGE
49#	define TRACE(x) dprintf x
50#else
51#	define TRACE(x) ;
52#endif
53
54//#define TRACE_VM_DAEMONS
55#ifdef TRACE_VM_DAEMONS
56#define TRACE_DAEMON(x...) dprintf(x)
57#else
58#define TRACE_DAEMON(x...) do {} while (false)
59#endif
60
61//#define TRACK_PAGE_USAGE_STATS	1
62
63#define PAGE_ASSERT(page, condition)	\
64	ASSERT_PRINT((condition), "page: %p", (page))
65
66#define SCRUB_SIZE 16
67	// this many pages will be cleared at once in the page scrubber thread
68
69#define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70	// maximum I/O priority of the page writer
71#define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72	// the maximum I/O priority shall be reached when this many pages need to
73	// be written
74
75
76// The page reserve an allocation of the certain priority must not touch.
77static const size_t kPageReserveForPriority[] = {
78	VM_PAGE_RESERVE_USER,		// user
79	VM_PAGE_RESERVE_SYSTEM,		// system
80	0							// VIP
81};
82
83// Minimum number of free pages the page daemon will try to achieve.
84static uint32 sFreePagesTarget;
85static uint32 sFreeOrCachedPagesTarget;
86static uint32 sInactivePagesTarget;
87
88// Wait interval between page daemon runs.
89static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91
92// Number of idle runs after which we want to have processed the full active
93// queue.
94static const uint32 kIdleRunsForFullQueue = 20;
95
96// Maximum limit for the vm_page::usage_count.
97static const int32 kPageUsageMax = 64;
98// vm_page::usage_count buff an accessed page receives in a scan.
99static const int32 kPageUsageAdvance = 3;
100// vm_page::usage_count debuff an unaccessed page receives in a scan.
101static const int32 kPageUsageDecline = 1;
102
103int32 gMappedPagesCount;
104
105static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106
107static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113
114static vm_page *sPages;
115static page_num_t sPhysicalPageOffset;
116static page_num_t sNumPages;
117static page_num_t sNonExistingPages;
118	// pages in the sPages array that aren't backed by physical memory
119static uint64 sIgnoredPages;
120	// pages of physical memory ignored by the boot loader (and thus not
121	// available here)
122static vint32 sUnreservedFreePages;
123static vint32 sUnsatisfiedPageReservations;
124static vint32 sModifiedTemporaryPages;
125
126static ConditionVariable sFreePageCondition;
127static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
128
129// This lock must be used whenever the free or clear page queues are changed.
130// If you need to work on both queues at the same time, you need to hold a write
131// lock, otherwise, a read lock suffices (each queue still has a spinlock to
132// guard against concurrent changes).
133static rw_lock sFreePageQueuesLock
134	= RW_LOCK_INITIALIZER("free/clear page queues");
135
136#ifdef TRACK_PAGE_USAGE_STATS
137static page_num_t sPageUsageArrays[512];
138static page_num_t* sPageUsage = sPageUsageArrays;
139static page_num_t sPageUsagePageCount;
140static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
141static page_num_t sNextPageUsagePageCount;
142#endif
143
144
145#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
146
147struct caller_info {
148	addr_t		caller;
149	size_t		count;
150};
151
152static const int32 kCallerInfoTableSize = 1024;
153static caller_info sCallerInfoTable[kCallerInfoTableSize];
154static int32 sCallerInfoCount = 0;
155
156static caller_info* get_caller_info(addr_t caller);
157
158
159RANGE_MARKER_FUNCTION_PROTOTYPES(vm_page)
160
161static const addr_t kVMPageCodeAddressRange[] = {
162	RANGE_MARKER_FUNCTION_ADDRESS_RANGE(vm_page)
163};
164
165#endif
166
167
168RANGE_MARKER_FUNCTION_BEGIN(vm_page)
169
170
171struct page_stats {
172	int32	totalFreePages;
173	int32	unsatisfiedReservations;
174	int32	cachedPages;
175};
176
177
178struct PageReservationWaiter
179		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
180	Thread*	thread;
181	uint32	dontTouch;		// reserve not to touch
182	uint32	missing;		// pages missing for the reservation
183	int32	threadPriority;
184
185	bool operator<(const PageReservationWaiter& other) const
186	{
187		// Implies an order by descending VM priority (ascending dontTouch)
188		// and (secondarily) descending thread priority.
189		if (dontTouch != other.dontTouch)
190			return dontTouch < other.dontTouch;
191		return threadPriority > other.threadPriority;
192	}
193};
194
195typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
196static PageReservationWaiterList sPageReservationWaiters;
197
198
199struct DaemonCondition {
200	void Init(const char* name)
201	{
202		mutex_init(&fLock, "daemon condition");
203		fCondition.Init(this, name);
204		fActivated = false;
205	}
206
207	bool Lock()
208	{
209		return mutex_lock(&fLock) == B_OK;
210	}
211
212	void Unlock()
213	{
214		mutex_unlock(&fLock);
215	}
216
217	bool Wait(bigtime_t timeout, bool clearActivated)
218	{
219		MutexLocker locker(fLock);
220		if (clearActivated)
221			fActivated = false;
222		else if (fActivated)
223			return true;
224
225		ConditionVariableEntry entry;
226		fCondition.Add(&entry);
227
228		locker.Unlock();
229
230		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
231	}
232
233	void WakeUp()
234	{
235		if (fActivated)
236			return;
237
238		MutexLocker locker(fLock);
239		fActivated = true;
240		fCondition.NotifyOne();
241	}
242
243	void ClearActivated()
244	{
245		MutexLocker locker(fLock);
246		fActivated = false;
247	}
248
249private:
250	mutex				fLock;
251	ConditionVariable	fCondition;
252	bool				fActivated;
253};
254
255
256static DaemonCondition sPageWriterCondition;
257static DaemonCondition sPageDaemonCondition;
258
259
260#if PAGE_ALLOCATION_TRACING
261
262namespace PageAllocationTracing {
263
264class ReservePages : public AbstractTraceEntry {
265public:
266	ReservePages(uint32 count)
267		:
268		fCount(count)
269	{
270		Initialized();
271	}
272
273	virtual void AddDump(TraceOutput& out)
274	{
275		out.Print("page reserve:   %lu", fCount);
276	}
277
278private:
279	uint32		fCount;
280};
281
282
283class UnreservePages : public AbstractTraceEntry {
284public:
285	UnreservePages(uint32 count)
286		:
287		fCount(count)
288	{
289		Initialized();
290	}
291
292	virtual void AddDump(TraceOutput& out)
293	{
294		out.Print("page unreserve: %lu", fCount);
295	}
296
297private:
298	uint32		fCount;
299};
300
301
302class AllocatePage
303	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
304public:
305	AllocatePage(page_num_t pageNumber)
306		:
307		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
308		fPageNumber(pageNumber)
309	{
310		Initialized();
311	}
312
313	virtual void AddDump(TraceOutput& out)
314	{
315		out.Print("page alloc: %#" B_PRIxPHYSADDR, fPageNumber);
316	}
317
318private:
319	page_num_t	fPageNumber;
320};
321
322
323class AllocatePageRun
324	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
325public:
326	AllocatePageRun(page_num_t startPage, uint32 length)
327		:
328		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
329		fStartPage(startPage),
330		fLength(length)
331	{
332		Initialized();
333	}
334
335	virtual void AddDump(TraceOutput& out)
336	{
337		out.Print("page alloc run: start %#" B_PRIxPHYSADDR " length: %"
338			B_PRIu32, fStartPage, fLength);
339	}
340
341private:
342	page_num_t	fStartPage;
343	uint32		fLength;
344};
345
346
347class FreePage
348	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
349public:
350	FreePage(page_num_t pageNumber)
351		:
352		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
353		fPageNumber(pageNumber)
354	{
355		Initialized();
356	}
357
358	virtual void AddDump(TraceOutput& out)
359	{
360		out.Print("page free: %#" B_PRIxPHYSADDR, fPageNumber);
361	}
362
363private:
364	page_num_t	fPageNumber;
365};
366
367
368class ScrubbingPages : public AbstractTraceEntry {
369public:
370	ScrubbingPages(uint32 count)
371		:
372		fCount(count)
373	{
374		Initialized();
375	}
376
377	virtual void AddDump(TraceOutput& out)
378	{
379		out.Print("page scrubbing: %lu", fCount);
380	}
381
382private:
383	uint32		fCount;
384};
385
386
387class ScrubbedPages : public AbstractTraceEntry {
388public:
389	ScrubbedPages(uint32 count)
390		:
391		fCount(count)
392	{
393		Initialized();
394	}
395
396	virtual void AddDump(TraceOutput& out)
397	{
398		out.Print("page scrubbed:  %lu", fCount);
399	}
400
401private:
402	uint32		fCount;
403};
404
405
406class StolenPage : public AbstractTraceEntry {
407public:
408	StolenPage()
409	{
410		Initialized();
411	}
412
413	virtual void AddDump(TraceOutput& out)
414	{
415		out.Print("page stolen");
416	}
417};
418
419}	// namespace PageAllocationTracing
420
421#	define TA(x)	new(std::nothrow) PageAllocationTracing::x
422
423#else
424#	define TA(x)
425#endif	// PAGE_ALLOCATION_TRACING
426
427
428#if PAGE_DAEMON_TRACING
429
430namespace PageDaemonTracing {
431
432class ActivatePage : public AbstractTraceEntry {
433	public:
434		ActivatePage(vm_page* page)
435			:
436			fCache(page->cache),
437			fPage(page)
438		{
439			Initialized();
440		}
441
442		virtual void AddDump(TraceOutput& out)
443		{
444			out.Print("page activated:   %p, cache: %p", fPage, fCache);
445		}
446
447	private:
448		VMCache*	fCache;
449		vm_page*	fPage;
450};
451
452
453class DeactivatePage : public AbstractTraceEntry {
454	public:
455		DeactivatePage(vm_page* page)
456			:
457			fCache(page->cache),
458			fPage(page)
459		{
460			Initialized();
461		}
462
463		virtual void AddDump(TraceOutput& out)
464		{
465			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
466		}
467
468	private:
469		VMCache*	fCache;
470		vm_page*	fPage;
471};
472
473
474class FreedPageSwap : public AbstractTraceEntry {
475	public:
476		FreedPageSwap(vm_page* page)
477			:
478			fCache(page->cache),
479			fPage(page)
480		{
481			Initialized();
482		}
483
484		virtual void AddDump(TraceOutput& out)
485		{
486			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
487		}
488
489	private:
490		VMCache*	fCache;
491		vm_page*	fPage;
492};
493
494}	// namespace PageDaemonTracing
495
496#	define TD(x)	new(std::nothrow) PageDaemonTracing::x
497
498#else
499#	define TD(x)
500#endif	// PAGE_DAEMON_TRACING
501
502
503#if PAGE_WRITER_TRACING
504
505namespace PageWriterTracing {
506
507class WritePage : public AbstractTraceEntry {
508	public:
509		WritePage(vm_page* page)
510			:
511			fCache(page->Cache()),
512			fPage(page)
513		{
514			Initialized();
515		}
516
517		virtual void AddDump(TraceOutput& out)
518		{
519			out.Print("page write: %p, cache: %p", fPage, fCache);
520		}
521
522	private:
523		VMCache*	fCache;
524		vm_page*	fPage;
525};
526
527}	// namespace PageWriterTracing
528
529#	define TPW(x)	new(std::nothrow) PageWriterTracing::x
530
531#else
532#	define TPW(x)
533#endif	// PAGE_WRITER_TRACING
534
535
536#if PAGE_STATE_TRACING
537
538namespace PageStateTracing {
539
540class SetPageState : public AbstractTraceEntry {
541	public:
542		SetPageState(vm_page* page, uint8 newState)
543			:
544			fPage(page),
545			fOldState(page->State()),
546			fNewState(newState),
547			fBusy(page->busy),
548			fWired(page->WiredCount() > 0),
549			fMapped(!page->mappings.IsEmpty()),
550			fAccessed(page->accessed),
551			fModified(page->modified)
552		{
553#if PAGE_STATE_TRACING_STACK_TRACE
554			fStackTrace = capture_tracing_stack_trace(
555				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
556				// Don't capture userland stack trace to avoid potential
557				// deadlocks.
558#endif
559			Initialized();
560		}
561
562#if PAGE_STATE_TRACING_STACK_TRACE
563		virtual void DumpStackTrace(TraceOutput& out)
564		{
565			out.PrintStackTrace(fStackTrace);
566		}
567#endif
568
569		virtual void AddDump(TraceOutput& out)
570		{
571			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
572				fBusy ? 'b' : '-',
573				fWired ? 'w' : '-',
574				fMapped ? 'm' : '-',
575				fAccessed ? 'a' : '-',
576				fModified ? 'm' : '-',
577				page_state_to_string(fOldState),
578				page_state_to_string(fNewState));
579		}
580
581	private:
582		vm_page*	fPage;
583#if PAGE_STATE_TRACING_STACK_TRACE
584		tracing_stack_trace* fStackTrace;
585#endif
586		uint8		fOldState;
587		uint8		fNewState;
588		bool		fBusy : 1;
589		bool		fWired : 1;
590		bool		fMapped : 1;
591		bool		fAccessed : 1;
592		bool		fModified : 1;
593};
594
595}	// namespace PageStateTracing
596
597#	define TPS(x)	new(std::nothrow) PageStateTracing::x
598
599#else
600#	define TPS(x)
601#endif	// PAGE_STATE_TRACING
602
603
604#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
605
606namespace BKernel {
607
608class AllocationTrackingCallback {
609public:
610	virtual						~AllocationTrackingCallback();
611
612	virtual	bool				ProcessTrackingInfo(
613									AllocationTrackingInfo* info,
614									page_num_t pageNumber) = 0;
615};
616
617}
618
619using BKernel::AllocationTrackingCallback;
620
621
622class AllocationCollectorCallback : public AllocationTrackingCallback {
623public:
624	AllocationCollectorCallback(bool resetInfos)
625		:
626		fResetInfos(resetInfos)
627	{
628	}
629
630	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
631		page_num_t pageNumber)
632	{
633		if (!info->IsInitialized())
634			return true;
635
636		addr_t caller = 0;
637		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
638
639		if (traceEntry != NULL && info->IsTraceEntryValid()) {
640			caller = tracing_find_caller_in_stack_trace(
641				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
642		}
643
644		caller_info* callerInfo = get_caller_info(caller);
645		if (callerInfo == NULL) {
646			kprintf("out of space for caller infos\n");
647			return false;
648		}
649
650		callerInfo->count++;
651
652		if (fResetInfos)
653			info->Clear();
654
655		return true;
656	}
657
658private:
659	bool	fResetInfos;
660};
661
662
663class AllocationInfoPrinterCallback : public AllocationTrackingCallback {
664public:
665	AllocationInfoPrinterCallback(bool printStackTrace, page_num_t pageFilter,
666		team_id teamFilter, thread_id threadFilter)
667		:
668		fPrintStackTrace(printStackTrace),
669		fPageFilter(pageFilter),
670		fTeamFilter(teamFilter),
671		fThreadFilter(threadFilter)
672	{
673	}
674
675	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
676		page_num_t pageNumber)
677	{
678		if (!info->IsInitialized())
679			return true;
680
681		if (fPageFilter != 0 && pageNumber != fPageFilter)
682			return true;
683
684		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
685		if (traceEntry != NULL && !info->IsTraceEntryValid())
686			traceEntry = NULL;
687
688		if (traceEntry != NULL) {
689			if (fTeamFilter != -1 && traceEntry->TeamID() != fTeamFilter)
690				return true;
691			if (fThreadFilter != -1 && traceEntry->ThreadID() != fThreadFilter)
692				return true;
693		} else {
694			// we need the info if we have filters set
695			if (fTeamFilter != -1 || fThreadFilter != -1)
696				return true;
697		}
698
699		kprintf("page number %#" B_PRIxPHYSADDR, pageNumber);
700
701		if (traceEntry != NULL) {
702			kprintf(", team: %" B_PRId32 ", thread %" B_PRId32
703				", time %" B_PRId64 "\n", traceEntry->TeamID(),
704				traceEntry->ThreadID(), traceEntry->Time());
705
706			if (fPrintStackTrace)
707				tracing_print_stack_trace(traceEntry->StackTrace());
708		} else
709			kprintf("\n");
710
711		return true;
712	}
713
714private:
715	bool		fPrintStackTrace;
716	page_num_t	fPageFilter;
717	team_id		fTeamFilter;
718	thread_id	fThreadFilter;
719};
720
721
722class AllocationDetailPrinterCallback : public AllocationTrackingCallback {
723public:
724	AllocationDetailPrinterCallback(addr_t caller)
725		:
726		fCaller(caller)
727	{
728	}
729
730	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
731		page_num_t pageNumber)
732	{
733		if (!info->IsInitialized())
734			return true;
735
736		addr_t caller = 0;
737		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
738		if (traceEntry != NULL && !info->IsTraceEntryValid())
739			traceEntry = NULL;
740
741		if (traceEntry != NULL) {
742			caller = tracing_find_caller_in_stack_trace(
743				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
744		}
745
746		if (caller != fCaller)
747			return true;
748
749		kprintf("page %#" B_PRIxPHYSADDR "\n", pageNumber);
750		if (traceEntry != NULL)
751			tracing_print_stack_trace(traceEntry->StackTrace());
752
753		return true;
754	}
755
756private:
757	addr_t	fCaller;
758};
759
760#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
761
762
763static int
764find_page(int argc, char **argv)
765{
766	struct vm_page *page;
767	addr_t address;
768	int32 index = 1;
769	int i;
770
771	struct {
772		const char*	name;
773		VMPageQueue*	queue;
774	} pageQueueInfos[] = {
775		{ "free",		&sFreePageQueue },
776		{ "clear",		&sClearPageQueue },
777		{ "modified",	&sModifiedPageQueue },
778		{ "active",		&sActivePageQueue },
779		{ "inactive",	&sInactivePageQueue },
780		{ "cached",		&sCachedPageQueue },
781		{ NULL, NULL }
782	};
783
784	if (argc < 2
785		|| strlen(argv[index]) <= 2
786		|| argv[index][0] != '0'
787		|| argv[index][1] != 'x') {
788		kprintf("usage: find_page <address>\n");
789		return 0;
790	}
791
792	address = strtoul(argv[index], NULL, 0);
793	page = (vm_page*)address;
794
795	for (i = 0; pageQueueInfos[i].name; i++) {
796		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
797		while (vm_page* p = it.Next()) {
798			if (p == page) {
799				kprintf("found page %p in queue %p (%s)\n", page,
800					pageQueueInfos[i].queue, pageQueueInfos[i].name);
801				return 0;
802			}
803		}
804	}
805
806	kprintf("page %p isn't in any queue\n", page);
807
808	return 0;
809}
810
811
812const char *
813page_state_to_string(int state)
814{
815	switch(state) {
816		case PAGE_STATE_ACTIVE:
817			return "active";
818		case PAGE_STATE_INACTIVE:
819			return "inactive";
820		case PAGE_STATE_MODIFIED:
821			return "modified";
822		case PAGE_STATE_CACHED:
823			return "cached";
824		case PAGE_STATE_FREE:
825			return "free";
826		case PAGE_STATE_CLEAR:
827			return "clear";
828		case PAGE_STATE_WIRED:
829			return "wired";
830		case PAGE_STATE_UNUSED:
831			return "unused";
832		default:
833			return "unknown";
834	}
835}
836
837
838static int
839dump_page(int argc, char **argv)
840{
841	bool addressIsPointer = true;
842	bool physical = false;
843	bool searchMappings = false;
844	int32 index = 1;
845
846	while (index < argc) {
847		if (argv[index][0] != '-')
848			break;
849
850		if (!strcmp(argv[index], "-p")) {
851			addressIsPointer = false;
852			physical = true;
853		} else if (!strcmp(argv[index], "-v")) {
854			addressIsPointer = false;
855		} else if (!strcmp(argv[index], "-m")) {
856			searchMappings = true;
857		} else {
858			print_debugger_command_usage(argv[0]);
859			return 0;
860		}
861
862		index++;
863	}
864
865	if (index + 1 != argc) {
866		print_debugger_command_usage(argv[0]);
867		return 0;
868	}
869
870	uint64 value;
871	if (!evaluate_debug_expression(argv[index], &value, false))
872		return 0;
873
874	uint64 pageAddress = value;
875	struct vm_page* page;
876
877	if (addressIsPointer) {
878		page = (struct vm_page *)(addr_t)pageAddress;
879	} else {
880		if (!physical) {
881			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
882
883			if (debug_get_debugged_thread()->team->address_space != NULL)
884				addressSpace = debug_get_debugged_thread()->team->address_space;
885
886			uint32 flags = 0;
887			phys_addr_t physicalAddress;
888			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
889					&physicalAddress, &flags) != B_OK
890				|| (flags & PAGE_PRESENT) == 0) {
891				kprintf("Virtual address not mapped to a physical page in this "
892					"address space.\n");
893				return 0;
894			}
895			pageAddress = physicalAddress;
896		}
897
898		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
899	}
900
901	kprintf("PAGE: %p\n", page);
902	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
903		page->queue_link.previous);
904	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n",
905		page->physical_page_number);
906	kprintf("cache:           %p\n", page->Cache());
907	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
908	kprintf("cache_next:      %p\n", page->cache_next);
909	kprintf("state:           %s\n", page_state_to_string(page->State()));
910	kprintf("wired_count:     %d\n", page->WiredCount());
911	kprintf("usage_count:     %d\n", page->usage_count);
912	kprintf("busy:            %d\n", page->busy);
913	kprintf("busy_writing:    %d\n", page->busy_writing);
914	kprintf("accessed:        %d\n", page->accessed);
915	kprintf("modified:        %d\n", page->modified);
916	#if DEBUG_PAGE_QUEUE
917		kprintf("queue:           %p\n", page->queue);
918	#endif
919	#if DEBUG_PAGE_ACCESS
920		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
921	#endif
922	kprintf("area mappings:\n");
923
924	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
925	vm_page_mapping *mapping;
926	while ((mapping = iterator.Next()) != NULL) {
927		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
928		mapping = mapping->page_link.next;
929	}
930
931	if (searchMappings) {
932		kprintf("all mappings:\n");
933		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
934		while (addressSpace != NULL) {
935			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
936			for (addr_t address = addressSpace->Base(); pageCount != 0;
937					address += B_PAGE_SIZE, pageCount--) {
938				phys_addr_t physicalAddress;
939				uint32 flags = 0;
940				if (addressSpace->TranslationMap()->QueryInterrupt(address,
941						&physicalAddress, &flags) == B_OK
942					&& (flags & PAGE_PRESENT) != 0
943					&& physicalAddress / B_PAGE_SIZE
944						== page->physical_page_number) {
945					VMArea* area = addressSpace->LookupArea(address);
946					kprintf("  aspace %" B_PRId32 ", area %" B_PRId32 ": %#"
947						B_PRIxADDR " (%c%c%s%s)\n", addressSpace->ID(),
948						area != NULL ? area->id : -1, address,
949						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
950						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
951						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
952						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
953				}
954			}
955			addressSpace = VMAddressSpace::DebugNext(addressSpace);
956		}
957	}
958
959	set_debug_variable("_cache", (addr_t)page->Cache());
960	#if DEBUG_PAGE_ACCESS
961		set_debug_variable("_accessor", page->accessing_thread);
962	#endif
963
964	return 0;
965}
966
967
968static int
969dump_page_queue(int argc, char **argv)
970{
971	struct VMPageQueue *queue;
972
973	if (argc < 2) {
974		kprintf("usage: page_queue <address/name> [list]\n");
975		return 0;
976	}
977
978	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
979		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
980	if (!strcmp(argv[1], "free"))
981		queue = &sFreePageQueue;
982	else if (!strcmp(argv[1], "clear"))
983		queue = &sClearPageQueue;
984	else if (!strcmp(argv[1], "modified"))
985		queue = &sModifiedPageQueue;
986	else if (!strcmp(argv[1], "active"))
987		queue = &sActivePageQueue;
988	else if (!strcmp(argv[1], "inactive"))
989		queue = &sInactivePageQueue;
990	else if (!strcmp(argv[1], "cached"))
991		queue = &sCachedPageQueue;
992	else {
993		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
994		return 0;
995	}
996
997	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
998		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
999		queue->Count());
1000
1001	if (argc == 3) {
1002		struct vm_page *page = queue->Head();
1003		const char *type = "none";
1004		int i;
1005
1006		if (page->Cache() != NULL) {
1007			switch (page->Cache()->type) {
1008				case CACHE_TYPE_RAM:
1009					type = "RAM";
1010					break;
1011				case CACHE_TYPE_DEVICE:
1012					type = "device";
1013					break;
1014				case CACHE_TYPE_VNODE:
1015					type = "vnode";
1016					break;
1017				case CACHE_TYPE_NULL:
1018					type = "null";
1019					break;
1020				default:
1021					type = "???";
1022					break;
1023			}
1024		}
1025
1026		kprintf("page        cache       type       state  wired  usage\n");
1027		for (i = 0; page; i++, page = queue->Next(page)) {
1028			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
1029				type, page_state_to_string(page->State()),
1030				page->WiredCount(), page->usage_count);
1031		}
1032	}
1033	return 0;
1034}
1035
1036
1037static int
1038dump_page_stats(int argc, char **argv)
1039{
1040	page_num_t swappableModified = 0;
1041	page_num_t swappableModifiedInactive = 0;
1042
1043	size_t counter[8];
1044	size_t busyCounter[8];
1045	memset(counter, 0, sizeof(counter));
1046	memset(busyCounter, 0, sizeof(busyCounter));
1047
1048	struct page_run {
1049		page_num_t	start;
1050		page_num_t	end;
1051
1052		page_num_t Length() const	{ return end - start; }
1053	};
1054
1055	page_run currentFreeRun = { 0, 0 };
1056	page_run currentCachedRun = { 0, 0 };
1057	page_run longestFreeRun = { 0, 0 };
1058	page_run longestCachedRun = { 0, 0 };
1059
1060	for (page_num_t i = 0; i < sNumPages; i++) {
1061		if (sPages[i].State() > 7) {
1062			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
1063				&sPages[i]);
1064		}
1065
1066		uint32 pageState = sPages[i].State();
1067
1068		counter[pageState]++;
1069		if (sPages[i].busy)
1070			busyCounter[pageState]++;
1071
1072		if (pageState == PAGE_STATE_MODIFIED
1073			&& sPages[i].Cache() != NULL
1074			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
1075			swappableModified++;
1076			if (sPages[i].usage_count == 0)
1077				swappableModifiedInactive++;
1078		}
1079
1080		// track free and cached pages runs
1081		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
1082			currentFreeRun.end = i + 1;
1083			currentCachedRun.end = i + 1;
1084		} else {
1085			if (currentFreeRun.Length() > longestFreeRun.Length())
1086				longestFreeRun = currentFreeRun;
1087			currentFreeRun.start = currentFreeRun.end = i + 1;
1088
1089			if (pageState == PAGE_STATE_CACHED) {
1090				currentCachedRun.end = i + 1;
1091			} else {
1092				if (currentCachedRun.Length() > longestCachedRun.Length())
1093					longestCachedRun = currentCachedRun;
1094				currentCachedRun.start = currentCachedRun.end = i + 1;
1095			}
1096		}
1097	}
1098
1099	kprintf("page stats:\n");
1100	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
1101
1102	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1103		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
1104	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1105		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
1106	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1107		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
1108	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1109		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
1110	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1111		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
1112	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1113		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
1114	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
1115	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
1116
1117	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
1118	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
1119		sUnsatisfiedPageReservations);
1120	kprintf("mapped pages: %" B_PRId32 "\n", gMappedPagesCount);
1121	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
1122		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
1123		sPages[longestFreeRun.start].physical_page_number);
1124	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
1125		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
1126		sPages[longestCachedRun.start].physical_page_number);
1127
1128	kprintf("waiting threads:\n");
1129	for (PageReservationWaiterList::Iterator it
1130			= sPageReservationWaiters.GetIterator();
1131		PageReservationWaiter* waiter = it.Next();) {
1132		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
1133			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
1134			waiter->missing, waiter->dontTouch);
1135	}
1136
1137	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
1138		sFreePageQueue.Count());
1139	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
1140		sClearPageQueue.Count());
1141	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
1142		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
1143		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
1144		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
1145	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
1146		&sActivePageQueue, sActivePageQueue.Count());
1147	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
1148		&sInactivePageQueue, sInactivePageQueue.Count());
1149	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
1150		&sCachedPageQueue, sCachedPageQueue.Count());
1151	return 0;
1152}
1153
1154
1155#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1156
1157static caller_info*
1158get_caller_info(addr_t caller)
1159{
1160	// find the caller info
1161	for (int32 i = 0; i < sCallerInfoCount; i++) {
1162		if (caller == sCallerInfoTable[i].caller)
1163			return &sCallerInfoTable[i];
1164	}
1165
1166	// not found, add a new entry, if there are free slots
1167	if (sCallerInfoCount >= kCallerInfoTableSize)
1168		return NULL;
1169
1170	caller_info* info = &sCallerInfoTable[sCallerInfoCount++];
1171	info->caller = caller;
1172	info->count = 0;
1173
1174	return info;
1175}
1176
1177
1178static int
1179caller_info_compare_count(const void* _a, const void* _b)
1180{
1181	const caller_info* a = (const caller_info*)_a;
1182	const caller_info* b = (const caller_info*)_b;
1183	return (int)(b->count - a->count);
1184}
1185
1186
1187static int
1188dump_page_allocations_per_caller(int argc, char** argv)
1189{
1190	bool resetAllocationInfos = false;
1191	bool printDetails = false;
1192	addr_t caller = 0;
1193
1194	for (int32 i = 1; i < argc; i++) {
1195		if (strcmp(argv[i], "-d") == 0) {
1196			uint64 callerAddress;
1197			if (++i >= argc
1198				|| !evaluate_debug_expression(argv[i], &callerAddress, true)) {
1199				print_debugger_command_usage(argv[0]);
1200				return 0;
1201			}
1202
1203			caller = callerAddress;
1204			printDetails = true;
1205		} else if (strcmp(argv[i], "-r") == 0) {
1206			resetAllocationInfos = true;
1207		} else {
1208			print_debugger_command_usage(argv[0]);
1209			return 0;
1210		}
1211	}
1212
1213	sCallerInfoCount = 0;
1214
1215	AllocationCollectorCallback collectorCallback(resetAllocationInfos);
1216	AllocationDetailPrinterCallback detailsCallback(caller);
1217	AllocationTrackingCallback& callback = printDetails
1218		? (AllocationTrackingCallback&)detailsCallback
1219		: (AllocationTrackingCallback&)collectorCallback;
1220
1221	for (page_num_t i = 0; i < sNumPages; i++)
1222		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1223
1224	if (printDetails)
1225		return 0;
1226
1227	// sort the array
1228	qsort(sCallerInfoTable, sCallerInfoCount, sizeof(caller_info),
1229		&caller_info_compare_count);
1230
1231	kprintf("%" B_PRId32 " different callers\n\n", sCallerInfoCount);
1232
1233	size_t totalAllocationCount = 0;
1234
1235	kprintf("     count      caller\n");
1236	kprintf("----------------------------------\n");
1237	for (int32 i = 0; i < sCallerInfoCount; i++) {
1238		caller_info& info = sCallerInfoTable[i];
1239		kprintf("%10" B_PRIuSIZE "  %p", info.count, (void*)info.caller);
1240
1241		const char* symbol;
1242		const char* imageName;
1243		bool exactMatch;
1244		addr_t baseAddress;
1245
1246		if (elf_debug_lookup_symbol_address(info.caller, &baseAddress, &symbol,
1247				&imageName, &exactMatch) == B_OK) {
1248			kprintf("  %s + %#" B_PRIxADDR " (%s)%s\n", symbol,
1249				info.caller - baseAddress, imageName,
1250				exactMatch ? "" : " (nearest)");
1251		} else
1252			kprintf("\n");
1253
1254		totalAllocationCount += info.count;
1255	}
1256
1257	kprintf("\ntotal page allocations: %" B_PRIuSIZE "\n",
1258		totalAllocationCount);
1259
1260	return 0;
1261}
1262
1263
1264static int
1265dump_page_allocation_infos(int argc, char** argv)
1266{
1267	page_num_t pageFilter = 0;
1268	team_id teamFilter = -1;
1269	thread_id threadFilter = -1;
1270	bool printStackTraces = false;
1271
1272	for (int32 i = 1; i < argc; i++) {
1273		if (strcmp(argv[i], "--stacktrace") == 0)
1274			printStackTraces = true;
1275		else if (strcmp(argv[i], "-p") == 0) {
1276			uint64 pageNumber;
1277			if (++i >= argc
1278				|| !evaluate_debug_expression(argv[i], &pageNumber, true)) {
1279				print_debugger_command_usage(argv[0]);
1280				return 0;
1281			}
1282
1283			pageFilter = pageNumber;
1284		} else if (strcmp(argv[i], "--team") == 0) {
1285			uint64 team;
1286			if (++i >= argc
1287				|| !evaluate_debug_expression(argv[i], &team, true)) {
1288				print_debugger_command_usage(argv[0]);
1289				return 0;
1290			}
1291
1292			teamFilter = team;
1293		} else if (strcmp(argv[i], "--thread") == 0) {
1294			uint64 thread;
1295			if (++i >= argc
1296				|| !evaluate_debug_expression(argv[i], &thread, true)) {
1297				print_debugger_command_usage(argv[0]);
1298				return 0;
1299			}
1300
1301			threadFilter = thread;
1302		} else {
1303			print_debugger_command_usage(argv[0]);
1304			return 0;
1305		}
1306	}
1307
1308	AllocationInfoPrinterCallback callback(printStackTraces, pageFilter,
1309		teamFilter, threadFilter);
1310
1311	for (page_num_t i = 0; i < sNumPages; i++)
1312		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1313
1314	return 0;
1315}
1316
1317#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1318
1319
1320#ifdef TRACK_PAGE_USAGE_STATS
1321
1322static void
1323track_page_usage(vm_page* page)
1324{
1325	if (page->WiredCount() == 0) {
1326		sNextPageUsage[(int32)page->usage_count + 128]++;
1327		sNextPageUsagePageCount++;
1328	}
1329}
1330
1331
1332static void
1333update_page_usage_stats()
1334{
1335	std::swap(sPageUsage, sNextPageUsage);
1336	sPageUsagePageCount = sNextPageUsagePageCount;
1337
1338	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
1339	sNextPageUsagePageCount = 0;
1340
1341	// compute average
1342	if (sPageUsagePageCount > 0) {
1343		int64 sum = 0;
1344		for (int32 i = 0; i < 256; i++)
1345			sum += (int64)sPageUsage[i] * (i - 128);
1346
1347		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
1348			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
1349	}
1350}
1351
1352
1353static int
1354dump_page_usage_stats(int argc, char** argv)
1355{
1356	kprintf("distribution of page usage counts (%lu pages):",
1357		sPageUsagePageCount);
1358
1359	int64 sum = 0;
1360	for (int32 i = 0; i < 256; i++) {
1361		if (i % 8 == 0)
1362			kprintf("\n%4ld:", i - 128);
1363
1364		int64 count = sPageUsage[i];
1365		sum += count * (i - 128);
1366
1367		kprintf("  %9llu", count);
1368	}
1369
1370	kprintf("\n\n");
1371
1372	kprintf("average usage count: %f\n",
1373		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1374
1375	return 0;
1376}
1377
1378#endif	// TRACK_PAGE_USAGE_STATS
1379
1380
1381// #pragma mark - vm_page
1382
1383
1384inline void
1385vm_page::InitState(uint8 newState)
1386{
1387	state = newState;
1388}
1389
1390
1391inline void
1392vm_page::SetState(uint8 newState)
1393{
1394	TPS(SetPageState(this, newState));
1395
1396	state = newState;
1397}
1398
1399
1400// #pragma mark -
1401
1402
1403static void
1404get_page_stats(page_stats& _pageStats)
1405{
1406	_pageStats.totalFreePages = sUnreservedFreePages;
1407	_pageStats.cachedPages = sCachedPageQueue.Count();
1408	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1409	// TODO: We don't get an actual snapshot here!
1410}
1411
1412
1413static bool
1414do_active_paging(const page_stats& pageStats)
1415{
1416	return pageStats.totalFreePages + pageStats.cachedPages
1417		< pageStats.unsatisfiedReservations
1418			+ (int32)sFreeOrCachedPagesTarget;
1419}
1420
1421
1422/*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1423	\a count. Doesn't touch the last \a dontTouch pages of
1424	\c sUnreservedFreePages, though.
1425	\return The number of actually reserved pages.
1426*/
1427static uint32
1428reserve_some_pages(uint32 count, uint32 dontTouch)
1429{
1430	while (true) {
1431		int32 freePages = sUnreservedFreePages;
1432		if (freePages <= (int32)dontTouch)
1433			return 0;
1434
1435		int32 toReserve = std::min(count, freePages - dontTouch);
1436		if (atomic_test_and_set(&sUnreservedFreePages,
1437					freePages - toReserve, freePages)
1438				== freePages) {
1439			return toReserve;
1440		}
1441
1442		// the count changed in the meantime -- retry
1443	}
1444}
1445
1446
1447static void
1448wake_up_page_reservation_waiters()
1449{
1450	MutexLocker pageDeficitLocker(sPageDeficitLock);
1451
1452	// TODO: If this is a low priority thread, we might want to disable
1453	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1454	// high priority threads wait be kept waiting while a medium priority thread
1455	// prevents us from running.
1456
1457	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1458		int32 reserved = reserve_some_pages(waiter->missing,
1459			waiter->dontTouch);
1460		if (reserved == 0)
1461			return;
1462
1463		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1464		waiter->missing -= reserved;
1465
1466		if (waiter->missing > 0)
1467			return;
1468
1469		sPageReservationWaiters.Remove(waiter);
1470
1471		InterruptsSpinLocker schedulerLocker(gSchedulerLock);
1472		thread_unblock_locked(waiter->thread, B_OK);
1473	}
1474}
1475
1476
1477static inline void
1478unreserve_pages(uint32 count)
1479{
1480	atomic_add(&sUnreservedFreePages, count);
1481	if (sUnsatisfiedPageReservations != 0)
1482		wake_up_page_reservation_waiters();
1483}
1484
1485
1486static void
1487free_page(vm_page* page, bool clear)
1488{
1489	DEBUG_PAGE_ACCESS_CHECK(page);
1490
1491	PAGE_ASSERT(page, !page->IsMapped());
1492
1493	VMPageQueue* fromQueue;
1494
1495	switch (page->State()) {
1496		case PAGE_STATE_ACTIVE:
1497			fromQueue = &sActivePageQueue;
1498			break;
1499		case PAGE_STATE_INACTIVE:
1500			fromQueue = &sInactivePageQueue;
1501			break;
1502		case PAGE_STATE_MODIFIED:
1503			fromQueue = &sModifiedPageQueue;
1504			break;
1505		case PAGE_STATE_CACHED:
1506			fromQueue = &sCachedPageQueue;
1507			break;
1508		case PAGE_STATE_FREE:
1509		case PAGE_STATE_CLEAR:
1510			panic("free_page(): page %p already free", page);
1511			return;
1512		case PAGE_STATE_WIRED:
1513		case PAGE_STATE_UNUSED:
1514			fromQueue = NULL;
1515			break;
1516		default:
1517			panic("free_page(): page %p in invalid state %d",
1518				page, page->State());
1519			return;
1520	}
1521
1522	if (page->CacheRef() != NULL)
1523		panic("to be freed page %p has cache", page);
1524	if (page->IsMapped())
1525		panic("to be freed page %p has mappings", page);
1526
1527	if (fromQueue != NULL)
1528		fromQueue->RemoveUnlocked(page);
1529
1530	TA(FreePage(page->physical_page_number));
1531
1532#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1533	page->allocation_tracking_info.Clear();
1534#endif
1535
1536	ReadLocker locker(sFreePageQueuesLock);
1537
1538	DEBUG_PAGE_ACCESS_END(page);
1539
1540	if (clear) {
1541		page->SetState(PAGE_STATE_CLEAR);
1542		sClearPageQueue.PrependUnlocked(page);
1543	} else {
1544		page->SetState(PAGE_STATE_FREE);
1545		sFreePageQueue.PrependUnlocked(page);
1546	}
1547
1548	locker.Unlock();
1549
1550	unreserve_pages(1);
1551}
1552
1553
1554/*!	The caller must make sure that no-one else tries to change the page's state
1555	while the function is called. If the page has a cache, this can be done by
1556	locking the cache.
1557*/
1558static void
1559set_page_state(vm_page *page, int pageState)
1560{
1561	DEBUG_PAGE_ACCESS_CHECK(page);
1562
1563	if (pageState == page->State())
1564		return;
1565
1566	VMPageQueue* fromQueue;
1567
1568	switch (page->State()) {
1569		case PAGE_STATE_ACTIVE:
1570			fromQueue = &sActivePageQueue;
1571			break;
1572		case PAGE_STATE_INACTIVE:
1573			fromQueue = &sInactivePageQueue;
1574			break;
1575		case PAGE_STATE_MODIFIED:
1576			fromQueue = &sModifiedPageQueue;
1577			break;
1578		case PAGE_STATE_CACHED:
1579			fromQueue = &sCachedPageQueue;
1580			break;
1581		case PAGE_STATE_FREE:
1582		case PAGE_STATE_CLEAR:
1583			panic("set_page_state(): page %p is free/clear", page);
1584			return;
1585		case PAGE_STATE_WIRED:
1586		case PAGE_STATE_UNUSED:
1587			fromQueue = NULL;
1588			break;
1589		default:
1590			panic("set_page_state(): page %p in invalid state %d",
1591				page, page->State());
1592			return;
1593	}
1594
1595	VMPageQueue* toQueue;
1596
1597	switch (pageState) {
1598		case PAGE_STATE_ACTIVE:
1599			toQueue = &sActivePageQueue;
1600			break;
1601		case PAGE_STATE_INACTIVE:
1602			toQueue = &sInactivePageQueue;
1603			break;
1604		case PAGE_STATE_MODIFIED:
1605			toQueue = &sModifiedPageQueue;
1606			break;
1607		case PAGE_STATE_CACHED:
1608			PAGE_ASSERT(page, !page->IsMapped());
1609			PAGE_ASSERT(page, !page->modified);
1610			toQueue = &sCachedPageQueue;
1611			break;
1612		case PAGE_STATE_FREE:
1613		case PAGE_STATE_CLEAR:
1614			panic("set_page_state(): target state is free/clear");
1615			return;
1616		case PAGE_STATE_WIRED:
1617		case PAGE_STATE_UNUSED:
1618			toQueue = NULL;
1619			break;
1620		default:
1621			panic("set_page_state(): invalid target state %d", pageState);
1622			return;
1623	}
1624
1625	VMCache* cache = page->Cache();
1626	if (cache != NULL && cache->temporary) {
1627		if (pageState == PAGE_STATE_MODIFIED)
1628			atomic_add(&sModifiedTemporaryPages, 1);
1629		else if (page->State() == PAGE_STATE_MODIFIED)
1630			atomic_add(&sModifiedTemporaryPages, -1);
1631	}
1632
1633	// move the page
1634	if (toQueue == fromQueue) {
1635		// Note: Theoretically we are required to lock when changing the page
1636		// state, even if we don't change the queue. We actually don't have to
1637		// do this, though, since only for the active queue there are different
1638		// page states and active pages have a cache that must be locked at
1639		// this point. So we rely on the fact that everyone must lock the cache
1640		// before trying to change/interpret the page state.
1641		PAGE_ASSERT(page, cache != NULL);
1642		cache->AssertLocked();
1643		page->SetState(pageState);
1644	} else {
1645		if (fromQueue != NULL)
1646			fromQueue->RemoveUnlocked(page);
1647
1648		page->SetState(pageState);
1649
1650		if (toQueue != NULL)
1651			toQueue->AppendUnlocked(page);
1652	}
1653}
1654
1655
1656/*! Moves a previously modified page into a now appropriate queue.
1657	The page queues must not be locked.
1658*/
1659static void
1660move_page_to_appropriate_queue(vm_page *page)
1661{
1662	DEBUG_PAGE_ACCESS_CHECK(page);
1663
1664	// Note, this logic must be in sync with what the page daemon does.
1665	int32 state;
1666	if (page->IsMapped())
1667		state = PAGE_STATE_ACTIVE;
1668	else if (page->modified)
1669		state = PAGE_STATE_MODIFIED;
1670	else
1671		state = PAGE_STATE_CACHED;
1672
1673// TODO: If free + cached pages are low, we might directly want to free the
1674// page.
1675	set_page_state(page, state);
1676}
1677
1678
1679static void
1680clear_page(struct vm_page *page)
1681{
1682	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1683		B_PAGE_SIZE);
1684}
1685
1686
1687static status_t
1688mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1689{
1690	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1691		B_PRIxPHYSADDR "\n", startPage, length));
1692
1693	if (sPhysicalPageOffset > startPage) {
1694		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1695			"): start page is before free list\n", startPage, length);
1696		if (sPhysicalPageOffset - startPage >= length)
1697			return B_OK;
1698		length -= sPhysicalPageOffset - startPage;
1699		startPage = sPhysicalPageOffset;
1700	}
1701
1702	startPage -= sPhysicalPageOffset;
1703
1704	if (startPage + length > sNumPages) {
1705		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1706			"): range would extend past free list\n", startPage, length);
1707		if (startPage >= sNumPages)
1708			return B_OK;
1709		length = sNumPages - startPage;
1710	}
1711
1712	WriteLocker locker(sFreePageQueuesLock);
1713
1714	for (page_num_t i = 0; i < length; i++) {
1715		vm_page *page = &sPages[startPage + i];
1716		switch (page->State()) {
1717			case PAGE_STATE_FREE:
1718			case PAGE_STATE_CLEAR:
1719			{
1720// TODO: This violates the page reservation policy, since we remove pages from
1721// the free/clear queues without having reserved them before. This should happen
1722// in the early boot process only, though.
1723				DEBUG_PAGE_ACCESS_START(page);
1724				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1725					? sFreePageQueue : sClearPageQueue;
1726				queue.Remove(page);
1727				page->SetState(wired ? PAGE_STATE_WIRED : PAGE_STATE_UNUSED);
1728				page->busy = false;
1729				atomic_add(&sUnreservedFreePages, -1);
1730				DEBUG_PAGE_ACCESS_END(page);
1731				break;
1732			}
1733			case PAGE_STATE_WIRED:
1734			case PAGE_STATE_UNUSED:
1735				break;
1736			case PAGE_STATE_ACTIVE:
1737			case PAGE_STATE_INACTIVE:
1738			case PAGE_STATE_MODIFIED:
1739			case PAGE_STATE_CACHED:
1740			default:
1741				// uh
1742				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1743					" in non-free state %d!\n", startPage + i, page->State());
1744				break;
1745		}
1746	}
1747
1748	return B_OK;
1749}
1750
1751
1752/*!
1753	This is a background thread that wakes up every now and then (every 100ms)
1754	and moves some pages from the free queue over to the clear queue.
1755	Given enough time, it will clear out all pages from the free queue - we
1756	could probably slow it down after having reached a certain threshold.
1757*/
1758static int32
1759page_scrubber(void *unused)
1760{
1761	(void)(unused);
1762
1763	TRACE(("page_scrubber starting...\n"));
1764
1765	for (;;) {
1766		snooze(100000); // 100ms
1767
1768		if (sFreePageQueue.Count() == 0
1769				|| sUnreservedFreePages < (int32)sFreePagesTarget) {
1770			continue;
1771		}
1772
1773		// Since we temporarily remove pages from the free pages reserve,
1774		// we must make sure we don't cause a violation of the page
1775		// reservation warranty. The following is usually stricter than
1776		// necessary, because we don't have information on how many of the
1777		// reserved pages have already been allocated.
1778		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1779			kPageReserveForPriority[VM_PRIORITY_USER]);
1780		if (reserved == 0)
1781			continue;
1782
1783		// get some pages from the free queue
1784		ReadLocker locker(sFreePageQueuesLock);
1785
1786		vm_page *page[SCRUB_SIZE];
1787		int32 scrubCount = 0;
1788		for (int32 i = 0; i < reserved; i++) {
1789			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1790			if (page[i] == NULL)
1791				break;
1792
1793			DEBUG_PAGE_ACCESS_START(page[i]);
1794
1795			page[i]->SetState(PAGE_STATE_ACTIVE);
1796			page[i]->busy = true;
1797			scrubCount++;
1798		}
1799
1800		locker.Unlock();
1801
1802		if (scrubCount == 0) {
1803			unreserve_pages(reserved);
1804			continue;
1805		}
1806
1807		TA(ScrubbingPages(scrubCount));
1808
1809		// clear them
1810		for (int32 i = 0; i < scrubCount; i++)
1811			clear_page(page[i]);
1812
1813		locker.Lock();
1814
1815		// and put them into the clear queue
1816		for (int32 i = 0; i < scrubCount; i++) {
1817			page[i]->SetState(PAGE_STATE_CLEAR);
1818			page[i]->busy = false;
1819			DEBUG_PAGE_ACCESS_END(page[i]);
1820			sClearPageQueue.PrependUnlocked(page[i]);
1821		}
1822
1823		locker.Unlock();
1824
1825		unreserve_pages(reserved);
1826
1827		TA(ScrubbedPages(scrubCount));
1828	}
1829
1830	return 0;
1831}
1832
1833
1834static void
1835init_page_marker(vm_page &marker)
1836{
1837	marker.SetCacheRef(NULL);
1838	marker.InitState(PAGE_STATE_UNUSED);
1839	marker.busy = true;
1840#if DEBUG_PAGE_QUEUE
1841	marker.queue = NULL;
1842#endif
1843#if DEBUG_PAGE_ACCESS
1844	marker.accessing_thread = thread_get_current_thread_id();
1845#endif
1846}
1847
1848
1849static void
1850remove_page_marker(struct vm_page &marker)
1851{
1852	DEBUG_PAGE_ACCESS_CHECK(&marker);
1853
1854	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1855		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1856
1857	marker.SetState(PAGE_STATE_UNUSED);
1858}
1859
1860
1861static vm_page*
1862next_modified_page(page_num_t& maxPagesToSee)
1863{
1864	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1865
1866	while (maxPagesToSee > 0) {
1867		vm_page* page = sModifiedPageQueue.Head();
1868		if (page == NULL)
1869			return NULL;
1870
1871		sModifiedPageQueue.Requeue(page, true);
1872
1873		maxPagesToSee--;
1874
1875		if (!page->busy)
1876			return page;
1877	}
1878
1879	return NULL;
1880}
1881
1882
1883// #pragma mark -
1884
1885
1886class PageWriteTransfer;
1887class PageWriteWrapper;
1888
1889
1890class PageWriterRun {
1891public:
1892	status_t Init(uint32 maxPages);
1893
1894	void PrepareNextRun();
1895	void AddPage(vm_page* page);
1896	uint32 Go();
1897
1898	void PageWritten(PageWriteTransfer* transfer, status_t status,
1899		bool partialTransfer, size_t bytesTransferred);
1900
1901private:
1902	uint32				fMaxPages;
1903	uint32				fWrapperCount;
1904	uint32				fTransferCount;
1905	vint32				fPendingTransfers;
1906	PageWriteWrapper*	fWrappers;
1907	PageWriteTransfer*	fTransfers;
1908	ConditionVariable	fAllFinishedCondition;
1909};
1910
1911
1912class PageWriteTransfer : public AsyncIOCallback {
1913public:
1914	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1915	bool AddPage(vm_page* page);
1916
1917	status_t Schedule(uint32 flags);
1918
1919	void SetStatus(status_t status, size_t transferred);
1920
1921	status_t Status() const	{ return fStatus; }
1922	struct VMCache* Cache() const { return fCache; }
1923	uint32 PageCount() const { return fPageCount; }
1924
1925	virtual void IOFinished(status_t status, bool partialTransfer,
1926		generic_size_t bytesTransferred);
1927private:
1928	PageWriterRun*		fRun;
1929	struct VMCache*		fCache;
1930	off_t				fOffset;
1931	uint32				fPageCount;
1932	int32				fMaxPages;
1933	status_t			fStatus;
1934	uint32				fVecCount;
1935	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
1936};
1937
1938
1939class PageWriteWrapper {
1940public:
1941	PageWriteWrapper();
1942	~PageWriteWrapper();
1943	void SetTo(vm_page* page);
1944	bool Done(status_t result);
1945
1946private:
1947	vm_page*			fPage;
1948	struct VMCache*		fCache;
1949	bool				fIsActive;
1950};
1951
1952
1953PageWriteWrapper::PageWriteWrapper()
1954	:
1955	fIsActive(false)
1956{
1957}
1958
1959
1960PageWriteWrapper::~PageWriteWrapper()
1961{
1962	if (fIsActive)
1963		panic("page write wrapper going out of scope but isn't completed");
1964}
1965
1966
1967/*!	The page's cache must be locked.
1968*/
1969void
1970PageWriteWrapper::SetTo(vm_page* page)
1971{
1972	DEBUG_PAGE_ACCESS_CHECK(page);
1973
1974	if (page->busy)
1975		panic("setting page write wrapper to busy page");
1976
1977	if (fIsActive)
1978		panic("re-setting page write wrapper that isn't completed");
1979
1980	fPage = page;
1981	fCache = page->Cache();
1982	fIsActive = true;
1983
1984	fPage->busy = true;
1985	fPage->busy_writing = true;
1986
1987	// We have a modified page -- however, while we're writing it back,
1988	// the page might still be mapped. In order not to lose any changes to the
1989	// page, we mark it clean before actually writing it back; if
1990	// writing the page fails for some reason, we'll just keep it in the
1991	// modified page list, but that should happen only rarely.
1992
1993	// If the page is changed after we cleared the dirty flag, but before we
1994	// had the chance to write it back, then we'll write it again later -- that
1995	// will probably not happen that often, though.
1996
1997	vm_clear_map_flags(fPage, PAGE_MODIFIED);
1998}
1999
2000
2001/*!	The page's cache must be locked.
2002	The page queues must not be locked.
2003	\return \c true if the page was written successfully respectively could be
2004		handled somehow, \c false otherwise.
2005*/
2006bool
2007PageWriteWrapper::Done(status_t result)
2008{
2009	if (!fIsActive)
2010		panic("completing page write wrapper that is not active");
2011
2012	DEBUG_PAGE_ACCESS_START(fPage);
2013
2014	fPage->busy = false;
2015		// Set unbusy and notify later by hand, since we might free the page.
2016
2017	bool success = true;
2018
2019	if (result == B_OK) {
2020		// put it into the active/inactive queue
2021		move_page_to_appropriate_queue(fPage);
2022		fPage->busy_writing = false;
2023		DEBUG_PAGE_ACCESS_END(fPage);
2024	} else {
2025		// Writing the page failed. One reason would be that the cache has been
2026		// shrunk and the page does no longer belong to the file. Otherwise the
2027		// actual I/O failed, in which case we'll simply keep the page modified.
2028
2029		if (!fPage->busy_writing) {
2030			// The busy_writing flag was cleared. That means the cache has been
2031			// shrunk while we were trying to write the page and we have to free
2032			// it now.
2033			vm_remove_all_page_mappings(fPage);
2034// TODO: Unmapping should already happen when resizing the cache!
2035			fCache->RemovePage(fPage);
2036			free_page(fPage, false);
2037		} else {
2038			// Writing the page failed -- mark the page modified and move it to
2039			// an appropriate queue other than the modified queue, so we don't
2040			// keep trying to write it over and over again. We keep
2041			// non-temporary pages in the modified queue, though, so they don't
2042			// get lost in the inactive queue.
2043			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
2044				strerror(result));
2045
2046			fPage->modified = true;
2047			if (!fCache->temporary)
2048				set_page_state(fPage, PAGE_STATE_MODIFIED);
2049			else if (fPage->IsMapped())
2050				set_page_state(fPage, PAGE_STATE_ACTIVE);
2051			else
2052				set_page_state(fPage, PAGE_STATE_INACTIVE);
2053
2054			fPage->busy_writing = false;
2055			DEBUG_PAGE_ACCESS_END(fPage);
2056
2057			success = false;
2058		}
2059	}
2060
2061	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
2062	fIsActive = false;
2063
2064	return success;
2065}
2066
2067
2068/*!	The page's cache must be locked.
2069*/
2070void
2071PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
2072{
2073	fRun = run;
2074	fCache = page->Cache();
2075	fOffset = page->cache_offset;
2076	fPageCount = 1;
2077	fMaxPages = maxPages;
2078	fStatus = B_OK;
2079
2080	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2081	fVecs[0].length = B_PAGE_SIZE;
2082	fVecCount = 1;
2083}
2084
2085
2086/*!	The page's cache must be locked.
2087*/
2088bool
2089PageWriteTransfer::AddPage(vm_page* page)
2090{
2091	if (page->Cache() != fCache
2092		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
2093		return false;
2094
2095	phys_addr_t nextBase = fVecs[fVecCount - 1].base
2096		+ fVecs[fVecCount - 1].length;
2097
2098	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2099		&& (off_t)page->cache_offset == fOffset + fPageCount) {
2100		// append to last iovec
2101		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
2102		fPageCount++;
2103		return true;
2104	}
2105
2106	nextBase = fVecs[0].base - B_PAGE_SIZE;
2107	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2108		&& (off_t)page->cache_offset == fOffset - 1) {
2109		// prepend to first iovec and adjust offset
2110		fVecs[0].base = nextBase;
2111		fVecs[0].length += B_PAGE_SIZE;
2112		fOffset = page->cache_offset;
2113		fPageCount++;
2114		return true;
2115	}
2116
2117	if (((off_t)page->cache_offset == fOffset + fPageCount
2118			|| (off_t)page->cache_offset == fOffset - 1)
2119		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
2120		// not physically contiguous or not in the right order
2121		uint32 vectorIndex;
2122		if ((off_t)page->cache_offset < fOffset) {
2123			// we are pre-pending another vector, move the other vecs
2124			for (uint32 i = fVecCount; i > 0; i--)
2125				fVecs[i] = fVecs[i - 1];
2126
2127			fOffset = page->cache_offset;
2128			vectorIndex = 0;
2129		} else
2130			vectorIndex = fVecCount;
2131
2132		fVecs[vectorIndex].base
2133			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2134		fVecs[vectorIndex].length = B_PAGE_SIZE;
2135
2136		fVecCount++;
2137		fPageCount++;
2138		return true;
2139	}
2140
2141	return false;
2142}
2143
2144
2145status_t
2146PageWriteTransfer::Schedule(uint32 flags)
2147{
2148	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
2149	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
2150
2151	if (fRun != NULL) {
2152		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
2153			flags | B_PHYSICAL_IO_REQUEST, this);
2154	}
2155
2156	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
2157		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
2158
2159	SetStatus(status, writeLength);
2160	return fStatus;
2161}
2162
2163
2164void
2165PageWriteTransfer::SetStatus(status_t status, size_t transferred)
2166{
2167	// only succeed if all pages up to the last one have been written fully
2168	// and the last page has at least been written partially
2169	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
2170		status = B_ERROR;
2171
2172	fStatus = status;
2173}
2174
2175
2176void
2177PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
2178	generic_size_t bytesTransferred)
2179{
2180	SetStatus(status, bytesTransferred);
2181	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
2182}
2183
2184
2185status_t
2186PageWriterRun::Init(uint32 maxPages)
2187{
2188	fMaxPages = maxPages;
2189	fWrapperCount = 0;
2190	fTransferCount = 0;
2191	fPendingTransfers = 0;
2192
2193	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
2194	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
2195	if (fWrappers == NULL || fTransfers == NULL)
2196		return B_NO_MEMORY;
2197
2198	return B_OK;
2199}
2200
2201
2202void
2203PageWriterRun::PrepareNextRun()
2204{
2205	fWrapperCount = 0;
2206	fTransferCount = 0;
2207	fPendingTransfers = 0;
2208}
2209
2210
2211/*!	The page's cache must be locked.
2212*/
2213void
2214PageWriterRun::AddPage(vm_page* page)
2215{
2216	fWrappers[fWrapperCount++].SetTo(page);
2217
2218	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
2219		fTransfers[fTransferCount++].SetTo(this, page,
2220			page->Cache()->MaxPagesPerAsyncWrite());
2221	}
2222}
2223
2224
2225/*!	Writes all pages previously added.
2226	\return The number of pages that could not be written or otherwise handled.
2227*/
2228uint32
2229PageWriterRun::Go()
2230{
2231	fPendingTransfers = fTransferCount;
2232
2233	fAllFinishedCondition.Init(this, "page writer wait for I/O");
2234	ConditionVariableEntry waitEntry;
2235	fAllFinishedCondition.Add(&waitEntry);
2236
2237	// schedule writes
2238	for (uint32 i = 0; i < fTransferCount; i++)
2239		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
2240
2241	// wait until all pages have been written
2242	waitEntry.Wait();
2243
2244	// mark pages depending on whether they could be written or not
2245
2246	uint32 failedPages = 0;
2247	uint32 wrapperIndex = 0;
2248	for (uint32 i = 0; i < fTransferCount; i++) {
2249		PageWriteTransfer& transfer = fTransfers[i];
2250		transfer.Cache()->Lock();
2251
2252		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2253			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
2254				failedPages++;
2255		}
2256
2257		transfer.Cache()->Unlock();
2258	}
2259
2260	ASSERT(wrapperIndex == fWrapperCount);
2261
2262	for (uint32 i = 0; i < fTransferCount; i++) {
2263		PageWriteTransfer& transfer = fTransfers[i];
2264		struct VMCache* cache = transfer.Cache();
2265
2266		// We've acquired a references for each page
2267		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2268			// We release the cache references after all pages were made
2269			// unbusy again - otherwise releasing a vnode could deadlock.
2270			cache->ReleaseStoreRef();
2271			cache->ReleaseRef();
2272		}
2273	}
2274
2275	return failedPages;
2276}
2277
2278
2279void
2280PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
2281	bool partialTransfer, size_t bytesTransferred)
2282{
2283	if (atomic_add(&fPendingTransfers, -1) == 1)
2284		fAllFinishedCondition.NotifyAll();
2285}
2286
2287
2288/*!	The page writer continuously takes some pages from the modified
2289	queue, writes them back, and moves them back to the active queue.
2290	It runs in its own thread, and is only there to keep the number
2291	of modified pages low, so that more pages can be reused with
2292	fewer costs.
2293*/
2294status_t
2295page_writer(void* /*unused*/)
2296{
2297	const uint32 kNumPages = 256;
2298#ifdef TRACE_VM_PAGE
2299	uint32 writtenPages = 0;
2300	bigtime_t lastWrittenTime = 0;
2301	bigtime_t pageCollectionTime = 0;
2302	bigtime_t pageWritingTime = 0;
2303#endif
2304
2305	PageWriterRun run;
2306	if (run.Init(kNumPages) != B_OK) {
2307		panic("page writer: Failed to init PageWriterRun!");
2308		return B_ERROR;
2309	}
2310
2311	page_num_t pagesSinceLastSuccessfulWrite = 0;
2312
2313	while (true) {
2314// TODO: Maybe wait shorter when memory is low!
2315		if (sModifiedPageQueue.Count() < kNumPages) {
2316			sPageWriterCondition.Wait(3000000, true);
2317				// all 3 seconds when no one triggers us
2318		}
2319
2320		page_num_t modifiedPages = sModifiedPageQueue.Count();
2321		if (modifiedPages == 0)
2322			continue;
2323
2324		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
2325			// We ran through the whole queue without being able to write a
2326			// single page. Take a break.
2327			snooze(500000);
2328			pagesSinceLastSuccessfulWrite = 0;
2329		}
2330
2331#if ENABLE_SWAP_SUPPORT
2332		page_stats pageStats;
2333		get_page_stats(pageStats);
2334		bool activePaging = do_active_paging(pageStats);
2335#endif
2336
2337		// depending on how urgent it becomes to get pages to disk, we adjust
2338		// our I/O priority
2339		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
2340		int32 ioPriority = B_IDLE_PRIORITY;
2341		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
2342			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
2343			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
2344		} else {
2345			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
2346				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
2347		}
2348
2349		thread_set_io_priority(ioPriority);
2350
2351		uint32 numPages = 0;
2352		run.PrepareNextRun();
2353
2354		// TODO: make this laptop friendly, too (ie. only start doing
2355		// something if someone else did something or there is really
2356		// enough to do).
2357
2358		// collect pages to be written
2359#ifdef TRACE_VM_PAGE
2360		pageCollectionTime -= system_time();
2361#endif
2362
2363		page_num_t maxPagesToSee = modifiedPages;
2364
2365		while (numPages < kNumPages && maxPagesToSee > 0) {
2366			vm_page *page = next_modified_page(maxPagesToSee);
2367			if (page == NULL)
2368				break;
2369
2370			PageCacheLocker cacheLocker(page, false);
2371			if (!cacheLocker.IsLocked())
2372				continue;
2373
2374			VMCache *cache = page->Cache();
2375
2376			// If the page is busy or its state has changed while we were
2377			// locking the cache, just ignore it.
2378			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2379				continue;
2380
2381			DEBUG_PAGE_ACCESS_START(page);
2382
2383			// Don't write back wired (locked) pages.
2384			if (page->WiredCount() > 0) {
2385				set_page_state(page, PAGE_STATE_ACTIVE);
2386				DEBUG_PAGE_ACCESS_END(page);
2387				continue;
2388			}
2389
2390			// Write back temporary pages only when we're actively paging.
2391			if (cache->temporary
2392#if ENABLE_SWAP_SUPPORT
2393				&& (!activePaging
2394					|| !cache->CanWritePage(
2395							(off_t)page->cache_offset << PAGE_SHIFT))
2396#endif
2397				) {
2398				// We can't/don't want to do anything with this page, so move it
2399				// to one of the other queues.
2400				if (page->mappings.IsEmpty())
2401					set_page_state(page, PAGE_STATE_INACTIVE);
2402				else
2403					set_page_state(page, PAGE_STATE_ACTIVE);
2404
2405				DEBUG_PAGE_ACCESS_END(page);
2406				continue;
2407			}
2408
2409			// We need our own reference to the store, as it might currently be
2410			// destroyed.
2411			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2412				DEBUG_PAGE_ACCESS_END(page);
2413				cacheLocker.Unlock();
2414				thread_yield(true);
2415				continue;
2416			}
2417
2418			run.AddPage(page);
2419				// TODO: We're possibly adding pages of different caches and
2420				// thus maybe of different underlying file systems here. This
2421				// is a potential problem for loop file systems/devices, since
2422				// we could mark a page busy that would need to be accessed
2423				// when writing back another page, thus causing a deadlock.
2424
2425			DEBUG_PAGE_ACCESS_END(page);
2426
2427			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2428			TPW(WritePage(page));
2429
2430			cache->AcquireRefLocked();
2431			numPages++;
2432		}
2433
2434#ifdef TRACE_VM_PAGE
2435		pageCollectionTime += system_time();
2436#endif
2437		if (numPages == 0)
2438			continue;
2439
2440		// write pages to disk and do all the cleanup
2441#ifdef TRACE_VM_PAGE
2442		pageWritingTime -= system_time();
2443#endif
2444		uint32 failedPages = run.Go();
2445#ifdef TRACE_VM_PAGE
2446		pageWritingTime += system_time();
2447
2448		// debug output only...
2449		writtenPages += numPages;
2450		if (writtenPages >= 1024) {
2451			bigtime_t now = system_time();
2452			TRACE(("page writer: wrote 1024 pages (total: %" B_PRIu64 " ms, "
2453				"collect: %" B_PRIu64 " ms, write: %" B_PRIu64 " ms)\n",
2454				(now - lastWrittenTime) / 1000,
2455				pageCollectionTime / 1000, pageWritingTime / 1000));
2456			lastWrittenTime = now;
2457
2458			writtenPages -= 1024;
2459			pageCollectionTime = 0;
2460			pageWritingTime = 0;
2461		}
2462#endif
2463
2464		if (failedPages == numPages)
2465			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2466		else
2467			pagesSinceLastSuccessfulWrite = 0;
2468	}
2469
2470	return B_OK;
2471}
2472
2473
2474// #pragma mark -
2475
2476
2477// TODO: This should be done in the page daemon!
2478#if 0
2479#if ENABLE_SWAP_SUPPORT
2480static bool
2481free_page_swap_space(int32 index)
2482{
2483	vm_page *page = vm_page_at_index(index);
2484	PageCacheLocker locker(page);
2485	if (!locker.IsLocked())
2486		return false;
2487
2488	DEBUG_PAGE_ACCESS_START(page);
2489
2490	VMCache* cache = page->Cache();
2491	if (cache->temporary && page->WiredCount() == 0
2492			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2493			&& page->usage_count > 0) {
2494		// TODO: how to judge a page is highly active?
2495		if (swap_free_page_swap_space(page)) {
2496			// We need to mark the page modified, since otherwise it could be
2497			// stolen and we'd lose its data.
2498			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2499			TD(FreedPageSwap(page));
2500			DEBUG_PAGE_ACCESS_END(page);
2501			return true;
2502		}
2503	}
2504	DEBUG_PAGE_ACCESS_END(page);
2505	return false;
2506}
2507#endif
2508#endif	// 0
2509
2510
2511static vm_page *
2512find_cached_page_candidate(struct vm_page &marker)
2513{
2514	DEBUG_PAGE_ACCESS_CHECK(&marker);
2515
2516	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2517	vm_page *page;
2518
2519	if (marker.State() == PAGE_STATE_UNUSED) {
2520		// Get the first free pages of the (in)active queue
2521		page = sCachedPageQueue.Head();
2522	} else {
2523		// Get the next page of the current queue
2524		if (marker.State() != PAGE_STATE_CACHED) {
2525			panic("invalid marker %p state", &marker);
2526			return NULL;
2527		}
2528
2529		page = sCachedPageQueue.Next(&marker);
2530		sCachedPageQueue.Remove(&marker);
2531		marker.SetState(PAGE_STATE_UNUSED);
2532	}
2533
2534	while (page != NULL) {
2535		if (!page->busy) {
2536			// we found a candidate, insert marker
2537			marker.SetState(PAGE_STATE_CACHED);
2538			sCachedPageQueue.InsertAfter(page, &marker);
2539			return page;
2540		}
2541
2542		page = sCachedPageQueue.Next(page);
2543	}
2544
2545	return NULL;
2546}
2547
2548
2549static bool
2550free_cached_page(vm_page *page, bool dontWait)
2551{
2552	// try to lock the page's cache
2553	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2554		return false;
2555	VMCache* cache = page->Cache();
2556
2557	AutoLocker<VMCache> cacheLocker(cache, true);
2558	MethodDeleter<VMCache> _2(cache, &VMCache::ReleaseRefLocked);
2559
2560	// check again if that page is still a candidate
2561	if (page->busy || page->State() != PAGE_STATE_CACHED)
2562		return false;
2563
2564	DEBUG_PAGE_ACCESS_START(page);
2565
2566	PAGE_ASSERT(page, !page->IsMapped());
2567	PAGE_ASSERT(page, !page->modified);
2568
2569	// we can now steal this page
2570
2571	cache->RemovePage(page);
2572		// Now the page doesn't have cache anymore, so no one else (e.g.
2573		// vm_page_allocate_page_run() can pick it up), since they would be
2574		// required to lock the cache first, which would fail.
2575
2576	sCachedPageQueue.RemoveUnlocked(page);
2577	return true;
2578}
2579
2580
2581static uint32
2582free_cached_pages(uint32 pagesToFree, bool dontWait)
2583{
2584	vm_page marker;
2585	init_page_marker(marker);
2586
2587	uint32 pagesFreed = 0;
2588
2589	while (pagesFreed < pagesToFree) {
2590		vm_page *page = find_cached_page_candidate(marker);
2591		if (page == NULL)
2592			break;
2593
2594		if (free_cached_page(page, dontWait)) {
2595			ReadLocker locker(sFreePageQueuesLock);
2596			page->SetState(PAGE_STATE_FREE);
2597			DEBUG_PAGE_ACCESS_END(page);
2598			sFreePageQueue.PrependUnlocked(page);
2599			locker.Unlock();
2600
2601			TA(StolenPage());
2602
2603			pagesFreed++;
2604		}
2605	}
2606
2607	remove_page_marker(marker);
2608
2609	return pagesFreed;
2610}
2611
2612
2613static void
2614idle_scan_active_pages(page_stats& pageStats)
2615{
2616	VMPageQueue& queue = sActivePageQueue;
2617
2618	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2619	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2620
2621	while (maxToScan > 0) {
2622		maxToScan--;
2623
2624		// Get the next page. Note that we don't bother to lock here. We go with
2625		// the assumption that on all architectures reading/writing pointers is
2626		// atomic. Beyond that it doesn't really matter. We have to unlock the
2627		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2628		vm_page* page = queue.Head();
2629		if (page == NULL)
2630			break;
2631
2632		// lock the page's cache
2633		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2634		if (cache == NULL)
2635			continue;
2636
2637		if (page->State() != PAGE_STATE_ACTIVE) {
2638			// page is no longer in the cache or in this queue
2639			cache->ReleaseRefAndUnlock();
2640			continue;
2641		}
2642
2643		if (page->busy) {
2644			// page is busy -- requeue at the end
2645			vm_page_requeue(page, true);
2646			cache->ReleaseRefAndUnlock();
2647			continue;
2648		}
2649
2650		DEBUG_PAGE_ACCESS_START(page);
2651
2652		// Get the page active/modified flags and update the page's usage count.
2653		// We completely unmap inactive temporary pages. This saves us to
2654		// iterate through the inactive list as well, since we'll be notified
2655		// via page fault whenever such an inactive page is used again.
2656		// We don't remove the mappings of non-temporary pages, since we
2657		// wouldn't notice when those would become unused and could thus be
2658		// moved to the cached list.
2659		int32 usageCount;
2660		if (page->WiredCount() > 0 || page->usage_count > 0
2661			|| !cache->temporary) {
2662			usageCount = vm_clear_page_mapping_accessed_flags(page);
2663		} else
2664			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2665
2666		if (usageCount > 0) {
2667			usageCount += page->usage_count + kPageUsageAdvance;
2668			if (usageCount > kPageUsageMax)
2669				usageCount = kPageUsageMax;
2670// TODO: This would probably also be the place to reclaim swap space.
2671		} else {
2672			usageCount += page->usage_count - (int32)kPageUsageDecline;
2673			if (usageCount < 0) {
2674				usageCount = 0;
2675				set_page_state(page, PAGE_STATE_INACTIVE);
2676			}
2677		}
2678
2679		page->usage_count = usageCount;
2680
2681		DEBUG_PAGE_ACCESS_END(page);
2682
2683		cache->ReleaseRefAndUnlock();
2684	}
2685}
2686
2687
2688static void
2689full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2690{
2691	int32 pagesToFree = pageStats.unsatisfiedReservations
2692		+ sFreeOrCachedPagesTarget
2693		- (pageStats.totalFreePages + pageStats.cachedPages);
2694	if (pagesToFree <= 0)
2695		return;
2696
2697	bigtime_t time = system_time();
2698	uint32 pagesScanned = 0;
2699	uint32 pagesToCached = 0;
2700	uint32 pagesToModified = 0;
2701	uint32 pagesToActive = 0;
2702
2703	// Determine how many pages at maximum to send to the modified queue. Since
2704	// it is relatively expensive to page out pages, we do that on a grander
2705	// scale only when things get desperate.
2706	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2707
2708	vm_page marker;
2709	init_page_marker(marker);
2710
2711	VMPageQueue& queue = sInactivePageQueue;
2712	InterruptsSpinLocker queueLocker(queue.GetLock());
2713	uint32 maxToScan = queue.Count();
2714
2715	vm_page* nextPage = queue.Head();
2716
2717	while (pagesToFree > 0 && maxToScan > 0) {
2718		maxToScan--;
2719
2720		// get the next page
2721		vm_page* page = nextPage;
2722		if (page == NULL)
2723			break;
2724		nextPage = queue.Next(page);
2725
2726		if (page->busy)
2727			continue;
2728
2729		// mark the position
2730		queue.InsertAfter(page, &marker);
2731		queueLocker.Unlock();
2732
2733		// lock the page's cache
2734		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2735		if (cache == NULL || page->busy
2736				|| page->State() != PAGE_STATE_INACTIVE) {
2737			if (cache != NULL)
2738				cache->ReleaseRefAndUnlock();
2739			queueLocker.Lock();
2740			nextPage = queue.Next(&marker);
2741			queue.Remove(&marker);
2742			continue;
2743		}
2744
2745		pagesScanned++;
2746
2747		DEBUG_PAGE_ACCESS_START(page);
2748
2749		// Get the accessed count, clear the accessed/modified flags and
2750		// unmap the page, if it hasn't been accessed.
2751		int32 usageCount;
2752		if (page->WiredCount() > 0)
2753			usageCount = vm_clear_page_mapping_accessed_flags(page);
2754		else
2755			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2756
2757		// update usage count
2758		if (usageCount > 0) {
2759			usageCount += page->usage_count + kPageUsageAdvance;
2760			if (usageCount > kPageUsageMax)
2761				usageCount = kPageUsageMax;
2762		} else {
2763			usageCount += page->usage_count - (int32)kPageUsageDecline;
2764			if (usageCount < 0)
2765				usageCount = 0;
2766		}
2767
2768		page->usage_count = usageCount;
2769
2770		// Move to fitting queue or requeue:
2771		// * Active mapped pages go to the active queue.
2772		// * Inactive mapped (i.e. wired) pages are requeued.
2773		// * The remaining pages are cachable. Thus, if unmodified they go to
2774		//   the cached queue, otherwise to the modified queue (up to a limit).
2775		//   Note that until in the idle scanning we don't exempt pages of
2776		//   temporary caches. Apparently we really need memory, so we better
2777		//   page out memory as well.
2778		bool isMapped = page->IsMapped();
2779		if (usageCount > 0) {
2780			if (isMapped) {
2781				set_page_state(page, PAGE_STATE_ACTIVE);
2782				pagesToActive++;
2783			} else
2784				vm_page_requeue(page, true);
2785		} else if (isMapped) {
2786			vm_page_requeue(page, true);
2787		} else if (!page->modified) {
2788			set_page_state(page, PAGE_STATE_CACHED);
2789			pagesToFree--;
2790			pagesToCached++;
2791		} else if (maxToFlush > 0) {
2792			set_page_state(page, PAGE_STATE_MODIFIED);
2793			maxToFlush--;
2794			pagesToModified++;
2795		} else
2796			vm_page_requeue(page, true);
2797
2798		DEBUG_PAGE_ACCESS_END(page);
2799
2800		cache->ReleaseRefAndUnlock();
2801
2802		// remove the marker
2803		queueLocker.Lock();
2804		nextPage = queue.Next(&marker);
2805		queue.Remove(&marker);
2806	}
2807
2808	queueLocker.Unlock();
2809
2810	time = system_time() - time;
2811	TRACE_DAEMON("  -> inactive scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2812		", moved: %" B_PRIu32 " -> cached, %" B_PRIu32 " -> modified, %"
2813		B_PRIu32 " -> active\n", time, pagesScanned, pagesToCached,
2814		pagesToModified, pagesToActive);
2815
2816	// wake up the page writer, if we tossed it some pages
2817	if (pagesToModified > 0)
2818		sPageWriterCondition.WakeUp();
2819}
2820
2821
2822static void
2823full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2824{
2825	vm_page marker;
2826	init_page_marker(marker);
2827
2828	VMPageQueue& queue = sActivePageQueue;
2829	InterruptsSpinLocker queueLocker(queue.GetLock());
2830	uint32 maxToScan = queue.Count();
2831
2832	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2833		+ sFreeOrCachedPagesTarget
2834		- (pageStats.totalFreePages + pageStats.cachedPages)
2835		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2836	if (pagesToDeactivate <= 0)
2837		return;
2838
2839	bigtime_t time = system_time();
2840	uint32 pagesAccessed = 0;
2841	uint32 pagesToInactive = 0;
2842	uint32 pagesScanned = 0;
2843
2844	vm_page* nextPage = queue.Head();
2845
2846	while (pagesToDeactivate > 0 && maxToScan > 0) {
2847		maxToScan--;
2848
2849		// get the next page
2850		vm_page* page = nextPage;
2851		if (page == NULL)
2852			break;
2853		nextPage = queue.Next(page);
2854
2855		if (page->busy)
2856			continue;
2857
2858		// mark the position
2859		queue.InsertAfter(page, &marker);
2860		queueLocker.Unlock();
2861
2862		// lock the page's cache
2863		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2864		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2865			if (cache != NULL)
2866				cache->ReleaseRefAndUnlock();
2867			queueLocker.Lock();
2868			nextPage = queue.Next(&marker);
2869			queue.Remove(&marker);
2870			continue;
2871		}
2872
2873		pagesScanned++;
2874
2875		DEBUG_PAGE_ACCESS_START(page);
2876
2877		// Get the page active/modified flags and update the page's usage count.
2878		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2879
2880		if (usageCount > 0) {
2881			usageCount += page->usage_count + kPageUsageAdvance;
2882			if (usageCount > kPageUsageMax)
2883				usageCount = kPageUsageMax;
2884			pagesAccessed++;
2885// TODO: This would probably also be the place to reclaim swap space.
2886		} else {
2887			usageCount += page->usage_count - (int32)kPageUsageDecline;
2888			if (usageCount <= 0) {
2889				usageCount = 0;
2890				set_page_state(page, PAGE_STATE_INACTIVE);
2891				pagesToInactive++;
2892			}
2893		}
2894
2895		page->usage_count = usageCount;
2896
2897		DEBUG_PAGE_ACCESS_END(page);
2898
2899		cache->ReleaseRefAndUnlock();
2900
2901		// remove the marker
2902		queueLocker.Lock();
2903		nextPage = queue.Next(&marker);
2904		queue.Remove(&marker);
2905	}
2906
2907	time = system_time() - time;
2908	TRACE_DAEMON("  ->   active scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2909		", moved: %" B_PRIu32 " -> inactive, encountered %" B_PRIu32 " accessed"
2910		" ones\n", time, pagesScanned, pagesToInactive, pagesAccessed);
2911}
2912
2913
2914static void
2915page_daemon_idle_scan(page_stats& pageStats)
2916{
2917	TRACE_DAEMON("page daemon: idle run\n");
2918
2919	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2920		// We want more actually free pages, so free some from the cached
2921		// ones.
2922		uint32 freed = free_cached_pages(
2923			sFreePagesTarget - pageStats.totalFreePages, false);
2924		if (freed > 0)
2925			unreserve_pages(freed);
2926		get_page_stats(pageStats);
2927	}
2928
2929	// Walk the active list and move pages to the inactive queue.
2930	get_page_stats(pageStats);
2931	idle_scan_active_pages(pageStats);
2932}
2933
2934
2935static void
2936page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2937{
2938	TRACE_DAEMON("page daemon: full run: free: %" B_PRIu32 ", cached: %"
2939		B_PRIu32 ", to free: %" B_PRIu32 "\n", pageStats.totalFreePages,
2940		pageStats.cachedPages, pageStats.unsatisfiedReservations
2941			+ sFreeOrCachedPagesTarget
2942			- (pageStats.totalFreePages + pageStats.cachedPages));
2943
2944	// Walk the inactive list and transfer pages to the cached and modified
2945	// queues.
2946	full_scan_inactive_pages(pageStats, despairLevel);
2947
2948	// Free cached pages. Also wake up reservation waiters.
2949	get_page_stats(pageStats);
2950	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2951		- (pageStats.totalFreePages);
2952	if (pagesToFree > 0) {
2953		uint32 freed = free_cached_pages(pagesToFree, true);
2954		if (freed > 0)
2955			unreserve_pages(freed);
2956	}
2957
2958	// Walk the active list and move pages to the inactive queue.
2959	get_page_stats(pageStats);
2960	full_scan_active_pages(pageStats, despairLevel);
2961}
2962
2963
2964static status_t
2965page_daemon(void* /*unused*/)
2966{
2967	int32 despairLevel = 0;
2968
2969	while (true) {
2970		sPageDaemonCondition.ClearActivated();
2971
2972		// evaluate the free pages situation
2973		page_stats pageStats;
2974		get_page_stats(pageStats);
2975
2976		if (!do_active_paging(pageStats)) {
2977			// Things look good -- just maintain statistics and keep the pool
2978			// of actually free pages full enough.
2979			despairLevel = 0;
2980			page_daemon_idle_scan(pageStats);
2981			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
2982		} else {
2983			// Not enough free pages. We need to do some real work.
2984			despairLevel = std::max(despairLevel + 1, (int32)3);
2985			page_daemon_full_scan(pageStats, despairLevel);
2986
2987			// Don't wait after the first full scan, but rather immediately
2988			// check whether we were successful in freeing enough pages and
2989			// re-run with increased despair level. The first scan is
2990			// conservative with respect to moving inactive modified pages to
2991			// the modified list to avoid thrashing. The second scan, however,
2992			// will not hold back.
2993			if (despairLevel > 1)
2994				snooze(kBusyScanWaitInterval);
2995		}
2996	}
2997
2998	return B_OK;
2999}
3000
3001
3002/*!	Returns how many pages could *not* be reserved.
3003*/
3004static uint32
3005reserve_pages(uint32 count, int priority, bool dontWait)
3006{
3007	int32 dontTouch = kPageReserveForPriority[priority];
3008
3009	while (true) {
3010		count -= reserve_some_pages(count, dontTouch);
3011		if (count == 0)
3012			return 0;
3013
3014		if (sUnsatisfiedPageReservations == 0) {
3015			count -= free_cached_pages(count, dontWait);
3016			if (count == 0)
3017				return count;
3018		}
3019
3020		if (dontWait)
3021			return count;
3022
3023		// we need to wait for pages to become available
3024
3025		MutexLocker pageDeficitLocker(sPageDeficitLock);
3026
3027		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
3028		sUnsatisfiedPageReservations += count;
3029
3030		if (sUnreservedFreePages > dontTouch) {
3031			// the situation changed
3032			sUnsatisfiedPageReservations -= count;
3033			continue;
3034		}
3035
3036		PageReservationWaiter waiter;
3037		waiter.dontTouch = dontTouch;
3038		waiter.missing = count;
3039		waiter.thread = thread_get_current_thread();
3040		waiter.threadPriority = waiter.thread->priority;
3041
3042		// insert ordered (i.e. after all waiters with higher or equal priority)
3043		PageReservationWaiter* otherWaiter = NULL;
3044		for (PageReservationWaiterList::Iterator it
3045				= sPageReservationWaiters.GetIterator();
3046			(otherWaiter = it.Next()) != NULL;) {
3047			if (waiter < *otherWaiter)
3048				break;
3049		}
3050
3051		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
3052
3053		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
3054			"waiting for pages");
3055
3056		if (notifyDaemon)
3057			sPageDaemonCondition.WakeUp();
3058
3059		pageDeficitLocker.Unlock();
3060
3061		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
3062		thread_block();
3063
3064		pageDeficitLocker.Lock();
3065
3066		return 0;
3067	}
3068}
3069
3070
3071//	#pragma mark - private kernel API
3072
3073
3074/*!	Writes a range of modified pages of a cache to disk.
3075	You need to hold the VMCache lock when calling this function.
3076	Note that the cache lock is released in this function.
3077	\param cache The cache.
3078	\param firstPage Offset (in page size units) of the first page in the range.
3079	\param endPage End offset (in page size units) of the page range. The page
3080		at this offset is not included.
3081*/
3082status_t
3083vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
3084	uint32 endPage)
3085{
3086	static const int32 kMaxPages = 256;
3087	int32 maxPages = cache->MaxPagesPerWrite();
3088	if (maxPages < 0 || maxPages > kMaxPages)
3089		maxPages = kMaxPages;
3090
3091	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
3092		| HEAP_DONT_LOCK_KERNEL_SPACE;
3093
3094	PageWriteWrapper stackWrappers[2];
3095	PageWriteWrapper* wrapperPool
3096		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
3097	if (wrapperPool == NULL) {
3098		// don't fail, just limit our capabilities
3099		wrapperPool = stackWrappers;
3100		maxPages = 1;
3101	}
3102
3103	int32 nextWrapper = 0;
3104
3105	PageWriteWrapper* wrappers[maxPages];
3106	int32 usedWrappers = 0;
3107
3108	PageWriteTransfer transfer;
3109	bool transferEmpty = true;
3110
3111	VMCachePagesTree::Iterator it
3112		= cache->pages.GetIterator(firstPage, true, true);
3113
3114	while (true) {
3115		vm_page* page = it.Next();
3116		if (page == NULL || page->cache_offset >= endPage) {
3117			if (transferEmpty)
3118				break;
3119
3120			page = NULL;
3121		}
3122
3123		if (page != NULL) {
3124			if (page->busy
3125				|| (page->State() != PAGE_STATE_MODIFIED
3126					&& !vm_test_map_modification(page))) {
3127				page = NULL;
3128			}
3129		}
3130
3131		PageWriteWrapper* wrapper = NULL;
3132		if (page != NULL) {
3133			wrapper = &wrapperPool[nextWrapper++];
3134			if (nextWrapper > maxPages)
3135				nextWrapper = 0;
3136
3137			DEBUG_PAGE_ACCESS_START(page);
3138
3139			wrapper->SetTo(page);
3140
3141			if (transferEmpty || transfer.AddPage(page)) {
3142				if (transferEmpty) {
3143					transfer.SetTo(NULL, page, maxPages);
3144					transferEmpty = false;
3145				}
3146
3147				DEBUG_PAGE_ACCESS_END(page);
3148
3149				wrappers[usedWrappers++] = wrapper;
3150				continue;
3151			}
3152
3153			DEBUG_PAGE_ACCESS_END(page);
3154		}
3155
3156		if (transferEmpty)
3157			continue;
3158
3159		cache->Unlock();
3160		status_t status = transfer.Schedule(0);
3161		cache->Lock();
3162
3163		for (int32 i = 0; i < usedWrappers; i++)
3164			wrappers[i]->Done(status);
3165
3166		usedWrappers = 0;
3167
3168		if (page != NULL) {
3169			transfer.SetTo(NULL, page, maxPages);
3170			wrappers[usedWrappers++] = wrapper;
3171		} else
3172			transferEmpty = true;
3173	}
3174
3175	if (wrapperPool != stackWrappers)
3176		delete[] wrapperPool;
3177
3178	return B_OK;
3179}
3180
3181
3182/*!	You need to hold the VMCache lock when calling this function.
3183	Note that the cache lock is released in this function.
3184*/
3185status_t
3186vm_page_write_modified_pages(VMCache *cache)
3187{
3188	return vm_page_write_modified_page_range(cache, 0,
3189		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
3190}
3191
3192
3193/*!	Schedules the page writer to write back the specified \a page.
3194	Note, however, that it might not do this immediately, and it can well
3195	take several seconds until the page is actually written out.
3196*/
3197void
3198vm_page_schedule_write_page(vm_page *page)
3199{
3200	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
3201
3202	vm_page_requeue(page, false);
3203
3204	sPageWriterCondition.WakeUp();
3205}
3206
3207
3208/*!	Cache must be locked.
3209*/
3210void
3211vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
3212	uint32 endPage)
3213{
3214	uint32 modified = 0;
3215	for (VMCachePagesTree::Iterator it
3216				= cache->pages.GetIterator(firstPage, true, true);
3217			vm_page *page = it.Next();) {
3218		if (page->cache_offset >= endPage)
3219			break;
3220
3221		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
3222			DEBUG_PAGE_ACCESS_START(page);
3223			vm_page_requeue(page, false);
3224			modified++;
3225			DEBUG_PAGE_ACCESS_END(page);
3226		}
3227	}
3228
3229	if (modified > 0)
3230		sPageWriterCondition.WakeUp();
3231}
3232
3233
3234void
3235vm_page_init_num_pages(kernel_args *args)
3236{
3237	// calculate the size of memory by looking at the physical_memory_range array
3238	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
3239	page_num_t physicalPagesEnd = sPhysicalPageOffset
3240		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
3241
3242	sNonExistingPages = 0;
3243	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
3244
3245	for (uint32 i = 1; i < args->num_physical_memory_ranges; i++) {
3246		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
3247		if (start > physicalPagesEnd)
3248			sNonExistingPages += start - physicalPagesEnd;
3249		physicalPagesEnd = start
3250			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
3251
3252#ifdef LIMIT_AVAILABLE_MEMORY
3253		page_num_t available
3254			= physicalPagesEnd - sPhysicalPageOffset - sNonExistingPages;
3255		if (available > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE)) {
3256			physicalPagesEnd = sPhysicalPageOffset + sNonExistingPages
3257				+ LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
3258			break;
3259		}
3260#endif
3261	}
3262
3263	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
3264		sPhysicalPageOffset, physicalPagesEnd));
3265
3266	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
3267}
3268
3269
3270status_t
3271vm_page_init(kernel_args *args)
3272{
3273	TRACE(("vm_page_init: entry\n"));
3274
3275	// init page queues
3276	sModifiedPageQueue.Init("modified pages queue");
3277	sInactivePageQueue.Init("inactive pages queue");
3278	sActivePageQueue.Init("active pages queue");
3279	sCachedPageQueue.Init("cached pages queue");
3280	sFreePageQueue.Init("free pages queue");
3281	sClearPageQueue.Init("clear pages queue");
3282
3283	new (&sPageReservationWaiters) PageReservationWaiterList;
3284
3285	// map in the new free page table
3286	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
3287		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3288
3289	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
3290		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
3291		(phys_addr_t)(sNumPages * sizeof(vm_page))));
3292
3293	// initialize the free page table
3294	for (uint32 i = 0; i < sNumPages; i++) {
3295		sPages[i].Init(sPhysicalPageOffset + i);
3296		sFreePageQueue.Append(&sPages[i]);
3297
3298#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3299		sPages[i].allocation_tracking_info.Clear();
3300#endif
3301	}
3302
3303	sUnreservedFreePages = sNumPages;
3304
3305	TRACE(("initialized table\n"));
3306
3307	// mark the ranges between usable physical memory unused
3308	phys_addr_t previousEnd = 0;
3309	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3310		phys_addr_t base = args->physical_memory_range[i].start;
3311		phys_size_t size = args->physical_memory_range[i].size;
3312		if (base > previousEnd) {
3313			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
3314				(base - previousEnd) / B_PAGE_SIZE, false);
3315		}
3316		previousEnd = base + size;
3317	}
3318
3319	// mark the allocated physical page ranges wired
3320	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3321		mark_page_range_in_use(
3322			args->physical_allocated_range[i].start / B_PAGE_SIZE,
3323			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
3324	}
3325
3326	// The target of actually free pages. This must be at least the system
3327	// reserve, but should be a few more pages, so we don't have to extract
3328	// a cached page with each allocation.
3329	sFreePagesTarget = VM_PAGE_RESERVE_USER
3330		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
3331
3332	// The target of free + cached and inactive pages. On low-memory machines
3333	// keep things tight. free + cached is the pool of immediately allocatable
3334	// pages. We want a few inactive pages, so when we're actually paging, we
3335	// have a reasonably large set of pages to work with.
3336	if (sUnreservedFreePages < 16 * 1024) {
3337		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
3338		sInactivePagesTarget = sFreePagesTarget / 3;
3339	} else {
3340		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
3341		sInactivePagesTarget = sFreePagesTarget / 2;
3342	}
3343
3344	TRACE(("vm_page_init: exit\n"));
3345
3346	return B_OK;
3347}
3348
3349
3350status_t
3351vm_page_init_post_area(kernel_args *args)
3352{
3353	void *dummy;
3354
3355	dummy = sPages;
3356	create_area("page structures", &dummy, B_EXACT_ADDRESS,
3357		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
3358		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3359
3360	add_debugger_command("page_stats", &dump_page_stats,
3361		"Dump statistics about page usage");
3362	add_debugger_command_etc("page", &dump_page,
3363		"Dump page info",
3364		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
3365		"Prints information for the physical page. If neither \"-p\" nor\n"
3366		"\"-v\" are given, the provided address is interpreted as address of\n"
3367		"the vm_page data structure for the page in question. If \"-p\" is\n"
3368		"given, the address is the physical address of the page. If \"-v\" is\n"
3369		"given, the address is interpreted as virtual address in the current\n"
3370		"thread's address space and for the page it is mapped to (if any)\n"
3371		"information are printed. If \"-m\" is specified, the command will\n"
3372		"search all known address spaces for mappings to that page and print\n"
3373		"them.\n", 0);
3374	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
3375	add_debugger_command("find_page", &find_page,
3376		"Find out which queue a page is actually in");
3377
3378#ifdef TRACK_PAGE_USAGE_STATS
3379	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
3380		"Dumps statistics about page usage counts",
3381		"\n"
3382		"Dumps statistics about page usage counts.\n",
3383		B_KDEBUG_DONT_PARSE_ARGUMENTS);
3384#endif
3385
3386#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3387	add_debugger_command_etc("page_allocations_per_caller",
3388		&dump_page_allocations_per_caller,
3389		"Dump current page allocations summed up per caller",
3390		"[ -d <caller> ] [ -r ]\n"
3391		"The current allocations will by summed up by caller (their count)\n"
3392		"printed in decreasing order by count.\n"
3393		"If \"-d\" is given, each allocation for caller <caller> is printed\n"
3394		"including the respective stack trace.\n"
3395		"If \"-r\" is given, the allocation infos are reset after gathering\n"
3396		"the information, so the next command invocation will only show the\n"
3397		"allocations made after the reset.\n", 0);
3398	add_debugger_command_etc("page_allocation_infos",
3399		&dump_page_allocation_infos,
3400		"Dump current page allocations",
3401		"[ --stacktrace ] [ -p <page number> ] [ --team <team ID> ] "
3402		"[ --thread <thread ID> ]\n"
3403		"The current allocations filtered by optional values will be printed.\n"
3404		"The optional \"-p\" page number filters for a specific page,\n"
3405		"with \"--team\" and \"--thread\" allocations by specific teams\n"
3406		"and/or threads can be filtered (these only work if a corresponding\n"
3407		"tracing entry is still available).\n"
3408		"If \"--stacktrace\" is given, then stack traces of the allocation\n"
3409		"callers are printed, where available\n", 0);
3410#endif
3411
3412	return B_OK;
3413}
3414
3415
3416status_t
3417vm_page_init_post_thread(kernel_args *args)
3418{
3419	new (&sFreePageCondition) ConditionVariable;
3420	sFreePageCondition.Publish(&sFreePageQueue, "free page");
3421
3422	// create a kernel thread to clear out pages
3423
3424	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3425		B_LOWEST_ACTIVE_PRIORITY, NULL);
3426	resume_thread(thread);
3427
3428	// start page writer
3429
3430	sPageWriterCondition.Init("page writer");
3431
3432	thread = spawn_kernel_thread(&page_writer, "page writer",
3433		B_NORMAL_PRIORITY + 1, NULL);
3434	resume_thread(thread);
3435
3436	// start page daemon
3437
3438	sPageDaemonCondition.Init("page daemon");
3439
3440	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3441		B_NORMAL_PRIORITY, NULL);
3442	resume_thread(thread);
3443
3444	return B_OK;
3445}
3446
3447
3448status_t
3449vm_mark_page_inuse(page_num_t page)
3450{
3451	return vm_mark_page_range_inuse(page, 1);
3452}
3453
3454
3455status_t
3456vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3457{
3458	return mark_page_range_in_use(startPage, length, false);
3459}
3460
3461
3462/*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3463*/
3464void
3465vm_page_unreserve_pages(vm_page_reservation* reservation)
3466{
3467	uint32 count = reservation->count;
3468	reservation->count = 0;
3469
3470	if (count == 0)
3471		return;
3472
3473	TA(UnreservePages(count));
3474
3475	unreserve_pages(count);
3476}
3477
3478
3479/*!	With this call, you can reserve a number of free pages in the system.
3480	They will only be handed out to someone who has actually reserved them.
3481	This call returns as soon as the number of requested pages has been
3482	reached.
3483	The caller must not hold any cache lock or the function might deadlock.
3484*/
3485void
3486vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3487	int priority)
3488{
3489	reservation->count = count;
3490
3491	if (count == 0)
3492		return;
3493
3494	TA(ReservePages(count));
3495
3496	reserve_pages(count, priority, false);
3497}
3498
3499
3500bool
3501vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3502	int priority)
3503{
3504	if (count == 0) {
3505		reservation->count = count;
3506		return true;
3507	}
3508
3509	uint32 remaining = reserve_pages(count, priority, true);
3510	if (remaining == 0) {
3511		TA(ReservePages(count));
3512		reservation->count = count;
3513		return true;
3514	}
3515
3516	unreserve_pages(count - remaining);
3517
3518	return false;
3519}
3520
3521
3522vm_page *
3523vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3524{
3525	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3526	ASSERT(pageState != PAGE_STATE_FREE);
3527	ASSERT(pageState != PAGE_STATE_CLEAR);
3528
3529	ASSERT(reservation->count > 0);
3530	reservation->count--;
3531
3532	VMPageQueue* queue;
3533	VMPageQueue* otherQueue;
3534
3535	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3536		queue = &sClearPageQueue;
3537		otherQueue = &sFreePageQueue;
3538	} else {
3539		queue = &sFreePageQueue;
3540		otherQueue = &sClearPageQueue;
3541	}
3542
3543	ReadLocker locker(sFreePageQueuesLock);
3544
3545	vm_page* page = queue->RemoveHeadUnlocked();
3546	if (page == NULL) {
3547		// if the primary queue was empty, grab the page from the
3548		// secondary queue
3549		page = otherQueue->RemoveHeadUnlocked();
3550
3551		if (page == NULL) {
3552			// Unlikely, but possible: the page we have reserved has moved
3553			// between the queues after we checked the first queue. Grab the
3554			// write locker to make sure this doesn't happen again.
3555			locker.Unlock();
3556			WriteLocker writeLocker(sFreePageQueuesLock);
3557
3558			page = queue->RemoveHead();
3559			if (page == NULL)
3560				otherQueue->RemoveHead();
3561
3562			if (page == NULL) {
3563				panic("Had reserved page, but there is none!");
3564				return NULL;
3565			}
3566
3567			// downgrade to read lock
3568			locker.Lock();
3569		}
3570	}
3571
3572	if (page->CacheRef() != NULL)
3573		panic("supposed to be free page %p has cache\n", page);
3574
3575	DEBUG_PAGE_ACCESS_START(page);
3576
3577	int oldPageState = page->State();
3578	page->SetState(pageState);
3579	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3580	page->usage_count = 0;
3581	page->accessed = false;
3582	page->modified = false;
3583
3584	locker.Unlock();
3585
3586	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3587		sPageQueues[pageState].AppendUnlocked(page);
3588
3589	// clear the page, if we had to take it from the free queue and a clear
3590	// page was requested
3591	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3592		clear_page(page);
3593
3594#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3595	page->allocation_tracking_info.Init(
3596		TA(AllocatePage(page->physical_page_number)));
3597#else
3598	TA(AllocatePage(page->physical_page_number));
3599#endif
3600
3601	return page;
3602}
3603
3604
3605static void
3606allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3607	VMPageQueue::PageList& clearPages)
3608{
3609	while (vm_page* page = freePages.RemoveHead()) {
3610		page->busy = false;
3611		page->SetState(PAGE_STATE_FREE);
3612		DEBUG_PAGE_ACCESS_END(page);
3613		sFreePageQueue.PrependUnlocked(page);
3614	}
3615
3616	while (vm_page* page = clearPages.RemoveHead()) {
3617		page->busy = false;
3618		page->SetState(PAGE_STATE_CLEAR);
3619		DEBUG_PAGE_ACCESS_END(page);
3620		sClearPageQueue.PrependUnlocked(page);
3621	}
3622}
3623
3624
3625/*!	Tries to allocate the a contiguous run of \a length pages starting at
3626	index \a start.
3627
3628	The caller must have write-locked the free/clear page queues. The function
3629	will unlock regardless of whether it succeeds or fails.
3630
3631	If the function fails, it cleans up after itself, i.e. it will free all
3632	pages it managed to allocate.
3633
3634	\param start The start index (into \c sPages) of the run.
3635	\param length The number of pages to allocate.
3636	\param flags Page allocation flags. Encodes the state the function shall
3637		set the allocated pages to, whether the pages shall be marked busy
3638		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3639		(VM_PAGE_ALLOC_CLEAR).
3640	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3641		queues in locked state. Will be unlocked by the function.
3642	\return The index of the first page that could not be allocated. \a length
3643		is returned when the function was successful.
3644*/
3645static page_num_t
3646allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3647	WriteLocker& freeClearQueueLocker)
3648{
3649	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3650	ASSERT(pageState != PAGE_STATE_FREE);
3651	ASSERT(pageState != PAGE_STATE_CLEAR);
3652	ASSERT(start + length <= sNumPages);
3653
3654	// Pull the free/clear pages out of their respective queues. Cached pages
3655	// are allocated later.
3656	page_num_t cachedPages = 0;
3657	VMPageQueue::PageList freePages;
3658	VMPageQueue::PageList clearPages;
3659	page_num_t i = 0;
3660	for (; i < length; i++) {
3661		bool pageAllocated = true;
3662		bool noPage = false;
3663		vm_page& page = sPages[start + i];
3664		switch (page.State()) {
3665			case PAGE_STATE_CLEAR:
3666				DEBUG_PAGE_ACCESS_START(&page);
3667				sClearPageQueue.Remove(&page);
3668				clearPages.Add(&page);
3669				break;
3670			case PAGE_STATE_FREE:
3671				DEBUG_PAGE_ACCESS_START(&page);
3672				sFreePageQueue.Remove(&page);
3673				freePages.Add(&page);
3674				break;
3675			case PAGE_STATE_CACHED:
3676				// We allocate cached pages later.
3677				cachedPages++;
3678				pageAllocated = false;
3679				break;
3680
3681			default:
3682				// Probably a page was cached when our caller checked. Now it's
3683				// gone and we have to abort.
3684				noPage = true;
3685				break;
3686		}
3687
3688		if (noPage)
3689			break;
3690
3691		if (pageAllocated) {
3692			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3693			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3694			page.usage_count = 0;
3695			page.accessed = false;
3696			page.modified = false;
3697		}
3698	}
3699
3700	if (i < length) {
3701		// failed to allocate a page -- free all that we've got
3702		allocate_page_run_cleanup(freePages, clearPages);
3703		return i;
3704	}
3705
3706	freeClearQueueLocker.Unlock();
3707
3708	if (cachedPages > 0) {
3709		// allocate the pages that weren't free but cached
3710		page_num_t freedCachedPages = 0;
3711		page_num_t nextIndex = start;
3712		vm_page* freePage = freePages.Head();
3713		vm_page* clearPage = clearPages.Head();
3714		while (cachedPages > 0) {
3715			// skip, if we've already got the page
3716			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3717				freePage = freePages.GetNext(freePage);
3718				nextIndex++;
3719				continue;
3720			}
3721			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3722				clearPage = clearPages.GetNext(clearPage);
3723				nextIndex++;
3724				continue;
3725			}
3726
3727			// free the page, if it is still cached
3728			vm_page& page = sPages[nextIndex];
3729			if (!free_cached_page(&page, false)) {
3730				// TODO: if the page turns out to have been freed already,
3731				// there would be no need to fail
3732				break;
3733			}
3734
3735			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3736			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3737			page.usage_count = 0;
3738			page.accessed = false;
3739			page.modified = false;
3740
3741			freePages.InsertBefore(freePage, &page);
3742			freedCachedPages++;
3743			cachedPages--;
3744			nextIndex++;
3745		}
3746
3747		// If we have freed cached pages, we need to balance things.
3748		if (freedCachedPages > 0)
3749			unreserve_pages(freedCachedPages);
3750
3751		if (nextIndex - start < length) {
3752			// failed to allocate all cached pages -- free all that we've got
3753			freeClearQueueLocker.Lock();
3754			allocate_page_run_cleanup(freePages, clearPages);
3755			freeClearQueueLocker.Unlock();
3756
3757			return nextIndex - start;
3758		}
3759	}
3760
3761	// clear pages, if requested
3762	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3763		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3764				vm_page* page = it.Next();) {
3765 			clear_page(page);
3766		}
3767	}
3768
3769	// add pages to target queue
3770	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3771		freePages.MoveFrom(&clearPages);
3772		sPageQueues[pageState].AppendUnlocked(freePages, length);
3773	}
3774
3775	// Note: We don't unreserve the pages since we pulled them out of the
3776	// free/clear queues without adjusting sUnreservedFreePages.
3777
3778#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3779	AbstractTraceEntryWithStackTrace* traceEntry
3780		= TA(AllocatePageRun(start, length));
3781
3782	for (page_num_t i = start; i < start + length; i++)
3783		sPages[i].allocation_tracking_info.Init(traceEntry);
3784#else
3785	TA(AllocatePageRun(start, length));
3786#endif
3787
3788	return length;
3789}
3790
3791
3792/*! Allocate a physically contiguous range of pages.
3793
3794	\param flags Page allocation flags. Encodes the state the function shall
3795		set the allocated pages to, whether the pages shall be marked busy
3796		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3797		(VM_PAGE_ALLOC_CLEAR).
3798	\param length The number of contiguous pages to allocate.
3799	\param restrictions Restrictions to the physical addresses of the page run
3800		to allocate, including \c low_address, the first acceptable physical
3801		address where the page run may start, \c high_address, the last
3802		acceptable physical address where the page run may end (i.e. it must
3803		hold \code runStartAddress + length <= high_address \endcode),
3804		\c alignment, the alignment of the page run start address, and
3805		\c boundary, multiples of which the page run must not cross.
3806		Values set to \c 0 are ignored.
3807	\param priority The page reservation priority (as passed to
3808		vm_page_reserve_pages()).
3809	\return The first page of the allocated page run on success; \c NULL
3810		when the allocation failed.
3811*/
3812vm_page*
3813vm_page_allocate_page_run(uint32 flags, page_num_t length,
3814	const physical_address_restrictions* restrictions, int priority)
3815{
3816	// compute start and end page index
3817	page_num_t requestedStart
3818		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3819			- sPhysicalPageOffset;
3820	page_num_t start = requestedStart;
3821	page_num_t end;
3822	if (restrictions->high_address > 0) {
3823		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3824				sPhysicalPageOffset)
3825			- sPhysicalPageOffset;
3826		end = std::min(end, sNumPages);
3827	} else
3828		end = sNumPages;
3829
3830	// compute alignment mask
3831	page_num_t alignmentMask
3832		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3833	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3834		// alignment must be a power of 2
3835
3836	// compute the boundary mask
3837	uint32 boundaryMask = 0;
3838	if (restrictions->boundary != 0) {
3839		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3840		// boundary must be a power of two and not less than alignment and
3841		// length
3842		ASSERT(((boundary - 1) & boundary) == 0);
3843		ASSERT(boundary >= alignmentMask + 1);
3844		ASSERT(boundary >= length);
3845
3846		boundaryMask = -boundary;
3847	}
3848
3849	vm_page_reservation reservation;
3850	vm_page_reserve_pages(&reservation, length, priority);
3851
3852	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3853
3854	// First we try to get a run with free pages only. If that fails, we also
3855	// consider cached pages. If there are only few free pages and many cached
3856	// ones, the odds are that we won't find enough contiguous ones, so we skip
3857	// the first iteration in this case.
3858	int32 freePages = sUnreservedFreePages;
3859	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3860
3861	for (;;) {
3862		if (alignmentMask != 0 || boundaryMask != 0) {
3863			page_num_t offsetStart = start + sPhysicalPageOffset;
3864
3865			// enforce alignment
3866			if ((offsetStart & alignmentMask) != 0)
3867				offsetStart = (offsetStart + alignmentMask) & ~alignmentMask;
3868
3869			// enforce boundary
3870			if (boundaryMask != 0 && ((offsetStart ^ (offsetStart
3871				+ length - 1)) & boundaryMask) != 0) {
3872				offsetStart = (offsetStart + length - 1) & boundaryMask;
3873			}
3874
3875			start = offsetStart - sPhysicalPageOffset;
3876		}
3877
3878		if (start + length > end) {
3879			if (useCached == 0) {
3880				// The first iteration with free pages only was unsuccessful.
3881				// Try again also considering cached pages.
3882				useCached = 1;
3883				start = requestedStart;
3884				continue;
3885			}
3886
3887			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3888				"length %" B_PRIuPHYSADDR " (%" B_PRIuPHYSADDR " %"
3889				B_PRIuPHYSADDR ") in second iteration (align: %" B_PRIuPHYSADDR
3890				" boundary: %" B_PRIuPHYSADDR ") !", length, requestedStart,
3891				end, restrictions->alignment, restrictions->boundary);
3892
3893			freeClearQueueLocker.Unlock();
3894			vm_page_unreserve_pages(&reservation);
3895			return NULL;
3896		}
3897
3898		bool foundRun = true;
3899		page_num_t i;
3900		for (i = 0; i < length; i++) {
3901			uint32 pageState = sPages[start + i].State();
3902			if (pageState != PAGE_STATE_FREE
3903				&& pageState != PAGE_STATE_CLEAR
3904				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3905				foundRun = false;
3906				break;
3907			}
3908		}
3909
3910		if (foundRun) {
3911			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
3912			if (i == length)
3913				return &sPages[start];
3914
3915			// apparently a cached page couldn't be allocated -- skip it and
3916			// continue
3917			freeClearQueueLocker.Lock();
3918		}
3919
3920		start += i + 1;
3921	}
3922}
3923
3924
3925vm_page *
3926vm_page_at_index(int32 index)
3927{
3928	return &sPages[index];
3929}
3930
3931
3932vm_page *
3933vm_lookup_page(page_num_t pageNumber)
3934{
3935	if (pageNumber < sPhysicalPageOffset)
3936		return NULL;
3937
3938	pageNumber -= sPhysicalPageOffset;
3939	if (pageNumber >= sNumPages)
3940		return NULL;
3941
3942	return &sPages[pageNumber];
3943}
3944
3945
3946bool
3947vm_page_is_dummy(struct vm_page *page)
3948{
3949	return page < sPages || page >= sPages + sNumPages;
3950}
3951
3952
3953/*!	Free the page that belonged to a certain cache.
3954	You can use vm_page_set_state() manually if you prefer, but only
3955	if the page does not equal PAGE_STATE_MODIFIED.
3956*/
3957void
3958vm_page_free(VMCache *cache, vm_page *page)
3959{
3960	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3961		&& page->State() != PAGE_STATE_CLEAR);
3962
3963	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
3964		atomic_add(&sModifiedTemporaryPages, -1);
3965
3966	free_page(page, false);
3967}
3968
3969
3970void
3971vm_page_set_state(vm_page *page, int pageState)
3972{
3973	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3974		&& page->State() != PAGE_STATE_CLEAR);
3975
3976	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR)
3977		free_page(page, pageState == PAGE_STATE_CLEAR);
3978	else
3979		set_page_state(page, pageState);
3980}
3981
3982
3983/*!	Moves a page to either the tail of the head of its current queue,
3984	depending on \a tail.
3985	The page must have a cache and the cache must be locked!
3986*/
3987void
3988vm_page_requeue(struct vm_page *page, bool tail)
3989{
3990	PAGE_ASSERT(page, page->Cache() != NULL);
3991	page->Cache()->AssertLocked();
3992	// DEBUG_PAGE_ACCESS_CHECK(page);
3993		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
3994		// when it requeues busy pages. The reason is that vm_soft_fault()
3995		// (respectively fault_get_page()) and the file cache keep newly
3996		// allocated pages accessed while they are reading them from disk. It
3997		// would probably be better to change that code and reenable this
3998		// check.
3999
4000	VMPageQueue *queue = NULL;
4001
4002	switch (page->State()) {
4003		case PAGE_STATE_ACTIVE:
4004			queue = &sActivePageQueue;
4005			break;
4006		case PAGE_STATE_INACTIVE:
4007			queue = &sInactivePageQueue;
4008			break;
4009		case PAGE_STATE_MODIFIED:
4010			queue = &sModifiedPageQueue;
4011			break;
4012		case PAGE_STATE_CACHED:
4013			queue = &sCachedPageQueue;
4014			break;
4015		case PAGE_STATE_FREE:
4016		case PAGE_STATE_CLEAR:
4017			panic("vm_page_requeue() called for free/clear page %p", page);
4018			return;
4019		case PAGE_STATE_WIRED:
4020		case PAGE_STATE_UNUSED:
4021			return;
4022		default:
4023			panic("vm_page_touch: vm_page %p in invalid state %d\n",
4024				page, page->State());
4025			break;
4026	}
4027
4028	queue->RequeueUnlocked(page, tail);
4029}
4030
4031
4032page_num_t
4033vm_page_num_pages(void)
4034{
4035	return sNumPages - sNonExistingPages;
4036}
4037
4038
4039/*! There is a subtle distinction between the page counts returned by
4040	this function and vm_page_num_free_pages():
4041	The latter returns the number of pages that are completely uncommitted,
4042	whereas this one returns the number of pages that are available for
4043	use by being reclaimed as well (IOW it factors in things like cache pages
4044	as available).
4045*/
4046page_num_t
4047vm_page_num_available_pages(void)
4048{
4049	return vm_available_memory() / B_PAGE_SIZE;
4050}
4051
4052
4053page_num_t
4054vm_page_num_free_pages(void)
4055{
4056	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
4057	return count > 0 ? count : 0;
4058}
4059
4060
4061page_num_t
4062vm_page_num_unused_pages(void)
4063{
4064	int32 count = sUnreservedFreePages;
4065	return count > 0 ? count : 0;
4066}
4067
4068
4069void
4070vm_page_get_stats(system_info *info)
4071{
4072	// Note: there's no locking protecting any of the queues or counters here,
4073	// so we run the risk of getting bogus values when evaluating them
4074	// throughout this function. As these stats are for informational purposes
4075	// only, it is not really worth introducing such locking. Therefore we just
4076	// ensure that we don't under- or overflow any of the values.
4077
4078	// The pages used for the block cache buffers. Those should not be counted
4079	// as used but as cached pages.
4080	// TODO: We should subtract the blocks that are in use ATM, since those
4081	// can't really be freed in a low memory situation.
4082	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
4083
4084	// Non-temporary modified pages are special as they represent pages that
4085	// can be written back, so they could be freed if necessary, for us
4086	// basically making them into cached pages with a higher overhead. The
4087	// modified queue count is therefore split into temporary and non-temporary
4088	// counts that are then added to the corresponding number.
4089	page_num_t modifiedNonTemporaryPages
4090		= (sModifiedPageQueue.Count() - sModifiedTemporaryPages);
4091
4092	info->max_pages = vm_page_num_pages();
4093	info->cached_pages = sCachedPageQueue.Count() + modifiedNonTemporaryPages
4094		+ blockCachePages;
4095
4096	// max_pages is composed of:
4097	//	active + inactive + unused + wired + modified + cached + free + clear
4098	// So taking out the cached (including modified non-temporary), free and
4099	// clear ones leaves us with all used pages.
4100	int32 subtractPages = info->cached_pages + sFreePageQueue.Count()
4101		+ sClearPageQueue.Count();
4102	info->used_pages = subtractPages > info->max_pages
4103		? 0 : info->max_pages - subtractPages;
4104
4105	if (info->used_pages + info->cached_pages > info->max_pages) {
4106		// Something was shuffled around while we were summing up the counts.
4107		// Make the values sane, preferring the worse case of more used pages.
4108		info->cached_pages = info->max_pages - info->used_pages;
4109	}
4110
4111	info->page_faults = vm_num_page_faults();
4112	info->ignored_pages = sIgnoredPages;
4113
4114	// TODO: We don't consider pages used for page directories/tables yet.
4115}
4116
4117
4118/*!	Returns the greatest address within the last page of accessible physical
4119	memory.
4120	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
4121	means the that the last page ends at exactly 4 GB.
4122*/
4123phys_addr_t
4124vm_page_max_address()
4125{
4126	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
4127}
4128
4129
4130RANGE_MARKER_FUNCTION_END(vm_page)
4131